Транспонировать в табличный формат, используя массивы и словарь - PullRequest
0 голосов
/ 01 мая 2020

Я пытаюсь преобразовать данные в три столбца в табличный формат. Я могу получить уникальные элементы из первого столбца без проблем, но я не знаю, как сделать то же самое с заголовками, так как заголовки не являются уникальными, как столбец

Sub Test()
    Dim a, d As Object, d2 As Object, s As String, i As Long, m As Long
    Dim k As Long, c As Long

    a = Range("A1:C" & Cells(Rows.Count, 1).End(xlUp).Row).Value
    ReDim b(1 To UBound(a, 1), 1 To UBound(a, 1))
    k = 1: c = 2: b(1, 1) = a(1, 1)
    Set d = CreateObject("Scripting.Dictionary")

    For i = 2 To UBound(a)
        If Not d.Exists(a(i, 1)) Then
            k = k + 1
            b(k, 1) = a(i, 1)

            d(a(i, 1)) = a(i, 2)

            b(1, c) = a(i, 2)
            b(k, c) = a(i, 3)
        Else
            c = c + 1
            b(1, c) = a(i, 2)
            b(k, c) = a(i, 3)
        End If
    Next i

    'Range("E10").Resize(d.Count).Value = Application.Transpose(d.keys)

End Sub

Вот снимок перед и после, чтобы объяснить проблему хорошо enter image description here

1 Ответ

1 голос
/ 03 мая 2020

Следующее должно показать вам необходимые логики c. Я должен был сначала написать python, а затем перевести на vba (python был написан не на языке pythoni c, а таким образом, который позволял переводить на VBA.)

Option Explicit

Public Sub ReshapeData()

    Dim df1(), tracker As Scripting.Dictionary, headers As Object, rows As Scripting.Dictionary

    df1 = ActiveSheet.Range("A2:C11").Value      'no headers and 2d indexed from 1
    Set tracker = New Scripting.Dictionary
    Set headers = CreateObject("System.Collections.ArrayList")
    Set rows = New Scripting.Dictionary

    'Populate headers

    headers.Add "Header1"

    Dim r As Long

    For r = LBound(df1, 1) To UBound(df1, 1)     'loop rows r = 1; no headers

        Dim header1 As String, val As String, name As String, curr_id As String

        header1 = df1(r, 1)                      'not 0 as with python as 2d from range   .contains
        name = df1(r, 2)
        val = df1(r, 3)

        rows(header1) = vbNullString

        curr_id = header1 & ":" & name

        If Not tracker.exists(curr_id) And Not headers.contains(name) Then
            tracker(curr_id) = val
            headers.Add name
        End If
        If headers.contains(name) Then
            If Not tracker.exists(curr_id) Then
                tracker(curr_id) = val
            ElseIf tracker(curr_id) <> val Then
                headers.Add name
            End If
        End If
    Next


    Dim nRow As Long, nCol As Long
    ' determine array dimensions (will use df in python)
    nRow = rows.Count + 1
    nCol = headers.Count

    Dim rowsArr(), nRows As Long, nCols As Long

    rowsArr = rows.keys

    For r = LBound(rowsArr) To UBound(rowsArr)   'starts at 0 updated dict with these
        rows(rowsArr(r)) = r + 2                 ' as headers will be in row 1
    Next

    ' generate array (vba)/df(python)

    Dim df()
    ReDim df(1 To nRow, 1 To nCol)

    Dim headersArr()

    headersArr = headers.ToArray                 'assume 0 based array generated as cannot test

    For r = LBound(headersArr) To UBound(headersArr) ' r = 0
        df(1, r + 1) = headersArr(r)
    Next

    'populate first column of array (vba)/df(python)

    For r = LBound(rowsArr) To UBound(rowsArr)   ' r = 0. First val is A100
        df(r + 2, 1) = rowsArr(r)                'assuming 0 indexing
    Next


    ' Appropriately populate vba array  | python df
    Dim id_count As Scripting.Dictionary, row As Long, col As Long

    Set id_count = New Scripting.Dictionary

    For r = LBound(df1, 1) To UBound(df1, 1)     ' r = 1

        header1 = df1(r, 1)
        name = df1(r, 2)
        curr_id = header1 & ":" & name
        row = rows(header1)
        id_count(curr_id) = id_count(curr_id) + 1

        col = get_col_number(headersArr, name, id_count(curr_id)) ' determine col number to write value to based on

        df(row, col) = df1(r, 3)
    Next
    ActiveSheet.Cells(1, 5).Resize(UBound(df, 1), UBound(df, 2)) = df
End Sub

Public Function get_col_number(ByRef headersArr(), ByVal name As String, ByVal required_match As Long) As Long

    Dim i As Long, matchCount As Long

    For i = LBound(headersArr) To UBound(headersArr)
        If headersArr(i) = name Then

            matchCount = matchCount + 1

            If matchCount = required_match Then
                get_col_number = i + 1           'assuming this is correct adjustment
                Exit Function
            End If
        End If
    Next

End Function

Python тестовый скрипт (не pythoni c по причине, указанной выше):

import pandas as pd

def get_col_number(headers:list, name:str, required_match:int)->int:
    # return index in headers of nth occurrence of name as name can repeat and we want to know which
    # column number is the right one to write to
    return [i for i, n in enumerate(headers) if n == name][required_match] 

df1 = pd.DataFrame(
{
    'Header1': ['A100','A100','A100','B200','C300','C300','D400','D400','A100','C300'] ,
    'Header2': ['Ahmed','Yasser','Ahmed','Yasser','Ahmed','Khalil','Yasser','Ahmed','Ahmed','Yasser'] ,
    'Header3': ['Value1','Value2','Value3','Value5','Value6','Value7','Value9','Value10','Value4','Value8'] 
})


def main():

    print('input dataframe.....')
    print(100 * '==')
    print(df1)

    tracker = {}
    headers = ['Header1'] # this would need to be an array in vba or an arrayList? 
                          # CreateObject("System.Collections.ArrayList") then have .Add and finally .ToArray to retrieve as array
    rows = {}

    for r, _ in df1.iterrows(): 
        header1 = df1.iloc[r,0]
        rows[header1] = ''

        curr_id = ':'.join([df1.iloc[r,0] , df1.iloc[r,1]]) #concatenate header1 and header2 to give an id
        val = df1.iloc[r,2]
        name = df1.iloc[r,1]

        if curr_id not in tracker and name not in headers: 
            #tracker is a dict so can use Not .exists  in VBA
            tracker[curr_id] = val 
            headers.append(name)
        if name in headers:
            if curr_id not in tracker:
                tracker[curr_id] = val
            elif tracker[curr_id] != val:
                headers.append(name) 

    # determine array dimensions (df in python)
    nrow = len(rows.keys())  
    ncol = len(headers) 
    rows = {r:n for n, r in enumerate(rows.keys())} # for python need to know row number for Header1 values so updated dict with these

    # generate array (vba)/df(python)

    df = pd.DataFrame(["" for c in range(ncol)] for r in range(nrow))  #this would be a dimensioned array in vba
    df.columns = headers

    #populate first column of array (vba)/will use df(python)
    for r, key in enumerate(rows.keys()):  # this would be for r = lbound(arr,1)  where arr = rows.keys() ; key = arr(r)
        df.iloc[r,0] = key

    # Appropriately populate vba array  | python use a df
    id_count = {}

    for r, _ in df1.iterrows():    # loop df rows; this would be rows of VBA array i.e. dimension 1
        header1 = df1.iloc[r,0]
        name = df1.iloc[r,1]
        curr_id = ':'.join([header1, name]) 
        row = rows[header1]

        if curr_id in id_count:
            id_count[curr_id]+=1 
        else:
            id_count[curr_id] = 0

        col = get_col_number(headers, name,id_count[curr_id])  # determine col number to write value to based on 

        try: 
            df.iloc[row ,col] = df1.iloc[r,2]  # headers won't be included in python indexing. VBA indexing will include headers as row 0.
        except:
             print(list(_), row, col) 

    print()
    print('output dataframe.....')
    print(100 * '==')
    print(df)

if __name__ == "__main__":
    main()
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...