Как ускорить VBA для извлечения данных из Word в Excel - PullRequest
0 голосов
/ 15 января 2020

Я новичок в stackoverflow и новичок ie в кодировке VBA. На моей работе нам предоставляют данные об отправке в форме Ms Word, что не очень полезно. Я нашел способ передачи данных с помощью VBA и получил полностью функциональный код. Однако набор данных содержит сотни тысяч записей. Я попытался запустить данные за месяц с 200 тыс. Записей, и это заняло 5 дней. Просто интересно, есть ли в моем коде что-нибудь, что я мог бы улучшить, чтобы ускорить процесс. Я пытался отключить обновления экрана, события, расчеты, но это мало что дало. Заранее благодарим за помощь.

Sub Word_to_Excel()

Dim FName As String, FD As FileDialog
Dim wdApp As Object
Dim wdDoc As Object
Dim WDR, WDCheck, ShipmentID As Object
Dim ExR As Range
Dim file
Dim Path As String
Dim ImportDate As Object
Dim ImportValue As String
Dim ShipmentIDcheck As String
Dim objResult



    Set objShell = CreateObject("WScript.Shell")

    Set ExR = Selection ' current location in Excel Sheet

    ' Select Folder containing word documents
    Set FD = Application.FileDialog(msoFileDialogFolderPicker)
    FD.Show
    FName = FD.SelectedItems(1)

    file = Dir(FName & "\*.docx")

    Set wdApp = CreateObject("Word.Application")

    ' Open word document in the folder, run macro, close it and open the next word document until there are none left
    Do While file <> ""
    wdApp.Documents.Open Filename:=FName & "\" & file
    wdApp.ActiveWindow.ActivePane.View.Type = 1
    wdApp.Visible = True

    ' Once the word doc is open, go to beginning of document and search for CTY/SITE/SORT:
    wdApp.Selection.HomeKey Unit:=6
    wdApp.Selection.Find.ClearFormatting
    wdApp.Selection.Find.Execute "CTY/SITE/SORT:"
    Set WDCheck = wdApp.Selection

    ' If "CTY/SITE/SORT:" is found, then look for Shipment ID
    Do While WDCheck = "CTY/SITE/SORT:"

    ' Find first shipment
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=11
    wdApp.Selection.MoveRight Unit:=1, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set ShipmentID = wdApp.Selection
    ShipmentIDcheck = Replace(ShipmentID, " ", "")

    ' Transfer information from Word to Excel for a Shipment ID and go to the next one.
    ' Shipment ID should be a string that is 11 characters long
    ' If Shipment ID no longer exist, go to next page by searching for the next CTY/SITE/SORT:
    Do While Len(Trim(ShipmentIDcheck)) = 11
        i = i + 1
        ExR(i, 1) = file
        ExR(i, 2) = ShipmentIDcheck

    ' Consignee Name
    wdApp.Selection.MoveUp Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=12
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 3) = Trim(WDR)

    ' Importer Name
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 8) = Trim(WDR)

    ' Shipper Name
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 13) = Trim(WDR)

    ' Quantity
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=10, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 19) = Trim(WDR)

    ' Weight
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=12, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 20) = Trim(WDR)

    ' Value
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=12, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 21) = Trim(WDR)

    ' Broker
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 23) = Trim(WDR)

    ' Consignee Street
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=13
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 4) = Trim(WDR)

    ' Importer Street
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 9) = Trim(WDR)

    ' Shipper Street
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 14) = Trim(WDR)

    ' Description
    wdApp.Selection.MoveRight Unit:=1, Count:=8
    wdApp.Selection.MoveRight Unit:=1, Count:=40, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 18) = Trim(WDR)

    ' Consignee City
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=13
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 5) = Trim(WDR)

    ' Consignee Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 6) = Trim(WDR)

    ' Consignee Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 7) = Trim(WDR)

    ' Importer City
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 10) = Trim(WDR)

    ' Importer Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 11) = Trim(WDR)

    ' Importer Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 12) = Trim(WDR)

    ' Shipper City
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 15) = Trim(WDR)

    ' Shipper Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 16) = Trim(WDR)

    ' Shipper Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 17) = Trim(WDR)

    ' Country of Origin
    wdApp.Selection.MoveRight Unit:=1, Count:=29
    wdApp.Selection.MoveRight Unit:=1, Count:=21, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 22) = Trim(WDR)

    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set ShipmentID = wdApp.Selection
    ' Remove spaces from selection. Selection is then used to check if it is a shipment ID.
    ' If it is, then data for that shipment ID is transferred. If not, macro will go to the next page in the Word Doc.
    ShipmentIDcheck = Replace(ShipmentID, " ", "")

    ActiveCell.Offset(1).Select

    Loop

    'Simulate keyboard press "NUMLOCK" to prevent screen from locking
    objResult = objShell.SendKeys("{NUMLOCK}")

    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.Find.ClearFormatting
    wdApp.Selection.Find.Execute "CTY/SITE/SORT:"
    Set WDCheck = wdApp.Selection

    Loop

    wdApp.ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges
    ActiveWorkbook.Save
    file = Dir()

    Loop

    wdApp.Quit

    MsgBox "Data extraction completed at:" & vbNewLine & Format(Now, "mmmm d, yyyy hh:mm AM/PM")

End Sub

Вот как форматируется набор данных. Есть несколько документов Word, содержащих страницы и страницы этого набора данных в день. Количество отправлений на странице варьируется. Но формат одинаков во всем. В документах word нет таблиц, только текст, разделенный пробелами. CTY / SITE / SORT: уникален для каждой страницы, и я использовал его как опорную точку. если макрос находит его, он идет вниз на 11 строк и получает первый идентификатор отгрузки и другую информацию. Затем он проверяет следующий идентификатор доставки. Если его там нет, то он переходит на следующую страницу и повторяет процесс.

REPORT NUM   : ABC1234                                   OPERATIONS SYSTEM                                       PAGE NUM:   2      
 CTY/SITE/SORT: CA 00123                                    SUMMARY CARGO                                         RUN TIME: 07:33:43 
 SORT DATE    :                                            INBOUND - SCAN                                         RUN DATE: 01AUG19  
                                                                                                                  OPER ID : ABC123  
 MVMT: 12345678   MVMT DT: 01AUG19    MAWB:                  PROD TYP:      DTY TYP:      IMP CTY:      EXP CTY:      BL TYP:        
                        COURIER REMISSION  MANIFEST               EXPORT SITE: US 12345                                

 GCCN ID:               EXPECTED SHPTS:           EXPECTED PKGS:             EXPECTED WEIGHT:                                        

 CUSTOMS NUM     CONSIGNEE NAME           IMPORTER NAME           SHIPPER NAME        CSA    QTY     WGT(LBS)   VALUE  BROKER        
 SHIPMENT ID                                                                               DESCRIPTION           (CAD) CTRY OF ORIGIN
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
       TOTAL FOR DUTY TYPE COURIER REMISSION                                                                                         
       TOTAL SHIPMENTS:                      4                                                                                       
       TOTAL PACKAGES :                      4                                                                                       
       TOTAL WEIGHT   :                     70.9 LBS                                                                                 
       TOTAL VALUES   :                   4000.00                                                                                         
* * *                                      

Я использовал следующий код для очистки набора данных и упорядочения их по одной записи в строке, и каждая строка разделяется абзац (спасибо, макропод). Поскольку данные организованы в столбцы, разделенные пробелами, я могу сохранить их в виде файла .txt и импортировать в Excel. Теперь задача состоит в том, чтобы применить код ко всем документам в папке и создать файл .txt для каждого. Или было бы еще лучше, если бы код мог объединить все данные из очищенных файлов .docx в один файл .txt.

Sub CleanWordDoc()
Application.ScreenUpdating = False
Dim p As Long, StrOut As String

With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "REPORT NUM   : * CTRY OF ORIGIN^13" 'Clean header on each page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}ACTUAL SHP TOTAL*[ ]{20,}^13^m" 'Clean footer on some pages
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}TOTAL FOR DUTY*[ ]{20,}^13^m" 'Clean more footers
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   :*SUMMARY*[\*] [\*][ ]{20,}^13" 'Clean last page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^m^13" ' Clean all page breaks
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13^13" ' Clean empty paragraphs
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "<[ ]{1,}^13" ' Clean spaces and paragraphs at the beginning of doc
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "(*)^13(*)^13(*^13)" ' Combine 3 paragraphs into one
    .Replacement.Text = "\1 \2 \3"
    .Execute Replace:=wdReplaceAll
  End With

End With
Application.ScreenUpdating = True
End Sub

И вот так выглядит очищенный файл .docx (с сотнями записей ):

 12345678900 ABC COMPANY             DEF COMPANY             XYZ COMPANY                      1          1.1       123.45 AAABROKER    A0B12345LFD ABC ADDRESS             DEF ADDRESS             XYZ ADDRESS                   BICYCLE PARTS                                           VANCOUVER     BC V1A1A1 MARKHAM       ON L1L1L1 SHENZHEN         512323                            CN                   
 98765432100 ABC COMPANY             DEF COMPANY             XYZ COMPANY                      1          1.1       123.45 AAABROKER    A0B12345LFD ABC ADDRESS             DEF ADDRESS             XYZ ADDRESS                   BICYCLE PARTS                                           VANCOUVER     BC V1A1A1 MARKHAM       ON L1L1L1 SHENZHEN         512323                            CN                   

Ответы [ 3 ]

0 голосов
/ 28 января 2020

Ваш код работает медленно, потому что вы заставляете Word анализировать ваши данные. Было бы намного быстрее обработать его как обычный текст.

Я бы выбрал подход, чтобы убедить вашего поставщика данных предоставить его в виде текстовых файлов. Если это невозможно, то напишите программу VBA для преобразования каждого файла Word в текст.

Как только это будет сделано, используйте простую обработку текстового файла, чтобы прочитать каждую строку из файлов, проанализировать и извлечь необходимые данные в массив вариантов , затем запишите результат в Excel.

Примечание: я не включил полный код для анализа и извлечения ваших данных, я оставлю это вам. В комплекте небольшой фрагмент, чтобы вы начали.

Примерно так

Option Explicit

Sub Demo()
    Dim t1 As Single, t2 As Single
    Dim DataFile As String
    Dim DataPath As String
    Dim SavePath As String
    Dim rw As Long
    Dim ws As Worksheet
    Dim WordApp As Word.Application

    On Error GoTo EH

    'identify sheet to take results
    Set ws = ActiveSheet

    t1 = Timer() '<~~ only used to report run time

    ' Create an instance of Word
    Set WordApp = New Word.Application
    WordApp.Visible = False

    ' Set up path to data files
    DataPath = "C:\Data\Temp\SO\" '<~~ update to suit
    SavePath = DataPath & "Txt\" '<~~ optional: save text files to a seperate subfolder

    ' Get first word file in directory
    DataFile = Dir(DataPath & "*.docx")
    Do While DataFile <> vbNullString
        Debug.Print "Convert ", DataFile
        ' Open in word, save as text
        ConvertToText WordApp, DataPath, DataFile, SavePath
        DoEvents

        ' Get next file
        DataFile = Dir
    Loop

    ' Tidy up
    WordApp.Quit
    Set WordApp = Nothing

    t2 = Timer

    Debug.Print "Convert Time", t2 - t1


    t1 = Timer()
    ' Get first text file in directory
    DataFile = Dir(SavePath & "*.txt")
    rw = 1
    Do While DataFile <> vbNullString
        Debug.Print "Read ", DataFile
        ' process the file
        ReadFile ws, SavePath, DataFile, rw
        DoEvents
        ' Get next file
        DataFile = Dir
    Loop


    t2 = Timer

    Debug.Print "Read Time", t2 - t1

Exit Sub
EH:
    On Error Resume Next
    ' Tidy up
    If Not WordApp Is Nothing Then WordApp.Quit
    Set WordApp = Nothing

End Sub

Sub ConvertToText(WordApp As Word.Application, ByVal FilePath As String, ByVal FileName As String, ByVal SavePath As String)
    Dim WordDoc As Word.Document
    Dim i As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Open the file
    Set WordDoc = WordApp.Documents.Open(FilePath & FileName)

    ' generate Text file name
    i = InStrRev(FileName, ".")
    FileName = Left$(FileName, i) & "txt"

    ' Save as text
    WordDoc.SaveAs2 _
        FileName:=SavePath & FileName, _
        FileFormat:=wdFormatText, _
        AddToRecentFiles:=False, _
        EmbedTrueTypeFonts:=False, _
        SaveNativePictureFormat:=False, _
        SaveFormsData:=False, _
        SaveAsAOCELetter:=False, _
        Encoding:=1252, _
        InsertLineBreaks:=False, _
        AllowSubstitutions:=False, _
        LineEnding:=0, _
        CompatibilityMode:=0

EH:
    On Error Resume Next
    ' Close file
    WordDoc.Close False

End Sub

Sub ReadFile(ws As Worksheet, FilePath As String, FileName As String, ByRef rw)
    'parse text file
    Dim Ln As String
    Dim FileNum As Integer

    Dim ExtractedData() As Variant
    Dim idx As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Text file handling
    FileNum = FreeFile
    Open FilePath & FileName For Input As FileNum

    ' Restults array.
    ReDim ExtractedData(1 To 1000000, 1 To 1) ' Excel sheet can hold at most 1048576 rows
    idx = 0
    Do While Not EOF(FileNum)
        ' Read a line from file
        Line Input #FileNum, Ln

        ' Add your code to extract required data here
        If Ln Like " [A-Z][A-Z][A-z]########*" Then
            idx = idx + 1
            ExtractedData(idx, 1) = Ln
        End If
        '============================================
    Loop
    ' Place extracted data onto sheet
    ws.Cells(rw, 1).Resize(idx, 1) = ExtractedData
    ' Update row num for next file
    rw = rw + idx

EH:
    On Error Resume Next
    ' Clean Up
    Close #FileNum
End Sub



0 голосов
/ 04 февраля 2020

Проблема решена. Спасибо @chris neilsen и @macropod за помощь.

Это законченный код, который я использовал, и он может обрабатывать данные в считанные минуты, а не дни:

Option Explicit

Sub ConvertWordtoExcel()
    Dim t1 As Single, t2 As Single
    Dim DataFile As String
    Dim DataPath As String
    Dim SavePath As String
    Dim SavePathFolder As String
    Dim rw As Long
    Dim ws As Worksheet
    Dim WordApp As Word.Application
    Dim FD As FileDialog

    On Error GoTo EH

    'identify sheet to take results
    Set ws = ActiveSheet

    t1 = Timer() '<~~ only used to report run time

    ' Create an instance of Word
    Set WordApp = New Word.Application
    WordApp.Visible = False

    ' Set up path to data files
    Set FD = Application.FileDialog(msoFileDialogFolderPicker) 'Open Folder Picker
    FD.Show
    DataPath = FD.SelectedItems(1) & "\"
    Debug.Print "Folder", DataPath
    SavePath = DataPath & "Txt\" '<~~ save text files to a separate subfolder called Txt
    SavePathFolder = Dir(SavePath, vbDirectory) ' If the Txt subfolder does not exist, create it
        If SavePathFolder = vbNullString Then
            VBA.FileSystem.MkDir (SavePath)
        End If

    ' Get first word file in directory
    DataFile = Dir(DataPath & "*.docx")
    Do While DataFile <> vbNullString
        Debug.Print "Convert ", DataFile
        ' Open in word, save as text
        ConvertToText WordApp, DataPath, DataFile, SavePath
        DoEvents

        ' Get next file
        DataFile = Dir
    Loop

    ' Tidy up
    WordApp.Quit
    Set WordApp = Nothing

    t2 = Timer

    Debug.Print "Convert Time", t2 - t1


    t1 = Timer()
    ' Get first text file in directory
    DataFile = Dir(SavePath & "*.txt")
    rw = 1
    Do While DataFile <> vbNullString
        Debug.Print "Read ", DataFile
        ' process the file
        ReadFile ws, SavePath, DataFile, rw
        DoEvents
        ' Get next file
        DataFile = Dir
    Loop


    t2 = Timer

    Debug.Print "Read Time", t2 - t1

Exit Sub
EH:
    On Error Resume Next
    ' Tidy up
    If Not WordApp Is Nothing Then WordApp.Quit
    Set WordApp = Nothing

End Sub

Sub ConvertToText(WordApp As Word.Application, ByVal FilePath As String, ByVal FileName As String, ByVal SavePath As String)
    Dim WordDoc As Word.Document
    Dim i As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Open the file
    Set WordDoc = WordApp.Documents.Open(FilePath & FileName)

    With WordDoc.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "[ ]{2,}[^13]{1,}(REPORT NUM   :)" 'Clean header on each page
    .Replacement.Text = "\1"
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   : * CTRY OF ORIGIN^13" 'Clean header on each page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}ACTUAL SHP TOTAL*[ ]{20,}^13^m" 'Clean footer on some pages
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}TOTAL FOR DUTY*[ ]{20,}^13^m" 'Clean more footers
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   :*SUMMARY*[\*] [\*][ ]{20,}[^13]{1,}" 'Clean last page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13^m" ' Clean all page breaks
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "[^13]{2,}" ' Clean empty paragraphs
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "(*)^13(*)^13(*)^13" ' Combine 3 paragraphs into one and add file name at the end
    .Replacement.Text = "\1 \2 \3 " + FileName + "^13"
    .Execute Replace:=wdReplaceAll
  End With
  End With


    ' generate Text file name
    i = InStrRev(FileName, ".")
    FileName = Left$(FileName, i) & "txt"

    ' Save as text
    WordDoc.SaveAs2 _
        FileName:=SavePath & FileName, _
        FileFormat:=wdFormatText, _
        AddToRecentFiles:=False, _
        EmbedTrueTypeFonts:=False, _
        SaveNativePictureFormat:=False, _
        SaveFormsData:=False, _
        SaveAsAOCELetter:=False, _
        Encoding:=1252, _
        InsertLineBreaks:=False, _
        AllowSubstitutions:=False, _
        LineEnding:=0, _
        CompatibilityMode:=0

EH:
    On Error Resume Next
    ' Close file
    WordDoc.Close False

End Sub

Sub ReadFile(ws As Worksheet, FilePath As String, FileName As String, ByRef rw)
    'parse text file
    Dim Ln As String
    Dim FileNum As Integer

    Dim ExtractedData() As Variant
    Dim idx As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Text file handling
    FileNum = FreeFile
    Open FilePath & FileName For Input As FileNum

    ' Restults array.
    ReDim ExtractedData(1 To 1000000, 1 To 1) ' Excel sheet can hold at most 1048576 rows
    idx = 0
    Do While Not EOF(FileNum)
        ' Read a line from file
        Line Input #FileNum, Ln

        ' Add your code to extract required data here
        'If Ln Like " [A-Z][A-Z][A-z]########*" Then
            If Ln Like " *" Then
            idx = idx + 1
            ExtractedData(idx, 1) = Ln
        End If
        'End If
        '============================================
    Loop
    ' Place extracted data onto sheet
    ws.Cells(rw, 1).Resize(idx, 1) = ExtractedData
    ' Update row num for next file
    rw = rw + idx

EH:
    On Error Resume Next
    ' Clean Up
    Close #FileNum
End Sub
0 голосов
/ 15 января 2020

Для меня неясно, где или как «CTY / SITE / SORT:» относится к тому, что вы делаете, поскольку оно не отображается в фрагменте данных, который вы опубликовали. Следующий макрос Word показывает, как вы можете go проанализировать документ, который содержит только данные в опубликованном вами фрагменте данных. Как закодировано, он просто выводит первую такую ​​запись в конце документа - код, необходимый для генерации вывода для всего документа, был закомментирован. Комментарии в коде показывают, как структурирован вывод. Замена всех экземпляров vbCr, кроме последнего, на vbTab - это все, что требуется для преобразования вывода для каждой записи в одну строку для Excel.

Чтобы код, управляемый Excel, обрабатывал целую папку документов Word, см. Пример: https://www.excelguru.ca/forums/showthread.php?8900-Help-with-VBA-to-extract-data-from-Word-to-Excel&p=36586&viewfull=1#post36586. Как вы увидите, нет необходимости использовать Selection - что приводит к значительному снижению производительности.

Sub Demo()
Application.ScreenUpdating = False
Dim p As Long, StrOut As String
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "^13[!^13]@^13 <[A-Z]{3}[0-9]{8}"
    .Replacement.Text = "^p^&"
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM * CTRY OF ORIGIN^13"
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13[ ]@TOTAL FOR DUTY * TOTAL VALUES[!^13]@^13*^13"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
    .Text = "(^t[A-Z]{2}) ([A-Z0-9]{5,})[ ^t]"
    .Replacement.Text = "\1^t\2^t"
    .Execute Replace:=wdReplaceAll
    .Text = "([0-9]{1,}.[0-9]{2}) "
    .Replacement.Text = "\1^t"
    .Execute Replace:=wdReplaceAll
    .Text = "^13 (<[A-Z]{3}[0-9]{8}) "
    .Replacement.Text = "^p\1^t"
    .Execute Replace:=wdReplaceAll
    .Text = "^13"
    .Replacement.Text = "^l"
    .Execute Replace:=wdReplaceAll
    .Text = "[^l]{2,}"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "^l"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
  End With
  For p = 2 To .Paragraphs.Count - 1
    With .Paragraphs(p).Range
      'StrOut =
      'Shipment ID, Description, Quantity, Weight, Value, Broker, Country of Origin
      'Consignee Name, Consignee Street, Consignee City, Consignee State, Consignee Zip,
      'Importer Name, Importer Street, Importer City, Importer State, Importer Zip,
      'Shipper Name, Shipper Street, Shipper City, Shipper State, Shipper Zip,
      StrOut = StrOut & Split(.Text, vbTab)(8) & vbTab & Split(.Text, vbTab)(12) & vbTab & Split(.Text, vbTab)(4) & vbTab & Split(.Text, vbTab)(5) & vbTab & Split(.Text, vbTab)(6) & vbTab & Split(.Text, vbTab)(7) & vbTab & Split(.Text, vbTab)(24) & vbCr & _
        Split(.Text, vbTab)(1) & vbTab & Split(.Text, vbTab)(9) & vbTab & Split(.Text, vbTab)(15) & vbTab & Split(.Text, vbTab)(16) & vbTab & Split(.Text, vbTab)(17) & vbCr & _
        Split(.Text, vbTab)(2) & vbTab & Split(.Text, vbTab)(10) & vbTab & Split(.Text, vbTab)(18) & vbTab & Split(.Text, vbTab)(19) & vbTab & Split(.Text, vbTab)(20) & vbCr & _
        Split(.Text, vbTab)(3) & vbTab & Split(.Text, vbTab)(11) & vbTab & Split(.Text, vbTab)(21) & vbTab & Split(.Text, vbTab)(22) & vbTab & Split(.Text, vbTab)(23) & vbCr
    End With
  Next
  'Instead of .InsertAfter, write StrOut to Excel
  .InsertAfter vbCr & StrOut
End With
Application.ScreenUpdating = True
End Sub

Для заполнения таблицы вы можете использовать что-то вроде:

Dim StrRow As String, lRow As Long, r As Long, c As Long
With ActiveSheet
  lRow = .UsedRange.Cells.SpecialCells(xlCellTypeLastCell).Row + 1
  For r = 0 To UBound(Split(StrOut, vbCr))
    StrRow = Split(StrOut, vbCr)(r)
    For c = 0 To UBound(Split(StrRow, vbTab))
      .Cells(r + lRow, c + 1).Value = Split(StrRow, vbTab)(c)
    Next
  Next
End With
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...