Виджет даты VBA Scrape из результатов поиска - PullRequest
0 голосов
/ 25 октября 2019

при поиске определенного события. например, «дата оскар 2018», Google показывает виджет с датой события, прежде чем любые результаты поиска. Мне нужно получить эту дату в Excel, но она кажется сложной с точки зрения фактического кодирования. Я возился с этими функциями, но не получил никаких результатов. div Меня интересует:

<div class="Z0LcW">5 March 2018, 1:00 am GMT</div>

Вот полный код, который я пытаюсь использовать:

Option Explicit

Public Sub Example()
    Call GoogleSearchDescription("oscars 2018 date")
End Sub

Public Function GoogleSearchDescription(ByVal SearchTerm As String) As String
    Dim Query As String: Query = "https://www.google.com/search?q=" & URLEncode(SearchTerm)
    Dim HTML As String: HTML = GetHTML(Query)
    Dim Description() As String: Description = RegExer(HTML, "(<div class=""Z0LcW"">[\w\s.<>/]+<\/div>)")
    Description(0) = FilterHTML(Description(0))
    Debug.Print Description(0)
    Debug.Print "ok"
End Function

Public Function GetHTML(ByVal URL As String) As String
    On Error Resume Next
    Dim HTML As Object
    With CreateObject("InternetExplorer.Application")
        .navigate URL
        Do Until .ReadyState = 4: DoEvents: Loop
        Do While .Busy: DoEvents: Loop
        Set HTML = .Document.Body
        GetHTML = HTML.innerHTML
        .Quit
    End With
    Set HTML = Nothing
End Function

Private Function URLEncode(ByVal UnformattedString As String) As String
    'CAUTION: This function URLEncodes strings to match Google Maps API URL specifications, see note below for details
    'Note: We convert spaces to + signs, and skip converting plus signs to anything because they replace spaces
    'We also skip ampersands [&] as they should not be parsed out of a valid query
    Dim Index As Long, ReservedChars As String: ReservedChars = "!#$'()*/:;=?@[]""-.<>\^_`{|}~"
    'Convert all % symbols to encoding, as the unformatted string should not already contain URL Encoded characters
    UnformattedString = Replace(UnformattedString, "%", "%" & Asc("%"))
    'Convert spaces to plus signs to match Google URI query specifications
    UnformattedString = Replace(UnformattedString, " ", "+")
    'Iterate through the reserved characters for encoding
    For Index = 1 To (Len(ReservedChars) - 1)
        UnformattedString = Replace(UnformattedString, Mid(ReservedChars, Index, 1), "%" & Asc(Mid(ReservedChars, Index, 1)))
    Next Index
    'Return URL encoded string
    URLEncode = UnformattedString
End Function

Private Function FilterHTML(ByVal RawHTML As String) As String
    If Len(RawHTML) = 0 Then Exit Function
    Dim HTMLEntities As Variant, HTMLReplacements As Variant, Counter As Long
    Const REG_HTMLTAGS = "(<[\w\s""':.=-]*>|<\/[\w\s""':.=-]*>)" 'Used to remove HTML formating from each step in the queried directions
    HTMLEntities = Array("&nbsp;", "&lt;", "&gt;", "&amp;", "&quot;", "&apos;")
    HTMLReplacements = Array(" ", "<", ">", "&", """", "'")
    'Parse HTML Entities into plaintext
    For Counter = 0 To UBound(HTMLEntities)
        RawHTML = Replace(RawHTML, HTMLEntities(Counter), HTMLReplacements(Counter))
    Next Counter
    'Remove any stray HTML tags
    Dim TargetTags() As String: TargetTags = RegExer(RawHTML, REG_HTMLTAGS)
    'Preemptively remove new line characters with actual new lines to separate any conjoined lines.
    RawHTML = Replace(RawHTML, "<b>", " ")
    For Counter = 0 To UBound(TargetTags)
        RawHTML = Replace(RawHTML, TargetTags(Counter), "")
    Next Counter

    FilterHTML = RawHTML
End Function

Public Function RegExer(ByVal RawData As String, ByVal RegExPattern As String) As String()
    'Outputs an array of strings for each matching expression
    Dim RegEx As Object: Set RegEx = CreateObject("VBScript.RegExp")
    Dim Matches As Object
    Dim Match As Variant
    Dim Output() As String
    Dim OutputUBound As Integer
    Dim Counter As Long

    With RegEx
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = RegExPattern
    End With

    If RegEx.test(RawData) Then
        Set Matches = RegEx.Execute(RawData)
        For Each Match In Matches
            OutputUBound = OutputUBound + 1
        Next Match
        ReDim Output(OutputUBound - 1) As String
        For Each Match In Matches
            Output(Counter) = Matches(Counter)
            Counter = Counter + 1
        Next Match
        RegExer = Output
    Else
        ReDim Output(0) As String
        RegExer = Output
    End If
End Function

1 Ответ

0 голосов
/ 25 октября 2019

Вы можете использовать данные из Интернета, с этим запросом

https://www.google.com/search?q=oscars+2018+date&oq=oscars+2018

, затем проверить всю страницу и импортировать. это для меня это было в строке 27.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...