Пожалуйста, прочитайте комментарии в макросе. Не стесняйтесь упорядочить текст другим способом или получить дату сообщения из строки или чего-либо еще:
Редактировать 2: Я удалил первое редактирование, потому что указал на ошибки в макрос. Но я исправил их сейчас и заменил код макроса с помощью этого редактирования
Редактировать 3: Я заменил второй макрос на тот, который работает сейчас
Sub ExtractCoronaVirusCountryInfos()
'To get the clear text for each country we must restruct the html code of parts of the page
'It's necessary to delete some tags (p and span) and place some new tags (div and p)
'To manipulate the html code like we need it we use tools of the dom (document object model)
'and tools to make string operations on the html code
Dim url As String
Dim ie As Object
Dim nodeTextContainer As Object
Dim nodeAllP As Object
Dim nodeOneP As Object
Dim nodeNewBody As Object
Dim nodeAllDiv As Object
Dim nodeOneDiv As Object
Dim htmlString As String
Dim tableRow As Long
Dim tableColumn As Long
Dim countryName As String
Dim infoDate As String
Dim infoText As String
Dim p As Long
Dim openingArrowBracketIndex As Long
Dim closingArrowBracketIndex As Long
Dim openingRealBrTagComment As Long
Dim closingRealBrTagComment As Long
Dim openingRealBrTagStyle As Long
Dim closingRealBrTagStyle As Long
tableRow = 2
tableColumn = 1
url = "https://www.iatatravelcentre.com/international-travel-document-news/1580226297.htm"
'Initialize Internet Explorer, set visibility,
'call URL and wait until page is fully loaded
Set ie = CreateObject("internetexplorer.application")
ie.Visible = True
ie.navigate url
Do Until ie.readyState = 4: DoEvents: Loop
'Application.Wait Now + TimeSerial(0, 0, 2)
'Get the text container
Set nodeTextContainer = ie.document.getElementsByClassName("middle")(0)
'
'Get all p-tags
'They contain the text we want
Set nodeAllP = nodeTextContainer.getElementsByTagName("p")
'
'Kick the p tags (only the opening and closing strings)
'and concatinate the results of this operation
'We can do this very easy by getting the innerhtml
For Each nodeOneP In nodeAllP
htmlString = htmlString & nodeOneP.innerhtml
Next nodeOneP
'Now we want to kick the span tags. But we can't do that in the same way
'like with the p tags because there are nested span tags in the document
'Let's see what's the problem with nested tags
'
'HTML code example with two nested span tags:
'<span>
' <span>
' Data to show
' </span>
'</span
'
'VBA code to build a node collection:
'Set nodeAllSpan = ie.document.getElementsByTagName("span")
'
'Now there are two elements in the node collection:
'nodeAllSpan(0) = <span><span>Data to show</span></span>
'nodeAllSpan(1) = <span>Data to show</span>
'
'The Text we want is doubled!
'If we take the innertext of the whole collection we get this:
'Data to showData to show
'
'That is realy not our goal. Thats the reason we use string operations to delete
'all span tags. For the closing parts </span> it's easy with replace. The opening
'parts are unknown because they can have style information, attributes and even
'more. So we must search first for '<span'. Than for '>' after the before found
'position in string. Then we can delete the tag and go on for the next one
'
'First we replace the closing parts of all span tags with an empty string
htmlString = Replace(htmlString, "</span>", "")
'
'With the following part of the macro we delete the opening parts of all span tags
'We must search the whole string after each manipulation again so we need a loop
'until there is no more span tag
Do
openingArrowBracketIndex = InStr(1, htmlString, "<span")
closingArrowBracketIndex = InStr(openingArrowBracketIndex + 1, htmlString, ">")
If openingArrowBracketIndex > 1 Then
openingArrowBracketIndex = openingArrowBracketIndex - 1
End If
htmlString = Left(htmlString, openingArrowBracketIndex) & Mid(htmlString, closingArrowBracketIndex + 1)
Loop Until openingArrowBracketIndex = 0
'Now we have a string that starts with some text we don't need and some text at the end we don't need
'But we also have a string with a pattern we can use to place new html tags which can be used to get
'the text in that way we want
'
'The start text will lost automatically. The end text too with a little manipulation before placing all other new tags
htmlString = Replace(htmlString, "<br><br><br>", "</div>")
'Now we place the new structure
htmlString = Replace(htmlString, "<br><strong><br>", "<strong>")
htmlString = Replace(htmlString, "<strong><br><br>", "<strong>")
htmlString = Replace(htmlString, "<br><br><strong>", "<strong>")
htmlString = Replace(htmlString, "<br><br><a name=" & Chr(34) & "_GoBack" & Chr(34) & "></a><strong>", "<strong>")
htmlString = Replace(htmlString, "<br><strong>", "<strong>")
htmlString = Replace(htmlString, "<strong><br>", "<strong>")
htmlString = Replace(htmlString, "<strong>", "</p></div><div><strong>")
htmlString = Replace(htmlString, "</strong>", "</strong><p>")
htmlString = Replace(htmlString, "<br>", "</p><p>")
'Our htmlString contains all info we want. So we can
'use the ie to generate a new dom object
ie.Quit
Set ie = CreateObject("internetexplorer.application")
ie.Visible = True
ie.navigate "about:blank"
Do Until ie.readyState = 4: DoEvents: Loop
'First we encapsulate our htmlString in a body tag to be able to query it afterwards
htmlString = "<body>" & htmlString & "</body>"
'
'Than we use a little trick to get the htmlString as dom object
ie.document.Write (htmlString)
Set nodeNewBody = ie.document.getElementsByTagName("body")(0)
'Now we can get the text like we want it
'
'The information for every single country is placed now in a div tag
'By creating a node collection of all div tags we lost automatically
'the not needed text at the start and at the end
Set nodeAllDiv = ie.document.getElementsByTagName("div")
'
'Place data for each country in the excel table
For Each nodeOneDiv In nodeAllDiv
'Get country name
countryName = Trim(nodeOneDiv.getElementsByTagName("strong")(0).innertext)
ActiveSheet.Cells(tableRow, tableColumn).Value = countryName
tableColumn = tableColumn + 1
'Get date of message
'The date string is placed allways in the first p tag
infoDate = Trim(nodeOneDiv.getElementsByTagName("p")(0).innertext)
ActiveSheet.Cells(tableRow, tableColumn).Value = infoDate
tableColumn = tableColumn + 1
'Get the message itself
'The text of the message is placed from p tag 2 till the last p tag
Set nodeAllP = nodeOneDiv.getElementsByTagName("p")
'
For p = 1 To nodeAllP.Length - 1
infoText = infoText & Trim(nodeAllP(p).innertext) & Chr(10)
Next p
'
'Write Infotext to table without the last new line
ActiveSheet.Cells(tableRow, tableColumn).Value = Left(infoText, Len(infoText) - 1)
infoText = ""
tableColumn = 1
tableRow = tableRow + 1
Next nodeOneDiv
ie.Quit
End Sub