Я ошибся в мышлении. Чтобы загрузить содержимое Dynami c, когда сервер сообщает, что страница загружена, необходима еще одна пауза. Поэтому вам придется делать паузу на каждой странице, потому что вы не можете это проверить. Во всяком случае, я не знаю как. Итак, теперь я реализовал это с помощью более быстрого MSXML2.
Я добавил счетчики для всех найденных ссылок. Если найдено более одной ссылки, в ячейку устанавливается комментарий с найденным счетчиком. Если вы go снова перейдете по тем же URL-адресам, комментарии и найденные ссылки будут удалены и, при необходимости, сброшены.
Счетчики должны дать вам обзор, если вам нужен другой макет таблицы для получения всех ссылок, а не только последний. В этом случае вам понадобится ваша собственная таблица для ввода всех ссылок.
Попробуйте следующий макрос. Внимательно прочтите комментарии.
Sub ScrapeSoMeAndMailAddresses()
'******************************************************
'The macro works on the sheet from which it was started
'******************************************************
Dim url As String
Dim http As Object
Dim htmlDoc As Object
Dim nodeAllLinks As Object
Dim nodeOneLink As Object
Dim currentRow As Long
'Counters to check, if there are more than one address per page
Dim addressCounters(5) As Long
Dim checkCounters As Long
'Initialize variables
currentRow = 2
Set htmlDoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Loop over all URLs in column A in the active table
Do While ActiveSheet.Cells(currentRow, 1).Value <> ""
'Scroll if current row > 14
If currentRow > 14 Then
ActiveWindow.SmallScroll down:=1
End If
'Clear content in current row
ActiveSheet.Range(Cells(currentRow, 2), Cells(currentRow, 9)).ClearContents
'Get next url from table
url = ActiveSheet.Cells(currentRow, 1).Value
'Load page
http.Open "GET", url, False
http.send
'Check if page loading was successful
If http.Status = 200 Then
'Build html document for DOM operations
htmlDoc.body.innerHTML = http.responseText
'Create node list from all links of the page
Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
'Walk through all links of the node list
For Each nodeOneLink In nodeAllLinks
'Check for mail address
If InStr(1, nodeOneLink.href, "mailto:") Then
'Write mail address to table in column B
ActiveSheet.Cells(currentRow, 2).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
'Increment mail address counter
addressCounters(0) = addressCounters(0) + 1
End If
'Check for Facebook address
If InStr(1, UCase(nodeOneLink.href), "FACEBOOK") Then
'Write Facebook address to table in column C
ActiveSheet.Cells(currentRow, 3).Value = nodeOneLink.href
'Increment Facebook counter
addressCounters(1) = addressCounters(1) + 1
End If
'Check for Instagram address
If InStr(1, UCase(nodeOneLink.href), "INSTAGRAM") Then
'Write INSTAGRAM address to table in column D
ActiveSheet.Cells(currentRow, 4).Value = nodeOneLink.href
'Increment Instagram counter
addressCounters(2) = addressCounters(2) + 1
End If
'Check for Twitter address
If InStr(1, UCase(nodeOneLink.href), "TWITTER") Then
'Write Twitter address to table in column E
ActiveSheet.Cells(currentRow, 5).Value = nodeOneLink.href
'Increment Twitter counter
addressCounters(3) = addressCounters(3) + 1
End If
'Check for YouTube address
If InStr(1, UCase(nodeOneLink.href), "YOUTUBE") Then
'Write YouTube address to table in column F
ActiveSheet.Cells(currentRow, 6).Value = nodeOneLink.href
'Increment YouTube counter
addressCounters(4) = addressCounters(4) + 1
End If
'Check for LinkedIn address
If InStr(1, UCase(nodeOneLink.href), "LINKEDIN") Then
'Write LinkedIn address to table in column G
ActiveSheet.Cells(currentRow, 7).Value = nodeOneLink.href
'Increment LinkedIn counter
addressCounters(5) = addressCounters(5) + 1
End If
Next nodeOneLink
'Check counters
For checkCounters = 0 To 5
'Delete comment if there is one
If Not ActiveSheet.Cells(currentRow, checkCounters + 2).Comment Is Nothing Then
ActiveSheet.Cells(currentRow, checkCounters + 2).Comment.Delete
End If
'Set comment if more than 1 link were found
If addressCounters(checkCounters) > 1 Then
ActiveSheet.Cells(currentRow, checkCounters + 2).AddComment Text:=CStr(addressCounters(checkCounters))
ActiveSheet.Cells(currentRow, checkCounters + 2).Comment.Shape.TextFrame.AutoSize = True
End If
Next checkCounters
Else
'Page not loaded
'Write message in column H
ActiveSheet.Cells(currentRow, 9).Value = "Error with website address"
End If
'Prepare for next page
Erase addressCounters
currentRow = currentRow + 1
Loop
'Clean up
Set http = Nothing
Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set nodeOneLink = Nothing
'Check if this works now
Complete.Show
End Sub
Изменить: исправленный код
Поскольку макрос выше является более или менее подтверждением концепции, я разработал версию, которая соответствует требованиям первого комментарий под этим сообщением. Больше нечего читать со страниц, зато много работы; -)
Вам нужно создать другую таблицу, которая будет содержать все адреса после запуска макроса. Вы должны один раз создать заголовок вручную. У меня не реализована проверка на дублирование адресов!
Код макроса не идеален (все в одном, без функций), но он намного гибче, чем версия 1. Если вы понимаете код, при необходимости будет довольно легко интегрировать дополнительные платформы.
О том, что с Sheet3
: Вы можете получить доступ к листу из VBA либо по имени, которое он имеет на вкладке в обычном представлении Excel, либо по его индексу. Sheet3
- это имя, насколько я понял. В верхней части части Initialize Variables вы можете легко изменить два имени по умолчанию Sheet3
и Sheet4
, чтобы они соответствовали именам, используемым на вкладках. Больше ничего менять не нужно.
Если вы запустите макрос, сначала будет удалено содержимое обоих листов.
Внимательно прочтите комментарии в коде VBA !
Sub ScrapeSoMeAndMailAddresses()
'Columns for both tables
Const colUrl As Long = 1 'Must always be the first column
Const colMail As Long = 2 'Must always be the first column before SoMe platforms
Const colFacebook As Long = 3
Const colInstagram As Long = 4
Const colTwitter As Long = 5
Const colYouTube As Long = 6
Const colLinkedIn As Long = 7 'Must always be the last column of SoMe platforms
Const colError As Long = 9 'Must always be the last column
Dim url As String
Dim http As Object
Dim htmlDoc As Object
Dim nodeAllLinks As Object
Dim nodeOneLink As Object
Dim pageLoadSuccessful As Boolean
Dim tableUrlsOneAddressLeft As String
Dim tableAllAddresses As String
Dim currentRowTableUrls As Long
Dim lastRowTableUrls As Long
Dim currentRowsTableAll(colUrl To colLinkedIn) As Long
Dim lastRowTableAll As Long
Dim addressCounters(colMail To colLinkedIn) As Long
Dim checkCounters As Long
'Initialize variables
tableUrlsOneAddressLeft = "Sheet3"
currentRowTableUrls = 2 'First row for content
tableAllAddresses = "Sheet4"
For checkCounters = colUrl To colLinkedIn
currentRowsTableAll(checkCounters) = 2 'First rows for content
Next checkCounters
Set htmlDoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Clear all contents and comments in the URL source sheet from email column to error column
lastRowTableUrls = Sheets(tableUrlsOneAddressLeft).Cells(Rows.count, colUrl).End(xlUp).row
Sheets(tableUrlsOneAddressLeft).Range(Cells(currentRowTableUrls, colMail), Cells(lastRowTableUrls, colError)).ClearContents
Sheets(tableUrlsOneAddressLeft).Range(Cells(currentRowTableUrls, colMail), Cells(lastRowTableUrls, colError)).ClearComments
'Delete all rows except headline in the sheet with all addresses
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.count, colUrl).End(xlUp).row
Sheets(tableAllAddresses).Rows(currentRowsTableAll(colUrl) & ":" & lastRowTableAll).Delete Shift:=xlUp
'Loop over all URLs in column A in the URL source sheet
Do While Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Value <> ""
'**************************************
'Scroll for visual monitoring
'Comment out the following three lines,
'if Sheet3 is not visible while the
'macro runs
'Otherwise a runtime error will occur
If currentRowTableUrls > 14 Then
ActiveWindow.SmallScroll down:=1
End If
'**************************************
'Get next url from the URL source sheet
url = Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colUrl).Value
'Try to load page
'Temporarily disable error
'handling if there is a timeout
On Error Resume Next
http.Open "GET", url, False
http.send
On Error GoTo 0
'Check if page loading was successful
If http.Status = 200 Then
pageLoadSuccessful = True
End If
If pageLoadSuccessful Then
'Build html document for DOM operations
htmlDoc.body.innerHTML = http.responseText
'Create node list from all links of the page
Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
'Walk through all links of the node list
For Each nodeOneLink In nodeAllLinks
'Check for mail address
If InStr(1, nodeOneLink.href, "mailto:") Then
'Write mail address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colMail), colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colMail) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment mail counters
currentRowsTableAll(colMail) = currentRowsTableAll(colMail) + 1
addressCounters(colMail) = addressCounters(colMail) + 1
End If
'Check for Facebook address
If InStr(1, UCase(nodeOneLink.href), "FACEBOOK") Then
'Write Facebook address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colFacebook).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colFacebook), colFacebook).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colFacebook) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Facebook counters
currentRowsTableAll(colFacebook) = currentRowsTableAll(colFacebook) + 1
addressCounters(colFacebook) = addressCounters(colFacebook) + 1
End If
'Check for Instagram address
If InStr(1, UCase(nodeOneLink.href), "INSTAGRAM") Then
'Write INSTAGRAM address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colInstagram).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colInstagram), colInstagram).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colInstagram) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Instagram counters
currentRowsTableAll(colInstagram) = currentRowsTableAll(colInstagram) + 1
addressCounters(colInstagram) = addressCounters(colInstagram) + 1
End If
'Check for Twitter address
If InStr(1, UCase(nodeOneLink.href), "TWITTER") Then
'Write Twitter address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colTwitter).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colTwitter), colTwitter).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colTwitter) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Twitter counters
currentRowsTableAll(colTwitter) = currentRowsTableAll(colTwitter) + 1
addressCounters(colTwitter) = addressCounters(colTwitter) + 1
End If
'Check for YouTube address
If InStr(1, UCase(nodeOneLink.href), "YOUTUBE") Then
'Write YouTube address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colYouTube).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colYouTube), colYouTube).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colYouTube) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment YouTube counters
currentRowsTableAll(colYouTube) = currentRowsTableAll(colYouTube) + 1
addressCounters(colYouTube) = addressCounters(colYouTube) + 1
End If
'Check for LinkedIn address
If InStr(1, UCase(nodeOneLink.href), "LINKEDIN") Then
'Write LinkedIn address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colLinkedIn).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colLinkedIn), colLinkedIn).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colLinkedIn) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment LinkedIn counters
currentRowsTableAll(colLinkedIn) = currentRowsTableAll(colLinkedIn) + 1
addressCounters(colLinkedIn) = addressCounters(colLinkedIn) + 1
End If
Next nodeOneLink
'Check address counters
For checkCounters = colMail To colLinkedIn
'Set comment if more than 1 link were found
If addressCounters(checkCounters) > 1 Then
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).AddComment Text:=CStr(addressCounters(checkCounters))
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).Comment.Shape.TextFrame.AutoSize = True
End If
Next checkCounters
Else
'Page not loaded
'Write message URL table
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colError).Value = "Error with URL or timeout"
End If
'Prepare for next page
pageLoadSuccessful = False
Erase addressCounters
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.count, colUrl).End(xlUp).row
For checkCounters = colUrl To colLinkedIn
currentRowsTableAll(checkCounters) = lastRowTableAll + 1 'First rows for next page content
Next checkCounters
currentRowTableUrls = currentRowTableUrls + 1
Loop
'Clean up
Set http = Nothing
Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set nodeOneLink = Nothing
'Check if this works now
Complete.Show
End Sub