Я понимаю, что вы спрашивали эти две недели go, но, возможно, вы все еще ищете ответ.
На момент написания, я думаю, что код ниже работает для меня. Я говорю во время написания, потому что у меня создается впечатление, что некоторые из id
(в HTML, полученных от сервера) периодически меняются - что нарушает код.
Вот что В настоящее время у меня есть:
Код немного беспорядок, не стесняйтесь рефакторинг. Точка входа: ScrapeAllItemsFromEbayShop
.
Option Explicit
Private Function GetUrlForShopPageN(ByVal N As Long) As String
' Should return the store URL for page N,
' where N is some 1-based page index present in the query string.
GetUrlForShopPageN = "https://www.ebay.com/str/customwheelandperformancedepot?_pgn=" & N
End Function
Private Function GetHtmlForShopPageN(ByVal webClient As WinHttp.WinHttpRequest, ByVal N As Long) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for page N,
' where N is some 1-based page index present in the query string.
Dim targetUrl As String
targetUrl = GetUrlForShopPageN(N)
With webClient
.Open "GET", targetUrl, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForShopPageN = New MSHTML.HTMLDocument
GetHtmlForShopPageN.body.innerHTML = .responseText
End With
End Function
Private Function DoesShopPageNotContainResults(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return a boolean representing whether the htmlResponse contains zero results.
DoesShopPageNotContainResults = (htmlResponse.getElementsByClassName("srp-controls").Length = 0)
End Function
Private Function GetUrlsOfItemsToScrape() As Collection
' Should return a collection of strings, representings the URLs of items.
Set GetUrlsOfItemsToScrape = New Collection
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Do While True
Dim pageIndex As Long
pageIndex = pageIndex + 1
Dim htmlResponse As MSHTML.HTMLDocument
Set htmlResponse = GetHtmlForShopPageN(webClient, pageIndex)
If DoesShopPageNotContainResults(htmlResponse) Then Exit Do
Dim anchor As MSHTML.IHTMLElement
For Each anchor In htmlResponse.getElementsByClassName("s-item__link")
Debug.Assert StrComp(LCase$(Left$(anchor.getAttribute("href"), 25)), "https://www.ebay.com/itm/", vbBinaryCompare) = 0
GetUrlsOfItemsToScrape.Add anchor.getAttribute("href")
If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do ' Delete this line completely once you think everything is working.
Next anchor
If (0 = (pageIndex Mod 10)) Then DoEvents
Loop
End Function
Private Function DoesTextContainAnyOf(ByVal textToCheck As String, stringsToCheck As Variant) As Boolean
' Should return a boolean representing whether any of "stringsToCheck"
' can be found within "textToCheck". Performs a case-sensitive search.
Dim i As Long
For i = LBound(stringsToCheck) To UBound(stringsToCheck)
If InStr(1, textToCheck, stringsToCheck(i), vbBinaryCompare) Then
DoesTextContainAnyOf = True
Exit For
End If
Next i
End Function
Private Function IsItemAWheelOnly(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return True if, based on the HTML, the item is inferred to be a "wheel".
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlResponse.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tireAndPackageIdentifiers As Variant
tireAndPackageIdentifiers = Array("tire", "section width", "aspect ratio")
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
If DoesTextContainAnyOf(LCase$(tableRow.Cells(columnIndex).innerText), tireAndPackageIdentifiers) Then Exit Function
Next columnIndex
Next tableRow
IsItemAWheelOnly = True
End Function
Private Function GetHtmlForItem(ByVal webClient As WinHttp.WinHttpRequest, ByVal urlForItem As String) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for a given item.
With webClient
.Open "GET", urlForItem, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForItem = New MSHTML.HTMLDocument
GetHtmlForItem.body.innerHTML = .responseText
End With
End Function
Private Sub ScrapeAllItemsFromEbayShop()
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Dim urlsOfItemsToScrape As Collection
Set urlsOfItemsToScrape = GetUrlsOfItemsToScrape()
Dim rowWriteIndex As Long
rowWriteIndex = 1 ' Skip row 1/headers
Dim destinationSheet As Worksheet
Set destinationSheet = ThisWorkbook.Worksheets("Sheet1")
destinationSheet.Cells.ClearContents
Dim columnIndexes As Collection
Set columnIndexes = New Collection
Dim urlOfItem As Variant
For Each urlOfItem In urlsOfItemsToScrape
Debug.Print urlOfItem
Dim htmlOfItemPage As MSHTML.HTMLDocument
Set htmlOfItemPage = GetHtmlForItem(webClient, urlOfItem)
Dim nameValuePairs As Collection
If IsItemAWheelOnly(htmlOfItemPage) Then
Set nameValuePairs = CreateNameValuePairsForWheelOnly(htmlOfItemPage)
Else
Set nameValuePairs = CreateNameValuePairsForWheelAndTirePackage(htmlOfItemPage)
End If
rowWriteIndex = rowWriteIndex + 1
Dim nameValuePair As Variant
For Each nameValuePair In nameValuePairs
Dim columnWriteIndex As Long
columnWriteIndex = GetColumnIndexOfHeader(columnIndexes, nameValuePair(0))
If columnWriteIndex = 0 Then
columnWriteIndex = columnIndexes.Count + 1
columnIndexes.Add columnWriteIndex, Key:=nameValuePair(0)
destinationSheet.Cells(1, columnWriteIndex).Value = nameValuePair(0)
End If
destinationSheet.Cells(rowWriteIndex, columnWriteIndex).Value = nameValuePair(1)
Next nameValuePair
DoEvents
Next urlOfItem
End Sub
Private Function CreateNameValuePairsForWheelAndTirePackage(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_div")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Set CreateNameValuePairsForWheelAndTirePackage = outputCollection
End Function
Private Function CreateNameValuePairsForWheelOnly(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_wrapper_ctr")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlOfItemPage.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
outputCollection.Add CreateNameValuePair(tableRow.Cells(columnIndex).innerText, tableRow.Cells(columnIndex + 1).innerText)
Next columnIndex
Next tableRow
Set CreateNameValuePairsForWheelOnly = outputCollection
End Function
Private Function CreateNameValuePair(ByVal someName As String, ByVal someValue As String) As String()
Dim outputArray(0 To 1) As String
outputArray(0) = someName
outputArray(1) = someValue
CreateNameValuePair = outputArray
End Function
Private Function GetColumnIndexOfHeader(ByVal columnIndexes As Collection, ByVal header As String) As Long
' Should return a 1-based column index associated with "header".
' If "header" does not exist within collection, 0 is returned.
On Error Resume Next
GetColumnIndexOfHeader = columnIndexes(header)
On Error GoTo 0
End Function
Этот код работает медленно по ряду причин:
- Много времени тратится на ожидание ответа от сервера.
- Каждый элемент очищается последовательно и синхронно.
- Вывод записывается на лист по одной ячейке за раз (вместо использования массивов и сокращения числа операций чтения / записи, связанных с листом). ).
- Нет переключения
Application.Calculation
или Application.ScreenUpdating
.
Как только вы думаете, что код работает, вы захотите избавиться от этой строки If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do
в функция GetUrlsOfItemsToScrape
. В противном случае вы не очистите все элементы.
Я оставил DoEvents
внутри циклов Do
, чтобы обеспечить адаптивность (возможно, за счет некоторой производительности)