Это то, что я вижу при предпросмотре:
{C: "d-64E78C04-B,0|Wm,1|Wn,2",…}
C: "d-64E78C04-B,0|Wm,1|Wn,2"
M: [{H: "InsuranceOfferRequestHub", M: "ReceiveCaptcha", A: [{id: 6057043, valid: true}]}]
Это то, что я вижу, когда нажимаю ссылку, которая выглядит примерно так: https://forsikringsguiden.dk/signalr/poll?transport=longPolling&clientProtocol=2.1&connectionToken=%2BfuC59NdzGOF9AM7vTrn%2FGdFVp1ichfhw3mXduUlCtyTWG1ZaAUfvp5H2KM8JC9Uns8MhINiAfIO0NLxsVpwLjH8Pz3uWcNN9TZMuwBr0mzNqG9JuRmris4TabhPOUHp&connectionData=%5B%7B%22name%22%3A%22insuranceofferrequesthub%22%7D%5D Ссылка, вероятно, тайм-аут, как он меняется.
Я получаю вывод ниже, нажав на ссылку выше, которую я вижу в Chrome -> Проверка элемента -> Предварительный просмотр -> XHR:
{'C': 'd-64E78C04-B,0|Wk,11|Wl,2', 'M': []}
Вот полный код ниже.
Я использую комбинацию Selenium и Requests. Используя Selenium, я заполняю несколько форм и заканчиваю на последней странице https://forsikringsguiden.dk/resultat. Это обзор цен для различных фирм и страховых компаний.
from seleniumwire import webdriver
from openpyxl import Workbook
from openpyxl import load_workbook
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import requests
class Main:
BASE_URL = 'https://forsikringsguiden.dk'
browser = None
fname = ''
workbook = None
worksheet = None
row = 1
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(
'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"')
self.browser = webdriver.Chrome(chrome_options=chrome_options)
self.browser.get(self.BASE_URL)
self.fname = "F:\*\*\Data_Mining\Forsikringsguiden\XHR_URLS.xlsx"
self.workbook = Workbook()
self.worksheet = self.workbook.worksheets[0]
self.write_headers()
def write_headers(self):
self.worksheet.cell(row=self.row, column=1).value = 'indbo'
self.row = self.row + 1
def write(self, col, val):
self.worksheet.cell(row=self.row, column=col).value = val
self.row = self.row + 1
self.workbook.save(self.fname)
def fill_fields(self):
x=6 # for time.sleep()
self.browser.maximize_window()
time.sleep(x)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[3]/app-footer/app-cookie/section/div/div/div/div[2]/div/button').click()
time.sleep(3)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-start/section[1]/div/div/div/div/div/div[3]').click()
time.sleep(6)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-start/section[1]/div/div/div/div/div/div[7]/div/div[2]/div/a[2]').click()
time.sleep(5)
self.browser.execute_script("window.scrollBy(0,900)", "")
self.browser.find_element_by_xpath('//*[@id="vogntype"]/div/div/input').click()
self.browser.find_element_by_xpath('//*[@id="vogntype"]/div/div/input').send_keys(
"Kongshaven 86, 2500 Valby")
#time.sleep(x)
time.sleep(3)
self.browser.find_element_by_xpath('//*[@id="rankingAge"]').click()
self.browser.find_element_by_xpath('//*[@id="rankingAge"]').send_keys("40")
#self.browser.find_element_by_xpath('//*[@id="content"]/div[1]/form/div/div[2]/button/i').click()
#input("Enter q To continue")
time.sleep(x)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-common/section/div/div/fieldset/div[1]/div/div[2]/div[2]/div/select').click()
time.sleep(x)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-common/section/div/div/fieldset/div[1]/div/div[2]/div[2]/div/select').send_keys("2", Keys.TAB)
time.sleep(x)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-common/section/div/div/fieldset/div[1]/div/div[3]/div[1]/div/select').send_keys("2", Keys.TAB)
time.sleep(x)
self.browser.find_element_by_xpath('//*[@id="wrapper"]/div[2]/app-common/section/div/div/fieldset/div[2]/div/div[2]/div/button').click()
self.browser.execute_script("window.scrollBy(0,900)", "")
time.sleep(x)
self.browser.find_element_by_xpath('/html/body/app-root/app-product-layout/div/div[2]/app-furnishing/section/div/div/app-product-navigate/div/div/div/div/div[2]/div/div/button').click()
time.sleep(x)
#self.browser.find_element_by_xpath('///*[@id="wrapper"]/div[2]/app-furnishing/section/div/div/app-product-navigate/div/div/div/div/div[2]/div/div/button').click()
#input('Enter x When you done')
# self.browser.refresh()
time.sleep(15)
#self.requests.get()
#soup = BeautifulSoup(self.browser.requests, 'lxml').text
#company_id = soup.find_all("div", {"class":"company-image"})
#print(soup)
count = 1
for request in self.browser.requests:
if request.response:
if 'poll?transport' in request.path:
print(count, request.path)
count = count + 1
self.write(1, request.path)
#self.browser.quit()
bj = Main()
bj.fill_fields()
#%%
import pandas as pd
site = pd.read_excel("F:\*\*\Data_Mining\Forsikringsguiden\XHR_URLS.xlsx", header = 0)
xhr = site.iloc[0]["indbo"]
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Referer':'https://forsikringsguiden.dk/resultat'
}
request_data = requests.get(url = xhr, headers = header)
req_json = request_data.json()
request_data.close()
request_data.connection.close()
print(req_json)
Я скучаю по другим заголовкам?