Я пытаюсь извлечь некоторые данные (используя python) с веб-сайта: https://www.seminovosmovida.com.br/busca?q=
К сожалению, есть некоторые данные из столбца 'marca' из df_final фрейма данных, который приходит как "NaN" (я проверил вручную через базу данных веб-сайта и нет пропущенных значений, данные есть!).
Может ребята, помогите мне с этим, пожалуйста?
Ниже приведен код:
import requests as rq
from datetime import datetime, date, timedelta
from email.mime.text import MIMEText
import smtplib, ssl
import time
import win32com.client as win32
from selenium import webdriver
import os
import urllib
import math
#Query
session = rq.Session()
headers = {
'authority': 'be-seminovos.movidacloud.com.br',
'accept': 'application/json, text/plain, */*',
'sec-fetch-dest': 'empty',
'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJodHRwOlwvXC8xNzIuMjAuMC40OjgwODFcL2F1dGhcL2xvZ2luIiwiaWF0IjoxNTU2ODg1MDMxLCJleHAiOjE1NTY4ODg2MzEsIm5iZiI6MTU1Njg4NTAzMSwianRpIjoiaUdSV0JuVWVpcTMyNzNEcyIsInN1YiI6MSwicHJ2IjoiODdlMGFmMWVmOWZkMTU4MTJmZGVjOTcxNTNhMTRlMGIwNDc1NDZhYSJ9.WwHWiK0qUUGxyEMiI_owE5YoyykmW__fA1RaEAxqO0k',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Mobile Safari/537.36',
'content-type': 'application/json',
'origin': 'https://www.seminovosmovida.com.br',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'referer': 'https://www.seminovosmovida.com.br/busca?q=',
'accept-language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
}
params = (
('terms', ''),
('applyURL', '/searchapi/v3/search?apikey=movidaseminovos&sortBy=ascPrice'),
)
#Number of pages and response
response = session.get('https://be-seminovos.movidacloud.com.br/api/v1/chaordic/search/search', headers=headers, params=params)
df = pd.read_json(response.text,encoding = 'ansi', lines = True )
n_items = int(df['size'][0])
n_pages = math.ceil(n_items/20)
params_list = []
for pages in range(n_pages+1)[1:]:
if pages == 1:
params = (
('terms', ''),
('applyURL', '/searchapi/v3/search?apikey=movidaseminovos&sortBy=ascPrice'),
)
else:
var = '/searchapi/v3/search?apikey=movidaseminovos&terms=&page='+ str(pages)
params = (
('terms', ''),
('applyURL', var),
)
params_list.append(params)
df_list = []
for pl in params_list:
try:
response = session.get('https://be-seminovos.movidacloud.com.br/api/v1/chaordic/search/search',headers=headers, params = pl, verify = False)
df = pd.read_json(response.text,encoding = 'ansi', lines = True )
df = pd.DataFrame(list(df['products']))
df = df.T
df = pd.DataFrame(list(df[0]))
#df = pd.DataFrame(list(df['details']))
details = pd.DataFrame(list(df['details'])).loc[:,['ano','quilometragem','cidade','estado_uf','marca','modelo']]
df = pd.concat([df['id'],df['price'],details], axis = 1, sort = False)
df_list.append(df)
except:
pass
df_final = pd.concat(df_list,axis=0,sort=False,ignore_index = True)
column_names = ['id','ano','price','quilometragem','cidade','estado_uf','marca','modelo']
df_final = df_final.reindex(columns=column_names)
df_final.columns = ['id','year','price','km','city','uf','brand','model']
today = date.today()
df_final['RunDate'] = today