Я хочу очистить многостраничные данные через href, используя python и selenium webdriver, но проблема, с которой я сталкиваюсь, - это конкатенация данных с первой страницы до последней.Я использую:
url = http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana
Я обнаружил прикрепленную ниже ошибку для объединения и экспорта данных в одном файле с нескольких страниц
Снимок экрана ошибки:
Мой код:
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana'
chrome_path =r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
with open('Sample_buldhana.csv', 'w',encoding='utf-16',newline='') as csvfile:
f = csv.writer(csvfile, dialect='excel')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('1')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1458')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
#rename and re-order columns as required
while [page.get_attribute('href')for page in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Page$']")]:
try:
page_no =[page.get_attribute('href')for page in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Page$']")]
print(page_no)
for script_page in page_no:
d.execute_script(script_page)
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 =
pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted
rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table =
table.append(table1.reindex(columns=table.columns))
table = pd.concat([table, table1],
axis=0,join_axes=None,ignore_index=False)
#table = pd.concat([table, table1])
print(table)
except:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',',
encoding='utf-8-sig',index = False )