Я работаю над парсером и ищу помощь на моем последнем шаге, который экспортирует в csv с использованием pandas. Я хочу иметь возможность взять данные, которые я собрал в списке «Данные», и поместить их в строки под заголовками «Имя», «IssuedBy», «CertificationorDesignaationNumber», «CertfiedorDesignatedSince», «AccreditedBy» и «Expires». ". В настоящее время я получаю сообщение об ошибке:
"/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/bin/python" "/Users/nzalle/PycharmProjects/BCSP SCRAPING/Test.py"
Number of Pages to Scrape: 1
Traceback (most recent call last):
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 496, in _list_to_arrays
result = _convert_object_array(
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 580, in _convert_object_array
raise AssertionError(
AssertionError: 7 columns passed, passed data had 120 columns
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/Test.py", line 51, in <module>
df = pd.DataFrame([Data], columns=["Name", "IssuedBy", "CertificationorDesignaationNumber", "CertfiedorDesignatedSince", "AccreditedBy", "Expires", "temp"])
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 474, in __init__
arrays, columns = to_arrays(data, columns, dtype=dtype)
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 461, in to_arrays
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
File "/Users/nzalle/PycharmProjects/BCSP SCRAPING/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 500, in _list_to_arrays
raise ValueError(e) from e
ValueError: 7 columns passed, passed data had 120 columns
Вот мой текущий код:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
Data = []
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
df = pd.DataFrame([Data], columns=["Name", "IssuedBy", "CertificationorDesignaationNumber", "CertfiedorDesignatedSince", "AccreditedBy", "Expires"])
df.to_csv('work.csv', index=False)
driver.close()
Пожалуйста, дайте мне знать, что вызывает эту ошибку и как я могу ее исправить. Спасибо!