Вот предложение, которое использует преимущества от pandas
(вы можете легко экспортировать, чтобы потом превосходить)
import pandas as pd
import re
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
#A panda dataframe where all results will get stored
global_df = pd.DataFrame(columns=['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days', 'Min Order', 'FOB/Shipping'])
part = "MCMS00001"
driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
element = driver.find_element_by_id("twotabsearchtextbox")
element.send_keys(part)
driver.find_element_by_class_name('nav-submit-input').click()
#Find links
html = driver.page_source
soup= BeautifulSoup(html, 'html')
links = soup.find_all('a', href = re.compile('gsin='))
links = list(set([x.get('href') for x in links]))
#Explore links
for link in links:
driver.get("https://www.gsaadvantage.gov"+link)
#Sometimes zip pages shows up
try:
element = driver.find_element_by_id("zip")
element.send_keys("91911")
driver.find_element_by_name('submit').click()
except:
pass
html = driver.page_source
soup= BeautifulSoup(html, 'html')
#Get the table with the expected elements
table = soup.find('table', {'class':'greybox'})
item_numbers = []
#Extract itemNumbers
for tr in table.find_all('tr')[1:]: #skip the header line
item_number = tr.find('a', href=re.compile('itemNumber='))
if item_number != None:
pattern = 'itemNumber=(.*?)\&'
item_numbers.append(re.findall(pattern, item_number.get('href'))[0])
#Store table into a dataframe
temp_df = pd.read_html(str(soup.find('table', {'class':'greybox'})))[0]
#EDIT START : handle missing columns
for x in [2,8,14,16,18]:
if x not in temp_df.columns:
temp_df[x] = ""
temp_df = temp_df[[2,8,14,16,18]].dropna(how='all')
temp_df = temp_df[1:]
temp_df.columns = ['Price', 'Contractor Name', 'Deliv days', 'Min Order', 'FOB/Shipping']
temp_df['Contractor Part #'] = item_numbers
temp_df['Part #'] = part
#Extracting gsin from the explored link which differs in case of multiple links
gsin = link.split("gsin=")[1]
temp_df['gsin'] = gsin
temp_df = temp_df[['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days', 'Min Order', 'FOB/Shipping']]
#EDIT END
#Append new results to the global dataframe
global_df = pd.concat([global_df, temp_df])
Наконец, фрейм данных global_df выглядит так:
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| | gsin | Part # | Contractor Part # | Contractor Name | Price | Deliv days | Min Order | FOB/Shipping |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| 1 | 11000058164089 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGX-XXL | UNIFIRE INC | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI | |
| 1 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-L | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MILREG-L | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000063336537 | MCMS00001 | MCMS00001 | o | $153.99 | $50.00 | D-CONUSND-AK,PR,HI | NaN |
| 3 | 11000063336537 | MCMS00001 | MCMS00001 | s dv | $157.16 | $50.00 | O-CONUS,AK,PR,HI | NaN |
| 1 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-S | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MILREG-S | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-M | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MILREG-M | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XS | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XS | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163625 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGXX-XXXL | UNIFIRE INC | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI | |
| 1 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XL | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XL | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+