BeautifulSoup & Selenium - неверные данные, скопированные в мою таблицу после исключения - PullRequest
0 голосов
/ 14 июня 2019

Первое размещение, конструктивная критика приветствуется.

Я пытаюсь автоматизировать задачу анализа рынка для своей компании.Я написал программу, которая в большинстве случаев работает нормально, но перестала работать, как только добавила исключение для обработки запроса почтового индекса.Теперь BeautifulSoup загружает неверную исходную страницу и не может найти нужные мне данные.

Я попытался обновить переменную ссылки, из которой BeautifulSoup получает исходный код, но, похоже, это не работает.


# Global variables
os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')
WB_ORIG = openpyxl.load_workbook("test_pricelist.xlsx")
SHEET_ORIG = WB_ORIG['Sheet1']
MAX_ROW = SHEET_ORIG.max_row
NEW_WB_NAME = 'market_analysis_test.xlsx'


def pricelist_manip():

    # Note: This is the starting row number for the next for loop
    r = 2
    wb_new = openpyxl.load_workbook(NEW_WB_NAME)
    sheet_new = wb_new.active

    os.chdir(r"C:\Users\user\PycharmProjects\selenium\drivers")
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options=chrome_options)

    os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')

    driver.set_page_load_timeout(20)
    driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
    driver.maximize_window()

    for i in range(2, MAX_ROW):
        """Copies the value of the Manufacturer's part number and pastes it into the search box"""
        start_time = time.time()

        driver.set_page_load_timeout(20)
        item_id = SHEET_ORIG.cell(row=i, column=1).value
        driver.find_element_by_id("twotabsearchtextbox").clear()
        driver.find_element_by_id("twotabsearchtextbox").send_keys(item_id)
        driver.find_element_by_id("twotabsearchtextbox").send_keys(Keys.ENTER)

        product_links = driver.find_elements_by_xpath("//img[@alt='Click to view product details']/following::a[1]")
        print(i - 1)
        print(item_id)
        links_list = []

        for values in product_links:
            """Creates a list of hyperlinks for each product on the page."""
            hyperlink = values.get_attribute("href")
            links_list.append(hyperlink)

        for x in range(len(links_list)):
            """Goes through the links_list one by one. During each iteration, 
            the program goes to the link, copies the necessary information
            to the spreadsheet, and moves on."""
            driver.set_page_load_timeout(20)
            driver.get(links_list[x])

            # Sometimes a link will bring up an option to enter ZIP Code. This deals with that.

            try:
                driver.find_element_by_id("zip").clear()
                driver.find_element_by_id("zip").send_keys('91911')
                driver.find_element_by_id("zip").send_keys(Keys.ENTER)
                time.sleep(1)
                driver.get(links_list[x])
            except selenium.common.exceptions.NoSuchElementException:
                pass


            rows = r

            source = requests.get(links_list[x]).text
            soup = BeautifulSoup(source, 'lxml')
            body = soup.find('body')
            try:
                part_num = body.find('td', width="78%").text
            except AttributeError:
                print("AttributeError: Contractor Part Number defaulted")
                part_num = item_id
            finally:
                comments = soup.find_all(string=lambda text: isinstance(text, Comment))

                for comment in comments:
                    """Iterates over html searching for contractor names. Then writes contractor names to xlsx"""
                    if comment.strip() == 'Contractor display':
                        contractor = comment.find_next_sibling('td')
                        sheet_new['C' + str(r)] = contractor.text
                        sheet_new['A' + str(r)] = item_id
                        sheet_new['B' + str(r)] = part_num
                        r += 1

                r = rows

                for comment in comments:
                    if comment.strip() == 'unit price':
                        """Iterates over html searching for prices. Then writes prices to xlsx"""
                        price = comment.find_next_sibling('td')
                        sheet_new['D' + str(r)] = price.text
                        r += 1

                r = rows + 3
                time.sleep(1)

        wb_new.save(NEW_WB_NAME)

        # This checks the runtime of the loop
        end_time = time.time()
        duration = end_time - start_time
        duration = round(duration, 2)
        print("Runtime for this iteration was {} seconds.\n".format(duration))
        time.sleep(2)

    wb_new.save(NEW_WB_NAME)
    wb_new.close()

Я ожидаю, что код выведет следующее для каждого списка цен на единицу:

Part # - Contractor Part # - Contractor Name - Price

Пример, с которым у меня возникают проблемы, - при поиске DF2-850-FDPO-KH-S-R.Есть пять предложений, которые обычно записываются гладко.В приведенном выше коде результаты не записываются.

1 Ответ

0 голосов
/ 14 июня 2019

Вот предложение, которое использует преимущества от pandas (вы можете легко экспортировать, чтобы потом превосходить)

import pandas as pd
import re
from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
#A panda dataframe where all results will get stored
global_df = pd.DataFrame(columns=['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days',  'Min Order', 'FOB/Shipping'])

part = "MCMS00001"
driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
element = driver.find_element_by_id("twotabsearchtextbox")
element.send_keys(part)
driver.find_element_by_class_name('nav-submit-input').click()


#Find links
html = driver.page_source
soup= BeautifulSoup(html, 'html')
links = soup.find_all('a', href = re.compile('gsin='))
links = list(set([x.get('href') for x in links]))

#Explore links
for link in links:
    driver.get("https://www.gsaadvantage.gov"+link)

    #Sometimes zip pages shows up
    try:
        element = driver.find_element_by_id("zip")
        element.send_keys("91911")
        driver.find_element_by_name('submit').click()
    except:
        pass

    html = driver.page_source
    soup= BeautifulSoup(html, 'html')

    #Get the table with the expected elements
    table = soup.find('table', {'class':'greybox'})
    item_numbers = []

    #Extract itemNumbers
    for tr in table.find_all('tr')[1:]: #skip the header line
        item_number = tr.find('a', href=re.compile('itemNumber='))

        if item_number != None:
            pattern = 'itemNumber=(.*?)\&'        
            item_numbers.append(re.findall(pattern, item_number.get('href'))[0])

    #Store table into a dataframe
    temp_df = pd.read_html(str(soup.find('table', {'class':'greybox'})))[0]

    #EDIT START : handle missing columns
    for x in [2,8,14,16,18]:
        if x not in temp_df.columns:
            temp_df[x] = ""

    temp_df = temp_df[[2,8,14,16,18]].dropna(how='all')
    temp_df = temp_df[1:]
    temp_df.columns = ['Price', 'Contractor Name', 'Deliv days',  'Min Order', 'FOB/Shipping']
    temp_df['Contractor Part #'] = item_numbers
    temp_df['Part #'] = part

    #Extracting gsin from the explored link which differs in case of multiple links
    gsin = link.split("gsin=")[1]
    temp_df['gsin'] = gsin

    temp_df = temp_df[['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days',  'Min Order', 'FOB/Shipping']]


    #EDIT END
    #Append new results to the global dataframe
    global_df = pd.concat([global_df, temp_df])

Наконец, фрейм данных global_df выглядит так:

+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
|   |      gsin      |  Part #   |         Contractor Part #         |    Contractor Name    |  Price  |      Deliv days       |     Min Order      |    FOB/Shipping    |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| 1 | 11000058164089 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGX-XXL   | UNIFIRE INC           | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI   |                    |
| 1 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-L      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MILREG-L       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000063336537 | MCMS00001 | MCMS00001                         | o                     | $153.99 | $50.00                | D-CONUSND-AK,PR,HI | NaN                |
| 3 | 11000063336537 | MCMS00001 | MCMS00001                         | s dv                  | $157.16 | $50.00                | O-CONUS,AK,PR,HI   | NaN                |
| 1 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-S      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MILREG-S       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-M      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MILREG-M       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XS     | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XS      | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163625 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGXX-XXXL | UNIFIRE INC           | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI   |                    |
| 1 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XL     | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XL      | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...