Ищете альтернативу Selenium для соскоба нескольких страниц - PullRequest
0 голосов
/ 18 апреля 2020

Я получаю желаемые результаты, но думаю, что код можно улучшить. В настоящее время он довольно медленный и подвержен ошибкам при очистке нескольких страниц подряд. Код ниже очищает 5 функций для 42 транспортных средств (21 на страницу). Я собираю в общей сложности 18 функций (остальные 13 функций здесь не показаны) для этих двух страниц, но это занимает слишком много времени, учитывая, что я sh обработаю в общей сложности 29 страниц.

Чтобы увидеть цену автомобиля, вам необходимо войти в систему, поэтому я использую Selenium, как показано в коде ниже.

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from selenium import webdriver
import time
from IPython.core.interactiveshell import InteractiveShell #optional

# Change cell settings (optional)
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = None

driver = webdriver.Chrome()
#driver.maximize_window() #optional

# Log in and search  
urls = ["https://www.example.com/"]

for url in urls:
    driver.get(url)
    time.sleep(1)
    driver.find_elements_by_class_name("dropdown-toggle")[0].click()
    time.sleep(1)
    driver.find_elements_by_name('email')[0].send_keys("arjenvgeffen@hotmail.com")
    time.sleep(1)
    driver.find_elements_by_name("submit")[0].click()
    time.sleep(2)

    link = driver.find_element_by_link_text('SEARCH')
    time.sleep(1)
    link.click()
    time.sleep(2)

    driver.find_elements_by_name("searchScope")[0].send_keys('ALL PAST')
    time.sleep(1)
    driver.find_elements_by_name("searchMake")[0].send_keys('PLYMOUTH')
    time.sleep(1)
    driver.find_elements_by_name('searchModel')[0].send_keys('Cuda')
    time.sleep(1)
    driver.find_elements_by_name('searchYearStart')[0].send_keys("1970")
    time.sleep(1)
    driver.find_elements_by_name('searchYearEnd')[0].send_keys("1971")
    time.sleep(1)
    driver.find_element_by_xpath("//button[. = 'Search']").click()
    time.sleep(1)

Приведенный ниже код очищает название транспортного средства (year_make_model_type), цену (которую вы можете увидеть только после входа в систему выше по электронной почте) и URL страницы. На следующем этапе page_urls будет использоваться для сбора информации по каждой странице продукта. Это занимает слишком много времени при очистке 29 страниц, и это может пропустить / застрять. Любое улучшение здесь высоко ценится!

# Scrape two pages (these two variables can be scraped without being on the vehicle page)
i = 0
x = 1
year_make_model_type = []
price = []

while True:
    for i in range(0,1):
        time.sleep(2)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        time.sleep(2)
        urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
        time.sleep(2)
        mystring = 'https://www.example.com'
        page_urls = [mystring + s for s in urls]
        time.sleep(2)
        for y in soup.find_all("a", class_ = ("lot-title")):
            year_make_model_type.append(y.text)
        time.sleep(2)
        for p in soup.find_all("span", class_ = ("lot-price")):
            price.append(re.sub("[\$\,]", "", p.text))
        time.sleep(2)    
        i +=1 
        for x in range(2,3):
            time.sleep(5)
            driver.find_element_by_xpath('//a[@href="/search/page/%d/"]' % (x,)).click()
            time.sleep(5)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            time.sleep(2)
            page_products_urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
            time.sleep(2)
            mystring = 'https://www.example.com'
            page_products_urls2 = [mystring + s for s in page_products_urls]
            page_urls.extend(page_products_urls2)
            time.sleep(2)
            for y in soup.find_all("a", class_ = ("lot-title")):
                year_make_model_type.append(y.text)
            time.sleep(2)
            for p in soup.find_all("span", class_ = ("lot-price")):
                price.append(re.sub("[\$\,]", "", p.text))
            time.sleep(2)  
            x += 1
    if x == 2:
        break
    else:
        break

len(page_urls) #42
len(set(page_urls)) #42
len(price) #42
len(set(price)) #36
len(year_make_model_type) #42
len(set(year_make_model_type)) #13

# If you need to go back to the first page 
#driver.find_element_by_xpath('//a[@href="/search/page/1/"]').click()

# Create df
scraped_data = pd.DataFrame({'url': page_urls, 'year_make_model_type': year_make_model_type, 'price':price})
scraped_data['price'] = scraped_data['price'].replace('', np.NaN)
scraped_data['price'] = scraped_data['price'].astype(float)
scraped_data.shape
scraped_data.head()

#driver.quit()

Этот последний фрагмент кода счищает основные моменты и флаг_группы для каждого транспортного средства со страницы его продукта.

# Create additional features per product url (have to click on product to be able to scrape these features)
def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

highlights = []
flag_group = []

# Add features per vehicle
for url in page_urls:

    # Vehicle highlights
    highlights1 = []
    soup = getAndParseURL(url)
    if not soup.find("ul", class_ = "lot-highlights hidden-print"): 
        highlights1.append(np.NaN)
    else:
        hl = soup.find("ul", class_ = "lot-highlights hidden-print").text.strip()
        hl = hl.replace("\n", ", ").strip()
        highlights1.append(hl)

    highlights.extend(highlights1)

    # Vehicle flag_group
    attraction = []
    soup = getAndParseURL(url)
    flag = soup.find(class_=["flag flag-main","flag flag-star", "flag flag-feature"])
    if flag:
        attraction.append(flag.contents[0])
    else:
        attraction.append(np.NaN)

    flag_group.extend(attraction)

# Assign new features to existing df
scraped_data = scraped_data.assign(**{'highlights': highlights, 'flag_group': flag_group})#, 'reserve': reserve})
scraped_data.shape
scraped_data.head()

Дайте мне знать / покажите мне, где, по вашему мнению, код выше можно улучшить. Спасибо, что нашли время!

Ответы [ 2 ]

1 голос
/ 18 апреля 2020

Вы Действительно Действительно вообще не нужен весь этот очень длинный код.

Вам даже не нужно selenium.

Вам не нужно постоянно повторять свой код и все эти вещи.

Ниже следует легко достичь своей цели!

Примечание: я удалил только первые 3 страницы , Вы можете увеличить l oop для желаемой цели.

import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable

data = {
    "searchScope": "past",
    "searchText": "PLYMOUTH",
    "searchMake": "Plymouth",
    "searchModel": "Cuda",
    "searchYearStart": "1970",
    "searchYearEnd": "1971",
    "submit": ""
}

headers = {
    "Referer": "https://www.mecum.com",
}

login = {"email": "arjenvgeffen@hotmail.com"}


def main(url):
    with requests.Session() as req:
        r = req.post(
            "https://www.mecum.com/includes/login-action.cfm", data=login)
        p = PrettyTable()
        p.field_names = ["Name", "Url", "Price"]
        for item in range(1, 4):
            r = req.post(url.format(item), data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = soup.select("div.lot")
            for tar in target:
                price = tar.span.text if tar.span.text else "N/A"

                hint = tar.select_one("a.lot-title")
                p.add_row(
                    [hint.text, f"{url[:21]}{hint['href']}", price])
        print(p)


main("https://www.mecum.com/search/page/{}/")

Выход:

+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
|                           Name                           |                                             Url                                              |  Price   |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
|                   1936 Plymouth  Coupe                   |                https://www.mecum.com/lots/HA0420-412309/1936-plymouth-coupe/                 |   N/A    |
|               1937 Plymouth Deluxe Pickup                |            https://www.mecum.com/lots/HA0420-412385/1937-plymouth-deluxe-pickup/             |   N/A    |
|                1951 Plymouth  Convertible                |             https://www.mecum.com/lots/HA0420-412744/1951-plymouth-convertible/              |   N/A    |
|                1968 Plymouth Road Runner                 |             https://www.mecum.com/lots/HA0420-412874/1968-plymouth-road-runner/              |   N/A    |
|                    1970 Plymouth Cuda                    |                 https://www.mecum.com/lots/HA0420-413047/1970-plymouth-cuda/                 |   N/A    |
|              1971 Plymouth Cuda Convertible              |           https://www.mecum.com/lots/HA0420-413138/1971-plymouth-cuda-convertible/           |   N/A    |
|                1968 Plymouth Road Runner                 |             https://www.mecum.com/lots/HA0420-427812/1968-plymouth-road-runner/              |   N/A    |
|                1969 Plymouth Road Runner                 |             https://www.mecum.com/lots/AZ0320-404226/1969-plymouth-road-runner/              | $19,250  |
|             1973 Plymouth Duster Police Car              |          https://www.mecum.com/lots/AZ0320-404232/1973-plymouth-duster-police-car/           | $18,700  |
|       1963 Plymouth Valiant Signet 200 Convertible       |    https://www.mecum.com/lots/AZ0320-404250/1963-plymouth-valiant-signet-200-convertible/    |  $3,850  |
|                   1946 Plymouth  Taxi                    |                 https://www.mecum.com/lots/AZ0320-404267/1946-plymouth-taxi/                 |  $3,300  |
|                    1969 Plymouth GTX                     |                 https://www.mecum.com/lots/AZ0320-404449/1969-plymouth-gtx/                  | $25,000  |
|                  1999 Plymouth Prowler                   |               https://www.mecum.com/lots/AZ0320-404457/1999-plymouth-prowler/                | $20,000  |
|        1967 Plymouth Barracuda Formula S Fastback        |     https://www.mecum.com/lots/AZ0320-404478/1967-plymouth-barracuda-formula-s-fastback/     | $33,000  |
|              1970 Plymouth Cuda Convertible              |           https://www.mecum.com/lots/AZ0320-404626/1970-plymouth-cuda-convertible/           | $51,700  |
|                    1967 Plymouth GTX                     |                 https://www.mecum.com/lots/AZ0320-404634/1967-plymouth-gtx/                  | $31,350  |
|               1970 Plymouth Cuda Resto Mod               |            https://www.mecum.com/lots/AZ0320-404636/1970-plymouth-cuda-resto-mod/            | $50,000  |
|                1969 Plymouth Road Runner                 |             https://www.mecum.com/lots/AZ0320-404656/1969-plymouth-road-runner/              | $34,100  |
|                    1970 Plymouth Cuda                    |                 https://www.mecum.com/lots/AZ0320-404858/1970-plymouth-cuda/                 | $70,000  |
|                 1970 Plymouth Superbird                  |              https://www.mecum.com/lots/AZ0320-404866/1970-plymouth-superbird/               | $143,000 |
|           1967 Plymouth Satellite Convertible            |        https://www.mecum.com/lots/AZ0320-404883/1967-plymouth-satellite-convertible/         | $30,800  |
|                  1970 Plymouth AAR Cuda                  |               https://www.mecum.com/lots/AZ0320-404897/1970-plymouth-aar-cuda/               | $71,500  |
|            1967 Plymouth Barracuda Resto Mod             |         https://www.mecum.com/lots/AZ0320-404918/1967-plymouth-barracuda-resto-mod/          | $60,500  |
|              1969 Plymouth GTX Convertible               |           https://www.mecum.com/lots/AZ0320-404950/1969-plymouth-gtx-convertible/            | $42,000  |
|                 1959 Plymouth Sport Fury                 |              https://www.mecum.com/lots/AZ0320-404972/1959-plymouth-sport-fury/              | $30,000  |
|                 1965 Plymouth Barracuda                  |              https://www.mecum.com/lots/AZ0320-405120/1965-plymouth-barracuda/               | $22,000  |
|                 1970 Plymouth Hemi Cuda                  |              https://www.mecum.com/lots/AZ0320-405220/1970-plymouth-hemi-cuda/               | $150,700 |
|                 1970 Plymouth Superbird                  |              https://www.mecum.com/lots/AZ0320-405229/1970-plymouth-superbird/               | $115,000 |
|                    1970 Plymouth Cuda                    |                 https://www.mecum.com/lots/AZ0320-405236/1970-plymouth-cuda/                 | $52,500  |
|                 1970 Plymouth Hemi Cuda                  |              https://www.mecum.com/lots/AZ0320-405266/1970-plymouth-hemi-cuda/               | $130,000 |
|              1968 Plymouth Hemi Road Runner              |           https://www.mecum.com/lots/AZ0320-405267/1968-plymouth-hemi-road-runner/           | $70,000  |
|              1969 Plymouth Hemi Road Runner              |           https://www.mecum.com/lots/AZ0320-405286/1969-plymouth-hemi-road-runner/           | $62,000  |
|                1969 Plymouth Road Runner                 |             https://www.mecum.com/lots/AZ0320-405304/1969-plymouth-road-runner/              | $120,000 |
|           1959 Plymouth Sport Fury Convertible           |        https://www.mecum.com/lots/AZ0320-405321/1959-plymouth-sport-fury-convertible/        | $70,000  |
|               1973 Plymouth Cuda Resto Mod               |            https://www.mecum.com/lots/AZ0320-405340/1973-plymouth-cuda-resto-mod/            | $75,000  |
|        1969 Plymouth Sport Satellite Convertible         |     https://www.mecum.com/lots/AZ0320-405384/1969-plymouth-sport-satellite-convertible/      | $37,400  |
|                  1970 Plymouth AAR Cuda                  |               https://www.mecum.com/lots/AZ0320-405385/1970-plymouth-aar-cuda/               | $55,000  |
|                1969 Plymouth Road Runner                 |             https://www.mecum.com/lots/AZ0320-423532/1969-plymouth-road-runner/              | $60,500  |
|                 1970 Plymouth Hemi Cuda                  |              https://www.mecum.com/lots/AZ0320-423534/1970-plymouth-hemi-cuda/               | $93,500  |
|              1968 Plymouth Hemi Road Runner              |           https://www.mecum.com/lots/AZ0320-423535/1968-plymouth-hemi-road-runner/           | $66,000  |
|                    1970 Plymouth Cuda                    |                 https://www.mecum.com/lots/AZ0320-423545/1970-plymouth-cuda/                 | $60,000  |
|  1940s-50s Desoto Plymouth Double-Sided Porcelain 45x42  |  https://www.mecum.com/lots/AZ0320-424465/1940s-50s-desoto-plymouth-double-sided-porcelain/  |  $2,950  |
|  1940s-50s Dodge Plymouth Double-Sided Porcelain 42-in   |  https://www.mecum.com/lots/AZ0320-424468/1940s-50s-dodge-plymouth-double-sided-porcelain/   |  $5,900  |
| 1940s-50s Chrysler Plymouth Double-Sided Porcelain 42-in | https://www.mecum.com/lots/AZ0320-424471/1940s-50s-chrysler-plymouth-double-sided-porcelain/ |  $3,776  |
|                1969 Plymouth Road Runner                 |             https://www.mecum.com/lots/AZ0320-424624/1969-plymouth-road-runner/              | $59,400  |
|           1965 Plymouth Sport Fury Convertible           |        https://www.mecum.com/lots/AZ0320-424629/1965-plymouth-sport-fury-convertible/        | $13,750  |
|          1970 Plymouth Road Runner Convertible           |       https://www.mecum.com/lots/AZ0320-428253/1970-plymouth-road-runner-convertible/        | $45,000  |
|           1970 Plymouth Barracuda Convertible            |        https://www.mecum.com/lots/AZ0320-428658/1970-plymouth-barracuda-convertible/         | $42,900  |
|                 1966 Plymouth Barracuda                  |              https://www.mecum.com/lots/FL0120-394693/1966-plymouth-barracuda/               |  $9,625  |
|                 1965 Plymouth Barracuda                  |              https://www.mecum.com/lots/FL0120-394746/1965-plymouth-barracuda/               |  $7,700  |
|                 1969 Plymouth Satellite                  |              https://www.mecum.com/lots/FL0120-394747/1969-plymouth-satellite/               |  $3,850  |
|                   1954 Plymouth Savoy                    |                https://www.mecum.com/lots/FL0120-394753/1954-plymouth-savoy/                 |  $7,150  |
|                 1952 Plymouth Police Car                 |              https://www.mecum.com/lots/FL0120-394828/1952-plymouth-police-car/              |   N/A    |
|                   1970 Plymouth Duster                   |                https://www.mecum.com/lots/FL0120-394921/1970-plymouth-duster/                | $26,400  |
|                 1965 Plymouth Barracuda                  |              https://www.mecum.com/lots/FL0120-394956/1965-plymouth-barracuda/               |  $8,800  |
|               1950 Plymouth Special Deluxe               |            https://www.mecum.com/lots/FL0120-394983/1950-plymouth-special-deluxe/            |  $8,250  |
|                1973 Plymouth Road Runner                 |             https://www.mecum.com/lots/FL0120-395009/1973-plymouth-road-runner/              | $21,000  |
|                1970 Plymouth Road Runner                 |             https://www.mecum.com/lots/FL0120-395013/1970-plymouth-road-runner/              | $51,700  |
|                 1969 Plymouth Barracuda                  |              https://www.mecum.com/lots/FL0120-395106/1969-plymouth-barracuda/               | $17,600  |
|           1966 Plymouth Satellite Convertible            |        https://www.mecum.com/lots/FL0120-395145/1966-plymouth-satellite-convertible/         | $26,400  |
|                1970 Plymouth Road Runner                 |             https://www.mecum.com/lots/FL0120-395341/1970-plymouth-road-runner/              | $47,300  |
|                    1970 Plymouth Cuda                    |                 https://www.mecum.com/lots/FL0120-395362/1970-plymouth-cuda/                 | $61,000  |
|            1999 Plymouth Prowler Convertible             |         https://www.mecum.com/lots/FL0120-395647/1999-plymouth-prowler-convertible/          | $30,800  |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
0 голосов
/ 20 апреля 2020

Я отредактировал код Ахмеда, чтобы получить желаемый результат (pandas df)

import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable

data = {
    "searchScope": "past",
    "searchMake": "Plymouth",
    "searchModel": "Cuda",
    "searchYearStart": "1970",
    "searchYearEnd": "1971",
    "submit": ""
}

headers = {
    "Referer": "https://www.example.com",
}

login = {"email": "example@hotmail.com"}


price = []
urls = []
title = []
results = []


def main(url):
    with requests.Session() as req:
        r = req.post(
            "https://www.example.com/includes/login-action.cfm", data=login)
        for item in range(1, 30):
            r = req.post(url.format(item), data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = soup.select("div.lot")
            for tar in target:
                urls.append(tar.a.get('href'))
                title.append(tar.select_one("a.lot-title").text)
                price.append(tar.span.text if tar.span.text else np.NaN)
                r = tar.select("div[class*=lot-image-container]")
                for result in results2:
                    results.append(' '.join(result2['class']))


main("https://www.example.com/search/page/{}/")


scraped_data = pd.DataFrame({'url': urls, 'year_make_model_type': title, 'price':price, 'results': results})    
scraped_data.shape

scraped_data["results"] = scraped_data["results"].str.replace("lot-image-container", "")
scraped_data["results"] = scraped_data["results"].replace('', np.NaN)
scraped_data.head()

Теперь я хочу извлечь функции из списка страниц продукта, которые находятся в столбце 'url'. Ниже рабочий пример, но он слишком медленный. Я пытался исправить это с помощью многопроцессорной обработки, но пока не понял. Я хочу извлечь около 10 дополнительных функций для 500+ страниц, поэтому это должно быть быстрее, чем это.

low_url = ['https://www.mecum.com/lots/KC1210-101030/1970-plymouth-cuda/',
 'https://www.mecum.com/lots/SC0510-91294/1970-plymouth-hemi-cuda/',
 'https://www.mecum.com/lots/KC1210-100686/1970-plymouth-barracuda-convertible/',
 'https://www.mecum.com/lots/KA0316-235834/1970-plymouth-barracuda-convertible/',
 'https://www.mecum.com/lots/FL0110-88180/1970-plymouth-barracuda/']


reserve = []    
with requests.Session() as req:
    for url in low_url:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        attraction2 = []
        if not soup.find(class_=["flag flag-no-reserve"]):
            attraction2.append(np.NaN)
        else:
            r = soup.find(class_=["flag flag-no-reserve"])
            attraction2.append(r.contents[0])
        reserve.extend(attraction2)

len(reserve)
len(set(reserve))
reserve

Out: ['No Reserve', nan, nan, 'No Reserve', nan]
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...