Корректировка кода Web Scraping для другого сайта - PullRequest
0 голосов
/ 07 марта 2020

В настоящее время я использую этот код для очистки отзывов от TrustPilot. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * «Тогда, когда« * 1008 »* будет корректировать код для очистки отзывов (https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review -update-create ). Однако, в отличие от большинства других сайтов отзывов, обзоры не разделены на несколько подстраниц, а вместо этого в конце страницы есть кнопка для «просмотра большего количества обзоров», которая показывает 3 дополнительных отзыва при каждом нажатии.

Можно ли настроить код таким образом, чтобы он мог собирать все отзывы об этом конкретном продукте на веб-сайте с такой веб-структурой?

from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')

# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 44):
    names = []
    headers = []
    bodies = []
    ratings = []
    published = []
    updated = []
    reported = []

    link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
    print (link)
    req = requests.get(link)
    content = req.content
    soup = BeautifulSoup(content, "lxml")
    articles = soup.find_all('article', {'class':'review'})
    for article in articles:
        names.append(article.find('div', attrs={'class': 'consumer-information__name'}).text.strip())
        headers.append(article.find('h2', attrs={'class':'review-content__title'}).text.strip())
        try:
            bodies.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
        except:
            bodies.append('')

        try:
            #ratings.append(article.find('div', attrs={'class':'star-rating star-rating--medium'}).text.strip())
            #ratings.append(article.find('div', attrs={'class': 'star-rating star-rating--medium'})['alt'])
            ratings.append(article.find_all("img", alt=True)[0]["alt"])
        except:
            ratings.append('')
        dateElements = article.find('div', attrs={'class':'review-content-header__dates'}).text.strip()

        jsonData = json.loads(dateElements)
        published.append(jsonData['publishedDate'])
        updated.append(jsonData['updatedDate'])
        reported.append(jsonData['reportedDate'])


    # Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
    temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
    df = df.append(temp_df, sort=False).reset_index(drop=True)

print ('pass1')


df.to_csv('BirchboxReviews2.0.csv', index=False, encoding='utf-8')
print ('excel done')

Ответы [ 2 ]

2 голосов
/ 08 марта 2020

Обычно вы имеете дело с веб-сайтом, который dynamically загружается с помощью кода JavaScript после загрузки страницы, где комментарии rendered с кодом JS на каждом scroll down.

Мне удалось перейти к запросу XHR, который получает Comments от JS, и я смог позвонить и получить все запрошенные вами комментарии.

Вы не можете нужно использовать selenium, так как это замедлит ваш процесс.

Здесь вы можете достичь своей цели. при условии, что каждая страница содержит 3 комментариев. поэтому мы просто вычислим это для работы на полных страницах.

import requests
from bs4 import BeautifulSoup
import math


def PageNum():
    r = requests.get(
        "https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
    soup = BeautifulSoup(r.text, 'html.parser')
    num = int(
        soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
    if num % 3 == 0:
        return (num / 3) + 1
    else:
        return math.ceil(num / 3) + 2


def Main():
    num = PageNum()
    headers = {
        'X-Requested-With': 'XMLHttpRequest'
    }
    with requests.Session() as req:
        for item in range(1, num):
            print(f"Extracting Page# {item}")
            r = req.get(
                f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            for com in soup.findAll("div", class_=r'\"comment-body\"'):
                print(com.text[5:com.text.find(r"\n", 3)])


Main()

Простой вывод:

Number of Pages 49
Extracting Page# 1
****************************************
I think Boxycharm overall is the best beauty subscription. However, I think it's 
ridiculous that if you want to upgrade you have to pay the 25 for the first box and then add additional money to get the premium. Even though it's only one time, 
that's insane. So about 80 bucks just to switch to Premium. And suppose U do that and then my Boxy Premium shows up at my door. I open it ....and absolutely hate 
the majority if everything I have. Yeah I would be furious! Not worth taking a chance on. Boxy only shows up half the time with actual products or colors I use.  
I love getting the monthly boxes, just wish they would have followed my preferences for colors!
I used to really get excited for my boxes. But not so much anymore.  This months 
Fenty box choices lack! I am not a clown
Extracting Page# 2
****************************************
Love it its awsome
Boxycharm has always been a favorite subscription box, I’ve had it off and on , love most of the goodies.  I get frustrated when they don’t curate it to fit me and or customer service isn’t that helpful but overall a great box’!
I like BoxyCharm but to be honest I feel like some months they don’t even look at your beauty profile because I sometimes get things I clearly said I wasn’t interested in getting.
Extracting Page# 3
****************************************
The BEST sub box hands down. 
I love all the boxy charm boxes everything is amazing all full size products and 
the colors are outstanding
I absolutely love Boxycharm.  I have received amazing high end products.  My makeup cart is so full I have such a variety everyday. I love the new premium box and paired with Boxyluxe I recieve 15 products for $85 The products are worth anywhere from $500 to $700  total.  I used to spend $400 a month buying products at Ulta. I would HIGHLY recommend this subscription.  
1 голос
/ 08 марта 2020

Также я разработал код для вашего сайта. Он использует селен для нажатия кнопок и прокрутки, дайте мне знать, если у вас есть какие-либо сомнения. Я все еще предлагаю вам go через статью сначала: -

# -*- coding: utf-8 -*-
"""
Created on Sun Mar  8 18:09:45 2020

@author: prakharJ
"""

from selenium import webdriver
import time
import pandas as pd

names_found = []
comments_found = []
ratings_found = []
dateElements_found = []

# Web extraction of web page boxes
print("scheduled to run boxesweb scrapper ")
driver = webdriver.Chrome(executable_path='Your/path/to/chromedriver.exe') 
webpage = 'https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create' 
driver.get(webpage)

SCROLL_PAUSE_TIME = 6

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight*0.80);")

    time.sleep(SCROLL_PAUSE_TIME)
    try:
        b = driver.find_element_by_class_name('show-more-reviews')
        b.click()
        time.sleep(SCROLL_PAUSE_TIME)
    except Exception:
        s ='no button'

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")

    if new_height == last_height:
        break
    last_height = new_height

names_list = driver.find_elements_by_class_name('name')
comment_list = driver.find_elements_by_class_name('comment-body')
rating_list = driver.find_elements_by_xpath("//meta[@itemprop='ratingValue']")
date_list = driver.find_elements_by_class_name('comment-date')
for names in names_list:
    names_found.append(names.text)
for bodies in comment_list:
    try:
        comments_found.append(bodies.text)
    except:
        comments_found.append('NA')
for ratings in rating_list:
    try:
        ratings_found.append(ratings.get_attribute("content"))
    except:
        ratings_found.append('NA')
for dateElements in date_list:
    dateElements_found.append(dateElements.text)
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names_found, 'Body': comments_found, 'Rating': ratings_found, 'Published Date': dateElements_found})
#df = df.append(temp_df, sort=False).reset_index(drop=True)
print('extraction completed for the day and system goes into sleep mode')
driver.quit()
...