Question

Я пытаюсь настроить проект Python Scrapy и запустить его локально на моем ПК.Цель состоит в том, чтобы изучить и понять это.Я включил функцию start_requests () в " main ", но она не вызывается.Любая помощь или ссылки на соответствующие ресурсы, приветствуется.

Программа компилируется без ошибок, но просто открывает пустое окно браузера.Ожидаемый результат - просмотреть список кодов ASIN в .csv и очистить некоторые данные с соответствующих страниц.

# -*- coding: utf-8 -*-

import re
from os.path import splitext, basename

from bs4 import BeautifulSoup as bs
#from scrapy import Spider, Request

country_domain = {'US': {'code': 'us', 'domain': 'com'},
                  'UK': {'code': 'uk', 'domain': 'co.uk'},
                  'Germany': {'code': 'de', 'domain': 'de'}, }


def get_asin_url(asin, domain='com'):
#function get_asin_url body emitted for clarity ...    

def get_title(soup):
    title = ""
    pTitle = soup.find('h1', id='title')
    if pTitle:
        title = re.sub('\s+', ' ', pTitle.text.strip())
    return title

class AmazonbotSpider():
    print("I'm in class AmazonbotSpider")
    name = 'amazonbot'
    allowed_domains = ['amazon.*']
    start_urls = ['https://amazon.com/']
    custom_settings = {'FEED_URI': '%(input_filename)s_%(country)s_%(time)s.csv'}

    def __init__(self, asin_path='C:\\Users\\Chris K\Documents\\0_Molzi\\AmazonScraping\\customScripts\\asins.csv', country='UK', *args, **kwargs):
        print("I'm in __init__")
        super(AmazonbotSpider, self).__init__(*args, **kwargs)
        self.asin_path = asin_path
        self.country = country
        self.country_code = country_domain[country]['domain']
        self.input_filename = splitext(basename(asin_path))[0]

        with open(self.asin_path, 'r') as fp:
            lines = fp.readlines()
        for line in lines:
            asin = line.strip()
            data = get_asin_url(asin, self.country_code)
            #data.meta['item'] = {'asin': asin}
            print("data: ",data)
            #yield data

    def start_requests(self):
        print("I'm in start_requests")
        with open(self.asin_path, 'r') as fp:
            lines = fp.readlines()
        for line in lines:
            asin = line.strip()
            data = Request(get_asin_url(asin, self.country_code), callback=self.parse)
            data.meta['item'] = {'asin': asin}
            print("data: ",data)
            yield data

    def parse(self, response):
        print("I'm in parse")
        item = response.meta['item']
        soup = bs(response.text, 'lxml')
        # Remove any style tags
        style_tags = soup.find_all('style')
        if style_tags:
            for style_tag in style_tags:
                style_tag.extract()
        item['name'] = get_title(soup)
        item['url'] = response.url
        yield item

if __name__ == "__main__":
        spider = AmazonbotSpider() # Create the object
        #spider.start_requests() # Run the rank checker
        print("I'm in __main__")

LeCoon zarakailloux · Answer 1 · 02 мая 2019

Оператор return в get_title с плохим отступом.

Линтер должен помочь вам написать хороший Python, попробуйте pylint, например.

Chris Kotsiopoulos · Answer 2 · 03 мая 2019

Я перестал пытаться настроить вышеуказанный код. Вместо этого я использовал этот скрипт в качестве основы и просто добавил нужные мне методы. Использование Selenium - главное отличие:

from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv

globaldomain = "es"

class AmazonScaper(object):

    def __init__(self,asins, output_file='results.csv',sleep=2):

        self.browser = webdriver.Chrome(executable_path='chromedriver.exe')  #Add path to your Chromedriver
        self.asin_queue = deque(asins)  #Add the start URL to our list of URLs to crawl
        self.output_file = output_file
        self.sleep = sleep
        self.results = []
    #method body ommited for clarity:
    def get_title(self, soup):
    def get_soldby(self,soup):
    def get_price_seller(self,soup):

    def run_crawler(self):
        price_seller = ""
        while len(self.asin_queue): #If we have asins to check
            asin = self.asin_queue.popleft() #We grab a asin from the left of the list
            html = self.get_page(asin)
            soup = self.get_soup(html)
            time.sleep(self.sleep) # Wait for the specified time
            if soup is not None:  #If we have soup - parse and save data
                title = self.get_title(soup)
                soldby = self.get_soldby(soup)
                price_seller = self.get_price_seller(soup)

                time.sleep(3)
            print(asin,"^^^",title,"^^^",price_seller,"^^^",soldby)
        #self.browser.quit()
        #self.csv_output() # Save the object data to csv

if __name__ == "__main__":
        asins = [str.replace(line.rstrip('\n'),' ','+') for line in open('ASINs.txt')] # Use our file of asins & replaces spaces with +
        ranker = AmazonScaper(asins) # Create the object
        ranker.run_crawler() # Run the rank checker

Phil Gyford · Answer 3 · 02 мая 2019

Вы должны отступить start_requests() больше - это в настоящее время функция, а не метод в классе AmazonbotSpider.

Инициализация программы Python BeautifulSoup

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 3 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Инициализация программы Python BeautifulSoup

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 3 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов