Зачистка в CSV с использованием Scrapy & Python, не добавляя правильные значения - PullRequest
1 голос
/ 20 марта 2020

Я пытаюсь сохранить некоторые данные в CSV-файл с помощью Scrapy, я запускаю 3 процесса Scraping внутри одного из моих пауков. Но файл CSV, который генерирует сканирование, сохраняет только первые данные, которые я извлек с первой функцией "parse" , мне нужно сохранить данные третьей "parse_following_urls_2" .

Я даже пытался использовать Pandas для создания файла CSV, но у Scrapy, похоже, возникли проблемы с Pandas.

Для запуска кода scrapy crawl goodmans -o goodmans.csv -t csv

Что я могу сделать? Вот мой код:

import os
import scrapy
from ..items import TutorialItem
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import Request
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

class GoodmanSpider(scrapy.Spider):
name = "goodmans"
start_urls = ['http://www.goodmans.net/d/1706/brands.htm']
supplier_urls = []

def parse(self, response):
    items = TutorialItem()
    all_data = response.css('.SubDepartments')
    for data in all_data:
        category = data.css('.SubDepartments a::text').extract()
        category_url = data.css('.SubDepartments a::attr(href)').extract()
        items['category'] = category
        items['category_url'] = category_url
        for cat, url in zip(category, category_url):
            item = dict(category=cat, category_url=url)
            yield item
            yield Request(url, callback=self.parse_following_urls, dont_filter=True)
            # yield {'Category': category, 'url': category_url}

def parse_following_urls(self, response):  
    items = TutorialItem()
    all_data = response.css('.SubDepartments')
    for data in all_data:         
        article_url = data.css('.SubDepartments a::attr(href)').extract() 
        items['article_url'] = article_url
        for url in article_url:
            item = dict(article_url=url)        
            # yield item    
            yield Request(url, callback=self.parse_following_urls_2, dont_filter=True)

def parse_following_urls_2(self, response):  
    items = TutorialItem()
    all_data = response.css('.recin')
    for data in all_data:
        article_url_2 = data.css('.recit a').xpath('@href').extract() 
        article_name = data.css('.recit span::text').extract()
        article_price = data.css('.price span::text').extract()
        article_item_num = data.css('.itemno span::text').extract() 
        article_image = data.css('.linkit img').xpath('@src').extract()
        items['article_url_2'] = article_url_2
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image
        for au, an, ap, ain, ai in zip(article_url_2, article_name, article_price, article_item_num, article_image):
            item = dict(article_url_2=au, article_name=an, article_price=ap, article_item_num=ain, article_image=ai)
            df_result = pd.DataFrame.from_dict(item)
            path_file = os.path.join(BASE_DIR, 'csv/supplier_product_database.csv')
            my_csv = df_result.to_csv(path_file, index=None, header=True)
            yield item          

файл items.py

itemps.py

Ответы [ 2 ]

1 голос
/ 20 марта 2020

Я использовал response.xpath, надеюсь, это поможет вам,

class GoodManSpider(scrapy.Spider):
    name = "goodmans"


    def start_requests(self):
        url = 'http://www.goodmans.net/d/1706/brands.htm'
        yield scrapy.Request(url=url, callback=self.parse)

### scraped all links listed in goodman's page
    def parse(self, response):
        department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
        for link in department_links:
            yield scrapy.Request(url=link, callback=self.parse_contents)


### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
    def parse_contents(self, response):
        category_url = response.url
        category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
        products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
        for product_link in products:
            yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})

### scraped all necessary data and yield items
    def parse_products(self, response):
        category = response.meta['category_name']
        category_url = response.meta['category_link']
        article_url = response.url
        article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()[0]
        article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()[0]
        article_item_number = response.xpath('//span[@itemprop="productID"]/text()').extract()[0]
        article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()[0]
        items = TutorialItem()
        items['category'] = category
        items['category_url'] = category_url
        items['article_url'] = article_url
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image

        yield item

Я не знаю, что означает article_url_2, насколько я понимаю

category = company name
category_url = company page link in goodman
article_name = product name
article_url = product_link
article_price = product price
article_item_number = product item number
article_image = product image
0 голосов
/ 22 марта 2020

На основе кода @bonifacio_kid с некоторыми небольшими исправлениями, теперь он работает так, как должен:

class GoodManSpider2(scrapy.Spider):
    name = "goodmans_2"


    def start_requests(self):
        url = 'http://www.goodmans.net/d/1706/brands.htm'
        yield scrapy.Request(url=url, callback=self.parse)

### scraped all links listed in goodman's page
    def parse(self, response):
        department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
        for link in department_links:
            yield scrapy.Request(url=link, callback=self.parse_contents)


### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
    def parse_contents(self, response):
        category_url = response.url
        category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
        products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
        for product_link in products:
            yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})

### scraped all necessary data and yield items
    def parse_products(self, response):
        items = TutorialItem()
        category = response.meta['category_name']
        category_url = response.meta['category_link']
        article_url = response.url
        article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()
        article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()
        article_item_num = response.xpath('//span[@itemprop="productID"]/text()').extract()
        article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()
        items['category'] = category
        items['category_url'] = category_url
        items['article_url'] = article_url
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image

        for au, an, ap, ain, ai in zip(article_url, article_name, article_price, article_item_num, article_image):
                item = dict(supplier_url=article_url, supplier_item_name=an, min_price=ap, max_price=ap, article_item_num=ain, article_image=ai)
                yield item
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...