Scrapy: экспортировать проанализированные данные в несколько файлов - PullRequest
0 голосов
/ 28 июня 2018

Я хочу проанализировать страницы, а затем экспортировать определенные элементы в один CSV-файл, а другой - в другой файл: используя экспорт фидов здесь Мне удалось сделать это для одного файла следующим образом:

Настройки

FEED_EXPORT_FIELDS = (
    'url',
    'group_url',
    'name',
    'streetAddress',
    'city',
    'addressCountry',
)
FEED_FORMAT = 'csv'
FEED_URI = 'output/%(name)s_%(time)s.csv'

Но, как я сказал выше, экспорт экспортируется только в один CSV-файл. Я хотел бы иметь возможность очистить другие поля в другой файл:

FEED_EXPORT_FIELDS = (
    'employee',
    'employee_group',

)

Разбор моего скребка:

def parse(self, response):

    l = BasicItemLoader(item=ProductItemLoader(), response=response)
    l.default_input_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
    # l.default_output_processor = Compose(TakeFirst())
    l.add_value('url', response.request.url)
    l.add_value('group_url', response.meta.get('section', ''))
    l.add_css('name', 'h1[itemprop="name"]::text')
    l.add_css('streetAddress', "div[itemprop=\"address\"] [itemprop=\"streetAddress\"]::text")
    l.add_css('city', "div[itemprop=\"address\"]>p::text")
    l.add_css('addressCountry', "div[itemprop=\"address\"] [itemprop=\"addressCountry\"]::text")
    l.add_css('phone', ".phoneCompany>input[id*='freePhone']::attr(value)", TakeFirst())
    l.add_css('summary', 'span[itemprop="description"]::text')
    l.add_xpath('year', "//td[contains(text(),'Year established')]/following-sibling::td/text()")
    l.add_xpath('registry', "//td[contains(text(),'Registry of commerce')]/following-sibling::td/text()")
    l.add_xpath('legal_form', "//td[contains(text(),'Legal form')]/following-sibling::td/text()")
    l.add_xpath('vat', "//td[contains(text(),'VAT')]/following-sibling::td/text()")
    l.add_xpath('fax', "//td[contains(text(),'Fax')]/following-sibling::td/text()")
    l.add_css('website', "[id*='webSite_presentation_']::text")
    l.add_css('brands', "#company-tradenames .tradeName::text")
    l.add_xpath('banks', "//h3[contains(text(),'Banks')]/following-sibling::div//strong/text()")
    l.add_css('export_area', "#exportZones>span:nth-of-type(2)::text")
    l.add_css('import_area', "#importZones>span:nth-of-type(2)::text")
    l.add_css('export_countries', "#exportCountries>span:nth-of-type(2)::text")
    l.add_css('import_countries', "#importCountries>span:nth-of-type(2)::text")
    l.add_css('employees', ".employees.bloc .effectif p::text")
    l.add_css('turn_over', ".turnover.bloc li:nth-of-type(1)>p:nth-of-type(2)::text")
    return l.load_item()

и определение предметов

class ProductItemLoader(scrapy.Item):

    url = scrapy.Field()
    group_url = scrapy.Field()
    name = scrapy.Field()
    streetAddress = scrapy.Field()
    addressCountry = scrapy.Field()
    city = scrapy.Field()
    phone = scrapy.Field()
    summary = scrapy.Field()
    year = scrapy.Field()
    registry = scrapy.Field()
    legal_form = scrapy.Field()
    vat = scrapy.Field()
    fax = scrapy.Field()
    website = scrapy.Field()
    brands = scrapy.Field()
    banks = scrapy.Field()
    import_area = scrapy.Field()
    import_countries = scrapy.Field()
    export_area = scrapy.Field()
    export_countries = scrapy.Field()
    employees = scrapy.Field()
    turn_over = scrapy.Field()

1 Ответ

0 голосов
/ 28 июня 2018

Вы должны будете использовать определение своих предметов, чтобы добиться сохранения различных полей в своих собственных файлах csv.

items.py:
import scrapy

class ProductItemLoader(scrapy.Item):
    url = scrapy.Field()
    group_url = scrapy.Field()
    name = scrapy.Field()
    streetAddress = scrapy.Field()
    addressCountry = scrapy.Field()
    city = scrapy.Field()
    phone = scrapy.Field()
    summary = scrapy.Field()
    year = scrapy.Field()
    registry = scrapy.Field()
    legal_form = scrapy.Field()
    vat = scrapy.Field()

class EmployeeLoader(scrapy.Item):
    fax = scrapy.Field()
    website = scrapy.Field()
    brands = scrapy.Field()
    banks = scrapy.Field()
    import_area = scrapy.Field()
    import_countries = scrapy.Field()
    export_area = scrapy.Field()
    export_countries = scrapy.Field()
    employees = scrapy.Field()
    turn_over = scrapy.Field()

pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher

def item_type(item):
    # The CSV file names are used (imported) from the scrapy spider.
    return type(item)

class YourSitePipelineHere(object):
    # For simplicity, I'm using the same class def names as found in the,
    # main scrapy spider and as defined in the items.py
    fileNamesCsv = ['ProductItemLoader','EmployeeLoader']

    def __init__(self):
        self.files = {}
        self.exporters = {}
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.files = dict([ (name, open("/projec_name"+name+'.csv','wb')) for name in self.fileNamesCsv ])
        for name in self.fileNamesCsv:
        self.exporters[name] = CsvItemExporter(self.files[name])

            if name == 'ProductItemLoader':
                self.exporters[name].fields_to_export = ['url','group_url','name','streetAddress','addressCountry','city','phone','summary','year','registry','legal_form','vat']
                self.exporters[name].start_exporting()

            if name == 'EmployeeLoader':
                self.exporters[name].fields_to_export = ['fax','website','brands','bank','import_area','import_countries','export_area','export_countries','employees','turn_over']
                self.exporters[name].start_exporting()

    def spider_closed(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]

    def process_item(self, item, spider):
        typesItem = item_type(item)
        if typesItem in set(self.fileNamesCsv):
            self.exporters[typesItem].export_item(item)
        return item

NeilR

...