При запуске я получаю две проблемы
ему нужно
Page = scrapy.Field()
страница https://medium.com/tag/python/archive/02/01
перенаправлена на https://medium.com/tag/python/archive
, но это заблокирован
handle_httpstatus_list = [302]
После удаления handle_httpstatus_list
Я получаю данные с первой страницы
Результат (CSV)
Claps,Date,Name,Page,Publication,Read,Responses,Title
81K,,Daniel van Flymen,https://medium.com/tag/python/archive,,9 min read,383 responses,Learn Blockchains by Building One
25K,,Jonny Fox,https://medium.com/tag/python/archive,,6 min read,63 responses,Regex tutorial — A quick cheatsheet by examples
9.6K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,112 responses,"Building A Logistic Regression in Python, Step by Step"
5.8K,,Adi Bronshtein,https://medium.com/tag/python/archive,,9 min read,46 responses,Train/Test Split and Cross Validation in Python
7.8K,,Will Koehrsen,https://medium.com/tag/python/archive,,21 min read,42 responses,Random Forest in Python
7.2K,,Ted Petrou,https://medium.com/tag/python/archive,,24 min read,34 responses,Selecting Subsets of Data in Pandas: Part 1
11.1K,,Milo Spencer-Harper,https://medium.com/tag/python/archive,,6 min read,86 responses,How to build a simple neural network in 9 lines of Python code
5.2K,,Michael Galarnyk,https://medium.com/tag/python/archive,,8 min read,27 responses,PCA using Python (scikit-learn)
64K,,TK,https://medium.com/tag/python/archive,,11 min read,148 responses,Learning Python: From Zero to Hero
6.9K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,75 responses,An End-to-End Project on Time Series Analysis and Forecasting with Python
Код, который я использовал - все в одном файле без создания проекта
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class MediumItem(scrapy.Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
Page = scrapy.Field()
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
#handle_httpstatus_list = [302]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(DataSpider)
c.start()