Вам необходимо переместить логику следующей страницы в вашей функции parse
, поскольку кнопка следующей страницы находится в URL-адресе, определенном в вашем start_urls
.
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}