Я новичок в области скрапа и не могу получить точное содержание html
для выделения необходимых разделений.
Как извлечь точное содержимое HTML в Python, используя scrapy
?Я попытался использовать библиотеку requests
и запрос scrapy
.Но не удалось получить точное HTML-содержимое страницы.Я хочу использовать scrapy
в Spyder / Jupiter
Notebook.
Пробовал следующие разные запросы, но он не работает.
# My Code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = 'https://www.rbauction.com/construction?cid=3279191388',
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(QuotesSpider)
process.start()
process.stop()
#
import requests
from scrapy.http import TextResponse
import scrapy
url = 'https://www.rbauction.com/construction?cid=3279191388'
a = scrapy.Request(url)
from scrapy.http import HtmlResponse
a = HtmlResponse(url)
a.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a').extract()
#
import requests
import json
from lxml import html
import requests
source = requests.get("https://www.rbauction.com/construction?cid=3279191388")
tree = html.fromstring(source.content)
#
import urllib.request
opener = urllib.request.FancyURLopener({})
url = "https://www.rbauction.com/construction?cid=3279191388"
f = opener.open(url)
content = f.read()
#
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
url = [
'https://www.rbauction.com/construction?cid=3279191388'
]
for url1 in url:
yield scrapy.Request(url=url1, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_item)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings`enter code here`
process = CrawlerProcess(get_project_settings())
#enter code here
process.crawl(QuotesSpider)
process.start() # the script will block here until the crawling is finished
process.stop()