У меня есть проект Scrapy, подключенный к проекту Django, и все работает нормально (т.е. когда я запускаю свой скребок, я могу сохранять элементы в БД).
Я пытаюсь добавить скребок для изображений в свой проект и не могу заставить его работать.Я могу заставить скребок для изображений Scrapy работать самостоятельно, но не при подключении к проекту Django
Я получаю следующую ошибку:
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/scrapy/pipelines/media.py", line 79, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/scrapy/pipelines/images.py", line 155, in get_media_requests
return [Request(x) for x in item.get(self.images_urls_field, [])]
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/scrapy/pipelines/images.py", line 155, in <listcomp>
return [Request(x) for x in item.get(self.images_urls_field, [])]
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/Users/junaid/Desktop/clscraper2/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 62, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
Вот мой проект:
Models.py
class atl_sale_listing(models.Model):
metro_area = models.CharField(max_length=40, null=False, blank=False)
listing_id = models.CharField(max_length=250, null=False, blank=False, unique=True) #must be unique
url = models.CharField(max_length=450, null=True, blank=True)
status = models.CharField(max_length=25, null=True, blank=True)
price = models.IntegerField(null=True, blank=True)
tax = models.FloatField(null=True, blank=True)
Items.py - Примечание. Я добавляю поле image_urls и изображения в объект элемента django
import scrapy
from scrapy_djangoitem import DjangoItem
from realestate_app.models import atl_sale_listing
class AtlSaleListingItem(DjangoItem):
django_model = atl_sale_listing
image_urls = scrapy.Field() #added
images = scrapy.Field() #added
spider.py
import scrapy
from re_scraper.items import AtlSaleListingItem
from scrapy.loader import ItemLoader
class AtlListings2Spider(scrapy.Spider):
name = "atl_buy_testing"
allowed_domains = ["www.something.com"]
start_urls = ['www.something.com/something2',
] #specify the filter in the url
def parse(self, response):
listings = response.xpath('//div[@class="cardone "]')
order = 1
for listing in listings:
url = listing.xpath('.//a/@href').extract_first()
yield scrapy.Request(url,
callback=self.parse_listing)
def parse_listing(self, response):
status = response.xpath('//*[@class="text-orange"]/text()').extract_first()
price = response.xpath('//*[@class="price"]/text()').extract_first()
image_urls = response.xpath('//img/@data-img')[0].extract() #added image field here
yield AtlSaleListingItem(
status = status,
price = price,
image_urls = image_urls,
)
settings.py
from random import random
import os
import sys
DJANGO_PROJECT_PATH = os.path.dirname(os.path.abspath(__file__))
DJANGO_SETTINGS_MODULE = 'realestate.settings'
sys.path.append(os.path.dirname(os.path.abspath('.')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'realestate.settings'
import django
django.setup()
BOT_NAME = 're_scraper'
SPIDER_MODULES = ['re_scraper.spiders']
NEWSPIDER_MODULE = 're_scraper.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
're_scraper.pipelines.AtlListingPipeline': 5,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = '/Users/user1/desktop/movoto_images'
Любая помощь здесь будет высоко ценится