Вы не можете назначить allowed_domain
для каждой ссылки в start_urls
Вам придется фильтровать URL-адреса внутри process_request
метода DownloaderMiddleware
Вот ваш код паука
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})
Вот код Middleware
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")