Я построил скребок для работы по сайту и сохранил все потенциальные данные о работе в файл csv , а затем в базу данных MySQL.По какой-то причине скребок останавливается после вытягивания заданий из первого города в списке.Вот что я имею в виду:
Код списка городов:
Cities = {
'cities':[ 'washingtondc',
'newyork',
'sanfrancisco',
'...',
'...']
}
Код паука Scrapy:
# -*- coding: utf-8 -*-
from city_list import Cities
import scrapy, os, csv, glob, pymysql.cursors
class JobsSpider(scrapy.Spider):
name = 'jobs'
c_list = Cities['cities']
for c in c_list:
print(f'Searching {c} for jobs...')
allowed_domains = [f'{c}.jobsite.com']
start_urls = [f'https://{c}.jobsite.com/search/jobs/']
def parse(self, response):
listings = response.xpath('//li[@class="listings-path"]')
for listing in listings:
date = listing.xpath('.//*[@class="date-path"]/@datetime').extract_first()
link = listing.xpath('.//a[@class="link-path"]/@href').extract_first()
text = listing.xpath('.//a[@class="text-path"]/text()').extract_first()
yield scrapy.Request(link,
callback=self.parse_listing,
meta={'date': date,
'link': link,
'text': text})
next_page_url = response.xpath('//a[text()="next-path "]/@href').extract_first()
if next_page_url:
yield scrapy.Request(response.urljoin(next_page_url), callback=self.parse)
def parse_listing(self, response):
date = response.meta['date']
link = response.meta['link']
text = response.meta['text']
compensation = response.xpath('//*[@class="compensation-path"]/span[1]/b/text()').extract_first()
employment_type = response.xpath('//*[@class="employment-type-path"]/span[2]/b/text()').extract_first()
images = response.xpath('//*[@id="images-path"]//@src').extract()
address = response.xpath('//*[@id="address-path"]/text()').extract()
yield {'date': date,
'link': link,
'text': text,
'compensation': compensation,
'type': employment_type,
'images': images,
'address': address}
def close(self, reason):
csv_file = max(glob.iglob('*.csv'), key=os.path.getctime)
conn = pymysql.connect(host='localhost',
user='root',
password='**********',
db='jobs_database',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
csv_data = csv.reader(open('jobs.csv'))
for row in csv_data:
cur.execute('INSERT INTO jobs_table(date, link, text, compensation, type, images, address)' 'VALUES(%s, %s, %s, %s, %s, %s, %s)', row)
conn.commit()
conn.close()
print("Done Importing!")
Скребок работает нормально, но останавливается после захвата заданий извашингтон и выходы.
Как мне решить эту проблему?
ОБНОВЛЕНИЕ - Я изменил приведенный выше код на
class JobsSpider(scrapy.Spider):
name = 'jobs'
allowed_domains = []
start_urls = []
def __init__(self, *args, **kwargs):
super().__init__(self, *args, **kwargs)
c_list = Cities['cities']
for c in c_list:
print(f'Searching {c} for jobs...')
self.allowed_domains.append(f'{c}.jobsearch.com')
self.start_urls.append(f'https://{c}.jobsearch.com/search/jobs/')
def parse(self, response):
...
и теперь получаю «RecursionError: превышена максимальная глубина рекурсии при вызове объекта Python»
Вот трассировка:
Traceback (most recent call last):
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 1034, in emit
msg = self.format(record)
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 880, in format
return fmt.format(record)
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 619, in format
record.message = record.getMessage()
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 380, in getMessage
msg = msg % self.args
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
[Previous line repeated 479 more times]
RecursionError: maximum recursion depth exceeded while calling a Python object