Я с трудом пытаюсь почистить сайт, отображающий все его страницы в JS: https://www.jobteaser.com/en/job-offers
После проверки запросов с помощью инструмента отладчика я обнаружил, что весь контент, который я хочу, отправляется сAJAX в. json
формате.
Файл, возвращающий содержимое
Итак, я сделал следующий паук, чтобы получить содержимое определенного поиска:
import scrapy
from scrapy import Request
import json
class JobteaserSpider(scrapy.Spider):
name="jobteaser"
start_urls=['https://www.jobteaser.com/fr/job-offers?q%3Dbusiness%20analyst%26contract%3Dstage%2Cinternship%2Cwerkstudent%26location%3DFrance..France%26locale%3Dfr%2Cen']
def parse(self,response):
apiKey="..."
header ={
"requests":[
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=*&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=abroad_only&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=company_business_type&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=company_sectors&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=contract_duration&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=contract&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%5D&hitsPerPage=20&page=0&facets=locale&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=location&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=position_category&distinct=true&facetingAfterDistinct=true"},
{"indexName":"job_offers",
"params":"query=business%20analyst&facetFilters=%5B%5B%22contract%3Astage%22%2C%22contract%3Ainternship%22%2C%22contract%3Awerkstudent%22%5D%2C%5B%22location%3AFrance%22%5D%2C%5B%22locale%3Afr%22%2C%22locale%3Aen%22%5D%5D&hitsPerPage=20&page=0&facets=start_date&distinct=true&facetingAfterDistinct=true"},
],
"apiKey":apiKey
}
yield scrapy.Request(
url="https://9vcp793ivh-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.30.0&x-algolia-application-id=9VCP793IVH",
method='POST',
body=json.dumps(header),
headers={'Content-Type':'application/json'},
callback=self.parse_internship)
def parse_internship(self,response):
yield{"E":response.body}
pass
Заголовки также отправляются в формате .json
.USER_AGENT
было изменено, а ROBOTSTXT_OBEY
установлено на False
.Несмотря на эти действия, у меня появляется эта ошибка:
DEBUG: Crawled (200) <GET https://www.jobteaser.com/fr/job-offers?q%3Dbusiness%20analyst%26contract%3Dstage%2Cinternship%2Cwerkstudent%26location%3DFrance..France%26locale%3Dfr%2Cen> (referer: None)
DEBUG: Crawled (400) <POST https://9vcp793ivh-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.30.0&x-algolia-application-id=9VCP793IVH> (referer: https://www.jobteaser.com/)
INFO: Ignoring response <400 https://9vcp793ivh-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.30.0&x-algolia-application-id=9VCP793IVH>: HTTP status code is not handled or not allowed
INFO: Closing spider (finished)
Возможно, это URL-адрес, отправленный с помощью запроса, который не подходит, но я не могу найти правильный после тщательного анализа исходного URL-адреса.
Спасибо!