Я пытался опубликовать запрос с запросами и копией.Но только запросы работали.
Вот код для запросов:
import requests
raw_headers = '''Host: www.lagou.com
Connection: keep-alive
Content-Length: 23
Pragma: no-cache
Cache-Control: no-cache
Origin: https://www.lagou.com
X-Anit-Forge-Code: 0
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Accept: application/json, text/javascript, */*; q=0.01
X-Requested-With: XMLHttpRequest
X-Anit-Forge-Token: None
Referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7
Cookie: WEBTJ-ID=20180504193617-1632af03c50d9-0cfacb3d784272-33657f07-1296000-1632af03c5140e; _ga=GA1.2.728186241.1525433778; user_trace_token=20180504193615-5aaf0300-4f8f-11e8-8357-525400f775ce; LGUID=20180504193615-5aaf077c-4f8f-11e8-8357-525400f775ce; X_HTTP_TOKEN=fe969bafb2950f67843324b14f8cdbd0; LG_LOGIN_USER_ID=b235225329cf5721682a4926b55de8de569941c43e9c38689b38d7edf024d8ec; _putrc=301CD314998957B0123F89F2B170EADC; JSESSIONID=ABAAABAAADEAAFI4F9C70F4CBB13D2411863A16D1676402; login=true; unick=%E8%AD%9A%E7%A8%8B; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.456971435.1526656087; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1525433777,1526687924; hasDeliver=23; LGSID=20180519203312-cb891735-5b60-11e8-875b-5254005c3644; gate_login_token=fed1eb04f3a29f2e9b0f531b1926635d38a7961a01993a97ad8e0ccdbda664dc; witkey_login_authToken="VPXRNHs8Vp69Ud0vSKA5x8nFJqLnHGMr14LXYnIGUY8i0PGWzL9wFfP5wowqwjPYiU1E97SBgJhetuVyJBJ/n3/I4TZXhwVfa4t9gF7LQSzDeEi29qE8Cs+TDSX+paY5py1ypde3mn+mcVsWn05vXUuEKYzfwnbucwhdyxudaTZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526736412; LGRID=20180519212649-48d3c64d-5b68-11e8-b9a7-525400f775ce; TG-TRACK-CODE=search_code; SEARCH_ID=beee3527273149129e1933113f165cbc'''
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
data = {'first':'true','kd':'python','pn':'1'}
def get_headers(raw_headers):
headers = {}
for line in raw_headers.split('\n'):
key, value = line.split(':', 1)
headers[key] = value.strip()
return headers
headers = get_headers(raw_headers)
r = requests.post(url, headers=headers, data=data)
print(r.text)
И вот код для scrapy:
import scrapy
import json
raw_headers = '''Host: www.lagou.com
Connection: keep-alive
Content-Length: 23
Pragma: no-cache
Cache-Control: no-cache
Origin: https://www.lagou.com
X-Anit-Forge-Code: 0
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Accept: application/json, text/javascript, */*; q=0.01
X-Requested-With: XMLHttpRequest
X-Anit-Forge-Token: None
Referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'''
raw_cookies = '''WEBTJ-ID=20180504193617-1632af03c50d9-0cfacb3d784272-33657f07-1296000-1632af03c5140e; _ga=GA1.2.728186241.1525433778; user_trace_token=20180504193615-5aaf0300-4f8f-11e8-8357-525400f775ce; LGUID=20180504193615-5aaf077c-4f8f-11e8-8357-525400f775ce; X_HTTP_TOKEN=fe969bafb2950f67843324b14f8cdbd0; LG_LOGIN_USER_ID=b235225329cf5721682a4926b55de8de569941c43e9c38689b38d7edf024d8ec; _putrc=301CD314998957B0123F89F2B170EADC; JSESSIONID=ABAAABAAADEAAFI4F9C70F4CBB13D2411863A16D1676402; login=true; unick=%E8%AD%9A%E7%A8%8B; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.456971435.1526656087; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1525433777,1526687924; hasDeliver=23; gate_login_token=fed1eb04f3a29f2e9b0f531b1926635d38a7961a01993a97ad8e0ccdbda664dc; witkey_login_authToken="VPXRNHs8Vp69Ud0vSKA5x8nFJqLnHGMr14LXYnIGUY8i0PGWzL9wFfP5wowqwjPYiU1E97SBgJhetuVyJBJ/n3/I4TZXhwVfa4t9gF7LQSzDeEi29qE8Cs+TDSX+paY5py1ypde3mn+mcVsWn05vXUuEKYzfwnbucwhdyxudaTZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180520073610-69150509-5bbd-11e8-8791-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_java%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526773014; LGRID=20180520073648-7fbe6b5d-5bbd-11e8-bafa-525400f775ce; SEARCH_ID=95a523e4cff74203bc730f8bb85db29f'''
def get_headers(raw_headers):
headers = {}
for line in raw_headers.split('\n'):
key, value = line.split(':', 1)
headers[key] = value.strip()
return headers
def get_cookies(raw_cookies):
cookies = {}
equas = raw_cookies.split('; ')
for equa in equas:
key, value = equa.split('=', 1) # only split the first "="
cookies[key] = value.strip()
return cookies
class LagouSpider(scrapy.Spider):
name = "lagou"
def start_requests(self):
urls = [
'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
]
for url in urls:
yield scrapy.FormRequest(
url=url,
method='POST',
headers=get_headers(raw_headers),
cookies=get_cookies(raw_cookies),
formdata={'first':'true','kd':'python','pn':'1'},
callback=self.parse
)
def parse(self, response):
print(response.text)
После теста я могу получить правильный ответ по запросам, которыеэто JSON, в то время как Scrapy получил это:
2018-05-20 07:57:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.lagou.com/lagouhtml/a4.html?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false> from <POST https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false>
2018-05-20 07:46:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.lagou.com/lagouhtml/a4.html?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false> (referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=)
****************************************
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
body {
font-family: "Hiragino Sans GB", "Microsoft Yahei", "SimSun", Arial, "Helvetica Neue", Helvetica;
background: #f8f9fc;
}
.i_error {
position: relative;
/*width: 654px;*/
width: 34.0625%;
/*height: 467px;*/
margin: 67px auto 0;
/*background: url(/lagouhtml/blocked_404.png) 0 0 no-repeat;*/
}
.i_logo {
position: absolute;
/*top: 116px;*/
top: 24.8394%;
/*left: 68px;*/
left: 9.785933%;
/*width: 110px;*/
width: 16.819572%;
/*height: 41px;*/
/*background: url(/lagouhtml/lagou_logo.png) 0 0 no-repeat;*/
}
.tip {
margin-top: 47px;
font-size: 24px;
line-height: 24px;
text-align: center;
color: #333;
}
.btn {
margin-top: 30px;
text-align: center;
}
.btn a {
padding: 0 18px;
font-size: 16px;
line-height: 16px;
text-decoration: none;
color: #777;
}
.btn a + a {
border-left: 1px solid #d5dadf;
}
.btn a:hover {
color: #00b38a;
/*text-decoration: underline;*/
}
.btn .green {
color: #00b38a;
}
</style>
</head>
<body>
<div class="i_error">
<img src="/lagouhtml/blocked_404.png" alt="404" width="100%">
<div class="i_logo"><img src="/lagouhtml/lagou_logo.png" alt="logo" width="100%"></div>
</div>
<p class="tip" id="tipEle">啊哦,出错了,您访问的链接不存在!</p>
<p class="btn"><a class="green" href="javascript:history.go(-1);">返回上页</a><a href="http://www.lagou.com">返回首页</a></p>
<script type="text/javascript">
var body = document.getElementsByTagName( 'body' )[0];
var screenWidth = body.offsetWidth;
if ( screenWidth <= 1366 ) {
document.getElementById( 'tipEle' ).style.fontSize = "18px";
}
</script>
<script type="text/javascript" src="https://www.lagou.com/upload/oss.js"></script></body>
</html>
В основном это страница 404 после перенаправления 302.В моем файле settings.py я уже установил:
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
Я также пытался отправить куки вместе с заголовками, но это тоже не сработало.Затем я попытался поместить заголовки в настройках. DEFAULT_REQUEST_HEADERS.Все еще не повезло.
Интересно, есть ли различия между этими двумя запросами?Как мне найти различия?
Любой ввод будет полезен!