Почему запросы получают данные, в то время как scrapy не может в почтовом запросе - PullRequest
0 голосов
/ 20 мая 2018

Я пытался опубликовать запрос с запросами и копией.Но только запросы работали.

Вот код для запросов:

import requests

raw_headers = '''Host: www.lagou.com
Connection: keep-alive
Content-Length: 23
Pragma: no-cache
Cache-Control: no-cache
Origin: https://www.lagou.com
X-Anit-Forge-Code: 0
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Accept: application/json, text/javascript, */*; q=0.01
X-Requested-With: XMLHttpRequest
X-Anit-Forge-Token: None
Referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7
Cookie: WEBTJ-ID=20180504193617-1632af03c50d9-0cfacb3d784272-33657f07-1296000-1632af03c5140e; _ga=GA1.2.728186241.1525433778; user_trace_token=20180504193615-5aaf0300-4f8f-11e8-8357-525400f775ce; LGUID=20180504193615-5aaf077c-4f8f-11e8-8357-525400f775ce; X_HTTP_TOKEN=fe969bafb2950f67843324b14f8cdbd0; LG_LOGIN_USER_ID=b235225329cf5721682a4926b55de8de569941c43e9c38689b38d7edf024d8ec; _putrc=301CD314998957B0123F89F2B170EADC; JSESSIONID=ABAAABAAADEAAFI4F9C70F4CBB13D2411863A16D1676402; login=true; unick=%E8%AD%9A%E7%A8%8B; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.456971435.1526656087; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1525433777,1526687924; hasDeliver=23; LGSID=20180519203312-cb891735-5b60-11e8-875b-5254005c3644; gate_login_token=fed1eb04f3a29f2e9b0f531b1926635d38a7961a01993a97ad8e0ccdbda664dc; witkey_login_authToken="VPXRNHs8Vp69Ud0vSKA5x8nFJqLnHGMr14LXYnIGUY8i0PGWzL9wFfP5wowqwjPYiU1E97SBgJhetuVyJBJ/n3/I4TZXhwVfa4t9gF7LQSzDeEi29qE8Cs+TDSX+paY5py1ypde3mn+mcVsWn05vXUuEKYzfwnbucwhdyxudaTZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526736412; LGRID=20180519212649-48d3c64d-5b68-11e8-b9a7-525400f775ce; TG-TRACK-CODE=search_code; SEARCH_ID=beee3527273149129e1933113f165cbc'''

url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'

data = {'first':'true','kd':'python','pn':'1'}

def get_headers(raw_headers):
    headers = {}
    for line in raw_headers.split('\n'):
        key, value = line.split(':', 1)
        headers[key] = value.strip()
    return headers

headers = get_headers(raw_headers)
r = requests.post(url, headers=headers, data=data)
print(r.text)

И вот код для scrapy:

import scrapy
import json

raw_headers = '''Host: www.lagou.com
Connection: keep-alive
Content-Length: 23
Pragma: no-cache
Cache-Control: no-cache
Origin: https://www.lagou.com
X-Anit-Forge-Code: 0
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Accept: application/json, text/javascript, */*; q=0.01
X-Requested-With: XMLHttpRequest
X-Anit-Forge-Token: None
Referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'''

raw_cookies = '''WEBTJ-ID=20180504193617-1632af03c50d9-0cfacb3d784272-33657f07-1296000-1632af03c5140e; _ga=GA1.2.728186241.1525433778; user_trace_token=20180504193615-5aaf0300-4f8f-11e8-8357-525400f775ce; LGUID=20180504193615-5aaf077c-4f8f-11e8-8357-525400f775ce; X_HTTP_TOKEN=fe969bafb2950f67843324b14f8cdbd0; LG_LOGIN_USER_ID=b235225329cf5721682a4926b55de8de569941c43e9c38689b38d7edf024d8ec; _putrc=301CD314998957B0123F89F2B170EADC; JSESSIONID=ABAAABAAADEAAFI4F9C70F4CBB13D2411863A16D1676402; login=true; unick=%E8%AD%9A%E7%A8%8B; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.456971435.1526656087; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1525433777,1526687924; hasDeliver=23; gate_login_token=fed1eb04f3a29f2e9b0f531b1926635d38a7961a01993a97ad8e0ccdbda664dc; witkey_login_authToken="VPXRNHs8Vp69Ud0vSKA5x8nFJqLnHGMr14LXYnIGUY8i0PGWzL9wFfP5wowqwjPYiU1E97SBgJhetuVyJBJ/n3/I4TZXhwVfa4t9gF7LQSzDeEi29qE8Cs+TDSX+paY5py1ypde3mn+mcVsWn05vXUuEKYzfwnbucwhdyxudaTZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180520073610-69150509-5bbd-11e8-8791-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_java%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526773014; LGRID=20180520073648-7fbe6b5d-5bbd-11e8-bafa-525400f775ce; SEARCH_ID=95a523e4cff74203bc730f8bb85db29f'''

def get_headers(raw_headers):
    headers = {}
    for line in raw_headers.split('\n'):
        key, value = line.split(':', 1)
        headers[key] = value.strip()
    return headers

def get_cookies(raw_cookies):
    cookies = {}
    equas = raw_cookies.split('; ')
    for equa in equas:
        key, value = equa.split('=', 1) # only split the first "="
        cookies[key] = value.strip()
    return cookies   

class LagouSpider(scrapy.Spider):
    name = "lagou"

    def start_requests(self):
        urls = [
            'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
        ]
        for url in urls:
            yield scrapy.FormRequest(
                url=url, 
                method='POST',
                headers=get_headers(raw_headers), 
                cookies=get_cookies(raw_cookies),
                formdata={'first':'true','kd':'python','pn':'1'}, 
                callback=self.parse
            )

    def parse(self, response):
        print(response.text)

После теста я могу получить правильный ответ по запросам, которыеэто JSON, в то время как Scrapy получил это:

2018-05-20 07:57:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.lagou.com/lagouhtml/a4.html?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false> from <POST https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false>
2018-05-20 07:46:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.lagou.com/lagouhtml/a4.html?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false> (referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=)
    ****************************************
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title></title>
        <style>
            * {
                margin: 0;
                padding: 0;
            }
            body {
                font-family: "Hiragino Sans GB", "Microsoft Yahei", "SimSun", Arial, "Helvetica Neue", Helvetica;
                background: #f8f9fc;                
            }
            .i_error {
                position: relative;
                /*width: 654px;*/
                width: 34.0625%;
                /*height: 467px;*/
                margin: 67px auto 0;    
                /*background: url(/lagouhtml/blocked_404.png) 0 0 no-repeat;*/
            }
            .i_logo {
                position: absolute;
                /*top: 116px;*/
                top: 24.8394%;
                /*left: 68px;*/
                left: 9.785933%;
                /*width: 110px;*/
                width: 16.819572%;
                /*height: 41px;*/
                /*background: url(/lagouhtml/lagou_logo.png) 0 0 no-repeat;*/
            }
            .tip {
                margin-top: 47px;
                font-size: 24px;
                line-height: 24px;
                text-align: center;
                color: #333;
            }
            .btn {
                margin-top: 30px;
                text-align: center;
            }
            .btn a {
                padding: 0 18px;
                font-size: 16px;
                line-height: 16px;
                text-decoration: none;
                color: #777;
            }
            .btn a + a {
                border-left: 1px solid #d5dadf;
            }
            .btn a:hover {
                color: #00b38a;
                /*text-decoration: underline;*/
            }
            .btn .green {
                color: #00b38a;
            }
        </style>
    </head>
    <body>
        <div class="i_error">
            <img src="/lagouhtml/blocked_404.png" alt="404" width="100%">
            <div class="i_logo"><img src="/lagouhtml/lagou_logo.png" alt="logo" width="100%"></div>
        </div>  
        <p class="tip" id="tipEle">啊哦,出错了,您访问的链接不存在!</p>
        <p class="btn"><a class="green" href="javascript:history.go(-1);">返回上页</a><a href="http://www.lagou.com">返回首页</a></p>

        <script type="text/javascript">
            var body = document.getElementsByTagName( 'body' )[0];
            var screenWidth = body.offsetWidth;
            if ( screenWidth <= 1366 ) {
                document.getElementById( 'tipEle' ).style.fontSize = "18px";
            }
        </script>
    <script type="text/javascript" src="https://www.lagou.com/upload/oss.js"></script></body>
    </html>

В основном это страница 404 после перенаправления 302.В моем файле settings.py я уже установил:
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True

Я также пытался отправить куки вместе с заголовками, но это тоже не сработало.Затем я попытался поместить заголовки в настройках. DEFAULT_REQUEST_HEADERS.Все еще не повезло.

Интересно, есть ли различия между этими двумя запросами?Как мне найти различия?

Любой ввод будет полезен!

1 Ответ

0 голосов
/ 20 мая 2018

Вы должны понимать, что Scrapyrequests) генерируют некоторые заголовки для вас.

import scrapy
import json

raw_headers = '''Content-Type: application/x-www-form-urlencoded; charset=UTF-8
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36
Referer: https://www.lagou.com/jobs/list_java?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput='''

raw_cookies = '''WEBTJ-ID=20180504193617-1632af03c50d9-0cfacb3d784272-33657f07-1296000-1632af03c5140e; _ga=GA1.2.728186241.1525433778; user_trace_token=20180504193615-5aaf0300-4f8f-11e8-8357-525400f775ce; LGUID=20180504193615-5aaf077c-4f8f-11e8-8357-525400f775ce; X_HTTP_TOKEN=fe969bafb2950f67843324b14f8cdbd0; LG_LOGIN_USER_ID=b235225329cf5721682a4926b55de8de569941c43e9c38689b38d7edf024d8ec; _putrc=301CD314998957B0123F89F2B170EADC; JSESSIONID=ABAAABAAADEAAFI4F9C70F4CBB13D2411863A16D1676402; login=true; unick=%E8%AD%9A%E7%A8%8B; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.456971435.1526656087; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1525433777,1526687924; hasDeliver=23; gate_login_token=fed1eb04f3a29f2e9b0f531b1926635d38a7961a01993a97ad8e0ccdbda664dc; witkey_login_authToken="VPXRNHs8Vp69Ud0vSKA5x8nFJqLnHGMr14LXYnIGUY8i0PGWzL9wFfP5wowqwjPYiU1E97SBgJhetuVyJBJ/n3/I4TZXhwVfa4t9gF7LQSzDeEi29qE8Cs+TDSX+paY5py1ypde3mn+mcVsWn05vXUuEKYzfwnbucwhdyxudaTZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180520073610-69150509-5bbd-11e8-8791-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_java%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526773014; LGRID=20180520073648-7fbe6b5d-5bbd-11e8-bafa-525400f775ce; SEARCH_ID=95a523e4cff74203bc730f8bb85db29f'''

def get_headers(raw_headers):
    headers = {}
    for line in raw_headers.split('\n'):
        key, value = line.split(':', 1)
        headers[key] = value.strip()
    return headers

def get_cookies(raw_cookies):
    cookies = {}
    equas = raw_cookies.split('; ')
    for equa in equas:
        key, value = equa.split('=', 1) # only split the first "="
        cookies[key] = value.strip()
    return cookies   

class LagouSpider(scrapy.Spider):
    name = "lagou"

    def start_requests(self):
        urls = [
            'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
            # 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
        ]

        headers=get_headers(raw_headers)
        cookies=get_cookies(raw_cookies)

        for url in urls:
            yield scrapy.FormRequest(
                url=url, 
                method='POST',
                headers=headers, 
                cookies=cookies,
                formdata={'first':'true','kd':'python','pn':'1'}, 
                callback=self.parse
            )

    def parse(self, response):
        # request_headers = response.request.headers
        # print(request_headers)
        print(response.text.encode('utf-8'))
...