Отправить в запросе javascript сгенерированный файл cookie - PullRequest
0 голосов
/ 25 апреля 2020

Я делаю скрипт, который делает вид, что сканирует определенный URL. Я понял, что для получения какой-то информации необходим повар ie (wmt.breakpoint), который уже должен быть установлен, но когда его нет, веб-страница создает его через js с помощью этого сценария:

<script>
        var breakpoint;

        if(wmtBreakpointDetection.isMobileOrTabletDevice() && !(window.screen.width >=960)) {
            breakpoint = wmtBreakpointDetection.getBreakPointByWidth(Math.max(window.screen.width, window.screen.height)).shortName;
        } else {
            breakpoint = "d";
        }

        var oneYearFromToday = new Date(new Date().setFullYear(new Date().getFullYear() + 1));

        document.cookie = "wmt.breakpoint=" + breakpoint + "; path=/; expired=" + oneYearFromToday;



            var head = document.getElementsByTagName("head")[0];

            var globalCSSPreload = document.createElement("link");
            globalCSSPreload.rel = "prefetch";

            var shelfCSSPreload = document.createElement("link");
            shelfCSSPreload.rel = "prefetch";

            switch (breakpoint) {
                case "m":
                    globalCSSPreload.href = '/assets/css/3423909a99dd600da028633d212cbb0a-global-m.min.css';
                    shelfCSSPreload.href = '/assets/css/6e54519a22f49739c21efb1bc1b232b2-shelfPages-m.min.css';
                    break;
                case "t":
                    globalCSSPreload.href = '/assets/css/6177bd71898adeb2b41a859a28aa6830-global-t.min.css';
                    shelfCSSPreload.href = '/assets/css/6dedcf9549c4c3d516802e99edd62280-shelfPages-t.min.css';
                    break;
                case "d":
                    globalCSSPreload.href = '/assets/css/2783efac6768fa0e558b57e984f95ec3-global-d.min.css';
                    shelfCSSPreload.href = '/assets/css/03f5b9e01267340f0fd90169e0ab7389-shelfPages-d.min.css';
                    break;
            }

            if (globalCSSPreload.href) {
                head.appendChild(globalCSSPreload);
            }

                if (shelfCSSPreload.href) {
                    head.appendChild(shelfCSSPreload);
                }




        var intendedUrl = "/en/grocery/N-117?icid=home%20page_HP_Category_Tile_Grocery_WM".replace(/&amp;/g, "&");
        var shouldReload = document.location.hash
            && document.location.pathname + document.location.search == intendedUrl;

        if(document.cookie.indexOf("wmt.breakpoint=") == -1){
            document.getElementById("nocookies").style.display = 'block';
        } else if (shouldReload) {
            // If we are redirecting to a URL with an anchor (ie:
            // "/en/help/legal#TermsofUse" and the current path
            // matches the path in the request, use reload() as
            // replace will not work.
            document.location.reload(true);
        } else {
            document.location.replace(intendedUrl);
        }
    </script>

Я думаю, что этот блок создает проблему, почему-то document.cookie не способен прочитать значение cook ie, но я не знаю, как это исправить.

if(document.cookie.indexOf("wmt.breakpoint=") == -1){
            document.getElementById("nocookies").style.display = 'block';
        }

Я делаю запрос следующим образом:

class WalmartSpider(scrapy.Spider):
    name = 'walmart'
    allowed_domains = (
        'walmart.ca',
    )

    start_urls = (
        'https://www.walmart.ca/en/grocery/N-117',
    )

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                cookies=[{'name': 'wmt.breakpoint',
                          'value': 'd',
                          'domain': 'www.example.ca',
                          'path': '/'}]
            )

    def parse(self, response):
        self.log(response.text)

Отладка Scrapy выглядит следующим образом

2020-04-24 21:13:04 [scrapy.utils.log] INFO: Scrapy 2.0.1 started (bot: app)
2020-04-24 21:13:04 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.8.2 (default, Feb 26 2020, 22:21:03) - [GCC 9.2.1 20200130], pyOpenSSL 19.1.0 (OpenSSL 1.1.1f  31 Mar 2020), cryptography 2.9, Platform Linux-5.5.16-1-MANJARO-x86_64-with-glibc2.2.5
2020-04-24 21:13:04 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-04-24 21:13:04 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'app',
 'COOKIES_DEBUG': True,
 'DOWNLOAD_DELAY': 5,
 'NEWSPIDER_MODULE': 'app.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['app.spiders'],
 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:75.0) Gecko/20100101 '
               'Firefox/75.0'}
2020-04-24 21:13:04 [scrapy.extensions.telnet] INFO: Telnet Password: 0bd54d8669ebb658
2020-04-24 21:13:04 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2020-04-24 21:13:04 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'app.middlewares.AppDownloaderMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-04-24 21:13:04 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-04-24 21:13:04 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-04-24 21:13:04 [scrapy.core.engine] INFO: Spider opened
2020-04-24 21:13:04 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-04-24 21:13:04 [walmart] INFO: Spider opened: walmart
2020-04-24 21:13:04 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
<GET https://www.walmart.ca/robots.txt>
2020-04-24 21:13:04 [scrapy.downloadermiddlewares.cookies] DEBUG: Received cookies from: <200 https://www.walmart.ca/robots.txt>
Set-Cookie: wmt.c=1; Path=/; Domain=walmart.ca; Expires=Sat, 02 May 2020 02:00:42 GMT; Max-Age=604800

Set-Cookie: ENV=ak-dfw-prod; Path=/; Secure; Domain=.walmart.ca; max-age=1200

Set-Cookie: TS01f4281b=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; Path=/; Secure

Set-Cookie: TS0175e29f=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; path=/; domain=walmart.ca; Secure

Set-Cookie: TS011fb5f6=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; path=/; domain=.walmart.ca; Secure

2020-04-24 21:13:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.walmart.ca/robots.txt> (referer: None)
<GET https://www.walmart.ca/en/grocery/N-117>
2020-04-24 21:13:04 [scrapy.downloadermiddlewares.cookies] DEBUG: Sending cookies to: <GET https://www.walmart.ca/en/grocery/N-117>
Cookie: TS01f4281b=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; wmt.breakpoint=d; wmt.c=1; ENV=ak-dfw-prod; TS0175e29f=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; TS011fb5f6=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a

2020-04-24 21:13:12 [scrapy.downloadermiddlewares.cookies] DEBUG: Received cookies from: <200 https://www.walmart.ca/en/grocery/N-117>
Set-Cookie: walmart.shippingPostalCode=P7B3Z7; Max-Age=31540000; Expires=Sun, 25 Apr 2021 03:19:52 GMT; Path=/

Set-Cookie: defaultNearestStoreId=3124; Max-Age=31540000; Expires=Sun, 25 Apr 2021 03:19:52 GMT; Path=/

Set-Cookie: originalHttpReferer=; Max-Age=900; Expires=Sat, 25 Apr 2020 02:28:12 GMT; Path=/

Set-Cookie: zone=9; Max-Age=5184000; Expires=Wed, 24 Jun 2020 02:13:12 GMT; Path=/

Set-Cookie: deliveryCatchment=3124; Max-Age=5184000; Expires=Wed, 24 Jun 2020 02:13:12 GMT; Path=/

Set-Cookie: walmart.csrf=a2562be8657290158154b75a; SameSite=Lax; Path=/

Set-Cookie: wmt.c=1; Path=/; Domain=walmart.ca; Expires=Sat, 02 May 2020 02:04:29 GMT; Max-Age=604800

Set-Cookie: ENV=ak-dfw-prod; Path=/; Secure; Domain=.walmart.ca; max-age=1200

Set-Cookie: TS01f4281b=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; Path=/; Secure

Set-Cookie: TS0175e29f=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; path=/; domain=walmart.ca; Secure

Set-Cookie: TS011fb5f6=0130aff2320ca7973bd8b952821ff10fe914bfdca4adb7e44a8733fe88d83ff69a5b7c5d3f004279d1312252206bcacd73087a503a; path=/; domain=.walmart.ca; Secure

Set-Cookie: userSegment=10-percent; expires=Mon, 25-May-2020 02:13:12 GMT; path=/; domain=.www.walmart.ca

Перед раздачей. Спасибо за помощь, которую вы могли бы оказать мне.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...