Соскоб в Интернете с помощью HtmlUnit 2.35.0 - PullRequest
1 голос
/ 06 июля 2019

Я пишу парсер. И одна страница не загружается как полная версия (не все скрипты выполняются). Если я загружаю страницу (https://hh.ru/employer/negotiations/change_topic?r=5598e4e9000318fe590000bde1526e666d5968) в браузере, то это хорошо, но в htmlunit он не загружает некоторые скрипты (как мне кажется). На загруженной странице с firefox есть кнопка enable. Но на той же странице, загруженной htmluntit, есть атрибут button отключен, поэтому я не могу отправить (даже если я удалю этот атрибут, он отправлен, но не работает). Поэтому я не могу понять, почему страница htmlunit не работает.

Мои настройки:

WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60);
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setTimeout(35000);
webClient.getOptions().setUseInsecureSSL(true);
webClient.getOptions().setRedirectEnabled(true);

//overcome problems in js
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setPrintContentOnFailingStatusCode(false);
webClient.setCssErrorHandler(new SilentCssErrorHandler());
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.waitForBackgroundJavaScript(100000);
webClient.waitForBackgroundJavaScriptStartingBefore(100000);
getWebClient().setAlertHandler(new CollectingAlertHandler(new ArrayList<>()));
webClient.getOptions().setCssEnabled(true);

WebRequest requestSettings = new WebRequest(url, HttpMethod.GET);
requestSettings.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
requestSettings.setAdditionalHeader("Accept-Encoding", "gzip, deflate, br");
requestSettings.setAdditionalHeader("Accept-Language", "en-US,en;q=0.9");
requestSettings.setAdditionalHeader("Connection", "keep-alive");
requestSettings.setAdditionalHeader("Host", "hh.ru");
requestSettings.setAdditionalHeader("TE", "Trailers");
requestSettings.setAdditionalHeader("Upgrade-Insecure-Requests", "1");
requestSettings.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");

webClient.getPage(requestSettings);

Когда я пытаюсь вызвать myButton.click (), я получаю эту ошибку:

12:57:25,905 [Thread-7] ERROR com.gargoylesoftware.htmlunit.javascript.DefaultJavaScriptErrorListener - Error during JavaScript execution
======= EXCEPTION START ========
Exception class=[net.sourceforge.htmlunit.corejs.javascript.EvaluatorException]
com.gargoylesoftware.htmlunit.ScriptException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:885)
    ...
    at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:859)
    at com.headhunter.hhelper.Headhunter.invite(Headhunter.java:233)
    at com.headhunter.hhelper.Headhunter.doInvite(Headhunter.java:150)
    at com.headhunter.hhelper.SearchController$2.run(SearchController.java:126)
    at java.lang.Thread.run(Thread.java:745)
Caused by: net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$HtmlUnitErrorReporter.error(HtmlUnitContextFactory.java:420)
    at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:259)
    ...
    at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1584)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$TimeoutContext.compileString(HtmlUnitContextFactory.java:222)
    at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1573)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$1.doRun(JavaScriptEngine.java:707)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:870)
    ... 42 more
Enclosed exception: 
net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$HtmlUnitErrorReporter.error(HtmlUnitContextFactory.java:420)
    at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:259)
    ...
    at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:859)
    at com.headhunter.hhelper.Headhunter.invite(Headhunter.java:233)
    at com.headhunter.hhelper.Headhunter.doInvite(Headhunter.java:150)
    at com.headhunter.hhelper.SearchController$2.run(SearchController.java:126)
    at java.lang.Thread.run(Thread.java:745)
== CALLING JAVASCRIPT ==

            window.bloko = {
                fontUrl: '/'
            };
            window.globalVars = {
                locale: '',
                country: '',
                area: '',
                build: '',
                lang: '' || 'RU',
                requestId: '',
                sentryDSN: '',
                siteId: '' || '1',
                staticHost: '',
                hhcdnHost: '',
                apiHost: '',
                timeStamp: '',
                userType: '' || 'anonymous',
                cryptedUserId: '',
                employerState: '',
                vishnuIframeSrc: '',
                login: "",
                userId: '',
                hhid: '',


                autotestsComponentsInitEnd: false,


                performanceObserverEnabled: true,

                features: {"hide_resume_photo_from_untrusted_users": true, "disable_counters": false, "sentry_logging": true, "fingerprinting_enable": true, "secure_portal_enabled": true, "secure_portal_employer_registration_only": false, "employer_extensions_to_detect": "{\"vera\": \"veraBar\", \"friendwork\": \"fwi-popup\", \"potok\": \"potok_io__chrome_extension_iframe\", \"extrasaur\": \"custom-table-iframe-div\"}", "anonymous_resume_enabled": true, "sentry_js_config": "{\r\n  \"ignorePaths\": {\r\n    \"regexps\": [\r\n      \"[\\\\da-f]+/[\\\\da-f-]+/main\\\\.js\",\r\n      \".*akamaihd\\\\.net.+$\",\r\n      \"\\\\/inj_js\\\\/common\\\\.js\",\r\n      \"fingerprintjs\",\r\n      \"ckeditor4.5\",\r\n      \"axios/lib/core/createError\"\r\n    ]\r\n  },\r\n  \"ignoreErrors\": {\r\n    \"strings\": [\r\n      \"'e.data.indexOf' is not a function\",\r\n      \"Load timeout for modules:\",\r\n      \"__gCrWeb.autofill.extractForms\",\r\n      \"HTML Parsing Error: Unable to modify the parent container element before the child element is closed\",\r\n      \"Uncaught exception: TypeError: Cannot convert 'd.body' to object\",\r\n      \"Node cannot be inserted at the specified point in the hierarchy\",\r\n      \"TypeError: \u041d\u0435\u0434\u043e\u043f\u0443\u0441\u0442\u0438\u043c\u044b\u0439 \u0432\u044b\u0437\u044b\u0432\u0430\u044e\u0449\u0438\u0439 \u043e\u0431\u044a\u0435\u043a\u0442\",\r\n      \"TypeError: Invalid calling object\",\r\n      \"TypeError: 'undefined' is not an object (evaluating 'doc.forms')\",\r\n      \"Uncaught exception: TypeError: Cannot convert 'a.mini' to object\",\r\n      \"window.zAdv\",\r\n      \"backbone in Function.e.Router [as extend]\",\r\n      \"this._doc.documentElement\",\r\n      \"Can't find variable: inf\",\r\n      \"SkypeClick2Call\",\r\n      \"\u0421\u0438\u043d\u0442\u0430\u043a\u0441\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043e\u0448\u0438\u0431\u043a\u0430\",\r\n      \"Invalid or unexpected token\",\r\n      \"Unexpected token <\",\r\n      \"Blocked a frame with origin\",\r\n      \"__show__deepen\",\r\n      \"expected expression, got '<'\",\r\n      \"Cannot read property 'forms' of undefined\",\r\n      \"GM_addStyle is not defined\",\r\n      \"can't redefine non-configurable property \\\"userAgent\\\"\",\r\n      \"Can't find varfiable: auto\",\r\n      \"only one instance of babel-polyfill is allowed\",\r\n      \"this.matches is not a function\",\r\n      \"NS_ERROR_NOT_INITIALIZED\",\r\n      \"NS_ERROR_UNEXPECTED\",\r\n      \"jQuery(...).size is not a function\",\r\n      \"Unexpected token ILLEGAL\",\r\n      \"Unexpected identifier\",\r\n      \"Unexpected end of input\",\r\n      \"yndx_svtn_e\",\r\n      \"TypeError: Cannot set property 'destroySlots' of undefined\",\r\n      \"Non-Error exception captured with keys: status, statusText\",\r\n      \"SyntaxError: The string did not match the expected pattern.\",\r\n      \"The operation is insecure\",\r\n      \"No identifiers allowed directly after numeric literal\",\r\n      \"wmrzz_time2 is not defined\",\r\n      \"Request failed with status code 403\",\r\n      \"SYNTAX_ERR: DOM Exception 12\",\r\n      \"maxthon\",\r\n      \"Request aborted\"\r\n    ],\r\n    \"regexps\": [\r\n      \"^undefined$\",\r\n      \"^Syntax error$\",\r\n      \"^\u041d\u0435\u043e\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u043d\u0430\u044f \u043e\u0448\u0438\u0431\u043a\u0430\\\\.$\",\r\n      \"^\u041d\u0435\u0434\u043e\u043f\u0443\u0441\u0442\u0438\u043c\u044b\u0439 \u0437\u043d\u0430\u043a$\",\r\n      \"^\\\\[object Event\\\\]$\",\r\n      \"\\\\bgST\\\\b\",\r\n      \"pixelPositionVal\",\r\n      \"\u041d\u0435\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u043f\u0430\u043c\u044f\u0442\u0438 \u0434\u043b\u044f \u0437\u0430\u0432\u0435\u0440\u0448\u0435\u043d\u0438\u044f \u043e\u043f\u0435\u0440\u0430\u0446\u0438\u0438[\\\\s\\\\S]+?fingerprint2\",\r\n      \"^illegal character$\",\r\n      \"^Access is denied\\\\.\\\\s*$\",\r\n      \"^Timeout$\",\r\n      \"^Unexpected token else$\",\r\n      \"^\u041d\u0435\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u043f\u0430\u043c\u044f\u0442\u0438$\",\r\n      \"^\\\\[CKEDITOR.resourceManager.load\\\\] Resource name \\\"default\\\" was not found at\",\r\n      \"can't redefine non-configurable property \\\"AceScript\\\"\",\r\n      \"\u041e\u043f\u0435\u0440\u0430\u0446\u0438\u044f \u0431\u044b\u043b\u0430 \u043e\u0442\u043c\u0435\u043d\u0435\u043d\u0430 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0435\u043c.\",\r\n      \"out of memory\",\r\n      \"Network Error\",\r\n      \"Loading chunk\",\r\n      \"^No error message$\",\r\n      \"^\\\"Timeout\\\"$\"\r\n    ]\r\n  }\r\n}", "vishnu_webim_integration": true, "iframe_fix_size_banners": "504,514,500,502,260,348,674,675,370,369,368,345,346", "personal_manager_rating_enabled": true, "fp_pro_enabled": true},
                variables: ,
                cssMaping: ,
                firebaseMessagingSenderId: '',
                google_dfp_sandbox: '',
            };

======= EXCEPTION END ========
12:57:25,919 [Thread-7] WARN  com.gargoylesoftware.htmlunit.html.HtmlScript - Script is not JavaScript (type: text/html, language: ). Skipping execution.
...