Извлечение JavaScript переменных в Python словари - PullRequest
1 голос
/ 27 мая 2020

Понимая, что я должен использовать PyQt5 вместе с BeautifulSoup для запуска javascript на моем клиенте после извлечения html с помощью BeautifulSoup, я пытаюсь преобразовать переменную _Flourish_data в словарь Python.

Есть ли простой способ извлечь переменную Javascript _Flourish_data в словарь Python? Вот мой текущий Python для извлечения Javascript с использованием PyQt5 и BeautifulSoup:

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

page = Page('https://flo.uri.sh/visualisation/2451841/embed?auto=1')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find_all('script')
js_test[5]

Результат существующего кода:

<script>
        function _Flourish_unflattenInto(dest, src) {
            dest = dest || {};
            for (var k in src) {
                var t = dest;
                for (var i = k.indexOf("."), p = 0; i >= 0; i = k.indexOf(".", p = i+1)) {
                    var s = k.substring(p, i);
                    if (!(s in t)) t[s] = {};
                    t = t[s];
                }
                t[k.substring(p)] = src[k];
            }
            return dest;
        }
        var _Flourish_settings = {"cell_fill_1":"#ffffff","cell_fill_2":"#ebebeb","cell_fill_direction":"horizontal","cell_font_size":"1","cell_height":20,"cell_horizontal_alignment":"center","cell_link_color":"#2886b2","cell_padding_horizontal":16,"cell_padding_vertical":11,"column_width_mode":"auto","column_widths":"10%, 10%, 10%, 10%, 50%, 10%","header_fill":"#181f6c","header_font_color":"#ffffff","header_font_default":false,"header_font_size":1.1,"header_horizontal_alignment":"center","header_style_default":true,"layout.body_font":{"name":"Source Sans Pro","url":"https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700"},"layout.layout_order":"stack-default","layout.space_between_sections":"0.5","mobile.view":true,"no_results_text":"Use the search bar to find your state","pagination_amount":41,"pagination_amount_search":"5","search_enabled":false,"search_hide_table":false,"search_placeholder":"Search to find your state","search_resize":true,"search_width":15};
        _Flourish_unflattenInto(window.template.state, _Flourish_settings);

        var _Flourish_data_column_names = {"rows":{"columns":["State ","Earliest/Planned Start Date for 20/21 Academic Year ","","","",""]}},
                _Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n  ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n  ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
        for (var _Flourish_dataset in _Flourish_data) {
                window.template.data[_Flourish_dataset] = _Flourish_data[_Flourish_dataset];
                window.template.data[_Flourish_dataset].column_names = _Flourish_data_column_names[_Flourish_dataset];
        }
        window.template.draw();
    </script>

Я просто хочу var _flourish_data из HTML, как показано ниже:

        _Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n  ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n  ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};

Любая помощь будет принята с благодарностью!

Ответы [ 2 ]

1 голос
/ 27 мая 2020
import requests
import re
import json


def main(url):
    r = requests.get(url)
    match = json.loads(re.search(r'_Flourish_data = ({.*})', r.text).group(1))
    print(match.keys())


main("https://flo.uri.sh/visualisation/2451841/embed?auto=1")
1 голос
/ 27 мая 2020

Javascript выполнять не нужно. Это можно сделать с помощью модуля json и re.

Например:

import re
import json
import requests

url = 'https://flo.uri.sh/visualisation/2451841/embed?auto=1'

html_data = requests.get(url).text
data = re.search(r'_Flourish_data = (\{.*?\});', html_data).group(1)

data = json.loads(data)

# uncomment this to print all data:
# print(json.dumps(data, indent=4))

for row in data['rows']:
    print('{:<55}{}'.format(*map(str.strip, row['columns'][:2])))

Печать:

Alabama                                                Varies by district
Alaska                                                 Varies by district
American Samoa                                         Unknown
Arizona                                                Varies by district
Arkansas                                               Varies by district
Bureau of Indian Education                             Varies by district
California                                             Varies by district
Colorado                                               Varies by district
Connecticut                                            Not yet determined
Delaware                                               Varies by district
Department of Defense Education Activity               Varies by district
District of Columbia                                   8/31/2020
Florida                                                Unknown
Georgia                                                Unknown
Guam                                                   Unknown
Hawaii                                                 Not yet determined
Idaho                                                  Varies by District
Illinois                                               Varies by district
Indiana                                                Not yet determined
Iowa                                                   Varies by district
Kansas                                                 Not yet determined
Kentucky                                               Unknown
Louisiana                                              Varies by district
Maine                                                  Varies by district
Maryland                                               Not yet determined
Massachusetts                                          Not yet determined
Michigan                                               Not yet determined
Minnesota                                              Not yet determined
Mississippi                                            Varies by district
Missouri                                               Varies by district
Montana                                                Varies by district
Nebraska                                               Varies by district
Nevada                                                 Varies by district
New Hampshire                                          Not yet determined
New Jersey                                             Varies by district
New Mexico                                             Unknown
New York                                               Not yet determined
North Carolina                                         8/17/2020
North Dakota                                           Varies by district
Northern Marianas                                      Unknown
Ohio                                                   Not yet determined
Oklahoma                                               Varies by district
Oregon                                                 Not yet determined
Pennsylvania                                           Varies by district
Puerto Rico                                            Unknown
Rhode Island                                           Not yet determined
South Carolina                                         Not yet determined
South Dakota                                           Varies by district
Tennessee                                              Varies by district
Texas                                                  Varies by district
U.S. Virgin Islands                                    Not yet determined
Utah                                                   Varies by district
Vermont                                                Not yet determined
Virginia                                               Not yet determined
Washington                                             Varies by District
West Virginia                                          Not yet determined
Wisconsin                                              Varies by district
Wyoming                                                Not yet determined
...