Как удалить данные, которые невозможно проверить и которые находятся внутри тега <svg> - PullRequest
0 голосов
/ 09 мая 2019

Я не могу собрать некоторые данные с веб-страницы Partywise Result . Я хочу вычеркнуть partwise{vote%,vote count} с этой страницы.

Код, который я пробовал до сих пор:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os

def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)
    for records2 in soup2.findAll("div",{"id":"piecharts26"}):
        print(records2.table)
        for records in records2.findAll("table"):
            print(records)
            edata = ""
            for data in records.findAll('td'):
                edata= edata+","+data.text
            edatas= edatas + "\n" + edata[1:]+","+code

header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))

Результат, от которого я ожидаю, - это процентная доля голосов

output page 1

Я хочу, чтобы вывод был в формате CSV, например:

INC,43.0%,6144192

и так далее со страницы 1

output for page 2

и два

1 Ответ

0 голосов
/ 09 мая 2019

Идет загрузка данных непосредственно из JavaScript внутри вашего div:

if(document.getElementById('piecharts26')!=null)

Таким образом, вы должны использовать консольный браузер, такой как selenium ( ссылка здесь ), или использовать регулярное выражение:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json

def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    m = r.search(s)
    if m:
        result = m.group(1)

    return json.loads(result.replace("'",'"'))


def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata


#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)

    result = get_data(soup2)

    print(result)

header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))

ВЫВОД:

[['INC {43.0%,6144192}', 6144192],
 ['BJP {33.0%,4707141}', 4707141],
 ['JCCJ {7.6%,1086581}', 1086581],
 ['IND {5.9%,839053}', 839053],
 ['BSP {3.9%,552313}', 552313],
 ['GGP {1.7%,247459}', 247459],
 ['AAAP {0.9%,123526}', 123526],
 ['CPI {0.3%,48255}', 48255],
 ['APoI {0.3%,42013}', 42013],
 ['SHS {0.2%,34678}', 34678],
 ['NCP {0.2%,28983}', 28983],
 ['SP {0.2%,21969}', 21969],
 ['BYPP {0.1%,8425}', 8425],
 ['CPM {0.1%,8348}', 8348],
 ['JD(U) {0.1%,8285}', 8285],
 ['CSM {0.1%,7783}', 7783],
 ['BMUP {0.1%,7419}', 7419],
 ['BSCP {0.0%,5546}', 5546],
 ['BTP {0.0%,5498}', 5498],
 ['RJsbhP {0.0%,5141}', 5141],
 ['RGOP {0.0%,5040}', 5040],
 ['IPBP {0.0%,4982}', 4982],
 ['NINSHAD {0.0%,4586}', 4586],
 ['PSPU {0.0%,4309}', 4309],
 ['BHBHP {0.0%,3780}', 3780],
 ['RPI(A) {0.0%,3257}', 3257],
 ['JAC {0.0%,3034}', 3034],
 ['CPIM {0.0%,3017}', 3017],
 ['NDPF {0.0%,2912}', 2912],
 ['AASPP {0.0%,2474}', 2474],
 ['BBC {0.0%,2089}', 2089],
 ['SWAP {0.0%,2023}', 2023],
 ['cvgrp {0.0%,1582}', 1582],
 ['bhmm {0.0%,1474}', 1474],
 ['AVVP {0.0%,1407}', 1407],
 ['LSWP {0.0%,1399}', 1399],
 ['CSP {0.0%,1232}', 1232],
 ['BPSGKD {0.0%,1093}', 1093],
 ['BKNP {0.0%,1085}', 1085],
 ['CGVP {0.0%,1053}', 1053],
 ['SUCI {0.0%,1048}', 1048],
 ['SUSP {0.0%,988}', 988],
 ['DPI {0.0%,970}', 970],
 ['RJBP {0.0%,717}', 717],
 ['ASSP {0.0%,701}', 701],
 ['BLRP {0.0%,570}', 570],
 ['BSHSP {0.0%,562}', 562],
 ['ABHM {0.0%,549}', 549],
 ['SSBD {0.0%,468}', 468],
 ['ABSSP {0.0%,436}', 436],
 ['BRSP {0.0%,429}', 429],
 ['ABSKP {0.0%,389}', 389],
 ['BSSP {0.0%,279}', 279],
 ['BNIP {0.0%,267}', 267],
 ['RMGP {0.0%,258}', 258],
 ['KMSP {0.0%,241}', 241],
 ['BHBP {0.0%,224}', 224],
 ['RP(K) {0.0%,202}', 202],
 ['CMM {0.0%,192}', 192],
 ['CHSJP {0.0%,183}', 183],
 ['RSSM {0.0%,72}', 72],
 ['AnAP {0.0%,66}', 66],
 ['NOTA {2.0%,282744}', 282744]]

Затем вы можете зациклить результат и сохранить его в файле csv

EDIT:

Смотрите это изменение, чтобы сохранить его в CSV-файле:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    m = r.search(s)
    if m:
        result = m.group(1)

    return json.loads(result.replace("'",'"'))


def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata


codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)

    result = get_data(soup2)
    header = ["Party","Vote%","Count","State code"]

    results_export = []
    results_export.append(header)
    for r in result:
        export = []
        party = r[0].split(' {')[0]
        percent = r[0].split(' {')[1].split(',')[0]
        count = r[1]
        export.append(str(party))
        export.append(str(percent))
        export.append(str(count))
        export.append(code)
        results_export.append(export)

    file = open(os.path.expanduser("per2014_result.csv"), "w")  # 2018
    writer = csv.writer(file)
    writer.writerows(results_export)

EDIT2:

def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    ms = r.findall(s)
    result = '[]'
    if ms:
        for m in ms:
            if m != '[]':
                result = m
    return json.loads(result.replace("'",'"'))
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...