Question

Чтобы помочь бороться с covid19 здесь, на Филиппинах, я пытаюсь провести анализ данных. Мой источник данных - таблица инцидентов в Википедии. См. https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines

Попытка получить стол в python с Прекрасным супом, но я не могу получить содержание столбцов [Возможность приема или консультации, недавняя история поездок за границу ]. Смотрите скриншот:

Screenshot of dataframe result

Что я делаю не так?

Вот мой код: (также можно найти здесь https://github.com/gio888/covid19_ph2/blob/master/covid_import_from_wikipedia.ipynb)

import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')

n_columns = 0
n_rows=0
column_names = []

for row in table.find_all('tr'):
   td_tags = row.find_all('td')
   if len(td_tags) > 0:
      n_rows+=1
      if n_columns == 0:
         n_columns = len(td_tags)

   th_tags = row.find_all('th') 
   if len(th_tags) > 0 and len(column_names) == 0:
      for th in th_tags:
         column_names.append(th.get_text())

columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,index= range(0,n_rows))

row_marker = 0
for row in table.find_all('tr'):
   column_marker = 0
   columns = row.find_all('td')
   for column in columns:
      df.iat[row_marker,column_marker] = column.get_text()
      column_marker += 1
   if len(columns) > 0:
      row_marker += 1

for col in df:
   try:
      df[col] = df[col].astype(float)
   except ValueError:
      pass

df

ahmed.soli · Answer 1 · 29 марта 2020

import pandas as pd
import requests
from bs4 import BeautifulSoup
#url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"

css_content = {
    'status-a': 'Admitted',
    'status-r': 'Recovered',
    'status-d': 'Died',
    'yes':'Yes',
    'no': 'No',
    'tba':'TBA',
    "covid-sticky":'skip_header'
}

def Check_att(source,value,attribute='class'):
    # <tag att='value'> <td class='x'>
    if col_value : return col_value
    if value in source.attrs.get(attribute, []) :
        return css_content.get(value,'')
    return ''


url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')


column_names = [col_name.text.rstrip('\n').strip() for col_name in table.select('tr.covid-sticky > th')]
n_rows       = len(table.select('tr > td'))
df = pd.DataFrame(columns = column_names,index= range(0,n_rows))
for row_index,row in enumerate(table.find_all('tr')[1:],0):
    # if Check_att(row,"covid-sticky") :continue        
    columns = row.find_all('td')
    for col_index , column in enumerate(columns,0):
        col_value     = ''
        col_value     = Check_att(column,'status-a')
        col_value     = Check_att(column,'status-r')
        col_value     = Check_att(column,'status-d')
        col_value     = Check_att(column,'yes')
        col_value     = Check_att(column,'no')
        col_value     = Check_att(column,'tba')
        if not col_value :
            col_value = column.get_text().rstrip('\n').strip()
        df.iat[row_index,col_index] = col_value




for col in df:
   try:
      df[col] = df[col].astype(float)
   except ValueError:
      pass

print(df)

αԋɱҽԃ αмєяιcαη · Answer 2 · 28 марта 2020

import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)

driver = webdriver.Firefox()
driver.get(
    "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines")

items = [["yes", "Yes"], ["no", "No"], [
    "TBA", "TBA"], ["status-d", "Died"], ["status-r", "Recovered"], ["status-a", "Admitted"]]

for item in items:
    script = (
        "document.querySelectorAll('.{}').forEach((element) => element.innerHTML = '{}')".format(*item))
    driver.execute_script(script)


df = pd.read_html(driver.page_source)[2]
df.to_csv("data.csv", index=False)

driver.quit()

Вывод: Просмотр онлайн

Почему BeautifulSoup не разбирает простую таблицу Википедии

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Почему BeautifulSoup не разбирает простую таблицу Википедии

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов