Question

Я пытался сделать это индивидуально в виде ссылок, а не в качестве дат, но у меня были проблемы с количеством кадров данных, которые не совпадали, чтобы выяснить, как объединить список 2. Я решил извлечь ссылку и дату одновременно, но теперь я не могу получить никаких результатов.

Мой фрейм данных должен просто содержать ссылку и отчет Год-Месяц

Здесь это образец html

<tr>
 <td headers="view-dlf-1-title-table-column--G7-URXF07Ms" class="views-field views-field-dlf-1-title">
 <a href="/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report-Items/Contract-Summary-2013-03">Contract Summary</a>          </td>
 <td headers="view-dlf-2-report-period-table-column--G7Rqagd92Ho" class="views-field views-field-dlf-2-report-period">2013-03          </td>
 </tr>

Это мой текущий код

import pandas as pd
from datetime import datetime
from lxml import html
import requests

def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """

    if payload is None:
      payload = {}

    if session:
       content = session.get(url, params=payload, verify=False, headers={"content-type":"text"})
    else:
       content = requests.get(url, params=payload, verify=False, headers={"content-type":"text"})

    content.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)

    if parse:
       return html.fromstring(content.text), content.url
    else:
       return content.text, content.url

def get_html(link):
  """
  Returns a html.
  """
   page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
   return page_parsed


cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1']

for cmslink in cmslinks:
   content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
   table = content.cssselect('table[class="views-table views-view-table cols-2"]')[0]
   links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
   urls = [row.get('href') for row in links]         
   date = [dict(zip('ReportTime', row.xpath('td//text()'))) for row in table[0:]]
   df1 = pd.DataFrame(urls) 
   df2 = pd.DataFrame(date) 
   mergedDf = df2.merge(df1, left_index=True, right_index=True)

0buz · Answer 1 · 02 мая 2020

Попробуйте:

import pandas as pd
from datetime import datetime
from lxml import html
import requests

def http_request_get(url, session=None, payload=None, parse=True):
    """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """

    if payload is None:
      payload = {}

    if session:
       content = session.get(url, params=payload, verify=False, headers={"content-type":"text"})
    else:
       content = requests.get(url, params=payload, verify=False, headers={"content-type":"text"})

    content.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)

    if parse:
       return html.fromstring(content.text), content.url
    else:
       return content.text, content.url

def get_html(link):
    """
    Returns a html.
    """
    page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
    return page_parsed


cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1'
]

for cmslink in cmslinks:
   content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
   table = content.cssselect('table[class="views-table views-view-table cols-2"]')
   links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
   urls = [row.xpath("//a[contains(text(),'Enrollment by Contract')]/@href") for row in links]
   date = [dict(zip('ReportTime', row.xpath("//td[@class='views-field views-field-dlf-2-report-period']"))) for row in table[0:]]
   df1 = pd.DataFrame(urls)
   df2 = pd.DataFrame(date)
   mergedDf = df2.merge(df1, left_index=True, right_index=True)

full_table=pd.DataFrame()
for cmslink in cmslinks:
   content, _ = http_request_get(url=cmslink, payload={'t': ''}, parse=True)
   table=pd.read_html(cmslink)[0]
   links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
   urls = links[0].xpath("//td/a[contains(text(),'')]/@href")
   table['Title']=urls
   full_table=full_table.append(table)

print(full_table)

Вывод: - 166 строк x 2 столбца

chitown88 · Answer 2 · 02 мая 2020

Я бы go с BeautifulSoup здесь. Это довольно простая библиотека для работы с html. Тогда нужно просто захватить теги <a>, которые имеют href (в частности, ссылки "Enrollment-by-Contract"). Затем просто получите следующий тег <td> из этих элементов для текста в следующей ячейке таблицы.

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from lxml import html
import requests

def http_request_get(url, session=None, payload=None, parse=True):
    """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """

    if payload is None:
      payload = {}

    if session:
       content = session.get(url, params=payload, verify=False, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',"content-type":"text"})
    else:
       content = requests.get(url, params=payload, verify=False, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',"content-type":"text"})

    content.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)

    if parse:
       return BeautifulSoup(content.text, 'html.parser'), content.url
    else:
       return content.text, content.url

def get_html(link):
  """
  Returns a html.
  """
  page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
  return page_parsed


cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1']

df = pd.DataFrame()
for cmslink in cmslinks:
   content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
   table = content.find('table')
   links = table.find_all('a', href=True)
   urls = [[row.get('href'), row.find_next('td').text.strip()] for row in links if 'Enrollment-by-Contract' in row.get('href')]         
   df = df.append(pd.DataFrame(urls), sort=False).reset_index(drop=True)

Вывод:

print (df)
                                                     0        1
0    /Research-Statistics-Data-and-Systems/Statisti...  2019-10
1    /Research-Statistics-Data-and-Systems/Statisti...  2019-09
2    /Research-Statistics-Data-and-Systems/Statisti...  2019-08
3    /Research-Statistics-Data-and-Systems/Statisti...  2019-07
4    /Research-Statistics-Data-and-Systems/Statisti...  2019-06
5    /Research-Statistics-Data-and-Systems/Statisti...  2019-05
6    /Research-Statistics-Data-and-Systems/Statisti...  2019-04
7    /Research-Statistics-Data-and-Systems/Statisti...  2019-03
8    /Research-Statistics-Data-and-Systems/Statisti...  2019-02
9    /Research-Statistics-Data-and-Systems/Statisti...  2019-01
10   /Research-Statistics-Data-and-Systems/Statisti...  2018-12
11   /Research-Statistics-Data-and-Systems/Statisti...  2018-11
12   /Research-Statistics-Data-and-Systems/Statisti...  2018-10
13   /Research-Statistics-Data-and-Systems/Statisti...  2018-09
14   /Research-Statistics-Data-and-Systems/Statisti...  2018-08
15   /Research-Statistics-Data-and-Systems/Statisti...  2018-07
16   /Research-Statistics-Data-and-Systems/Statisti...  2018-06
17   /Research-Statistics-Data-and-Systems/Statisti...  2018-05
18   /Research-Statistics-Data-and-Systems/Statisti...  2018-04
19   /Research-Statistics-Data-and-Systems/Statisti...  2018-03
20   /Research-Statistics-Data-and-Systems/Statisti...  2018-02
21   /Research-Statistics-Data-and-Systems/Statisti...  2018-01
22   /Research-Statistics-Data-and-Systems/Statisti...  2017-12
23   /Research-Statistics-Data-and-Systems/Statisti...  2017-11
24   /Research-Statistics-Data-and-Systems/Statisti...  2017-10
25   /Research-Statistics-Data-and-Systems/Statisti...  2017-09
26   /Research-Statistics-Data-and-Systems/Statisti...  2017-08
27   /Research-Statistics-Data-and-Systems/Statisti...  2017-07
28   /Research-Statistics-Data-and-Systems/Statisti...  2017-06
29   /Research-Statistics-Data-and-Systems/Statisti...  2017-05
..                                                 ...      ...
129  /Research-Statistics-Data-and-Systems/Statisti...  2008-12
130  /Research-Statistics-Data-and-Systems/Statisti...  2008-11
131  /Research-Statistics-Data-and-Systems/Statisti...  2008-10
132  /Research-Statistics-Data-and-Systems/Statisti...  2008-09
133  /Research-Statistics-Data-and-Systems/Statisti...  2008-08
134  /Research-Statistics-Data-and-Systems/Statisti...  2008-07
135  /Research-Statistics-Data-and-Systems/Statisti...  2008-06
136  /Research-Statistics-Data-and-Systems/Statisti...  2008-05
137  /Research-Statistics-Data-and-Systems/Statisti...  2008-04
138  /Research-Statistics-Data-and-Systems/Statisti...  2008-03
139  /Research-Statistics-Data-and-Systems/Statisti...  2008-02
140  /Research-Statistics-Data-and-Systems/Statisti...  2008-01
141  /Research-Statistics-Data-and-Systems/Statisti...  2007-12
142  /Research-Statistics-Data-and-Systems/Statisti...  2007-11
143  /Research-Statistics-Data-and-Systems/Statisti...  2007-10
144  /Research-Statistics-Data-and-Systems/Statisti...  2007-09
145  /Research-Statistics-Data-and-Systems/Statisti...  2007-08
146  /Research-Statistics-Data-and-Systems/Statisti...  2007-07
147  /Research-Statistics-Data-and-Systems/Statisti...  2007-06
148  /Research-Statistics-Data-and-Systems/Statisti...  2007-05
149  /Research-Statistics-Data-and-Systems/Statisti...  2007-04
150  /Research-Statistics-Data-and-Systems/Statisti...  2007-03
151  /Research-Statistics-Data-and-Systems/Statisti...  2007-02
152  /Research-Statistics-Data-and-Systems/Statisti...  2007-01
153  /Research-Statistics-Data-and-Systems/Statisti...  2006-12
154  /Research-Statistics-Data-and-Systems/Statisti...  2006-11
155  /Research-Statistics-Data-and-Systems/Statisti...  2006-10
156  /Research-Statistics-Data-and-Systems/Statisti...  2006-09
157  /Research-Statistics-Data-and-Systems/Statisti...  2006-08
158  /Research-Statistics-Data-and-Systems/Statisti...  2012-11

[159 rows x 2 columns]

Создать фрейм данных из HTML Метки

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Создать фрейм данных из HTML Метки

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы