Мне особенно нужна помощь, чтобы понять, почему этот код работает одну минуту, а затем - нет. Итак, кто-нибудь может мне помочь, почему это происходит именно так? Я использую приведенный ниже код просто для извлечения некоторых данных из Yahoo Finance. Я получаю следующее set_index KeyError время от времени, которое ссылается на 3 отдельные, но похожие строки (55, 93 и 131, если быть точным, к которым я добавил комментарии рядом с):
File "/Users/daniel/Desktop/CG.py", line 55, in <module>
df_balancesheet = df_balancesheet.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/frame.py", line 4303, in set_index
raise KeyError(f"None of {missing} are in the columns")
KeyError: "None of ['0'] are in the columns"
Эта ошибка может произойти 15 раз подряд, а затем вдруг код работает и работает гладко ... до тех пор, пока не произойдет. Ниже приведена большая часть кода, который я запускаю:
from datetime import datetime
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
from urllib.request import urlopen as ur
from bs4 import BeautifulSoup as soup
print('Running Screen...')
collection = ['ar=180','r=21&ar=180','r=41&ar=180','r=61&ar=180','r=81&ar=180','r=101&ar=180','r=121&ar=180','r=141&ar=180','r=181&ar=180','r=201&ar=180','r=221&ar=180','r=241&ar=180']
for url in collection: #scrape multiple pages
my_url = 'https://finviz.com/screener.ashx?v=141&f=cap_smallover,fa_eps5years_pos,fa_grossmargin_o10,fa_netmargin_pos,fa_opermargin_pos,fa_sales5years_o5,geo_usa,sh_avgvol_o1000,ta_beta_o0.5&ft=2&' + str(url)
uClient = ur(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser") #find ticker
stock_tickers = page_soup.findAll("a", {"class":"screener-link-primary"})
for tickers in stock_tickers: #find all of the tickers
ticker = tickers.text
collection = [ticker]
for url in collection: #scrape multiple pages
#balance sheet data
my_url1 = 'https://finance.yahoo.com/quote/' + str(url) + '/balance-sheet?p=' + str(url)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
page = requests.get(my_url1, headers)
tree = html.fromstring(page.content)
tree.xpath("//h1/text()")
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
#assert len(table_rows) > 0
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
df_balancesheet = pd.DataFrame(parsed_rows)
df_balancesheet = pd.DataFrame(parsed_rows)
df_balancesheet = df_balancesheet.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 1***
df_balancesheet = df_balancesheet.transpose()
cols = list(df_balancesheet.columns)
cols[0] = 'Date'
df_balancesheet = df_balancesheet.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df_balancesheet.columns)[1::]
#income statement data
my_url2 = 'https://finance.yahoo.com/quote/' + str(url) + '/financials?p=' + str(url)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
page = requests.get(my_url2, headers)
tree = html.fromstring(page.content)
tree.xpath("//h1/text()")
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
#assert len(table_rows) > 0
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
df_incomestatement = pd.DataFrame(parsed_rows)
df_incomestatement = pd.DataFrame(parsed_rows)
df_incomestatement = df_incomestatement.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 2***
df_incomestatement = df_incomestatement.transpose()
cols = list(df_incomestatement.columns)
cols[0] = 'Date'
df_incomestatement = df_incomestatement.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df_incomestatement.columns)[1::]
# cash flow data
my_url3 = 'https://finance.yahoo.com/quote/' + str(url) + '/cash-flow?p=' + str(url)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
page = requests.get(my_url3, headers)
tree = html.fromstring(page.content)
tree.xpath("//h1/text()")
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
#assert len(table_rows) > 0
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
df_cashflow = pd.DataFrame(parsed_rows)
df_cashflow = pd.DataFrame(parsed_rows)
df_cashflow = df_cashflow.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 3***
df_cashflow = df_cashflow.transpose()
cols = list(df_cashflow.columns)
cols[0] = 'Date'
df_cashflow = df_cashflow.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df_cashflow.columns)[1::]
writer = pd.ExcelWriter(ticker + '.xlsx')
df_incomestatement.to_excel(writer,'Income Statement')
df_balancesheet.to_excel(writer,'Balance Sheet')
df_cashflow.to_excel(writer,'Statement of Cash Flows')
writer.save()
print('Collecting data for ' + ticker + '...')
Если вам удастся пролить свет на это, заранее спасибо