Set_Index Несоответствующий KeyError - PullRequest
0 голосов
/ 15 апреля 2020

Мне особенно нужна помощь, чтобы понять, почему этот код работает одну минуту, а затем - нет. Итак, кто-нибудь может мне помочь, почему это происходит именно так? Я использую приведенный ниже код просто для извлечения некоторых данных из Yahoo Finance. Я получаю следующее set_index KeyError время от времени, которое ссылается на 3 отдельные, но похожие строки (55, 93 и 131, если быть точным, к которым я добавил комментарии рядом с):

File "/Users/daniel/Desktop/CG.py", line 55, in <module>
    df_balancesheet = df_balancesheet.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/frame.py", line 4303, in set_index
    raise KeyError(f"None of {missing} are in the columns")
KeyError: "None of ['0'] are in the columns"

Эта ошибка может произойти 15 раз подряд, а затем вдруг код работает и работает гладко ... до тех пор, пока не произойдет. Ниже приведена большая часть кода, который я запускаю:

from datetime import datetime
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
from urllib.request import urlopen as ur
from bs4 import BeautifulSoup as soup

print('Running Screen...')
collection  = ['ar=180','r=21&ar=180','r=41&ar=180','r=61&ar=180','r=81&ar=180','r=101&ar=180','r=121&ar=180','r=141&ar=180','r=181&ar=180','r=201&ar=180','r=221&ar=180','r=241&ar=180']
for url in collection: #scrape multiple pages
        my_url = 'https://finviz.com/screener.ashx?v=141&f=cap_smallover,fa_eps5years_pos,fa_grossmargin_o10,fa_netmargin_pos,fa_opermargin_pos,fa_sales5years_o5,geo_usa,sh_avgvol_o1000,ta_beta_o0.5&ft=2&' + str(url)
        uClient = ur(my_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser") #find ticker
        stock_tickers = page_soup.findAll("a", {"class":"screener-link-primary"})
        for tickers in stock_tickers: #find all of the tickers
                ticker = tickers.text
                collection  = [ticker]
                for url in collection: #scrape multiple pages
                        #balance sheet data
                        my_url1 = 'https://finance.yahoo.com/quote/' + str(url) + '/balance-sheet?p=' + str(url) 
                        headers = {
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                            'Accept-Encoding': 'gzip, deflate, br',
                            'Accept-Language': 'en-US,en;q=0.9',
                            'Cache-Control': 'max-age=0',
                            'Pragma': 'no-cache',
                            'Referrer': 'https://google.com',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
                        }
                        page = requests.get(my_url1, headers)
                        tree = html.fromstring(page.content)
                        tree.xpath("//h1/text()")
                        table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
                        #assert len(table_rows) > 0
                        parsed_rows = []
                        for table_row in table_rows:
                            parsed_row = []
                            el = table_row.xpath("./div")
                            none_count = 0
                            for rs in el:
                                try:
                                    (text,) = rs.xpath('.//span/text()[1]')
                                    parsed_row.append(text)
                                except ValueError:
                                    parsed_row.append(np.NaN)
                                    none_count += 1
                            if (none_count < 4):
                                parsed_rows.append(parsed_row)
                        df_balancesheet = pd.DataFrame(parsed_rows)
                        df_balancesheet = pd.DataFrame(parsed_rows)
                        df_balancesheet = df_balancesheet.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 1***
                        df_balancesheet = df_balancesheet.transpose()
                        cols = list(df_balancesheet.columns)
                        cols[0] = 'Date'
                        df_balancesheet = df_balancesheet.set_axis(cols, axis='columns', inplace=False)
                        numeric_columns = list(df_balancesheet.columns)[1::]
                        #income statement data
                        my_url2 = 'https://finance.yahoo.com/quote/' + str(url) + '/financials?p=' + str(url) 
                        headers = {
                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                                'Accept-Encoding': 'gzip, deflate, br',
                                'Accept-Language': 'en-US,en;q=0.9',
                                'Cache-Control': 'max-age=0',
                                'Pragma': 'no-cache',
                                'Referrer': 'https://google.com',
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
                        }
                        page = requests.get(my_url2, headers)
                        tree = html.fromstring(page.content)
                        tree.xpath("//h1/text()")
                        table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
                            #assert len(table_rows) > 0
                        parsed_rows = []
                        for table_row in table_rows:
                            parsed_row = []
                            el = table_row.xpath("./div")
                            none_count = 0
                            for rs in el:
                                try:
                                    (text,) = rs.xpath('.//span/text()[1]')
                                    parsed_row.append(text)
                                except ValueError:
                                    parsed_row.append(np.NaN)
                                    none_count += 1
                            if (none_count < 4):
                                parsed_rows.append(parsed_row)
                        df_incomestatement = pd.DataFrame(parsed_rows)
                        df_incomestatement = pd.DataFrame(parsed_rows)
                        df_incomestatement = df_incomestatement.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 2***
                        df_incomestatement = df_incomestatement.transpose()
                        cols = list(df_incomestatement.columns)
                        cols[0] = 'Date'
                        df_incomestatement = df_incomestatement.set_axis(cols, axis='columns', inplace=False)
                        numeric_columns = list(df_incomestatement.columns)[1::]
                        # cash flow data
                        my_url3 = 'https://finance.yahoo.com/quote/' + str(url) + '/cash-flow?p=' + str(url) 
                        headers = {
                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                                'Accept-Encoding': 'gzip, deflate, br',
                                'Accept-Language': 'en-US,en;q=0.9',
                                'Cache-Control': 'max-age=0',
                                'Pragma': 'no-cache',
                                'Referrer': 'https://google.com',
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
                        }
                        page = requests.get(my_url3, headers)
                        tree = html.fromstring(page.content)
                        tree.xpath("//h1/text()")
                        table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
                            #assert len(table_rows) > 0
                        parsed_rows = []
                        for table_row in table_rows:
                            parsed_row = []
                            el = table_row.xpath("./div")
                            none_count = 0
                            for rs in el:
                                try:
                                    (text,) = rs.xpath('.//span/text()[1]')
                                    parsed_row.append(text)
                                except ValueError:
                                    parsed_row.append(np.NaN)
                                    none_count += 1
                            if (none_count < 4):
                                parsed_rows.append(parsed_row)
                        df_cashflow = pd.DataFrame(parsed_rows)
                        df_cashflow = pd.DataFrame(parsed_rows)
                        df_cashflow = df_cashflow.set_index('0', drop=True, append=False,inplace=False, verify_integrity=False) ***#ERROR LINE 3***
                        df_cashflow = df_cashflow.transpose()
                        cols = list(df_cashflow.columns)
                        cols[0] = 'Date'
                        df_cashflow = df_cashflow.set_axis(cols, axis='columns', inplace=False)
                        numeric_columns = list(df_cashflow.columns)[1::]
                        writer = pd.ExcelWriter(ticker + '.xlsx')
                        df_incomestatement.to_excel(writer,'Income Statement')
                        df_balancesheet.to_excel(writer,'Balance Sheet')
                        df_cashflow.to_excel(writer,'Statement of Cash Flows')
                        writer.save()
                        print('Collecting data for ' + ticker + '...')

Если вам удастся пролить свет на это, заранее спасибо

...