Итак, я отметил во всем коде некоторые проблемы, которые я видел в вашем коде как опубликованные.
Некоторые вещи, которые я заметил:
Не обрабатывает случаи, когда что-то не найдено, например 'PARIS-SAINT-GERMAIN-FOOTBALL 'потерпит неудачу, в то время как' PARIS SAINT GERMAIN FOOTBALL 'в качестве поискового термина не упустит
Упущенные возможности для упрощения, например, создание кадра данных с помощью цикла tr
, а затем td
, когда можно просто использоватьread_html
по table
;Использование find_all, когда требуется один тег table
или a
Перезапись переменных в циклах, а также опечатки, например
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td] # presumable a typo with row = row
Не проверяется, является ли кадр данных пустым
Риск создания неверных URL с использованием 'https://www.verif.com/'
, так как следующая часть, к которой вы присоединяете, также начинается с "/"
Несогласованное именование переменных, например, что такое single_item
? Функция, которую я вижу, называется single_text
.
Это всего лишь некоторые наблюдения, и, безусловно, еще есть возможности для улучшения.
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
if not df.empty: #test if dataframe is empty
print(df.head(1))
if __name__ == '__main__':
main()
Возвращение df из main ()
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
#print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
return df
if __name__ == '__main__':
df = main()
print(df)