• 1000 У меня в целом работает нормально. Обычно я оставляю его работать всю ночь, когда ответ веб-сайта быстрее и стабильнее. потерять много времени.
Я хочу улучшить код с помощью некоторого метода обработки ошибок, чтобы он мог продолжать проверять, работает ли соединение inste rnet, и переходить к следующей ссылке, когда она работает, вместо сбоя. Кто-нибудь из вас знает, как это реализовать?
Это мой python код:
#!-*- coding: utf8 -*-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from tqdm import tqdm
import datetime
import requests
import pandas
import os
class SigefRequests:
"""Class responsible for accessing, extracting and parsing sigef
information into a csv file.
The output file will be at ./data/outputs
"""
def __init__(self, path):
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
self.url_list = self.reading_url_file(path)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) "
"Gecko/20100101 Firefox/54.0",
"Connection": "close",
"Accept-Language": "en-US,en;q=0.5",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/"
";q=0.8",
"Upgrade-Insecure-Requests": "1"
}
self.session = requests.session()
self.data = {
'código': [],
'denominação': [],
'área': [],
'data de entrada': [],
'situação': [],
'responsável técnico': [],
'ART': [],
'envio': [],
'requerimento': [],
'status': [],
'data': [],
'nome': [],
'cpf/cnpj': [],
'situação - georreferência': [],
'natureza': [],
'número de parcelas': [],
'municípios': [],
'código do imóvel': [],
'shp - polígono': [],
'shp - vértices': [],
'shp - limites': [],
'kml - polígono': [],
'kml - vértices': [],
'kml - limites': [],
'csv - polígono': [],
'csv - vértices': [],
'csv - limites': [],
}
self.export_list = [
"https://sigef.incra.gov.br/geo/exportar/parcela/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/csv/{}"
]
# Used in __init__
@staticmethod
def reading_url_file(path):
"""This function reads the links.txt file and return a links list.
Parameters
----------
path : str
The path to links.txt file.
(By default this file is in data folder).
Returns
-------
url_list : iterator
The links list.
"""
return open(
os.path.abspath('../' + path)
).readlines()
# Used in __call__
def requesting(self, url):
"""This function makes a GET requisition into the given sigef url.
Parameters
----------
url : str
Sigef's URL.
Returns
-------
response : requests.models.Response
The GET Requisition response.
"""
return self.session.get(url, verify=False, headers=self.headers)
# Used in __call__
@staticmethod
def soup(html):
"""This function parses the html.
Parameters
----------
html : requests.models.Response
Unparsed html.
Returns
-------
parsed_html : bs4.BeautifulSoup
Parsed html.
"""
return BeautifulSoup(html.content, 'html5lib')
# Used in __call__
def filtering_content(self, html):
"""This function filters the page content and looks for the relevant
data.
Parameters
----------
html : bs4.BeautifulSoup
Parsed html.
Returns
-------
"""
tables = html.find_all('table', {
'class': 'table table-hover tabela-atributos'
})
tables_ = [tables[0], tables[1], tables[2], tables[-1]]
content_list = []
for table in tables_:
for row in table.find_all('td'):
content_list.append((row.text.strip()))
content_list.pop(content_list.index('Envio'))
if 'Nenhum requerimento' in content_list:
content_list.insert(9, '-')
content_list.insert(9, '-')
names = []
for row in tables[3].find_all('th'):
names.append(row.text)
table_3_content = []
for row in tables[3].find_all('td'):
table_3_content.append(row.text.strip())
content_list.append(table_3_content[1])
content_list.append(table_3_content[2])
content_list.append(table_3_content[names.index('Número parcelas')])
content_list.append(table_3_content[-1])
try:
content_list.append(table_3_content[names.index(
'Código do Imóvel (SNCR/INCRA)')])
except ValueError:
content_list.append('-')
for elem in self.export_list:
content_list.append(elem.format(content_list[0]))
for elem in content_list:
if u'\u2013' in elem:
content_list[content_list.index(elem)] = \
elem.replace(u'\u2013', '-')
for key, value in zip(self.data.keys(), content_list):
self.data.get(key).append(value)
self.parsing_to_csv()
# Used in filtering_content
def parsing_to_csv(self):
"""This function parses the acquired data into a csv file.
Returns
-------
"""
pandas.DataFrame(self.data).set_index('código').to_csv(os.path.abspath(
'../data/outputs/sigef-{}.csv'.format(datetime.date.today())),
encoding='latin-1', sep=';'
)
def __call__(self, *args, **kwargs):
for url in tqdm(self.url_list):
self.filtering_content(self.soup(self.requesting(url)))
if __name__ == '__main__':
SigefRequests(r'data\links.txt').__call__()
Вот пример ошибки, которую я получаю, когда он перестает работать:
(env) D:\Documentos\LAGESA\Programas\Scraper\up3\sigef-crawler\src>python crawler.py
12%|█████████▎ | 543/4493 [1:59:07<14:26:33, 13.16s/it]
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\joaop\AppData\Local\Programs\Python\Python38\lib\socket.py", line 918, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "crawler.py", line 212, in <module>
SigefRequests(r'data\links.txt').__call__()
File "crawler.py", line 208, in __call__
self.filtering_content(self.soup(self.requesting(url)))
File "crawler.py", line 110, in requesting
return self.session.get(url, verify=False, headers=self.headers)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 543, in get
return self.request('GET', url, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Заранее благодарим за помощь!