Создание исключений для ошибок в скрипте веб-сканирования python - PullRequest
1 голос
/ 07 августа 2020
• 1000 У меня в целом работает нормально. Обычно я оставляю его работать всю ночь, когда ответ веб-сайта быстрее и стабильнее. потерять много времени.

Я хочу улучшить код с помощью некоторого метода обработки ошибок, чтобы он мог продолжать проверять, работает ли соединение inste rnet, и переходить к следующей ссылке, когда она работает, вместо сбоя. Кто-нибудь из вас знает, как это реализовать?

Это мой python код:

    #!-*- coding: utf8 -*-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from tqdm import tqdm

import datetime
import requests
import pandas
import os


class SigefRequests:
    """Class responsible for accessing, extracting and parsing sigef
    information into a csv file.

    The output file will be at ./data/outputs

    """
    def __init__(self, path):
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        self.url_list = self.reading_url_file(path)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) "
                          "Gecko/20100101 Firefox/54.0",
            "Connection": "close",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/"
                      ";q=0.8",
            "Upgrade-Insecure-Requests": "1"
        }
        self.session = requests.session()
        self.data = {
            'código': [],
            'denominação': [],
            'área': [],
            'data de entrada': [],
            'situação': [],
            'responsável técnico': [],
            'ART': [],
            'envio': [],
            'requerimento': [],
            'status': [],
            'data': [],
            'nome': [],
            'cpf/cnpj': [],
            'situação - georreferência': [],
            'natureza': [],
            'número de parcelas': [],
            'municípios': [],
            'código do imóvel': [],
            'shp - polígono': [],
            'shp - vértices': [],
            'shp - limites': [],
            'kml - polígono': [],
            'kml - vértices': [],
            'kml - limites': [],
            'csv - polígono': [],
            'csv - vértices': [],
            'csv - limites': [],
        }

        self.export_list = [
            "https://sigef.incra.gov.br/geo/exportar/parcela/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/parcela/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/parcela/csv/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/csv/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/csv/{}"
        ]

    # Used in __init__
    @staticmethod
    def reading_url_file(path):
        """This function reads the links.txt file and return a links list.

        Parameters
        ----------
        path : str
            The path to links.txt file.
            (By default this file is in data folder).

        Returns
        -------
        url_list : iterator
            The links list.

        """
        return open(
            os.path.abspath('../' + path)
        ).readlines()

    # Used in __call__
    def requesting(self, url):
        """This function makes a GET requisition into the given sigef url.

        Parameters
        ----------
        url : str
            Sigef's URL.

        Returns
        -------
        response : requests.models.Response
            The GET Requisition response.

        """
        return self.session.get(url, verify=False, headers=self.headers)

    # Used in __call__
    @staticmethod
    def soup(html):
        """This function parses the html.

        Parameters
        ----------
        html : requests.models.Response
            Unparsed html.

        Returns
        -------
        parsed_html : bs4.BeautifulSoup
            Parsed html.

        """
        return BeautifulSoup(html.content, 'html5lib')

    # Used in __call__
    def filtering_content(self, html):
        """This function filters the page content and looks for the relevant
        data.

        Parameters
        ----------
        html : bs4.BeautifulSoup
            Parsed html.

        Returns
        -------

        """
        tables = html.find_all('table', {
            'class': 'table table-hover tabela-atributos'
        })

        tables_ = [tables[0], tables[1], tables[2], tables[-1]]

        content_list = []
        for table in tables_:
            for row in table.find_all('td'):
                content_list.append((row.text.strip()))

        content_list.pop(content_list.index('Envio'))

        if 'Nenhum requerimento' in content_list:
            content_list.insert(9, '-')
            content_list.insert(9, '-')

        names = []
        for row in tables[3].find_all('th'):
            names.append(row.text)

        table_3_content = []
        for row in tables[3].find_all('td'):
            table_3_content.append(row.text.strip())

        content_list.append(table_3_content[1])
        content_list.append(table_3_content[2])
        content_list.append(table_3_content[names.index('Número parcelas')])
        content_list.append(table_3_content[-1])

        try:
            content_list.append(table_3_content[names.index(
                'Código do Imóvel (SNCR/INCRA)')])
        except ValueError:
            content_list.append('-')

        for elem in self.export_list:
            content_list.append(elem.format(content_list[0]))

        for elem in content_list:
            if u'\u2013' in elem:
                content_list[content_list.index(elem)] = \
                    elem.replace(u'\u2013', '-')

        for key, value in zip(self.data.keys(), content_list):
            self.data.get(key).append(value)

        self.parsing_to_csv()

    # Used in filtering_content
    def parsing_to_csv(self):
        """This function parses the acquired data into a csv file.

        Returns
        -------

        """
        pandas.DataFrame(self.data).set_index('código').to_csv(os.path.abspath(
            '../data/outputs/sigef-{}.csv'.format(datetime.date.today())),
            encoding='latin-1', sep=';'
        )

    def __call__(self, *args, **kwargs):
        for url in tqdm(self.url_list):
            self.filtering_content(self.soup(self.requesting(url)))


if __name__ == '__main__':
    SigefRequests(r'data\links.txt').__call__()

Вот пример ошибки, которую я получаю, когда он перестает работать:

(env) D:\Documentos\LAGESA\Programas\Scraper\up3\sigef-crawler\src>python crawler.py
 12%|█████████▎                                                                   | 543/4493 [1:59:07<14:26:33, 13.16s/it]
Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "C:\Users\joaop\AppData\Local\Programs\Python\Python38\lib\socket.py", line 918, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
    conn.connect()
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 308, in connect
    conn = self._new_conn()
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
    raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 439, in send
    resp = conn.urlopen(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
    retries = retries.increment(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\retry.py", line 439, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "crawler.py", line 212, in <module>
    SigefRequests(r'data\links.txt').__call__()
  File "crawler.py", line 208, in __call__
    self.filtering_content(self.soup(self.requesting(url)))
  File "crawler.py", line 110, in requesting
    return self.session.get(url, verify=False, headers=self.headers)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 543, in get
    return self.request('GET', url, **kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 516, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

Заранее благодарим за помощь!

1 Ответ

0 голосов
/ 26 августа 2020

Привет, джоао в python вы используете оператор try try для продолжения работы, если программа получает конкретную c ошибку.

вот и пример.

string = "string"

try:
    print(int(string))
except ValueError:
    print("it didn't work")

без попытки и за исключением того, что вы получите

Traceback (most recent call last):
File "C:\Users\jojop\OneDrive\Desktop\python.py", line 4, in <module>
print(int(string))
ValueError: invalid literal for int() with base 10: 'string'

. В сообщении будет отображаться ошибка, которую вы можете использовать в этом случае «ValueError»

...