Selenium ConnectionRefusedError: [WinError 10061] Невозможно установить соединение, так как целевая машина активно отказала ему - PullRequest
0 голосов
/ 13 апреля 2020

У меня есть сценарий python, который используется для очистки изображений из Google. Чтобы запустить скрипт, вам нужно создать файл с именем imgsearch_list.txt, в котором вы должны передать список в подобный кошке, собаке, чтобы искать этот список в Google.

Когда я запускаю скрипт, он выдает ошибку.

Вы можете увидеть код:

import re, os, sys, datetime, time
import pandas
from selenium import webdriver
from contextlib import closing
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM

class GoogleImageExtractor(object):

    def __init__(self, search_key = '' ):
        """ Google image search class
            Args:
                search_key to be entered.

        """
        if type(search_key) == str:
            ## convert to list even for one search keyword to standalize the pulling.
            self.g_search_key_list = [search_key]
        elif type(search_key) == list:
            self.g_search_key_list = search_key
        else:
            print ('google_search_keyword not of type str or list')
            raise

        self.g_search_key = ''

        ## user options
        self.image_dl_per_search = 200

        ## url construct string text
        self.prefix_of_search_url = "https://www.google.com.sg/search?q="
        self.postfix_of_search_url = '&source=lnms&tbm=isch&sa=X&ei=0eZEVbj3IJG5uATalICQAQ&ved=0CAcQ_AUoAQ&biw=939&bih=591'# non changable text
        self.target_url_str = ''

        ## storage
        self.pic_url_list = []
        self.pic_info_list = []

        ## file and folder path
        self.folder_main_dir_prefix = r'C:\Users\intel\Desktop\Scrappr'

    def reformat_search_for_spaces(self):
        """
            Method call immediately at the initialization stages
            get rid of the spaces and replace by the "+"
            Use in search term. Eg: "Cookie fast" to "Cookie+fast"

            steps:
            strip any lagging spaces if present
            replace the self.g_search_key
        """
        self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')

    def set_num_image_to_dl(self, num_image):
        """ Set the number of image to download. Set to self.image_dl_per_search.
            Args:
                num_image (int): num of image to download.
        """
        self.image_dl_per_search = num_image

    def get_searchlist_fr_file(self, filename):
        """Get search list from filename. Ability to add in a lot of phrases.
            Will replace the self.g_search_key_list
            Args:
                filename (str): full file path
        """
        with open(filename,'r') as f:
            self.g_search_key_list = f.readlines()

    def formed_search_url(self):
        ''' Form the url either one selected key phrases or multiple search items.
            Get the url from the self.g_search_key_list
            Set to self.sp_search_url_list
        '''
        self.reformat_search_for_spaces()
        self.target_url_str = self.prefix_of_search_url + self.g_search_key +\
                                self.postfix_of_search_url

    def retrieve_source_fr_html(self):
        """ Make use of selenium. Retrieve from html table using pandas table.

        """
        driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe")
        driver.get(self.target_url_str)

        ## wait for log in then get the page source.
        try:
            driver.execute_script("window.scrollTo(0, 30000)")
            time.sleep(2)
            self.temp_page_source = driver.page_source
            #driver.find_element_by_css_selector('ksb _kvc').click()#cant find the class
            driver.find_element_by_id('smb').click() #ok
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, 60000)")
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, 60000)")

        except:
            print ('not able to find')
            driver.quit()

        self.page_source = driver.page_source

        driver.close()

    def extract_pic_url(self):
        """ extract all the raw pic url in list

        """
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')

        for tag in tag_list[:self.image_dl_per_search]:
            tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print ('error parsing', tag)

    def multi_search_download(self):
        """ Mutli search download"""
        for indiv_search in self.g_search_key_list:
            self.pic_url_list = []
            self.pic_info_list = []

            self.g_search_key = indiv_search

            self.formed_search_url()
            self.retrieve_source_fr_html()
            self.extract_pic_url()
            self.downloading_all_photos() #some download might not be jpg?? use selnium to download??
            self.save_infolist_to_file()

    def downloading_all_photos(self):
        """ download all photos to particular folder

        """
        self.create_folder()
        pic_counter = 1
        for url_link in self.pic_url_list:
            print (pic_counter)
            pic_prefix_str = self.g_search_key  + str(pic_counter)
            self.download_single_image(url_link.encode(), pic_prefix_str)
            pic_counter = pic_counter +1

    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )

        valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive

        url = URL(url_link)
        if url.redirect:
            return # if there is re-direct, return

        if file_ext not in valid_image_ext_list:
            return #return if not valid image extension

        f = open(temp_filename_full_path, 'wb') # save as test.gif
        print (url_link)
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
        try:
            f.write(url.download())#if have problem skip
        except:
            #if self.__print_download_fault:
            print ('Problem with processing this data: ', url_link)
            self.download_fault =1
        f.close()

    def create_folder(self):
        """
            Create a folder to put the log data segregate by date

        """
        self.gs_raw_dirpath = os.path.join(self.folder_main_dir_prefix, time.strftime("_%d_%b%y", time.localtime()))
        if not os.path.exists(self.gs_raw_dirpath):
            os.makedirs(self.gs_raw_dirpath)

    def save_infolist_to_file(self):
        """ Save the info list to file.

        """
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key + '_info.txt' )

        with  open(temp_filename_full_path, 'w') as f:
            for n in self.pic_info_list:
                f.write(n)
                f.write('\n')

if __name__ == '__main__':

    choice =4

    if choice ==4:
        """test the downloading of files"""
        w = GoogleImageExtractor('')#leave blanks if get the search list from file
        searchlist_filename = r'C:\Users\intel\Desktop\Scrappr\imgsearch_list.txt'
        w.set_num_image_to_dl(200)
        w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
        w.multi_search_download()


Вот ошибка:

not able to find
Traceback (most recent call last):
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 156, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 665, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1230, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1276, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1225, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1004, in _send_output
    self.send(msg)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 944, in send
    self.connect()
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 184, in connect
    conn = self._new_conn()
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 168, in _new_conn
    raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x0000000007017520>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:/Users/intel/Desktop/go.py", line 211, in <module>
    w.multi_search_download()
  File "c:/Users/intel/Desktop/go.py", line 133, in multi_search_download
    self.retrieve_source_fr_html()
  File "c:/Users/intel/Desktop/go.py", line 106, in retrieve_source_fr_html
    self.page_source = driver.page_source
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
    return self.execute(Command.GET_PAGE_SOURCE)['value']
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 319, in execute
    response = self.command_executor.execute(driver_command, params)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 374, in execute
    return self._request(command_info[0], url, body=data)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 397, in _request
    resp = self._conn.request(method, url, body=body, headers=headers)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\request.py", line 75, in request
    return self.request_encode_url(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\request.py", line 97, in request_encode_url
    return self.urlopen(method, url, **extra_kw)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\poolmanager.py", line 330, in urlopen
    response = conn.urlopen(method, u.request_uri, **kw)
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 747, in urlopen
    return self.urlopen(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 747, in urlopen
    return self.urlopen(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 747, in urlopen
    return self.urlopen(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 719, in urlopen
    retries = retries.increment(
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\retry.py", line 436, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=50181): Max retries exceeded with url: /session/a473cdecf0cbd7a585ac13d08f156b4a/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000000007017520>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

Любая помощь будет оценил ...

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...