Соскоб с веб-сайта remax.com - PullRequest
2 голосов
/ 25 марта 2019

Я пытаюсь собрать некоторые данные с Remax.com для получения информации, например, о размере лота или квадратных футах собственности.Хотя я получаю следующие ошибки:

---------------------------------------------------------------------------
Error                                     Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    440             try:
--> 441                 cnx.do_handshake()
    442             except OpenSSL.SSL.WantReadError:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in do_handshake(self)
   1715         result = _lib.SSL_do_handshake(self._ssl)
-> 1716         self._raise_ssl_error(self._ssl, result)
   1717 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
   1455         else:
-> 1456             _raise_current_error()
   1457 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\_util.py in exception_from_error_queue(exception_type)
     53 
---> 54     raise exception_type(errors)
     55 

Error: [('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')]

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    345         try:
--> 346             self._validate_conn(conn)
    347         except (SocketTimeout, BaseSSLError) as e:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
    849         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 850             conn.connect()
    851 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
    325             server_hostname=hostname,
--> 326             ssl_context=context)
    327 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    328     if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
--> 329         return context.wrap_socket(sock, server_hostname=server_hostname)
    330 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    447             except OpenSSL.SSL.Error as e:
--> 448                 raise ssl.SSLError('bad handshake: %r' % e)
    449             break

SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    638             retries = retries.increment(method, url, error=e, _pool=self,
--> 639                                         _stacktrace=sys.exc_info()[2])
    640             retries.sleep()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    387         if new_retry.is_exhausted():
--> 388             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    389 

MaxRetryError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
<ipython-input-22-bcfdfdfb0a4e> in <module>()
----> 1 get_info('119 S IRENA AVE B, Redondo Beach, CA 90277')

<ipython-input-21-f3c942a87400> in get_info(address)
     32         }
     33 #         proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
---> 34         req_properties = requests.get("https://www.remax.com/api/listings", params=params)
     35         matching_properties_json = req_properties.json()
     36         for p in matching_properties_json[0]:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    504             if isinstance(e.reason, _SSLError):
    505                 # This branch is for urllib3 v1.22 and later.
--> 506                 raise SSLError(e, request=request)
    507 
    508             raise ConnectionError(e, request=request)

SSLError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

Вот мой код:

import urllib
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests

geolocator = Nominatim(timeout=None)
def get_dir(address):
    location = geolocator.geocode(address)
    lat = location.latitude
    lng = location.longitude
    h = geolib.geohash.encode(lat, lng, 7)
    hashes = geolib.geohash.neighbours(h)
    NW = geohash.decode(hashes.nw)
    SE = geohash.decode(hashes.ne)
    nwlat = NW.lat
    nwlon = NW.lon
    selat = SE.lat
    selon = SE.lon
    return nwlat, nwlon, selat, selon

    def get_info(address):
    try:
        nwlat, nwlon, selat, selon = get_dir(address)
        params = {
        "nwlat" : nwlat,        
        "nwlong" : nwlon,     
        "selat" : selat,     
        "selong" : selon,   
        "Count" : 100,
        "pagenumber" : 1,
        "SiteID" : "68000000",
        "pageCount" : "10",
        "tab" : "map",
        "sh" : "true",
        "forcelatlong" : "true",
        "maplistings" : "1",
        "maplistcards" : "0",
        "sv" : "true",
        "sortorder" : "newest",
        "view" : "homeestimates",
        }
        proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
        req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)
        matching_properties_json = req_properties.json()
        for p in matching_properties_json[0]:
            print(f"{p['Address']:<40}  {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
    except (AttributeError):
        return 'NaN'

x = get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')
print(x)

Я не уверен, как исправить эту проблему, поскольку я новичок в веб-очистке, я пыталсядобавив прокси в код, но я все еще получаю те же ошибки в последнем выше.

Обновление:

добавление

proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)

не даетошибки, но также не выводятся вообще.

1 Ответ

3 голосов
/ 26 марта 2019

Похоже, есть ряд проблем:

  1. Прокси-сервер не является проблемой, поскольку вы сказали, что предыдущий вопрос работает без необходимости его настройки.

  2. Ваш geohash.decode(hashes.ne) вызов использует ne вместо se.

  3. Возвращенные координаты не возвращают никаких допустимых свойств, в этом случае API, по-видимому, возвращает ответ другого типа, который не включает требуемые значения. Это включает в себя цену, хотя.

  4. Убедитесь, что verify=False настроен для получения. Предупреждающее сообщение может быть подавлено.

Если квадрат поиска немного увеличен в размере, он возвращает результаты:

import urllib
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests


# Disable the certificate warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
geolocator = Nominatim(timeout=None)


def get_dir(address):
    location = geolocator.geocode(address)
    lat = location.latitude
    lng = location.longitude
    h = geolib.geohash.encode(lat, lng, 7)
    hashes = geolib.geohash.neighbours(h)
    NW = geohash.decode(hashes.nw)
    SE = geohash.decode(hashes.se)

    return NW, SE


def get_info(address):
    try:
        NW, SE = get_dir(address)
        square_size = 0.001

        params = {
            "nwlat" : float(NW.lat) + square_size,        
            "nwlong" : float(NW.lon) - square_size,
            "selat" : float(SE.lat) - square_size,
            "selong" : float(SE.lon) + square_size,   
            "Count" : 100,
            "pagenumber" : 1,
            "SiteID" : "68000000",
            "pageCount" : "10",
            "tab" : "map",
            "sh" : "true",
            "forcelatlong" : "true",
            "maplistings" : "1",
            "maplistcards" : "0",
            "sv" : "true",
            "sortorder" : "newest",
            "view" : "homeestimates",
        }

        req_properties = requests.get("https://www.remax.com/api/listings", params=params, verify=False)
        matching_properties_json = req_properties.json()

        for p in matching_properties_json[0]:
            address = f"{p['Address']}, {p['City']}, {p['State']}, {p['Zip']}"

            try:
                print(f"  {address:<50} | {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
            except KeyError:
                print(f"None found - {address} - ${p['PriceFormatted']}")

    except (AttributeError):
        return 'NaN'

get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')

Отображается:

  1566 Glenneyre Street, Laguna Beach, CA, 92651     | 0 beds | 0 baths |  sqft
  1585 S Coast 4, Laguna Beach, CA, 92651            | 3 beds | 2 baths | 1448 sqft
  429 Shadow Lane, Laguna Beach, CA, 92651           | 2 beds | 2 baths | 1102 sqft
  243 Calliope Street 1, Laguna Beach, CA, 92651     | 2 beds | 2 baths | 1350 sqft        
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...