Как перебрать все страницы сайта? - PullRequest
1 голос
/ 18 апреля 2020

Я снимаю имя, электронную почту, телефон и местонахождение терапевтов с веб-сайта. Я скопировал данные с первой страницы, но не могу разбить на страницы остальные страницы. Я использую requests и beautifulsoup.

Веб-сайт Здесь

Код для первой страницы:

import requests
from bs4 import BeautifulSoup as bs

count = 0

cookies = {
    'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno',
    '__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1',
    'LoginGuid': '',
    '_ga': 'GA1.2.1257196513.1587105612',
    'Asi.Web.Browser.CookiesEnabled': 'true',
    'tltos': '1',
    '_gid': 'GA1.2.1385127198.1587230995',
    '__utmxst': '180',
}

headers = {
    'Connection': 'keep-alive',
    'sec-ch-ua': '"Google Chrome 80"',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cache-Control': 'no-cache',
    'Sec-Fetch-Dest': 'empty',
    'X-Requested-With': 'XMLHttpRequest',
    'X-MicrosoftAjax': 'Delta=true',
    'Accept': '*/*',
    'Origin': 'https://www.therapistlocator.net',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=',
    'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}


params = (
    ('zip', '10001'),
    ('name', ''),
)

data = {
  'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__WPPS': 's',
  '__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}',
  '__CTRLKEY': '',
  '__SHIFTKEY': '',
  'ctl01_ScriptManager1_TSM': '',
  'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37',
  '__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1',
  'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages',
  'PageIsDirty': 'false',
  'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1',
  'IsControlPostBackctl01$SearchField': '1',
  '__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__EVENTARGUMENT': '',
  'NavMenuClientID': 'ctl01_Primary_NavMenu',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage1': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage2': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage3': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
  'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1',
  'IsControlPostBackctl01$FooterCopyright$tosol': '1',
  '__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==',
  '__VIEWSTATEGENERATOR': '37E773F2',
  'ctl01$lastClickedElementId': '',
  'ctl01$SearchField$SearchTerms': 'Keyword Search',
  'ctl01_Primary_NavMenu_ClientState': '',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '',
  '__ASYNCPOST': 'true',
  '': ''
}

response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data)

html = str(response.content)
con = bs(html , 'lxml')

therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row')

for therapist in therapists:
    count+=1
    name = therapist.find('div',class_='item name').find('a').text.strip()

    therapist_href = therapist.find('div',class_='item name').find('a').get('href')
    therapist_href = therapist_href.replace('\\','')
    therapist_href = therapist_href.replace("'",'')

    therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href)

    therapist_info = requests.get(therapist_link)
    if therapist_info.ok:
        dataa = bs(therapist_info.text,'lxml')

        try:
            email = dataa.find('a',class_='PanelField').text.strip()

            location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip()
            loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip()
            location = location.replace(loc1 , ' {}'.format(loc1))

            phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip()


            print('\n*********** '+str(count)+' ************\n')
            print('Name: {}'.format(name))
            print('Email: {}'.format(email))
            print('Phone: {}'.format(phone))
            print('Location: {}'.format(location))
        except:
            pass

Остальные страницы, кажется, имеют один и тот же URL, поэтому я не смог перебрать их все.

На каждой странице 25 записей. Я sh получаю их все.

Пример вывода для каждой записи:

Name: Marya B . Slater
Email: nycitytherapist@gmail.com
Phone: (646) 265-1555
Location: 360 W 34th St Apt 5P New York, NY  10001-2407

Ответы [ 2 ]

1 голос
/ 19 апреля 2020
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import pandas as pd

fish = ["ctl01$ScriptManager1", "ctl01$lastClickedElementId", "__EVENTTARGET"]

data = {
    'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage',
    '__WPPS': 's',
    '__CTRLKEY': '',
    '__SHIFTKEY': '',
    'NavMenuClientID': 'ctl01_Primary_NavMenu',
    'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
    'ctl01$lastClickedElementId': 'id|ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_lnkFirstPage',
    'ctl01$SearchField$SearchTerms': 'Keyword Search',
    "ctl01_Primary_NavMenu_ClientState": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0": "10001",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0": "5",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlResultsPerPage": "25",
    "ctl01_GenericWindow_ClientState": "",
    "ctl01_ObjectBrowser_ClientState": "",
    "ctl01_ObjectBrowserDialog_ClientState": "",
    "ctl01_WindowManager1_ClientState": "",
    "__EVENTTARGET": "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage",
    "__EVENTARGUMENT": "",
    "__LASTFOCUS": "",
    "__VIEWSTATEGENERATOR": "37E773F2",
    "__ClientContext": "{\"baseUrl\":\"/\",\"isAnonymous\":true,\"loggedInPartyId\":\"132791\",\"selectedPartyId\":\"132791\",\"websiteRoot\":\"http://www.therapistlocator.net/\",\"virtualDir\":\"\"}",
    "TemplateUserMessagesID": "ctl01_TemplateUserMessages_ctl00_Messages",
    "PageIsDirty": "false",
    "IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan": "1",
    "IsControlPostBackctl01$SearchField": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage1": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage2": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage3": "1",
    "IsControlPostBackctl01$FooterCopyright$FooterCopyright": "1",
    "IsControlPostBackctl01$FooterCopyright$tosol": "1",
    "__ASYNCPOST": "true",
    "RadAJAXControlID": "ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_rapLoading"
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
    "Referer": "https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001"
}


def main(url):
    with requests.Session() as req:

        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        data['ctl01_ScriptManager1_TSM'] = unquote(soup.select_one(
            "script[src*=Telerik]").get("src")).split("=", 3)[-1]
        data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
        data['PageInstanceKey'] = re.search(
            'PageInstanceKey=(.+?)"', r.text).group(1)
        data['__RequestVerificationToken'] = soup.find(
            "input", id="__RequestVerificationToken").get("value")
        urls = []
        for num in range(1, 4):
            print(f"Extracting Links From Page {num}")

            r = req.post(url, data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')

            links = [f'{url[:32]}{link.get("href")}'
                     for link in soup.select("a[href*=viewprofile]")]
            urls.extend(links)

            for f in fish:
                if num == 1:
                    data[f] = re.sub('(k)(.+)', r"\1SecondPage", data[f])
                else:
                    data[f] = re.sub('(k)(.+)', r"\1Last", data[f])

        print(f"Collected {len(urls)} Links")
        done = []
        for x in urls:
            r = req.get(x)
            soup = BeautifulSoup(r.content, 'html.parser')
            load = soup.select("div.PanelFieldValue")
            name = load[2].span.text
            add = load[0].span.text
            ph = load[1].span.text
            try:
                em = soup.select_one("a.PanelField").text
            except:
                em = "N/A"
            goal = [name, add, ph, em]
            done.append(goal)
        df = pd.DataFrame.from_records(
            done, columns=["Name", "Address", "Phone", "Email"])
        print(df)
        df.to_csv("data.csv", index=False)


main("https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001")

Вывод: просмотр в режиме онлайн

enter image description here

1 голос
/ 18 апреля 2020

Ваш сайт имеет javascript. Когда вы нажимаете на следующую страницу , она запускает функцию javascript для заполнения результата. Вы можете использовать Selenium автоматизацию браузера для программного доступа к другим страницам.

См. Эти:

  1. https://selenium-python.readthedocs.io/getting-started.html
  2. https://selenium-python.readthedocs.io/
  3. https://selenium-python.readthedocs.io/navigating.html#interacting со страницей

Пагинация с селеном

Основные шаги

Вам необходимо разбить проблему на следующие шаги:

  1. Используйте Selenium (с python) BrowserAutomation для доступа к вашей странице.
  2. Получите общее количество страниц (см. В источнике страниц, в самом конце, он имеет раздел нумерации страниц). Альтернативно, вы можете использовать total_pages = total_results//max_results + 1 где, max_results = 25 по умолчанию.

  3. Для каждой страницы :

    1. Используйте BeautifulSoup для извлечь данные из объекта ответа, полученного с помощью Selenium.
    2. Используйте селен, чтобы щелкнуть по следующей странице ссылку
    3. Добавить результаты в dict или list или pandas.DataFrame, если хотите.
...