Я снимаю имя, электронную почту, телефон и местонахождение терапевтов с веб-сайта. Я скопировал данные с первой страницы, но не могу разбить на страницы остальные страницы. Я использую requests
и beautifulsoup
.
Веб-сайт Здесь
Код для первой страницы:
import requests
from bs4 import BeautifulSoup as bs
count = 0
cookies = {
'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno',
'__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1',
'LoginGuid': '',
'_ga': 'GA1.2.1257196513.1587105612',
'Asi.Web.Browser.CookiesEnabled': 'true',
'tltos': '1',
'_gid': 'GA1.2.1385127198.1587230995',
'__utmxst': '180',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '"Google Chrome 80"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cache-Control': 'no-cache',
'Sec-Fetch-Dest': 'empty',
'X-Requested-With': 'XMLHttpRequest',
'X-MicrosoftAjax': 'Delta=true',
'Accept': '*/*',
'Origin': 'https://www.therapistlocator.net',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
params = (
('zip', '10001'),
('name', ''),
)
data = {
'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
'__WPPS': 's',
'__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}',
'__CTRLKEY': '',
'__SHIFTKEY': '',
'ctl01_ScriptManager1_TSM': '',
'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37',
'__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1',
'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages',
'PageIsDirty': 'false',
'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1',
'IsControlPostBackctl01$SearchField': '1',
'__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
'__EVENTARGUMENT': '',
'NavMenuClientID': 'ctl01_Primary_NavMenu',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1',
'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage1': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage2': '1',
'IsControlPostBackctl01$TemplateBody$ContentPage3': '1',
'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1',
'IsControlPostBackctl01$FooterCopyright$tosol': '1',
'__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==',
'__VIEWSTATEGENERATOR': '37E773F2',
'ctl01$lastClickedElementId': '',
'ctl01$SearchField$SearchTerms': 'Keyword Search',
'ctl01_Primary_NavMenu_ClientState': '',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5',
'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '',
'__ASYNCPOST': 'true',
'': ''
}
response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data)
html = str(response.content)
con = bs(html , 'lxml')
therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row')
for therapist in therapists:
count+=1
name = therapist.find('div',class_='item name').find('a').text.strip()
therapist_href = therapist.find('div',class_='item name').find('a').get('href')
therapist_href = therapist_href.replace('\\','')
therapist_href = therapist_href.replace("'",'')
therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href)
therapist_info = requests.get(therapist_link)
if therapist_info.ok:
dataa = bs(therapist_info.text,'lxml')
try:
email = dataa.find('a',class_='PanelField').text.strip()
location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip()
loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip()
location = location.replace(loc1 , ' {}'.format(loc1))
phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip()
print('\n*********** '+str(count)+' ************\n')
print('Name: {}'.format(name))
print('Email: {}'.format(email))
print('Phone: {}'.format(phone))
print('Location: {}'.format(location))
except:
pass
Остальные страницы, кажется, имеют один и тот же URL, поэтому я не смог перебрать их все.
На каждой странице 25 записей. Я sh получаю их все.
Пример вывода для каждой записи:
Name: Marya B . Slater
Email: nycitytherapist@gmail.com
Phone: (646) 265-1555
Location: 360 W 34th St Apt 5P New York, NY 10001-2407