Я вижу, вы зафиксировали EC.presence_of_element_located((By.ID,{'class':'result-content'}))
как EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
Далее, у вас может возникнуть проблема с (в зависимости от того, где открыт браузер) из-за необходимости пропустить / щелкнуть javascript, который говорит, что вы все в порядке и принимают куки.
Но весь этот код кажется огромной работой, учитывая, что данные хранятся в формате json в тегах script
из html. Почему бы просто не использовать requests
, вытащить json, преобразовать в dataframe, а затем записать в csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
Вывод:
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds
и CSV-файл, который выглядит следующим образом:
app area detailsUrl expired houseTypeValue id latLng latLng.latitude latLng.longitude location.city location.street location.zipcode lotSize market numberOfRooms openHouseDate openHouseTimes openhouse photo price priceToShow showoffColor showoffCustomText showoffPhotoText spotlight status veiling
0 False 165 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 6899666 NaN 52.368420 4.833631 AMSTERDAM Hof van Versailles 61 1064NX 216 sale 4 None None False 10014EAAF8B8883668593EFAC9E5FF1C 595000.0 595000.0 None None None False Sale False
1 True 211 /te-koop/noord+holland/groot-amsterdam/amsterd... False Appartement 10585731 NaN 52.327550 4.889076 AMSTERDAM Beysterveld 35 1083KA Onbekend sale 4 None None False E4F9E5BC7BC90B5B92C7BD8D48B7A677 925000.0 925000.0 None None None False Sale False
2 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Dubbele bovenwoning 11731386 NaN 52.341890 4.896053 AMSTERDAM Uiterwaardenstraat 320 2 1079DC Onbekend sale 5 None None False AB9F45B2CD4AD7879C5A80F18092F9D4 750000.0 750000.0 None None None False SoldConditionally False
3 False 269 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 11840681 NaN 52.358266 4.875508 AMSTERDAM Korte van Eeghenstraat 4 1071ER 107 sale 9 None None False A3DF2B1D426B5E4D501503C5D0E66966 3100000.0 3100000.0 None None None False Sale False
4 False 100 /te-koop/noord+holland/groot-amsterdam/amsterd... False Tussenwoning 12152943 NaN 52.421245 4.899478 AMSTERDAM Pieter A v Heijningestraat 9 1035SV 83 sale 5 None None False 55C6F589523FA553D67A709776DD70DD 399000.0 399000.0 None None None False Sale False
5 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Bovenwoning 15796874 NaN NaN NaN AMSTERDAM Eerste Amstelvlietpad 20 1096GB Onbekend sale 3 None None False AE822B627ED096310B9ECBE7756340C8 1200000.0 1200000.0 None None None False Sale False
6 True 76 /te-koop/noord+holland/groot-amsterdam/amsterd... False Benedenwoning 10580650 NaN 52.346010 4.888799 AMSTERDAM Grevelingenstraat 18 HS 1078KP Onbekend sale 2 None None False 6FD1011D917E776DCF4DA836B5FFEE3E 550000.0 550000.0 None None None False SoldConditionally False
7 False 298 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9623182 NaN 52.330610 4.862902 AMSTERDAM Cannenburg 51 1081GW 651 sale 7 None None False 15FA170B99D4E2DEA03B6FC27E3B5B74 2495000.0 2495000.0 None None None False Sale False
8 False 270 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 15791215 NaN 52.347780 5.004530 AMSTERDAM Nico Jessekade 189 1087MR 200 sale 9 None None False 6EA5C0CDA0475DFC88A3A918A6B2909A 1549000.0 1549000.0 None None None False SoldConditionally False
9 False 201 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9617942 NaN 52.377391 4.764554 AMSTERDAM Osdorperweg 803 1067SW 1348 sale 6 None None False 4680429D99EC5AC47C950D57A77DF1EB 950000.0 950000.0 None None None False Sale False
ОБНОВЛЕНИЕ:
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
root_URL = 'https://jaap.nl'
df['detailsUrl'] = root_URL + df['detailsUrl']
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row['detailsUrl']
print ('Scraping: %s' %(row['location.street']))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how='all')
specialCase = False
for col in list(each.columns):
if each[col].dtypes == 'object':
if each[col].str.contains('Voorziening').any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + '---' + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split('---', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
temp_df = each.T
cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')