Страница загружает контент через xhr на другой URL, поэтому вы пропустили это.Вы можете имитировать запрос xhr POST, который использует страница, и изменять сообщение, отправленное json.Если вы измените size
, вы получите больше результатов.
import requests
data = {"size":1480,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = requests.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
for film in r['hits']['hits']:
print(film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url'])
Фактический счетчик результатов для rentals
указан в json, r['hits']['total']
, поэтому вы можете выполнить первоначальный запрос, начиная с большого числа.выше, чем вы ожидаете, проверьте, нужен ли еще один запрос, а затем соберите все дополнительные, изменив значения from
и size
, чтобы убрать все невыполненные.
import requests
import pandas as pd
initial_count = 10000
results = []
def add_results(r):
for film in r['hits']['hits']:
results.append([film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url']])
with requests.Session() as s:
data = {"size": initial_count,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
total_results = int(r['hits']['total'])
add_results(r)
if total_results > initial_count :
data['size'] = total_results - initial_count
data['from'] = initial_count
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
add_results(r)
df = pd.DataFrame(results, columns = ['Title', 'Link'])
print(df.head())