При запуске этой очистки она происходит случайным образом из-за ошибки памяти (см. Сообщение об ошибке ниже). Использование памяти с помощью objgraph
не дает никаких подсказок, поскольку память остается постоянной. Мое инстинктивное чувство подсказывает мне, что проблема возникает из-за повторных Pandas.to_sql()
заявлений, учитывая, что последний отпечаток, который я могу сделать, перед вызовом to_sql
( См. Этот выпуск GitHub ).
Сообщение об ошибке: Ошибка в `/ home / user / Projects / scrape_env / bin / python ': free (): неверный следующий размер (быстрый): 0x0000000002ae7340 Прервано
# coding: utf8
import pandas as pd
import time
import datetime
import traceback
from bs4 import BeautifulSoup
import requests
import random
from utils import get_engine
def requesturl(url):
try:
time.sleep(random.randint(1, 5))
resp = requests.get(url)
if 'please refer to the MWS Subscription API' in resp.text:
print(resp.text)
print('Banned!! Sleeping... 30s', url)
time.sleep(random.randint(10, 30))
i = 10
requesturl(url)
except:
traceback.print_exc()
resp = ''
return BeautifulSoup(resp.text, "lxml")
def filter_already_scraped(url_list):
try:
already_scraped = pd.read_sql('select PageURL from AmazonKindleUnlimitedScrape_temp', get_engine())
except:
already_scraped = pd.DataFrame(columns=['PageURL'])
position = pd.read_sql("select max(Position) from AmazonKindleUnlimitedScrape_temp", get_engine()).values[0][0]
if not position:
position = 0
url_list = [url for url in url_list if url not in already_scraped['PageURL'].tolist()]
return url_list, position
def get_max_pages(url):
resp = requesturl(url)
return int(resp.find('span', {'class': 'pagnDisabled'}).text.strip())
def create_urls(start_url):
max_pages = get_max_pages(start_url)
url_list = []
for page_num in range(1, max_pages+1):
url_list.append(start_url + "&page=" + str(page_num))
return url_list
def scrape_product_info(serp_element, position, url):
position += 1
product_info = dict.fromkeys(['ASIN', 'Title', 'Author'])
product_info['ProductURL'] = \
serp_element.find('a', {'class': 'a-link-normal a-text-normal'})['href'].split('/ref=')[0]
product_info['ASIN'] = product_info['ProductURL'].split('/dp/')[1].split('/')[0]
product_info['ProductURL'] = 'https://www.amazon.co.uk/dp/' + product_info['ASIN']
product_info['PageURL'] = url
product_info['Title'] = serp_element.find('h2', {
'class': 'a-size-medium s-inline s-access-title a-text-normal'}).text.strip()
product_info['Author'] = serp_element.find_all('a', {'class': 'a-link-normal a-text-normal'})[1].text.split('|')[0]
if '£' in product_info['Author']:
product_info['Author'] = \
serp_element.find_all('span', {'class': 'a-size-small a-color-secondary'})[3].text.split('|')[0]
product_info['Price'] = serp_element.find_all('span', {'class': 'a-size-base a-color-price s-price a-text-bold'})[
1].text.strip()
product_info['Position'] = position
if serp_element.find('span', {'class': 's-icon s-icon-kindle-unlimited'}):
product_info['isKindleUnlimited'] = True
else:
product_info['isKindleUnlimited'] = False
product_info['isPrime'] = True
product_info['timestamp'] = datetime.datetime.now()
product_info['Date'] = product_info['timestamp'].date()
return product_info
def scrape_serp(urls, position):
for url in urls:
resp = requesturl(url)
main_results_group = resp.find('div', {'id': 'mainResults'})
serp_group = main_results_group.find_all('li', {'class': 's-result-item'})
serp_data = []
for serp_element in serp_group:
product_info = scrape_product_info(serp_element, position, url)
serp_data.append(product_info)
pd.DataFrame(serp_data).to_sql('AmazonKindleUnlimitedScrape_temp', get_engine(), if_exists='append', index=False)
def prime_titles_scrape():
start_url = 'https://www.amazon.co.uk/b/ref=pr_br_rw_lp_mn?node=12730785031&storeType=ebooks&pageType=kindle'
url_list = create_urls(start_url)
url_list, position = filter_already_scraped(url_list)
scrape_serp(url_list, position)
prime_titles_scrape()
Печать объекта:
Memory usage: 120680 (kb)
dict 29589
function 29464
list 17002
tuple 12175
Tag 7609
weakref 6167
NavigableString 5138
cell 3961
getset_descriptor 3611
builtin_function_or_method 3346
None