Я создал скрипт для сбора продуктов и информации о них со многих веб-сайтов (~ 120). Он делает то, чего я хотел бы достичь, но через некоторое время (в основном около 70 страниц) он дает мне «MemoryError» и «RuntimeError: не удается запустить новый поток». Я пытался искать решения, такие как: .clear () мои списки, или пытался использовать sys.getsizeof (), чтобы обнаружить утечки памяти, но пока безуспешно. У вас есть идеи, в чем может быть проблема?
Подробное сообщение об ошибке:
Traceback (most recent call last):
File "C:\EGYÉB\PYTHON\PyCharm\helpers\pydev\pydevd.py", line 1741, in <module>
main()
File "C:\EGYÉB\PYTHON\PyCharm\helpers\pydev\pydevd.py", line 1735, in main
globals = debugger.run(setup['file'], None, None, is_module)
File "C:\EGYÉB\PYTHON\PyCharm\helpers\pydev\pydevd.py", line 1135, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\EGYÉB\PYTHON\PyCharm\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/EGYÉB/PYTHON/Projects/WebScraping/Selenium_scraping.py", line 63, in <module>
soup1 = BeautifulSoup(driver.page_source, 'html.parser')
File "C:\EGYÉB\PYTHON\Projects\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
return self.execute(Command.GET_PAGE_SOURCE)['value']
File "C:\EGYÉB\PYTHON\Projects\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\EGYÉB\PYTHON\Projects\venv\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "C:\EGYÉB\PYTHON\Projects\venv\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 423, in _request
data = utils.load_json(data.strip())
File "C:\EGYÉB\PYTHON\Projects\venv\lib\site-packages\selenium\webdriver\remote\utils.py", line 37, in load_json
return json.loads(s)
File "C:\EGYÉB\PYTHON\Python Core\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\EGYÉB\PYTHON\Python Core\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\EGYÉB\PYTHON\Python Core\lib\json\decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
MemoryError
Traceback (most recent call last):
File "C:\EGYÉB\PYTHON\PyCharm\helpers\pydev\_pydevd_bundle\pydevd_comm.py", line 1505, in do_it
t.start()
File "C:\EGYÉB\PYTHON\Python Core\lib\threading.py", line 847, in start
_start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
Код:
from selenium import webdriver
from bs4 import BeautifulSoup
from itertools import count
import pandas as pd
import os
import csv
import time
import re
os.chdir('C:\...')
price = []
prod_name = []
href_link = []
specs = []
item_specs1 = []
item_specs2 = []
url1 = 'https://login.aliexpress.com/'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url1)
time.sleep(3)
driver.switch_to.frame('alibaba-login-box')
driver.find_element_by_id('fm-login-id').send_keys('..........')
driver.find_element_by_id('fm-login-password').send_keys('.........')
driver.find_element_by_id('fm-login-submit').click()
time.sleep(3)
driver.switch_to.default_content()
df = pd.read_csv('........csv', header=0)
for index, row in df.iterrows():
page_nr = 1
url = 'https://www.aliexpress.com/store/{}'.format(row['Link']) + '/search/{}'.format(page_nr) + '.html'
driver.get(url)
time.sleep(2)
for page_number in count(start=1):
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
for div_b in soup.find_all('div', {'class': 'cost'}):
price.append(div_b.text + 'Ł')
for pr_name in soup.find_all('div', {'class': 'detail'}):
for pr_h in pr_name.find_all('h3'):
for pr_title in pr_h.find_all('a'):
prod_name_t = (pr_title.get('title').strip())
prod_name_l = (pr_title.get('href').strip())
href_link.append(prod_name_l + 'Ł')
prod_name.append(prod_name_t + 'Ł')
links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("//div[@id='node-gallery']/div[5]/div/div/ul/li/div[2]/h3/a")]
for link in links:
driver.get(link)
time.sleep(2)
soup1 = BeautifulSoup(driver.page_source, 'html.parser')
for item1 in soup1.find_all('span', {'class': 'propery-title'}):
item_specs1.append(item1.text)
for item2 in soup1.find_all('span', {'class': 'propery-des'}):
item_specs2.append(item2.text + 'Ł')
item_specs = list(zip(item_specs1, item_specs2)))
item_specs_join = ''.join(str(item_specs))
item_specs_replace = [re.sub('[^a-zA-Z0-9 \n.:Ł]', '', item_specs_join)]
specs.append(item_specs_replace)
item_specs1.clear()
item_specs2.clear()
soup1.clear()
driver.back()
links.clear()
if len(prod_name) > 500:
data_csv = list(zip(prod_name, price, href_link, specs))
with open('........csv'), 'a', newline='') as f:
writer = csv.writer(f)
for row0 in data_csv:
writer.writerow(row0)
f.close()
price.clear()
prod_name.clear()
href_link.clear()
specs.clear()
data_csv.clear()
try:
if soup.find_all('span', {'class': 'ui-pagination-next ui-pagination-disabled'}):
print("Last page reached!")
break
else:
driver.find_element_by_class_name('ui-pagination-next').click()
time.sleep(1)
except Exception:
break
driver.quit()
data_csv = list(zip(prod_name, price, href_link, specs))
print(len(data_csv))
with open('.......csv', 'a', newline='') as f:
writer = csv.writer(f)
for row1 in data_csv:
writer.writerow(row1)
f.close()