Машина: Raspberry Pi 3B +
Система: Raspbian GNU / Linux 10 (Buster)
Python версия: Python 3.7.3
Я создал код, который загружает главы для романа с сайта inte rnet, защищенного Cloudflare. Поскольку загрузка 400 сайтов с задержкой в 5 секунд каждый раз занимала более 1,5 часа, я переписал код, чтобы использовать его с модулем потоков. Это был успех. С четырьмя потоками мне удалось сократить его до 16 минут.
Но время от времени возникала ошибка сегментации. Кроме того, когда я увеличил количество потоков, это происходило чаще / раньше в процессе (загрузки), и с 6 было почти уверен, что это произойдет.
Первое, что я пытался сделать (что я нашел где-то на форумах Raspberry Pi) было увеличить размер стека с ulimit -s (stack size)
. Это не помогло. У меня было ощущение, что оно увеличивается, как часто это происходит.
После этого я нашел faulthandler
модуль здесь . Я включил его с помощью
import faulthandler
faulthandler.enable()
PYTHONFAULTHANDLER = 1
, и он дал мне это, как только произошла ошибка (ядро - это функция, определенная в скрипте):
Fatal Python error: Segmentation fault
Thread 0x6f6ff460 (most recent call first):
File "new Tensura download.py", line 89 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Current thread 0x700ff460 (most recent call first):
File "/usr/lib/python3/dist-packages/OpenSSL/SSL.py", line 1914 in do_handshake
File "/usr/lib/python3/dist-packages/urllib3/contrib/pyopenssl.py", line 453 in wrap_socket
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 344 in ssl_wrap_socket
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 344 in connect
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 841 in _validate_conn
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 343 in _make_request
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 600 in urlopen
File "/home/pi/.local/lib/python3.7/site-packages/requests/adapters.py", line 449 in send
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 646 in send
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 533 in request
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 121 in request
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 546 in get
File "new Tensura download.py", line 49 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x70aff460 (most recent call first):
File "/usr/lib/python3/dist-packages/bs4/builder/_lxml.py", line 287 in feed
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 342 in _feed
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 281 in __init__
File "new Tensura download.py", line 50 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x714ff460 (most recent call first):
File "/usr/lib/python3/dist-packages/OpenSSL/SSL.py", line 773 in load_verify_locations
File "/usr/lib/python3/dist-packages/urllib3/contrib/pyopenssl.py", line 428 in load_verify_locations
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 321 in ssl_wrap_socket
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 344 in connect
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 841 in _validate_conn
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 343 in _make_request
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 600 in urlopen
File "/home/pi/.local/lib/python3.7/site-packages/requests/adapters.py", line 449 in send
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 646 in send
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 533 in request
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 121 in request
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 219 in solve_cf_challenge
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 129 in request
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 546 in get
File "new Tensura download.py", line 49 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x71eff460 (most recent call first):
File "new Tensura download.py", line 253 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x728ff460 (most recent call first):
File "/usr/lib/python3.7/selectors.py", line 415 in select
File "/usr/lib/python3.7/subprocess.py", line 1681 in _communicate
File "/usr/lib/python3.7/subprocess.py", line 939 in communicate
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 322 in solve_challenge
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 204 in solve_cf_challenge
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 129 in request
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 237 in solve_cf_challenge
File "/home/pi/.local/lib/python3.7/site-packages/cfscrape/__init__.py", line 129 in request
File "/home/pi/.local/lib/python3.7/site-packages/requests/sessions.py", line 546 in get
File "new Tensura download.py", line 49 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x7329c460 (most recent call first):
File "/usr/lib/python3/dist-packages/bs4/builder/_lxml.py", line 287 in feed
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 342 in _feed
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 281 in __init__
File "new Tensura download.py", line 50 in core
File "/usr/lib/python3.7/threading.py", line 865 in run
File "/usr/lib/python3.7/threading.py", line 917 in _bootstrap_inner
File "/usr/lib/python3.7/threading.py", line 885 in _bootstrap
Thread 0x76fe5ad0 (most recent call first):
File "/usr/lib/python3.7/threading.py", line 1048 in _wait_for_tstate_lock
File "/usr/lib/python3.7/threading.py", line 1032 in join
File "/usr/lib/python3.7/threading.py", line 1281 in _shutdown
bash: linia 1: 18565 Naruszenie ochrony pamięci /usr/bin/python3 "new Tensura download.py"
Я вижу, что ошибка вызвана поточностью модуль, но я понятия не имею, как это исправить.
Если у вас есть какие-либо подсказки о том, как устранить это, я буду благодарен.
Вот весь код, если кто-то хочет его инициатор потоков находится в самом конце (обратите внимание, что для работы ядра необходимы терминальные программы enscript и ps2pdf , а модуль cfscrape является модулем запросов для сайтов, защищенных Cloudflare):
#improvment from 1h 40min to 1h 10 min with new version
#with 2 threads this goes down to 36 min
#with 4 down to 16 min (if i remember correctly)
#with 5 down to 13 min 40 s
#with 6 down to 13 min (segmentation faults occur)
#handling segmentation fault
import faulthandler
faulthandler.enable()
PYTHONFAULTHANDLER = 1
import cfscrape
from bs4 import BeautifulSoup
import threading
import subprocess
from time import time
#------time tracking class------
class time_check():
def __init__(self): #starts time tracking
self.start_t = time()
def check(self): #prints current time from start
end_t = time()
t = end_t - self.start_t
t_sec = int(t % 60)
t_min = int(((t - t_sec) / 60) % 60)
t_hr = int(((((t - t % 60) / 60) - t_min) / 60) % 24)
print(f'Operation took {t_hr} hours {t_min} minuts {t_sec} seconds.')
#----------------------------creating scraper------------------------
print('scrap s')
scrap = cfscrape.create_scraper()
source = scrap.get('https://www.readlightnovel.org/tensei-shitara-slime-datta-ken-wn', headers={"User-Agent": "XZ"}).content
print('scrap e')
soup = BeautifulSoup(source, 'lxml')
#---------------------char table-----------------------------
# V Vnbsp-V would you believe? there are more than 1 type of spaces. one is 160(noble space), other 32 in hex, another 12288
chars = ['\t', '\n', '\r', ' ', ' ', ' ', '', '「', '“', '」', '”', '『', '』'] # VV VV these are not the same markers
for x in '''QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm-_=+,<<《〈≪.≫〉》>>[{]};:'"/?`‘’~1234567890!!@#$%^&**※()\|…–—−ー・→°✦✧''':
chars.append(x)
#-----------------------getting more links from main site------------------------------------
print('link s')
links, link = [], []
table = soup.find_all('ul', class_='chapter-chs')[:4]
for x in table:
x = x.find_all('a')
links += x
for x in links:
link += [x['href']]
print('link e')
#--------------------core------------------------------
#purely esthetic purpose, brerak lines, changes characters, nicely breaks lines and saves it to files
def core(links, thr_id='?'):
ttime = time_check()
for num, link in enumerate(links):
#--------------------------request--------------------------------
source = scrap.get(link, headers={"User-Agent": "XZ"}).content
soup = BeautifulSoup(source, 'lxml')
#---------------------some deleting of useless data---------------------------
for trin in soup.find_all('div', class_='trinity-player-iframe-wrapper'):
trin.replace_with('')
for scr in soup.find_all('script'):
scr.replace_with('')
for scr in soup.find_all('text-center add11'):
scr.replace_with('')
for scr in soup.find_all('text-center'):
scr.replace_with('')
for div in soup.find_all('div', 'hidden'):
div.replace_with('')
#----------getting title for chapter---------------------------
try:
title = soup.find('div', class_='block-title').h1.text
except:
try:
title = f'v6 Tensei Shitara Slime Datta Ken (WN) - Chapter {num.name}'
except:
title = f'v6 Tensei Shitara Slime Datta Ken (WN) - Chapter {num}'
#-------------------getting wanted part of the site--------------------
desc = soup.find('div', class_='desc').text
#----------------new line/dialogue breaker------------------------------
beg_list, end_list, tetr, out = ['「', '(', '<', '“', '<', '《', '≪'], ['」', ')', '>', '”', '》', '≫'], ' ', ''
dialog = 0
#it is based on cart/sight/focus basically there is this tetr variable.
#At the beginning it changes its focus from lets say characters 21,22,23 to 22,23,24, and at the end addes first one to the output(in this case it would add 22).
#thats only if nothing would be changed.
#But each time newest letter (24 for this example) is added, it goes through a scanner.
#Some(all) of them manually alter the focus, so that new letters can be added(mostly \n).
for x in desc:
tetr = tetr[1:] + x
if tetr in [' . ', ' . ', ' . '] and dialog == 0: #line end after dot v1
tetr = ' \n '
out += '.'
elif tetr[1:] in ['. ', '. ', '. '] and dialog == 0: #line end after dot v2
out += tetr[0]
tetr = tetr[0] + '.\n'
elif x in beg_list and dialog == 0: #beginning of dialog/thought or something similar
dialog = 1
out += tetr[0:2]
tetr = tetr[2] + '\n' + x
elif x in end_list: #end of above
dialog = 0
out += tetr[0:2]
tetr = tetr[2] + x + '\n'
elif x in ['!', '!', '?'] and dialog == 0: #breaker for ! and ?
out += tetr[0:2]
tetr = tetr[1] + x + '\n'
else: #for all other letters
out += tetr[0]
desc = out
del out, beg_list, end_list, tetr, dialog
#------------------character changer------------------------
out, tetr = '', ' '
for x in desc:
tetr = tetr[1:] + x
if tetr[0] not in chars:
out += '\\'
continue
if tetr[0] in ['△'] :
out += '$'
elif tetr[0] in ['’', '‘'] :
out += "'"
elif tetr[0] in ['「'] :
out += '['
elif tetr[0] in ['」'] :
out += ']'
elif tetr[0] in ['>', '〉'] :
out += '>'
elif tetr[0] in ['<', '〈'] :
out += '<'
elif tetr[0] in ['“', '”'] :
out += '"'
elif tetr[0] in ['*'] :
out += '*'
elif tetr[0] in ['※']:
out += '--'
elif tetr[0] in ['…'] :
out += '...'
elif tetr[0] in ['『', '“', '《', '≪', '〈'] :
out += '<<'
elif tetr[0] in ['』', '”', '》', '≫', '〉'] :
out += '>>'
elif tetr[0] in ['–', '—', 'ー', '・', '−'] :
out += '-'
elif tetr[0] in ['→'] :
out += '->'
elif tetr[0] in [' ', ' ']:
out += ' '
elif tetr[0] in ['\t']:
out += ' '
elif tetr[0] in ['!']:
out += '!'
elif tetr[0] in ['é']:
out += 'e'
elif tetr[0] in ['Ō']:
out += 'Oo'
elif tetr[0] in ['✦', '✧']:
out += '*'
elif tetr[0] in ['°']:
out += ' degrees '
else:
out += tetr[0]
desc = out
del out, tetr
#--------------------nice line break---------------------------------
line, f_list = '', []
for x in desc: #converting long text into list of shorter strings
if x == '\n':
f_list.append(line)
line = ''
line += x
for y, line in enumerate(f_list):
if len(line) > 91:
for z in range(91, -1, -1):
v = line[z]
if v in [' ', ' ']: #looking for closest space from the (length) character to first
sep = z #+1 #separator
break
else: #or taking the (width) bit
sep = 90
if sep == 0:
sep = 1
part1 = line[:sep] #separating line
part2 = line[sep:]
f_list[y] = part1 + '\n' #and adding it to menu (part1 to place on old line)
f_list.insert(y+1, part2) #(part 2 to next line)
out = ''
for line in f_list:
out += line
desc = out
del out, sep, part1, part2, f_list, line
#-------------title adder------------------
desc = title + '\n' + desc
#----------------file creator------------------------
file = f'v7 {title}'
with open(f'v7/{file}.txt', 'w') as file:
file.write(desc)
file = f'v7 {title}'
#----------------------------ps creeation--------------------------
n_file_ps = f'{file}.ps'
proc = subprocess.Popen(['enscript', '-p', f'psv7/{n_file_ps}', f'v7/{file}.txt', f'--header={file}'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
proc.wait()
#---------------------------pdf creation----------------------------------
n_file_pdf = f'{file}.pdf'
proc = subprocess.Popen(['ps2pdf', f'psv7/{n_file_ps}', f'pdfv7/{n_file_pdf}'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
proc.wait()
print(thr_id, n_file_pdf, 'done')
ttime.check()
#-----------------------------folders--------------------------------
try: #deleing existing files
proc = subprocess.Popen('cd pdfv7 && rm v7*', stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
proc = subprocess.Popen('cd psv7 && rm v7*', stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
proc = subprocess.Popen('cd v7 && rm v7*', stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
except:
pass
for fold in ['psv7', 'pdfv7', 'v7']:
try: #creating new folders
proc = subprocess.Popen(['mkdir', fold], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
except:
pass
proc.wait()
#---------------------threads creation------------------------------------------
#for fixing segmentation error i used terminal command(8192 default) "ulimit -s 16284" <---no, i changed it back
thr_times = 7 #6 sometimes throws up segmentation fault (like 5 but more often)(4 also does that, but more chaotically)
threads = {}
for x in range(thr_times): #creating threads #V V V splitting links equally for all of the threads
threads[str(x)] = threading.Thread(target=core, name=f'Tensura thr.{x}', args=([link[x::thr_times], x])) #<-- x == thread id
for thread in threads:
threads[thread].start()
# core([link[80]])
Редактировать:
Хорошо, поскольку никто не отвечал, я создал мой обходной путь.
Если вы не можете зафиксировать ошибку сегментации в скрипте, просто создайте другой скрипт, который запускается и отслеживает, работает ли скрипт. Довольно аккуратно, да?
Проблема в том, что если сценарий будет убит ближе к концу, так как он должен будет начинаться заново, но вы можете go отойти от клавиатуры, и если ошибка сегментации - единственная проблема, этот скрипт будет работать сделано (если не произойдет что-то еще).
import subprocess
import time
import psutil
from fnctime import time_check
def kill(proc_pid):
process = psutil.Process(proc_pid)
for proc in process.children(recursive=True):
proc.kill()
process.kill()
ttime = time_check()
#begin process
p0 = subprocess.Popen('cd /home/pi/scripts/"Tensura Beautiful soup" && python3.7 "new Tensura download.py"', stderr=subprocess.PIPE, text=True, shell=True)
#define variable for checking if p0 is alive
status = p0.poll()
try:
while True:
if status != None: #if process is dead start it again
print('\nStart it again')
p0 = subprocess.Popen('cd /home/pi/scripts/"Tensura Beautiful soup" && python3.7 "new Tensura download.py"', stderr=subprocess.PIPE, text=True, shell=True)
status = p0.poll() #and renew status
while status == None: #while it is alive
print(p0.stdout if p0.stdout else '', end='') #print what it has printed
time.sleep(1) #wait
status = p0.poll() #check again
a, b = p0.communicate()
if b == 'Segmentation fault\n': #if Segmentation fault occured:
print('--Segmentation fault--')
continue #go again
if 'UnicodeDecodeError' in b:
print('Unicode Decode Error')
continue
print(f'--b: {b} --')
print('--all done--')
break #break whole loop
except KeyboardInterrupt: #if ctrl^C was clicked
try:
kill(p0.pid) #kill the process
except:
print('already dead') #unless it's dead already
ttime.check()
Надеюсь, я каким-то образом помог жить с этой проблемой, не решая ее.