Я пытаюсь запустить файл cython параллельно. Каркас моего кода:
def build_DF_single(lo, hi, map[long, set[string]] authors_id_map, map[long, set[string]] authors_org_map,
map[long, set[string]] fos_name_map, map[long, set[string]] publisher_map,
map[long, set[string]] year_map, map[long, set[long]] reference_map,
map[string, double] authors_id_prob, map[string, double] authors_org_prob,
map[string, double] fos_name_prob, map[string, double] publisher_prob,
map[string, double] year_prob, map[string, set[long]] authors_id_co,
map[string, set[long]] authors_org_co, map[string, set[long]] fos_name_co,
map[string, set[long]] publisher_co, map[string, set[long]] year_co,
map[long, vector[double]] doc2vec_map):
for i in tqdm(range(lo, hi)):
line = lines[i]
# Data cleaning on <line>
def mmap(name):
d = joblib.load("mmap/" + name + ".mmap", mmap_mode="r")
gc.collect()
return d
authors_id_prob = mmap("authors_id_prob")
authors_org_prob = mmap("authors_org_prob")
fos_name_prob = mmap("fos_name_prob")
publisher_prob = mmap("publisher_prob")
year_prob = mmap("year_prob")
authors_id_co = mmap("authors_id_co")
authors_org_co = mmap("authors_org_co")
fos_name_co = mmap("fos_name_co")
publisher_co = mmap("publisher_co")
year_co = mmap("year_co")
doc2vec_map = mmap("doc2vec_map")
with open("file", "r") as f:
lines = f.readlines() # Pretty large as well
batch_size = int(math.ceil(len(lines) / n_cpu))
results = Parallel(n_jobs = n_cpu, prefer="threads", max_nbytes=None)(delayed(build_DF_single)(
(i * batch_size), min((i + 1) * batch_size, len(lines)),
authors_id_map, authors_org_map, fos_name_map, publisher_map, year_map, reference_map, authors_id_prob, authors_org_prob, fos_name_prob, publisher_prob, year_prob, authors_id_co, authors_org_co, fos_name_co, publisher_co, year_co, doc2vec_map
) for i in range(n_cpu))
где авторы_id_map, авторы_org_map, fos_name_map, publisher_map, year_map, reference_map, sources_id_prob, авторы_org_prob, fos_name_prob, publisher_prob, year_prob, sources_id_cove, dosco_org_name, publisher_org_name все очень большие карты C ++. Поскольку я не хочу разделять их на разные процессы, я вместо этого делаю для них карты памяти. Однако, когда мой код попадает в часть Parallel (), я получаю следующую ошибку:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "buildDF.pyx", line 473, in buildDF.build_DF
results = Parallel(n_jobs = n_cpu, require="sharedmem", prefer="threads", max_nbytes=None)(delayed(build_DF_single)(
File "/home/zhangji/.local/lib/python2.7/site-packages/joblib/parallel.py", line 1004, in __call__
if self.dispatch_one_batch(iterator):
File "/home/zhangji/.local/lib/python2.7/site-packages/joblib/parallel.py", line 808, in dispatch_one_batch
islice = list(itertools.islice(iterator, big_batch_size))
File "buildDF.pyx", line 475, in genexpr
authors_id_map, authors_org_map, fos_name_map, publisher_map, year_map, reference_map, authors_id_prob, authors_org_prob, fos_name_prob, publisher_prob, year_prob, authors_id_co, authors_org_co, fos_name_co, publisher_co, year_co, doc2vec_map
File "stringsource", line 207, in map.to_py.__pyx_convert_map_to_py_std_3a__3a_string____double
MemoryError
Кто-нибудь может сказать мне, что происходит? Что такое stringsource?
Спасибо!