При попытке обучить данные doc2ve c в наборе данных SQuAD 2.0:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
я столкнулся с этой ошибкой:
Python, TypeError: unhashable type: 'list'
Я пытался преобразовать список в кортеж вроде этого, но это не сработало:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
tuples = tuple([x for x in tqdm(train_tagged.values)])
model_dbow.build_vocab(tuples)
Некоторые части кода:
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_clean, test_size=0.2, random_state=42)
def tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
if len(word) < 2:
continue
tokens.append(word.lower())
return tokens
train_tagged = df_clean.apply(
lambda r: TaggedDocument(words=tokenize_text(r['Context']), tags=[[r.Question], [r.Answer]]), axis=1)
test_tagged = df_clean.apply(
lambda r: TaggedDocument(words=tokenize_text(r['Context']), tags=[[r.Question], [r.Answer]]), axis=1)
import multiprocessing
cores = multiprocessing.cpu_count()
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
tuples = tuple([x for x in tqdm(train_tagged.values)])
model_dbow.build_vocab(tuples)
Полная трассировка стека:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-41-9cbc59d6600f> in <module>
4 model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
5 tuples = tuple([x for x in tqdm(train_tagged.values)])
----> 6 model_dbow.build_vocab(tuples)
~/anaconda3/lib/python3.7/site-packages/gensim/models/doc2vec.py in build_vocab(self, documents, corpus_file, update, progress_per, keep_raw_vocab, trim_rule, **kwargs)
1182 total_words, corpus_count = self.vocabulary.scan_vocab(
1183 documents=documents, corpus_file=corpus_file, docvecs=self.docvecs,
-> 1184 progress_per=progress_per, trim_rule=trim_rule
1185 )
1186 self.corpus_count = corpus_count
~/anaconda3/lib/python3.7/site-packages/gensim/models/doc2vec.py in scan_vocab(self, documents, corpus_file, docvecs, progress_per, trim_rule)
1379 documents = TaggedLineDocument(corpus_file)
1380
-> 1381 total_words, corpus_count = self._scan_vocab(documents, docvecs, progress_per, trim_rule)
1382
1383 logger.info(
~/anaconda3/lib/python3.7/site-packages/gensim/models/doc2vec.py in _scan_vocab(self, documents, docvecs, progress_per, trim_rule)
1326
1327 for tag in document.tags:
-> 1328 _note_doctag(tag, document_length, docvecs)
1329
1330 for word in document.words:
~/anaconda3/lib/python3.7/site-packages/gensim/models/doc2vec.py in _note_doctag(key, document_length, docvecs)
1253 docvecs.max_rawint = max(docvecs.max_rawint, key)
1254 else:
-> 1255 if key in docvecs.doctags:
1256 docvecs.doctags[key] = docvecs.doctags[key].repeat(document_length)
1257 else:
TypeError: unhashable type: 'list'