это мой код.
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim.models.keyedvectors as word2vec
gvc='D:/download/freebase-rdf-latest.gz'
model=word2vec.KeyedVectors.load_word2vec_format(gvc,binary=True,limit=200000)
Я использую gensim для word2vec на google freebase.Этот код хорошо работает для файла freebase 2GiG (Freebase Deleted Triples). Но он выдает ошибку на 22 ГБGiG (Freebase Triples).
Ошибка
ValueError Traceback (most recent call last)
<ipython-input-54-d97d5ae347db> in <module>()
7 #gvc='D:/download/freebase-vectors-skipgram1000-en.bin.gz'
8 gvc='D:/download/freebase-rdf-latest.gz'
----> 9 model=word2vec.KeyedVectors.load_word2vec_format(gvc,binary=True,limit=200000)
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
1436 return _load_word2vec_format(
1437 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1438 limit=limit, datatype=datatype)
1439
1440 def get_keras_embedding(self, train_embeddings=False):
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
171 with utils.smart_open(fname) as fin:
172 header = utils.to_unicode(fin.readline(), encoding=encoding)
--> 173 vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
174 if limit:
175 vocab_size = min(vocab_size, limit)
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\utils_any2vec.py in <genexpr>(.0)
171 with utils.smart_open(fname) as fin:
172 header = utils.to_unicode(fin.readline(), encoding=encoding)
--> 173 vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
174 if limit:
175 vocab_size = min(vocab_size, limit)
ValueError: invalid literal for int() with base 10: '<http://rdf.freebase.com/ns/american_football.football_player.footballdb_id>'