Как исправить "недопустимый литерал для int () с основанием 10: '<http://rdf.freebase.com/ns/american_football.football_player.footballdb_id>'" - PullRequest
0 голосов
/ 12 февраля 2019

это мой код.

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim.models.keyedvectors as word2vec

gvc='D:/download/freebase-rdf-latest.gz'
model=word2vec.KeyedVectors.load_word2vec_format(gvc,binary=True,limit=200000)    

Я использую gensim для word2vec на google freebase.Этот код хорошо работает для файла freebase 2GiG (Freebase Deleted Triples). Но он выдает ошибку на 22 ГБGiG (Freebase Triples).

Ошибка

ValueError                                Traceback (most recent call last)
<ipython-input-54-d97d5ae347db> in <module>()
      7 #gvc='D:/download/freebase-vectors-skipgram1000-en.bin.gz'
      8 gvc='D:/download/freebase-rdf-latest.gz'
----> 9 model=word2vec.KeyedVectors.load_word2vec_format(gvc,binary=True,limit=200000)

C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
   1436         return _load_word2vec_format(
   1437             cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1438             limit=limit, datatype=datatype)
   1439 
   1440     def get_keras_embedding(self, train_embeddings=False):

C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
    171     with utils.smart_open(fname) as fin:
    172         header = utils.to_unicode(fin.readline(), encoding=encoding)
--> 173         vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
    174         if limit:
    175             vocab_size = min(vocab_size, limit)

C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\utils_any2vec.py in <genexpr>(.0)
    171     with utils.smart_open(fname) as fin:
    172         header = utils.to_unicode(fin.readline(), encoding=encoding)
--> 173         vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
    174         if limit:
    175             vocab_size = min(vocab_size, limit)

ValueError: invalid literal for int() with base 10: '<http://rdf.freebase.com/ns/american_football.football_player.footballdb_id>'
...