Решение;
import spacy
nlp = spacy.load('en_core_web_md')
text = (u"When Sebastian Thrun started working on self-driving cars at ")
doc = nlp(text)
ids = []
for token in doc:
if token.has_vector:
id = nlp.vocab.vectors.key2row[token.norm]
else:
id = None
ids.append(id)
print([token for token in doc])
print(ids)
#>> [When, Sebastian, Thrun, started, working, on, self, -, driving, cars, at]
#>> [71, 19994, None, 369, 422, 19, 587, 32, 1169, 1153, 41]
Разбить это;
# A Vocabulary for which __getitem__ can take a chunk of text and returns a hash
nlp.vocab
# >> <spacy.vocab.Vocab at 0x12bcdce48>
nlp.vocab['hello'].norm # hash
# >> 5983625672228268878
# The tensor holding the word-vector
nlp.vocab.vectors.data.shape
# >> (20000, 300)
# A dict mapping hash -> row in this array
nlp.vocab.vectors.key2row
# >> {12646065887601541794: 0,
# >> 2593208677638477497: 1,
# >> ...}
# So to get int id of 'earth';
i = nlp.vocab.vectors.key2row[nlp.vocab['earth'].norm]
nlp.vocab.vectors.data[i]
# Note that tokens have hashes but may not have vector
# (Hence no entry in .key2row)
nlp.vocab['Thrun'].has_vector
# >> False