Я не могу создать собственную модель, используя Spacy.Я использовал код, приведенный в официальной документации Spacy в разделе «Обновление распознавателя именованных сущностей».Я использовал тот же код, я только что изменил данные обучения вместе с моими.При тестировании модели создаются токены, но не сущности.Любая помощь будет оценена.
# training data
TRAIN_DATA = [
('This Agreement was signed between Vibhav Verma and N.C. Buildwell Inc. It
is a very reputable company.', {
'entities': [(51, 69, 'ORG')]
}),
('This company is also associated with V.V. Telephone Company and L.S. Legal
Services', {
'entities': [(37, 62, 'ORG'), (68, 86, 'ORG')]
})
]
def main(model=None, output_dir = None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe('ner')
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
'''
Output:
# The entities are empty
Entities []
Tokens [(u'This', u'', 2), (u'Agreement', u'', 2), (u'was', u'', 2),
(u'signed', u'', 2), (u'between', u'', 2), (u'Vibhav', u'', 2), (u'Verma',
u'', 2), (u'and', u'', 2), (u'N.C.', u'',
2), (u'Buildwell', u'', 2), (u'Inc.', u'', 2), (u'It', u'', 2), (u'is', u'',
2), (u'a', u'', 2), (u'very', u'', 2), (u'reputable', u'', 2), (u'company',
u'', 2), (u'.', u'', 2)]
# The entities are empty
Entities []
Tokens [(u'This', u'', 2), (u'company', u'', 2), (u'is', u'', 2), (u'also',
u'', 2), (u'associated', u'', 2), (u'with', u'', 2), (u'V.V.', u'', 2),
(u'Telephone', u'', 2), (u'Company',
u'', 2), (u'and', u'', 2), (u'L.S.', u'', 2), (u'Legal', u'', 2),
(u'Services', u'', 2)]
'''