Python - получение пустых «сущностей» при создании пользовательской модели в Spacy NER - PullRequest
0 голосов
/ 21 сентября 2018

Я не могу создать собственную модель, используя Spacy.Я использовал код, приведенный в официальной документации Spacy в разделе «Обновление распознавателя именованных сущностей».Я использовал тот же код, я только что изменил данные обучения вместе с моими.При тестировании модели создаются токены, но не сущности.Любая помощь будет оценена.

# training data
TRAIN_DATA = [
('This Agreement was signed between Vibhav Verma and N.C. Buildwell Inc. It 
is a very reputable company.', {
    'entities': [(51, 69, 'ORG')]
}),
('This company is also associated with V.V. Telephone Company and L.S. Legal 
Services', {
    'entities': [(37, 62, 'ORG'), (68, 86, 'ORG')]
})
]

def main(model=None, output_dir = None, n_iter=100):

"""Load the model, set up the pipeline and train the entity recognizer."""

if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

'''
Output:
# The entities are empty
Entities []
Tokens [(u'This', u'', 2), (u'Agreement', u'', 2), (u'was', u'', 2), 
(u'signed', u'', 2), (u'between', u'', 2), (u'Vibhav', u'', 2), (u'Verma', 
u'', 2), (u'and', u'', 2), (u'N.C.', u'',
2), (u'Buildwell', u'', 2), (u'Inc.', u'', 2), (u'It', u'', 2), (u'is', u'',  
2), (u'a', u'', 2), (u'very', u'', 2), (u'reputable', u'', 2), (u'company', 
u'', 2), (u'.', u'', 2)]
# The entities are empty
Entities [] 
Tokens [(u'This', u'', 2), (u'company', u'', 2), (u'is', u'', 2), (u'also', 
u'', 2), (u'associated', u'', 2), (u'with', u'', 2), (u'V.V.', u'', 2), 
(u'Telephone', u'', 2), (u'Company',
u'', 2), (u'and', u'', 2), (u'L.S.', u'', 2), (u'Legal', u'', 2), 
(u'Services', u'', 2)]

'''
...