Я тренируюсь в модели Spacy по заказу NER (именованного объекта). Я следовал инструкциям в ссылках https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718 и https://spacy.io/usage/training#ner
В соответствии с приведенными примерами, каждое слово имеет метку для сущности. Я закончил это успешно, когда я тренировался, используя одно слово для каждого ярлыка. Но в моем сценарии мне нужно обучить его более чем одному слову или предложению для сущности. Иногда это может быть 7 слов или более, например: «Произошла частичная потеря 30 BBL» или « Общие потери = 50 баррелей "... et c. Однако, когда я сделал, что код выдаёт мне следующую ошибку:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-45-a9c9ec92bab3> in <module>
110 n_iter= 30 #("Number of training iterations", "option", "n", int))
111
--> 112 train_test(model, new_model_name, output_dir, n_iter)
<ipython-input-45-a9c9ec92bab3> in train_test(model, new_model_name, output_dir, n_iter)
74 texts, annotations = zip(*batch)
75 nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
---> 76 losses=losses)
77 print('Losses', losses)
78
~\AppData\Local\Continuum\anaconda3\lib\site-packages\spacy\language.py in update(self, docs, golds, drop, sgd, losses, component_cfg)
513 kwargs = component_cfg.get(name, {})
514 kwargs.setdefault("drop", drop)
--> 515 proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
516 for key, (W, dW) in grads.items():
517 sgd(W, dW, key=key)
nn_parser.pyx in spacy.syntax.nn_parser.Parser.update()
nn_parser.pyx in spacy.syntax.nn_parser.Parser._init_gold_batch()
ner.pyx in spacy.syntax.ner.BiluoPushDown.preprocess_gold()
ner.pyx in spacy.syntax.ner.BiluoPushDown.has_gold()
TypeError: object of type 'NoneType' has no len()
мой python код:
#!/usr/bin/env python
# coding: utf8
# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# New entity labels
# Specify the new entity labels which you want to add here
LABELSS = ["MY_CUSTOM_ENTITY" , "U-Tag"]
# Loading training data
with open ('C:\\Users\\NER\\\ner_corpus_spacy_format_data.json', 'rb') as fp:
TRAIN_DATA = pickle.load(fp)
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name=("New model name for model meta.", "option", "nm", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def train_test(model=None, new_model_name='AnyName43', output_dir=None, n_iter=30):
"""Setting up the pipeline and entity recognizer, and training the new entity."""
if model is not None:
nlp = spacy.load(model) # load existing spacy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
#for i in LABELSS:
#ner.add_label(i) # Add new entity labels to entity recognizer
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
losses=losses)
print('Losses', losses)
# Test the trained model
test_text = 'This is the text that has the instance of my custom entity. I am not using actual data since it is confidential, it can be something like: Had total loss of 60 BBLs or total losses = 85 BBLs. I have dataframe which consists of thousands of records.'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
print(ent.label_, ent.text)
# Save model
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.meta['name'] = new_model_name # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# Test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
print(ent.label_, ent.text)
#if __name__ == '__main__':
#plac.call(train_test)
model= None #"en_core_web_sm" #("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name= "MyModelName" #("New model name for model meta.", "option", "nm", str),
output_dir= 'C:\\Users\\NER\\TRAIN_TEST_OUTPUT' #("Optional output directory", "option", "o", Path),
n_iter= 30 #("Number of training iterations", "option", "n", int))
train_test(model, new_model_name, output_dir, n_iter)
Ценю вашу помощь, так как я застрял в этой точке, и я не удалось найти решение по inte rnet.