Модель Spacy en_core_web_sm-2.2.0
не обучена таким словам, как KOYAL
, KOYA
и др. c. Один из способов заставить модель предсказывать такие слова, как KOYAL
, KOYA
, et c, - обновить модель en_core_web_sm-2.2.0
.
Подробнее об этом можно узнать в здесь
Код должен выглядеть примерно так:
import random
from spacy.gold import GoldParse
from cytoolz import partition_all
# training data
TRAIN_DATA = [
("Where is ICICI bank located", {"entities": [(9, 18, "ORG")]}),
("I like Thodupuzha and Pala", {"entities": [(7, 16, "LOC"), (22, 25, "LOC")]}),
("Thodupuzha is a tourist place", {"entities": [(0, 9, "LOC")]}),
("Pala is famous for mangoes", {"entities": [(0, 3, "LOC")]}),
("ICICI bank is one of the largest bank in the world", {"entities": [(0, 9, "ORG")]}),
("ICICI bank has a branch in Thodupuzha", {"entities": [(0, 9, "ORG"), (27, 36, "LOC")]}),
]
# preparing the revision data
revision_data = []
for doc in nlp.pipe(list(zip(*TRAIN_DATA))[0]):
tags = [w.tag_ for w in doc]
heads = [w.head.i for w in doc]
deps = [w.dep_ for w in doc]
entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
revision_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
deps=deps, entities=entities)))
# preparing the fine_tune_data
fine_tune_data = []
for raw_text, entity_offsets in TRAIN_DATA:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets['entities'])
fine_tune_data.append((doc, gold))
# training the model
n_epoch = 10
batch_size = 2
for i in range(n_epoch):
examples = revision_data + fine_tune_data
losses = {}
random.shuffle(examples)
for batch in partition_all(batch_size, examples):
docs, golds = zip(*batch)
nlp.update(docs, golds, drop=0.0, losses=losses)
# finding ner with the updated model
nytimes = nlp(sentence)
entities = [(i, i.label_, i.label) for i in nytimes.ents]
print(entities)