Кажется, что PunktSentenceTokenizer просто игнорирует существующие сокращения / словосочетания, когда встречается слово верхнего регистра.Я составил небольшой пример, чтобы продемонстрировать поведение, в реальной жизни у меня есть документ большего размера, пробовал тренироваться и т. Д.
import docx
import nltk
import pickle
punkt_tk = nltk.data.load('tokenizers/punkt/english.pickle')
punkt_tk._params.abbrev_types.add('p.o')
punkt_tk._params.abbrev_types.add('p. o')
punkt_tk._params.abbrev_types.add('o')
punkt_tk._params.collocations.add(('p.o.','box'))
punkt_tk._params.collocations.add(('p. o.','box'))
punkt_tk._params.collocations.add(('o.','box'))
txt = 'its registered office address at P.O. Box 111 and having its registered office address at P. O. Box: 222'
d = punkt_tk.debug_decisions(txt)
for x in d:
print(nltk.tokenize.punkt.format_debug_decision(x))
результат равен
Text: 'P.O. Box' (at offset 36)
Sentence break? True (abbreviation + orthographic heuristic)
Collocation? True
'p.o.':
known abbreviation: True
is initial: False
'box':
known sentence starter: False
orthographic heuristic suggests is a sentence starter? True
orthographic contexts in training: {'UNK-LC', 'MID-LC'}
Text: 'P. O.' (at offset 91)
Sentence break? False (initial + special orthographic heuristic)
Collocation? False
'p.':
known abbreviation: True
is initial: True
'o.':
known sentence starter: False
orthographic heuristic suggests is a sentence starter? unknown
orthographic contexts in training: set()
Text: 'O. Box:' (at offset 94)
Sentence break? None (default decision)
Collocation? True
'o.':
known abbreviation: True
is initial: True
'box':
known sentence starter: False
orthographic heuristic suggests is a sentence starter? True
orthographic contexts in training: {'UNK-LC', 'MID-LC'}
.нет нежелательного разрыва предложения.