@ SidharthMacherla привел меня на правильный путь (NLTK и токенизация), хотя его решение не решает проблему выражений из нескольких слов, которые, к тому же, могут перекрываться.
Короче говоря, лучший Метод, который я нашел, заключается в создании подкласса MWETokenizer
в NLTK и добавлении функции для подсчета нескольких слов с использованием util.Tr ie:
import re, regex, timeit
from nltk.tokenize import MWETokenizer
from nltk.util import Trie
class FreqMWETokenizer(MWETokenizer):
"""A tokenizer that processes tokenized text and merges multi-word expressions
into single tokens.
"""
def __init__(self, mwes=None, separator="_"):
super().__init__(mwes, separator)
def freqs(self, text):
"""
:param text: A list containing tokenized text
:type text: list(str)
:return: A frequency dictionary with multi-words merged together as keys
:rtype: dict
:Example:
>>> tokenizer = FreqMWETokenizer([ mw.split() for mw in ['multilayer ceramic', 'multilayer ceramic capacitor', 'ceramic capacitor']], separator=' ')
>>> tokenizer.freqs("Gimme that multilayer ceramic capacitor please!".split())
{'multilayer ceramic': 1, 'multilayer ceramic capacitor': 1, 'ceramic capacitor': 1}
"""
i = 0
n = len(text)
result = {}
while i < n:
if text[i] in self._mwes:
# possible MWE match
j = i
trie = self._mwes
while j < n and text[j] in trie:
if Trie.LEAF in trie:
# success!
mw = self._separator.join(text[i:j])
result[mw]=result.get(mw,0)+1
trie = trie[text[j]]
j = j + 1
else:
if Trie.LEAF in trie:
# success!
mw = self._separator.join(text[i:j])
result[mw]=result.get(mw,0)+1
i += 1
else:
i += 1
return result
>>> tokenizer = FreqMWETokenizer([ mw.split() for mw in ['multilayer ceramic', 'multilayer ceramic capacitor', 'ceramic capacitor']], separator=' ')
>>> tokenizer.freqs("Gimme that multilayer ceramic capacitor please!".split())
{'multilayer ceramic': 1, 'multilayer ceramic capacitor': 1, 'ceramic capacitor': 1}
Вот набор тестов с мерами скорости:
Для подсчета 10 000 многословных терминов в 10 м символов потребовалось 2 секунды с FreqMWETokenizer, 4 секунды с MWETokenizer (также предоставляется полный токенизация, но пересчеты не учитываются), 150 секунд с простым методом подсчета и 1000 секунд с большое регулярное выражение. Попытка использования 100-кратных терминов в 100-миллиметровых символах остается выполнимой для токенизаторов без подсчета или регулярных выражений.
Для тестирования, пожалуйста, найдите два больших примера файлов в https://mega.nz/file/PsVVWSzA#5 -OHy-L7SO6fzsByiJzeBnAbtJKRVy95YFdjeF_7yxA
def freqtokenizer(thissampledict, thissampletext):
"""
This method uses the above FreqMWETokenizer's function freqs.
It captures overlapping multi-words
counting 1000 terms in 1000000 characters took 0.3222855870008061 seconds. found 0 terms from the list.
counting 10000 terms in 10000000 characters took 2.5309120759993675 seconds. found 21 terms from the list.
counting 100000 terms in 29467534 characters took 10.57763242800138 seconds. found 956 terms from the list.
counting 743274 terms in 29467534 characters took 25.613067482998304 seconds. found 10411 terms from the list.
"""
tokenizer = FreqMWETokenizer([mw.split() for mw in thissampledict], separator=' ')
thissampletext = re.sub(' +',' ', re.sub('[^\s\w\/\-\']+',' ',thissampletext)) # removing punctuation except /-'_
freqs = tokenizer.freqs(thissampletext.split())
return freqs
def nltkmethod(thissampledict, thissampletext):
""" This function first produces a tokenization by means of MWETokenizer.
This takes the biggest matching multi-word, no overlaps.
They could be computed separately on the dictionary.
counting 1000 terms in 1000000 characters took 0.34804968100070255 seconds. found 0 terms from the list.
counting 10000 terms in 10000000 characters took 3.9042628339993826 seconds. found 20 terms from the list.
counting 100000 terms in 29467534 characters took 12.782784996001283 seconds. found 942 terms from the list.
counting 743274 terms in 29467534 characters took 28.684293715999956 seconds. found 9964 terms from the list.
"""
termfreqdic = {}
tokenizer = MWETokenizer([mw.split() for mw in thissampledict], separator=' ')
thissampletext = re.sub(' +',' ', re.sub('[^\s\w\/\-\']+',' ',thissampletext)) # removing punctuation except /-'_
tokens = tokenizer.tokenize(thissampletext.split())
freqdist = FreqDist(tokens)
termsfound = set([t for t in freqdist.keys()]) & set(thissampledict)
for t in termsfound:termfreqdic[t]=freqdist[t]
return termfreqdic
def countmethod(thissampledict, thissampletext):
"""
counting 1000 in 1000000 took 0.9351876619912218 seconds.
counting 10000 in 10000000 took 91.92642056700424 seconds.
counting 100000 in 29467534 took 3185.7411157219904 seconds.
"""
termfreqdic = {}
for term in thissampledict:
termfreqdic[term] = thissampletext.count(term)
return termfreqdic
def regexmethod(thissampledict, thissampletext):
"""
counting 1000 terms in 1000000 characters took 2.298602456023218 seconds.
counting 10000 terms in 10000000 characters took 395.46084802100086 seconds.
counting 100000: impossible
"""
termfreqdic = {}
termregex = re.compile(r'\b'+r'\b|\b'.join(thissampledict))
for m in termregex.finditer(thissampletext):
termfreqdic[m.group(0)]=termfreqdic.get(m.group(0),0)+1
return termfreqdic
def timing():
"""
for testing, find the two large sample files at
https://mega.nz/file/PsVVWSzA#5-OHy-L7SO6fzsByiJzeBnAbtJKRVy95YFdjeF_7yxA
"""
sampletext=open("G06K0019000000.txt").read().lower()
sampledict=open("manyterms.lower.txt").read().strip().split('\n')
print(len(sampletext),'characters',len(sampledict),'terms')
for i in range(4):
for f in [freqtokenizer, nltkmethod, countmethod, regexmethod]:
start = timeit.default_timer()
thissampledict = sampledict[:1000*10**i]
thissampletext = sampletext[:1000000*10**i]
termfreqdic = f(thissampledict, thissampletext)
#termfreqdic = countmethod(thissampledict, thissampletext)
#termfreqdic = regexmethod(thissampledict, thissampletext)
#termfreqdic = nltkmethod(thissampledict, thissampletext)
#termfreqdic = freqtokenizer(thissampledict, thissampletext)
print('{f} counting {terms} terms in {characters} characters took {seconds} seconds. found {termfreqdic} terms from the list.'.format(f=f, terms=len(thissampledict), characters=len(thissampletext), seconds=timeit.default_timer()-start, termfreqdic=len({a:v for (a,v) in termfreqdic.items() if v})))
timing()