Вы можете использовать re.findall
, чтобы разбить, отфильтровать и сгруппировать текст в предложения, а затем пересечь структуру целевых и связанных слов, чтобы найти окончательные значения:
import re, string
from collections import namedtuple
import itertools
stop_words = [i.strip('\n') for i in open('filename.txt')]
text = open('filename.txt').read()
grammar = {'punctuation':string.punctuation, 'stopword':stop_words}
token = namedtuple('token', ['name', 'value'])
tokenized_file = [token((lambda x:'word' if not x else x[0])([a for a, b in grammar.items() if i.lower() in b]), i) for i in re.findall('\w+|\!|\-|\.|;|,:', text)]
filtered_file = [i for i in tokenized_file if i.name != 'stopword']
grouped_data = [list(b) for _, b in itertools.groupby(filtered_file, key=lambda x:x.value not in '!.?')]
text_with_sentences = ' '.join([' '.join([c.value for c in grouped_data[i]])+grouped_data[i+1][0].value for i in range(0, len(grouped_data), 2)])
В настоящее время результатиз text_with_sentences
is:
'Mississippi worth reading. commonplace river contrary ways remarkable. Considering Missouri main branch longest river - -. seems safe crookedest river part journey uses cover ground crow fly six seventy - five. discharges water St. Lawrence twenty - five Rhine thirty - eight Thames. river vast drainage - basin draws water supply twenty - eight States Territories ; Delaware Atlantic seaboard country Idaho Pacific slope - - spread forty - five degrees longitude. Mississippi receives carries Gulf water fifty - subordinate rivers navigable steamboats hundreds navigable flats keels. area drainage - basin combined areas England Wales Scotland Ireland France Spain Portugal Germany Austria Italy Turkey ; almost wide region fertile ; Mississippi valley proper exceptionally.'
Чтобы найти счетчики для профилирования ключевых слов, вы можете использовать collections.Counter
:
import collections
counts = collections.Counter(map(str.lower, re.findall('[\w\-]+', text)))
structure = [['river', ['ground', 'journey', 'longitude', 'main', 'world--four', 'contrary', 'cover']], ['mississippi', ['area', 'steamboats', 'germany', 'reading', 'france', 'proper']]]
new_structure = [{'keyword':counts.get(a, 0), 'associated':{i:counts.get(i, 0) for i in b}} for a, b in structure]
Вывод:
[{'associated': {'cover': 1, 'longitude': 1, 'journey': 1, 'contrary': 1, 'main': 1, 'world--four': 1, 'ground': 1}, 'keyword': 4}, {'associated': {'area': 1, 'france': 1, 'germany': 1, 'proper': 1, 'reading': 1, 'steamboats': 1}, 'keyword': 3}]
Без использования каких-либо модулей можно использовать str.split
:
words = [[i[:-1], i[-1]] if i[-1] in string.punctuation else [i] for i in text.split()]
new_words = [i for b in words for i in b if i.lower() not in stop_words]
def find_groups(d, _pivot = '.'):
current = []
for i in d:
if i == _pivot:
yield ' '.join(current)+'.'
current = []
else:
current.append(i)
print(list(find_groups(new_words)))
counts = {}
for i in new_words:
if i.lower() not in counts:
counts[i.lower()] = 1
else:
counts[i.lower()] += 1
structure = [['river', ['ground', 'journey', 'longitude', 'main', 'world--four', 'contrary', 'cover']], ['mississippi', ['area', 'steamboats', 'germany', 'reading', 'france', 'proper']]]
new_structure = [{'keyword':counts.get(a, 0), 'associated':{i:counts.get(i, 0) for i in b}} for a, b in structure]
Выход:
['Mississippi worth reading.', 'commonplace river , contrary ways remarkable.', 'Considering Missouri main branch , longest river world--four.', 'seems safe crookedest river , part journey uses cover ground crow fly six seventy-five.', 'discharges water St.', 'Lawrence , twenty-five Rhine , thirty-eight Thames.', 'river vast drainage-basin : draws water supply twenty-eight States Territories ; Delaware , Atlantic seaboard , country Idaho Pacific slope--a spread forty-five degrees longitude.', 'Mississippi receives carries Gulf water fifty-four subordinate rivers navigable steamboats , hundreds navigable flats keels.', 'area drainage-basin combined areas England , Wales , Scotland , Ireland , France , Spain , Portugal , Germany , Austria , Italy , Turkey ; almost wide region fertile ; Mississippi valley , proper , exceptionally.']
[{'associated': {'cover': 1, 'longitude': 1, 'journey': 1, 'contrary': 1, 'main': 1, 'world--four': 1, 'ground': 1}, 'keyword': 4}, {'associated': {'area': 1, 'france': 1, 'germany': 1, 'proper': 1, 'reading': 1, 'steamboats': 1}, 'keyword': 3}]