Заимствовано из pytudes Питера Норвига для выполнения сегментации слов. Попробуйте ..
import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools import permutations
from typing import List, Tuple, Set, Dict, Callable
!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Word = str # We implement words as strings
cat = ''.join # Function to concatenate strings together
def tokens(text) -> List[Word]:
"""List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
return re.findall('[a-z]+', text.lower())
TEXT = open('big.txt').read()
WORDS = tokens(TEXT)
class ProbabilityFunction:
def __call__(self, outcome):
"""The probability of `outcome`."""
if not hasattr(self, 'total'):
self.total = sum(self.values())
return self[outcome] / self.total
class Bag(Counter, ProbabilityFunction): """A bag of words."""
Pword = Bag(WORDS)
def Pwords(words: List[Word]) -> float:
"Probability of a sequence of words, assuming each word is independent of others."
return Π(Pword(w) for w in words)
def Π(nums) -> float:
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for num in nums:
result *= num
return result
def splits(text, start=0, end=20) -> Tuple[str, str]:
"""Return a list of all (first, rest) pairs; start <= len(first) <= L."""
return [(text[:i], text[i:])
for i in range(start, min(len(text), end)+1)]
def segment(text) -> List[Word]:
"""Return a list of words that is the most probable segmentation of text."""
if not text:
return []
else:
candidates = ([first] + segment(rest)
for (first, rest) in splits(text, 1))
return max(candidates, key=Pwords)
strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]
- 2020-08-04 18: 48: 06 - https://raw.githubusercontent.com/dwyl/english-words/master/words.txt Разрешение raw.githubusercontent.com (raw.githubusercontent.com). .. 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Подключение к raw.githubusercontent.com (raw.githubusercontent.com) | 151.101.0.133 |: 443 ... подключено. HTTP-запрос отправлен, ожидает ответа ... 200 OK Длина: 4863005 (4,6M) [text / plain] Сохранение в: 'words.txt.2'
words.txt.2 100% [=== ================>] 4,64 M 162 КБ / с за 25 с
2020-08-04 18:48:31 (192 КБ / с) - 'слова .txt.2 'сохранено [4863005/4863005]
[[' то ',' создание '], [' происходит ',' так '], [' пришло ',' потому что ']]