Я пытаюсь удалить стоп-слова из моего вывода с помощью функции черного списка, но указанные слова не удаляются в выводе, представленном ниже.
Например: слово «the» в выводе отображается как № 13.
Если есть простой способ использовать функцию string.replace, это также подойдет , но если бы я мог просто изменить слова в черном списке, это было бы идеально.
import string
# Open the input file
s = open('AnnualLetter_2017.txt', 'r').read()
for line in s:
line = line.translate(str.maketrans("","", string.punctuation))
punctuation_list = ['?',',','.','–','•','#'] # non exhaustive
for punctuation in punctuation_list:
s = s.replace(punctuation, "")
# Blacklist of words to be filtered out
blacklist = ["the", "at", "can", "by", "to", "are", "they", "if", "have", "about", "for", "this", "of", "and", "a", "in", "is", "that", "we", "be", "with", "will", "it", "we", "on", "was", "has", "their", "has", "you", "were", "as","an", "but", "image"]
for word in blacklist:
s = s.replace(" " + word + " ", " ")
# count characters
num_chars = len(s)
# count lines
num_lines = s.count('\n')
words = s
words = s.split()
d = {}
for w in words:
w = w.lower()
if w in d: # seen w before?
d[w] += 1
else:
d[w] = 1
num_words = sum(d[w] for w in d)
lst = [(d[w], w) for w in d]
lst.sort()
lst.reverse()
print('Your input file has characters = ' + str(num_chars))
print('Your input file has num_lines = ' + str(num_lines))
print('Your input file has num_words = ' + str(num_words))
print('\n The 35 most frequent words are \n')
i = 1
for count, word in lst[:35]:
print('%2s. %4s %s' % (i, count, word))
i += 1
My output file as follows:
Your input file has characters = 26656
Your input file has num_lines = 203
Your input file has num_words = 4069
The 35 most frequent words are
1. 44 bill:
2. 39 melinda:
3. 36 i
4. 32 more
5. 32 children
6. 31 our
7. 30 when
8. 28 women
9. 27 world
10. 27 one
11. 27 image
12. 25 your
13. 25 the
14. 25 health
15. 24 it’s
16. 24 but
17. 23 from
18. 21 us
19. 20 we’re
20. 20 lives
21. 20 it
22. 20 her
23. 20 get
24. 19 global
25. 18 work
26. 18 who
27. 18 these
28. 18 had
29. 17 we
30. 17 that’s
31. 17 out
32. 17 how
33. 17 deaths
34. 16 than
35. 16 polio