Вот два способа подсчета слов в текстовом файле.
from re import split
def process_line(words, word_dict):
for word in words:
if word in word_dict:
word_dict[word] += 1
else:
word_dict[word] = 1
def process_dict(word_dict):
temp_list = []
for key, value in word_dict.items():
temp_list.append((value, key))
temp_list.sort()
return temp_list
def format_print(input_list, reverse, word_num):
if reverse:
input_list.sort(reverse=True)
print("\n", ("[Unique Words: " + str(word_num) + "]").center(35, "="))
print("-"*35 + "\n", "%-16s %s %16s" % ("Word", "|", "Count"), "\n", "-"*35)
for count, word in input_list:
print("%-16s %s %16d" % (word, "|", count))
def word_count(_file, max_to_min=False):
txt = open(_file, "rU")
word_dict = {}
for line in txt:
if line.replace(" ", "") != ("\n" or None):
process_line(filter(None, split("[^a-zA-Z']+", line.lower())), word_dict)
txt.close()
final_list = process_dict(word_dict)
format_print(final_list, max_to_min, len(word_dict))
word_count("C:\\your_path_here\\Test.txt", True)
#########################################################
from collections import Counter
import re
def openfile(filename):
fh = open(filename, "r+")
str = fh.read()
fh.close()
return str
def removegarbage(str):
# Replace one or more non-word (non-alphanumeric) chars with a space
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename, topwords):
txt = openfile(filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common(topwords):
print(key,value)
main('C:\\your_path_here\\Test.txt', 500)
Вот способ сравнения двух текстовых файлов и сохранения общих элементов.
with open('C:\\your_path_here\\text1.txt', 'r') as file1:
with open('C:\\your_path_here\\text2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('C:\\your_path_here\\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
# For differences, use the code below:
with open('C:\\your_path_here\\text1.txt', 'r') as file1:
with open('C:\\your_path_here\\text2.txt', 'r') as file2:
same = set(file1).symmetric_difference(file2)
same.discard('\n')
with open('C:\\your_path_here\\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)