Переключите все на генераторы, и это должно работать:
#load text
filename = 'finnish_text.txt'
# Auto-close when done
with open(filename, 'r') as file:
#lowercase and split words by white space
word_iterables =(text.lower().split() for line in file)
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = (w.translate(table) for it in word_iterables for w in it)
# ranked word count specify return amount here
from collections import Counter
counter = Counter(stripped)
most_occur = counter.most_common(100)
# export csv file
import csv
with open('word_rank.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for x in most_occur:
writer.writerow(x)
Используя генераторы (круглые скобки вместо квадратных скобок), все слова обрабатываются лениво, а не загружаются в память сразу.
Если вы хотите самый эффективный из возможных способов, я написал один для самостоятельного вызова:
import itertools
import operator
#load text
filename = 'finnish_text.txt'
# Auto-close when done
with open(filename, 'r') as file:
# Lowercase the lines
lower_lines = map(str.lower, file)
# Split the words in each line - will return [[word, word], [word, word]]
word_iterables = map(str.split, lower_lines)
# Combine the iterables:
# i.e. [[word, word], [word, word]] -> [word, word, word, word]
words = itertools.chain.from_iterable(word_iterables)
import string
table = str.maketrans('', '', string.punctuation)
# remove punctuation from each word
stripped = map(operator.methodcaller("translate", table), words)
# ranked word count specify return amount here
from collections import Counter
counter = Counter(stripped)
most_occur = counter.most_common(100)
# export csv file
import csv
with open('word_rank.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for x in most_occur:
writer.writerow(x)
Он полностью использует генераторы, написанные на C (map & itertools).