Question

поэтому у меня есть некоторый код, который просматривает 10 файлов в каталоге.Каждый файл имеет, возможно, тысячи строк.Затем код отфильтровывает некоторые слова из этих файлов построчно.Я понимаю, что это может занять некоторое время, но мой код может быть улучшен, чтобы ускорить этот процесс.Я делаю где-то ошибку кодирования, которая вызывает узкое место?Любая помощь или совет будет принята с благодарностью!Вот мой код:

import os

def remove_stop_words(string, stopwords_list):
    string_to_list = string.split()
    x = (' '.join(i for i in string_to_list if i.lower() not in (x.lower() for x in stopwords_list)))
    x = x+'\n'
    return x

def get_stop_words_list(stopwords_path):
    with open(stopwords_path, 'r') as f:
        stopwords = f.read().split()
    return stopwords

def main():
    input_location = 'C:/Users/User/Desktop/mini_mouse'
    output_location = 'C:/Users/User/Desktop/test/'
    stop_words_path = 'C:/Users/User/Desktop/NLTK-stop-word-list.txt'
    stopwords = get_stop_words_list(stop_words_path)
    #print(stopwords)

    for root, dirs, files in os.walk(input_location):
        for name in files:
            file_path = os.path.join(root, name) # joins the new path of the file to the current file in order to access the file
            with open(file_path, 'r') as f: # open the file
                for line in f: # read file line by line
                    x = remove_stop_words(line,stopwords)
                    new_file_path = os.path.join(output_location, name) + '_filtered' # creates a new file of the file that is currenlty being filtered of stopwords
                    with open(new_file_path, 'a') as output_file: # opens output file
                        output_file.write(x) # writes the newly filtered text to the new output file



if __name__ == "__main__":
    main()

Daniel Scott · Answer 1 · 04 марта 2019

Вот решение для записи файла за файлом, а не построчно:

for root, dirs, files in os.walk(input_location):
    for name in files:
        file_path = os.path.join(root, name) # joins the new path of the file to the current file in order to access the file

        filestring = ''
        with open(file_path, 'r') as f: # open the file
            for line in f: # read file line by line
                x = remove_stop_words(line,stopwords)
                filestring+=x
                filestring+='\n' #Create new line

        new_file_path = os.path.join(output_location, name) + '_filtered' # creates a new file of the file that is currenlty being filtered of stopwords
        with open(new_file_path, 'a') as output_file: # opens output file
            output_file.write(filestring) # writes the newly filtered text to the new output file

Есть ли причина, по которой он работает медленно из-за того, как он закодирован?питон

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Есть ли причина, по которой он работает медленно из-за того, как он закодирован?питон

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы