Question

У меня небольшой опыт работы с программированием, но я начал изучать python и хотел бы создать функцию для подсчета наиболее часто встречающихся слов в тексте. Теперь я уверен, что моя версия не лучший способ сделать это, но он работает:

 import os

 punctuation = "~!@#$%^&*()_-=+[{]}\\|'\";:,<.>/?"

 def remove_punctuation(text):

     text_wo_punctuation = ""
     for word in text:
         if word not in punctuation:
             text_wo_punctuation += word
     return text_wo_punctuation

 with open(r'New Text Document.txt') as f:

     text = f.read().lower()
     t = remove_punctuation(text).split()
     dictionary = {}
     for word in t:
         if word in dictionary:
             dictionary[word] = dictionary[word] + 1
         else:
             dictionary[word] = 1

 print(dictionary)

 def top_five(d):

     top = {}
     value1 = 0
     value2 = 0
     value3 = 0
     value4 = 0
     value5 = 0


     for key in dictionary:
         if value1 < dictionary[key] and key not in top:
             value1 = dictionary[key]
             top1 = {key:value1}
         else:
             continue
     top.update(top1)    
     for key in dictionary:
         if value2 < dictionary[key] and key not in top:
             value2 = dictionary[key]
             top2 = {key:value2}
         else:
             continue
     top.update(top2)
     for key in dictionary:
         if value3 < dictionary[key] and key not in top:
             value3 = dictionary[key]
             top3 = {key:value3}
         else:
             continue
     top.update(top3)
     for key in dictionary:
         if value4 < dictionary[key] and key not in top:
             value4 = dictionary[key]
             top4 = {key:value4}
         else:
             continue
     top.update(top4)
     for key in dictionary:
         if value5 < dictionary[key] and key not in top:
             value5 = dictionary[key]
             top5 = {key:value4}
         else:
             continue
     top.update(top5)
    return top

 print(top_five(dictionary))

Приведенный выше код даст следующий вывод:

{'word1': «freq1», «word2»: «freq2», «word3»: «freq3», «word4»: «freq4», «word5»: «freq5»}

Несмотря на то, что это результат, который я хочу, я попытался упростить свою функцию и позволить пользователю выбрать, для скольких слов он должен считать частоту:

 def top_five(d,n):

     top = {}
     values = {}
     for i in range(1,n+1):
         values["value"+str(i)]=0
     for i in range(1,n+1):
         top["top"+str(i)]=0

     for i in range(1,n+1):
         for key in dictionary :
             if values["value"+str(i)] < dictionary[key] and key not in top:
                 values["value"+str(i)] = dictionary[key]
                 top["top"+str(i)] = {key:values["value"+str(i)]}
             else:
                 continue
         top.update(top1)
     print(top)
     return top

Этот код создаст словарь со значением1, значением2 и т. Д., Который я мог бы использовать в своем цикле, и другой словарь с top1, top2 и т. Д., Но он не будет работать, поскольку "и ключ не в верхней части" не будет работать.

top["top"+str(i)] = {key:values["value"+str(i)]}

это создаст словарь внутри словаря. Я застрял в этом, так как не смог найти способ сделать «верхний» словарь полезным или перебрать имя переменной внутри цикла. Я читал, что списки или словари должны использоваться, и эта итерация имени переменной не очень хорошая идея, но я не понимаю, почему это так, и я не могу придумать, как сделать списки или словари полезными в моем цикле for.

Как я уже сказал, я знаю, что это может быть не лучшим подходом при создании такого рода функций, но мой вопрос: как я могу упростить тот, который я уже сделал, и получить цикл работающий?

Спасибо!

cdlane · Answer 1 · 31 декабря 2018

Понимая, что вы хотите реализовать собственную пузырьковую сортировку вместо использования быстрой сортировки в Python, и вы хотите сами подсчитывать слова, вместо использования счетчика, давайте ужесточим ваш код, чтобы использовать преимущества идиом Python и немного уменьшить нагрузку на вашей бедной черепахе:

from turtle import Screen, Turtle
from collections import defaultdict

PUNCTUATION = "~`!@#$%^&*()_-=+[{]}\\|'\";:,<.>/?"

def remove_punctuation(text):
    """ Removes punctuation characters from given text """

    text_wo_punctuation = ""

    for letter in text:
        if letter not in PUNCTUATION:
            text_wo_punctuation += letter

    return text_wo_punctuation

def count_words(filename):
    """ Returns a dictionary of words and word count from "file" """

    dictionary = defaultdict(int)  # if you won't use Counter, at least use defaultdict()

    with open(filename) as file:
        text = remove_punctuation(file.read()).lower().split()

        for word in text:
            dictionary[word] += 1

    return dictionary

def dict_sort(d, reverse=False):
    """
    Sort given dictionary "d" values (& keys) in ascending (default)
    or descending (reverse = True) order
    Outputs tuple of: list of keys, list of values
    Recommended format for output: k, v = dict_sort(d)
    """

    key_list = list(d.keys())
    value_list = list(d.values())

    for _ in range(len(value_list) - 1):
        for i in range(len(value_list) - 1):
            if reverse:
                if value_list[i] > value_list[i+1]:
                    value_list[i], value_list[i+1] = value_list[i+1], value_list[i]
                    key_list[i], key_list[i+1] = key_list[i+1], key_list[i]
            else:
                if value_list[i] < value_list[i+1]:
                    value_list[i], value_list[i+1] = value_list[i+1], value_list[i]
                    key_list[i], key_list[i+1] = key_list[i+1], key_list[i]

    return key_list, value_list

def word_freq():
    """ Input how many words to plot on graph """

    while True:
        try:
            n_freq = int(input("How many of the most frequent words would you like to display?\n"))

            if not 1 <= n_freq <= 10:
                print("Please input an integer between 1 and 10:")
                continue

        except ValueError:
            print("Please input an integer between 1 and 10:")
            continue
        else:
            break

    return n_freq

def graph_word_freq(n, f, w):
    """
    Draw bar chart of most frequent words in text
    n: number of words to plot (between 1 and 10)
    f: word frequency list
    w: word list
    """

    window = Screen()
    window.bgcolor("honeydew")
    window.title("Most Frequent Words")

    if f[0] < 960:
        width = 60
        spacing = 20
        y = 500
        y_pos = -480
        x_pos = - (30 + 40 * (n - 1))
    else:
        width = 100
        spacing = 40
        y = f[0] / 2 + 20
        y_pos = -f[0] / 2
        x_pos = - (50 + 70 * (n - 1))

    window.setworldcoordinates(-y, -y, y, y)
    tortoise = Turtle(visible=False)
    tortoise.speed('fastest')  # because I have no patience

    tortoise.penup()
    tortoise.setposition(x_pos, y_pos)

    for i in range(n):
        if f[i] < (f[0] - f[n]) / 3:
            tortoise.color("SeaGreen", "ForestGreen")
        elif (f[0] - f[n]) / 3 <= f[i] < (f[0] - f[n]) / 1.5:
            tortoise.color("orange", "gold")
        else:
            tortoise.color("coral3", "IndianRed")

        tortoise.left(90)

        tortoise.begin_fill()

        tortoise.forward(f[i])
        tortoise.right(90)

        tortoise.forward(1/2 * width)
        tortoise.write(f[i], align='center')
        tortoise.forward(1/2 * width)

        tortoise.right(90)
        tortoise.forward(f[i])

        tortoise.end_fill()

        tortoise.forward(20)
        tortoise.right(90)

        tortoise.forward(1/2 * width)
        tortoise.write(w[i], align='center')
        tortoise.backward(1/2 * width)

        tortoise.right(90)
        tortoise.forward(20)
        tortoise.right(90)
        tortoise.forward(spacing)

    window.exitonclick()

dictionary = count_words("New Text Document.txt")

words, values = dict_sort(dictionary, reverse=True)

n_freq = word_freq()

graph_word_freq(n_freq, values, words)

Bogdan Prădatu · Answer 2 · 27 декабря 2018

Я обновил свой код в соответствии с рекомендацией Бармара:

def remove_punctuation(text):
""""Removes punctuation characters from given text"""
punctuation = "~`!@#$%^&*()_-=+[{]}\\|'\";:,<.>/?"
text_wo_punctuation = ""
for word in text:
    if word not in punctuation:
        text_wo_punctuation += word
return text_wo_punctuation

def count_words(file):
    """Returns a dictionary of words and word count from "file" """
    with open(file) as f:
        text = remove_punctuation(f.read()).lower().split()
        dictionary = {}
        for word in text:
    #        print(word)
            if word in dictionary:
                dictionary[word] = dictionary[word] + 1
    #            print("**Existing**")
            else:
                dictionary[word] = 1
    #            print("**New**")
    #        print(dictionary[word])
    return dictionary
    #print(dictionary)

def dict_sort(d, reverse = False):
    """Sort given dictionary "d" in ascending (default)
        or descending (reverse = True) order
        Outputs tuple of: list of keys, list of values and dictionary
        Recommended format for output: a,b,c = dict_sort(d)"""
    key_list = []
    value_list = []
    for key in d:
        key_list.append(key)
        value_list.append(d[key])
    #print(key_list)
    #print(value_list)
    for i in range(len(value_list)-1):
        for i in range(len(value_list)-1):
            if reverse == False:
                if value_list[i] > value_list[i+1]:
                    value_list[i],value_list[i+1] = value_list[i+1],value_list[i]
                    key_list[i],key_list[i+1] = key_list[i+1],key_list[i]
            elif reverse == True:
                if value_list[i] < value_list[i+1]:
                    value_list[i],value_list[i+1] = value_list[i+1],value_list[i]
                    key_list[i],key_list[i+1] = key_list[i+1],key_list[i]
    d = {}
    for i in range(len(value_list)):
        d[key_list[i]] = value_list[i]
    sorted_dict = d    
    return key_list,value_list,sorted_dict

def word_freq():
    """Input how many words to plot on graph"""
    while True:
        try:
            n_freq = int(input("How many of the most frequent words would you like to display?\n"))
            if (n_freq < 1 or n_freq > 10):
                print("Please input an integer between 1 and 10:")
                continue
        except(ValueError):
            print("Please input an integer between 1 and 10:")
            continue
        else:
            break
    return n_freq

def graph_word_freq(n,f,w):                     #create function to draw chart
    """Draw bar chart of most frequent words in text
        n: number of words to plot (between 1 and 10)
        f: word frequency list
        w: word list"""

    import turtle                                       #import turtle module
    window = turtle.Screen()                            #create screen
    window.bgcolor("honeydew")                          #define screen color
    window.title("Most Frequent Words")                 #set window title
    if f[0] < 960:
        y = 500
        y_pos = -480
        width = 60
        spacing = 20
        x_pos = -(30+40*(n-1))
    else:
        width = 100
        spacing = 40
        y = f[0]/2+20
        y_pos = -f[0]/2
        x_pos = -(50+70*(n-1))

    #turtle.screensize(y,y)                              #set window size
    turtle.setworldcoordinates(-y,-y,y,y)
    tortoise = turtle.Turtle()                          #create turtle
    tortoise.hideturtle()                               #hide turtle stamp
    tortoise.penup()                                    #raise turtle pen
    tortoise.setposition(x_pos,y_pos)                   #position turtle
    tortoise.pendown()                                  #put turtle pen down
    tortoise.speed(5)                                   #set drawing speed

    for i in range(n):
        if abs(f[i]) < ((f[0]-f[n])/3):
            tortoise.color("SeaGreen","ForestGreen")    #set turtle color
        elif abs(f[i]) >= ((f[0]-f[n])/3) and abs(f[i]) < ((f[0]-f[n])/1.5):
            tortoise.color("orange","gold")             #set turtle color
        else:
            tortoise.color("coral3","IndianRed")        #set turtle color

        tortoise.begin_fill()                           #begin drawing shapes
        tortoise.left(90)
        tortoise.forward(f[i])                          #draw bar height
        tortoise.right(90)
        tortoise.forward(1/3*width)                            #prepare for text
        if f[i] >= 0:
            tortoise.write(f[i])                        #write value
        else:
            tortoise.penup()
            tortoise.right(90)
            tortoise.forward(15)
            tortoise.write(f[i])
            tortoise.forward(-15)
            tortoise.left(90)
            tortoise.pendown()
        tortoise.forward(2/3*width)                     #bar width
        tortoise.right(90)
        tortoise.forward(f[i])
        tortoise.left(90)
        tortoise.penup()
        tortoise.right(90)
        tortoise.forward(25)
        tortoise.left(90)
        tortoise.forward(-2/3*width)
        tortoise.write(w[i])                            #write word
        tortoise.forward(2/3*width)
        tortoise.left(90)
        tortoise.forward(25)
        tortoise.right(90)
        tortoise.forward(spacing)                       #spacing
        tortoise.pendown()
        tortoise.end_fill()                             #stop drawing shapes
    turtle.exitonclick()

dictionary = count_words("New Text Document.txt")

words,values,dictionary = dict_sort(dictionary, reverse = True)

n_freq = word_freq()

graph_word_freq(n_freq,values,words)

Теперь работает как задумано. Спасибо, приятель!

Итерация имени переменной внутри цикла

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Итерация имени переменной внутри цикла

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы