Ускорьте работу программы на Python по кратчайшим путям - PullRequest
0 голосов
/ 11 ноября 2019

Этот код вводит взвешенный крайний список, начальные слова положительных и отрицательных настроений и целевое слово. Программа вычисляет сумму весов по кратчайшим путям от начальных слов до цели и от целевых до начальных слов, поскольку она генерирует 9 выходных значений.

Программа очень медленная. Запуск больших файлов Edgelist занимает дни, а не минуты или секунды. Требуется увеличить скорость в 100 и более раз.

Как ускорить эту программу?

from tkinter import Tk, X, Y, TOP, BOTTOM, LEFT, RIGHT, BOTH, END
from tkinter import filedialog, messagebox
from tkinter.ttk import Frame, Button, Entry, Label, Progressbar
import os, glob, time
import pandas as pd

root = Tk()
root.geometry("600x400+300+300")

def read_edge_list(filename):
    edges = {}
    words = set()

    with open(filename) as fp:
        lines = fp.readlines()

        for line in lines:
            token = line.split()

            if len(token) != 3:
                continue

            word1 = token[0]
            word2 = token[1]
            freq = token[2]

            words = words | {word1, word2}

            if not word1 in edges.keys():
                edges[word1] = {}

            if not word2 in edges[word1]:
                edges[word1][word2] = {}

            edges[word1][word2] = freq

    return edges, words

def read_sentiment(filename):
    with open(filename, encoding='utf-8-sig') as fp:
        lines = fp.readlines()

    words = {line.strip() for line in lines}

    return words

def read_target_word():
    word = input("Please input target word: ")
    return word

def run_shortest_path_algorithm(edges, positive, negative, target):
    positivedict = {}
    negativedict = {}

    for source in positive:
        dist1 = dijkstra(edges, source, target)
        dist2 = dijkstra(edges, target, source)
        if dist1 and dist2:
            positivedict[source] = dist1 + dist2

    for source in negative:
        dist1 = dijkstra(edges, source, target)
        dist2 = dijkstra(edges, target, source)
        if dist1 and dist2:
            negativedict[source] = dist1 + dist2

    return positivedict, negativedict

def calculate_statistics_summary(positivedict, negativedict, positivewords, negativewords):
    numpositive = len(positivedict)
    numnegative = len(negativedict)

    actualnumpositive = len(positivewords)
    actualnumnegative = len(negativewords)

    sumpositive = sum(positivedict.values())
    sumnegative = sum(negativedict.values())

    if actualnumpositive == 0:
        s1 = 0
    else:
        s1 = sumpositive / actualnumpositive

    if actualnumnegative == 0:
        s2 = 0
    else:
        s2 = sumnegative / actualnumnegative

    if numnegative == 0:
        s3 = 0
    else:
        s3 = s1 * numpositive / numnegative

    if s2 == 0:
        s4 = 0
    else:
        s4 = s3 / s2

    if numpositive == 0:
        s5 = 0
    else:
        s5 = sumpositive / numpositive

    if numnegative == 0:
        s6 = 0
    else:
        s6 = sumnegative / numnegative

    if numnegative == 0:
        s7 = 0
    else:
        s7 = s5 * numpositive / numnegative

    if s6 == 0:
        s8 = 0
    else:
        s8 = s7 / s6

    s9 = s3 - s2

    return [s1, s2, s3, s4, s5, s6, s7, s8, s9]

def write_output_file():
    pass

def dijkstra(graph, start, end):
    shortest_paths = {start: (None, 0)}
    current_node = start
    visited = set()

    while current_node != end:
        visited.add(current_node)

        if current_node not in graph:
            destinations = []
        else:
            destinations = graph[current_node].keys()

        weight_to_current_node = shortest_paths[current_node][1]

        for next_node in destinations:
            weight = int(graph[current_node][next_node]) + weight_to_current_node
            if next_node not in shortest_paths:
                shortest_paths[next_node] = (current_node, weight)
            else:
                current_shortest_weight = shortest_paths[next_node][1]
                if current_shortest_weight > weight:
                    shortest_paths[next_node] = (current_node, weight)

        next_destinations = {node: shortest_paths[node] for node in
            shortest_paths if node not in visited}
        if not next_destinations:
            return None

        current_node = min(next_destinations, key=lambda k: next_destinations[k][1])

    #path = []
    #while current_node is not None:
        #path.append(current_node)
        #next_node = shortest_paths[current_node][0]
        #current_node = next_node

    #path = path[::-1]
    #return path

    return shortest_paths[end][1]

class SentimentWindow(Frame):
    def __init__(self):
        super().__init__()

        self.initUI()

        self.initPositiveDir = None
        self.initNegativeDir = None
        self.initSaveDir = None

        self.summary = pd.DataFrame(columns=['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'])

    def initUI(self):
        self.master.title("Sentiment")
        self.pack(fill=BOTH, expand=True, padx=15, pady=15)

        frmEdges = Frame(self)
        frmEdges.pack(fill=X, expand=True)

        lblEdges = Label(frmEdges, text="Select the directory of edge list.")
        lblEdges.pack(expand=True, fill=X, side=TOP, pady=2)

        frmEdgesPath = Frame(frmEdges)
        frmEdgesPath.pack(expand=True, fill=X, side=BOTTOM, pady=2)

        self.entEdgesPath = Entry(frmEdgesPath, width=60)
        self.entEdgesPath.pack(expand=True, fill=X, side=LEFT)

        btnEdgesPath = Button(frmEdgesPath, width=20, text="Load Edges", command=self.loadEdges)
        btnEdgesPath.pack(expand=True, side=RIGHT)

        frmPositive = Frame(self)
        frmPositive.pack(fill=X, expand=True)

        lblPositive = Label(frmPositive, text="Select the positive file.")
        lblPositive.pack(expand=True, fill=X, side=TOP, pady=2)

        frmPositivePath = Frame(frmPositive)
        frmPositivePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)

        self.entPositivePath = Entry(frmPositivePath, width=60)
        self.entPositivePath.pack(expand=True, fill=X, side=LEFT)

        btnPositivePath = Button(frmPositivePath, width=20, text="Load Positive", command=self.loadPositive)
        btnPositivePath.pack(expand=True, side=RIGHT)

        frmNegative = Frame(self)
        frmNegative.pack(fill=X, expand=True)

        lblNegative = Label(frmNegative, text="Select the negative file.")
        lblNegative.pack(expand=True, fill=X, side=TOP, pady=2)

        frmNegativePath = Frame(frmNegative)
        frmNegativePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)

        self.entNegativePath = Entry(frmNegativePath, width=60)
        self.entNegativePath.pack(expand=True, fill=X, side=LEFT)

        btnNegativePath = Button(frmNegativePath, width=20, text="Load Negative", command=self.loadNegative)
        btnNegativePath.pack(expand=True, side=RIGHT)

        frmTarget = Frame(self)
        frmTarget.pack(fill=X, expand=True)

        lblTarget = Label(frmTarget, text="Input the target word.")
        lblTarget.pack(expand=True, fill=X, side=TOP, pady=2)

        self.entTarget = Entry(frmTarget)
        self.entTarget.pack(fill=X, expand=True, pady=2)

        frmRun = Frame(self)
        frmRun.pack(fill=X, expand=True, pady=20)

        self.proRun = Progressbar(frmRun, value=0)
        self.proRun.pack(fill=X, expand=True, side=LEFT)

        btnRun = Button(frmRun, text = "Run", width=20, command=self.run)
        btnRun.pack(side=RIGHT, padx=20)


    def loadEdges(self):
        edgesFolderName = filedialog.askdirectory()

        if edgesFolderName:
            self.entEdgesPath.delete(0, END)
            self.entEdgesPath.insert(0, edgesFolderName)

    def loadPositive(self):
        if self.initPositiveDir is None:
            self.initPositiveDir = "/"

        positiveFileName = filedialog.askopenfilename(initialdir=self.initPositiveDir,
                title="Open Positive File", filetypes=(("Text file", "*.txt"),))

        if positiveFileName:
            self.initPositiveDir = positiveFileName
            self.entPositivePath.delete(0, END)
            self.entPositivePath.insert(0, positiveFileName)

    def loadNegative(self):
        if self.initNegativeDir is None:
            self.initNegativeDir = "/"

        negativeFileName = filedialog.askopenfilename(initialdir=self.initNegativeDir,
                title="Open Positive File", filetypes=(("Text file", "*.txt"),))

        if negativeFileName:
            self.initNegativeDir = negativeFileName
            self.entNegativePath.delete(0, END)
            self.entNegativePath.insert(0, negativeFileName)

    def run(self):
        edgesFolderName = self.entEdgesPath.get()
        if not os.path.isdir(edgesFolderName):
            messagebox.showerror("Invalid Path", "The directory of edge list is invalid.")
            return

        positiveFileName = self.entPositivePath.get()
        if not os.path.isfile(positiveFileName):
            messagebox.showerror("Invalid Path", "The positive filename is invalid.")
            return

        negativeFileName = self.entNegativePath.get()
        if not os.path.isfile(negativeFileName):
            messagebox.showerror("Invalid Path", "The negative filename is invalid.")
            return

        targetWord = self.entTarget.get()
        if targetWord is None or len(targetWord) <= 0:
            messagebox.showerror("No Target", "Please input the target word.")

        os.chdir(edgesFolderName)
        edgefiles = glob.glob("*.pr")

        if len(edgefiles) <= 0:
            messagebox.showerror("No Edge File", "Cannot find the edge files.")

        positivewords = read_sentiment(positiveFileName)

        negativewords = read_sentiment(negativeFileName)

        self.summary.drop(self.summary.index, inplace=True)
        self.proRun["value"] = 0.0
        self.proRun.update()
        root.config(cursor="wait")
        root.update()
        time.sleep(0.300)

        for index, edgefile in enumerate(edgefiles):
            edges, words = read_edge_list(edgefile)

            if targetWord not in words:
                messagebox.showerror("Invalid Target", "Target does not exist in " + edgefile)
            else:
                possiblepositive = positivewords & words
                possiblenegative = negativewords & words

                positivedict, negativedict = \
                    run_shortest_path_algorithm(edges, possiblepositive, possiblenegative, targetWord)

                statistics_summary = calculate_statistics_summary(positivedict, negativedict,
                    positivewords, negativewords)
                self.summary.loc[edgefile] = statistics_summary
            self.proRun["value"] = 100 * (index + 1) / len(edgefiles)
            self.proRun.update()

        root.config(cursor="")

        if self.summary.shape[0] > 0:
            self.summary.loc['mean'] = self.summary.mean()
            self.summary.loc['std'] = self.summary.std()

            if self.initSaveDir is None:
                self.initSaveDir = "/"

            outputFile = filedialog.asksaveasfilename(initialdir=self.initSaveDir,
                                                      title="Save Summary File", filetypes=(("Text file", "*.txt"),))
            self.initSaveDir = outputFile

            if outputFile:
                with open(outputFile, 'w') as outfp:
                    self.summary.to_string(outfp)

app = SentimentWindow()
root.mainloop()

Вот небольшие списки фронтов, которые запускаются в считанные минуты: https://drive.google.com/file/d/1zDOSMFz0AooXrs9WJ0noC3oD9cWg_562/view?usp=sharing

Вот большой файл, который будет работать в течение нескольких дней! https://drive.google.com/file/d/18NR_bPjb9OU03n7MO08GwELrK7gqXEKE/view?usp=sharing

Вот файл отрицательного начального числа: https://docs.google.com/document/d/1Y0eFolLWjqoHiFnHD7TOS-9z5h1xxmvUiENS1TEv9yU/edit?usp=sharing

Файл положительного начального значения: https://docs.google.com/document/d/1FAct8O-rRN6qsdTU3praW6hy2ckMf1s1mA9K2gy7WYI/edit?usp=sharing

Установите целевое слово на: bp.

Вот код в файле: https://docs.google.com/document/d/1erSpyXxy3eMehBCiYJudf7tnQgIneT9H7Ot2_wGYBBI/edit?usp=sharing

...