извлечение функций и классификаторы python обнаружение спама в электронной почте - PullRequest
0 голосов
/ 20 июня 2020

Привет, я хочу выполнить извлечение функций и классификаторы для обнаружения спама в электронной почте. Я попытался выполнить извлечение функций, что, на мой взгляд, нормально, но я бы предпочел это как отдельную функцию. Однако у меня были проблемы с тем, что я не мог передать данные словаря. Пожалуйста, не мог бы кто-нибудь помочь с извлечением признаков, чтобы я мог затем посмотреть классификаторы. См. Код ниже. Я новичок в python, поэтому извиняюсь.

import os
import string
from collections import Counter
from os import listdir  # return all files and folders in the directory

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import DictVectorizer
import nltk
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
def importLingspamDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
    f = open((dir + '/' + file), "r")  # used for opening the file in read only format
    fileNames.append(file)
    allEmails.append(f.read()[20:]) # appends the read emails to the emails array - 20: is characters not lines
    f.close()
return allEmails, fileNames


def removePunctuation(emails):
cleanedEmails = [email.translate(str.maketrans('', '', string.punctuation)) for email in emails]
removeNumbers = [email.translate(str.maketrans('', '', string.digits)) for email in cleanedEmails]
return removeNumbers


def removeStopwords(cleanedEmails):
removeWords = set(stopwords.words('english')) # sets all the stopwords to be removed
for stopw in removeWords: # for each word in remove words
    if stopw not in removeWords: # if the word is not in the stopwords to be removed
        cleanedEmails.append(stopw) # add this word to the cleaned emails
return(cleanedEmails)


def lemmatizeEmails(cleanedEmails):
lemma = WordNetLemmatizer() # to be used for returning each word to its root form
lemmaEmails = [lemma.lemmatize(i) for i in cleanedEmails] # lemmatize each word in the cleaned emails
return lemmaEmails


def cleanAllEmails(cleanedEmails):
cleanPunc = removePunctuation(cleanedEmails)
cleanStop = removeStopwords(cleanPunc)
cleanLemma = lemmatizeEmails(cleanStop)
return cleanLemma

def createDictionary(email):
allWords = ' '.join(email)
allWords = allWords.split()
dictionary = Counter(allWords)
for item in allWords:
    if len(item) == 1:
        del dictionary[item]
dic = dict(dictionary.most_common(10))
#print(dic)
#word_cloud = WordCloud(width=1600, height=1600, background_color='white',
# min_font_size=12).generate_from_frequencies(dic)
#plt.imshow(word_cloud)
#plt.axis("off")
#plt.margins(x=0, y=0)
#plt.show()
#word_cloud.to_file('test1.png')
vectorizer = DictVectorizer()
print(vectorizer.fit_transform(dictionary).toarray())
print(vectorizer.get_feature_names())
x = vectorizer.fit_transform(dictionary).toarray()

return dictionary

def classifyLingspamDataset(email):
classifications = []
for name in email:
    classifications.append("spmsg" in name)
return classifications

Набор данных Lingspam

trainingDataLingspam, trainingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/train-mails") # extract the training emails from the dataset
testingDataLingspam, testingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/test-mails") # extract the testing emails from the dataset

trainingDataLingspamClean = cleanAllEmails(trainingDataLingspam)
#testingDataLingspamClean = cleanAllEmails(testingDataLingspam)

#trainClassifyLingspam = classifyLingspamDataset(trainingDataLingspam)
#testClassifyLingspam = classifyLingspamDataset(testingDataLingspam)

trainDictionary = createDictionary(trainingDataLingspamClean)

# createDictionary (testingDataLingspamClean)

...