Привет, я хочу выполнить извлечение функций и классификаторы для обнаружения спама в электронной почте. Я попытался выполнить извлечение функций, что, на мой взгляд, нормально, но я бы предпочел это как отдельную функцию. Однако у меня были проблемы с тем, что я не мог передать данные словаря. Пожалуйста, не мог бы кто-нибудь помочь с извлечением признаков, чтобы я мог затем посмотреть классификаторы. См. Код ниже. Я новичок в python, поэтому извиняюсь.
import os
import string
from collections import Counter
from os import listdir # return all files and folders in the directory
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import DictVectorizer
import nltk
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
def importLingspamDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
f = open((dir + '/' + file), "r") # used for opening the file in read only format
fileNames.append(file)
allEmails.append(f.read()[20:]) # appends the read emails to the emails array - 20: is characters not lines
f.close()
return allEmails, fileNames
def removePunctuation(emails):
cleanedEmails = [email.translate(str.maketrans('', '', string.punctuation)) for email in emails]
removeNumbers = [email.translate(str.maketrans('', '', string.digits)) for email in cleanedEmails]
return removeNumbers
def removeStopwords(cleanedEmails):
removeWords = set(stopwords.words('english')) # sets all the stopwords to be removed
for stopw in removeWords: # for each word in remove words
if stopw not in removeWords: # if the word is not in the stopwords to be removed
cleanedEmails.append(stopw) # add this word to the cleaned emails
return(cleanedEmails)
def lemmatizeEmails(cleanedEmails):
lemma = WordNetLemmatizer() # to be used for returning each word to its root form
lemmaEmails = [lemma.lemmatize(i) for i in cleanedEmails] # lemmatize each word in the cleaned emails
return lemmaEmails
def cleanAllEmails(cleanedEmails):
cleanPunc = removePunctuation(cleanedEmails)
cleanStop = removeStopwords(cleanPunc)
cleanLemma = lemmatizeEmails(cleanStop)
return cleanLemma
def createDictionary(email):
allWords = ' '.join(email)
allWords = allWords.split()
dictionary = Counter(allWords)
for item in allWords:
if len(item) == 1:
del dictionary[item]
dic = dict(dictionary.most_common(10))
#print(dic)
#word_cloud = WordCloud(width=1600, height=1600, background_color='white',
# min_font_size=12).generate_from_frequencies(dic)
#plt.imshow(word_cloud)
#plt.axis("off")
#plt.margins(x=0, y=0)
#plt.show()
#word_cloud.to_file('test1.png')
vectorizer = DictVectorizer()
print(vectorizer.fit_transform(dictionary).toarray())
print(vectorizer.get_feature_names())
x = vectorizer.fit_transform(dictionary).toarray()
return dictionary
def classifyLingspamDataset(email):
classifications = []
for name in email:
classifications.append("spmsg" in name)
return classifications
Набор данных Lingspam
trainingDataLingspam, trainingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/train-mails") # extract the training emails from the dataset
testingDataLingspam, testingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/test-mails") # extract the testing emails from the dataset
trainingDataLingspamClean = cleanAllEmails(trainingDataLingspam)
#testingDataLingspamClean = cleanAllEmails(testingDataLingspam)
#trainClassifyLingspam = classifyLingspamDataset(trainingDataLingspam)
#testClassifyLingspam = classifyLingspamDataset(testingDataLingspam)
trainDictionary = createDictionary(trainingDataLingspamClean)
# createDictionary (testingDataLingspamClean)