Это мои полные коды для most_inforrative_feature_for_binary_classification и Naive Bayes, которые я пробовал.
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
"""
Identify most important features if given a vectorizer and binary classifier. Set n to the number
of weighted features you would like to show.
"""
counter = 1
def read_counter():
return loads(open("counter.json", "r").read()) + 1 if path.exists("counter.json") else 0
def write_counter():
with open("counter.json", "w") as f:
f.write(dumps(counter))
counter = 1
counter = read_counter()
atexit.register(write_counter)
if counter == 6:
counter = 0
# additional stopwords to be remove
# Open a file and read it into memory
file = open('..\stopwords.txt')
additional_stopwords = file.read()
additional_stopwords = additional_stopwords.split()
class_labels = classifier.classes_
feature_names = vectorizer.get_feature_names()
feature_names = [word for word in feature_names if word not in additional_stopwords]
topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
# class_labels = category
# coef = co-effecient
# feat = most informative feature
if counter == 1:
for coef, feat in topn_class1:
print(class_labels[0], coef, feat)
print(class_labels)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[1], coef, feat)
elif counter == 2:
for coef, feat in topn_class1:
print(class_labels[1], coef, feat)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[2], coef, feat)
elif counter == 3:
for coef, feat in topn_class1:
print(class_labels[2], coef, feat)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[3], coef, feat)
elif counter == 4:
for coef, feat in topn_class1:
print(class_labels[3], coef, feat)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[4], coef, feat)
elif counter == 5:
for coef, feat in topn_class1:
print(class_labels[4], coef, feat)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[5], coef, feat)
else:
print("=== PLEASE RUN PROGRAM AGAIN TO VIEW THE CO-EFFICIENT FOR THE CHOSEN MODEL ===")
'''=========================================================================================='''
'''MEAN ACCURACY FOR 4 MODELS(NAIVE BAYES / LOGISTICS REGRESSION / LINEAR SVC / RANDOM FOREST)'''
'''=========================================================================================='''
# n_estimators is the number of trees to be used in the fores
def mean_acc():
models = [
RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0)]
CV = 6
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(cv_df.groupby('model_name').accuracy.mean())
'''=========================================================================================='''
'''NAIVE BAYES CLASSIFIER'''
'''=========================================================================================='''
def NB_func():
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['cat_id'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf_NB = MultinomialNB().fit(X_train_tfidf, y_train)
print(clf_NB)
# save the model to disk
filename = '../dataanalysis/models/Naive_Bayes.sav'
pickle.dump(clf_NB, open(filename, 'wb'))
print()
'''Print the prediction of the category from the unknown document'''
# For now its not accurate due to insufficient sample data
# print("NAIVE BAYES CLASSIFIER: ", clf_NB.predict(count_vect.transform([""])))
print()
print("===============================================")
print("================= NAIVE BAYES =================")
print("===============================================")
most_informative_feature_for_binary_classification(tfidf, clf_NB, n=10)
Вывод при запуске NB_func ():
При первом запуске программы:
3 -7.730506816516725 abato
3 -7.730506816516725 abdome abdome
3 -7.730506816516725 abdome erythema
3 -7.730506816516725 abdome erythema redness
3 -7.730506816516725 abdomen distended complaint
3 -7.730506816516725 abdomen ex drain
3 -7.730506816516725 abdomen hbs
3 -7.730506816516725 abdomen hbs kidneys
3 -7.730506816516725 abdomen insitu
3 -7.730506816516725 abdomen insitu pinkish
4 -7.254153151633855 assist function toileting
4 -7.331276619904304 bnoxl
4 -7.343069108877106 assist incontinence
4 -7.384156232690568 breath utcome ineffective
4 -7.387014055512514 arora plansfollow
4 -7.4053139680029565 agitated change
4 -7.410079385200252 ambulate using aid
4 -7.443599738543319 accompanied bedside complain
4 -7.44802940166431 alert comfortable afebrile
4 -7.455925447446086 absence urinary symptoms
Второй раз при запуске программы:
4 -7.730945941543211 abato
4 -7.730945941543211 abdome abdome
4 -7.730945941543211 abdome erythema
4 -7.730945941543211 abdome erythema redness
4 -7.730945941543211 abdomen distended complaint
4 -7.730945941543211 abdomen ex drain
4 -7.730945941543211 abdomen hbs
4 -7.730945941543211 abdomen hbs kidneys
4 -7.730945941543211 abdomen insitu
4 -7.730945941543211 abdomen insitu pinkish
5 -7.254592276660341 assist function toileting
5 -7.331715744930789 bnoxl discharge previously
5 -7.3435082339035915 assist incontinence
5 -7.384595357717053 breathing
5 -7.387453180539 arora
5 -7.405753093029442 agitated change
5 -7.410518510226737 ambulate using
5 -7.444038863569804 accompanied bedside complain
5 -7.448468526690795 alert comfortable
5 -7.456364572472571 absence urinary symptoms
В третий раз запустить программу:
5 -7.730506816516725 abato
5 -7.730506816516725 abdome abdome
5 -7.730506816516725 abdome erythema
5 -7.730506816516725 abdome erythema redness
5 -7.730506816516725 abdomen distended complaint
5 -7.730506816516725 abdomen ex drain
5 -7.730506816516725 abdomen hbs
5 -7.730506816516725 abdomen hbs kidneys
5 -7.730506816516725 abdomen insitu
5 -7.730506816516725 abdomen insitu pinkish
6 -7.254153151633855 assist incontinence urine
6 -7.331276619904304 board
6 -7.343069108877106 assist indwelling
6 -7.384156232690568 breathing
6 -7.387014055512514 arora plansfollow watch
6 -7.4053139680029565 agitated change
6 -7.410079385200252 ambulate using aid
6 -7.443599738543319 accompanied bedside complain
6 -7.44802940166431 alert comfortable afebrile
6 -7.455925447446086 absence urinary symptoms
В четвертый раз запустить программу:
6 -7.730492001996121 abato
6 -7.730492001996121 abdome abdome
6 -7.730492001996121 abdome erythema
6 -7.730492001996121 abdome erythema redness
6 -7.730492001996121 abdomen distended complaint
6 -7.730492001996121 abdomen ex drain
6 -7.730492001996121 abdomen hbs
6 -7.730492001996121 abdomen hbs kidneys
6 -7.730492001996121 abdomen insitu
6 -7.730492001996121 abdomen insitu pinkish
7 -7.25347933696348 assist incontinence urine
7 -7.3312618053837 board
7 -7.342671383051713 assist indwelling
7 -7.383811392545885 breathing decreased shortness
7 -7.3865695243404845 arora plansfollow watch
7 -7.405088464279118 agitated behaviour aggression
7 -7.4098105414391995 ambulating
7 -7.443278461891396 accompanied bedside
7 -7.4477507335224775 alert conscious
7 -7.455910632925482 absence urinary
Пятый раз запустить программу:
7 -7.730945941543211 abato
7 -7.730945941543211 abdome abdome
7 -7.730945941543211 abdome erythema
7 -7.730945941543211 abdome erythema redness
7 -7.730945941543211 abdomen distended complaint
7 -7.730945941543211 abdomen ex drain
7 -7.730945941543211 abdomen hbs
7 -7.730945941543211 abdomen hbs kidneys
7 -7.730945941543211 abdomen insitu
7 -7.730945941543211 abdomen insitu pinkish
8 -7.254592276660341 assist incontinence urine
8 -7.331715744930789 board reassessment indicated
8 -7.3435082339035915 assist indwelling
8 -7.384595357717053 breathing free respiratory
8 -7.387453180539 aroun
8 -7.405753093029442 agitated behaviour aggression
8 -7.410518510226737 ambulate prior admission
8 -7.444038863569804 accompanied bedside
8 -7.448468526690795 alert afebrile complains
8 -7.456364572472571 absence urinary
Проблема, с которой я столкнулся, заключается в том, почему я всегда получаю одинаковые результаты при каждом запуске своей программы.Классы имеют одинаковые результаты вместо другого.