Predict.proba в Наив Байес - PullRequest
       107

Predict.proba в Наив Байес

0 голосов
/ 11 октября 2018

У меня вопрос о том, как преобразовать результат предиката. Проба в Наивном Байесе в проценты.Я уже попробовал, но не получилось.Я хочу, чтобы результат стал как 50%, 100%.Это пример моего кода

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import itertools
plt.style.use('ggplot')

class bayesNaive:
    def __init__(self, dataTrainInput):
        self.data = pd.read_csv(dataTrainInput, delimiter=";", encoding="latin-1")
def encoderLabel(self):
    self.lb = LabelEncoder()
    df = pd.DataFrame(self.data,
                      columns=["laboratory_registration_id", "Albumin","Asam Urat", "Basofil", "Basofil Absolut","BE", "Berat Jenis", "Besi (Fe/iron)", "Eosinofil", "Eosinofil Absolut", "Eritrosit","Ferritin", "Free T4", "Glukosa Darah 2 jam PP", "Glukosa Darah Puasa","Glukosa Darah Sewaktu", "Hb-A1c", "Hematokrit", "Hemoglobin", "Kalium (K)","Klorida (Cl)", "Kolesterol HDL", "Kolesterol LDL", "Kolesterol Total", "Kreatinin","Leukosit", "Limfosit", "Limfosit Absolut", "MCH", "MCHC", "MCV", "Monosit","Monosit Absolut", "MPV", "Natrium (Na)", "Neutrofil Absolut", "Neutrofil Segmen","O2 Saturasi", "pCO2", "PDW", "pH", "pO2", "RDW-CV", "RDW-SD", "T CO2", "TIBC","T3 Total", "T4 Total", "Trigliserida", "Trombosit", "Troponin T", "TSH", "Ureum", "Age", "Gender", "Disease"])
    data1 = self.data["Bakteri"]
    data2 = self.data["Bilirubin"]
    data3 = self.data["Blood"]
    data5 = self.data["Epitel"]
    data6 = self.data["Eritrosit Urin"]
    data7 = self.data["Faktor Rheumatoid (RF)"]
    data8 = self.data["Glukosa"]
    data9 = self.data["HBsAg"]
    data10 = self.data["Kejernihan"]
    data11 = self.data["Keton"]
    data12 = self.data["Kristal"]
    data13 = self.data["Leukosit Urin"]
    data14 = self.data["Nitrit"]
    data15 = self.data["Protein"]
    data16 = self.data["Silinder"]
    data17 = self.data["Urobilinogen"]
    data18 = self.data["Warna"]

    x1 = self.lb.fit_transform(data1)
    x2 = self.lb.fit_transform(data2)
    x3 = self.lb.fit_transform(data3)
    x5 = self.lb.fit_transform(data5)
    x6 = self.lb.fit_transform(data6)
    x7 = self.lb.fit_transform(data7)
    x8 = self.lb.fit_transform(data8)
    x9 = self.lb.fit_transform(data9)
    x10 = self.lb.fit_transform(data10)
    x11 = self.lb.fit_transform(data11)
    x12 = self.lb.fit_transform(data12)
    x13 = self.lb.fit_transform(data13)
    x14 = self.lb.fit_transform(data14)
    x15 = self.lb.fit_transform(data15)
    x16 = self.lb.fit_transform(data16)
    x17 = self.lb.fit_transform(data17)
    x18 = self.lb.fit_transform(data18)

    df1 = pd.DataFrame(x1, columns=['Bakteri'])
    df2 = pd.DataFrame(x2, columns=['Bilirubin'])
    df3 = pd.DataFrame(x3, columns=['Blood'])
    df5 = pd.DataFrame(x5, columns=['Epitel'])
    df6 = pd.DataFrame(x6, columns=['Eritrosit Urin'])
    df7 = pd.DataFrame(x7, columns=['Faktor Rheumatoid (RF)'])
    df8 = pd.DataFrame(x8, columns=['Glukosa'])
    df9 = pd.DataFrame(x9, columns=['HBsAg'])
    df10 = pd.DataFrame(x10, columns=['Kejernihan'])
    df11 = pd.DataFrame(x11, columns=['Keton'])
    df12 = pd.DataFrame(x12, columns=['Kristal'])
    df13 = pd.DataFrame(x13, columns=['Leukosit Urin'])
    df14 = pd.DataFrame(x14, columns=['Nitrit'])
    df15 = pd.DataFrame(x15, columns=['Protein'])
    df16 = pd.DataFrame(x16, columns=['Silinder'])
    df17 = pd.DataFrame(x17, columns=['Urobilinogen'])
    df18 = pd.DataFrame(x18, columns=['Warna'])

    return pd.concat([df1, df2, df3, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df], axis=1)

def split_label(self):
    res = self.encoderLabel()
    X = res.iloc[:, :-1].values
    Y = res['Disease'].values
    return X, Y

def test_split(self):
    X, Y = self.split_label()
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
    return X_train, X_test, Y_train, Y_test

def Classify_lab(self):
    try:
        X_train, X_test, Y_train, Y_test = self.test_split()
        # print(X_test.shape)
        model = GaussianNB()
        model_train = model.fit(X_train, Y_train)

        model_score = model_train.score(X_test, Y_test)
        accuracy = round(model_score * 100, 2)
        predicted = model_train.predict(X_test)
        Y_prob = model_train.predict_proba(X_test)
        report=classification_report(Y_test, predicted)
        conf_m = confusion_matrix(Y_test, predicted)
        # "precision: tp/(tp+fp)"
        # "recall: tp/(tp+fn)"
        # "f1-score: (2xprecisionxrecall)/(precision+recall)"

        # print(test.shape)
        test.columns = ["Bakteri", "Bilirubin", "Blood", "Epitel","Eritrosit Urin","Faktor Rheumatoid (RF)", "Glukosa", "HBsAg", "Kejernihan", "Keton", "Kristal", "Leukosit Urin", "Nitrit", "Protein", "Silinder", "Urobilinogen", "Warna","laboratory_registration_id", "Albumin", "Asam Urat", "Basofil", "Basofil Absolut","BE", "Berat Jenis", "Besi (Fe/iron)", "Eosinofil", "Eosinofil Absolut", "Eritrosit","Ferritin", "Free T4", "Glukosa Darah 2 jam PP", "Glukosa Darah Puasa","Glukosa Darah Sewaktu", "Hb-A1c", "Hematokrit", "Hemoglobin", "Kalium (K)","Klorida (Cl)", "Kolesterol HDL", "Kolesterol LDL", "Kolesterol Total", "Kreatinin","Leukosit", "Limfosit", "Limfosit Absolut", "MCH", "MCHC", "MCV", "Monosit","Monosit Absolut", "MPV", "Natrium (Na)", "Neutrofil Absolut", "Neutrofil Segmen","O2 Saturasi", "pCO2", "PDW", "pH", "pO2", "RDW-CV", "RDW-SD", "T CO2", "TIBC","T3 Total", "T4 Total", "Trigliserida", "Trombosit", "Troponin T", "TSH", "Ureum","Age", "Gender"]

        labels = pd.DataFrame(predicted)
        # print(Y_prob)
        print("bulatan")
        # Y_prob[:, 1:3] = np.around(Y_prob[:,1:3],decimals=1)
        probability = pd.DataFrame(Y_prob)
        labels.columns = ["Disease"]
        probability.columns = ["Diabetes mellitus", "Ginjal","Jantung", "Thalassemia"]

        result = pd.concat([test, labels, probability], axis=1)
        # Z = self.decodeLabel(X_test)
        # print("Ini", Z)
        # exit()
        return [result, str(accuracy), report, conf_m]
    except ValueError as v:
        print(v)
    except FileNotFoundError:
        print('File not found')


if __name__ == '__main__':
    dataTrainInput = "perc4.csv"
    obj = bayesNaive(dataTrainInput)
    print(obj.Classify_lab())

В Classify_lab () я предсказываю вероятность других целей

 Y_prob = model_train.predict_proba(X_test)

Затем я создаю фрейм данных

probability = pd.DataFrame(Y_prob)

Когда я запускаю вероятность, я получу этот результат

     Diabetes mellitus      ...          Thalassemia
0         1.000000e+00      ...         0.000000e+00
1         5.693959e-28      ...         0.000000e+00
2        1.610343e-182      ...         0.000000e+00
3         4.347851e-04      ...         9.949097e-01
4         2.611833e-24      ...         0.000000e+00
5         6.281686e-32      ...         1.000000e+00
6         1.000000e+00      ...         3.472943e-99
7        2.580374e-132      ...         0.000000e+00
8         1.457926e-26      ...         1.000000e+00
9         1.000000e+00      ...         0.000000e+00
10        4.993317e-33      ...         1.000000e+00

И если я запусту вероятность в HTML, я получу этот результат

Результат

Большое спасибо ...

1 Ответ

0 голосов
/ 25 октября 2018

Общая вероятность NB равна 1. В ваших случаях вы можете видеть, где бы ни было 1,0, для оставшихся столбцов вероятность равна 0 или очень близка к 0. Вы можете просто извлечь максимум из возвращенного массива и умножить на 100.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...