Извините за любые очевидные ошибки здесь - я настоящий новичок. Я разбил набор данных на обучение / тестирование и успешно применил алгоритм Байеса с результатом 0,8888 (см. Код ниже). Теперь я хочу применить второй набор данных к этому существующему алгоритму - те же функции и метки, но неизвестный результат. Как мне это сделать?
import pandas as pd
import numpy as np
testdf = pd.read_csv("train_predictions.csv")
#change output settings
pd.set_option("display.width", 400)
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 200)
# print data types of each column
print(testdf.dtypes)
# transform str data to numerical
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
testdf["ID"] = le.fit_transform(testdf["ID"])
testdf["THAL"] = le.fit_transform(testdf["THAL"])
print(testdf.head())
# ID is not relevant to model, HEART DZ will be our target
cols = [col for col in testdf.columns if col not in ["ID","HEART DZ"]]
data = testdf[cols]
target = testdf["HEART DZ"]
print(data.head())
from sklearn.model_selection import train_test_split
# split dataset
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.30, random_state=10)
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
gnb = GaussianNB()
pred = gnb.fit(data_train, target_train).predict(data_test)
print("Naive-Bayes accuracy : ",accuracy_score(target_test, pred, normalize=True))
ОБНОВЛЕННЫЙ КОД:
testdf = pd.read_csv("train_predictions.csv")
predictdf = pd.read_csv("export_dataframe.csv")
#change output settings
pd.set_option("display.width", 400)
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 200)
# print data types of each column
#print(predictdf.head())
# transform str data to numerical
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
testdf["ID"] = le.fit_transform(testdf["ID"])
testdf["THAL"] = le.fit_transform(testdf["THAL"])
predictdf["ID"] = le.fit_transform(predictdf["ID"])
predictdf["THAL"] = le.fit_transform(predictdf["THAL"])
#print(predictdf.head())
# ID is not relevant to model, HEART DZ will be our target (drop them)
cols = [col for col in testdf.columns if col not in ["ID","HEART DZ"]]
data = testdf[cols]
target = testdf["HEART DZ"]
pred_cols = [col for col in predictdf.columns if col not in ["ID","HEART DZ"]]
pred_data = predictdf[cols]
pred_target = predictdf["HEART DZ"]
#print(pred_data.head())
from sklearn.model_selection import train_test_split
# split dataset
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.30) #random_state=10)
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
gnb = GaussianNB()
pred = gnb.fit(data_train, target_train).predict(data_test)
predictions = gnb.predict([predictdf])
#print("Naive-Bayes accuracy : ",accuracy_score(target_test, pred, normalize=True))
print(predictions)
ОБНОВЛЕННЫЙ КОД 2
testdf = pd.read_csv("train_predictions.csv")
testlabelsdf = pd.read_csv("train_labels.csv")
predictdf = pd.read_csv("export_dataframe.csv")
#print(testlabelsdf.head())
# transform str to int
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
testdf["ID"] = le.fit_transform(testdf["ID"])
predictdf["ID"] = le.fit_transform(predictdf["ID"])
testdf["THAL"] = le.fit_transform(testdf["THAL"])
predictdf["THAL"] = le.fit_transform(predictdf["THAL"])
# ID is not relevant to model, HEART DZ will be our target (drop them)
cols = [col for col in testdf.columns if col not in ["ID"]]
data = testdf[cols]
target = testlabelsdf["HEART DZ"]
from sklearn.model_selection import train_test_split
# split dataset
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=10) #test_size=0.30,random_state=10)
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
gnb = GaussianNB()
gnb.fit(data_train, target_train)
target_pred = gnb.predict(data_test)
ac = accuracy_score(target_test, target_pred, normalize=True)
yNew = gnb.predict(predictdf)
#print(yNew)
for i in range(len(predictdf)):
print("Predicted: ", yNew[i])