Я пытаюсь выполнить этот урок: https://github.com/USC-Melady/Benchmarking_DL_MIMICIII в этом уроке он запускает модель SuperLearner, где он использует перекрестную проверку на комбинации многих алгоритмов машинного обучения. проблема в том, что он не сохраняет модель SuperLearner.
это класс супер ученика:
from SuPyLearner.supylearner.core import *
from sklearn import datasets, svm, linear_model, neighbors, svm, ensemble, neural_network
from sklearn import tree
import numpy as np
from argparse import ArgumentParser
import os
from pygam import LogisticGAM, LinearGAM
from pyearth import Earth
from sklearn.metrics import roc_auc_score, mean_squared_error
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import re
from sklearn.externals import joblib
import pickle
def load_data(datapath, foldpath, taskname, labelnum, subset, sltype):
if sltype == 'sl1':
inputname = 'input_sapsiisubscores'
elif sltype == 'sl2':
inputname = 'input'
if subset == 'all':
nametail = '.csv'
elif subset == 'cv':
nametail = '_cv.csv'
elif subset == 'mv':
nametail = '_mv.csv'
inputname += nametail
# outputname += nametail
inputarray = np.genfromtxt(os.path.join(datapath, inputname), delimiter=',')
# outputarray = np.genfromtxt(os.path.join(datapath, outputname), delimiter=',')[:, labelnum]
hrs = datapath.rstrip('/').split('/')[-2].split('_')[0].rstrip('hrs')
outputfile = np.load(os.path.join(foldpath, 'imputed-normed-ep_1_%s.npz' % hrs))
if taskname == 'mor':
outputarray = outputfile['adm_labels_all'][:, labelnum]
print(labelnum)
print(max(outputarray))
elif taskname == 'icd9':
outputarray = outputfile['y_icd9'][:, labelnum]
elif taskname == 'los':
outputarray = outputfile['y_los'] / 60.0
if taskname == 'mor':
foldname = 'folds_ep_mor'
elif taskname == 'icd9':
foldname = 'folds_ep_icd9'
elif taskname == 'los':
foldname = 'folds_ep_mor'
folds = np.load(os.path.join(foldpath, '5-folds.npz'))[foldname][labelnum][0]
return inputarray, outputarray, folds
def get_algolib_classification():
return [
linear_model.LogisticRegression(),
# linear_model.LassoLarsIC(criterion='aic'),
# LinearGAM(),
linear_model.ElasticNet(),
# Earth(),
# linear_model.BayesianRidge(),
ensemble.GradientBoostingClassifier(),
neural_network.MLPClassifier(),
ensemble.BaggingClassifier(),
# tree.DecisionTreeClassifier(),
ensemble.RandomForestClassifier(),
# bart
# ensemble.GradientBoostingClassifier(),
# XGBClassifier(),
# LGBMClassifier(),
], [
'SL.glm',
# 'SL.stepAIC',
# 'SL.gam',
'SL.glmnet',
# 'SL.polymars',
# 'SL.bayesglm',
'SL.gbm',
'SL.nnet',
'SL.ipredbagg',
# 'SL.rpartPrune',
'SL.randomForest',
# 'SL.bart',
# 'GBDT',
# 'XGBoost',
# 'LightGBM'
]
def get_algolib_regression():
return [
linear_model.LinearRegression(),
# linear_model.LassoLarsIC(criterion='aic'),
# LinearGAM(),
linear_model.ElasticNet(),
# Earth(),
# linear_model.BayesianRidge(),
ensemble.GradientBoostingRegressor(),
neural_network.MLPRegressor(),
ensemble.BaggingRegressor(),
# tree.DecisionTreeClassifier(),
ensemble.RandomForestRegressor(),
# bart
# ensemble.GradientBoostingClassifier(),
# XGBClassifier(),
# LGBMClassifier(),
], [
'SL.glm',
# 'SL.stepAIC',
# 'SL.gam',
'SL.glmnet',
# 'SL.polymars',
# 'SL.bayesglm',
'SL.gbm',
'SL.nnet',
'SL.ipredbagg',
# 'SL.rpartPrune',
'SL.randomForest',
# 'SL.bart',
# 'GBDT',
# 'XGBoost',
# 'LightGBM'
]
def main():
parser = ArgumentParser()
parser.add_argument('datapath',
help='path of data folder')
parser.add_argument('foldpath',
help='path of fold file')
parser.add_argument('taskname',
help='name of task, like mor/icd9/los')
parser.add_argument('labelnum', type=int,
help='number of label used for current task')
parser.add_argument('subset',
help='choose to use full dataset or only cv/mv, value must be all/cv/mv')
parser.add_argument('sltype', default='sl2',
help='type of superlearner, sl1 uses sapsii scores, sl2 uses features')
# parser.add_argument('modeltype', default='classification',
# help='run classification task or regression task')
args = parser.parse_args()
datapath = args.datapath
foldpath = args.foldpath
taskname = args.taskname
labelnum = args.labelnum
subset = args.subset
sltype = args.sltype
print("datapath : "+str(datapath))
print("foldpath : "+str(foldpath))
print("taskname : "+str(taskname))
print("labelnum : "+str(labelnum))
print("subset : "+str(subset))
print("sltype : "+str(sltype))
assert subset in ['all', 'cv', 'mv']
assert sltype in ['sl1', 'sl2']
if taskname == 'los':
modeltype = 'regression'
else:
modeltype = 'classification'
X, y, folds = load_data(datapath, foldpath, taskname, labelnum, subset, sltype)
if modeltype == 'classification':
algolib, algonames = get_algolib_classification()
sl = SuperLearner(algolib, algonames, loss='nloglik', K=10)
metricname, metric = 'aurocs', roc_auc_score
elif modeltype == 'regression':
algolib, algonames = get_algolib_regression()
sl = SuperLearner(algolib, algonames, loss='L2', coef_method='NNLS', K=10)
# def scaled_mean_squared_error(y_true, y_pred):
# return mean_squared_error(y_true, y_pred)
metricname, metric = 'mses', mean_squared_error
risk_cv, y_pred_cv, y_true_cv,slObj,X_test,y_test = cv_superlearner(sl, X, y, K=5, fixed_folds=folds)
metric_results = []
for y_pred, y_true in zip(y_pred_cv, y_true_cv):
metric_results.append(metric(y_true, y_pred[:,-1].flatten()))
print(metric_results)
print(np.mean(metric_results))
print(np.std(metric_results))
if modeltype == 'classification':
np.savez(os.path.join(datapath, 'pyslresults-{0}-{1}-{2}-{3}.npz'.format(taskname, labelnum, subset, sltype)),
risk_cv=risk_cv, y_pred_cv=y_pred_cv, y_true_cv=y_true_cv, aurocs=metric_results)
slObj.save(datapath+'/SLModels/'+taskname+'/SuperLearner.pkl')
os.path.join(datapath+'/SLModels/'+taskname+'/{0}.pkl'.format(modelName)))
elif modeltype == 'regression':
np.savez(os.path.join(datapath, 'pyslresults-{0}-{1}-{2}-{3}.npz'.format(taskname, labelnum, subset, sltype)),
risk_cv=risk_cv, y_pred_cv=y_pred_cv, y_true_cv=y_true_cv, mses=metric_results)
slObj.save(datapath+'/SLModels/'+taskname+'/SuperLearner.pkl')
os.path.join(datapath+'/SLModels/'+taskname+'/{0}.pkl'.format(modelName)))
if __name__ == '__main__':
main()
моя проблема в том, что когда я пытаюсь сохранить экземпляр класса SuperLearner, которыйis slObj У меня есть эта ошибка:
slObj.save (datapath + '/ SLModels /' + taskname + '/ SuperLearner.pkl')
typeError: notСериализация объекта 'io.textIOWapper'
Цель этого заключается в том, что я пытаюсь использовать сгенерированную модель в другом сценарии. Может кто-нибудь, пожалуйста, помогите мне решить эту проблему или дать мне альтернативный способ сделать это.