Пример кадра данных:
new_host split sequence expression
FALSE train AQVPYGVS 0.039267878
FALSE train ASVPYGVSI 0.039267878
FALSE train STNLYGSGR 0.261456561
FALSE valid NLYGSGLVR 0.265188519
FALSE valid SLGPSNLYG 0.419680588
FALSE valid ATSLGTTNG 0.145710993
Я пытаюсь рассчитать конформные прогнозы моей модели ( Модель регрессии PLS ), это основано на вычислении интервалов прогнозирования (цель - это выражение моих последовательностей) на основе конформной функции моих данных калибровки. Основывая свой алгоритм на следующем:
введите описание изображения здесь
В основном у меня есть:
- Разделил мои наборы данных
- Подгоняет мою модель под данные обучения
- Определил мою функцию соответствия как абсолютная ошибка между предсказанными и истинными метками
- Применил функцию соответствия к моему набору калибровочных данных
- Рассчитал мои язвы соответствия с шага 4
Теперь мне нужно определить предсказание интервалы, основанные на уровне значимости. У меня возникают проблемы при вычислении интервалов в моем наборе данных. Я продолжаю сталкиваться с различными numpy ошибками, и я не уверен, как действовать в этом случае, найденном в моем последнем методе класса под названием contal_predictions . Где я рассчитываю интервалы прогнозов.
Ниже приведен фрагмент моего кода, и я надеюсь, что он не слишком расплывчатый, при необходимости я могу предоставить дополнительную информацию.
def data_split(df):
train = df.loc[df['split'] == 'train']
valid = df.loc[df['split'] == 'valid']
X_test = valid.iloc[:,:-1]
y_test = valid.iloc[:,-1]
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]
X_train, X_cal, y_train, y_cal = train_test_split(X_train, y_train, test_size =0.2)
print("Data has been split")
print("X_train and y_train shape: "+ str(X_train.shape) + str(y_train.shape))
print("X_cal and y_cal shape: "+ str(X_cal.shape) + str(y_cal.shape))
print('{} instances, {} features, {} classes'.format(y_train.size,
X_train.shape[1],
np.unique(y_train).size))
return X_test, y_test, X_train, y_train, X_cal, y_cal
My NonConformistClass
:
class NonConformist():
def __init__(self, model):
self.model = model
def underlying_fit (self, X_train, y_train):
'''
Train underlying model on proper training data
@Params
X_train: has shape (n_train, n_features)
y_train: has shape (n_train)
'''
self.model.fit(X_train,y_train)
print("Model has been fitted")
def calibration_predictions(self, X_cal):
'''
Obtain predictions from the underlying model using X_cal data.
Returns an output of predicted real values as numpy.array of shape (n_test)
@params
X_cal: numpy array has shape (n_train, n_features)
'''
calibration_predictions = self.model.predict(X_cal)
print("Calibration Predictions Established")
return calibration_predictions
def test_predictions(self, X_test):
'''
Obtain predictions from the underlying model using X_test data.
Returns an output of predicted real values as numpy.array of shape (n_test)
@params
X_test: numpy array has shape (n_train, n_features)
'''
test_predictions = self.model.predict(X_test)
print("Calibration Predictions Established")
return test_predictions
def calibration_scores(calibration_predictions, y_cal):
'''
Calculates absolute error nonconformity for calibration set.
For each correct output in ``y``, nonconformity is defined as
math::
| y_i (predicted labels) - y^_i (true labels)|
@params
true_labels is a numpy array of (true) labels
predictions is a numpy array of predicted labels'''
true_labels = np.array(y_cal)
calibration_scores = np.abs(calibration_predictions - true_labels)
calibration_scores = np.sort(calibration_scores)[::-1] #sort in descending order
print("Calibration Scores Obtained")
return calibration_scores
def partial_inverse(self, calibration_scores, significance):
'''
This function is the partial inverse of the nonconformity function (calibration_scores) in order to
calculate the prediction intervals where:
apply_inverse(...)[0] is subtracted from the prediction of the
underlying model to create the lower boundary of the
prediction interval
apply_inverse(...)[1] is added to the prediction of the
underlying model to create the upper boundary of the
prediction interval
@params
Significance is a float between 0-1 (i.e. 0.05)
'''
border = int(np.floor(significance * (calibration_scores.size + 1))) - 1
border = min(max(border, 0), calibration_scores.size - 1)
return np.vstack([calibration_scores[border], calibration_scores[border]])
def conformal_predictions(self, X_test, calibration_scores, significance, test_predictions):
"""This function creates the prediction intervals based from a set of test examples.
This takes the predictions for each test pattern with the underlying model
and applies the conformity function to each prediction, resulting in
a final prediction interval for each test pattern.
Predicts the output of each test pattern using the underlying model,
and applies the (partial) inverse nonconformity function to each
prediction, resulting in a prediction interval for each test pattern.
@params
----------
X_test: consists of a numpy array of shape [n_samples, n_features]
significance level : is a float between 0 and 1; determimned as
the maximum allowed error rate of predictions.
Returns
-------
p : numpy array of shape [n_samples, 2] or [n_samples, 2, 99]
If significance is ``None``, then p contains the interval (minimum
and maximum boundaries) for each test pattern, and each significance
level (0.01, 0.02, ..., 0.99).
If significance value is a float between
0 and 1, then p contains the prediction intervals (minimum and
maximum boundaries) for the set of test patterns at the chosen
significance level.
"""
n_test = X_test.shape[0]
prediction = self.model.predict(x)
norm = np.ones(n_test)
if significance:
intervals = np.zeros((x.shape[0], 2)) #creates empty 2D numpy array
err_dist = self.err_func.partial_inverse(calibration_scores, significance)
err_dist = np.hstack([err_dist] * n_test)
err_dist *= norm
intervals[:, 0] = test_predictions - err_dist[0, :]
intervals[:, 1] = test_predictions + err_dist[1, :]
return intervals
else:
significance = np.arange(0.01, 1.0, 0.01)
intervals = np.zeros((x.shape[0], 2, significance.size))
for i, s in enumerate(significance):
err_dist = self.err_func.apply_inverse(nc, s)
err_dist = np.hstack([err_dist] * n_test)
err_dist *= norm
intervals[:, 0, i] = prediction - err_dist[0, :]
intervals[:, 1, i] = prediction + err_dist[0, :]
return intervals