Я обнаружил, что у меня было 2 ошибки в размерах. Поскольку они были одновременными, я не мог легко их отследить. Я опубликую ответ здесь, может быть, когда-нибудь он будет полезен.
1. Из документации
Изменение примера в документации помогло отследить ошибки измерения.
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate, cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
cv_results = cross_validate(lasso, X, y, cv=3)
sorted(cv_results.keys())
print( cv_results['test_score'] )
[0.33150734 0.08022311 0.03531764]
Обратите внимание, что cross_validation
необходимо, чтобы первые размеры были одинаковыми:
print( X.shape, y.shape )
(150, 10) (150,)
Обратите внимание, что с этими новыми измерениями простой способ вызова curve_fit
вызывает ошибку:
def lincomb( X, a, b ):
x1 = X[0]
x2 = X[1]
return a*x1*x2 + b
popt, pcov = curve_fit( lincomb, x_data, y_data )
print( popt )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-6-dedaa241e377> in <module>
4 return a*x1*x2 + b
5
----> 6 popt, pcov = curve_fit( lincomb, x_data, y_data )
7 print( popt )
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, jac, **kwargs)
754 # Remove full_output from kwargs, otherwise we're passing it in twice.
755 return_full = kwargs.pop('full_output', False)
--> 756 res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
757 popt, pcov, infodict, errmsg, ier = res
758 cost = np.sum(infodict['fvec'] ** 2)
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in leastsq(func, x0, args, Dfun, full_output, col_deriv, ftol, xtol, gtol, maxfev, epsfcn, factor, diag)
381 if not isinstance(args, tuple):
382 args = (args,)
--> 383 shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
384 m = shape[0]
385
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in _check_func(checker, argname, thefunc, x0, args, numinputs, output_shape)
24 def _check_func(checker, argname, thefunc, x0, args, numinputs,
25 output_shape=None):
---> 26 res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
27 if (output_shape is not None) and (shape(res) != output_shape):
28 if (output_shape[0] != 1):
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in func_wrapped(params)
456 if transform is None:
457 def func_wrapped(params):
--> 458 return func(xdata, *params) - ydata
459 elif transform.ndim == 1:
460 def func_wrapped(params):
ValueError: operands could not be broadcast together with shapes (2,) (10,)
Эту проблему можно решить, транспонировав внутри вызова снова в curve_fit
:
popt, pcov = curve_fit( lincomb, x_data.T, y_data )
print( popt )
[- 0,17857143 -1,57142857]
2. Класс
Использование новых измерений в x_data
для cross_validation
(используя класс, определенный в вопросе) приводит к другой ошибке:
from sklearn.model_selection import cross_validate
class LinComb:
def __init__( self, a=None, b=None ):
self.a = a
self.b = b
def _lincomb_background(self, X, a, b):
x1 = X[0]
x2 = X[1]
return a*x1*x2 + b
def predict( self, X ):
return self._lincomb_background( X, self.a, self.b )
def fit( self, X, y ):
from scipy.optimize import curve_fit
popt, pcov = curve_fit( self._lincomb_background, X, y )
self.a = popt[0]
self.b = popt[1]
return self
def get_params( self, deep=False ):
return { 'a':self.a, 'b':self.b }
def set_params( self, **parameters ):
for parameter, value in parameters.intems():
setattr( self, parameter, value )
return self
cross_validate( LinComb(), x_data, y_data, cv=5, scoring='neg_mean_squared_error' )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-e0ff8bb83213> in <module>
----> 1 cross_validate( LinComb(), x_data, y_data, cv=5, scoring='neg_mean_squared_error' )
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
229 return_times=True, return_estimator=return_estimator,
230 error_score=error_score)
--> 231 for train, test in cv.split(X, y, groups))
232
233 zipped_scores = list(zip(*scores))
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
512 estimator.fit(X_train, **fit_params)
513 else:
--> 514 estimator.fit(X_train, y_train, **fit_params)
515
516 except Exception as e:
<ipython-input-9-ff88060f1729> in fit(self, X, y)
15 def fit( self, X, y ):
16 from scipy.optimize import curve_fit
---> 17 popt, pcov = curve_fit( self._lincomb_background, X, y )
18 self.a = popt[0]
19 self.b = popt[1]
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, jac, **kwargs)
754 # Remove full_output from kwargs, otherwise we're passing it in twice.
755 return_full = kwargs.pop('full_output', False)
--> 756 res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
757 popt, pcov, infodict, errmsg, ier = res
758 cost = np.sum(infodict['fvec'] ** 2)
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in leastsq(func, x0, args, Dfun, full_output, col_deriv, ftol, xtol, gtol, maxfev, epsfcn, factor, diag)
381 if not isinstance(args, tuple):
382 args = (args,)
--> 383 shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
384 m = shape[0]
385
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in _check_func(checker, argname, thefunc, x0, args, numinputs, output_shape)
24 def _check_func(checker, argname, thefunc, x0, args, numinputs,
25 output_shape=None):
---> 26 res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
27 if (output_shape is not None) and (shape(res) != output_shape):
28 if (output_shape[0] != 1):
/usr/local/lib/python3.6/dist-packages/scipy/optimize/minpack.py in func_wrapped(params)
456 if transform is None:
457 def func_wrapped(params):
--> 458 return func(xdata, *params) - ydata
459 elif transform.ndim == 1:
460 def func_wrapped(params):
ValueError: operands could not be broadcast together with shapes (2,) (8,)
3. Ошибка измерения внутри класса
Эта ошибка исходит от curve_fit
, а не cross_validation
, и должна быть исправлена внутри класса, в обеих функциях, вызывающих модель _lincomb_background()
то есть как в fit()
, так и в predict()
. Модифицированный класс:
class LinComb:
def __init__( self, a=None, b=None ):
self.a = a
self.b = b
def _lincomb_background(self, X, a, b):
x1 = X[0]
x2 = X[1]
return a*x1*x2 + b
def predict( self, X ):
return self._lincomb_background( X.T, self.a, self.b ) # Call with transposed X!
def fit( self, X, y ):
from scipy.optimize import curve_fit
popt, pcov = curve_fit( self._lincomb_background, X.T, y ) # Call with transposed X!
self.a = popt[0]
self.b = popt[1]
return self
def get_params( self, deep=False ):
return { 'a':self.a, 'b':self.b }
def set_params( self, **parameters ):
for parameter, value in parameters.intems():
setattr( self, parameter, value )
return self
С этими двумя измененными вызовами cross_validation
работает как положено:
cross_validate( LinComb(), x_data, y_data, cv=5, scoring='neg_mean_squared_error' )
{'fit_time': array ([0.00105524, 0.00051618 , 0,0004158, 0,00040078, 0,00039887]), 'score_time': массив ([0,00158715, 0,0001812, 0,00017715, 0,00017595, 0,00017548]), 'test_score': массив ([- 12,89, -0,29918379, -3,82378685, -2,77,25908, -2,77,25908, -2,72051908, -2,720.2590, )}
4. Резюме
a) Сначала проверьте правильность размеров для cross_validation()
b) Затем отрегулируйте размеры внутри класса в вызове curve_fit()
c) Наконец, отрегулируйте размеры внутри класса, в predict()