Я получаю ValueError ниже, когда пытаюсь отправить свой конвейер в грейдер. И я не уверен, где я должен сбрить 12500 строк данных.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
Мне было поручено создать модель, которая сочетает в себе business_features домов престарелых с результатами их опросов цикла 1, а также время между опросом цикла 1 и цикла 2 для прогнозирования общего балла цикла 2.
Это мой код, который я использую для выполнения вышеуказанной задачи.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)
Подогнанный конвейер выглядит следующим образом
Pipeline(memory=None,
steps=[('features',
FeatureUnion(n_jobs=None,
transformer_list=[('business',
FeatureUnion(n_jobs=None,
transformer_list=[('simple',
Pipeline(memory=None,
steps=[('cst',
ColumnSelectTransformer(columns=['BEDCERT',
'RESTOT',
'INHOSP',
'CCRC_FACIL',
'SFF',
'CHOW_LAST_12MOS',
'SPRINKLER_STATUS',
'EXP_TOTAL',
'ADJ_TOTAL'])),
('imputer',
SimpleImpute...
transformer_weights=None, verbose=False)),
('forest',
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
Некоторый дополнительный контекст: я строю эту модель, чтобы ее метод прогнозирования был передан в пользовательский грейдер для проекта. Грейдер передает список словарей в метод предиката илиgnast_proba моего оценщика, а не в DataFrame. Это означает, что модель должна работать с обоими типами данных. По этой причине мне нужно предоставить собственный ColumnSelectTransformer, чтобы использовать вместо него собственный ColumnTransformer scikit-learn.
Ниже приведен дополнительный код, связанный с бизнес-функциями, и ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Наконец, ниже приведенполная ошибка
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
96
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
91
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.