Я построил следующий конвейер для предварительной обработки данных перед передачей их в классификатор.Входные данные DataFrame
содержат 100 миллионов строк и 10 столбцов.
class ColumnSelector(base.BaseEstimator, base.TransformerMixin):
def __init__(self, column_names):
self._column_names = column_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X.loc[:, self._column_names]
_categorical_pipeline = pipeline.Pipeline([
("selector", ColumnSelector(_categorical_columns)),
("one_hot_encoder", preprocessing.OneHotEncoder()),
("feature_crosses", preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])
_numeric_pipeline = pipeline.Pipeline([
('selector', ColumnSelector(_numeric_columns)),
('standard_scaler', preprocessing.StandardScaler())
])
preprocessing_pipeline = pipeline.FeatureUnion([
("categorical_pipeline", _categorical_pipeline),
("numeric_pipeline", _numeric_pipeline)
])
Z = preprocessing_pipeline.fit_transform(training_df.drop(target_attribute, axis=1))
Результирующая разреженная матрица Z
составляет 100 миллионов строк и 10000 столбцов.Я пробовал несколько классификаторов (в том числе SGDClassifier
и RandomForestClassifier
), и каждый раз я получаю сообщение об ошибке.Вот обратная связь для SGDClassifier
.--------------------------------------------------------------------------- ValueError Traceback (последний вызов был последним) в ----> 1 sgd_clf.fit (Z, training_df [target_attribute])
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
741 loss=self.loss, learning_rate=self.learning_rate,
742 coef_init=coef_init, intercept_init=intercept_init,
--> 743 sample_weight=sample_weight)
744
745
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
568
569 X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
--> 570 accept_large_sparse=False)
571 n_samples, n_features = X.shape
572
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
754 ensure_min_features=ensure_min_features,
755 warn_on_dtype=warn_on_dtype,
--> 756 estimator=estimator)
757 if multi_output:
758 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
515 dtype=dtype, copy=copy,
516 force_all_finite=force_all_finite,
--> 517 accept_large_sparse=accept_large_sparse)
518 else:
519 # If np.array(..) gives ComplexWarning, then we convert the warning
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
313
314 # Indices dtype validation
--> 315 _check_large_sparse(spmatrix, accept_large_sparse)
316
317 if accept_sparse is False:
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in _check_large_sparse(X, accept_large_sparse)
631 raise ValueError("Only sparse matrices with 32-bit integer"
632 " indices are accepted. Got %s indices."
--> 633 % indices_datatype)
634
635
ValueError: Only sparse matrices with 32-bit integer indices are accepted. Got int64 indices.
Вот обратная трассировка для RandomForestClassifier
.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-3c5aa87f477e> in <module>
----> 1 random_forest_clf.fit(Z, training_df[target_attribute])
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
331 t, self, X, y, sample_weight, i, len(trees),
332 verbose=self.verbose, class_weight=self.class_weight)
--> 333 for i, t in enumerate(trees))
334
335 # Collect newly grown trees
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
117 curr_sample_weight *= compute_sample_weight('balanced', y, indices)
118
--> 119 tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
120 else:
121 tree.fit(X, y, sample_weight=sample_weight, check_input=False)
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
799 sample_weight=sample_weight,
800 check_input=check_input,
--> 801 X_idx_sorted=X_idx_sorted)
802 return self
803
~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
364 min_impurity_split)
365
--> 366 builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
367
368 if self.n_outputs_ == 1:
sklearn/tree/_tree.pyx in sklearn.tree._tree.DepthFirstTreeBuilder.build()
sklearn/tree/_tree.pyx in sklearn.tree._tree.DepthFirstTreeBuilder.build()
sklearn/tree/_tree.pyx in sklearn.tree._tree.TreeBuilder._check_input()
ValueError: No support for np.int64 index based sparse matrices
Длина Z.indices
равна 3377440896, что больше 2**31-1
, поэтому я понимаю, почему 64 битцелые числа были бы необходимы для индексации массива.Но тогда я не понимаю, почему поддержка 64-битных целых индексов является проблемой с самого начала.Есть ли работа вокруг?