Как подобрать классификаторы Scikit-learn с использованием очень больших разреженных матриц? - PullRequest
0 голосов
/ 22 января 2019

Я построил следующий конвейер для предварительной обработки данных перед передачей их в классификатор.Входные данные DataFrame содержат 100 миллионов строк и 10 столбцов.

class ColumnSelector(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, column_names):
        self._column_names = column_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.loc[:, self._column_names]


_categorical_pipeline = pipeline.Pipeline([
    ("selector", ColumnSelector(_categorical_columns)),
    ("one_hot_encoder", preprocessing.OneHotEncoder()),
    ("feature_crosses", preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])

_numeric_pipeline = pipeline.Pipeline([
    ('selector', ColumnSelector(_numeric_columns)),
    ('standard_scaler', preprocessing.StandardScaler())
])

preprocessing_pipeline = pipeline.FeatureUnion([
    ("categorical_pipeline", _categorical_pipeline),
    ("numeric_pipeline", _numeric_pipeline)
])

Z = preprocessing_pipeline.fit_transform(training_df.drop(target_attribute, axis=1))

Результирующая разреженная матрица Z составляет 100 миллионов строк и 10000 столбцов.Я пробовал несколько классификаторов (в том числе SGDClassifier и RandomForestClassifier), и каждый раз я получаю сообщение об ошибке.Вот обратная связь для SGDClassifier.--------------------------------------------------------------------------- ValueError Traceback (последний вызов был последним) в ----> 1 sgd_clf.fit (Z, training_df [target_attribute])

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
    741                          loss=self.loss, learning_rate=self.learning_rate,
    742                          coef_init=coef_init, intercept_init=intercept_init,
--> 743                          sample_weight=sample_weight)
    744 
    745 

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    568 
    569         X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
--> 570                          accept_large_sparse=False)
    571         n_samples, n_features = X.shape
    572 

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    754                     ensure_min_features=ensure_min_features,
    755                     warn_on_dtype=warn_on_dtype,
--> 756                     estimator=estimator)
    757     if multi_output:
    758         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    515                                       dtype=dtype, copy=copy,
    516                                       force_all_finite=force_all_finite,
--> 517                                       accept_large_sparse=accept_large_sparse)
    518     else:
    519         # If np.array(..) gives ComplexWarning, then we convert the warning

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
    313 
    314     # Indices dtype validation
--> 315     _check_large_sparse(spmatrix, accept_large_sparse)
    316 
    317     if accept_sparse is False:

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/utils/validation.py in _check_large_sparse(X, accept_large_sparse)
    631                 raise ValueError("Only sparse matrices with 32-bit integer"
    632                                  " indices are accepted. Got %s indices."
--> 633                                  % indices_datatype)
    634 
    635 

ValueError: Only sparse matrices with 32-bit integer indices are accepted. Got int64 indices.

Вот обратная трассировка для RandomForestClassifier.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-3c5aa87f477e> in <module>
----> 1 random_forest_clf.fit(Z, training_df[target_attribute])

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    331                     t, self, X, y, sample_weight, i, len(trees),
    332                     verbose=self.verbose, class_weight=self.class_weight)
--> 333                 for i, t in enumerate(trees))
    334 
    335             # Collect newly grown trees

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919 

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
    117             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    118 
--> 119         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    120     else:
    121         tree.fit(X, y, sample_weight=sample_weight, check_input=False)

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    799             sample_weight=sample_weight,
    800             check_input=check_input,
--> 801             X_idx_sorted=X_idx_sorted)
    802         return self
    803 

~/.conda/envs/sec-data-analytics/lib/python3.7/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    364                                            min_impurity_split)
    365 
--> 366         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    367 
    368         if self.n_outputs_ == 1:

sklearn/tree/_tree.pyx in sklearn.tree._tree.DepthFirstTreeBuilder.build()

sklearn/tree/_tree.pyx in sklearn.tree._tree.DepthFirstTreeBuilder.build()

sklearn/tree/_tree.pyx in sklearn.tree._tree.TreeBuilder._check_input()

ValueError: No support for np.int64 index based sparse matrices

Длина Z.indices равна 3377440896, что больше 2**31-1, поэтому я понимаю, почему 64 битцелые числа были бы необходимы для индексации массива.Но тогда я не понимаю, почему поддержка 64-битных целых индексов является проблемой с самого начала.Есть ли работа вокруг?

...