Я хочу использовать метод CV для настройки гиперпараметров моего случайного лесного классификатора. На этом этапе я был бы рад просто настроить идеальное количество деревьев, которые я должен вырастить (n_estimators).
Моя входная переменная - это строка текста (независимая переменная) и метка (dep var). Где я запутался, где вступает в игру TfidfVectorizer?
Я искал образец кода, но пока не нашел ничего полезного.
from sklearn.model_selection import cross_validate
pipeline_cv = Pipeline([
('bow', TfidfVectorizer(analyzer=text_process)),
('tfidf', TfidfTransformer()),
('classifier', RandomForestClassifier()),
])
parameters = {
'n_estimators' : [10,50,200],
'random_state' : [108],
'min_samples_leaf' : [2,3,5],
'min_samples_split' : [2,3,5]
}
clf = cross_validate(pipeline_cv, parameters)
clf.fit(text_train, value_train)
Я получил следующую ошибку. Я не уверен, что вызвало KeyError.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-367-e5e6e92c059d> in <module>
14 }
15
---> 16 clf = cross_validate(pipeline_cv, parameters)
17 clf.fit(text_train, value_train)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
229 return_times=True, return_estimator=return_estimator,
230 error_score=error_score)
--> 231 for train, test in cv.split(X, y, groups))
232
233 zipped_scores = list(zip(*scores))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
503 start_time = time.time()
504
--> 505 X_train, y_train = _safe_split(estimator, X, y, train)
506 X_test, y_test = _safe_split(estimator, X, y, test, train)
507
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
199 X_subset = X[np.ix_(indices, train_indices)]
200 else:
--> 201 X_subset = safe_indexing(X, indices)
202
203 if y is not None:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
221 return X[indices]
222 else:
--> 223 return [X[idx] for idx in indices]
224
225
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\__init__.py in <listcomp>(.0)
221 return X[indices]
222 else:
--> 223 return [X[idx] for idx in indices]
224
225
KeyError: 2