Я работаю над проблемой классификации текста с несколькими метками (Всего целевых меток 90). Распределение данных имеет длинный хвост и дисбаланс классов. Я работаю с выборкой из 100 тыс. Записей, используя стратегию OVR (One Versus Rest). Так как набор данных огромен, я пробую метод part_fit. Я узнал, что ранее были некоторые проблемы, и подобный вопрос был задан еще в 2017 году. Я попробовал частичное_применение и обнаружил, что та же проблема все еще существует, или, возможно, я делаю это неправильно.
Версия Scikit-learn: 0.22.2.post1
Код
def stream_documents(data=None):
"""Iterate over documents of the dataset.
Documents are represented as dictionaries
"""
for index,row in data.iterrows():
tmp_dict = dict()
tmp_dict['text'] = row[TEXT_FEAT]
tmp_dict['target'] = row[TARGET_LABEL]
yield tmp_dict
def get_minibatch(doc_iter, size, mlb):
"""Extract a minibatch of examples, return a tuple X_text, y.
Note: size is before excluding invalid docs with no topics assigned.
"""
data = [(doc['text'], doc['target'])
for doc in itertools.islice(doc_iter, size)]
if not len(data):
return np.asarray([], dtype=int), np.asarray([], dtype=int)
X_text, y = zip(*data)
y = pd.Series(data=y)
y_encoded = mlb.transform(y.str.split(','))
# print("Y SHAPE : ",np.asarray(y_encoded,dtype=int).shape)
return X_text, np.asarray(y_encoded,dtype=int)
def iter_minibatches(doc_iter, minibatch_size):
"""Generator of minibatches."""
X_text, y = get_minibatch(doc_iter, minibatch_size, mlb)
while len(X_text):
yield X_text, y
X_text, y = get_minibatch(doc_iter, minibatch_size, mlb)
def progress(cls_name, stats):
"""Report progress information, return a string."""
duration = time.time() - stats['t0']
s = "%20s classifier : \t" % cls_name
s += "%(n_train)6d train docs " % stats
s += "%(n_test)6d test docs " % test_stats
s += "Acc: %(accuracy).3f " % stats
s += "f1: %(f1).3f " % stats
s += "P: %(p).3f " % stats
s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
return s
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18)
data_stream = stream_documents(data=df_sample_xs) # X, y
partial_fit_classifiers = {
'SGD': OneVsRestClassifier(SGDClassifier(max_iter=1000, tol=1e-3)),
'Logistic':OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))
}
# test data statistics
test_stats = {'n_test': 0}
# First we hold out a number of examples to estimate accuracy
n_test_documents = 1000
tick = time.time()
X_test_text, y_test = get_minibatch(data_stream, 1000, mlb)
parsing_time = time.time() - tick
tick = time.time()
X_test = vectorizer.transform(X_test_text)
vectorizing_time = time.time() - tick
test_stats['n_test'] += len(y_test)
print("Test set is %d documents" % (len(y_test)))
cls_stats = {}
for cls_name in partial_fit_classifiers:
stats = {'n_train': 0, 'n_train_pos': 0,
'accuracy': 0.0,
'accuracy_history': [(0, 0)],
'f1': 0.0,
'f1_history': [(0,0)],
'p': 0.0,
'p_history': [(0,0)],
't0': time.time(),
'runtime_history': [(0, 0)],
'total_fit_time': 0.0}
cls_stats[cls_name] = stats
get_minibatch(data_stream, n_test_documents, mlb)
minibatch_size = 2000
minibatch_iterators = iter_minibatches(data_stream, minibatch_size)
total_vect_time = 0.0
# Main loop : iterate on mini-batchs of examples
for i, (X_train_text, y_train) in enumerate(minibatch_iterators):
tick = time.time()
X_train = vectorizer.transform(X_train_text)
total_vect_time += time.time() - tick
# print(X_train.shape,y_train.shape)
for cls_name, cls in partial_fit_classifiers.items():
tick = time.time()
print(cls_name)
# update estimator with examples in the current mini-batch
# cls.partial_fit(X_train, y_train, classes=all_classes)
cls.partial_fit(X_train, y_train, classes=mlb.transform(df_sample_xs[TARGET_LABEL].str.split(',')))
# accumulate test accuracy stats
cls_stats[cls_name]['total_fit_time'] += time.time() - tick
cls_stats[cls_name]['n_train'] += X_train.shape[0]
cls_stats[cls_name]['n_train_pos'] += sum(y_train)
tick = time.time()
cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
cls_stats[cls_name]['f1'] = f1_score(y_test, cls.predict(X_test))
cls_stats[cls_name]['p'] = precision_score(y_test, cls.predict(X_test))
cls_stats[cls_name]['prediction_time'] = time.time() - tick
acc_history = (cls_stats[cls_name]['accuracy'],cls_stats[cls_name]['n_train'])
cls_stats[cls_name]['accuracy_history'].append(acc_history)
f1_history = (cls_stats[cls_name]['f1'],cls_stats[cls_name]['n_train'])
cls_stats[cls_name]['f1_history'].append(f1_history)
p_history = (cls_stats[cls_name]['p'],cls_stats[cls_name]['n_train'])
cls_stats[cls_name]['p_history'].append(p_history)
run_history = (cls_stats[cls_name]['accuracy'],
cls_stats[cls_name]['f1'],
cls_stats[cls_name]['p'],
total_vect_time + cls_stats[cls_name]['total_fit_time'])
cls_stats[cls_name]['runtime_history'].append(run_history)
if i % 3 == 0:
print(progress(cls_name, cls_stats[cls_name]))
if i % 3 == 0:
print('\n')
Ошибка
SGD
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-87-cf38c633c6aa> in <module>
31 # update estimator with examples in the current mini-batch
32 # cls.partial_fit(X_train, y_train, classes=all_classes)
---> 33 cls.partial_fit(X_train, y_train, classes=mlb.transform(df_sample_xs[TARGET_LABEL].str.split(',')))
34 # accumulate test accuracy stats
35 cls_stats[cls_name]['total_fit_time'] += time.time() - tick
/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/multiclass.py in partial_fit(self, X, y, classes)
287 self.classes_))
288
--> 289 Y = self.label_binarizer_.transform(y)
290 Y = Y.tocsc()
291 columns = (col.toarray().ravel() for col in Y.T)
/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/preprocessing/_label.py in transform(self, y)
478 y_is_multilabel = type_of_target(y).startswith('multilabel')
479 if y_is_multilabel and not self.y_type_.startswith('multilabel'):
--> 480 raise ValueError("The object was not fitted with multilabel"
481 " input.")
482
ValueError: The object was not fitted with multilabel input.