Из базового обучения для проблемы классификации текста с несколькими метками - PullRequest
0 голосов
/ 21 апреля 2020

Я работаю над проблемой классификации текста с несколькими метками (Всего целевых меток 90). Распределение данных имеет длинный хвост и дисбаланс классов. Я работаю с выборкой из 100 тыс. Записей, используя стратегию OVR (One Versus Rest). Так как набор данных огромен, я пробую метод part_fit. Я узнал, что ранее были некоторые проблемы, и подобный вопрос был задан еще в 2017 году. Я попробовал частичное_применение и обнаружил, что та же проблема все еще существует, или, возможно, я делаю это неправильно.

Версия Scikit-learn: 0.22.2.post1

Код

def stream_documents(data=None):
    """Iterate over documents of the dataset.

    Documents are represented as dictionaries
    """

    for index,row in data.iterrows():
      tmp_dict = dict()
      tmp_dict['text'] = row[TEXT_FEAT]
      tmp_dict['target'] = row[TARGET_LABEL]
      yield tmp_dict

def get_minibatch(doc_iter, size, mlb):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.
    """
    data = [(doc['text'], doc['target'])
            for doc in itertools.islice(doc_iter, size)]
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X_text, y = zip(*data)
    y = pd.Series(data=y)
    y_encoded = mlb.transform(y.str.split(','))
#     print("Y SHAPE : ",np.asarray(y_encoded,dtype=int).shape)
    return X_text, np.asarray(y_encoded,dtype=int)

def iter_minibatches(doc_iter, minibatch_size):
    """Generator of minibatches."""
    X_text, y = get_minibatch(doc_iter, minibatch_size, mlb)
    while len(X_text):
        yield X_text, y
        X_text, y = get_minibatch(doc_iter, minibatch_size, mlb)

def progress(cls_name, stats):
    """Report progress information, return a string."""
    duration = time.time() - stats['t0']
    s = "%20s classifier : \t" % cls_name
    s += "%(n_train)6d train docs " % stats
    s += "%(n_test)6d test docs  " % test_stats
    s += "Acc: %(accuracy).3f " % stats
    s += "f1: %(f1).3f " % stats
    s += "P: %(p).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s

vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18)
data_stream = stream_documents(data=df_sample_xs) # X, y
partial_fit_classifiers = {
    'SGD': OneVsRestClassifier(SGDClassifier(max_iter=1000, tol=1e-3)),
    'Logistic':OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))
}

# test data statistics
test_stats = {'n_test': 0}

# First we hold out a number of examples to estimate accuracy
n_test_documents = 1000
tick = time.time()
X_test_text, y_test = get_minibatch(data_stream, 1000, mlb)
parsing_time = time.time() - tick
tick = time.time()
X_test = vectorizer.transform(X_test_text)
vectorizing_time = time.time() - tick
test_stats['n_test'] += len(y_test)

print("Test set is %d documents" % (len(y_test)))

cls_stats = {}

for cls_name in partial_fit_classifiers:
    stats = {'n_train': 0, 'n_train_pos': 0,
             'accuracy': 0.0, 
             'accuracy_history': [(0, 0)],
             'f1': 0.0,
             'f1_history': [(0,0)],
             'p': 0.0,
             'p_history': [(0,0)],
             't0': time.time(),
             'runtime_history': [(0, 0)],
             'total_fit_time': 0.0}
    cls_stats[cls_name] = stats

get_minibatch(data_stream, n_test_documents, mlb)
minibatch_size = 2000
minibatch_iterators = iter_minibatches(data_stream, minibatch_size)
total_vect_time = 0.0

# Main loop : iterate on mini-batchs of examples
for i, (X_train_text, y_train) in enumerate(minibatch_iterators):
    tick = time.time()
    X_train = vectorizer.transform(X_train_text)
    total_vect_time += time.time() - tick
#     print(X_train.shape,y_train.shape)

    for cls_name, cls in partial_fit_classifiers.items():
        tick = time.time()
        print(cls_name)
        # update estimator with examples in the current mini-batch
        # cls.partial_fit(X_train, y_train, classes=all_classes)
        cls.partial_fit(X_train, y_train, classes=mlb.transform(df_sample_xs[TARGET_LABEL].str.split(',')))
        # accumulate test accuracy stats
        cls_stats[cls_name]['total_fit_time'] += time.time() - tick
        cls_stats[cls_name]['n_train'] += X_train.shape[0]
        cls_stats[cls_name]['n_train_pos'] += sum(y_train)
        tick = time.time()
        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
        cls_stats[cls_name]['f1'] = f1_score(y_test, cls.predict(X_test))
        cls_stats[cls_name]['p'] = precision_score(y_test, cls.predict(X_test))
        cls_stats[cls_name]['prediction_time'] = time.time() - tick
        acc_history = (cls_stats[cls_name]['accuracy'],cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['accuracy_history'].append(acc_history)
        f1_history = (cls_stats[cls_name]['f1'],cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['f1_history'].append(f1_history)
        p_history = (cls_stats[cls_name]['p'],cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['p_history'].append(p_history)

        run_history = (cls_stats[cls_name]['accuracy'],
                       cls_stats[cls_name]['f1'],
                       cls_stats[cls_name]['p'],
                       total_vect_time + cls_stats[cls_name]['total_fit_time'])
        cls_stats[cls_name]['runtime_history'].append(run_history)
        if i % 3 == 0:
            print(progress(cls_name, cls_stats[cls_name]))
    if i % 3 == 0:
        print('\n')

Ошибка

SGD
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-87-cf38c633c6aa> in <module>
     31         # update estimator with examples in the current mini-batch
     32         # cls.partial_fit(X_train, y_train, classes=all_classes)
---> 33         cls.partial_fit(X_train, y_train, classes=mlb.transform(df_sample_xs[TARGET_LABEL].str.split(',')))
     34         # accumulate test accuracy stats
     35         cls_stats[cls_name]['total_fit_time'] += time.time() - tick

/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
    114 
    115         # lambda, but not partial, allows help() to work with update_wrapper
--> 116         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
    117         # update the docstring of the returned function
    118         update_wrapper(out, self.fn)

/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/multiclass.py in partial_fit(self, X, y, classes)
    287                                                              self.classes_))
    288 
--> 289         Y = self.label_binarizer_.transform(y)
    290         Y = Y.tocsc()
    291         columns = (col.toarray().ravel() for col in Y.T)

/opt/virtual_env/py3/lib/python3.6/site-packages/sklearn/preprocessing/_label.py in transform(self, y)
    478         y_is_multilabel = type_of_target(y).startswith('multilabel')
    479         if y_is_multilabel and not self.y_type_.startswith('multilabel'):
--> 480             raise ValueError("The object was not fitted with multilabel"
    481                              " input.")
    482 

ValueError: The object was not fitted with multilabel input.
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...