Как передать несколько текстовых столбцов в Logisti c Регрессия для классификации по нескольким меткам - PullRequest
0 голосов
/ 23 февраля 2020

Я пытаюсь назначить двоичные значения для 10 ярлыков, используя 3 функции, заголовок статьи, резюме статьи и идентификатор того, кто создал ярлыки. Я застрял при попытке создать модель, которая может принимать все 3 поля в качестве входных данных. В настоящее время это работает, только если я пропущу только одно поле. Я знаю, что, вероятно, что-то напутал с tfidvectorizer, но я не могу понять это. Любая помощь будет оценена. Я получаю сообщение об ошибке (полное отслеживание ниже) обычно составляет

ValueError: Found input variables with inconsistent numbers of samples: [3, 75897].

снимок экрана с фреймом данных

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import string

df = pd.read_csv('../data/homework_clean.csv')

emotion_cols = ['emotion_0', 'emotion_1', 'emotion_2', 'emotion_3', 'emotion_4', 'emotion_5', 'emotion_6', 'emotion_7', 'emotion_8', 'emotion_9']


def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

df['headline'] = df['headline'].str.lower()
df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '')
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].str.replace(r'[^\w\s]+', '')


stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

df['headline'] = df['headline'].apply(removeStopWords)
df['summary'] = df['summary'].apply(removeStopWords)

stemmer = SnowballStemmer('english')

df['headline'] = df['headline'].apply(stemming)
df['summary'] = df['summary'].apply(stemming)

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state = 42, test_size = .2, shuffle = True)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion


vectorizer = FeatureUnion([
    ('headline', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2')),
    ('summary', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2'))])


x_train = train[['headline', 'summary', 'worker_id']]
y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)

x_test = test[['headline', 'summary', 'worker_id']]
y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)

# IF I only use one feature it works fine.
# x_train = train['headline']
# y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)

# x_test = test['headline']
# y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.multiclass import OneVsRestClassifier

OneVsRest_pipeline = Pipeline(steps = [
    ('featureunion', vectorizer),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])

OneVsRest_pipeline.fit(x_train, y_train)
predictions = OneVsRest_pipeline.predict(x_test)
prediction_prob = OneVsRest_pipeline.predict_proba(x_test)

Полная трассировка

ValueError                                Traceback (most recent call last)
<ipython-input-27-6394288c65f8> in <module>
      4 ])
      5 
----> 6 OneVsRest_pipeline.fit(x_train, y_train)
      7 predictions = OneVsRest_pipeline.predict(x_test)
      8 prediction_prob = OneVsRest_pipeline.predict_proba(x_test)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    354                                  self._log_message(len(self.steps) - 1)):
    355             if self._final_estimator != 'passthrough':
--> 356                 self._final_estimator.fit(Xt, y, **fit_params)
    357         return self
    358 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in fit(self, X, y)
    214                 "not %s" % self.label_binarizer_.classes_[i],
    215                 self.label_binarizer_.classes_[i]])
--> 216             for i, column in enumerate(columns))
    217 
    218         return self

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
    919             # remaining jobs.
    920             self._iterating = False
--> 921             if self.dispatch_one_batch(iterator):
    922                 self._iterating = self._original_iterator is not None
    923 

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in _fit_binary(estimator, X, y, classes)
     78     else:
     79         estimator = clone(estimator)
---> 80         estimator.fit(X, y)
     81     return estimator
     82 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1530 
   1531         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1532                          accept_large_sparse=solver != 'liblinear')
   1533         check_classification_targets(y)
   1534         self.classes_ = np.unique(y)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    727         y = y.astype(np.float64)
    728 
--> 729     check_consistent_length(X, y)
    730 
    731     return X, y

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    203     if len(uniques) > 1:
    204         raise ValueError("Found input variables with inconsistent numbers of"
--> 205                          " samples: %r" % [int(l) for l in lengths])
    206 
    207 
...