Я пытаюсь назначить двоичные значения для 10 ярлыков, используя 3 функции, заголовок статьи, резюме статьи и идентификатор того, кто создал ярлыки. Я застрял при попытке создать модель, которая может принимать все 3 поля в качестве входных данных. В настоящее время это работает, только если я пропущу только одно поле. Я знаю, что, вероятно, что-то напутал с tfidvectorizer, но я не могу понять это. Любая помощь будет оценена. Я получаю сообщение об ошибке (полное отслеживание ниже) обычно составляет
ValueError: Found input variables with inconsistent numbers of samples: [3, 75897].
снимок экрана с фреймом данных
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import string
df = pd.read_csv('../data/homework_clean.csv')
emotion_cols = ['emotion_0', 'emotion_1', 'emotion_2', 'emotion_3', 'emotion_4', 'emotion_5', 'emotion_6', 'emotion_7', 'emotion_8', 'emotion_9']
def removeStopWords(sentence):
global re_stop_words
return re_stop_words.sub(" ", sentence)
def stemming(sentence):
stemSentence = ""
for word in sentence.split():
stem = stemmer.stem(word)
stemSentence += stem
stemSentence += " "
stemSentence = stemSentence.strip()
return stemSentence
df['headline'] = df['headline'].str.lower()
df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '')
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].str.replace(r'[^\w\s]+', '')
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
df['headline'] = df['headline'].apply(removeStopWords)
df['summary'] = df['summary'].apply(removeStopWords)
stemmer = SnowballStemmer('english')
df['headline'] = df['headline'].apply(stemming)
df['summary'] = df['summary'].apply(stemming)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state = 42, test_size = .2, shuffle = True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
vectorizer = FeatureUnion([
('headline', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2')),
('summary', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2'))])
x_train = train[['headline', 'summary', 'worker_id']]
y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
x_test = test[['headline', 'summary', 'worker_id']]
y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# IF I only use one feature it works fine.
# x_train = train['headline']
# y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# x_test = test['headline']
# y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.multiclass import OneVsRestClassifier
OneVsRest_pipeline = Pipeline(steps = [
('featureunion', vectorizer),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
OneVsRest_pipeline.fit(x_train, y_train)
predictions = OneVsRest_pipeline.predict(x_test)
prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
Полная трассировка
ValueError Traceback (most recent call last)
<ipython-input-27-6394288c65f8> in <module>
4 ])
----> 6 OneVsRest_pipeline.fit(x_train, y_train)
7 predictions = OneVsRest_pipeline.predict(x_test)
8 prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
354 self._log_message(len(self.steps) - 1)):
355 if self._final_estimator != 'passthrough':
--> 356 self._final_estimator.fit(Xt, y, **fit_params)
357 return self
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in fit(self, X, y)
214 "not %s" % self.label_binarizer_.classes_[i],
215 self.label_binarizer_.classes_[i]])
--> 216 for i, column in enumerate(columns))
218 return self
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in _fit_binary(estimator, X, y, classes)
78 else:
79 estimator = clone(estimator)
---> 80 estimator.fit(X, y)
81 return estimator
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1532 accept_large_sparse=solver != 'liblinear')
1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
727 y = y.astype(np.float64)
--> 729 check_consistent_length(X, y)
731 return X, y
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
203 if len(uniques) > 1:
204 raise ValueError("Found input variables with inconsistent numbers of"
--> 205 " samples: %r" % [int(l) for l in lengths])