Я изменил некоторый код Python
из github
для запуска регрессии logisti c на подмножестве данных жалоб потребителей, используя следующий код, части векторизации и классификации текста работают гладко. Но мне интересно, возможно ли также включить нетекстовые, двоичные числовые индикаторы, такие как timely_response
и consumer_disputed.
в качестве функций (наряду с текстовыми векторами)? Однако, когда я сделал это, Python
возвращает ошибку, говорящую, что у меня есть input variables with inconsistent numbers of samples
.
%% load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
df = df[pd.notnull(df['consumer_complaint_narrative'])]
df['product'].value_counts()
%% cleaning text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)
%% include only text as features
X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
%% fit and test with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
my_tags = ['Debt collection','Mortgage','Credit reporting','Credit card','Bank account or service','Consumer Loan','Student loan','Payday loan','Money transfers','Other financial service','Prepaid card']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
%% including binary numerical indicators as additional features
new_X = df[['consumer_complaint_narrative', 'timely_response', 'consumer_disputed.']]
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)
%% fit and test again
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
, который возвращает следующее сообщение об ошибке
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-455c8fd83ba4> in <module>
8 ('clf', LogisticRegression(n_jobs=1, C=1e5)),
9 ])
---> 10 logreg.fit(X_train, y_train)
11
12 y_pred = logreg.predict(X_test)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
--> 267 self._final_estimator.fit(Xt, y, **fit_params)
268 return self
269
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1286
1287 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1288 accept_large_sparse=solver != 'liblinear')
1289 check_classification_targets(y)
1290 self.classes_ = np.unique(y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
764 y = y.astype(np.float64)
765
--> 766 check_consistent_length(X, y)
767
768 return X, y
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
233 if len(uniques) > 1:
234 raise ValueError("Found input variables with inconsistent numbers of"
--> 235 " samples: %r" % [int(l) for l in lengths])
236
237
ValueError: Found input variables with inconsistent numbers of samples: [3, 529]
Буду очень признателен, если кто-нибудь сможет пролить на это свет.