Ниже приведен пользовательский преобразователь, который принимает текстовые данные и возвращает матрицу объектов.
class Doc2VecFeatures(BaseEstimator):
def __init__(self,dm=0,model_dbow=None,data_tagged_full=None, vector_size=100, negative=5, hs=0, min_count=2, sample = 0):
self.model_dbow = None
self.data_tagged_full=None
self.dm = dm
self.vector_size = vector_size
self.negative = negative
self.hs = hs
self.min_count = min_count
self.sample = sample
def tokenize_text(self,text):
tokens = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
if len(word) < 2:
continue
tokens.append(word.lower())
return tokens
def vec_for_learning(self,model, tagged_docs):
sents = tagged_docs.values
targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
return regressors
def fit(self, text_data, y=None, **kwargs):
terms_path = r'C:\Users\puboggavarapu\Desktop\work\projects\aws_comprehend\sharefile\Terms\Machine Learning - Standard & Non standard Terms_description.xlsx'
terms_df = pd.read_excel(terms_path)
full_text_data = pd.DataFrame(pd.concat([text_data,terms_df[terms_df['Comment'].notnull()]['Comment'],terms_df[terms_df['Description of term'].notnull()]['Description of term']],axis=0))
full_text_data.columns = ['Text']
full_text_data = full_text_data.reset_index()
full_text_data['index'] = np.arange(full_text_data.shape[0])
self.data_tagged_full = full_text_data.apply(lambda r: TaggedDocument(words=self.tokenize_text(r['Text']), tags=[r['index']]), axis=1)
self.model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0)
self.model_dbow.build_vocab(self.data_tagged_full)
for epoch in range(1):
print('iteration {0}'.format(epoch))
self.model_dbow.train(self.data_tagged_full,
total_examples=self.model_dbow.corpus_count,
epochs=self.model_dbow.iter)
# decrease the learning rate
self.model_dbow.alpha -= 0.0002
# fix the learning rate, no decay
self.model_dbow.min_alpha = self.model_dbow.alpha
return self
def transform(self,text_data):
X_data_doc2vec = self.vec_for_learning(self.model_dbow, self.data_tagged_full)
description_similiarity_features = X_data_doc2vec[-33:]
X_data_doc2vec = X_data_doc2vec[:len(X_data_doc2vec)-33]
comment_similiarity_features = X_data_doc2vec[-33:]
X_data_doc2vec = X_data_doc2vec[:len(X_data_doc2vec)-33]
comment_similiarity_features = cosine_similarity(X_data_doc2vec,comment_similiarity_features)
description_similiarity_features = cosine_similarity(X_data_doc2vec,description_similiarity_features)
X_data_doc2vec_cosine_similiarity = np.hstack((comment_similiarity_features,description_similiarity_features))
X = pd.DataFrame(X_data_doc2vec_cosine_similiarity, index=text_data.index)
return X
#return np.matrix(X_data_doc2vec_cosine_similiarity)
def fit_transform(self,text_data,y=None):
self.fit(text_data)
return self.transform(text_data)
Когда я использую метод fit_transform этого класса для данных, я получаю желаемый вывод.
doc2vec = Doc2VecFeatures()
data = doc2vec.fit_transform(preprocessed_data['Text'],preprocessed_data['label'])
data.shape
(8110, 66)
Однако, когда я передаю это в конвейер для gridsearch с перекрестной проверкой, я получаю ошибку «ValueError: Форма переданных значений (6488, 66), индексы подразумевают (1622, 66)»
#Pipeline where tfidf is followed by model selector
pipeline = Pipeline([
('doc2vec', Doc2VecFeatures()),
('clf', RandomForestClassifier()),
])
#Models and Model parameters for clf_switcher
# parameters for tfidf and classifier
parameters = [
{
'doc2vec__vector_size': (10,20)
}
]
gscv = GridSearchCV(pipeline,parameters, n_jobs=12, return_train_score=False, verbose=3,scoring='accuracy')
gscv.fit(preprocessed_data['Text'],preprocessed_data['label'])
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 3 out of 10 | elapsed: 2.2min remaining: 5.2min
[Parallel(n_jobs=12)]: Done 7 out of 10 | elapsed: 2.2min remaining: 57.1s
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1681, in create_block_manager_from_blocks
mgr = BlockManager(blocks, axes)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 143, in __init__
self._verify_integrity()
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 345, in _verify_integrity
construction_error(tot_items, block.shape[1:], self.axes)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1719, in construction_error
"Shape of passed values is {0}, indices imply {1}".format(passed, implied)
ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 544, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 591, in _score
scores = scorer(estimator, X_test, y_test)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
*args, **kwargs)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 205, in _score
y_pred = method_caller(estimator, "predict", X)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 52, in _cached_call
return getattr(estimator, method)(*args, **kwargs)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py", line 116, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py", line 419, in predict
Xt = transform.transform(Xt)
File "<ipython-input-13-5792b7ffb56f>", line 59, in transform
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py", line 440, in __init__
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\construction.py", line 213, in init_ndarray
return create_block_manager_from_blocks(block_values, [columns, index])
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1688, in create_block_manager_from_blocks
construction_error(tot_items, blocks[0].shape[1:], axes, e)
File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1719, in construction_error
"Shape of passed values is {0}, indices imply {1}".format(passed, implied)
ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-23-fb0ff9325ff1> in <module>
----> 1 gscv.fit(preprocessed_data['Text'],preprocessed_data['label'])
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1149 def _run_search(self, evaluate_candidates):
1150 """Search all candidates in param_grid"""
-> 1151 evaluate_candidates(ParameterGrid(self.param_grid))
1152
1153
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
687 for parameters, (train, test)
688 in product(candidate_params,
--> 689 cv.split(X, y, groups)))
690
691 if len(out) < 1:
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
932
933 with self._backend.retrieval_context():
--> 934 self.retrieve()
935 # Make sure that we get a last message telling us we are done
936 elapsed_time = time.time() - self._start_time
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)