ValueError: форма передаваемых значений (6488, 66), индексы подразумевают (1622, 66) при использовании пользовательского преобразователя gridsearchcv с конвейером - PullRequest
0 голосов
/ 26 апреля 2020

Ниже приведен пользовательский преобразователь, который принимает текстовые данные и возвращает матрицу объектов.

class Doc2VecFeatures(BaseEstimator):
    def __init__(self,dm=0,model_dbow=None,data_tagged_full=None, vector_size=100, negative=5, hs=0, min_count=2, sample = 0):
        self.model_dbow = None
        self.data_tagged_full=None
        self.dm = dm
        self.vector_size = vector_size
        self.negative = negative
        self.hs = hs
        self.min_count = min_count
        self.sample = sample

    def tokenize_text(self,text):
        tokens = []
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if len(word) < 2:
                    continue
                tokens.append(word.lower())
        return tokens

    def vec_for_learning(self,model, tagged_docs):
        sents = tagged_docs.values
        targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
        return regressors


    def fit(self, text_data, y=None, **kwargs):
        terms_path = r'C:\Users\puboggavarapu\Desktop\work\projects\aws_comprehend\sharefile\Terms\Machine Learning - Standard & Non standard Terms_description.xlsx'
        terms_df = pd.read_excel(terms_path)
        full_text_data = pd.DataFrame(pd.concat([text_data,terms_df[terms_df['Comment'].notnull()]['Comment'],terms_df[terms_df['Description of term'].notnull()]['Description of term']],axis=0))
        full_text_data.columns = ['Text']
        full_text_data = full_text_data.reset_index()
        full_text_data['index'] = np.arange(full_text_data.shape[0])
        self.data_tagged_full = full_text_data.apply(lambda r: TaggedDocument(words=self.tokenize_text(r['Text']), tags=[r['index']]), axis=1)
        self.model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0)
        self.model_dbow.build_vocab(self.data_tagged_full)

        for epoch in range(1):
            print('iteration {0}'.format(epoch))
            self.model_dbow.train(self.data_tagged_full,
                        total_examples=self.model_dbow.corpus_count,
                        epochs=self.model_dbow.iter)
            # decrease the learning rate
            self.model_dbow.alpha -= 0.0002
            # fix the learning rate, no decay
            self.model_dbow.min_alpha = self.model_dbow.alpha
        return self

    def transform(self,text_data):
        X_data_doc2vec = self.vec_for_learning(self.model_dbow, self.data_tagged_full)
        description_similiarity_features = X_data_doc2vec[-33:]
        X_data_doc2vec = X_data_doc2vec[:len(X_data_doc2vec)-33]
        comment_similiarity_features = X_data_doc2vec[-33:]
        X_data_doc2vec = X_data_doc2vec[:len(X_data_doc2vec)-33]
        comment_similiarity_features = cosine_similarity(X_data_doc2vec,comment_similiarity_features)
        description_similiarity_features = cosine_similarity(X_data_doc2vec,description_similiarity_features)
        X_data_doc2vec_cosine_similiarity = np.hstack((comment_similiarity_features,description_similiarity_features))

        X = pd.DataFrame(X_data_doc2vec_cosine_similiarity, index=text_data.index)
        return X
        #return np.matrix(X_data_doc2vec_cosine_similiarity)

    def fit_transform(self,text_data,y=None):
        self.fit(text_data)
        return self.transform(text_data)

Когда я использую метод fit_transform этого класса для данных, я получаю желаемый вывод.

doc2vec = Doc2VecFeatures()
data = doc2vec.fit_transform(preprocessed_data['Text'],preprocessed_data['label'])
data.shape
(8110, 66)

Однако, когда я передаю это в конвейер для gridsearch с перекрестной проверкой, я получаю ошибку «ValueError: Форма переданных значений (6488, 66), индексы подразумевают (1622, 66)»

#Pipeline where tfidf is followed by model selector
pipeline = Pipeline([
    ('doc2vec', Doc2VecFeatures()),
    ('clf', RandomForestClassifier()),
])

#Models and Model parameters for clf_switcher
# parameters for tfidf and classifier
parameters = [
    {
        'doc2vec__vector_size': (10,20)
    }
]

gscv = GridSearchCV(pipeline,parameters, n_jobs=12, return_train_score=False, verbose=3,scoring='accuracy')


gscv.fit(preprocessed_data['Text'],preprocessed_data['label'])
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  10 | elapsed:  2.2min remaining:  5.2min
[Parallel(n_jobs=12)]: Done   7 out of  10 | elapsed:  2.2min remaining:   57.1s
---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1681, in create_block_manager_from_blocks
    mgr = BlockManager(blocks, axes)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 143, in __init__
    self._verify_integrity()
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 345, in _verify_integrity
    construction_error(tot_items, block.shape[1:], self.axes)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1719, in construction_error
    "Shape of passed values is {0}, indices imply {1}".format(passed, implied)
ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
    r = call_item()
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\_parallel_backends.py", line 567, in __call__
    return self.func(*args, **kwargs)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py", line 225, in __call__
    for func, args, kwargs in self.items]
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
    for func, args, kwargs in self.items]
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 544, in _fit_and_score
    test_scores = _score(estimator, X_test, y_test, scorer)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 591, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    *args, **kwargs)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 205, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\metrics\_scorer.py", line 52, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py", line 116, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py", line 419, in predict
    Xt = transform.transform(Xt)
  File "<ipython-input-13-5792b7ffb56f>", line 59, in transform
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py", line 440, in __init__
    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\construction.py", line 213, in init_ndarray
    return create_block_manager_from_blocks(block_values, [columns, index])
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1688, in create_block_manager_from_blocks
    construction_error(tot_items, blocks[0].shape[1:], axes, e)
  File "c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py", line 1719, in construction_error
    "Shape of passed values is {0}, indices imply {1}".format(passed, implied)
ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-23-fb0ff9325ff1> in <module>
----> 1 gscv.fit(preprocessed_data['Text'],preprocessed_data['label'])

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    708                 return results
    709 
--> 710             self._run_search(evaluate_candidates)
    711 
    712         # For multi-metric evaluation, store the best_index_, best_params_ and

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1149     def _run_search(self, evaluate_candidates):
   1150         """Search all candidates in param_grid"""
-> 1151         evaluate_candidates(ParameterGrid(self.param_grid))
   1152 
   1153 

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    687                                for parameters, (train, test)
    688                                in product(candidate_params,
--> 689                                           cv.split(X, y, groups)))
    690 
    691                 if len(out) < 1:

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
    932 
    933             with self._backend.retrieval_context():
--> 934                 self.retrieve()
    935             # Make sure that we get a last message telling us we are done
    936             elapsed_time = time.time() - self._start_time

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\parallel.py in retrieve(self)
    831             try:
    832                 if getattr(self._backend, 'supports_timeout', False):
--> 833                     self._output.extend(job.get(timeout=self.timeout))
    834                 else:
    835                     self._output.extend(job.get())

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
    519         AsyncResults.get from multiprocessing."""
    520         try:
--> 521             return future.result(timeout=timeout)
    522         except LokyTimeoutError:
    523             raise TimeoutError()

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\concurrent\futures\_base.py in result(self, timeout)
    433                 raise CancelledError()
    434             elif self._state == FINISHED:
--> 435                 return self.__get_result()
    436             else:
    437                 raise TimeoutError()

c:\users\puboggavarapu\appdata\local\programs\python\python37\lib\concurrent\futures\_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

ValueError: Shape of passed values is (6488, 66), indices imply (1622, 66)
...