Проблема с использованием Google Universal Sentence Encoder в конвейере scikit (ошибка TypeError: невозможно выбрать объекты _thread.RLock) - PullRequest
0 голосов
/ 07 января 2020

Я использую универсальный кодировщик предложений google в конвейере scikit, но я сталкиваюсь со следующей ошибкой: TypeError: не могу выбрать объекты _thread.RLock Я думаю, что дело в глубокой копировании объекта тензорного потока, используемого в качестве универсального кодировщика. Вот некоторые детали кода и ошибки:

module = hub.Module(href)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

class UnivEmbedding( BaseEstimator, TransformerMixin ):
  '''
  Universal embedding for pipeline
  ATTENZIONE per clonare o slavare la classe, settare module=None
  es: pipe_svm_clf.set_params(embed__module=None)
  '''
  #Class Constructor 
  def __init__( self, module, use_light=True, verbose=False):
       self.module = module
       self.use_light= use_light
       self.verbose= verbose 

  #Return self nothing else to do here   
  def fit( self, X, y = None ):
    return self 

  #Method that describes what we need this transformer to do
  def transform( self, X, y = None ):
    return embed(X) # universal_embedding (self.module, X, self.use_light, self.verbose)

  def fit_transform( self, X, y = None ):  
    if self.verbose: print(self.module)
    return embed(X) #universal_embedding (self.module, X, self.use_light, self.verbose)

  def get_params(self, deep=True):
    return {"module": self.module, "use_light": self.use_light, "verbose": self.verbose}

  def set_params(self, **parameters):
    for parameter, value in parameters.items():
        setattr(self, parameter, value)
    return self

clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(
                                                                                  bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                                                                  criterion='gini', max_depth=2, max_features=None,
                                                                                  max_leaf_nodes=2, max_samples=None,
                                                                                  min_impurity_decrease=0.0, min_impurity_split=None,
                                                                                  min_samples_leaf=2, min_samples_split=3,
                                                                                  min_weight_fraction_leaf=0.0, n_estimators=200,
                                                                                  n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                                                             warm_start=False),
                            passthrough= False) 

... ...

for train_index, test_index in cv.split(messages, y_real):

  univ_emb= UnivEmbedding(module, use_light=False, verbose=False)

  X_train= np.array(messages)[train_index]
  y_train= y_real[train_index]
  X_test= np.array(messages)[test_index]
  y_test= y_real[test_index]

  pipe_rnd_clf= Pipeline(
      [("embed", univ_emb) ,
      ("rnd_clf", clone(rnd_clf))])


  estimators = [
          ('rnd', pipe_rnd_clf),
          ('ada', pipe_ada_rnd_clf),
          ('bag', pipe_rnd_clf_bag)
  ]

  clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(
                                                                              bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                                                              criterion='gini', max_depth=2, max_features=None,
                                                                              max_leaf_nodes=2, max_samples=None,
                                                                              min_impurity_decrease=0.0, min_impurity_split=None,
                                                                              min_samples_leaf=2, min_samples_split=3,
                                                                              min_weight_fraction_leaf=0.0, n_estimators=200,
                                                                              n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                                                              warm_start=False),
                        passthrough= False) 

  print(clf.named_estimators)

  clf.fit(X_train, y_train)

Я получаю следующую ошибку:

---------------------------------------------------------------------------
Empty                                     Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    796             try:
--> 797                 tasks = self._ready_batches.get(block=False)
    798             except queue.Empty:

22 frames
/usr/lib/python3.6/queue.py in get(self, block, timeout)
    160                 if not self._qsize():
--> 161                     raise Empty
    162             elif timeout is None:

Empty: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-30-6a97c8d96f75> in <module>()
     45   print(clf.named_estimators)
     46 
---> 47   clf.fit(X_train, y_train)
     48 
     49   y_pred_test = clf.predict(X_test)

/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    411         self._le = LabelEncoder().fit(y)
    412         self.classes_ = self._le.classes_
--> 413         return super().fit(X, self._le.transform(y), sample_weight)
    414 
    415     @if_delegate_has_method(delegate='final_estimator_')

/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    139         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
    140             delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight)
--> 141             for est in all_estimators if est != 'drop'
    142         )
    143 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    806                 big_batch_size = batch_size * n_jobs
    807 
--> 808                 islice = list(itertools.islice(iterator, big_batch_size))
    809                 if len(islice) == 0:
    810                     return False

/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/_stacking.py in <genexpr>(.0)
    139         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
    140             delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight)
--> 141             for est in all_estimators if est != 'drop'
    142         )
    143 

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     69     new_object_params = estimator.get_params(deep=False)
     70     for name, param in new_object_params.items():
---> 71         new_object_params[name] = clone(param, safe=False)
     72     new_object = klass(**new_object_params)
     73     params_set = new_object.get_params(deep=False)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     69     new_object_params = estimator.get_params(deep=False)
     70     for name, param in new_object_params.items():
---> 71         new_object_params[name] = clone(param, safe=False)
     72     new_object = klass(**new_object_params)
     73     params_set = new_object.get_params(deep=False)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:
---> 62             return copy.deepcopy(estimator)
     63         else:
     64             raise TypeError("Cannot clone object '%s' (type %s): "

/usr/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
    178                     y = x
    179                 else:
--> 180                     y = _reconstruct(x, memo, *rv)
    181 
    182     # If is its own copy, don't memoize.

/usr/lib/python3.6/copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
    278     if state is not None:
    279         if deep:
--> 280             state = deepcopy(state, memo)
    281         if hasattr(y, '__setstate__'):
    282             y.__setstate__(state)

/usr/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
    148     copier = _deepcopy_dispatch.get(cls)
    149     if copier:
--> 150         y = copier(x, memo)
    151     else:
    152         try:

/usr/lib/python3.6/copy.py in _deepcopy_dict(x, memo, deepcopy)
    238     memo[id(x)] = y
    239     for key, value in x.items():
--> 240         y[deepcopy(key, memo)] = deepcopy(value, memo)
    241     return y
    242 d[dict] = _deepcopy_dict

/usr/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
    178                     y = x
    179                 else:
--> 180                     y = _reconstruct(x, memo, *rv)
    181 
    182     # If is its own copy, don't memoize.

/usr/lib/python3.6/copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
    278     if state is not None:
    279         if deep:
--> 280             state = deepcopy(state, memo)
    281         if hasattr(y, '__setstate__'):
    282             y.__setstate__(state)

/usr/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
    148     copier = _deepcopy_dispatch.get(cls)
    149     if copier:
--> 150         y = copier(x, memo)
    151     else:
    152         try:

/usr/lib/python3.6/copy.py in _deepcopy_dict(x, memo, deepcopy)
    238     memo[id(x)] = y
    239     for key, value in x.items():
--> 240         y[deepcopy(key, memo)] = deepcopy(value, memo)
    241     return y
    242 d[dict] = _deepcopy_dict

/usr/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
    167                     reductor = getattr(x, "__reduce_ex__", None)
    168                     if reductor:
--> 169                         rv = reductor(4)
    170                     else:
    171                         reductor = getattr(x, "__reduce__", None)

TypeError: can't pickle _thread.RLock objects

Я считаю, что проблема звонит:

from sklearn.base import clone
clone(univ_emb)

Есть ли у enyone какие-либо идеи? Спасибо

...