Свяжите слой с самим собой - PullRequest
0 голосов
/ 02 августа 2020

Я учусь работать с категориальными данными, следуя примерам кода той же старой сущности, встраиваемой Абхишеком Тхакуром . Он использует Visual Studio One в качестве редактора python, а я использую блокнот Google Colab. Код тот же, но при конкатенации возникает ошибка. Согласно документации тензора:

layers.Concatenate(axis = along which to concatenate)

В примере кода используется:

x=layers.Concatenate()(outputs)

без упоминания оси, с которой предполагается конкатенация. Используя код как есть, я получаю сообщение об ошибке:

---------------------------------------------------------------------------
      ValueError                                Traceback (most recent call last)
       <ipython-input-15-8d7152b44077> in <module>()
       ----> 1 run(0)

        5 frames
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/layers/merge.py in        build(self, input_shape)
491     # Used purely for shape validation.
492     if not isinstance(input_shape[0], tuple) or len(input_shape) < 2:
--> 493       raise ValueError('A `Concatenate` layer should be called '
494                        'on a list of at least 2 inputs')
495     if all(shape is None for shape in input_shape):

ValueError: A `Concatenate` layer should be called on a list of at least 2 inputs

Я получаю ту же ошибку, даже когда добавляю ось аргумента = 1.

Вот мой пример кода:

import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import Model 
from tensorflow.keras.models import load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

def create_model(data, catcols):
  #init lit of inputs for embedding
  inputs = []

  #init list of outputs for embeddings
  outputs = []

  #loop over all categorical colums

  for c in catcols:
      #find the number of unique values in the column
      num_unique_values = int(data[c].nunique())
      #simple dimension of embedding calculator
      #min size is half of the number of unique values
      #max size is 50. max size depends on the number of unique
      #categories too. 
      embed_dim = int(min(np.ceil((num_unique_values)/2), 50))

      #simple keras input layer with size 1
      inp = layers.Input(shape=(1, ))

      #add embedding layer to raw input 
      #embedding size is always 1 more than uique values in input 
      out = layers.Embedding(num_unique_values + 1, embed_dim, name = c)(inp)

      #1-d spatial dropout is the standard for embedding layers 
      #you can use it in NLP tasks too 
      out = layers.SpatialDropout1D(0.3)(out)

      #reshape the input to the dimension of embedding
      #this becomes our output layer for current feature
      out = layers.Reshape(target_shape=(embed_dim, ))(out)

      #add input to input list
      inputs.append(inp)

      #add out to output list
      outputs.append(out)

      #concatenate all output layers
      x = layers.Concatenate(axis=1)(outputs)

      #add a batchnorm layer
      x = layers.BatchNormalization()(x)

      #add layers with dropout
      x= layers.Dense(300, acivation="relu")(x)
      x= layers.Dropout(0.3)(x)
      x= layers.BatchNormalization()(x)

      x = layers.Dense(300, activation="relu")(x)
      x = layers.Dropout(0.3)(x)
      x = layers.BatchNormalization()(x)

      y = layers.Dense(2, activation="softmax")(x)

      #create final model
      model = Model(inputs=inputs, outputs=y)

      model.compile(loss='binary_crossentropy', optimizer='adam')
      
      return model_selection



from sklearn import metrics, preprocessing
def run(fold):
  df = pd.read_csv('/content/drive/My Drive/train_folds.csv')

  features = [f for f in df.columns if f not in ("id", "target", "kfold")]

  for col in features:
    df.loc[:, col] = df[col].astype(str).fillna("NONE") #fill the nan values with nan. The function "fillna" was created to fill only one column;
                                                      # at a time it is not tested against a list of columns. 
    for feat in features:
      lbl_enc = preprocessing.LabelEncoder() #create the label encoder
      df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values) #label encode all columns

     #split the data
     df_train = df[df.kfold != fold].reset_index(drop=True) #get the training data. Remember that we created a column name kfold. if kfold = fold,
                                                        #then we train all the other folds but the argument fold. 

     df_valid = df[df.kfold == fold].reset_index(drop=True) #get the validation data. The argument fold is reserved for validation

     #create the model
    model = create_model(df, features)

    #our features are lists of lists
    xtrain = [df_train[features].values[:, k] for k in range(len(features))]

    xvalid = [df_valid[features].values[:, k] for k in range(len(features))]

    #retrieve target values
    ytain = df_train.target.values
    yvalid = df_valid.target.values

    #convert target columsn to categories 
    #binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)

    #fit the model
    model.fit(xtrain, ytrain_cat, validation_data=(xvalid, yvalid_cat), verbose=1, batch_size=1024, epochs=3)

    #generation validation predictions
    valid_preds = model.predict(xvalid)[:, 1]

    #metrics
    print(metrics.roc_auc_score(yvalid, valid_preds))

    #save memory space by clearing session
    K.clear_session()

Я запустил код:

run(0)
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...