Я учусь работать с категориальными данными, следуя примерам кода той же старой сущности, встраиваемой Абхишеком Тхакуром . Он использует Visual Studio One в качестве редактора python, а я использую блокнот Google Colab. Код тот же, но при конкатенации возникает ошибка. Согласно документации тензора:
layers.Concatenate(axis = along which to concatenate)
В примере кода используется:
x=layers.Concatenate()(outputs)
без упоминания оси, с которой предполагается конкатенация. Используя код как есть, я получаю сообщение об ошибке:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-15-8d7152b44077> in <module>()
----> 1 run(0)
5 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/layers/merge.py in build(self, input_shape)
491 # Used purely for shape validation.
492 if not isinstance(input_shape[0], tuple) or len(input_shape) < 2:
--> 493 raise ValueError('A `Concatenate` layer should be called '
494 'on a list of at least 2 inputs')
495 if all(shape is None for shape in input_shape):
ValueError: A `Concatenate` layer should be called on a list of at least 2 inputs
Я получаю ту же ошибку, даже когда добавляю ось аргумента = 1.
Вот мой пример кода:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
def create_model(data, catcols):
#init lit of inputs for embedding
inputs = []
#init list of outputs for embeddings
outputs = []
#loop over all categorical colums
for c in catcols:
#find the number of unique values in the column
num_unique_values = int(data[c].nunique())
#simple dimension of embedding calculator
#min size is half of the number of unique values
#max size is 50. max size depends on the number of unique
#categories too.
embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
#simple keras input layer with size 1
inp = layers.Input(shape=(1, ))
#add embedding layer to raw input
#embedding size is always 1 more than uique values in input
out = layers.Embedding(num_unique_values + 1, embed_dim, name = c)(inp)
#1-d spatial dropout is the standard for embedding layers
#you can use it in NLP tasks too
out = layers.SpatialDropout1D(0.3)(out)
#reshape the input to the dimension of embedding
#this becomes our output layer for current feature
out = layers.Reshape(target_shape=(embed_dim, ))(out)
#add input to input list
inputs.append(inp)
#add out to output list
outputs.append(out)
#concatenate all output layers
x = layers.Concatenate(axis=1)(outputs)
#add a batchnorm layer
x = layers.BatchNormalization()(x)
#add layers with dropout
x= layers.Dense(300, acivation="relu")(x)
x= layers.Dropout(0.3)(x)
x= layers.BatchNormalization()(x)
x = layers.Dense(300, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.BatchNormalization()(x)
y = layers.Dense(2, activation="softmax")(x)
#create final model
model = Model(inputs=inputs, outputs=y)
model.compile(loss='binary_crossentropy', optimizer='adam')
return model_selection
from sklearn import metrics, preprocessing
def run(fold):
df = pd.read_csv('/content/drive/My Drive/train_folds.csv')
features = [f for f in df.columns if f not in ("id", "target", "kfold")]
for col in features:
df.loc[:, col] = df[col].astype(str).fillna("NONE") #fill the nan values with nan. The function "fillna" was created to fill only one column;
# at a time it is not tested against a list of columns.
for feat in features:
lbl_enc = preprocessing.LabelEncoder() #create the label encoder
df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values) #label encode all columns
#split the data
df_train = df[df.kfold != fold].reset_index(drop=True) #get the training data. Remember that we created a column name kfold. if kfold = fold,
#then we train all the other folds but the argument fold.
df_valid = df[df.kfold == fold].reset_index(drop=True) #get the validation data. The argument fold is reserved for validation
#create the model
model = create_model(df, features)
#our features are lists of lists
xtrain = [df_train[features].values[:, k] for k in range(len(features))]
xvalid = [df_valid[features].values[:, k] for k in range(len(features))]
#retrieve target values
ytain = df_train.target.values
yvalid = df_valid.target.values
#convert target columsn to categories
#binarization
ytrain_cat = utils.to_categorical(ytrain)
yvalid_cat = utils.to_categorical(yvalid)
#fit the model
model.fit(xtrain, ytrain_cat, validation_data=(xvalid, yvalid_cat), verbose=1, batch_size=1024, epochs=3)
#generation validation predictions
valid_preds = model.predict(xvalid)[:, 1]
#metrics
print(metrics.roc_auc_score(yvalid, valid_preds))
#save memory space by clearing session
K.clear_session()
Я запустил код:
run(0)