Следующий код взят из ядра Kaggle и немного адаптирован для уменьшения вычислительной нагрузки.
Вы можете скачать данные здесь: https://www.kaggle.com/aquatic/entity-embedding-neural-net/data
![enter image description here](https://i.stack.imgur.com/Lt8oD.png)
Код следующий:
import numpy as np
import pandas as pd
from tqdm import tqdm
#random seeds for stochastic parts of neural network
import tensorflow as tf
np.random.seed(10)
tf.random.set_seed(15)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import StratifiedKFold
#Data loading & preprocessing
df_train = pd.read_csv('train.csv', index_col = [0])
df_test = pd.read_csv('test.csv', index_col = [0])
df_train = df_train.sample(frac = 0.1)
df_test = df_test.sample(frac = 0.1)
df_train.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)
X_train, y_train = df_train.iloc[:,1:], df_train.target
X_test = df_test
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]
X_train = X_train[cols_use]
X_test = X_test[cols_use]
col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}
embed_cols = []
for c in col_vals_dict:
if len(col_vals_dict[c])>2:
embed_cols.append(c)
print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions
print('\n')
def build_embedding_network():
inputs = []
embeddings = []
input_ps_ind_02_cat = Input(shape=(1,))
embedding = Embedding(5, 3, input_length=1)(input_ps_ind_02_cat)
embedding = Reshape(target_shape=(3,))(embedding)
inputs.append(input_ps_ind_02_cat)
embeddings.append(embedding)
input_ps_ind_04_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_ind_04_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_ind_04_cat)
embeddings.append(embedding)
input_ps_ind_05_cat = Input(shape=(1,))
embedding = Embedding(8, 5, input_length=1)(input_ps_ind_05_cat)
embedding = Reshape(target_shape=(5,))(embedding)
inputs.append(input_ps_ind_05_cat)
embeddings.append(embedding)
input_ps_car_01_cat = Input(shape=(1,))
embedding = Embedding(13, 7, input_length=1)(input_ps_car_01_cat)
embedding = Reshape(target_shape=(7,))(embedding)
inputs.append(input_ps_car_01_cat)
embeddings.append(embedding)
input_ps_car_02_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_02_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_02_cat)
embeddings.append(embedding)
input_ps_car_03_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_03_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_03_cat)
embeddings.append(embedding)
input_ps_car_04_cat = Input(shape=(1,))
embedding = Embedding(10, 5, input_length=1)(input_ps_car_04_cat)
embedding = Reshape(target_shape=(5,))(embedding)
inputs.append(input_ps_car_04_cat)
embeddings.append(embedding)
input_ps_car_05_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_05_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_05_cat)
embeddings.append(embedding)
input_ps_car_06_cat = Input(shape=(1,))
embedding = Embedding(18, 8, input_length=1)(input_ps_car_06_cat)
embedding = Reshape(target_shape=(8,))(embedding)
inputs.append(input_ps_car_06_cat)
embeddings.append(embedding)
input_ps_car_07_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_07_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_07_cat)
embeddings.append(embedding)
input_ps_car_09_cat = Input(shape=(1,))
embedding = Embedding(6, 3, input_length=1)(input_ps_car_09_cat)
embedding = Reshape(target_shape=(3,))(embedding)
inputs.append(input_ps_car_09_cat)
embeddings.append(embedding)
input_ps_car_10_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_10_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_10_cat)
embeddings.append(embedding)
input_ps_car_11_cat = Input(shape=(1,))
embedding = Embedding(104, 10, input_length=1)(input_ps_car_11_cat)
embedding = Reshape(target_shape=(10,))(embedding)
inputs.append(input_ps_car_11_cat)
embeddings.append(embedding)
input_numeric = Input(shape=(24,))
embedding_numeric = Dense(16)(input_numeric)
inputs.append(input_numeric)
embeddings.append(embedding_numeric)
x = Concatenate()(embeddings)
x = Dense(80, activation='relu')(x)
x = Dropout(.35)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(.15)(x)
x = Dense(10, activation='relu')(x)
x = Dropout(.15)(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs, output)
model.compile(loss='binary_crossentropy', optimizer='adam')
return model
#converting data to list format to match the network structure
def preproc(X_train, X_val, X_test):
input_list_train = []
input_list_val = []
input_list_test = []
#the cols to be embedded: rescaling to range [0, # values)
for c in tqdm(embed_cols):
raw_vals = np.unique(X_train[c])
print(f'raw_vals is {raw_vals} and c is {c}/n')
val_map = {}
for i in tqdm(range(len(raw_vals))):
val_map[raw_vals[i]] = i
print(f'i is {i} and raw_vals[i] is {raw_vals[i]} and val_map is {val_map} \n')
input_list_train.append(X_train[c].map(val_map).values)
input_list_val.append(X_val[c].map(val_map).fillna(0).values)
input_list_test.append(X_test[c].map(val_map).fillna(0).values)
#the rest of the columns
other_cols = [c for c in X_train.columns if (not c in embed_cols)]
input_list_train.append(X_train[other_cols].values)
input_list_val.append(X_val[other_cols].values)
input_list_test.append(X_test[other_cols].values)
return input_list_train, input_list_val, input_list_test
#gini scoring function from kernel at:
#https://www.kaggle.com/tezdhar/faster-gini-calculation
def ginic(actual, pred):
n = len(actual)
a_s = actual[np.argsort(pred)]
a_c = a_s.cumsum()
giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
return giniSum / n
def gini_normalizedc(a, p):
return ginic(a, p) / ginic(a, a)
#network training
K = 3
runs_per_fold = 1
n_epochs = 15
cv_ginis = []
full_val_preds = np.zeros(np.shape(X_train)[0])
y_preds = np.zeros((np.shape(X_test)[0],K))
kfold = StratifiedKFold(n_splits = K,
random_state = 231,
shuffle = True)
for i, (f_ind, outf_ind) in tqdm(enumerate(kfold.split(X_train, y_train))):
X_train_f, X_val_f = X_train.loc[f_ind].copy(), X_train.loc[outf_ind].copy()
y_train_f, y_val_f = y_train[f_ind], y_train[outf_ind]
X_test_f = X_test.copy()
#upsampling adapted from kernel:
#https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283
pos = (pd.Series(y_train_f == 1))
# Add positive examples
X_train_f = pd.concat([X_train_f, X_train_f.loc[pos]], axis=0)
y_train_f = pd.concat([y_train_f, y_train_f.loc[pos]], axis=0)
# Shuffle data
idx = np.arange(len(X_train_f))
np.random.shuffle(idx)
X_train_f = X_train_f.iloc[idx]
y_train_f = y_train_f.iloc[idx]
#preprocessing
proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(X_train_f, X_val_f, X_test_f)
#track oof prediction for cv scores
val_preds = 0
for j in tqdm(range(runs_per_fold)):
NN = build_embedding_network()
NN.fit(proc_X_train_f, y_train_f.values, epochs=n_epochs, batch_size=4096, verbose=0)
val_preds += NN.predict(proc_X_val_f)[:,0] / runs_per_fold
y_preds[:,i] += NN.predict(proc_X_test_f)[:,0] / runs_per_fold
full_val_preds[outf_ind] += val_preds
cv_gini = gini_normalizedc(y_val_f.values, val_preds)
cv_ginis.append(cv_gini)
print ('\nFold %i prediction cv gini: %.5f\n' %(i,cv_gini))
raw_vals is [-1 1 2 3 4] and c is ps_ind_02_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
i is 4 and raw_vals[i] is 4 and val_map is {-1: 0, 1: 1, 2: 2, 3: 3, 4: 4}
raw_vals is [-1 0 1] and c is ps_ind_04_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]
i is 2 and raw_vals[i] is 1 and val_map is {-1: 0, 0: 1, 1: 2}
raw_vals is [-1 0 1 2 3 4 5 6] and c is ps_ind_05_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<?, ?it/s]
i is 7 and raw_vals[i] is 6 and val_map is {-1: 0, 0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7}
raw_vals is [-1 0 1 2 3 4 5 6 7 8 9 10 11] and c is ps_car_01_cat/n
100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 13157.81it/s]
i is 12 and raw_vals[i] is 11 and val_map is {-1: 0, 0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12}
raw_vals is [-1 0 1] and c is ps_car_03_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]
i is 2 and raw_vals[i] is 1 and val_map is {-1: 0, 0: 1, 1: 2}
raw_vals is [0 1 2 3 4 5 6 7 8 9] and c is ps_car_04_cat/n
100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]
i is 9 and raw_vals[i] is 9 and val_map is {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
raw_vals is [-1 0 1] and c is ps_car_05_cat/n
100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 3003.80it/s]
i is 2 and raw_vals[i] is 1 and val_map is {-1: 0, 0: 1, 1: 2}
raw_vals is [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] and c is ps_car_06_cat/n
100%|██████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<?, ?it/s]
i is 17 and raw_vals[i] is 17 and val_map is {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17}
raw_vals is [-1 0 1] and c is ps_car_07_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]
i is 2 and raw_vals[i] is 1 and val_map is {-1: 0, 0: 1, 1: 2}
75%|██████████████████████████████████████████████████████████████▎ | 9/12 [00:00<00:00, 82.04it/s]
raw_vals is [-1 0 1 2 3 4] and c is ps_car_09_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<?, ?it/s]
i is 5 and raw_vals[i] is 4 and val_map is {-1: 0, 0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
raw_vals is [0 1 2] and c is ps_car_10_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]
i is 2 and raw_vals[i] is 2 and val_map is {0: 0, 1: 1, 2: 2}
raw_vals is [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
91 92 93 94 95 96 97 98 99 100 101 102 103 104] and c is ps_car_11_cat/n
100%|████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<?, ?it/s]
i is 103 and raw_vals[i] is 104 and val_map is {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 33: 32, 34: 33, 35: 34, 36: 35, 37: 36, 38: 37, 39: 38, 40: 39, 41: 40, 42: 41, 43: 42, 44: 43, 45: 44, 46: 45, 47: 46, 48: 47, 49: 48, 50: 49, 51: 50, 52: 51, 53: 52, 54: 53, 55: 54, 56: 55, 57: 56, 58: 57, 59: 58, 60: 59, 61: 60, 62: 61, 63: 62, 64: 63, 65: 64, 66: 65, 67: 66, 68: 67, 69: 68, 70: 69, 71: 70, 72: 71, 73: 72, 74: 73, 75: 74, 76: 75, 77: 76, 78: 77, 79: 78, 80: 79, 81: 80, 82: 81, 83: 82, 84: 83, 85: 84, 86: 85, 87: 86, 88: 87, 89: 88, 90: 89, 91: 90, 92: 91, 93: 92, 94: 93, 95: 94, 96: 95, 97: 96, 98: 97, 99: 98, 100: 99, 101: 100, 102: 101, 103: 102, 104: 103}
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 79.67it/s]
0%| | 0/1 [00:03<?, ?it/s]
0it [00:03, ?it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-94fa2bc6e556> in <module>
29
30 NN = build_embedding_network()
---> 31 NN.fit(proc_X_train_f, y_train_f.values, epochs=n_epochs, batch_size=4096, verbose=0)
32
33 val_preds += NN.predict(proc_X_val_f)[:,0] / runs_per_fold
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
222 validation_data=validation_data,
223 validation_steps=validation_steps,
--> 224 distribution_strategy=strategy)
225
226 total_samples = _get_total_number_of_samples(training_data_adapter)
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
545 max_queue_size=max_queue_size,
546 workers=workers,
--> 547 use_multiprocessing=use_multiprocessing)
548 val_adapter = None
549 if validation_data:
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in _process_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
592 batch_size=batch_size,
593 check_steps=False,
--> 594 steps=steps)
595 adapter = adapter_cls(
596 x,
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)
2470 feed_input_shapes,
2471 check_batch_axis=False, # Don't enforce the batch size.
-> 2472 exception_prefix='input')
2473
2474 # Get typespecs for the input data and sanitize it if necessary.
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
529 'Expected to see ' + str(len(names)) + ' array(s), '
530 'but instead got the following list of ' +
--> 531 str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
532 elif len(names) > 1:
533 raise ValueError('Error when checking model ' + exception_prefix +
ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 14 array(s), but instead got the following list of 13 arrays: [array([[1],
[1],
[1],
...,
[1],
[2],
[2]], dtype=int64), array([[1],
[1],
[1],
...,
[1],
[1],
[1]], dtype=int64), a...