Hyperopt для нескольких подмножеств кадра данных - PullRequest
0 голосов
/ 20 января 2020

У меня есть следующий фрейм данных:

#create some data with Names column
data = pd.DataFrame({'Names': ['Joe', 'John', 'Jasper', 'Jez'] *4, 'Add': ['Lo', 'Po', 'Fa',    
'It']*4,'Ob1' : np.random.rand(16), 'Ob2' : np.random.rand(16)})

Я хочу запустить hyperopt, основываясь только на нескольких комбинациях «Имена» и «Добавить». Сказав, что я разбил этот фрейм данных на подмножество, используя следующий код:

 # create a dataframe of all combinations
 combinations=data[['Names','Add']].drop_duplicates()
 comboList=list(zip(combinations['Names'],combinations['Add']))
comboList

[('Joe', 'Lo'), ('John', 'Po'), ('Jasper', 'Fa'), ('Jez', 'It')]
 In [14]:
for i,combo in enumerate(comboList):
    print(combo)
    subset=data[(data['Names']==combo[0]) & (data['Add']==combo[1])]
    # run multiple functions on subset here
    print(subset.head())

Используя вышеупомянутую логику c, я построил функции:

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation, Flatten
from keras import optimizers
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score
import sys

def trim_dataset(mat, batch_size):
"""
trims dataset to a size that's divisible by BATCH_SIZE
"""
no_of_rows_drop = mat.shape[0] % batch_size
if no_of_rows_drop > 0:
    return mat[:-no_of_rows_drop]
else:
    return mat


def build_timeseries(mat, y_col_index, time_steps):
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - time_steps
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, time_steps, dim_1))
    y = np.zeros((x.shape[0],))

for i in tqdm(range(dim_0)):
    x[i] = mat[i:time_steps + i]
    y[i] = mat[time_steps + i, y_col_index]
#     print("length of time-series i/o {} {}".format(x.shape, y.shape))
return x, y

def data(df,batch_size, time_steps):
global mat

BATCH_SIZE = 10
TIME_STEPS = 1
x_train, x_test = train_test_split(df, train_size=883, shuffle=False)

# scale the train and test dataset
min_max_scaler = MinMaxScaler()
x_train = min_max_scaler.fit_transform(x_train)
x_test = min_max_scaler.transform(x_test)

x_train_ts, y_train_ts = build_timeseries(x_train, 0, TIME_STEPS)
x_test_ts, y_test_ts = build_timeseries(x_test, 0, TIME_STEPS)
x_train_ts = trim_dataset(x_train_ts, BATCH_SIZE)
y_train_ts = trim_dataset(y_train_ts, BATCH_SIZE)
x_test_ts = trim_dataset(x_test_ts, BATCH_SIZE)
y_test_ts = trim_dataset(y_test_ts, BATCH_SIZE)
                                                                       str(np.isnan(x_test).any())))
return x_train_ts, y_train_ts, x_test_ts, y_test_ts

stime = time .time ()

def print_time(text, stime):
seconds = (time.time() - stime)
print(text + " " + str(seconds / 60) + " minutes : " + str(np.round(seconds % 60)) + " seconds")

def get_readable_ctime():
return time.strftime("%d-%m-%Y %H_%M_%S")

из keras.callbacks import Callback

class LogMetrics(Callback):

def __init__(self, search_params, param, comb_no):
    self.param = param
    self.self_params = search_params
    self.comb_no = comb_no

def on_epoch_end(self, epoch, logs):
    for i, key in enumerate(self.self_params.keys()):
        logs[key] = self.param[key]
    logs["combination_number"] = self.comb_no

search_space = {
'batch_size': hp.choice('bs', [10]),
'time_steps': hp.choice('ts', [1]),
'lstm1_nodes':hp.choice('units_lsmt1', [100,150,200,250,300]),
'lstm1_dropouts':hp.uniform('dos_lstm1',0,1),
'lstm_layers': hp.choice('num_layers_lstm',[
    {
        'layers':'one', 
    },
    {
        'layers':'two',
        'lstm2_nodes':hp.choice('units_lstm2', [50,100,150]),
        'lstm2_dropouts':hp.uniform('dos_lstm2',0,1)  
    }
    ]),
'dense_layers': hp.choice('num_layers_dense',[
    {
        'layers':'one'
    },
    {
        'layers':'two',
        'dense2_nodes':hp.choice('units_dense', [10,20,30,40])
    }
    ]),
"lr": hp.uniform('lr',0,1),
"epochs": hp.choice('epochs', [50,100,150,200,300,500,1000,2000]),
"optimizer": hp.choice('optmz',["rms"])

}

      def create_model_hypopt(params):
           print("Trying params:",params)
           batch_size = params["batch_size"]
           time_steps = params["time_steps"]
           start_time = time.clock()

      comboList=list(zip(Roaming_Map['roaming_partner_country'],Roaming_Map['opco']))

for i,combo in enumerate(comboList):
    print("As above for the country & opco - ",i,combo)
    subset=data[(data['Names']==combo[0]) & (data['add']==combo[1])]
    subset=subset[["Ob1","Ob2"]]

    x_train_ts, y_train_ts, x_test_ts, y_test_ts = data(subset,10, 1)    

    lstm_model = Sequential()
    lstm_model.add(LSTM(params["lstm1_nodes"], batch_input_shape=(batch_size, time_steps, x_train_ts.shape[2]), dropout=params["lstm1_dropouts"],
                    recurrent_dropout=params["lstm1_dropouts"], stateful=True, return_sequences=True,
                    kernel_initializer='random_uniform'))  
    # ,return_sequences=True #LSTM params => dropout=0.2, recurrent_dropout=0.2
    if params["lstm_layers"]["layers"] == "two":
        lstm_model.add(LSTM(params["lstm_layers"]["lstm2_nodes"], dropout=params["lstm_layers"]["lstm2_dropouts"]))
    else:
        lstm_model.add(Flatten())

    if params["dense_layers"]["layers"] == 'two':
        lstm_model.add(Dense(params["dense_layers"]["dense2_nodes"], activation='relu'))

    lstm_model.add(Dense(1, activation='sigmoid'))

    lr = params["lr"]
    epochs = params["epochs"]

    if params["optimizer"] == 'rms':
        optimizer = optimizers.RMSprop(lr=lr)
    else:
        optimizer = optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)

    lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)  # binary_crossentropy
    history = lstm_model.fit(x_train_ts, y_train_ts, epochs=epochs, verbose=2, batch_size=batch_size,
                         validation_data=[x_test_ts, y_test_ts],
                         callbacks=[LogMetrics(search_space, params, -1)])
# for key in history.history.keys():
#     print(key, "--",history.history[key])
# get the highest validation accuracy of the training epochs
    val_error = np.amin(history.history['val_loss']) 

    print('Best validation error of epoch number',epochs,'for the combination:',combo, val_error)



return {'loss': val_error, 'status': STATUS_OK, 'model': lstm_model} # if accuracy use '-' sign

trials = Trials()
best = fmin(create_model_hypopt,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials)

print_time("program completed in", stime)

best_trials = sorted(trials.results, key=lambda x: x['loss'], reverse=False)
best_trials.append(best_trials)

Но это работает для всех комбинаций Name & Add в один раз с теми же гиперпараметрами, а затем переходит к следующему набору гиперпараметров. Вместо этого я хочу, чтобы он запустил все возможные гиперпараметры для одной комбинации name & add & затем go для следующей комбинации.

1 Ответ

0 голосов
/ 22 января 2020

Я решил эту проблему, запустив l oop вне функции create_model:

for i,combo in enumerate(comboList): print(combo)       

 subset=data[(data['country']==combo[0]) & (data['type']==combo[1])]      

 subset=subset[["Data"]]

 x_train_ts, y_train_ts, x_test_ts, y_test_ts = data(subset,10, 1)    


trials = Trials()
best = fmin(create_model_hypopt,
space=search_space,
 algo=tpe.suggest,
 max_evals=1,
 trials=trials)

loss=trials.losses()
loss.append(loss)
...