I am using ray tune to find to optimal hyperparameters value for this model:
class BroadModel(tune.Trainable):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def build_model(self, config):
global convB2, drop2, convA2, poolA, poolB
window_size = 200
self.x_gyro, self.x_acc, x_mag, q = load_data_train()
self.Att_quat = Att_q(q)
self.x_gyro_t, self.x_acc_t, x_mag_t, q_t = load_data_test()
self.Att_quat_t = Att_q(q_t)
self.x_gyro, self.x_acc, self.Att_quat = shuffle(self.x_gyro, self.x_acc, self.Att_quat)
x1 = Input((window_size, 3), name='x1')
x2 = Input((window_size, 3), name='x2')
convA1 = Conv1D(config["Conv1DA"],11,padding='same',activation='relu')(x1)
for i in range(config["Conv1DAn"]):
if i > 0:
convA2 = Conv1D(config[f'Conv1DAn_{i}'],11,padding='same',activation='relu')(convA1)
poolA = MaxPooling1D(3)(convA1)
convB1 = Conv1D(config["Conv1DB"],11,padding='same',activation='relu')(x2)
for i in range(config["Conv1DBn"]):
if i > 0:
convB2 = Conv1D(config[f'Conv1DBn_{i}'],11,padding='same',activation='relu')(convB1)
poolB = MaxPooling1D(3)(convB1)
AB = concatenate([poolA, poolB])
lstm1 = Bidirectional(LSTM(config["LSTM1"], return_sequences=True))(AB)
drop1 = Dropout(config['dropout'])(lstm1)
for i in range(config['LSTMn']):
if i > 0:
lstm2 = Bidirectional(LSTM(config[f'LSTMn_{i}'], return_sequences=True))(drop1)
drop1 = Dropout(config['dropout'])(lstm2)
lstm2 = Bidirectional(LSTM(config['LSTMn_l']))(drop1)
drop2 = Dropout(config['dropout'])(lstm2)
y1_pred = Dense(4,kernel_regularizer='l2')(drop2)
model = Model(inputs =[x1, x2], outputs = [y1_pred])
return model
def setup(self, config):
model = self.build_model(config)
model.compile(
optimizer=Adam(learning_rate=config['lr']),
loss=quaternion_mean_multiplicative_error,
metrics=[quaternion_mean_multiplicative_error],
)
self.model = model
return model
But whenever I scale up my network by increasing the size of each layer from 50 to 100 or more or increasing the number of iterations from 10~20 to more than 40 I get some weird errors such as
> Failure # 1 (occurred at 2022-09-05_12-04-07)
> [36mray::ResourceTrainable.train()[39m (pid=35719,
> ip=192.168.91.120, repr=<ray.tune.trainable.util.BroadModel object at
> 0x7f478f107c40>) File
> "/home/ssrc/asq/lib/python3.8/site-packages/ray/tune/trainable/trainable.py",
> line 347, in train
> result = self.step() File "ray_test.py", line 258, in step
> self.model.fit( AttributeError: 'BroadModel' object has no attribute 'model'
This is the tunning code
if __name__ == "__main__":
import ray
from ray.tune.schedulers import PopulationBasedTraining
pbt = PopulationBasedTraining(
perturbation_interval=600,
hyperparam_mutations={
"dropout": tune.uniform(0.1,0.5),
"lr": tune.uniform(1e-5,1e-3),
"Conv1DA": tune.randint(10,15),
"Conv1DAn": tune.choice([0,1,2,3]),
"Conv1DAn_1": tune.randint(10,15),
"Conv1DAn_2": tune.randint(10,15),
"Conv1DAn_3": tune.randint(10,15),
"Conv1DB": tune.randint(10,15),
"Conv1DBn": tune.choice([0,1,2,3]),
"Conv1DBn_1": tune.randint(10,15),
"Conv1DBn_2": tune.randint(10,15),
"Conv1DBn_3": tune.randint(10,15),
"LSTM1": tune.randint(10,15),
"LSTMn": tune.choice([0,1,2,3]),
"LSTMn_1": tune.randint(10,15),
"LSTMn_2": tune.randint(10,15),
"LSTMn_3": tune.randint(10,15),
"LSTMn_l": tune.randint(10,15),
},
)
resources_per_trial = {"cpu": 10 , "gpu": 0}
tuner = tune.Tuner(
tune.with_resources(
BroadModel,
resources=resources_per_trial),
run_config=air.RunConfig(
name="BroadPBT"+timestr,
stop={"training_iteration": 50},
),
tune_config=tune.TuneConfig(
reuse_actors=True,
scheduler=pbt,
metric="loss",
mode="min",
num_samples=2 ,
),
param_space={
"finish_fast": False,
"batch_size": 1000,
"epochs": 200,
"dropout": tune.uniform(0.1,0.5),
"lr": tune.uniform(1e-5,1e-3),
"Conv1DA": tune.randint(10,15),
"Conv1DAn": tune.choice([0,1,2,3]),
"Conv1DAn_1": tune.randint(10,15),
"Conv1DAn_2": tune.randint(10,15),
"Conv1DAn_3": tune.randint(10,15),
"Conv1DB": tune.randint(10,15),
"Conv1DBn": tune.choice([0,1,2,3]),
"Conv1DBn_1": tune.randint(10,15),
"Conv1DBn_2": tune.randint(10,15),
"Conv1DBn_3": tune.randint(10,15),
"LSTM1": tune.randint(10,15),
"LSTMn": tune.choice([0,1,2,3]),
"LSTMn_1": tune.randint(10,15),
"LSTMn_2": tune.randint(10,15),
"LSTMn_3": tune.randint(10,15),
"LSTMn_l": tune.randint(10,15),
},
)
#tune.run(resources_per_trial={'gpu': 1}, tuner)
tuner.fit()
What should I do? As I mentioned before if I change the interaction numbers to less than 20, I don't get any errors.
Add this code to your class:
def reset_config(self, new_config):
self.config = new_config
self.build_model(new_config)
return True