Bayesian Optimization Hyperband

Bayesian optimization Hyperband combines Bayesian optimization’s guided search with Hyperband’s efficient resource allocation. It replaces Bayesian optimization’s GP with a tree-structured Parzen estimator (TPE), which improves performance by using kernel density estimation to compute probability densities for high-performing HP configurations. Bayesian optimization Hyperband optimizes multiple HP configurations in parallel, making it highly efficient in navigating complex HP spaces.

The following piece of code offers an example:

    from imp_functions_bohb_workers import *
from scipy.optimize import differential_evolution

import argparse
import os
import subprocess
import time
import pickle
from scipy.io import savemat
from scipy.stats import qmc
from sklearn.model_selection import train_test_split
import numpy as np
import logging
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB

# logging.WARNING has less log
#logging.basicConfig(level=logging.DEBUG)
logging.getLogger('hpbandster').setLevel(logging.WARNING)

argparser = argparse.ArgumentParser()
argparser.add_argument("--train_size", type=int, default=10)
argparser.add_argument("--runs", type=int, default=1)
argparser.add_argument('--n_workers', type=int, default=10)
args = argparser.parse_args()

# Setting bohb parameters
iteration = 40
min_b = 5
max_b = 30

print("###############################################", flush=True)
print("Training size: {}".format(args.train_size), flush=True)
print("Parallel worker size: {}".format(args.n_workers), flush=True)
print("###############################################", flush=True)

# Bounds
ub = np.array([8.0, 8.0])
lb = np.array([0.0, 0.0])

bounds = [(lb[i], ub[i]) for i in np.arange(len(lb))]

num_dim = 2
samples = args.train_size
sampler_x_train = qmc.Halton(d=num_dim, scramble=False)
sample_x_train = sampler_x_train.random(n=samples)
x = qmc.scale(sample_x_train, lb, ub)
y = branin(x)

# Split the training data into training and cross-validation data
x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.2)

x_opt = np.zeros((args.runs, num_dim))
f_opt = np.zeros((args.runs, 1))
epoch = np.zeros((args.runs, 1))
activation = np.zeros((args.runs, 1))
layer = np.zeros((args.runs, 1))
neuron = np.zeros((args.runs, 1))
training_nrmse = np.zeros((args.runs, 1))
loss = np.zeros((args.runs, 1))

store_times = []

# Serialize training data to pass to workers
data = {
    'x_train': x_train,
    'y_train': y_train,
    'x_cv': x_cv,
    'y_cv': y_cv
}
with open('train_data.pkl', 'wb') as f:
    pickle.dump(data, f)

# Running the optimization loop
for idx in range(args.runs):
    print("\nIteration: {}".format(idx + 1), flush=True)

    # Start a nameserver
    NS = hpns.NameServer(run_id='bohb', host='127.0.0.1', port=None)
    NS.start()

    # Start worker processes using subprocess
    worker_processes = []
    for i in range(args.n_workers):
        # Open imp_function located in two level up from the current directory
        worker_cmd = f'python ../../imp_functions_bohb_workers.py --run_id bohb --host 127.0.0.1 --worker {i}'
        worker_processes.append(subprocess.Popen(worker_cmd, shell=True))

    # Give workers some time to start up and register
    time.sleep(5)

    tic = time.time()

    # Run hpbandster
    res = run_hpbandster(x_train, y_train, x_cv, y_cv, iteration, min_b, max_b)

    # Extract the best configuration
    best_config = res.get_id2config_mapping()[res.get_incumbent_id()]['config']
    best_loss = res.get_runs_by_id(res.get_incumbent_id())[-1]['loss']
    best_loss = np.array(best_loss)
    print(f"Best objective (loss for training + CV): {best_loss}", flush=True)
    print(f"Best hyperparameters: {best_config}", flush=True)

    opt_params = {
        "num_epochs": best_config["epoch"],
        "activation": best_config["act"],
        "num_hidden_layers": best_config["layer"],
        "num_neurons": best_config["neuron"]
    }

    # Get the model
    model, x_transform, y_transform = train(x_train, y_train, opt_params)

    # Transform the data
    x_train = x_transform.transform(x_train)
    # Predict at training data
    y_pred = model(x_train)
    # Transform back to original scale
    x_train = x_transform.inverse_transform(x_train)
    y_pred = y_transform.inverse_transform(y_pred)

    training_loss = np.sqrt(mean_squared_error(y_train, y_pred) / np.ptp(y_train))

    print("Training NRMSE: {}".format(training_loss), flush=True)

    # Minimize the NN model
    result = differential_evolution(predict, bounds, mutation=0.5, recombination=0.9,
                                    args=(x_transform, y_transform, model), polish=True, disp=False)

    print("Optimal x: {}".format(result.x), flush=True)
    print("Optimal f: {}".format(result.fun), flush=True)

    toc = time.time()
    print(f"Elapsed time : {toc - tic} seconds")

    epoch[idx, 0] = opt_params['num_epochs']
    activation[idx, 0] = opt_params['activation']
    layer[idx, 0] = opt_params['num_hidden_layers']
    neuron[idx, 0] = opt_params['num_neurons']

    x_opt[idx, 0] = result.x[0]
    x_opt[idx, 1] = result.x[1]
    f_opt[idx, 0] = result.fun
    training_nrmse[idx, 0] = training_loss
    loss[idx, 0] = best_loss

    times_bohb = toc - tic

    if idx == 0:
        store_times.append(times_bohb)
    else:
        times_grid = store_times[idx - 1]
        store_times.append(times_bohb)

    data = {
        "x": x_opt,
        "fun": f_opt,
        "training_nrmse": training_nrmse,
        "time": store_times,
        "activation": activation,
        "epoch": epoch,
        "layer": layer,
        "neuron": neuron,
        "loss" : loss,
    }

    savemat("result.mat", data)

    # Shutdown the optimizer and nameserver
    NS.shutdown()

    # Terminate worker processes
    for p in worker_processes:
        p.terminate()
        p.wait()