Hyperband

Hyperband is a resource-efficient HPO technique that uses the successive halving algorithm (SHA) to allocate resources dynamically among HP configurations. It starts by evaluating several configurations with minimal resources and progressively eliminates lower-performing ones while allocating more resources to promising configurations. Hyperband offers scalability and parallelization, but its reliance on random sampling can lead to suboptimal HPs in large search spaces.

The following piece of code offers an example:

from imp_functions_hb_workers import *
from scipy.optimize import differential_evolution

import argparse
import os
import subprocess
import time
import pickle
import numpy as np
from scipy.io import savemat
from scipy.stats import qmc
from sklearn.model_selection import train_test_split

import logging
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import HyperBand

# logging.WARNING has less log
# logging.basicConfig(level=logging.DEBUG)
logging.getLogger('hpbandster').setLevel(logging.WARNING)

argparser = argparse.ArgumentParser()
argparser.add_argument("--train_size", type=int, default=10)
argparser.add_argument("--runs", type=int, default=1)
argparser.add_argument('--n_workers', type=int, default=10)
args = argparser.parse_args()

# Setting hb parameters
iteration = 40
min_b = 5
max_b = 30

print("###############################################", flush=True)
print("Training size: {}".format(args.train_size), flush=True)
print("Parallel worker size: {}".format(args.n_workers), flush=True)
print("###############################################", flush=True)

# Bounds
ub = np.array([8.0, 8.0])
lb = np.array([0.0, 0.0])

bounds = [(lb[i], ub[i]) for i in np.arange(len(lb))]

num_dim = 2
samples = args.train_size
sampler_x_train = qmc.Halton(d=num_dim, scramble=False)
sample_x_train = sampler_x_train.random(n=samples)
x = qmc.scale(sample_x_train, lb, ub)
y = branin(x)

# Split the training data into training and cross-validation data
x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.2)

x_opt = np.zeros((args.runs, num_dim))
f_opt = np.zeros((args.runs, 1))
epoch = np.zeros((args.runs, 1))
activation = np.zeros((args.runs, 1))
layer = np.zeros((args.runs, 1))
neuron = np.zeros((args.runs, 1))
training_nrmse = np.zeros((args.runs, 1))
loss = np.zeros((args.runs, 1))

store_times = []

# Serialize training data to pass to workers
data = {
    'x_train': x_train,
    'y_train': y_train,
    'x_cv': x_cv,
    'y_cv': y_cv
}
with open('train_data.pkl', 'wb') as f:
    pickle.dump(data, f)

# Running the optimization loop
for idx in range(args.runs):
    print("\nIteration: {}".format(idx + 1), flush=True)

    # Start a nameserver
    NS = hpns.NameServer(run_id='hb', host='127.0.0.1', port=None)
    NS.start()

    # Start worker processes using subprocess
    worker_processes = []
    for i in range(args.n_workers):
        # Open imp_function located in two level up from the current directory
        worker_cmd = f'python ../../imp_functions_hb_workers.py --run_id hb --host 127.0.0.1 --worker {i}'
        worker_processes.append(subprocess.Popen(worker_cmd, shell=True))

    # Give workers some time to start up and register
    time.sleep(5)

    tic = time.time()

    # Run hpbandster
    res = run_hpbandster(x_train, y_train, x_cv, y_cv, iteration, min_b, max_b)

    # Extract the best configuration
    best_config = res.get_id2config_mapping()[res.get_incumbent_id()]['config']
    best_loss = res.get_runs_by_id(res.get_incumbent_id())[-1]['loss']
    best_loss = np.array(best_loss)
    print(f"Best objective (loss for training + CV): {best_loss}", flush=True)
    print(f"Best hyperparameters: {best_config}", flush=True)

    opt_params = {
        "num_epochs": best_config["epoch"],
        "activation": best_config["act"],
        "num_hidden_layers": best_config["layer"],
        "num_neurons": best_config["neuron"]
    }

    # Get the model
    model, x_transform, y_transform = train(x_train, y_train, opt_params)

    # Transform the data
    x_train = x_transform.transform(x_train)
    # Predict at training data
    y_pred = model(x_train)
    # Transform back to original scale
    x_train = x_transform.inverse_transform(x_train)
    y_pred = y_transform.inverse_transform(y_pred)

    training_loss = np.sqrt(mean_squared_error(y_train, y_pred) / np.ptp(y_train))

    print("Training NRMSE: {}".format(training_loss), flush=True)

    # Minimize the NN model
    result = differential_evolution(predict, bounds, mutation=0.5, recombination=0.9,
                                    args=(x_transform, y_transform, model), polish=True, disp=False)

    print("Optimal x: {}".format(result.x), flush=True)
    print("Optimal f: {}".format(result.fun), flush=True)

    toc = time.time()
    print(f"Elapsed time : {toc - tic} seconds")

    epoch[idx, 0] = opt_params['num_epochs']
    activation[idx, 0] = opt_params['activation']
    layer[idx, 0] = opt_params['num_hidden_layers']
    neuron[idx, 0] = opt_params['num_neurons']

    x_opt[idx, 0] = result.x[0]
    x_opt[idx, 1] = result.x[1]
    f_opt[idx, 0] = result.fun
    training_nrmse[idx, 0] = training_loss
    loss[idx, 0] = best_loss

    times_hb = toc - tic

    if idx == 0:
        store_times.append(times_hb)
    else:
        times_grid = store_times[idx - 1]
        store_times.append(times_hb)

    data = {
        "x": x_opt,
        "fun": f_opt,
        "training_nrmse": training_nrmse,
        "time": store_times,
        "activation": activation,
        "epoch": epoch,
        "layer": layer,
        "neuron": neuron,
        "loss" : loss,
    }

    savemat("result.mat", data)

    # Shutdown the optimizer and nameserver
    NS.shutdown()

    # Terminate worker processes
    for p in worker_processes:
        p.terminate()
        p.wait()