====================
Random Search
====================

Random search samples HP configurations randomly from specified distributions. It is more efficient than grid search for large HP spaces and reduces computation time, increasing the chances of discovering optimal configurations. However, it does not utilize previous results to guide future searches, making it less efficient compared to model-based methods.

The following piece of code offers an example::

    import argparse
    import numpy as np
    from scipy.io import savemat
    from scipy.stats import qmc
    from sklearn.model_selection import train_test_split
    from imp_functions import *
    from scipy.optimize import differential_evolution
    import time

    # Setting the CML arguments
    argparser = argparse.ArgumentParser()
    argparser.add_argument("--train_size", type=int, default=100)
    argparser.add_argument("--test_size", type=int, default=50)
    argparser.add_argument("--runs", type=int, default=1)
    args = argparser.parse_args()

    # Setting CML arguments
    train_size = args.train_size
    test_size = args.test_size
    runs = args.runs

    print("###############################################", flush=True)
    print("Training size: {}".format(train_size), flush=True)
    print("###############################################", flush=True)

    ########################### Create architectures for random search

    # 4*4*4*4=256 which is the architectures for grid search
    pool = 256 

    architectures = np.zeros((pool, 4))

    for i in np.arange(pool):
        w = np.round(np.random.uniform(low=1000, high=10000, size=1))
        x = np.round(np.random.uniform(low=4, high=16, size=1))
        y = np.round(np.random.uniform(low=1, high=4, size=1))
        z = np.round(np.random.uniform(low=1, high=4, size=1))
        architectures[i] = np.hstack((w, x, y, z))
        
    architectures = architectures.astype(int)

    ########################### Create the dataset
    # Bounds
    ub = np.array([8.0, 8.0])
    lb = np.array([0.0, 0.0])

    bounds = []
    for i in np.arange(len(lb)):
        bounds.append((lb[i], ub[i]))

    u_bounds = [8.0, 8.0]
    l_bounds = [0.0, 0.0]

    num_dim = 2
    samples = train_size

    sampler_x_train  = qmc.Halton(d=num_dim, scramble=False)
    sample_x_train  = sampler_x_train.random(n=samples)
    x_mean = qmc.scale(sample_x_train, l_bounds, u_bounds)
    y_mean = branin(x_mean)

    x_train = x_mean
    y_train = y_mean

    x_train, x_cv, y_train, y_cv = train_test_split(x_mean, y_mean, test_size=0.2)

    x_opt = np.zeros((runs, num_dim))
    f_opt = np.zeros((runs, 1))

    epoch = np.zeros((runs, 1))
    activation = np.zeros((runs, 1))
    layer = np.zeros((runs, 1))
    neuron = np.zeros((runs, 1))
    training_nrmse = np.zeros((runs, 1))
    loss_training_cv = np.zeros((runs, 1))

    store_times = []

    ########################### Running grid search multiple times

    for idx in range(runs):
        
        print("\nIteration: {}".format(idx+1), flush=True)

        tic = time.time()

        loss = 10000
        
        for architecture in architectures:

            parameters = {}
            parameters['epochs'] = architecture[0]
            parameters['neurons'] = architecture[1]
            parameters['num_hidden_layers'] = architecture[2]
            parameters['activation'] = architecture[3]

            # Optimize the hyperparameters
            obj = objective(x_train, y_train, x_cv, y_cv, parameters)

            # Check if the objective is less than the previous objective
            if obj < loss:
                loss = obj
                opt_params = parameters
                
        best_loss = np.array(loss)

        print(f"Best objective (loss for training + CV): {best_loss}", flush=True)
        print("Optimal hyperparameters: {}".format(opt_params), flush=True)

        # Get the model
        model, x_transform, y_transform = train(x_train, y_train, opt_params)

        # Transform the data
        x_train = x_transform.transform(x_train)
        
        # Predict at testing data
        y_pred = model(x_train)

        # Transform back to original scale
        x_train = x_transform.inverse_transform(x_train)
        y_pred = y_transform.inverse_transform(y_pred)

        training_loss = np.sqrt(mean_squared_error(y_train, y_pred)/np.ptp(y_train))

        print("Training NRMSE: {}".format(training_loss), flush=True)

        ########################### Minimize the NN model

        # Minimum of the NN model
        result = differential_evolution(predict, bounds, mutation=0.5, recombination=0.9,
                        args=(x_transform, y_transform, model), polish=True, disp=False)
        
        print("Optimal x: {}".format(result.x), flush=True)
        print("Optimal f: {}".format(result.fun), flush=True)

        toc = time.time()
        print(f"Elapsed time : {toc-tic} seconds")

        epoch[idx, 0] =  parameters['epochs']
        activation[idx, 0] = parameters['activation']
        layer[idx, 0] = parameters['num_hidden_layers']
        neuron[idx, 0] = parameters['neurons']
        x_opt[idx, 0] = result.x[0]
        x_opt[idx, 1] = result.x[1]
        f_opt[idx, 0] = result.fun
        training_nrmse[idx, 0] = training_loss
        loss_training_cv[idx, 0] = best_loss

        times_rs = toc-tic
        
        if idx == 0:
            store_times.append(times_rs)
        else:
            times_rs += store_times[idx-1]
            store_times.append(times_rs)

        data = {
            "x": x_opt,
            "fun": f_opt,
            "training_nrmse": training_nrmse,
            "time": store_times,
            "activation": activation,
            "epoch": epoch,
            "layer": layer,
            "neuron": neuron,
            "loss" : loss_training_cv,
        }

        savemat("result.mat".format(idx), data)