Adaptive Sampling

This notebook implements adaptive sampling, which explores under-sampled parts of the design space while exploiting the knowledge gained from previous simulation runs. The aim is to reduce the number of samples required to train an accurate surrogate model. We use Lola-Voronoi based sampling, which is compatible with any surrogate model type. Here it is applied with a Gaussian Process model.

import warnings

import numpy as np
import seaborn as sns
from besos import eppy_funcs as ef, sampling
from besos.evaluator import EvaluatorEP
from besos.problem import EPProblem
from dask.distributed import Client
from matplotlib import pyplot as plt
from scipy.interpolate import griddata
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.manifold import MDS
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

from parameter_sets import parameter_set
from sampling import adaptive_sampler_lv
client = Client(threads_per_worker=1)
client

Client

Cluster

  • Workers: 16
  • Cores: 16
  • Memory: 68.72 GB

Define sampler settings

n_samples_init = (
    20  # initial set of samples collected using the standard low-discrepancy sampling
)

no_iter = 10  # number of iterations of the adaptive sampler to run
n = 4  # number of samples added per iteration

Generate data set

This generates an example model and the initial sampling data, see this example.

parameters = parameter_set(7)
problem = EPProblem(parameters, ["Electricity:Facility"])
building = ef.get_building()
inputs = sampling.dist_sampler(sampling.lhs, problem, n_samples_init)
evaluator = EvaluatorEP(problem, building)
outputs = evaluator.df_apply(inputs, processes=4)
results = inputs.join(outputs)
results.head()
Conductivity Thickness U-Factor Solar Heat Gain Coefficient ElectricEquipment Lights Window to Wall Ratio Electricity:Facility
0 0.068922 0.278260 0.589089 0.808344 11.278208 12.836307 0.114060 1.994176e+09
1 0.165364 0.199395 2.800824 0.907124 10.950859 10.328810 0.502100 1.845950e+09
2 0.159571 0.249230 0.290241 0.980416 14.532254 11.521821 0.884315 2.152789e+09
3 0.083254 0.112736 1.924484 0.297179 14.368944 12.232532 0.804670 2.119206e+09
4 0.028942 0.203402 4.931487 0.079449 11.817406 14.989872 0.679059 2.103819e+09

Initial training of Surrogate Model

Here we use a Gaussian Process surrogate model, see here for details.

train_in, test_in, train_out, test_out = train_test_split(
    inputs, outputs, test_size=0.2
)
hyperparameters = {
    "kernel": [
        None,
        1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
        1.0 * RationalQuadratic(length_scale=1.0, alpha=0.5),
        # ConstantKernel(0.1, (0.01, 10.0))*(DotProduct(sigma_0=1.0, sigma_0_bounds=(0.1, 10.0))**2),
        1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
    ]
}

folds = 3

gp = GaussianProcessRegressor(normalize_y=True)

clf = GridSearchCV(gp, hyperparameters, iid=True, cv=folds)
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    clf.fit(inputs, outputs)

print(f"The best performing model $R^2$ score on the validation set: {clf.best_score_}")
print(f"The model $R^2$ parameters: {clf.best_params_}")
print(
    f"The best performing model $R^2$ score on a separate test set: {clf.best_estimator_.score(test_in, test_out)}"
)
reg = clf.best_estimator_
The best performing model $R^2$ score on the validation set: 0.8932778938504393
The model $R^2$ parameters: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
The best performing model $R^2$ score on a separate test set: 1.0

LOLA - Voronoi sampling

Here we run our implementation of LOLA-Voronoi sampling. New designs to be simulated are picked around previously simulated designs with a high hybrid score \(H\).

\(H = V + E\)

\(H\) is used to incentives exploration \(V\) and exploitation \(E\). \(V\) is the Voronoi cell size to approximate the sample density and \(E\) the local-linear estimate to approximate the gradient in the neighbourhood of a sample.

numiter = 10
AS = adaptive_sampler_lv(
    train_in.values,
    train_out.values,
    n,
    problem,
    evaluator,
    reg,
    test_in,
    test_out,
    verbose=False,
)
AS.run(numiter)
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
0
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
1
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
2
/home/user/.local/lib/python3.7/site-packages/sklearn/gaussian_process/_gpr.py:504: ConvergenceWarning: lbfgs failed to converge (status=2):
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
3
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
4
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
5
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
6
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
7
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
8
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
9
HBox(children=(FloatProgress(value=0.0, description='Executing', max=4.0, style=ProgressStyle(description_widt…
/home/user/.local/lib/python3.7/site-packages/sklearn/gaussian_process/_gpr.py:504: ConvergenceWarning: lbfgs failed to converge (status=2):
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)

Visualization

We visualize the working of the adaptive sampler by reducing the input dimensionality to 2 using multi-dimensional scaling.

plt.plot(range(n_samples_init, n_samples_init + no_iter * n, n), AS.score[:-1], "-o")
plt.ylabel("R^2 score")
plt.xlabel("No. of samples")
Text(0.5, 0, 'No. of samples')
../../_images/FitGPAdaptive_14_1.png
# Manifold model
scaler_mds = StandardScaler()
p_norm = scaler_mds.fit_transform(AS.P)
model = MDS(n_components=2, random_state=1)
out3 = model.fit_transform(p_norm)

plt.scatter(out3[:n_samples_init, 0], out3[:n_samples_init, 1], color="grey")
plt.scatter(out3[n_samples_init:, 0], out3[n_samples_init:, 1], color="r")
plt.axis("equal")
(-4.207227499374288, 4.505355347028606, -3.7690514453462707, 4.057620752975868)
../../_images/FitGPAdaptive_15_1.png
ax = sns.kdeplot(out3[n_samples_init:, 0], out3[n_samples_init:, 1], shade=True)
plt.ylim([-4, 4])
plt.xlim([-4, 4])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: y. From version 0.12, the only valid positional argument will be data, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
(-4.0, 4.0)
../../_images/FitGPAdaptive_16_2.png
x = out3[:, 0]
y = out3[:, 1]
z = reg.predict(AS.P)

# Convert from pandas dataframes to numpy arrays
X, Y, Z, = np.array([]), np.array([]), np.array([])
for i in range(len(x)):
    X = np.append(X, x[i])
    Y = np.append(Y, y[i])
    Z = np.append(Z, z[i])

# create x-y points to be used in heatmap
xi = np.linspace(X.min(), X.max(), 1000)
yi = np.linspace(Y.min(), Y.max(), 1000)

# Z is a matrix of x-y values
zi = griddata((X, Y), Z, (xi[None, :], yi[:, None]), method="cubic")

# I control the range of my colorbar by removing data
# outside of my range of interest
zmin = 1.0 * 10 ** 9
zmax = 3 * 10 ** 9
zi[(zi < zmin) | (zi > zmax)] = None

# Create the contour plot
plt.figure(figsize=(10, 10))
CS = plt.contourf(xi, yi, zi, 10, cmap=plt.cm.rainbow, vmax=zmax, vmin=zmin, alpha=0.8)
plt.colorbar()
plt.scatter(out3[:18, 0], out3[:18, 1], color="grey")
plt.scatter(out3[18:, 0], out3[18:, 1], color="r")
# ax = sns.kdeplot(out3[18:,0], out3[18:,1], shade=True)


plt.ylim([-4, 4])
plt.xlim([-4, 4])
plt.grid()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:23: RuntimeWarning: invalid value encountered in less
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:23: RuntimeWarning: invalid value encountered in greater
../../_images/FitGPAdaptive_17_1.png