70 KiB
70 KiB
<html>
<head>
</head>
</html>
Hyperparameter Tuning (Supplementary Notebook)¶
This notebook explores a grid search with repeated k-fold cross validation scheme for tuning the hyperparameters of the LightGBM model used in forecasting the M5 dataset. In general, the techniques used below can be also be adapted for other forecasting models, whether they be classical statistical models or machine learning methods.
Prepared by: Sebastian C. Ibañez
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
plt.rcParams['figure.figsize'] = [15, 5]
np.set_printoptions(precision = 6, suppress = True)
### CREATE X,Y ####
def create_xy(series, window_size, prediction_horizon, shuffle = False):
x = []
y = []
for i in range(0, len(series)):
if len(series[(i + window_size):(i + window_size + prediction_horizon)]) < prediction_horizon:
break
x.append(series[i:(i + window_size)])
y.append(series[(i + window_size):(i + window_size + prediction_horizon)])
x = np.array(x)
y = np.array(y)
return x,y
date_list = [d.strftime('%Y-%m-%d') for d in pd.date_range(start = '2011-01-29', end = '2016-04-24')]
df_sales = pd.read_csv('../data/m5/sales_train_validation.csv')
df_sales.rename(columns = dict(zip(df_sales.columns[6:], date_list)), inplace = True)
df_sales_total_by_store = df_sales.groupby(['store_id']).sum()
series = df_sales_total_by_store.iloc[0]
### HYPERPARAMETERS ###
window_size = 365
prediction_horizon = 1
### TRAIN VAL SPLIT ### (include shuffling later)
test_size = 28
split_time = len(series) - test_size
train_series = series[:split_time]
test_series = series[split_time - window_size:]
train_x, train_y = create_xy(train_series, window_size, prediction_horizon)
test_x, test_y = create_xy(test_series, window_size, prediction_horizon)
train_y = train_y.flatten()
test_y = test_y.flatten()
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-1-f8af59ced7b0> in <module>
2 import pandas as pd
3 import matplotlib.pyplot as plt
----> 4 import lightgbm as lgb
5 from sklearn.model_selection import cross_val_score
6 from sklearn.model_selection import RepeatedKFold
~/opt/anaconda3/envs/atsa/lib/python3.7/site-packages/lightgbm/__init__.py in <module>
6 from __future__ import absolute_import
7
----> 8 from .basic import Booster, Dataset
9 from .callback import (early_stopping, print_evaluation, record_evaluation,
10 reset_parameter)
~/opt/anaconda3/envs/atsa/lib/python3.7/site-packages/lightgbm/basic.py in <module>
41
42
---> 43 _LIB = _load_lib()
44
45
~/opt/anaconda3/envs/atsa/lib/python3.7/site-packages/lightgbm/basic.py in _load_lib()
32 if len(lib_path) == 0:
33 return None
---> 34 lib = ctypes.cdll.LoadLibrary(lib_path[0])
35 lib.LGBM_GetLastError.restype = ctypes.c_char_p
36 callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
~/opt/anaconda3/envs/atsa/lib/python3.7/ctypes/__init__.py in LoadLibrary(self, name)
440
441 def LoadLibrary(self, name):
--> 442 return self._dlltype(name)
443
444 cdll = LibraryLoader(CDLL)
~/opt/anaconda3/envs/atsa/lib/python3.7/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error)
362
363 if handle is None:
--> 364 self._handle = _dlopen(self._name, mode)
365 else:
366 self._handle = handle
OSError: dlopen(/Users/prince.javier/opt/anaconda3/envs/atsa/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
Referenced from: /Users/prince.javier/opt/anaconda3/envs/atsa/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so
Reason: image not found
model = lgb.LGBMRegressor(first_metric_only = True)
model.fit(train_x, train_y,
eval_metric = 'l1',
eval_set = [(test_x, test_y)],
early_stopping_rounds = 10,
verbose = 0)
forecast = model.predict(test_x)
s1_naive = series[-29:-1].to_numpy()
s7_naive = series[-35:-7].to_numpy()
s30_naive = series[-56:-28].to_numpy()
s365_naive = series[-364:-336].to_numpy()
print(' Naive MAE: %.4f' % (np.mean(np.abs(s1_naive - test_y))))
print(' s7-Naive MAE: %.4f' % (np.mean(np.abs(s7_naive - test_y))))
print(' s30-Naive MAE: %.4f' % (np.mean(np.abs(s30_naive - test_y))))
print('s365-Naive MAE: %.4f' % (np.mean(np.abs(s365_naive - test_y))))
print(' LightGBM MAE: %.4f' % (np.mean(np.abs(forecast - test_y))))
series[-test_size:].plot(marker = 'o', linestyle = '--')
plt.plot(forecast, marker = 'o', linestyle = '--')
plt.show()
Naive MAE: 698.0000
s7-Naive MAE: 372.2857
s30-Naive MAE: 330.8214
s365-Naive MAE: 247.9286
LightGBM MAE: 208.9156
Hyperparameter Tuning¶
Number of Trees¶
trees = [10, 50, 100, 500, 1000, 2000]
results = []
names = []
for i in trees:
params = {
'n_estimators': i,
}
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(lgb.LGBMRegressor(**params), train_x, train_y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
results.append(scores)
names.append(i)
print('%3d --- MAE: %.3f (%.3f)' % (i, np.mean(scores), np.std(scores)))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()
10 --- MAE: -385.639 (31.537)
50 --- MAE: -283.540 (21.324)
100 --- MAE: -285.091 (21.551)
500 --- MAE: -288.237 (21.803)
1000 --- MAE: -288.672 (21.822)
2000 --- MAE: -288.734 (21.844)
Tree Depth¶
results = []
names = []
for i in range(1, 11):
params = {
'max_depth': i,
'num_leaves': 2**i,
}
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(lgb.LGBMRegressor(**params), train_x, train_y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
results.append(scores)
names.append(i)
print('%3d --- MAE: %.3f (%.3f)' % (i, np.mean(scores), np.std(scores)))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()
1 --- MAE: -292.235 (24.008)
2 --- MAE: -284.534 (20.113)
3 --- MAE: -278.984 (20.519)
4 --- MAE: -276.622 (21.169)
5 --- MAE: -280.497 (21.999)
6 --- MAE: -283.877 (21.382)
7 --- MAE: -284.667 (21.436)
8 --- MAE: -288.216 (21.230)
9 --- MAE: -287.373 (21.043)
10 --- MAE: -288.503 (20.929)
Learning Rate¶
rates = [0.0001, 0.001, 0.01, 0.1, 1.0]
results = []
names = []
for i in rates:
params = {
'learning_rate': i,
}
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(lgb.LGBMRegressor(**params), train_x, train_y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
results.append(scores)
names.append(i)
print('%.3f --- MAE: %.3f (%.3f)' % (i, np.mean(scores), np.std(scores)))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()
0.000 --- MAE: -748.947 (39.458)
0.001 --- MAE: -695.973 (38.578)
0.010 --- MAE: -391.718 (30.680)
0.100 --- MAE: -285.091 (21.551)
1.000 --- MAE: -437.648 (23.715)
Boosting Type¶
gbdt
: Gradient Boosting Decision Tree (GDBT).dart
: Dropouts meet Multiple Additive Regression Trees (DART).goss
: Gradient-based One-Side Sampling (GOSS).
types = ['gbdt', 'dart', 'goss']
results = []
names = []
for i in types:
params = {
'boosting_type': i,
}
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(lgb.LGBMRegressor(**params), train_x, train_y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
results.append(scores)
names.append(i)
print('%s --- MAE: %.3f (%.3f)' % (i, np.mean(scores), np.std(scores)))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()
gbdt --- MAE: -285.091 (21.551)
dart --- MAE: -481.225 (33.006)
goss --- MAE: -291.263 (21.742)
More than one variable…¶
types = ['gbdt', 'dart', 'goss']
results = []
for i in types:
for j in range(1, 11):
params = {
'n_estimators': 2000,
'learning_rate': 0.1,
'max_depth': j,
'num_leaves': 2**j,
'boosting_type': i,
}
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(lgb.LGBMRegressor(**params), train_x, train_y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
results.append(scores)
print('%s %d --- MAE: %.3f (%.3f)' % (i, j, np.mean(scores), np.std(scores)))
gbdt 1 --- MAE: -287.860 (21.319)
gbdt 2 --- MAE: -298.625 (20.007)
gbdt 3 --- MAE: -288.742 (21.788)
gbdt 4 --- MAE: -284.235 (20.843)
gbdt 5 --- MAE: -285.302 (21.250)
gbdt 6 --- MAE: -288.334 (21.073)
gbdt 7 --- MAE: -288.306 (21.382)
gbdt 8 --- MAE: -290.893 (21.207)
/opt/conda/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py:688: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
warnings.warn(
gbdt 9 --- MAE: -291.163 (22.049)
gbdt 10 --- MAE: -291.799 (21.347)
dart 1 --- MAE: -277.039 (21.229)
dart 2 --- MAE: -278.634 (19.236)
dart 3 --- MAE: -274.650 (21.226)
dart 4 --- MAE: -274.975 (22.260)
dart 5 --- MAE: -276.816 (22.355)
dart 6 --- MAE: -278.424 (22.700)
dart 7 --- MAE: -280.250 (22.238)
dart 8 --- MAE: -281.548 (23.202)
dart 9 --- MAE: -282.776 (23.881)
dart 10 --- MAE: -283.613 (23.785)
goss 1 --- MAE: -301.392 (22.057)
goss 2 --- MAE: -305.135 (17.874)
goss 3 --- MAE: -297.664 (18.621)
goss 4 --- MAE: -297.698 (19.858)
goss 5 --- MAE: -296.022 (18.470)
goss 6 --- MAE: -295.540 (20.386)
goss 7 --- MAE: -290.380 (19.665)
goss 8 --- MAE: -295.496 (20.014)
goss 9 --- MAE: -292.827 (20.048)
goss 10 --- MAE: -293.961 (22.087)
Final Model¶
params = {
'n_estimators': 2000,
'max_depth': 4,
'num_leaves': 2**4,
'learning_rate': 0.1,
'boosting_type': 'dart'
}
model = lgb.LGBMRegressor(first_metric_only = True, **params)
model.fit(train_x, train_y,
eval_metric = 'l1',
eval_set = [(test_x, test_y)],
#early_stopping_rounds = 10,
verbose = 0)
forecast = model.predict(test_x)
s1_naive = series[-29:-1].to_numpy()
s7_naive = series[-35:-7].to_numpy()
s30_naive = series[-56:-28].to_numpy()
s365_naive = series[-364:-336].to_numpy()
print(' Naive MAE: %.4f' % (np.mean(np.abs(s1_naive - test_y))))
print(' s7-Naive MAE: %.4f' % (np.mean(np.abs(s7_naive - test_y))))
print(' s30-Naive MAE: %.4f' % (np.mean(np.abs(s30_naive - test_y))))
print('s365-Naive MAE: %.4f' % (np.mean(np.abs(s365_naive - test_y))))
print(' LightGBM MAE: %.4f' % (np.mean(np.abs(forecast - test_y))))
series[-test_size:].plot(marker = 'o', linestyle = '--')
plt.plot(forecast, marker = 'o', linestyle = '--')
plt.show()
Naive MAE: 698.0000
s7-Naive MAE: 372.2857
s30-Naive MAE: 330.8214
s365-Naive MAE: 247.9286
LightGBM MAE: 200.5037
By students of PhD in Data Science Batch 2023 at the Asian Institute of Management
© Copyright 2020.