350 lines
16 KiB
Python
350 lines
16 KiB
Python
import os
|
|
import pickle
|
|
import time
|
|
from collections import namedtuple
|
|
from datetime import datetime
|
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sklearn.utils.multiclass as multiclass
|
|
from sklearn.base import BaseEstimator
|
|
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
|
|
from sklearn.model_selection import BaseCrossValidator
|
|
|
|
from nyaggle.environment import requires_catboost, requires_lightgbm, requires_xgboost
|
|
from nyaggle.experiment.auto_prep import autoprep_gbdt
|
|
from nyaggle.experiment.experiment import Experiment
|
|
from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter
|
|
from nyaggle.feature_store import load_features
|
|
from nyaggle.util import plot_importance, is_gbdt_instance
|
|
from nyaggle.validation.cross_validate import cross_validate
|
|
from nyaggle.validation.split import check_cv
|
|
|
|
ExperimentResult = namedtuple('ExperimentResult',
|
|
[
|
|
'oof_prediction',
|
|
'test_prediction',
|
|
'metrics',
|
|
'models',
|
|
'importance',
|
|
'time',
|
|
'submission_df'
|
|
])
|
|
|
|
|
|
def run_experiment(model_params: Dict[str, Any],
|
|
X_train: pd.DataFrame, y: pd.Series,
|
|
X_test: Optional[pd.DataFrame] = None,
|
|
logging_directory: str = 'output/{time}',
|
|
if_exists: str = 'error',
|
|
eval_func: Optional[Callable] = None,
|
|
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
|
|
fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
|
|
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
|
groups: Optional[pd.Series] = None,
|
|
categorical_feature: Optional[List[str]] = None,
|
|
sample_submission: Optional[pd.DataFrame] = None,
|
|
submission_filename: Optional[str] = None,
|
|
type_of_target: str = 'auto',
|
|
feature_list: Optional[List[Union[int, str]]] = None,
|
|
feature_directory: Optional[str] = None,
|
|
with_auto_hpo: bool = False,
|
|
with_auto_prep: bool = False,
|
|
with_mlflow: bool = False
|
|
):
|
|
"""
|
|
Evaluate metrics by cross-validation and stores result
|
|
(log, oof prediction, test prediction, feature importance plot and submission file)
|
|
under the directory specified.
|
|
|
|
One of the following estimators are used (automatically dispatched by ``type_of_target(y)`` and ``gbdt_type``).
|
|
|
|
* LGBMClassifier
|
|
* LGBMRegressor
|
|
* CatBoostClassifier
|
|
* CatBoostRegressor
|
|
|
|
The output files are laid out as follows:
|
|
|
|
.. code-block:: none
|
|
|
|
<logging_directory>/
|
|
log.txt <== Logging file
|
|
importance.png <== Feature importance plot generated by nyaggle.util.plot_importance
|
|
oof_prediction.npy <== Out of fold prediction in numpy array format
|
|
test_prediction.npy <== Test prediction in numpy array format
|
|
submission.csv <== Submission csv file
|
|
metrics.txt <== Metrics
|
|
params.txt <== Parameters
|
|
models/
|
|
fold1 <== The trained model in fold 1
|
|
...
|
|
|
|
Args:
|
|
model_params:
|
|
Parameters passed to the constructor of the classifier/regressor object (i.e. LGBMRegressor).
|
|
X_train:
|
|
Training data. Categorical feature should be casted to pandas categorical type or encoded to integer.
|
|
y:
|
|
Target
|
|
X_test:
|
|
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
|
|
logging_directory:
|
|
Path to directory where output of experiment is stored.
|
|
if_exists:
|
|
How to behave if the logging directory already exists.
|
|
- error: Raise a ValueError.
|
|
- replace: Delete logging directory before logging.
|
|
- append: Append to exisitng experiment.
|
|
- rename: Rename current directory by adding "_1", "_2"... prefix
|
|
fit_params:
|
|
Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
|
|
eval_set passed for each fold. If callable is passed,
|
|
returning value of ``fit_params(fold_id, train_index, test_index)`` will be used for each fold.
|
|
eval_func:
|
|
Function used for logging and calculation of returning scores.
|
|
This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
|
|
If ``eval_func`` is None, ``roc_auc_score`` or ``mean_squared_error`` is used by default.
|
|
gbdt_type:
|
|
Type of gradient boosting library used. "lgbm" (lightgbm) or "cat" (catboost)
|
|
cv:
|
|
int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
|
|
|
|
- None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
|
|
- integer, to specify the number of folds in a ``(Stratified)KFold``,
|
|
- CV splitter (the instance of ``BaseCrossValidator``),
|
|
- An iterable yielding (train, test) splits as arrays of indices.
|
|
groups:
|
|
Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
|
|
sample_submission:
|
|
A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv).
|
|
The submission file will be created with the same schema as this dataframe.
|
|
submission_filename:
|
|
The name of submission file will be created under logging directory. If ``None``, the basename of the logging
|
|
directory will be used as a filename.
|
|
categorical_feature:
|
|
List of categorical column names. If ``None``, categorical columns are automatically determined by dtype.
|
|
type_of_target:
|
|
The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
|
|
Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
|
|
feature_list:
|
|
The list of feature ids saved through nyaggle.feature_store module.
|
|
feature_directory:
|
|
The location of features stored. Only used if feature_list is not empty.
|
|
with_auto_prep:
|
|
If True, the input datasets will be copied and automatic preprocessing will be performed on them.
|
|
For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled.
|
|
with_auto_hpo:
|
|
If True, model parameters will be automatically updated using optuna (only available in lightgbm).
|
|
with_mlflow:
|
|
If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
|
|
One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
|
|
Note that all output
|
|
mlflow's directory (``mlruns`` by default).
|
|
:return:
|
|
Namedtuple with following members
|
|
|
|
* oof_prediction:
|
|
numpy array, shape (len(X_train),) Predicted value on Out-of-Fold validation data.
|
|
* test_prediction:
|
|
numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
|
|
* metrics:
|
|
list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold.
|
|
``scores[-1]`` is overall score.
|
|
* models:
|
|
list of objects, shape(nfolds) Trained models for each folds.
|
|
* importance:
|
|
list of pd.DataFrame, feature importance for each fold (type="gain").
|
|
* time:
|
|
Training time in seconds.
|
|
* submit_df:
|
|
The dataframe saved as submission.csv
|
|
"""
|
|
start_time = time.time()
|
|
cv = check_cv(cv, y)
|
|
|
|
if feature_list:
|
|
X = pd.concat([X_train, X_test]) if X_test is not None else X_train
|
|
X.reset_index(drop=True, inplace=True)
|
|
X = load_features(X, feature_list, directory=feature_directory)
|
|
ntrain = len(X_train)
|
|
X_train, X_test = X.iloc[:ntrain, :], X.iloc[ntrain:, :].reset_index(drop=True)
|
|
|
|
_check_input(X_train, y, X_test)
|
|
|
|
if categorical_feature is None:
|
|
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
|
|
|
|
if type_of_target == 'auto':
|
|
type_of_target = multiclass.type_of_target(y)
|
|
model_type, eval_func, cat_param_name = _dispatch_models(algorithm_type, type_of_target, eval_func)
|
|
|
|
if with_auto_prep:
|
|
assert algorithm_type in ('cat', 'xgb', 'lgbm'), "with_auto_prep is only supported for gbdt"
|
|
X_train, X_test = autoprep_gbdt(algorithm_type, X_train, X_test, categorical_feature)
|
|
|
|
logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
|
|
|
|
with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
|
|
exp.log('Algorithm: {}'.format(algorithm_type))
|
|
exp.log('Experiment: {}'.format(logging_directory))
|
|
exp.log('Params: {}'.format(model_params))
|
|
exp.log('Features: {}'.format(list(X_train.columns)))
|
|
exp.log_param('algorithm_type', algorithm_type)
|
|
exp.log_param('num_features', X_train.shape[1])
|
|
exp.log_param('fit_params', fit_params)
|
|
exp.log_param('model_params', model_params)
|
|
if feature_list is not None:
|
|
exp.log_param('features', feature_list)
|
|
|
|
if with_auto_hpo:
|
|
assert algorithm_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
|
|
model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups,
|
|
type_of_target=type_of_target)
|
|
exp.log_param('model_params_tuned', model_params)
|
|
|
|
exp.log('Categorical: {}'.format(categorical_feature))
|
|
|
|
models = [model_type(**model_params) for _ in range(cv.get_n_splits())]
|
|
|
|
if fit_params is None:
|
|
fit_params = {}
|
|
if cat_param_name is not None and not callable(fit_params) and cat_param_name not in fit_params:
|
|
fit_params[cat_param_name] = categorical_feature
|
|
|
|
exp.log_params(fit_params)
|
|
|
|
result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups,
|
|
logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params,
|
|
type_of_target=type_of_target)
|
|
|
|
# save oof
|
|
exp.log_numpy('oof_prediction', result.oof_prediction)
|
|
exp.log_numpy('test_prediction', result.test_prediction)
|
|
|
|
for i in range(cv.get_n_splits()):
|
|
exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
|
|
exp.log_metric('Overall', result.scores[-1])
|
|
|
|
# save importance plot
|
|
if result.importance:
|
|
importance = pd.concat(result.importance)
|
|
plot_file_path = os.path.join(logging_directory, 'importance.png')
|
|
plot_importance(importance, plot_file_path)
|
|
exp.log_artifact(plot_file_path)
|
|
|
|
# save trained model
|
|
for i, model in enumerate(models):
|
|
_save_model(model, logging_directory, i + 1, exp)
|
|
|
|
# save submission.csv
|
|
submit_df = None
|
|
if X_test is not None:
|
|
submit_df = _make_submission_df(result.test_prediction, type_of_target, y, sample_submission)
|
|
exp.log_dataframe(submission_filename or os.path.basename(logging_directory), submit_df, 'csv')
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
|
return ExperimentResult(result.oof_prediction, result.test_prediction,
|
|
result.scores, models, result.importance, elapsed_time, submit_df)
|
|
|
|
|
|
def _make_submission_df(test_prediction: np.ndarray, type_of_target: str, y: pd.Series,
|
|
sample_submission: Optional[pd.DataFrame] = None):
|
|
if sample_submission is not None:
|
|
submit_df = sample_submission.copy()
|
|
|
|
if type_of_target == 'multiclass':
|
|
n_id_cols = submit_df.shape[1] - test_prediction.shape[1]
|
|
for i, y in enumerate(sorted(y.unique())):
|
|
submit_df.iloc[:, n_id_cols + i] = test_prediction[:, i]
|
|
else:
|
|
n_id_cols = submit_df.shape[1] - 1
|
|
submit_df.iloc[:, n_id_cols] = test_prediction
|
|
else:
|
|
submit_df = pd.DataFrame()
|
|
id_col_name = y.index.name or 'id'
|
|
tgt_col_name = y.name or 'target'
|
|
|
|
submit_df[id_col_name] = np.arange(len(test_prediction))
|
|
|
|
if type_of_target == 'multiclass':
|
|
for i, y in enumerate(sorted(y.unique())):
|
|
submit_df[y] = test_prediction[:, i]
|
|
else:
|
|
submit_df[tgt_col_name] = test_prediction
|
|
|
|
return submit_df
|
|
|
|
|
|
def _dispatch_eval_func(target_type: str, custom_eval: Optional[Callable] = None):
|
|
default_eval_func = {
|
|
'binary': roc_auc_score,
|
|
'multiclass': log_loss,
|
|
'continuous': mean_squared_error
|
|
}
|
|
return custom_eval if custom_eval is not None else default_eval_func[target_type]
|
|
|
|
|
|
def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str):
|
|
is_regression = type_of_target == 'continuous'
|
|
|
|
if algorithm_type == 'lgbm':
|
|
requires_lightgbm()
|
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
return LGBMRegressor if is_regression else LGBMClassifier
|
|
elif algorithm_type == 'cat':
|
|
requires_catboost()
|
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
return CatBoostRegressor if is_regression else CatBoostClassifier
|
|
else:
|
|
requires_xgboost()
|
|
assert algorithm_type == 'xgb'
|
|
from xgboost import XGBClassifier, XGBRegressor
|
|
return XGBRegressor if is_regression else XGBClassifier
|
|
|
|
|
|
def _dispatch_models(algorithm_type: Union[str, Type[BaseEstimator]],
|
|
target_type: str, custom_eval: Optional[Callable] = None):
|
|
if not isinstance(algorithm_type, str):
|
|
assert issubclass(algorithm_type, BaseEstimator), "algorithm_type should be str or subclass of BaseEstimator"
|
|
return algorithm_type, _dispatch_eval_func(target_type, custom_eval), None
|
|
|
|
cat_features = {
|
|
'lgbm': 'categorical_feature',
|
|
'cat': 'cat_features',
|
|
'xgb': None
|
|
}
|
|
|
|
gbdt_class = _dispatch_gbdt_class(algorithm_type, target_type)
|
|
eval_func = _dispatch_eval_func(target_type, custom_eval)
|
|
|
|
return gbdt_class, eval_func, cat_features[algorithm_type]
|
|
|
|
|
|
def _save_model(model: BaseEstimator, logging_directory: str, fold: int, exp: Experiment):
|
|
model_dir = os.path.join(logging_directory, 'models')
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
path = os.path.join(model_dir, 'fold{}'.format(fold))
|
|
|
|
if is_gbdt_instance(model, 'lgbm'):
|
|
model.booster_.save_model(path)
|
|
elif is_gbdt_instance(model, ('xgb', 'cat')):
|
|
model.save_model(path)
|
|
else:
|
|
with open(path, "wb") as f:
|
|
pickle.dump(model, f)
|
|
|
|
exp.log_artifact(path)
|
|
|
|
|
|
def _check_input(X_train: pd.DataFrame, y: pd.Series,
|
|
X_test: Optional[pd.DataFrame] = None):
|
|
assert len(X_train) == len(y), "length of X_train and y are different. len(X_train) = {}, len(y) = {}".format(
|
|
len(X_train), len(y)
|
|
)
|
|
|
|
if X_test is not None:
|
|
assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
|