diff --git a/nyaggle/experiment/gbdt.py b/nyaggle/experiment/gbdt.py index 864ae9c..829c28b 100644 --- a/nyaggle/experiment/gbdt.py +++ b/nyaggle/experiment/gbdt.py @@ -3,7 +3,7 @@ import os import time from collections import namedtuple from datetime import datetime -from typing import Any, Callable, Dict, Iterable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import numpy as np import optuna.integration.lightgbm as optuna_lgb @@ -12,6 +12,7 @@ import sklearn.utils.multiclass as multiclass from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor from more_itertools import first_true +from pandas.api.types import is_integer_dtype, is_categorical from sklearn.model_selection import BaseCrossValidator from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss @@ -119,6 +120,7 @@ def experiment_gbdt(model_params: Dict[str, Any], submission_filename: Optional[str] = None, type_of_target: str = 'auto', tuning_time_budget: Optional[int] = None, + with_auto_prep: bool = True, with_mlflow: bool = False, mlflow_experiment_id: Optional[Union[int, str]] = None, mlflow_run_name: Optional[str] = None, @@ -196,6 +198,9 @@ def experiment_gbdt(model_params: Dict[str, Any], tuning_time_budget: If not ``None``, model parameters will be automatically updated using optuna with the specified time budgets in seconds (only available in lightgbm). + with_auto_prep: + If True, the input datasets will be copied and automatic preprocessing will be performed on them. + For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled. with_mlflow: If True, `mlflow tracking `_ is used. One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. @@ -231,6 +236,8 @@ def experiment_gbdt(model_params: Dict[str, Any], cv = check_cv(cv, y) _check_input(X_train, y, X_test) + if with_auto_prep: + X_train, X_test = autoprep(X_train, X_test, categorical_feature, gbdt_type) logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S')) @@ -366,3 +373,34 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series, if X_test is not None: assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test" + + +def autoprep(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame], + categorical_feature: Optional[List[str]] = None, + gbdt_type: str = 'lgbm') -> Tuple[pd.DataFrame, pd.DataFrame]: + if categorical_feature is None: + categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']] + + if gbdt_type == 'cat' and len(categorical_feature) > 0: + X_train = X_train.copy() + X_all = X_train if X_test is None else pd.concat([X_train, X_test]) + + # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features + for c in categorical_feature: + if is_integer_dtype(X_train[c].dtype): + fillval = X_all[c].min() - 1 + else: + unique_values = X_all[c].unique() + fillval = 'na' + while fillval in unique_values: + fillval += '-' + if is_categorical(X_train[c]): + X_train[c] = X_train[c].cat.add_categories(fillval).fillna(fillval) + if X_test is not None: + X_test[c] = X_test[c].cat.add_categories(fillval).fillna(fillval) + else: + X_train[c].fillna(fillval, inplace=True) + if X_test is not None: + X_test[c].fillna(fillval, inplace=True) + + return X_train, X_test \ No newline at end of file diff --git a/nyaggle/testing/util.py b/nyaggle/testing/util.py index f20d91e..c5d00b3 100644 --- a/nyaggle/testing/util.py +++ b/nyaggle/testing/util.py @@ -31,7 +31,7 @@ def make_classification_df(n_samples: int = 1024, for i in range(n_cat_features): X['cat_{}'.format(i)] = \ - pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype(str).astype('category') + pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category') return X, y diff --git a/tests/experiment/test_gbdt.py b/tests/experiment/test_gbdt.py index 448ab5d..10a47aa 100644 --- a/tests/experiment/test_gbdt.py +++ b/tests/experiment/test_gbdt.py @@ -376,19 +376,3 @@ def test_experiment_sample_submission_multiclass(): log_loss_default = log_loss(y_test, np.full((len(y_test), 5), 0.2), labels=[0, 1, 2, 3, 4]) assert log_loss_trianed < log_loss_default - -def test_find_best_parameter(): - params = { - 'objective': 'binary', - 'metrics': 'auc', - 'n_estimators': 1000 - } - X, y = make_classification_df(2048, class_sep=0.7) - X_train, X_test, y_train, y_test = train_test_split(X, y) - - best_params = find_best_lgbm_parameter(params, X_train, y_train, cv=5) - - result_base = experiment_gbdt(params, X_train, y_train, eval_func=roc_auc_score) - result_opt = experiment_gbdt(best_params, X_train, y_train) - - assert result_opt.metrics[-1] >= result_base.metrics[-1]