handle missing value in categorical column

pull/20/head
nyanp 2020-01-23 00:17:25 +09:00
parent 9bda8654be
commit 627620eff2
3 changed files with 40 additions and 18 deletions

View File

@ -3,7 +3,7 @@ import os
import time
from collections import namedtuple
from datetime import datetime
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
import optuna.integration.lightgbm as optuna_lgb
@ -12,6 +12,7 @@ import sklearn.utils.multiclass as multiclass
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor
from more_itertools import first_true
from pandas.api.types import is_integer_dtype, is_categorical
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
@ -119,6 +120,7 @@ def experiment_gbdt(model_params: Dict[str, Any],
submission_filename: Optional[str] = None,
type_of_target: str = 'auto',
tuning_time_budget: Optional[int] = None,
with_auto_prep: bool = True,
with_mlflow: bool = False,
mlflow_experiment_id: Optional[Union[int, str]] = None,
mlflow_run_name: Optional[str] = None,
@ -196,6 +198,9 @@ def experiment_gbdt(model_params: Dict[str, Any],
tuning_time_budget:
If not ``None``, model parameters will be automatically updated using optuna with the specified time
budgets in seconds (only available in lightgbm).
with_auto_prep:
If True, the input datasets will be copied and automatic preprocessing will be performed on them.
For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled.
with_mlflow:
If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
@ -231,6 +236,8 @@ def experiment_gbdt(model_params: Dict[str, Any],
cv = check_cv(cv, y)
_check_input(X_train, y, X_test)
if with_auto_prep:
X_train, X_test = autoprep(X_train, X_test, categorical_feature, gbdt_type)
logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
@ -366,3 +373,34 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series,
if X_test is not None:
assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
def autoprep(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
categorical_feature: Optional[List[str]] = None,
gbdt_type: str = 'lgbm') -> Tuple[pd.DataFrame, pd.DataFrame]:
if categorical_feature is None:
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
if gbdt_type == 'cat' and len(categorical_feature) > 0:
X_train = X_train.copy()
X_all = X_train if X_test is None else pd.concat([X_train, X_test])
# https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
for c in categorical_feature:
if is_integer_dtype(X_train[c].dtype):
fillval = X_all[c].min() - 1
else:
unique_values = X_all[c].unique()
fillval = 'na'
while fillval in unique_values:
fillval += '-'
if is_categorical(X_train[c]):
X_train[c] = X_train[c].cat.add_categories(fillval).fillna(fillval)
if X_test is not None:
X_test[c] = X_test[c].cat.add_categories(fillval).fillna(fillval)
else:
X_train[c].fillna(fillval, inplace=True)
if X_test is not None:
X_test[c].fillna(fillval, inplace=True)
return X_train, X_test

View File

@ -31,7 +31,7 @@ def make_classification_df(n_samples: int = 1024,
for i in range(n_cat_features):
X['cat_{}'.format(i)] = \
pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype(str).astype('category')
pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')
return X, y

View File

@ -376,19 +376,3 @@ def test_experiment_sample_submission_multiclass():
log_loss_default = log_loss(y_test, np.full((len(y_test), 5), 0.2), labels=[0, 1, 2, 3, 4])
assert log_loss_trianed < log_loss_default
def test_find_best_parameter():
params = {
'objective': 'binary',
'metrics': 'auc',
'n_estimators': 1000
}
X, y = make_classification_df(2048, class_sep=0.7)
X_train, X_test, y_train, y_test = train_test_split(X, y)
best_params = find_best_lgbm_parameter(params, X_train, y_train, cv=5)
result_base = experiment_gbdt(params, X_train, y_train, eval_func=roc_auc_score)
result_opt = experiment_gbdt(best_params, X_train, y_train)
assert result_opt.metrics[-1] >= result_base.metrics[-1]