support xgboost

2020-01-30 23:33:09 +09:00 · 2020-01-30 23:33:09 +09:00 · 7265b1aa76
parent 9de453bdbe
commit 7265b1aa76
6 changed files with 131 additions and 33 deletions
--- a/nyaggle/experiment/gbdt.py
+++ b/nyaggle/experiment/gbdt.py
@ -11,10 +11,12 @@ import pandas as pd
 import sklearn.utils.multiclass as multiclass
 from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
 from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor
+from xgboost import XGBModel, XGBClassifier, XGBRegressor
 from more_itertools import first_true
 from pandas.api.types import is_integer_dtype, is_categorical
 from sklearn.model_selection import BaseCrossValidator
 from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
+from sklearn.preprocessing import LabelEncoder

 from nyaggle.experiment.experiment import Experiment
 from nyaggle.feature_store import load_features
@ -24,6 +26,7 @@ from nyaggle.validation.split import check_cv

 GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'metrics', 'models', 'importance', 'time',
                                      'submission_df'])
+GBDTModel = Union[CatBoost, LGBMModel, XGBModel]


 def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
@ -256,6 +259,9 @@ def experiment_gbdt(model_params: Dict[str, Any],

    _check_input(X_train, y, X_test)

+    if categorical_feature is None:
+        categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
+
    if with_auto_prep:
        X_train, X_test = autoprep_gbdt(X_train, X_test, categorical_feature, gbdt_type)

@ -275,13 +281,11 @@ def experiment_gbdt(model_params: Dict[str, Any],
            exp.log_param('features', feature_list)

        if tuning_time_budget is not None:
-            assert gbdt_type == 'lgbm', 'auto-tuning with catboost is not supported'
+            assert gbdt_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
            model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups,
                                                    time_budget=tuning_time_budget, type_of_target=type_of_target)
            exp.log_param('model_params_tuned', model_params)

-        if categorical_feature is None:
-            categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
        exp.log('Categorical: {}'.format(categorical_feature))

        if type_of_target == 'auto':
@ -363,6 +367,9 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
        ('binary', 'cat', CatBoostClassifier, roc_auc_score, 'cat_features'),
        ('multiclass', 'cat', CatBoostClassifier, log_loss, 'cat_features'),
        ('continuous', 'cat', CatBoostRegressor, mean_squared_error, 'cat_features'),
+        ('binary', 'xgb', XGBClassifier, roc_auc_score, None),
+        ('multiclass', 'xgb', XGBClassifier, log_loss, None),
+        ('continuous', 'xgb', XGBRegressor, mean_squared_error, None),
    ]
    found = first_true(gbdt_table, pred=lambda x: x[0] == target_type and x[1] == gbdt_type)
    if found is None:
@ -375,17 +382,17 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
    return model, eval_func, cat_param


-def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int, exp: Experiment):
+def _save_model(gbdt_type: str, model: GBDTModel, logging_directory: str, fold: int, exp: Experiment):
    model_dir = os.path.join(logging_directory, 'models')
    os.makedirs(model_dir, exist_ok=True)
    path = os.path.join(model_dir, 'fold{}'.format(fold))

-    if gbdt_type == 'cat':
-        assert isinstance(model, CatBoost)
-        model.save_model(path)
-    else:
+    if gbdt_type == 'lgbm':
        assert isinstance(model, LGBMModel)
        model.booster_.save_model(path)
+    else:
+        assert isinstance(model, (XGBModel, CatBoost))
+        model.save_model(path)

    exp.log_artifact(path)

@ -400,6 +407,25 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series,
        assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"


+def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series], sall: pd.Series):
+    if is_integer_dtype(strain.dtype):
+        fillval = sall.min() - 1
+    else:
+        unique_values = sall.unique()
+        fillval = 'na'
+        while fillval in unique_values:
+            fillval += '-'
+    if is_categorical(strain):
+        strain = strain.cat.add_categories(fillval).fillna(fillval)
+        if stest is not None:
+            stest = stest.cat.add_categories(fillval).fillna(fillval)
+    else:
+        strain = strain.fillna(fillval)
+        if stest is not None:
+            stest = stest.fillna(fillval)
+    return strain, stest
+
+
 def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
                  categorical_feature: Optional[List[str]] = None,
                  gbdt_type: str = 'lgbm') -> Tuple[pd.DataFrame, pd.DataFrame]:
@ -412,20 +438,22 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],

        # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
        for c in categorical_feature:
-            if is_integer_dtype(X_train[c].dtype):
-                fillval = X_all[c].min() - 1
+            if X_test is not None:
+                X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c], X_all[c])
            else:
-                unique_values = X_all[c].unique()
-                fillval = 'na'
-                while fillval in unique_values:
-                    fillval += '-'
-            if is_categorical(X_train[c]):
-                X_train[c] = X_train[c].cat.add_categories(fillval).fillna(fillval)
-                if X_test is not None:
-                    X_test[c] = X_test[c].cat.add_categories(fillval).fillna(fillval)
-            else:
-                X_train[c].fillna(fillval, inplace=True)
-                if X_test is not None:
-                    X_test[c].fillna(fillval, inplace=True)
+                X_train[c], _ = _fill_na_by_unique_value(X_train[c], None, X_all[c])
+
+    if gbdt_type == 'xgb' and len(categorical_feature) > 0:
+        assert X_test is not None, "X_test is required for XGBoost with categorical variables"
+        X_train = X_train.copy()
+        X_test = X_test.copy()
+        X_all = pd.concat([X_train, X_test]).copy()
+
+        for c in categorical_feature:
+            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c],
+                                                             X_test[c] if X_test is not None else None, X_all[c])
+            le = LabelEncoder()
+            X_train[c] = le.fit_transform(X_train[c])
+            X_test[c] = le.transform(X_test[c])

    return X_train, X_test
--- a/nyaggle/hyper_parameters/parameters.py
+++ b/nyaggle/hyper_parameters/parameters.py
@ -27,23 +27,23 @@ def _return(parameter: Union[List[Dict], Dict], with_metadata: bool) -> Union[Li
        return parameter['parameters']


-def _get_table(gbdt_type: str = 'lgb'):
-    if gbdt_type == 'lgb':
+def _get_table(gbdt_type: str = 'lgbm'):
+    if gbdt_type == 'lgbm':
        return params_lgb
    elif gbdt_type == 'cat':
        return params_cat
    elif gbdt_type == 'xgb':
        return params_xgb
-    raise ValueError('gbdt type should be one of (lgb, cat, xgb)')
+    raise ValueError('gbdt type should be one of (lgbm, cat, xgb)')


-def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> List[Dict]:
+def list_hyperparams(gbdt_type: str = 'lgbm', with_metadata: bool = False) -> List[Dict]:
    """
    List all hyperparameters

    Args:
        gbdt_type:
-            The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
+            The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
        with_metadata:
            When set to True, parameters are wrapped by metadata dictionary which contains information about
            source URL, competition name etc.
@ -53,7 +53,7 @@ def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> Lis
    return _return(_get_table(gbdt_type), with_metadata)


-def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool = False) -> Dict:
+def get_hyperparam_byname(name: str, gbdt_type: str = 'lgbm', with_metadata: bool = False) -> Dict:
    """
    Get a hyperparameter by parameter name

@ -61,7 +61,7 @@ def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool
        name:
            The name of parameter (e.g. "ieee-2019-10th").
        gbdt_type:
-            The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
+            The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
        with_metadata:
            When set to True, parameters are wrapped by metadata dictionary which contains information about
            source URL, competition name etc.
--- a/nyaggle/validation/cross_validate.py
+++ b/nyaggle/validation/cross_validate.py
@ -10,6 +10,7 @@ import sklearn.utils.multiclass as multiclass
 from category_encoders.utils import convert_input, convert_input_vector
 from catboost import CatBoost
 from lightgbm import LGBMModel
+from xgboost import XGBModel
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import BaseCrossValidator
 from nyaggle.validation.split import check_cv
@ -155,7 +156,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        else:
            fit_params_fold = copy.copy(fit_params)

-        if isinstance(estimator[n], (LGBMModel, CatBoost)):
+        if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
            if early_stopping:
                if 'eval_set' not in fit_params_fold:
                    fit_params_fold['eval_set'] = [(valid_x, valid_y)]
@ -175,7 +176,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        if on_each_fold is not None:
            on_each_fold(n, estimator[n], train_x, train_y)

-        if isinstance(estimator[n], (LGBMModel, CatBoost)):
+        if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
            importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))

        if eval_func is not None:
@ -200,7 +201,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
    return CVResult(oof, predicted, scores, importance)


-def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[str],
+def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel, XGBModel], features: List[str],
                         importance_type: str) -> pd.DataFrame:
    df = pd.DataFrame()

@ -208,6 +209,8 @@ def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[

    if isinstance(gbdt_model, CatBoost):
        df['importance'] = gbdt_model.get_feature_importance()
+    elif isinstance(gbdt_model, XGBModel):
+        df['importance'] = gbdt_model.feature_importances_
    elif isinstance(gbdt_model, LGBMModel):
        df['importance'] = gbdt_model.booster_.feature_importance(importance_type=importance_type)

--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ seaborn
 sklearn
 tqdm
 transformers
+xgboost
--- a/setup.py
+++ b/setup.py
@ -40,7 +40,8 @@ setup(
        'seaborn',
        'sklearn',
        'tqdm',
-        'transformers>=2.3.0'
+        'transformers>=2.3.0',
+        'xgboost'
    ],
    author='nyanp',
    author_email='Noumi.Taiga@gmail.com',
--- a/tests/experiment/test_gbdt.py
+++ b/tests/experiment/test_gbdt.py
@ -148,6 +148,71 @@ def test_experiment_cat_multiclass():
        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))


+def test_experiment_xgb_classifier():
+    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
+                                  class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
+
+    params = {
+        'max_depth': 8,
+        'num_boost_round': 100
+    }
+
+    with get_temp_directory() as temp_path:
+        result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, gbdt_type='xgb',
+                                 submission_filename='submission.csv')
+
+        assert len(np.unique(result.oof_prediction)) > 5  # making sure prediction is not binarized
+        assert len(np.unique(result.test_prediction)) > 5
+        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
+        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
+        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
+
+        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+
+
+def test_experiment_xgb_regressor():
+    X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
+                              random_state=0, id_column='user_id')
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
+
+    params = {
+        'max_depth': 8,
+        'num_boost_round': 100
+    }
+
+    with get_temp_directory() as temp_path:
+        result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb')
+
+        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
+        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+
+
+def test_experiment_xgb_multiclass():
+    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2, n_classes=5,
+                                  class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
+
+    params = {
+        'max_depth': 8,
+        'num_boost_round': 100
+    }
+
+    with get_temp_directory() as temp_path:
+        result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb',
+                                 type_of_target='multiclass', submission_filename='submission.csv')
+
+        assert result.oof_prediction.shape == (len(y_train), 5)
+        assert result.test_prediction.shape == (len(y_test), 5)
+
+        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
+
+        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+
+
 def test_experiment_cat_custom_eval():
    X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                              random_state=0, id_column='user_id')