support xgboost

pull/21/head
nyanp 2020-01-30 23:33:09 +09:00
parent 9de453bdbe
commit 7265b1aa76
6 changed files with 131 additions and 33 deletions

View File

@ -11,10 +11,12 @@ import pandas as pd
import sklearn.utils.multiclass as multiclass
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor
from xgboost import XGBModel, XGBClassifier, XGBRegressor
from more_itertools import first_true
from pandas.api.types import is_integer_dtype, is_categorical
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder
from nyaggle.experiment.experiment import Experiment
from nyaggle.feature_store import load_features
@ -24,6 +26,7 @@ from nyaggle.validation.split import check_cv
GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'metrics', 'models', 'importance', 'time',
'submission_df'])
GBDTModel = Union[CatBoost, LGBMModel, XGBModel]
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
@ -256,6 +259,9 @@ def experiment_gbdt(model_params: Dict[str, Any],
_check_input(X_train, y, X_test)
if categorical_feature is None:
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
if with_auto_prep:
X_train, X_test = autoprep_gbdt(X_train, X_test, categorical_feature, gbdt_type)
@ -275,13 +281,11 @@ def experiment_gbdt(model_params: Dict[str, Any],
exp.log_param('features', feature_list)
if tuning_time_budget is not None:
assert gbdt_type == 'lgbm', 'auto-tuning with catboost is not supported'
assert gbdt_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups,
time_budget=tuning_time_budget, type_of_target=type_of_target)
exp.log_param('model_params_tuned', model_params)
if categorical_feature is None:
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
exp.log('Categorical: {}'.format(categorical_feature))
if type_of_target == 'auto':
@ -363,6 +367,9 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
('binary', 'cat', CatBoostClassifier, roc_auc_score, 'cat_features'),
('multiclass', 'cat', CatBoostClassifier, log_loss, 'cat_features'),
('continuous', 'cat', CatBoostRegressor, mean_squared_error, 'cat_features'),
('binary', 'xgb', XGBClassifier, roc_auc_score, None),
('multiclass', 'xgb', XGBClassifier, log_loss, None),
('continuous', 'xgb', XGBRegressor, mean_squared_error, None),
]
found = first_true(gbdt_table, pred=lambda x: x[0] == target_type and x[1] == gbdt_type)
if found is None:
@ -375,17 +382,17 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
return model, eval_func, cat_param
def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int, exp: Experiment):
def _save_model(gbdt_type: str, model: GBDTModel, logging_directory: str, fold: int, exp: Experiment):
model_dir = os.path.join(logging_directory, 'models')
os.makedirs(model_dir, exist_ok=True)
path = os.path.join(model_dir, 'fold{}'.format(fold))
if gbdt_type == 'cat':
assert isinstance(model, CatBoost)
model.save_model(path)
else:
if gbdt_type == 'lgbm':
assert isinstance(model, LGBMModel)
model.booster_.save_model(path)
else:
assert isinstance(model, (XGBModel, CatBoost))
model.save_model(path)
exp.log_artifact(path)
@ -400,6 +407,25 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series,
assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series], sall: pd.Series):
if is_integer_dtype(strain.dtype):
fillval = sall.min() - 1
else:
unique_values = sall.unique()
fillval = 'na'
while fillval in unique_values:
fillval += '-'
if is_categorical(strain):
strain = strain.cat.add_categories(fillval).fillna(fillval)
if stest is not None:
stest = stest.cat.add_categories(fillval).fillna(fillval)
else:
strain = strain.fillna(fillval)
if stest is not None:
stest = stest.fillna(fillval)
return strain, stest
def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
categorical_feature: Optional[List[str]] = None,
gbdt_type: str = 'lgbm') -> Tuple[pd.DataFrame, pd.DataFrame]:
@ -412,20 +438,22 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
# https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
for c in categorical_feature:
if is_integer_dtype(X_train[c].dtype):
fillval = X_all[c].min() - 1
if X_test is not None:
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c], X_all[c])
else:
unique_values = X_all[c].unique()
fillval = 'na'
while fillval in unique_values:
fillval += '-'
if is_categorical(X_train[c]):
X_train[c] = X_train[c].cat.add_categories(fillval).fillna(fillval)
if X_test is not None:
X_test[c] = X_test[c].cat.add_categories(fillval).fillna(fillval)
else:
X_train[c].fillna(fillval, inplace=True)
if X_test is not None:
X_test[c].fillna(fillval, inplace=True)
X_train[c], _ = _fill_na_by_unique_value(X_train[c], None, X_all[c])
if gbdt_type == 'xgb' and len(categorical_feature) > 0:
assert X_test is not None, "X_test is required for XGBoost with categorical variables"
X_train = X_train.copy()
X_test = X_test.copy()
X_all = pd.concat([X_train, X_test]).copy()
for c in categorical_feature:
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c],
X_test[c] if X_test is not None else None, X_all[c])
le = LabelEncoder()
X_train[c] = le.fit_transform(X_train[c])
X_test[c] = le.transform(X_test[c])
return X_train, X_test

View File

@ -27,23 +27,23 @@ def _return(parameter: Union[List[Dict], Dict], with_metadata: bool) -> Union[Li
return parameter['parameters']
def _get_table(gbdt_type: str = 'lgb'):
if gbdt_type == 'lgb':
def _get_table(gbdt_type: str = 'lgbm'):
if gbdt_type == 'lgbm':
return params_lgb
elif gbdt_type == 'cat':
return params_cat
elif gbdt_type == 'xgb':
return params_xgb
raise ValueError('gbdt type should be one of (lgb, cat, xgb)')
raise ValueError('gbdt type should be one of (lgbm, cat, xgb)')
def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> List[Dict]:
def list_hyperparams(gbdt_type: str = 'lgbm', with_metadata: bool = False) -> List[Dict]:
"""
List all hyperparameters
Args:
gbdt_type:
The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
with_metadata:
When set to True, parameters are wrapped by metadata dictionary which contains information about
source URL, competition name etc.
@ -53,7 +53,7 @@ def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> Lis
return _return(_get_table(gbdt_type), with_metadata)
def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool = False) -> Dict:
def get_hyperparam_byname(name: str, gbdt_type: str = 'lgbm', with_metadata: bool = False) -> Dict:
"""
Get a hyperparameter by parameter name
@ -61,7 +61,7 @@ def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool
name:
The name of parameter (e.g. "ieee-2019-10th").
gbdt_type:
The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
with_metadata:
When set to True, parameters are wrapped by metadata dictionary which contains information about
source URL, competition name etc.

View File

@ -10,6 +10,7 @@ import sklearn.utils.multiclass as multiclass
from category_encoders.utils import convert_input, convert_input_vector
from catboost import CatBoost
from lightgbm import LGBMModel
from xgboost import XGBModel
from sklearn.base import BaseEstimator
from sklearn.model_selection import BaseCrossValidator
from nyaggle.validation.split import check_cv
@ -155,7 +156,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
else:
fit_params_fold = copy.copy(fit_params)
if isinstance(estimator[n], (LGBMModel, CatBoost)):
if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
if early_stopping:
if 'eval_set' not in fit_params_fold:
fit_params_fold['eval_set'] = [(valid_x, valid_y)]
@ -175,7 +176,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
if on_each_fold is not None:
on_each_fold(n, estimator[n], train_x, train_y)
if isinstance(estimator[n], (LGBMModel, CatBoost)):
if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))
if eval_func is not None:
@ -200,7 +201,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
return CVResult(oof, predicted, scores, importance)
def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[str],
def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel, XGBModel], features: List[str],
importance_type: str) -> pd.DataFrame:
df = pd.DataFrame()
@ -208,6 +209,8 @@ def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[
if isinstance(gbdt_model, CatBoost):
df['importance'] = gbdt_model.get_feature_importance()
elif isinstance(gbdt_model, XGBModel):
df['importance'] = gbdt_model.feature_importances_
elif isinstance(gbdt_model, LGBMModel):
df['importance'] = gbdt_model.booster_.feature_importance(importance_type=importance_type)

View File

@ -11,3 +11,4 @@ seaborn
sklearn
tqdm
transformers
xgboost

View File

@ -40,7 +40,8 @@ setup(
'seaborn',
'sklearn',
'tqdm',
'transformers>=2.3.0'
'transformers>=2.3.0',
'xgboost'
],
author='nyanp',
author_email='Noumi.Taiga@gmail.com',

View File

@ -148,6 +148,71 @@ def test_experiment_cat_multiclass():
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_xgb_classifier():
X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
params = {
'max_depth': 8,
'num_boost_round': 100
}
with get_temp_directory() as temp_path:
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, gbdt_type='xgb',
submission_filename='submission.csv')
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
assert len(np.unique(result.test_prediction)) > 5
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_xgb_regressor():
X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
random_state=0, id_column='user_id')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
params = {
'max_depth': 8,
'num_boost_round': 100
}
with get_temp_directory() as temp_path:
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb')
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_xgb_multiclass():
X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2, n_classes=5,
class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
params = {
'max_depth': 8,
'num_boost_round': 100
}
with get_temp_directory() as temp_path:
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb',
type_of_target='multiclass', submission_filename='submission.csv')
assert result.oof_prediction.shape == (len(y_train), 5)
assert result.test_prediction.shape == (len(y_test), 5)
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_cat_custom_eval():
X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
random_state=0, id_column='user_id')