support xgboost
parent
9de453bdbe
commit
7265b1aa76
|
@ -11,10 +11,12 @@ import pandas as pd
|
|||
import sklearn.utils.multiclass as multiclass
|
||||
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
|
||||
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor
|
||||
from xgboost import XGBModel, XGBClassifier, XGBRegressor
|
||||
from more_itertools import first_true
|
||||
from pandas.api.types import is_integer_dtype, is_categorical
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from nyaggle.experiment.experiment import Experiment
|
||||
from nyaggle.feature_store import load_features
|
||||
|
@ -24,6 +26,7 @@ from nyaggle.validation.split import check_cv
|
|||
|
||||
GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'metrics', 'models', 'importance', 'time',
|
||||
'submission_df'])
|
||||
GBDTModel = Union[CatBoost, LGBMModel, XGBModel]
|
||||
|
||||
|
||||
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
|
||||
|
@ -256,6 +259,9 @@ def experiment_gbdt(model_params: Dict[str, Any],
|
|||
|
||||
_check_input(X_train, y, X_test)
|
||||
|
||||
if categorical_feature is None:
|
||||
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
|
||||
|
||||
if with_auto_prep:
|
||||
X_train, X_test = autoprep_gbdt(X_train, X_test, categorical_feature, gbdt_type)
|
||||
|
||||
|
@ -275,13 +281,11 @@ def experiment_gbdt(model_params: Dict[str, Any],
|
|||
exp.log_param('features', feature_list)
|
||||
|
||||
if tuning_time_budget is not None:
|
||||
assert gbdt_type == 'lgbm', 'auto-tuning with catboost is not supported'
|
||||
assert gbdt_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
|
||||
model_params = find_best_lgbm_parameter(model_params, X_train, y, cv=cv, groups=groups,
|
||||
time_budget=tuning_time_budget, type_of_target=type_of_target)
|
||||
exp.log_param('model_params_tuned', model_params)
|
||||
|
||||
if categorical_feature is None:
|
||||
categorical_feature = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
|
||||
exp.log('Categorical: {}'.format(categorical_feature))
|
||||
|
||||
if type_of_target == 'auto':
|
||||
|
@ -363,6 +367,9 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
|
|||
('binary', 'cat', CatBoostClassifier, roc_auc_score, 'cat_features'),
|
||||
('multiclass', 'cat', CatBoostClassifier, log_loss, 'cat_features'),
|
||||
('continuous', 'cat', CatBoostRegressor, mean_squared_error, 'cat_features'),
|
||||
('binary', 'xgb', XGBClassifier, roc_auc_score, None),
|
||||
('multiclass', 'xgb', XGBClassifier, log_loss, None),
|
||||
('continuous', 'xgb', XGBRegressor, mean_squared_error, None),
|
||||
]
|
||||
found = first_true(gbdt_table, pred=lambda x: x[0] == target_type and x[1] == gbdt_type)
|
||||
if found is None:
|
||||
|
@ -375,17 +382,17 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
|
|||
return model, eval_func, cat_param
|
||||
|
||||
|
||||
def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int, exp: Experiment):
|
||||
def _save_model(gbdt_type: str, model: GBDTModel, logging_directory: str, fold: int, exp: Experiment):
|
||||
model_dir = os.path.join(logging_directory, 'models')
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
path = os.path.join(model_dir, 'fold{}'.format(fold))
|
||||
|
||||
if gbdt_type == 'cat':
|
||||
assert isinstance(model, CatBoost)
|
||||
model.save_model(path)
|
||||
else:
|
||||
if gbdt_type == 'lgbm':
|
||||
assert isinstance(model, LGBMModel)
|
||||
model.booster_.save_model(path)
|
||||
else:
|
||||
assert isinstance(model, (XGBModel, CatBoost))
|
||||
model.save_model(path)
|
||||
|
||||
exp.log_artifact(path)
|
||||
|
||||
|
@ -400,6 +407,25 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series,
|
|||
assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
|
||||
|
||||
|
||||
def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series], sall: pd.Series):
|
||||
if is_integer_dtype(strain.dtype):
|
||||
fillval = sall.min() - 1
|
||||
else:
|
||||
unique_values = sall.unique()
|
||||
fillval = 'na'
|
||||
while fillval in unique_values:
|
||||
fillval += '-'
|
||||
if is_categorical(strain):
|
||||
strain = strain.cat.add_categories(fillval).fillna(fillval)
|
||||
if stest is not None:
|
||||
stest = stest.cat.add_categories(fillval).fillna(fillval)
|
||||
else:
|
||||
strain = strain.fillna(fillval)
|
||||
if stest is not None:
|
||||
stest = stest.fillna(fillval)
|
||||
return strain, stest
|
||||
|
||||
|
||||
def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
|
||||
categorical_feature: Optional[List[str]] = None,
|
||||
gbdt_type: str = 'lgbm') -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||||
|
@ -412,20 +438,22 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
|
|||
|
||||
# https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
|
||||
for c in categorical_feature:
|
||||
if is_integer_dtype(X_train[c].dtype):
|
||||
fillval = X_all[c].min() - 1
|
||||
if X_test is not None:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c], X_all[c])
|
||||
else:
|
||||
unique_values = X_all[c].unique()
|
||||
fillval = 'na'
|
||||
while fillval in unique_values:
|
||||
fillval += '-'
|
||||
if is_categorical(X_train[c]):
|
||||
X_train[c] = X_train[c].cat.add_categories(fillval).fillna(fillval)
|
||||
if X_test is not None:
|
||||
X_test[c] = X_test[c].cat.add_categories(fillval).fillna(fillval)
|
||||
else:
|
||||
X_train[c].fillna(fillval, inplace=True)
|
||||
if X_test is not None:
|
||||
X_test[c].fillna(fillval, inplace=True)
|
||||
X_train[c], _ = _fill_na_by_unique_value(X_train[c], None, X_all[c])
|
||||
|
||||
if gbdt_type == 'xgb' and len(categorical_feature) > 0:
|
||||
assert X_test is not None, "X_test is required for XGBoost with categorical variables"
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test.copy()
|
||||
X_all = pd.concat([X_train, X_test]).copy()
|
||||
|
||||
for c in categorical_feature:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c],
|
||||
X_test[c] if X_test is not None else None, X_all[c])
|
||||
le = LabelEncoder()
|
||||
X_train[c] = le.fit_transform(X_train[c])
|
||||
X_test[c] = le.transform(X_test[c])
|
||||
|
||||
return X_train, X_test
|
||||
|
|
|
@ -27,23 +27,23 @@ def _return(parameter: Union[List[Dict], Dict], with_metadata: bool) -> Union[Li
|
|||
return parameter['parameters']
|
||||
|
||||
|
||||
def _get_table(gbdt_type: str = 'lgb'):
|
||||
if gbdt_type == 'lgb':
|
||||
def _get_table(gbdt_type: str = 'lgbm'):
|
||||
if gbdt_type == 'lgbm':
|
||||
return params_lgb
|
||||
elif gbdt_type == 'cat':
|
||||
return params_cat
|
||||
elif gbdt_type == 'xgb':
|
||||
return params_xgb
|
||||
raise ValueError('gbdt type should be one of (lgb, cat, xgb)')
|
||||
raise ValueError('gbdt type should be one of (lgbm, cat, xgb)')
|
||||
|
||||
|
||||
def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> List[Dict]:
|
||||
def list_hyperparams(gbdt_type: str = 'lgbm', with_metadata: bool = False) -> List[Dict]:
|
||||
"""
|
||||
List all hyperparameters
|
||||
|
||||
Args:
|
||||
gbdt_type:
|
||||
The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
|
||||
The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
|
||||
with_metadata:
|
||||
When set to True, parameters are wrapped by metadata dictionary which contains information about
|
||||
source URL, competition name etc.
|
||||
|
@ -53,7 +53,7 @@ def list_hyperparams(gbdt_type: str = 'lgb', with_metadata: bool = False) -> Lis
|
|||
return _return(_get_table(gbdt_type), with_metadata)
|
||||
|
||||
|
||||
def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool = False) -> Dict:
|
||||
def get_hyperparam_byname(name: str, gbdt_type: str = 'lgbm', with_metadata: bool = False) -> Dict:
|
||||
"""
|
||||
Get a hyperparameter by parameter name
|
||||
|
||||
|
@ -61,7 +61,7 @@ def get_hyperparam_byname(name: str, gbdt_type: str = 'lgb', with_metadata: bool
|
|||
name:
|
||||
The name of parameter (e.g. "ieee-2019-10th").
|
||||
gbdt_type:
|
||||
The type of gbdt library. ``lgb``, ``cat``, ``xgb`` can be used.
|
||||
The type of gbdt library. ``lgbm``, ``cat``, ``xgb`` can be used.
|
||||
with_metadata:
|
||||
When set to True, parameters are wrapped by metadata dictionary which contains information about
|
||||
source URL, competition name etc.
|
||||
|
|
|
@ -10,6 +10,7 @@ import sklearn.utils.multiclass as multiclass
|
|||
from category_encoders.utils import convert_input, convert_input_vector
|
||||
from catboost import CatBoost
|
||||
from lightgbm import LGBMModel
|
||||
from xgboost import XGBModel
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
from nyaggle.validation.split import check_cv
|
||||
|
@ -155,7 +156,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
else:
|
||||
fit_params_fold = copy.copy(fit_params)
|
||||
|
||||
if isinstance(estimator[n], (LGBMModel, CatBoost)):
|
||||
if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
|
||||
if early_stopping:
|
||||
if 'eval_set' not in fit_params_fold:
|
||||
fit_params_fold['eval_set'] = [(valid_x, valid_y)]
|
||||
|
@ -175,7 +176,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
if on_each_fold is not None:
|
||||
on_each_fold(n, estimator[n], train_x, train_y)
|
||||
|
||||
if isinstance(estimator[n], (LGBMModel, CatBoost)):
|
||||
if isinstance(estimator[n], (LGBMModel, CatBoost, XGBModel)):
|
||||
importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))
|
||||
|
||||
if eval_func is not None:
|
||||
|
@ -200,7 +201,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
return CVResult(oof, predicted, scores, importance)
|
||||
|
||||
|
||||
def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[str],
|
||||
def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel, XGBModel], features: List[str],
|
||||
importance_type: str) -> pd.DataFrame:
|
||||
df = pd.DataFrame()
|
||||
|
||||
|
@ -208,6 +209,8 @@ def _get_gbdt_importance(gbdt_model: Union[CatBoost, LGBMModel], features: List[
|
|||
|
||||
if isinstance(gbdt_model, CatBoost):
|
||||
df['importance'] = gbdt_model.get_feature_importance()
|
||||
elif isinstance(gbdt_model, XGBModel):
|
||||
df['importance'] = gbdt_model.feature_importances_
|
||||
elif isinstance(gbdt_model, LGBMModel):
|
||||
df['importance'] = gbdt_model.booster_.feature_importance(importance_type=importance_type)
|
||||
|
||||
|
|
|
@ -11,3 +11,4 @@ seaborn
|
|||
sklearn
|
||||
tqdm
|
||||
transformers
|
||||
xgboost
|
||||
|
|
3
setup.py
3
setup.py
|
@ -40,7 +40,8 @@ setup(
|
|||
'seaborn',
|
||||
'sklearn',
|
||||
'tqdm',
|
||||
'transformers>=2.3.0'
|
||||
'transformers>=2.3.0',
|
||||
'xgboost'
|
||||
],
|
||||
author='nyanp',
|
||||
author_email='Noumi.Taiga@gmail.com',
|
||||
|
|
|
@ -148,6 +148,71 @@ def test_experiment_cat_multiclass():
|
|||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
||||
|
||||
def test_experiment_xgb_classifier():
|
||||
X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
|
||||
class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
|
||||
|
||||
params = {
|
||||
'max_depth': 8,
|
||||
'num_boost_round': 100
|
||||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, gbdt_type='xgb',
|
||||
submission_filename='submission.csv')
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
|
||||
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
||||
|
||||
def test_experiment_xgb_regressor():
|
||||
X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
|
||||
random_state=0, id_column='user_id')
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
|
||||
|
||||
params = {
|
||||
'max_depth': 8,
|
||||
'num_boost_round': 100
|
||||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb')
|
||||
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
||||
|
||||
def test_experiment_xgb_multiclass():
|
||||
X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2, n_classes=5,
|
||||
class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
|
||||
|
||||
params = {
|
||||
'max_depth': 8,
|
||||
'num_boost_round': 100
|
||||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='xgb',
|
||||
type_of_target='multiclass', submission_filename='submission.csv')
|
||||
|
||||
assert result.oof_prediction.shape == (len(y_train), 5)
|
||||
assert result.test_prediction.shape == (len(y_test), 5)
|
||||
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
||||
|
||||
def test_experiment_cat_custom_eval():
|
||||
X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2,
|
||||
random_state=0, id_column='user_id')
|
||||
|
|
Loading…
Reference in New Issue