refactoring
parent
5a6f553de6
commit
4468563cd6
22
README.md
22
README.md
|
@ -28,8 +28,8 @@ $pip install nyaggle
|
|||
## Examples
|
||||
|
||||
### Experiment Logging
|
||||
`experiment_gbdt()` is an high-level API for cross validation using
|
||||
gradient boosting algorithm. It outputs parameters, metrics, out of fold predictions, test predictions,
|
||||
`run_experiment()` is an high-level API for experiment with cross validation.
|
||||
It outputs parameters, metrics, out of fold predictions, test predictions,
|
||||
feature importance and submission.csv under the specified directory.
|
||||
|
||||
It can be combined with mlflow tracking.
|
||||
|
@ -48,10 +48,10 @@ params = {
|
|||
'max_depth': 8
|
||||
}
|
||||
|
||||
result = experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test)
|
||||
result = run_experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test)
|
||||
|
||||
# You can get outputs that needed in data science competitions with 1 API
|
||||
|
||||
|
@ -67,11 +67,11 @@ print(result.submission_df) # The output dataframe saved as submission.csv
|
|||
|
||||
|
||||
# You can use it with mlflow and track your experiments through mlflow-ui
|
||||
result = experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
with_mlflow=True)
|
||||
result = run_experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
with_mlflow=True)
|
||||
```
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ import argparse
|
|||
import pandas as pd
|
||||
|
||||
from sklearn.metrics import log_loss
|
||||
from nyaggle.experiment import experiment
|
||||
from nyaggle.experiment import run_experiment
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -24,8 +24,8 @@ if __name__ == "__main__":
|
|||
'task_type': 'GPU' if args.gpu else 'CPU'
|
||||
}
|
||||
|
||||
result = experiment(cat_params, X_train, y_train, X_test, logging_directory='bnp-paribas-{time}',
|
||||
eval_func=log_loss,
|
||||
algorithm_type='cat',
|
||||
sample_submission=pd.read_csv('sample_submission.csv'),
|
||||
with_mlflow=True)
|
||||
result = run_experiment(cat_params, X_train, y_train, X_test, logging_directory='bnp-paribas-{time}',
|
||||
eval_func=log_loss,
|
||||
algorithm_type='cat',
|
||||
sample_submission=pd.read_csv('sample_submission.csv'),
|
||||
with_mlflow=True)
|
||||
|
|
|
@ -2,7 +2,7 @@ import pandas as pd
|
|||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import GroupKFold
|
||||
|
||||
from nyaggle.experiment import experiment
|
||||
from nyaggle.experiment import run_experiment
|
||||
from nyaggle.feature.category_encoder import TargetEncoder
|
||||
|
||||
lgb_params = {
|
||||
|
@ -39,13 +39,13 @@ X_train, y_train = transform(te, X_train, y_train)
|
|||
X_test, _ = transform(te, X_test, None)
|
||||
|
||||
# generated submission.csv scores 11.61445 in private LB (35th)
|
||||
experiment(logging_directory='baseline_kaggledays_tokyo',
|
||||
model_params=lgb_params,
|
||||
X_train=X_train,
|
||||
y=y_train,
|
||||
X_test=X_test,
|
||||
eval_func=mean_squared_error,
|
||||
type_of_target='continuous',
|
||||
overwrite=True,
|
||||
with_auto_hpo=True,
|
||||
sample_submission=pd.read_csv('sample_submission.csv'))
|
||||
run_experiment(logging_directory='baseline_kaggledays_tokyo',
|
||||
model_params=lgb_params,
|
||||
X_train=X_train,
|
||||
y=y_train,
|
||||
X_test=X_test,
|
||||
eval_func=mean_squared_error,
|
||||
type_of_target='continuous',
|
||||
overwrite=True,
|
||||
with_auto_hpo=True,
|
||||
sample_submission=pd.read_csv('sample_submission.csv'))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pandas as pd
|
||||
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from nyaggle.experiment import experiment
|
||||
from nyaggle.experiment import run_experiment
|
||||
|
||||
|
||||
meta = pd.read_csv('training_set_metadata.csv')
|
||||
|
@ -18,9 +18,9 @@ lgb_param_extra = {
|
|||
|
||||
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
|
||||
|
||||
result_extra = experiment(lgb_param_extra,
|
||||
meta_extra.drop('target', axis=1),
|
||||
meta_extra['target'],
|
||||
logging_directory='plasticc-{time}',
|
||||
cv=skf,
|
||||
type_of_target='multiclass')
|
||||
result_extra = run_experiment(lgb_param_extra,
|
||||
meta_extra.drop('target', axis=1),
|
||||
meta_extra['target'],
|
||||
logging_directory='plasticc-{time}',
|
||||
cv=skf,
|
||||
type_of_target='multiclass')
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from nyaggle.experiment import experiment
|
||||
from nyaggle.experiment import run_experiment
|
||||
|
||||
|
||||
csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
|
||||
|
@ -20,11 +20,11 @@ params = {
|
|||
'reg_alpha': 0.1
|
||||
}
|
||||
|
||||
result = experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
result = run_experiment(params,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
'./wine-quality-{time}',
|
||||
type_of_target='continuous',
|
||||
with_mlflow=True,
|
||||
with_auto_hpo=True)
|
||||
type_of_target='continuous',
|
||||
with_mlflow=True,
|
||||
with_auto_hpo=True)
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from nyaggle.experiment.experiment import Experiment, add_leaderboard_score
|
||||
from nyaggle.experiment.averaging import average_results
|
||||
from nyaggle.experiment.gbdt import autoprep_gbdt, experiment, find_best_lgbm_parameter
|
||||
from nyaggle.experiment.run import autoprep_gbdt, run_experiment, find_best_lgbm_parameter
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
from typing import List, Optional, Tuple, Type, Union
|
||||
|
||||
import pandas as pd
|
||||
from catboost import CatBoost
|
||||
from lightgbm import LGBMModel
|
||||
from pandas.api.types import is_integer_dtype, is_categorical
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from xgboost import XGBModel
|
||||
|
||||
GBDTModel = Union[CatBoost, LGBMModel, XGBModel]
|
||||
|
||||
|
||||
def autoprep_gbdt(model: Type[GBDTModel], X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
|
||||
categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||||
if categorical_feature_to_treat is None:
|
||||
categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
|
||||
|
||||
# LightGBM:
|
||||
# Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
|
||||
# https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
|
||||
#
|
||||
# CatBoost:
|
||||
# int, float, bool or str is acceptable for categorical columns. NaN should be filled.
|
||||
# https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
|
||||
#
|
||||
# XGBoost:
|
||||
# All categorical column should be encoded beforehand.
|
||||
|
||||
if issubclass(model, LGBMModel):
|
||||
# LightGBM can handle categorical dtype natively
|
||||
categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical(X_train[c])]
|
||||
|
||||
if issubclass(model, CatBoost) and len(categorical_feature_to_treat) > 0:
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy() # dummy
|
||||
for c in categorical_feature_to_treat:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
|
||||
|
||||
if issubclass(model, (LGBMModel, XGBModel)) and len(categorical_feature_to_treat) > 0:
|
||||
assert X_test is not None, "X_test is required for XGBoost with categorical variables"
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test.copy()
|
||||
|
||||
for c in categorical_feature_to_treat:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
|
||||
le = LabelEncoder()
|
||||
concat = np.concatenate([X_train[c].values, X_test[c].values])
|
||||
concat = le.fit_transform(concat)
|
||||
X_train[c] = concat[:len(X_train)]
|
||||
X_test[c] = concat[len(X_train):]
|
||||
|
||||
return X_train, X_test
|
||||
|
||||
|
||||
def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]:
|
||||
if is_categorical(strain):
|
||||
return strain.cat.codes, stest.cat.codes
|
||||
elif is_integer_dtype(strain.dtype):
|
||||
fillval = min(strain.min(), stest.min()) - 1
|
||||
return strain.fillna(fillval), stest.fillna(fillval)
|
||||
else:
|
||||
return strain.astype(str), stest.astype(str)
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
import copy
|
||||
from typing import Dict, Iterable, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import optuna.integration.lightgbm as optuna_lgb
|
||||
import sklearn.utils.multiclass as multiclass
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
|
||||
from nyaggle.validation.split import check_cv
|
||||
|
||||
|
||||
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
time_budget: Optional[int] = None,
|
||||
type_of_target: str = 'auto') -> Dict:
|
||||
"""
|
||||
Search hyperparameter for lightgbm using optuna.
|
||||
|
||||
Args:
|
||||
base_param:
|
||||
Base parameters passed to lgb.train.
|
||||
X:
|
||||
Training data.
|
||||
y:
|
||||
Target
|
||||
cv:
|
||||
int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
|
||||
groups:
|
||||
Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
|
||||
time_budget:
|
||||
Time budget for tuning (in seconds).
|
||||
type_of_target:
|
||||
The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
|
||||
Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
|
||||
|
||||
Returns:
|
||||
The best parameters found
|
||||
"""
|
||||
cv = check_cv(cv, y)
|
||||
|
||||
if type_of_target == 'auto':
|
||||
type_of_target = multiclass.type_of_target(y)
|
||||
|
||||
train_index, test_index = next(cv.split(X, y, groups))
|
||||
|
||||
dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
|
||||
dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index])
|
||||
|
||||
params = copy.deepcopy(base_param)
|
||||
if 'early_stopping_rounds' not in params:
|
||||
params['early_stopping_rounds'] = 100
|
||||
|
||||
if not any([p in params for p in ('num_iterations', 'num_iteration',
|
||||
'num_trees', 'num_tree',
|
||||
'num_rounds', 'num_round')]):
|
||||
params['num_iterations'] = params.get('n_estimators', 10000)
|
||||
|
||||
if 'objective' not in params:
|
||||
tot_to_objective = {
|
||||
'binary': 'binary',
|
||||
'continuous': 'regression',
|
||||
'multiclass': 'multiclass'
|
||||
}
|
||||
params['objective'] = tot_to_objective[type_of_target]
|
||||
|
||||
if 'metric' not in params and 'objective' in params:
|
||||
if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root',
|
||||
'root_mean_squared_error', 'rmse']:
|
||||
params['metric'] = 'l2'
|
||||
if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']:
|
||||
params['metric'] = 'l1'
|
||||
if params['objective'] in ['binary']:
|
||||
params['metric'] = 'binary_logloss'
|
||||
if params['objective'] in ['multiclass']:
|
||||
params['metric'] = 'multi_logloss'
|
||||
|
||||
if not any([p in params for p in ('verbose', 'verbosity')]):
|
||||
params['verbosity'] = -1
|
||||
|
||||
best_params, tuning_history = dict(), list()
|
||||
optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0,
|
||||
best_params=best_params, tuning_history=tuning_history, time_budget=time_budget)
|
||||
|
||||
result_param = copy.deepcopy(base_param)
|
||||
result_param.update(best_params)
|
||||
return result_param
|
|
@ -1,26 +1,24 @@
|
|||
import copy
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from collections import namedtuple
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
|
||||
|
||||
import numpy as np
|
||||
import optuna.integration.lightgbm as optuna_lgb
|
||||
import pandas as pd
|
||||
import sklearn.utils.multiclass as multiclass
|
||||
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
|
||||
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor
|
||||
from xgboost import XGBModel, XGBClassifier, XGBRegressor
|
||||
from more_itertools import first_true
|
||||
from pandas.api.types import is_integer_dtype, is_categorical
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
from xgboost import XGBModel, XGBClassifier, XGBRegressor
|
||||
|
||||
from nyaggle.experiment.auto_prep import autoprep_gbdt
|
||||
from nyaggle.experiment.experiment import Experiment
|
||||
from nyaggle.experiment.hyperparameter_tuner import find_best_lgbm_parameter
|
||||
from nyaggle.feature_store import load_features
|
||||
from nyaggle.util import plot_importance
|
||||
from nyaggle.validation.cross_validate import cross_validate
|
||||
|
@ -39,104 +37,26 @@ ExperimentResult = namedtuple('LGBResult',
|
|||
GBDTModel = Union[CatBoost, LGBMModel, XGBModel]
|
||||
|
||||
|
||||
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
time_budget: Optional[int] = None,
|
||||
type_of_target: str = 'auto') -> Dict:
|
||||
"""
|
||||
Search hyperparameter for lightgbm using optuna.
|
||||
|
||||
Args:
|
||||
base_param:
|
||||
Base parameters passed to lgb.train.
|
||||
X:
|
||||
Training data.
|
||||
y:
|
||||
Target
|
||||
cv:
|
||||
int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
|
||||
groups:
|
||||
Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
|
||||
time_budget:
|
||||
Time budget for tuning (in seconds).
|
||||
type_of_target:
|
||||
The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
|
||||
Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
|
||||
|
||||
Returns:
|
||||
The best parameters found
|
||||
"""
|
||||
cv = check_cv(cv, y)
|
||||
|
||||
if type_of_target == 'auto':
|
||||
type_of_target = multiclass.type_of_target(y)
|
||||
|
||||
train_index, test_index = next(cv.split(X, y, groups))
|
||||
|
||||
dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
|
||||
dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index])
|
||||
|
||||
params = copy.deepcopy(base_param)
|
||||
if 'early_stopping_rounds' not in params:
|
||||
params['early_stopping_rounds'] = 100
|
||||
|
||||
if not any([p in params for p in ('num_iterations', 'num_iteration',
|
||||
'num_trees', 'num_tree',
|
||||
'num_rounds', 'num_round')]):
|
||||
params['num_iterations'] = params.get('n_estimators', 10000)
|
||||
|
||||
if 'objective' not in params:
|
||||
tot_to_objective = {
|
||||
'binary': 'binary',
|
||||
'continuous': 'regression',
|
||||
'multiclass': 'multiclass'
|
||||
}
|
||||
params['objective'] = tot_to_objective[type_of_target]
|
||||
|
||||
if 'metric' not in params and 'objective' in params:
|
||||
if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root',
|
||||
'root_mean_squared_error', 'rmse']:
|
||||
params['metric'] = 'l2'
|
||||
if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']:
|
||||
params['metric'] = 'l1'
|
||||
if params['objective'] in ['binary']:
|
||||
params['metric'] = 'binary_logloss'
|
||||
if params['objective'] in ['multiclass']:
|
||||
params['metric'] = 'multi_logloss'
|
||||
|
||||
if not any([p in params for p in ('verbose', 'verbosity')]):
|
||||
params['verbosity'] = -1
|
||||
|
||||
best_params, tuning_history = dict(), list()
|
||||
optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0,
|
||||
best_params=best_params, tuning_history=tuning_history, time_budget=time_budget)
|
||||
|
||||
result_param = copy.deepcopy(base_param)
|
||||
result_param.update(best_params)
|
||||
return result_param
|
||||
|
||||
|
||||
def experiment(model_params: Dict[str, Any],
|
||||
X_train: pd.DataFrame, y: pd.Series,
|
||||
X_test: Optional[pd.DataFrame] = None,
|
||||
logging_directory: str = 'output/{time}',
|
||||
overwrite: bool = False,
|
||||
eval_func: Optional[Callable] = None,
|
||||
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
|
||||
fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
categorical_feature: Optional[List[str]] = None,
|
||||
sample_submission: Optional[pd.DataFrame] = None,
|
||||
submission_filename: Optional[str] = None,
|
||||
type_of_target: str = 'auto',
|
||||
feature_list: Optional[List[Union[int, str]]] = None,
|
||||
feature_directory: Optional[str] = None,
|
||||
with_auto_hpo: bool = False,
|
||||
with_auto_prep: bool = False,
|
||||
with_mlflow: bool = False
|
||||
):
|
||||
def run_experiment(model_params: Dict[str, Any],
|
||||
X_train: pd.DataFrame, y: pd.Series,
|
||||
X_test: Optional[pd.DataFrame] = None,
|
||||
logging_directory: str = 'output/{time}',
|
||||
overwrite: bool = False,
|
||||
eval_func: Optional[Callable] = None,
|
||||
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
|
||||
fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
categorical_feature: Optional[List[str]] = None,
|
||||
sample_submission: Optional[pd.DataFrame] = None,
|
||||
submission_filename: Optional[str] = None,
|
||||
type_of_target: str = 'auto',
|
||||
feature_list: Optional[List[Union[int, str]]] = None,
|
||||
feature_directory: Optional[str] = None,
|
||||
with_auto_hpo: bool = False,
|
||||
with_auto_prep: bool = False,
|
||||
with_mlflow: bool = False
|
||||
):
|
||||
"""
|
||||
Evaluate metrics by cross-validation and stores result
|
||||
(log, oof prediction, test prediction, feature importance plot and submission file)
|
||||
|
@ -410,55 +330,3 @@ def _check_input(X_train: pd.DataFrame, y: pd.Series,
|
|||
|
||||
if X_test is not None:
|
||||
assert list(X_train.columns) == list(X_test.columns), "columns are different between X_train and X_test"
|
||||
|
||||
|
||||
def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]:
|
||||
if is_categorical(strain):
|
||||
return strain.cat.codes, stest.cat.codes
|
||||
elif is_integer_dtype(strain.dtype):
|
||||
fillval = min(strain.min(), stest.min()) - 1
|
||||
return strain.fillna(fillval), stest.fillna(fillval)
|
||||
else:
|
||||
return strain.astype(str), stest.astype(str)
|
||||
|
||||
|
||||
def autoprep_gbdt(model: Type[GBDTModel], X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
|
||||
categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||||
if categorical_feature_to_treat is None:
|
||||
categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]
|
||||
|
||||
# LightGBM:
|
||||
# Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
|
||||
# https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
|
||||
#
|
||||
# CatBoost:
|
||||
# int, float, bool or str is acceptable for categorical columns. NaN should be filled.
|
||||
# https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
|
||||
#
|
||||
# XGBoost:
|
||||
# All categorical column should be encoded beforehand.
|
||||
|
||||
if issubclass(model, LGBMModel):
|
||||
# LightGBM can handle categorical dtype natively
|
||||
categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical(X_train[c])]
|
||||
|
||||
if issubclass(model, CatBoost) and len(categorical_feature_to_treat) > 0:
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy() # dummy
|
||||
for c in categorical_feature_to_treat:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
|
||||
|
||||
if issubclass(model, (LGBMModel, XGBModel)) and len(categorical_feature_to_treat) > 0:
|
||||
assert X_test is not None, "X_test is required for XGBoost with categorical variables"
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test.copy()
|
||||
|
||||
for c in categorical_feature_to_treat:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
|
||||
le = LabelEncoder()
|
||||
concat = np.concatenate([X_train[c].values, X_test[c].values])
|
||||
concat = le.fit_transform(concat)
|
||||
X_train[c] = concat[:len(X_train)]
|
||||
X_test[c] = concat[len(X_train):]
|
||||
|
||||
return X_train, X_test
|
|
@ -4,7 +4,7 @@ import tempfile
|
|||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from nyaggle.experiment import average_results, experiment
|
||||
from nyaggle.experiment import average_results, run_experiment
|
||||
from nyaggle.testing import make_classification_df
|
||||
|
||||
|
||||
|
@ -22,8 +22,8 @@ def test_averaging():
|
|||
with tempfile.TemporaryDirectory() as temp_path:
|
||||
for i in range(3):
|
||||
params['seed'] = i
|
||||
ret_single = experiment(params, X_train, y_train, X_test,
|
||||
os.path.join(temp_path, 'seed{}'.format(i)))
|
||||
ret_single = run_experiment(params, X_train, y_train, X_test,
|
||||
os.path.join(temp_path, 'seed{}'.format(i)))
|
||||
|
||||
df = average_results([
|
||||
os.path.join(temp_path, 'seed{}'.format(i)) for i in range(3)
|
||||
|
|
|
@ -11,7 +11,7 @@ from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_err
|
|||
from sklearn.model_selection import GroupKFold, KFold, train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
|
||||
from nyaggle.experiment import experiment
|
||||
from nyaggle.experiment import run_experiment
|
||||
from nyaggle.feature_store import save_feature
|
||||
from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory
|
||||
|
||||
|
@ -33,7 +33,7 @@ def test_experiment_lgb_classifier():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -55,7 +55,7 @@ def test_experiment_lgb_regressor():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -76,7 +76,7 @@ def test_experiment_lgb_multiclass():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path)
|
||||
|
||||
assert len(np.unique(result.oof_prediction[:, 0])) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction[:, 0])) > 5
|
||||
|
@ -98,8 +98,8 @@ def test_experiment_cat_classifier():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, algorithm_type='cat',
|
||||
submission_filename='submission.csv', with_auto_prep=True)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, algorithm_type='cat',
|
||||
submission_filename='submission.csv', with_auto_prep=True)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -122,7 +122,7 @@ def test_experiment_cat_regressor():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
|
||||
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
@ -140,8 +140,8 @@ def test_experiment_cat_multiclass():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat',
|
||||
type_of_target='multiclass', submission_filename='submission.csv', with_auto_prep=True)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat',
|
||||
type_of_target='multiclass', submission_filename='submission.csv', with_auto_prep=True)
|
||||
|
||||
assert result.oof_prediction.shape == (len(y_train), 5)
|
||||
assert result.test_prediction.shape == (len(y_test), 5)
|
||||
|
@ -163,8 +163,8 @@ def test_experiment_xgb_classifier():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, algorithm_type='xgb',
|
||||
submission_filename='submission.csv', with_auto_prep=True)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score, algorithm_type='xgb',
|
||||
submission_filename='submission.csv', with_auto_prep=True)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -187,7 +187,7 @@ def test_experiment_xgb_regressor():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
|
||||
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
@ -205,9 +205,9 @@ def test_experiment_xgb_multiclass():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb',
|
||||
type_of_target='multiclass', submission_filename='submission.csv',
|
||||
with_auto_prep=True)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb',
|
||||
type_of_target='multiclass', submission_filename='submission.csv',
|
||||
with_auto_prep=True)
|
||||
|
||||
assert result.oof_prediction.shape == (len(y_train), 5)
|
||||
assert result.test_prediction.shape == (len(y_test), 5)
|
||||
|
@ -228,8 +228,8 @@ def test_experiment_sklearn_classifier():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score,
|
||||
algorithm_type=LogisticRegression, with_auto_prep=False)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, eval_func=roc_auc_score,
|
||||
algorithm_type=LogisticRegression, with_auto_prep=False)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -250,8 +250,8 @@ def test_experiment_sklearn_regressor():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, with_auto_prep=False,
|
||||
algorithm_type=LinearRegression)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, with_auto_prep=False,
|
||||
algorithm_type=LinearRegression)
|
||||
|
||||
assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction)) > 5
|
||||
|
@ -271,8 +271,8 @@ def test_experiment_sklearn_multiclass():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, algorithm_type=KNeighborsClassifier,
|
||||
with_auto_prep=False)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type=KNeighborsClassifier,
|
||||
with_auto_prep=False)
|
||||
|
||||
assert len(np.unique(result.oof_prediction[:, 0])) > 5 # making sure prediction is not binarized
|
||||
assert len(np.unique(result.test_prediction[:, 0])) > 5
|
||||
|
@ -295,8 +295,8 @@ def test_experiment_cat_custom_eval():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path,
|
||||
algorithm_type='cat', eval_func=mean_absolute_error)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path,
|
||||
algorithm_type='cat', eval_func=mean_absolute_error)
|
||||
|
||||
assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
|
@ -314,7 +314,7 @@ def test_experiment_without_test_data():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, None, temp_path)
|
||||
result = run_experiment(params, X_train, y_train, None, temp_path)
|
||||
|
||||
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))
|
||||
|
@ -333,11 +333,11 @@ def test_experiment_fit_params():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result1 = experiment(params, X_train, y_train, X_test,
|
||||
temp_path, fit_params={'early_stopping_rounds': None})
|
||||
result1 = run_experiment(params, X_train, y_train, X_test,
|
||||
temp_path, fit_params={'early_stopping_rounds': None})
|
||||
with get_temp_directory() as temp_path:
|
||||
result2 = experiment(params, X_train, y_train, X_test,
|
||||
temp_path, fit_params={'early_stopping_rounds': 5})
|
||||
result2 = run_experiment(params, X_train, y_train, X_test,
|
||||
temp_path, fit_params={'early_stopping_rounds': 5})
|
||||
|
||||
assert result1.models[-1].booster_.num_trees() == params['n_estimators']
|
||||
assert result2.models[-1].booster_.num_trees() < params['n_estimators']
|
||||
|
@ -355,7 +355,7 @@ def test_experiment_mlflow():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
|
||||
run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))
|
||||
|
||||
|
@ -380,13 +380,13 @@ def test_experiment_already_exists():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
|
||||
# result is overwrited by default
|
||||
experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
experiment(params, X_train, y_train, None, temp_path, overwrite=False)
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=False)
|
||||
|
||||
|
||||
def test_submission_filename():
|
||||
|
@ -401,7 +401,7 @@ def test_submission_filename():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment(params, X_train, y_train, X_test, temp_path, submission_filename='sub.csv')
|
||||
run_experiment(params, X_train, y_train, X_test, temp_path, submission_filename='sub.csv')
|
||||
|
||||
df = pd.read_csv(os.path.join(temp_path, 'sub.csv'))
|
||||
assert list(df.columns) == ['id', 'target']
|
||||
|
@ -419,7 +419,7 @@ def test_experiment_manual_cv_kfold():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, None, temp_path, cv=KFold(4))
|
||||
result = run_experiment(params, X_train, y_train, None, temp_path, cv=KFold(4))
|
||||
assert len(result.models) == 4
|
||||
assert len(result.metrics) == 4 + 1
|
||||
|
||||
|
@ -436,7 +436,7 @@ def test_experiment_manual_cv_int():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, None, temp_path, cv=KFold(2))
|
||||
result = run_experiment(params, X_train, y_train, None, temp_path, cv=KFold(2))
|
||||
assert len(result.models) == 2
|
||||
assert len(result.metrics) == 2 + 1
|
||||
|
||||
|
@ -467,7 +467,7 @@ def test_experiment_manual_cv_group():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, cv=GroupKFold(2), groups=grp)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, cv=GroupKFold(2), groups=grp)
|
||||
assert result.metrics[-1] < 0.7
|
||||
|
||||
|
||||
|
@ -485,7 +485,7 @@ def test_experiment_sample_submission_binary():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)
|
||||
|
||||
assert list(result.submission_df.columns) == ['target_id_abc', 'target_value_abc']
|
||||
assert roc_auc_score(y_test, result.submission_df['target_value_abc']) > 0.8
|
||||
|
@ -506,7 +506,7 @@ def test_experiment_sample_submission_multiclass():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result = experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)
|
||||
result = run_experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)
|
||||
|
||||
assert list(result.submission_df.columns) == ['target_id_abc',
|
||||
'target_class_0',
|
||||
|
@ -539,11 +539,11 @@ def test_with_feature_attachment():
|
|||
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result_wo_feature = experiment(params, X_train, y_train, X_test, logging_directory=temp_path)
|
||||
result_wo_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path)
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
result_w_feature = experiment(params, X_train, y_train, X_test, logging_directory=temp_path,
|
||||
feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path)
|
||||
result_w_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path,
|
||||
feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path)
|
||||
|
||||
assert result_w_feature.metrics[-1] > result_wo_feature.metrics[-1]
|
||||
|
||||
|
@ -560,8 +560,8 @@ def test_with_long_params():
|
|||
|
||||
with get_temp_directory() as temp_path:
|
||||
# just to make sure experiment finish
|
||||
experiment(params, X_train, y_train, X_test,
|
||||
logging_directory=temp_path, with_mlflow=True)
|
||||
run_experiment(params, X_train, y_train, X_test,
|
||||
logging_directory=temp_path, with_mlflow=True)
|
||||
|
||||
|
||||
def test_with_rare_categories():
|
||||
|
@ -602,9 +602,9 @@ def test_with_rare_categories():
|
|||
|
||||
for algorithm in ('cat', 'xgb', 'lgbm'):
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment(params[algorithm], X_train, y_train, X_test, algorithm_type=algorithm,
|
||||
logging_directory=temp_path, with_mlflow=True, with_auto_prep=True,
|
||||
categorical_feature=['x0', 'x1', 'x2', 'x3'])
|
||||
run_experiment(params[algorithm], X_train, y_train, X_test, algorithm_type=algorithm,
|
||||
logging_directory=temp_path, with_mlflow=True, with_auto_prep=True,
|
||||
categorical_feature=['x0', 'x1', 'x2', 'x3'])
|
||||
|
||||
|
||||
def test_inherit_outer_scope_run():
|
||||
|
@ -618,7 +618,7 @@ def test_inherit_outer_scope_run():
|
|||
X, y = make_classification_df()
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment(params, X, y, with_mlflow=True, logging_directory=temp_path)
|
||||
run_experiment(params, X, y, with_mlflow=True, logging_directory=temp_path)
|
||||
|
||||
assert mlflow.active_run() is not None # still valid
|
||||
|
||||
|
|
Loading…
Reference in New Issue