From 174cc19b538ea00ceaf99eda9a0e47ea014fe0c8 Mon Sep 17 00:00:00 2001 From: Taiga Noumi Date: Wed, 19 Feb 2020 20:05:46 +0900 Subject: [PATCH] change params/metrics to json format, replace overwrite to if_exists --- .../kaggle-days-tokyo/kaggle_days_tokyo.py | 2 +- nyaggle/experiment/experiment.py | 80 +++++++++++++++---- nyaggle/experiment/run.py | 12 ++- tests/experiment/test_experiment.py | 15 ++-- tests/experiment/test_run.py | 48 ++++++----- 5 files changed, 107 insertions(+), 50 deletions(-) diff --git a/examples/kaggle-days-tokyo/kaggle_days_tokyo.py b/examples/kaggle-days-tokyo/kaggle_days_tokyo.py index e570abe..4742425 100644 --- a/examples/kaggle-days-tokyo/kaggle_days_tokyo.py +++ b/examples/kaggle-days-tokyo/kaggle_days_tokyo.py @@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo', X_test=X_test, eval_func=mean_squared_error, type_of_target='continuous', - overwrite=True, + if_exists='replace', with_auto_hpo=True, sample_submission=pd.read_csv('sample_submission.csv')) diff --git a/nyaggle/experiment/experiment.py b/nyaggle/experiment/experiment.py index d92ee9a..6214443 100644 --- a/nyaggle/experiment/experiment.py +++ b/nyaggle/experiment/experiment.py @@ -1,4 +1,5 @@ import json +import numbers import os import shutil import uuid @@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit): return param +def _check_directory(directory: str, if_exists: str) -> str: + if os.path.exists(directory): + if if_exists == 'error': + raise ValueError('directory {} already exists.'.format(directory)) + elif if_exists == 'replace': + warnings.warn( + 'directory {} already exists. It will be replaced by the new result'.format(directory)) + shutil.rmtree(directory, ignore_errors=True) + elif if_exists == 'rename': + postfix_index = 1 + + while os.path.exists(directory + '_' + str(postfix_index)): + postfix_index += 1 + + directory += '_' + str(postfix_index) + warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory)) + return directory + + class Experiment(object): """Minimal experiment logger for Kaggle @@ -42,8 +62,6 @@ class Experiment(object): Args: logging_directory: Path to directory where output is stored. - overwrite: - If True, contents in ``logging_directory`` will be overwritten. custom_logger: A custom logger to be used instead of default logger. with_mlflow: @@ -51,17 +69,24 @@ class Experiment(object): One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. Note that all output files are located both ``logging_directory`` and mlflow's directory (``mlruns`` by default). + if_exists: + How to behave if the logging directory already exists. + - error: Raise a ValueError. + - replace: Delete logging directory before logging. + - append: Append to exisitng experiment. + - rename: Rename current directory by adding "_1", "_2"... prefix """ def __init__(self, logging_directory: str, - overwrite: bool = False, custom_logger: Optional[Logger] = None, with_mlflow: bool = False, mlflow_run_id: Optional[str] = None, - logging_mode: str = 'w' + if_exists: str = 'error' ): - os.makedirs(logging_directory, exist_ok=overwrite) + logging_directory = _check_directory(logging_directory, if_exists) + os.makedirs(logging_directory, exist_ok=True) + self.logging_directory = logging_directory self.with_mlflow = with_mlflow @@ -75,8 +100,8 @@ class Experiment(object): self.logger.setLevel(DEBUG) self.is_custom = False self.metrics_path = os.path.join(logging_directory, 'metrics.txt') - self.metrics = open(self.metrics_path, mode=logging_mode) - self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode) + self.metrics = self._load_dict('metrics.json') + self.params = self._load_dict('params.json') self.inherit_existing_run = False if self.with_mlflow: @@ -98,15 +123,13 @@ class Experiment(object): def continue_from(cls, logging_directory: str): params = { 'logging_directory': logging_directory, - 'overwrite': True, - 'logging_mode': 'a' + 'if_exists': 'append' } mlflow_path = os.path.join(logging_directory, 'mlflow.json') if os.path.exists(mlflow_path): with open(mlflow_path, 'r') as f: mlflow_metadata = json.load(f) - params['with_mlflow'] = True params['mlflow_run_id'] = mlflow_metadata['run_id'] @@ -132,12 +155,29 @@ class Experiment(object): with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f: json.dump(mlflow_metadata, f, indent=4) + def _load_dict(self, filename: str) -> Dict: + try: + path = os.path.join(self.logging_directory, filename) + with open(path, 'r') as f: + return json.load(f) + except IOError: + self.logger.warning('failed to load file: {}'.format(filename)) + return {} + + def _save_dict(self, obj: Dict, filename: str): + try: + path = os.path.join(self.logging_directory, filename) + with open(path, 'w') as f: + json.dump(obj, f) + except IOError: + self.logger.warning('failed to save file: {}'.format(filename)) + def stop(self): """ Stop current experiment. """ - self.metrics.close() - self.params.close() + self._save_dict(self.metrics, 'metrics.json') + self._save_dict(self.params, 'params.json') if not self.is_custom: for h in self.logger.handlers: @@ -146,7 +186,8 @@ class Experiment(object): if self.with_mlflow: import mlflow mlflow.log_artifact(self.log_path) - mlflow.log_artifact(self.metrics_path) + mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json')) + mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json')) if not self.inherit_existing_run: mlflow.end_run() @@ -182,6 +223,9 @@ class Experiment(object): """ self.logger.info(text) + def _sanitize(self, v): + return v if isinstance(v, numbers.Number) else str(v) + def log_param(self, key, value): """ Logs a key-value pair for the experiment. @@ -190,8 +234,9 @@ class Experiment(object): key: parameter name value: parameter value """ - self.params.write('{},{}\n'.format(key, value)) - self.params.flush() + key = self._sanitize(key) + value = self._sanitize(value) + self.params[key] = value if self.with_mlflow: import mlflow @@ -219,8 +264,9 @@ class Experiment(object): score: Metric value. """ - self.metrics.write('{},{}\n'.format(name, score)) - self.metrics.flush() + name = self._sanitize(name) + score = self._sanitize(score) + self.metrics[name] = score if self.with_mlflow: import mlflow diff --git a/nyaggle/experiment/run.py b/nyaggle/experiment/run.py index 4aaf51f..ad08307 100644 --- a/nyaggle/experiment/run.py +++ b/nyaggle/experiment/run.py @@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any], X_train: pd.DataFrame, y: pd.Series, X_test: Optional[pd.DataFrame] = None, logging_directory: str = 'output/{time}', - overwrite: bool = False, + if_exists: str = 'error', eval_func: Optional[Callable] = None, algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm', fit_params: Optional[Union[Dict[str, Any], Callable]] = None, @@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any], Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. logging_directory: Path to directory where output of experiment is stored. - overwrite: - If True, contents in ``logging_directory`` will be overwritten. + if_exists: + How to behave if the logging directory already exists. + - error: Raise a ValueError. + - replace: Delete logging directory before logging. + - append: Append to exisitng experiment. + - rename: Rename current directory by adding "_1", "_2"... prefix fit_params: Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except eval_set passed for each fold. If callable is passed, @@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any], logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S')) - with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp: + with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp: exp.log('Algorithm: {}'.format(algorithm_type)) exp.log('Experiment: {}'.format(logging_directory)) exp.log('Params: {}'.format(model_params)) diff --git a/tests/experiment/test_experiment.py b/tests/experiment/test_experiment.py index d725eb5..c901f64 100644 --- a/tests/experiment/test_experiment.py +++ b/tests/experiment/test_experiment.py @@ -1,4 +1,6 @@ +import json import os + from nyaggle.experiment import Experiment from nyaggle.testing import get_temp_directory @@ -12,13 +14,7 @@ def test_experiment_continue(): with Experiment.continue_from(logging_dir) as e: e.log_metric('LB', 0.95) - metric_file = os.path.join(logging_dir, 'metrics.txt') - - with open(metric_file, 'r') as f: - lines = [line.split(',') for line in f.readlines()] - - assert lines[0][0] == 'CV' - assert lines[1][0] == 'LB' + metric_file = os.path.join(logging_dir, 'metrics.json') import mlflow @@ -26,3 +22,8 @@ def test_experiment_continue(): data = client.get_run(mlflow.active_run().info.run_id).data assert data.metrics['CV'] == 0.97 assert data.metrics['LB'] == 0.95 + + with open(metric_file, 'r') as f: + obj = json.load(f) + assert obj['CV'] == 0.97 + assert obj['LB'] == 0.95 diff --git a/tests/experiment/test_run.py b/tests/experiment/test_run.py index cce0782..7c561ac 100644 --- a/tests/experiment/test_run.py +++ b/tests/experiment/test_run.py @@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory -def _check_file_exists(directory, files): +def _check_file_exists(directory, submission_filename=None, with_mlflow=False): + files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json'] + if submission_filename: + files.append(submission_filename) + if with_mlflow: + files.append('mlflow.json') + for f in files: assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f) @@ -40,7 +46,7 @@ def test_experiment_lgb_classifier(): assert roc_auc_score(y_train, result.oof_prediction) >= 0.9 assert roc_auc_score(y_test, result.test_prediction) >= 0.9 - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_lgb_regressor(): @@ -61,7 +67,7 @@ def test_experiment_lgb_regressor(): assert len(np.unique(result.test_prediction)) > 5 assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_lgb_multiclass(): @@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass(): assert result.oof_prediction.shape == (len(y_train), 5) assert result.test_prediction.shape == (len(y_test), 5) - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_cat_classifier(): @@ -107,7 +113,7 @@ def test_experiment_cat_classifier(): assert roc_auc_score(y_test, result.test_prediction) >= 0.9 assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt'] - _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_cat_regressor(): @@ -125,7 +131,7 @@ def test_experiment_cat_regressor(): result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat') assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_cat_multiclass(): @@ -148,7 +154,7 @@ def test_experiment_cat_multiclass(): assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4'] - _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path, submission_filename='submission.csv') def test_experiment_xgb_classifier(): @@ -172,7 +178,7 @@ def test_experiment_xgb_classifier(): assert roc_auc_score(y_test, result.test_prediction) >= 0.9 assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt'] - _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path, submission_filename='submission.csv') def test_experiment_xgb_regressor(): @@ -190,7 +196,7 @@ def test_experiment_xgb_regressor(): result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True) assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_xgb_multiclass(): @@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass(): assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4'] - _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path, submission_filename='submission.csv') def test_experiment_sklearn_classifier(): @@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier(): assert roc_auc_score(y_train, result.oof_prediction) >= 0.8 assert roc_auc_score(y_test, result.test_prediction) >= 0.8 - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_sklearn_regressor(): @@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor(): assert len(np.unique(result.test_prediction)) > 5 assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_sklearn_multiclass(): @@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass(): assert result.oof_prediction.shape == (len(y_train), 5) assert result.test_prediction.shape == (len(y_test), 5) - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_cat_custom_eval(): @@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval(): algorithm_type='cat', eval_func=mean_absolute_error) assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1] - _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_without_test_data(): @@ -317,7 +323,7 @@ def test_experiment_without_test_data(): result = run_experiment(params, X_train, y_train, None, temp_path) assert roc_auc_score(y_train, result.oof_prediction) >= 0.9 - _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt')) + _check_file_exists(temp_path) def test_experiment_fit_params(): @@ -357,7 +363,7 @@ def test_experiment_mlflow(): with get_temp_directory() as temp_path: run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True) - _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json')) + _check_file_exists(temp_path, with_mlflow=True) # test if output files are also stored in the mlflow artifact uri with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f: @@ -365,7 +371,7 @@ def test_experiment_mlflow(): p = unquote(urlparse(mlflow_meta['artifact_uri']).path) if os.name == 'nt' and p.startswith("/"): p = p[1:] - _check_file_exists(p, ('oof_prediction.npy', 'metrics.txt')) + _check_file_exists(p, with_mlflow=False) def test_experiment_already_exists(): @@ -380,13 +386,13 @@ def test_experiment_already_exists(): } with get_temp_directory() as temp_path: - run_experiment(params, X_train, y_train, None, temp_path, overwrite=True) + run_experiment(params, X_train, y_train, None, temp_path) - # result is overwrited by default - run_experiment(params, X_train, y_train, None, temp_path, overwrite=True) + # result is not overwrited by default + run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace') with pytest.raises(Exception): - run_experiment(params, X_train, y_train, None, temp_path, overwrite=False) + run_experiment(params, X_train, y_train, None, temp_path) def test_submission_filename():