change params/metrics to json format, replace overwrite to if_exists

2020-02-19 20:05:46 +09:00 · 2020-02-19 20:05:46 +09:00 · 174cc19b53
parent 72092fb1ab
commit 174cc19b53
5 changed files with 107 additions and 50 deletions
--- a/examples/kaggle-days-tokyo/kaggle_days_tokyo.py
+++ b/examples/kaggle-days-tokyo/kaggle_days_tokyo.py
@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo',
               X_test=X_test,
               eval_func=mean_squared_error,
               type_of_target='continuous',
-               overwrite=True,
+               if_exists='replace',
               with_auto_hpo=True,
               sample_submission=pd.read_csv('sample_submission.csv'))
--- a/nyaggle/experiment/experiment.py
+++ b/nyaggle/experiment/experiment.py
@ -1,4 +1,5 @@
 import json
+import numbers
 import os
 import shutil
 import uuid
@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit):
    return param


+def _check_directory(directory: str, if_exists: str) -> str:
+    if os.path.exists(directory):
+        if if_exists == 'error':
+            raise ValueError('directory {} already exists.'.format(directory))
+        elif if_exists == 'replace':
+            warnings.warn(
+                'directory {} already exists. It will be replaced by the new result'.format(directory))
+            shutil.rmtree(directory, ignore_errors=True)
+        elif if_exists == 'rename':
+            postfix_index = 1
+
+            while os.path.exists(directory + '_' + str(postfix_index)):
+                postfix_index += 1
+
+            directory += '_' + str(postfix_index)
+            warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
+    return directory
+
+
 class Experiment(object):
    """Minimal experiment logger for Kaggle

@ -42,8 +62,6 @@ class Experiment(object):
    Args:
        logging_directory:
            Path to directory where output is stored.
-        overwrite:
-            If True, contents in ``logging_directory`` will be overwritten.
        custom_logger:
            A custom logger to be used instead of default logger.
        with_mlflow:
@ -51,17 +69,24 @@ class Experiment(object):
            One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
            Note that all output files are located both ``logging_directory`` and
            mlflow's directory (``mlruns`` by default).
+        if_exists:
+            How to behave if the logging directory already exists.
+            - error: Raise a ValueError.
+            - replace: Delete logging directory before logging.
+            - append: Append to exisitng experiment.
+            - rename: Rename current directory by adding "_1", "_2"... prefix
    """

    def __init__(self,
                 logging_directory: str,
-                 overwrite: bool = False,
                 custom_logger: Optional[Logger] = None,
                 with_mlflow: bool = False,
                 mlflow_run_id: Optional[str] = None,
-                 logging_mode: str = 'w'
+                 if_exists: str = 'error'
                 ):
-        os.makedirs(logging_directory, exist_ok=overwrite)
+        logging_directory = _check_directory(logging_directory, if_exists)
+        os.makedirs(logging_directory, exist_ok=True)
+
        self.logging_directory = logging_directory
        self.with_mlflow = with_mlflow

@ -75,8 +100,8 @@ class Experiment(object):
            self.logger.setLevel(DEBUG)
            self.is_custom = False
        self.metrics_path = os.path.join(logging_directory, 'metrics.txt')
-        self.metrics = open(self.metrics_path, mode=logging_mode)
-        self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode)
+        self.metrics = self._load_dict('metrics.json')
+        self.params = self._load_dict('params.json')
        self.inherit_existing_run = False

        if self.with_mlflow:
@ -98,15 +123,13 @@ class Experiment(object):
    def continue_from(cls, logging_directory: str):
        params = {
            'logging_directory': logging_directory,
-            'overwrite': True,
-            'logging_mode': 'a'
+            'if_exists': 'append'
        }

        mlflow_path = os.path.join(logging_directory, 'mlflow.json')
        if os.path.exists(mlflow_path):
            with open(mlflow_path, 'r') as f:
                mlflow_metadata = json.load(f)
-
                params['with_mlflow'] = True
                params['mlflow_run_id'] = mlflow_metadata['run_id']

@ -132,12 +155,29 @@ class Experiment(object):
            with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
                json.dump(mlflow_metadata, f, indent=4)

+    def _load_dict(self, filename: str) -> Dict:
+        try:
+            path = os.path.join(self.logging_directory, filename)
+            with open(path, 'r') as f:
+                return json.load(f)
+        except IOError:
+            self.logger.warning('failed to load file: {}'.format(filename))
+            return {}
+
+    def _save_dict(self, obj: Dict, filename: str):
+        try:
+            path = os.path.join(self.logging_directory, filename)
+            with open(path, 'w') as f:
+                json.dump(obj, f)
+        except IOError:
+            self.logger.warning('failed to save file: {}'.format(filename))
+
    def stop(self):
        """
        Stop current experiment.
        """
-        self.metrics.close()
-        self.params.close()
+        self._save_dict(self.metrics, 'metrics.json')
+        self._save_dict(self.params, 'params.json')

        if not self.is_custom:
            for h in self.logger.handlers:
@ -146,7 +186,8 @@ class Experiment(object):
        if self.with_mlflow:
            import mlflow
            mlflow.log_artifact(self.log_path)
-            mlflow.log_artifact(self.metrics_path)
+            mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
+            mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
            if not self.inherit_existing_run:
                mlflow.end_run()

@ -182,6 +223,9 @@ class Experiment(object):
        """
        self.logger.info(text)

+    def _sanitize(self, v):
+        return v if isinstance(v, numbers.Number) else str(v)
+
    def log_param(self, key, value):
        """
        Logs a key-value pair for the experiment.
@ -190,8 +234,9 @@ class Experiment(object):
            key: parameter name
            value: parameter value
        """
-        self.params.write('{},{}\n'.format(key, value))
-        self.params.flush()
+        key = self._sanitize(key)
+        value = self._sanitize(value)
+        self.params[key] = value

        if self.with_mlflow:
            import mlflow
@ -219,8 +264,9 @@ class Experiment(object):
            score:
                Metric value.
        """
-        self.metrics.write('{},{}\n'.format(name, score))
-        self.metrics.flush()
+        name = self._sanitize(name)
+        score = self._sanitize(score)
+        self.metrics[name] = score

        if self.with_mlflow:
            import mlflow
--- a/nyaggle/experiment/run.py
+++ b/nyaggle/experiment/run.py
@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any],
                   X_train: pd.DataFrame, y: pd.Series,
                   X_test: Optional[pd.DataFrame] = None,
                   logging_directory: str = 'output/{time}',
-                   overwrite: bool = False,
+                   if_exists: str = 'error',
                   eval_func: Optional[Callable] = None,
                   algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
                   fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any],
            Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
        logging_directory:
            Path to directory where output of experiment is stored.
-        overwrite:
-            If True, contents in ``logging_directory`` will be overwritten.
+        if_exists:
+            How to behave if the logging directory already exists.
+            - error: Raise a ValueError.
+            - replace: Delete logging directory before logging.
+            - append: Append to exisitng experiment.
+            - rename: Rename current directory by adding "_1", "_2"... prefix
        fit_params:
            Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
            eval_set passed for each fold. If callable is passed,
@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any],

    logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))

-    with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp:
+    with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
        exp.log('Algorithm: {}'.format(algorithm_type))
        exp.log('Experiment: {}'.format(logging_directory))
        exp.log('Params: {}'.format(model_params))
--- a/tests/experiment/test_experiment.py
+++ b/tests/experiment/test_experiment.py
@ -1,4 +1,6 @@
+import json
 import os
+
 from nyaggle.experiment import Experiment
 from nyaggle.testing import get_temp_directory

@ -12,13 +14,7 @@ def test_experiment_continue():
        with Experiment.continue_from(logging_dir) as e:
            e.log_metric('LB', 0.95)

-            metric_file = os.path.join(logging_dir, 'metrics.txt')
-
-            with open(metric_file, 'r') as f:
-                lines = [line.split(',') for line in f.readlines()]
-
-                assert lines[0][0] == 'CV'
-                assert lines[1][0] == 'LB'
+            metric_file = os.path.join(logging_dir, 'metrics.json')

            import mlflow

@ -26,3 +22,8 @@ def test_experiment_continue():
            data = client.get_run(mlflow.active_run().info.run_id).data
            assert data.metrics['CV'] == 0.97
            assert data.metrics['LB'] == 0.95
+
+        with open(metric_file, 'r') as f:
+            obj = json.load(f)
+            assert obj['CV'] == 0.97
+            assert obj['LB'] == 0.95
--- a/tests/experiment/test_run.py
+++ b/tests/experiment/test_run.py
@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature
 from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory


-def _check_file_exists(directory, files):
+def _check_file_exists(directory, submission_filename=None, with_mlflow=False):
+    files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json']
+    if submission_filename:
+        files.append(submission_filename)
+    if with_mlflow:
+        files.append('mlflow.json')
+
    for f in files:
        assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f)

@ -40,7 +46,7 @@ def test_experiment_lgb_classifier():
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_lgb_regressor():
@ -61,7 +67,7 @@ def test_experiment_lgb_regressor():
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_lgb_multiclass():
@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass():
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_cat_classifier():
@ -107,7 +113,7 @@ def test_experiment_cat_classifier():
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']

-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_cat_regressor():
@ -125,7 +131,7 @@ def test_experiment_cat_regressor():
        result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')

        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_cat_multiclass():
@ -148,7 +154,7 @@ def test_experiment_cat_multiclass():

        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']

-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')


 def test_experiment_xgb_classifier():
@ -172,7 +178,7 @@ def test_experiment_xgb_classifier():
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']

-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')


 def test_experiment_xgb_regressor():
@ -190,7 +196,7 @@ def test_experiment_xgb_regressor():
        result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)

        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_xgb_multiclass():
@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass():

        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']

-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')


 def test_experiment_sklearn_classifier():
@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier():
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
        assert roc_auc_score(y_test, result.test_prediction) >= 0.8

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_sklearn_regressor():
@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor():
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_sklearn_multiclass():
@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass():
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_cat_custom_eval():
@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval():
                                algorithm_type='cat', eval_func=mean_absolute_error)

        assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_without_test_data():
@ -317,7 +323,7 @@ def test_experiment_without_test_data():
        result = run_experiment(params, X_train, y_train, None, temp_path)

        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)


 def test_experiment_fit_params():
@ -357,7 +363,7 @@ def test_experiment_mlflow():
    with get_temp_directory() as temp_path:
        run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)

-        _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))
+        _check_file_exists(temp_path, with_mlflow=True)

        # test if output files are also stored in the mlflow artifact uri
        with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
@ -365,7 +371,7 @@ def test_experiment_mlflow():
            p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
            if os.name == 'nt' and p.startswith("/"):
                p = p[1:]
-            _check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))
+            _check_file_exists(p, with_mlflow=False)


 def test_experiment_already_exists():
@ -380,13 +386,13 @@ def test_experiment_already_exists():
    }

    with get_temp_directory() as temp_path:
-        run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
+        run_experiment(params, X_train, y_train, None, temp_path)

-        # result is overwrited by default
-        run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
+        # result is not overwrited by default
+        run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace')

        with pytest.raises(Exception):
-            run_experiment(params, X_train, y_train, None, temp_path, overwrite=False)
+            run_experiment(params, X_train, y_train, None, temp_path)


 def test_submission_filename():