change params/metrics to json format, replace overwrite to if_exists

2020-02-19 20:05:46 +09:00 · 2020-02-19 20:05:46 +09:00 · 174cc19b53
parent 72092fb1ab
commit 174cc19b53
5 changed files with 107 additions and 50 deletions
--- a/examples/kaggle-days-tokyo/kaggle_days_tokyo.py
+++ b/examples/kaggle-days-tokyo/kaggle_days_tokyo.py
@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo',
               X_test=X_test,
               eval_func=mean_squared_error,
               type_of_target='continuous',
-               overwrite=True,
+               if_exists='replace',
               with_auto_hpo=True,
               sample_submission=pd.read_csv('sample_submission.csv'))
--- a/nyaggle/experiment/experiment.py
+++ b/nyaggle/experiment/experiment.py
@ -1,4 +1,5 @@
 import json
 import numbers
 import os
 import shutil
 import uuid
@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit):
    return param
 def _check_directory(directory: str, if_exists: str) -> str:
    if os.path.exists(directory):
        if if_exists == 'error':
            raise ValueError('directory {} already exists.'.format(directory))
        elif if_exists == 'replace':
            warnings.warn(
                'directory {} already exists. It will be replaced by the new result'.format(directory))
            shutil.rmtree(directory, ignore_errors=True)
        elif if_exists == 'rename':
            postfix_index = 1
            while os.path.exists(directory + '_' + str(postfix_index)):
                postfix_index += 1
            directory += '_' + str(postfix_index)
            warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
    return directory
 class Experiment(object):
    """Minimal experiment logger for Kaggle
@ -42,8 +62,6 @@ class Experiment(object):
    Args:
        logging_directory:
            Path to directory where output is stored.
        overwrite:
            If True, contents in ``logging_directory`` will be overwritten.
        custom_logger:
            A custom logger to be used instead of default logger.
        with_mlflow:
@ -51,17 +69,24 @@ class Experiment(object):
            One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
            Note that all output files are located both ``logging_directory`` and
            mlflow's directory (``mlruns`` by default).
        if_exists:
            How to behave if the logging directory already exists.
            - error: Raise a ValueError.
            - replace: Delete logging directory before logging.
            - append: Append to exisitng experiment.
            - rename: Rename current directory by adding "_1", "_2"... prefix
    """
    def __init__(self,
                 logging_directory: str,
                 overwrite: bool = False,
                 custom_logger: Optional[Logger] = None,
                 with_mlflow: bool = False,
                 mlflow_run_id: Optional[str] = None,
-                 logging_mode: str = 'w'
+                 if_exists: str = 'error'
                 ):
-        os.makedirs(logging_directory, exist_ok=overwrite)
+        logging_directory = _check_directory(logging_directory, if_exists)
        os.makedirs(logging_directory, exist_ok=True)
        self.logging_directory = logging_directory
        self.with_mlflow = with_mlflow
@ -75,8 +100,8 @@ class Experiment(object):
            self.logger.setLevel(DEBUG)
            self.is_custom = False
        self.metrics_path = os.path.join(logging_directory, 'metrics.txt')
-        self.metrics = open(self.metrics_path, mode=logging_mode)
+        self.metrics = self._load_dict('metrics.json')
-        self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode)
+        self.params = self._load_dict('params.json')
        self.inherit_existing_run = False
        if self.with_mlflow:
@ -98,15 +123,13 @@ class Experiment(object):
    def continue_from(cls, logging_directory: str):
        params = {
            'logging_directory': logging_directory,
-            'overwrite': True,
+            'if_exists': 'append'
            'logging_mode': 'a'
        }
        mlflow_path = os.path.join(logging_directory, 'mlflow.json')
        if os.path.exists(mlflow_path):
            with open(mlflow_path, 'r') as f:
                mlflow_metadata = json.load(f)
                params['with_mlflow'] = True
                params['mlflow_run_id'] = mlflow_metadata['run_id']
@ -132,12 +155,29 @@ class Experiment(object):
            with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
                json.dump(mlflow_metadata, f, indent=4)
    def _load_dict(self, filename: str) -> Dict:
        try:
            path = os.path.join(self.logging_directory, filename)
            with open(path, 'r') as f:
                return json.load(f)
        except IOError:
            self.logger.warning('failed to load file: {}'.format(filename))
            return {}
    def _save_dict(self, obj: Dict, filename: str):
        try:
            path = os.path.join(self.logging_directory, filename)
            with open(path, 'w') as f:
                json.dump(obj, f)
        except IOError:
            self.logger.warning('failed to save file: {}'.format(filename))
    def stop(self):
        """
        Stop current experiment.
        """
-        self.metrics.close()
+        self._save_dict(self.metrics, 'metrics.json')
-        self.params.close()
+        self._save_dict(self.params, 'params.json')
        if not self.is_custom:
            for h in self.logger.handlers:
@ -146,7 +186,8 @@ class Experiment(object):
        if self.with_mlflow:
            import mlflow
            mlflow.log_artifact(self.log_path)
-            mlflow.log_artifact(self.metrics_path)
+            mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
            mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
            if not self.inherit_existing_run:
                mlflow.end_run()
@ -182,6 +223,9 @@ class Experiment(object):
        """
        self.logger.info(text)
    def _sanitize(self, v):
        return v if isinstance(v, numbers.Number) else str(v)
    def log_param(self, key, value):
        """
        Logs a key-value pair for the experiment.
@ -190,8 +234,9 @@ class Experiment(object):
            key: parameter name
            value: parameter value
        """
-        self.params.write('{},{}\n'.format(key, value))
+        key = self._sanitize(key)
-        self.params.flush()
+        value = self._sanitize(value)
        self.params[key] = value
        if self.with_mlflow:
            import mlflow
@ -219,8 +264,9 @@ class Experiment(object):
            score:
                Metric value.
        """
-        self.metrics.write('{},{}\n'.format(name, score))
+        name = self._sanitize(name)
-        self.metrics.flush()
+        score = self._sanitize(score)
        self.metrics[name] = score
        if self.with_mlflow:
            import mlflow
--- a/nyaggle/experiment/run.py
+++ b/nyaggle/experiment/run.py
@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any],
                   X_train: pd.DataFrame, y: pd.Series,
                   X_test: Optional[pd.DataFrame] = None,
                   logging_directory: str = 'output/{time}',
-                   overwrite: bool = False,
+                   if_exists: str = 'error',
                   eval_func: Optional[Callable] = None,
                   algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
                   fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any],
            Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
        logging_directory:
            Path to directory where output of experiment is stored.
-        overwrite:
+        if_exists:
-            If True, contents in ``logging_directory`` will be overwritten.
+            How to behave if the logging directory already exists.
            - error: Raise a ValueError.
            - replace: Delete logging directory before logging.
            - append: Append to exisitng experiment.
            - rename: Rename current directory by adding "_1", "_2"... prefix
        fit_params:
            Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
            eval_set passed for each fold. If callable is passed,
@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any],
    logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
-    with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp:
+    with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
        exp.log('Algorithm: {}'.format(algorithm_type))
        exp.log('Experiment: {}'.format(logging_directory))
        exp.log('Params: {}'.format(model_params))
--- a/tests/experiment/test_experiment.py
+++ b/tests/experiment/test_experiment.py
@ -1,4 +1,6 @@
 import json
 import os
 from nyaggle.experiment import Experiment
 from nyaggle.testing import get_temp_directory
@ -12,13 +14,7 @@ def test_experiment_continue():
        with Experiment.continue_from(logging_dir) as e:
            e.log_metric('LB', 0.95)
-            metric_file = os.path.join(logging_dir, 'metrics.txt')
+            metric_file = os.path.join(logging_dir, 'metrics.json')
            with open(metric_file, 'r') as f:
                lines = [line.split(',') for line in f.readlines()]
                assert lines[0][0] == 'CV'
                assert lines[1][0] == 'LB'
            import mlflow
@ -26,3 +22,8 @@ def test_experiment_continue():
            data = client.get_run(mlflow.active_run().info.run_id).data
            assert data.metrics['CV'] == 0.97
            assert data.metrics['LB'] == 0.95
        with open(metric_file, 'r') as f:
            obj = json.load(f)
            assert obj['CV'] == 0.97
            assert obj['LB'] == 0.95
--- a/tests/experiment/test_run.py
+++ b/tests/experiment/test_run.py
@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature
 from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory
-def _check_file_exists(directory, files):
+def _check_file_exists(directory, submission_filename=None, with_mlflow=False):
    files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json']
    if submission_filename:
        files.append(submission_filename)
    if with_mlflow:
        files.append('mlflow.json')
    for f in files:
        assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f)
@ -40,7 +46,7 @@ def test_experiment_lgb_classifier():
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_lgb_regressor():
@ -61,7 +67,7 @@ def test_experiment_lgb_regressor():
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_lgb_multiclass():
@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass():
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_cat_classifier():
@ -107,7 +113,7 @@ def test_experiment_cat_classifier():
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_cat_regressor():
@ -125,7 +131,7 @@ def test_experiment_cat_regressor():
        result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_cat_multiclass():
@ -148,7 +154,7 @@ def test_experiment_cat_multiclass():
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')
 def test_experiment_xgb_classifier():
@ -172,7 +178,7 @@ def test_experiment_xgb_classifier():
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')
 def test_experiment_xgb_regressor():
@ -190,7 +196,7 @@ def test_experiment_xgb_regressor():
        result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_xgb_multiclass():
@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass():
        assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
-        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path, submission_filename='submission.csv')
 def test_experiment_sklearn_classifier():
@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier():
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
        assert roc_auc_score(y_test, result.test_prediction) >= 0.8
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_sklearn_regressor():
@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor():
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_sklearn_multiclass():
@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass():
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_cat_custom_eval():
@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval():
                                algorithm_type='cat', eval_func=mean_absolute_error)
        assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_without_test_data():
@ -317,7 +323,7 @@ def test_experiment_without_test_data():
        result = run_experiment(params, X_train, y_train, None, temp_path)
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))
+        _check_file_exists(temp_path)
 def test_experiment_fit_params():
@ -357,7 +363,7 @@ def test_experiment_mlflow():
    with get_temp_directory() as temp_path:
        run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
-        _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))
+        _check_file_exists(temp_path, with_mlflow=True)
        # test if output files are also stored in the mlflow artifact uri
        with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
@ -365,7 +371,7 @@ def test_experiment_mlflow():
            p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
            if os.name == 'nt' and p.startswith("/"):
                p = p[1:]
-            _check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))
+            _check_file_exists(p, with_mlflow=False)
 def test_experiment_already_exists():
@ -380,13 +386,13 @@ def test_experiment_already_exists():
    }
    with get_temp_directory() as temp_path:
-        run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
+        run_experiment(params, X_train, y_train, None, temp_path)
-        # result is overwrited by default
+        # result is not overwrited by default
-        run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
+        run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace')
        with pytest.raises(Exception):
-            run_experiment(params, X_train, y_train, None, temp_path, overwrite=False)
+            run_experiment(params, X_train, y_train, None, temp_path)
 def test_submission_filename():