change params/metrics to json format, replace overwrite to if_exists

pull/47/head
Taiga Noumi 2020-02-19 20:05:46 +09:00 committed by nyanp
parent 72092fb1ab
commit 174cc19b53
5 changed files with 107 additions and 50 deletions

View File

@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo',
X_test=X_test, X_test=X_test,
eval_func=mean_squared_error, eval_func=mean_squared_error,
type_of_target='continuous', type_of_target='continuous',
overwrite=True, if_exists='replace',
with_auto_hpo=True, with_auto_hpo=True,
sample_submission=pd.read_csv('sample_submission.csv')) sample_submission=pd.read_csv('sample_submission.csv'))

View File

@ -1,4 +1,5 @@
import json import json
import numbers
import os import os
import shutil import shutil
import uuid import uuid
@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit):
return param return param
def _check_directory(directory: str, if_exists: str) -> str:
if os.path.exists(directory):
if if_exists == 'error':
raise ValueError('directory {} already exists.'.format(directory))
elif if_exists == 'replace':
warnings.warn(
'directory {} already exists. It will be replaced by the new result'.format(directory))
shutil.rmtree(directory, ignore_errors=True)
elif if_exists == 'rename':
postfix_index = 1
while os.path.exists(directory + '_' + str(postfix_index)):
postfix_index += 1
directory += '_' + str(postfix_index)
warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
return directory
class Experiment(object): class Experiment(object):
"""Minimal experiment logger for Kaggle """Minimal experiment logger for Kaggle
@ -42,8 +62,6 @@ class Experiment(object):
Args: Args:
logging_directory: logging_directory:
Path to directory where output is stored. Path to directory where output is stored.
overwrite:
If True, contents in ``logging_directory`` will be overwritten.
custom_logger: custom_logger:
A custom logger to be used instead of default logger. A custom logger to be used instead of default logger.
with_mlflow: with_mlflow:
@ -51,17 +69,24 @@ class Experiment(object):
One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
Note that all output files are located both ``logging_directory`` and Note that all output files are located both ``logging_directory`` and
mlflow's directory (``mlruns`` by default). mlflow's directory (``mlruns`` by default).
if_exists:
How to behave if the logging directory already exists.
- error: Raise a ValueError.
- replace: Delete logging directory before logging.
- append: Append to exisitng experiment.
- rename: Rename current directory by adding "_1", "_2"... prefix
""" """
def __init__(self, def __init__(self,
logging_directory: str, logging_directory: str,
overwrite: bool = False,
custom_logger: Optional[Logger] = None, custom_logger: Optional[Logger] = None,
with_mlflow: bool = False, with_mlflow: bool = False,
mlflow_run_id: Optional[str] = None, mlflow_run_id: Optional[str] = None,
logging_mode: str = 'w' if_exists: str = 'error'
): ):
os.makedirs(logging_directory, exist_ok=overwrite) logging_directory = _check_directory(logging_directory, if_exists)
os.makedirs(logging_directory, exist_ok=True)
self.logging_directory = logging_directory self.logging_directory = logging_directory
self.with_mlflow = with_mlflow self.with_mlflow = with_mlflow
@ -75,8 +100,8 @@ class Experiment(object):
self.logger.setLevel(DEBUG) self.logger.setLevel(DEBUG)
self.is_custom = False self.is_custom = False
self.metrics_path = os.path.join(logging_directory, 'metrics.txt') self.metrics_path = os.path.join(logging_directory, 'metrics.txt')
self.metrics = open(self.metrics_path, mode=logging_mode) self.metrics = self._load_dict('metrics.json')
self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode) self.params = self._load_dict('params.json')
self.inherit_existing_run = False self.inherit_existing_run = False
if self.with_mlflow: if self.with_mlflow:
@ -98,15 +123,13 @@ class Experiment(object):
def continue_from(cls, logging_directory: str): def continue_from(cls, logging_directory: str):
params = { params = {
'logging_directory': logging_directory, 'logging_directory': logging_directory,
'overwrite': True, 'if_exists': 'append'
'logging_mode': 'a'
} }
mlflow_path = os.path.join(logging_directory, 'mlflow.json') mlflow_path = os.path.join(logging_directory, 'mlflow.json')
if os.path.exists(mlflow_path): if os.path.exists(mlflow_path):
with open(mlflow_path, 'r') as f: with open(mlflow_path, 'r') as f:
mlflow_metadata = json.load(f) mlflow_metadata = json.load(f)
params['with_mlflow'] = True params['with_mlflow'] = True
params['mlflow_run_id'] = mlflow_metadata['run_id'] params['mlflow_run_id'] = mlflow_metadata['run_id']
@ -132,12 +155,29 @@ class Experiment(object):
with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f: with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
json.dump(mlflow_metadata, f, indent=4) json.dump(mlflow_metadata, f, indent=4)
def _load_dict(self, filename: str) -> Dict:
try:
path = os.path.join(self.logging_directory, filename)
with open(path, 'r') as f:
return json.load(f)
except IOError:
self.logger.warning('failed to load file: {}'.format(filename))
return {}
def _save_dict(self, obj: Dict, filename: str):
try:
path = os.path.join(self.logging_directory, filename)
with open(path, 'w') as f:
json.dump(obj, f)
except IOError:
self.logger.warning('failed to save file: {}'.format(filename))
def stop(self): def stop(self):
""" """
Stop current experiment. Stop current experiment.
""" """
self.metrics.close() self._save_dict(self.metrics, 'metrics.json')
self.params.close() self._save_dict(self.params, 'params.json')
if not self.is_custom: if not self.is_custom:
for h in self.logger.handlers: for h in self.logger.handlers:
@ -146,7 +186,8 @@ class Experiment(object):
if self.with_mlflow: if self.with_mlflow:
import mlflow import mlflow
mlflow.log_artifact(self.log_path) mlflow.log_artifact(self.log_path)
mlflow.log_artifact(self.metrics_path) mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
if not self.inherit_existing_run: if not self.inherit_existing_run:
mlflow.end_run() mlflow.end_run()
@ -182,6 +223,9 @@ class Experiment(object):
""" """
self.logger.info(text) self.logger.info(text)
def _sanitize(self, v):
return v if isinstance(v, numbers.Number) else str(v)
def log_param(self, key, value): def log_param(self, key, value):
""" """
Logs a key-value pair for the experiment. Logs a key-value pair for the experiment.
@ -190,8 +234,9 @@ class Experiment(object):
key: parameter name key: parameter name
value: parameter value value: parameter value
""" """
self.params.write('{},{}\n'.format(key, value)) key = self._sanitize(key)
self.params.flush() value = self._sanitize(value)
self.params[key] = value
if self.with_mlflow: if self.with_mlflow:
import mlflow import mlflow
@ -219,8 +264,9 @@ class Experiment(object):
score: score:
Metric value. Metric value.
""" """
self.metrics.write('{},{}\n'.format(name, score)) name = self._sanitize(name)
self.metrics.flush() score = self._sanitize(score)
self.metrics[name] = score
if self.with_mlflow: if self.with_mlflow:
import mlflow import mlflow

View File

@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any],
X_train: pd.DataFrame, y: pd.Series, X_train: pd.DataFrame, y: pd.Series,
X_test: Optional[pd.DataFrame] = None, X_test: Optional[pd.DataFrame] = None,
logging_directory: str = 'output/{time}', logging_directory: str = 'output/{time}',
overwrite: bool = False, if_exists: str = 'error',
eval_func: Optional[Callable] = None, eval_func: Optional[Callable] = None,
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm', algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
fit_params: Optional[Union[Dict[str, Any], Callable]] = None, fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any],
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
logging_directory: logging_directory:
Path to directory where output of experiment is stored. Path to directory where output of experiment is stored.
overwrite: if_exists:
If True, contents in ``logging_directory`` will be overwritten. How to behave if the logging directory already exists.
- error: Raise a ValueError.
- replace: Delete logging directory before logging.
- append: Append to exisitng experiment.
- rename: Rename current directory by adding "_1", "_2"... prefix
fit_params: fit_params:
Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
eval_set passed for each fold. If callable is passed, eval_set passed for each fold. If callable is passed,
@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any],
logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S')) logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp: with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
exp.log('Algorithm: {}'.format(algorithm_type)) exp.log('Algorithm: {}'.format(algorithm_type))
exp.log('Experiment: {}'.format(logging_directory)) exp.log('Experiment: {}'.format(logging_directory))
exp.log('Params: {}'.format(model_params)) exp.log('Params: {}'.format(model_params))

View File

@ -1,4 +1,6 @@
import json
import os import os
from nyaggle.experiment import Experiment from nyaggle.experiment import Experiment
from nyaggle.testing import get_temp_directory from nyaggle.testing import get_temp_directory
@ -12,13 +14,7 @@ def test_experiment_continue():
with Experiment.continue_from(logging_dir) as e: with Experiment.continue_from(logging_dir) as e:
e.log_metric('LB', 0.95) e.log_metric('LB', 0.95)
metric_file = os.path.join(logging_dir, 'metrics.txt') metric_file = os.path.join(logging_dir, 'metrics.json')
with open(metric_file, 'r') as f:
lines = [line.split(',') for line in f.readlines()]
assert lines[0][0] == 'CV'
assert lines[1][0] == 'LB'
import mlflow import mlflow
@ -26,3 +22,8 @@ def test_experiment_continue():
data = client.get_run(mlflow.active_run().info.run_id).data data = client.get_run(mlflow.active_run().info.run_id).data
assert data.metrics['CV'] == 0.97 assert data.metrics['CV'] == 0.97
assert data.metrics['LB'] == 0.95 assert data.metrics['LB'] == 0.95
with open(metric_file, 'r') as f:
obj = json.load(f)
assert obj['CV'] == 0.97
assert obj['LB'] == 0.95

View File

@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature
from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory
def _check_file_exists(directory, files): def _check_file_exists(directory, submission_filename=None, with_mlflow=False):
files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json']
if submission_filename:
files.append(submission_filename)
if with_mlflow:
files.append('mlflow.json')
for f in files: for f in files:
assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f) assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f)
@ -40,7 +46,7 @@ def test_experiment_lgb_classifier():
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9 assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
assert roc_auc_score(y_test, result.test_prediction) >= 0.9 assert roc_auc_score(y_test, result.test_prediction) >= 0.9
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_lgb_regressor(): def test_experiment_lgb_regressor():
@ -61,7 +67,7 @@ def test_experiment_lgb_regressor():
assert len(np.unique(result.test_prediction)) > 5 assert len(np.unique(result.test_prediction)) > 5
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_lgb_multiclass(): def test_experiment_lgb_multiclass():
@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass():
assert result.oof_prediction.shape == (len(y_train), 5) assert result.oof_prediction.shape == (len(y_train), 5)
assert result.test_prediction.shape == (len(y_test), 5) assert result.test_prediction.shape == (len(y_test), 5)
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_cat_classifier(): def test_experiment_cat_classifier():
@ -107,7 +113,7 @@ def test_experiment_cat_classifier():
assert roc_auc_score(y_test, result.test_prediction) >= 0.9 assert roc_auc_score(y_test, result.test_prediction) >= 0.9
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt'] assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_cat_regressor(): def test_experiment_cat_regressor():
@ -125,7 +131,7 @@ def test_experiment_cat_regressor():
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat') result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_cat_multiclass(): def test_experiment_cat_multiclass():
@ -148,7 +154,7 @@ def test_experiment_cat_multiclass():
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4'] assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_xgb_classifier(): def test_experiment_xgb_classifier():
@ -172,7 +178,7 @@ def test_experiment_xgb_classifier():
assert roc_auc_score(y_test, result.test_prediction) >= 0.9 assert roc_auc_score(y_test, result.test_prediction) >= 0.9
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt'] assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_xgb_regressor(): def test_experiment_xgb_regressor():
@ -190,7 +196,7 @@ def test_experiment_xgb_regressor():
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True) result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_xgb_multiclass(): def test_experiment_xgb_multiclass():
@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass():
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4'] assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_sklearn_classifier(): def test_experiment_sklearn_classifier():
@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier():
assert roc_auc_score(y_train, result.oof_prediction) >= 0.8 assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
assert roc_auc_score(y_test, result.test_prediction) >= 0.8 assert roc_auc_score(y_test, result.test_prediction) >= 0.8
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_sklearn_regressor(): def test_experiment_sklearn_regressor():
@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor():
assert len(np.unique(result.test_prediction)) > 5 assert len(np.unique(result.test_prediction)) > 5
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_sklearn_multiclass(): def test_experiment_sklearn_multiclass():
@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass():
assert result.oof_prediction.shape == (len(y_train), 5) assert result.oof_prediction.shape == (len(y_train), 5)
assert result.test_prediction.shape == (len(y_test), 5) assert result.test_prediction.shape == (len(y_test), 5)
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_cat_custom_eval(): def test_experiment_cat_custom_eval():
@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval():
algorithm_type='cat', eval_func=mean_absolute_error) algorithm_type='cat', eval_func=mean_absolute_error)
assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1] assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_without_test_data(): def test_experiment_without_test_data():
@ -317,7 +323,7 @@ def test_experiment_without_test_data():
result = run_experiment(params, X_train, y_train, None, temp_path) result = run_experiment(params, X_train, y_train, None, temp_path)
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9 assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt')) _check_file_exists(temp_path)
def test_experiment_fit_params(): def test_experiment_fit_params():
@ -357,7 +363,7 @@ def test_experiment_mlflow():
with get_temp_directory() as temp_path: with get_temp_directory() as temp_path:
run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True) run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json')) _check_file_exists(temp_path, with_mlflow=True)
# test if output files are also stored in the mlflow artifact uri # test if output files are also stored in the mlflow artifact uri
with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f: with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
@ -365,7 +371,7 @@ def test_experiment_mlflow():
p = unquote(urlparse(mlflow_meta['artifact_uri']).path) p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
if os.name == 'nt' and p.startswith("/"): if os.name == 'nt' and p.startswith("/"):
p = p[1:] p = p[1:]
_check_file_exists(p, ('oof_prediction.npy', 'metrics.txt')) _check_file_exists(p, with_mlflow=False)
def test_experiment_already_exists(): def test_experiment_already_exists():
@ -380,13 +386,13 @@ def test_experiment_already_exists():
} }
with get_temp_directory() as temp_path: with get_temp_directory() as temp_path:
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True) run_experiment(params, X_train, y_train, None, temp_path)
# result is overwrited by default # result is not overwrited by default
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True) run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace')
with pytest.raises(Exception): with pytest.raises(Exception):
run_experiment(params, X_train, y_train, None, temp_path, overwrite=False) run_experiment(params, X_train, y_train, None, temp_path)
def test_submission_filename(): def test_submission_filename():