change params/metrics to json format, replace overwrite to if_exists

pull/47/head
Taiga Noumi 2020-02-19 20:05:46 +09:00 committed by nyanp
parent 72092fb1ab
commit 174cc19b53
5 changed files with 107 additions and 50 deletions

View File

@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo',
X_test=X_test,
eval_func=mean_squared_error,
type_of_target='continuous',
overwrite=True,
if_exists='replace',
with_auto_hpo=True,
sample_submission=pd.read_csv('sample_submission.csv'))

View File

@ -1,4 +1,5 @@
import json
import numbers
import os
import shutil
import uuid
@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit):
return param
def _check_directory(directory: str, if_exists: str) -> str:
if os.path.exists(directory):
if if_exists == 'error':
raise ValueError('directory {} already exists.'.format(directory))
elif if_exists == 'replace':
warnings.warn(
'directory {} already exists. It will be replaced by the new result'.format(directory))
shutil.rmtree(directory, ignore_errors=True)
elif if_exists == 'rename':
postfix_index = 1
while os.path.exists(directory + '_' + str(postfix_index)):
postfix_index += 1
directory += '_' + str(postfix_index)
warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
return directory
class Experiment(object):
"""Minimal experiment logger for Kaggle
@ -42,8 +62,6 @@ class Experiment(object):
Args:
logging_directory:
Path to directory where output is stored.
overwrite:
If True, contents in ``logging_directory`` will be overwritten.
custom_logger:
A custom logger to be used instead of default logger.
with_mlflow:
@ -51,17 +69,24 @@ class Experiment(object):
One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
Note that all output files are located both ``logging_directory`` and
mlflow's directory (``mlruns`` by default).
if_exists:
How to behave if the logging directory already exists.
- error: Raise a ValueError.
- replace: Delete logging directory before logging.
- append: Append to exisitng experiment.
- rename: Rename current directory by adding "_1", "_2"... prefix
"""
def __init__(self,
logging_directory: str,
overwrite: bool = False,
custom_logger: Optional[Logger] = None,
with_mlflow: bool = False,
mlflow_run_id: Optional[str] = None,
logging_mode: str = 'w'
if_exists: str = 'error'
):
os.makedirs(logging_directory, exist_ok=overwrite)
logging_directory = _check_directory(logging_directory, if_exists)
os.makedirs(logging_directory, exist_ok=True)
self.logging_directory = logging_directory
self.with_mlflow = with_mlflow
@ -75,8 +100,8 @@ class Experiment(object):
self.logger.setLevel(DEBUG)
self.is_custom = False
self.metrics_path = os.path.join(logging_directory, 'metrics.txt')
self.metrics = open(self.metrics_path, mode=logging_mode)
self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode)
self.metrics = self._load_dict('metrics.json')
self.params = self._load_dict('params.json')
self.inherit_existing_run = False
if self.with_mlflow:
@ -98,15 +123,13 @@ class Experiment(object):
def continue_from(cls, logging_directory: str):
params = {
'logging_directory': logging_directory,
'overwrite': True,
'logging_mode': 'a'
'if_exists': 'append'
}
mlflow_path = os.path.join(logging_directory, 'mlflow.json')
if os.path.exists(mlflow_path):
with open(mlflow_path, 'r') as f:
mlflow_metadata = json.load(f)
params['with_mlflow'] = True
params['mlflow_run_id'] = mlflow_metadata['run_id']
@ -132,12 +155,29 @@ class Experiment(object):
with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
json.dump(mlflow_metadata, f, indent=4)
def _load_dict(self, filename: str) -> Dict:
try:
path = os.path.join(self.logging_directory, filename)
with open(path, 'r') as f:
return json.load(f)
except IOError:
self.logger.warning('failed to load file: {}'.format(filename))
return {}
def _save_dict(self, obj: Dict, filename: str):
try:
path = os.path.join(self.logging_directory, filename)
with open(path, 'w') as f:
json.dump(obj, f)
except IOError:
self.logger.warning('failed to save file: {}'.format(filename))
def stop(self):
"""
Stop current experiment.
"""
self.metrics.close()
self.params.close()
self._save_dict(self.metrics, 'metrics.json')
self._save_dict(self.params, 'params.json')
if not self.is_custom:
for h in self.logger.handlers:
@ -146,7 +186,8 @@ class Experiment(object):
if self.with_mlflow:
import mlflow
mlflow.log_artifact(self.log_path)
mlflow.log_artifact(self.metrics_path)
mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
if not self.inherit_existing_run:
mlflow.end_run()
@ -182,6 +223,9 @@ class Experiment(object):
"""
self.logger.info(text)
def _sanitize(self, v):
return v if isinstance(v, numbers.Number) else str(v)
def log_param(self, key, value):
"""
Logs a key-value pair for the experiment.
@ -190,8 +234,9 @@ class Experiment(object):
key: parameter name
value: parameter value
"""
self.params.write('{},{}\n'.format(key, value))
self.params.flush()
key = self._sanitize(key)
value = self._sanitize(value)
self.params[key] = value
if self.with_mlflow:
import mlflow
@ -219,8 +264,9 @@ class Experiment(object):
score:
Metric value.
"""
self.metrics.write('{},{}\n'.format(name, score))
self.metrics.flush()
name = self._sanitize(name)
score = self._sanitize(score)
self.metrics[name] = score
if self.with_mlflow:
import mlflow

View File

@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any],
X_train: pd.DataFrame, y: pd.Series,
X_test: Optional[pd.DataFrame] = None,
logging_directory: str = 'output/{time}',
overwrite: bool = False,
if_exists: str = 'error',
eval_func: Optional[Callable] = None,
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any],
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
logging_directory:
Path to directory where output of experiment is stored.
overwrite:
If True, contents in ``logging_directory`` will be overwritten.
if_exists:
How to behave if the logging directory already exists.
- error: Raise a ValueError.
- replace: Delete logging directory before logging.
- append: Append to exisitng experiment.
- rename: Rename current directory by adding "_1", "_2"... prefix
fit_params:
Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
eval_set passed for each fold. If callable is passed,
@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any],
logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp:
with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
exp.log('Algorithm: {}'.format(algorithm_type))
exp.log('Experiment: {}'.format(logging_directory))
exp.log('Params: {}'.format(model_params))

View File

@ -1,4 +1,6 @@
import json
import os
from nyaggle.experiment import Experiment
from nyaggle.testing import get_temp_directory
@ -12,13 +14,7 @@ def test_experiment_continue():
with Experiment.continue_from(logging_dir) as e:
e.log_metric('LB', 0.95)
metric_file = os.path.join(logging_dir, 'metrics.txt')
with open(metric_file, 'r') as f:
lines = [line.split(',') for line in f.readlines()]
assert lines[0][0] == 'CV'
assert lines[1][0] == 'LB'
metric_file = os.path.join(logging_dir, 'metrics.json')
import mlflow
@ -26,3 +22,8 @@ def test_experiment_continue():
data = client.get_run(mlflow.active_run().info.run_id).data
assert data.metrics['CV'] == 0.97
assert data.metrics['LB'] == 0.95
with open(metric_file, 'r') as f:
obj = json.load(f)
assert obj['CV'] == 0.97
assert obj['LB'] == 0.95

View File

@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature
from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory
def _check_file_exists(directory, files):
def _check_file_exists(directory, submission_filename=None, with_mlflow=False):
files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json']
if submission_filename:
files.append(submission_filename)
if with_mlflow:
files.append('mlflow.json')
for f in files:
assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f)
@ -40,7 +46,7 @@ def test_experiment_lgb_classifier():
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_lgb_regressor():
@ -61,7 +67,7 @@ def test_experiment_lgb_regressor():
assert len(np.unique(result.test_prediction)) > 5
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_lgb_multiclass():
@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass():
assert result.oof_prediction.shape == (len(y_train), 5)
assert result.test_prediction.shape == (len(y_test), 5)
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_cat_classifier():
@ -107,7 +113,7 @@ def test_experiment_cat_classifier():
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_cat_regressor():
@ -125,7 +131,7 @@ def test_experiment_cat_regressor():
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_cat_multiclass():
@ -148,7 +154,7 @@ def test_experiment_cat_multiclass():
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_xgb_classifier():
@ -172,7 +178,7 @@ def test_experiment_xgb_classifier():
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_xgb_regressor():
@ -190,7 +196,7 @@ def test_experiment_xgb_regressor():
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_xgb_multiclass():
@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass():
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path, submission_filename='submission.csv')
def test_experiment_sklearn_classifier():
@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier():
assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
assert roc_auc_score(y_test, result.test_prediction) >= 0.8
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_sklearn_regressor():
@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor():
assert len(np.unique(result.test_prediction)) > 5
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_sklearn_multiclass():
@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass():
assert result.oof_prediction.shape == (len(y_train), 5)
assert result.test_prediction.shape == (len(y_test), 5)
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_cat_custom_eval():
@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval():
algorithm_type='cat', eval_func=mean_absolute_error)
assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_without_test_data():
@ -317,7 +323,7 @@ def test_experiment_without_test_data():
result = run_experiment(params, X_train, y_train, None, temp_path)
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))
_check_file_exists(temp_path)
def test_experiment_fit_params():
@ -357,7 +363,7 @@ def test_experiment_mlflow():
with get_temp_directory() as temp_path:
run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))
_check_file_exists(temp_path, with_mlflow=True)
# test if output files are also stored in the mlflow artifact uri
with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
@ -365,7 +371,7 @@ def test_experiment_mlflow():
p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
if os.name == 'nt' and p.startswith("/"):
p = p[1:]
_check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))
_check_file_exists(p, with_mlflow=False)
def test_experiment_already_exists():
@ -380,13 +386,13 @@ def test_experiment_already_exists():
}
with get_temp_directory() as temp_path:
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
run_experiment(params, X_train, y_train, None, temp_path)
# result is overwrited by default
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
# result is not overwrited by default
run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace')
with pytest.raises(Exception):
run_experiment(params, X_train, y_train, None, temp_path, overwrite=False)
run_experiment(params, X_train, y_train, None, temp_path)
def test_submission_filename():