change params/metrics to json format, replace overwrite to if_exists
parent
72092fb1ab
commit
174cc19b53
|
@ -46,6 +46,6 @@ run_experiment(logging_directory='baseline_kaggledays_tokyo',
|
|||
X_test=X_test,
|
||||
eval_func=mean_squared_error,
|
||||
type_of_target='continuous',
|
||||
overwrite=True,
|
||||
if_exists='replace',
|
||||
with_auto_hpo=True,
|
||||
sample_submission=pd.read_csv('sample_submission.csv'))
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import numbers
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
@ -22,6 +23,25 @@ def _sanitize_mlflow_param(param, limit):
|
|||
return param
|
||||
|
||||
|
||||
def _check_directory(directory: str, if_exists: str) -> str:
|
||||
if os.path.exists(directory):
|
||||
if if_exists == 'error':
|
||||
raise ValueError('directory {} already exists.'.format(directory))
|
||||
elif if_exists == 'replace':
|
||||
warnings.warn(
|
||||
'directory {} already exists. It will be replaced by the new result'.format(directory))
|
||||
shutil.rmtree(directory, ignore_errors=True)
|
||||
elif if_exists == 'rename':
|
||||
postfix_index = 1
|
||||
|
||||
while os.path.exists(directory + '_' + str(postfix_index)):
|
||||
postfix_index += 1
|
||||
|
||||
directory += '_' + str(postfix_index)
|
||||
warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
|
||||
return directory
|
||||
|
||||
|
||||
class Experiment(object):
|
||||
"""Minimal experiment logger for Kaggle
|
||||
|
||||
|
@ -42,8 +62,6 @@ class Experiment(object):
|
|||
Args:
|
||||
logging_directory:
|
||||
Path to directory where output is stored.
|
||||
overwrite:
|
||||
If True, contents in ``logging_directory`` will be overwritten.
|
||||
custom_logger:
|
||||
A custom logger to be used instead of default logger.
|
||||
with_mlflow:
|
||||
|
@ -51,17 +69,24 @@ class Experiment(object):
|
|||
One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
|
||||
Note that all output files are located both ``logging_directory`` and
|
||||
mlflow's directory (``mlruns`` by default).
|
||||
if_exists:
|
||||
How to behave if the logging directory already exists.
|
||||
- error: Raise a ValueError.
|
||||
- replace: Delete logging directory before logging.
|
||||
- append: Append to exisitng experiment.
|
||||
- rename: Rename current directory by adding "_1", "_2"... prefix
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
logging_directory: str,
|
||||
overwrite: bool = False,
|
||||
custom_logger: Optional[Logger] = None,
|
||||
with_mlflow: bool = False,
|
||||
mlflow_run_id: Optional[str] = None,
|
||||
logging_mode: str = 'w'
|
||||
if_exists: str = 'error'
|
||||
):
|
||||
os.makedirs(logging_directory, exist_ok=overwrite)
|
||||
logging_directory = _check_directory(logging_directory, if_exists)
|
||||
os.makedirs(logging_directory, exist_ok=True)
|
||||
|
||||
self.logging_directory = logging_directory
|
||||
self.with_mlflow = with_mlflow
|
||||
|
||||
|
@ -75,8 +100,8 @@ class Experiment(object):
|
|||
self.logger.setLevel(DEBUG)
|
||||
self.is_custom = False
|
||||
self.metrics_path = os.path.join(logging_directory, 'metrics.txt')
|
||||
self.metrics = open(self.metrics_path, mode=logging_mode)
|
||||
self.params = open(os.path.join(logging_directory, 'params.txt'), mode=logging_mode)
|
||||
self.metrics = self._load_dict('metrics.json')
|
||||
self.params = self._load_dict('params.json')
|
||||
self.inherit_existing_run = False
|
||||
|
||||
if self.with_mlflow:
|
||||
|
@ -98,15 +123,13 @@ class Experiment(object):
|
|||
def continue_from(cls, logging_directory: str):
|
||||
params = {
|
||||
'logging_directory': logging_directory,
|
||||
'overwrite': True,
|
||||
'logging_mode': 'a'
|
||||
'if_exists': 'append'
|
||||
}
|
||||
|
||||
mlflow_path = os.path.join(logging_directory, 'mlflow.json')
|
||||
if os.path.exists(mlflow_path):
|
||||
with open(mlflow_path, 'r') as f:
|
||||
mlflow_metadata = json.load(f)
|
||||
|
||||
params['with_mlflow'] = True
|
||||
params['mlflow_run_id'] = mlflow_metadata['run_id']
|
||||
|
||||
|
@ -132,12 +155,29 @@ class Experiment(object):
|
|||
with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
|
||||
json.dump(mlflow_metadata, f, indent=4)
|
||||
|
||||
def _load_dict(self, filename: str) -> Dict:
|
||||
try:
|
||||
path = os.path.join(self.logging_directory, filename)
|
||||
with open(path, 'r') as f:
|
||||
return json.load(f)
|
||||
except IOError:
|
||||
self.logger.warning('failed to load file: {}'.format(filename))
|
||||
return {}
|
||||
|
||||
def _save_dict(self, obj: Dict, filename: str):
|
||||
try:
|
||||
path = os.path.join(self.logging_directory, filename)
|
||||
with open(path, 'w') as f:
|
||||
json.dump(obj, f)
|
||||
except IOError:
|
||||
self.logger.warning('failed to save file: {}'.format(filename))
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stop current experiment.
|
||||
"""
|
||||
self.metrics.close()
|
||||
self.params.close()
|
||||
self._save_dict(self.metrics, 'metrics.json')
|
||||
self._save_dict(self.params, 'params.json')
|
||||
|
||||
if not self.is_custom:
|
||||
for h in self.logger.handlers:
|
||||
|
@ -146,7 +186,8 @@ class Experiment(object):
|
|||
if self.with_mlflow:
|
||||
import mlflow
|
||||
mlflow.log_artifact(self.log_path)
|
||||
mlflow.log_artifact(self.metrics_path)
|
||||
mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
|
||||
mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
|
||||
if not self.inherit_existing_run:
|
||||
mlflow.end_run()
|
||||
|
||||
|
@ -182,6 +223,9 @@ class Experiment(object):
|
|||
"""
|
||||
self.logger.info(text)
|
||||
|
||||
def _sanitize(self, v):
|
||||
return v if isinstance(v, numbers.Number) else str(v)
|
||||
|
||||
def log_param(self, key, value):
|
||||
"""
|
||||
Logs a key-value pair for the experiment.
|
||||
|
@ -190,8 +234,9 @@ class Experiment(object):
|
|||
key: parameter name
|
||||
value: parameter value
|
||||
"""
|
||||
self.params.write('{},{}\n'.format(key, value))
|
||||
self.params.flush()
|
||||
key = self._sanitize(key)
|
||||
value = self._sanitize(value)
|
||||
self.params[key] = value
|
||||
|
||||
if self.with_mlflow:
|
||||
import mlflow
|
||||
|
@ -219,8 +264,9 @@ class Experiment(object):
|
|||
score:
|
||||
Metric value.
|
||||
"""
|
||||
self.metrics.write('{},{}\n'.format(name, score))
|
||||
self.metrics.flush()
|
||||
name = self._sanitize(name)
|
||||
score = self._sanitize(score)
|
||||
self.metrics[name] = score
|
||||
|
||||
if self.with_mlflow:
|
||||
import mlflow
|
||||
|
|
|
@ -37,7 +37,7 @@ def run_experiment(model_params: Dict[str, Any],
|
|||
X_train: pd.DataFrame, y: pd.Series,
|
||||
X_test: Optional[pd.DataFrame] = None,
|
||||
logging_directory: str = 'output/{time}',
|
||||
overwrite: bool = False,
|
||||
if_exists: str = 'error',
|
||||
eval_func: Optional[Callable] = None,
|
||||
algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
|
||||
fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
|
||||
|
@ -92,8 +92,12 @@ def run_experiment(model_params: Dict[str, Any],
|
|||
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
|
||||
logging_directory:
|
||||
Path to directory where output of experiment is stored.
|
||||
overwrite:
|
||||
If True, contents in ``logging_directory`` will be overwritten.
|
||||
if_exists:
|
||||
How to behave if the logging directory already exists.
|
||||
- error: Raise a ValueError.
|
||||
- replace: Delete logging directory before logging.
|
||||
- append: Append to exisitng experiment.
|
||||
- rename: Rename current directory by adding "_1", "_2"... prefix
|
||||
fit_params:
|
||||
Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
|
||||
eval_set passed for each fold. If callable is passed,
|
||||
|
@ -182,7 +186,7 @@ def run_experiment(model_params: Dict[str, Any],
|
|||
|
||||
logging_directory = logging_directory.format(time=datetime.now().strftime('%Y%m%d_%H%M%S'))
|
||||
|
||||
with Experiment(logging_directory, overwrite, with_mlflow=with_mlflow) as exp:
|
||||
with Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) as exp:
|
||||
exp.log('Algorithm: {}'.format(algorithm_type))
|
||||
exp.log('Experiment: {}'.format(logging_directory))
|
||||
exp.log('Params: {}'.format(model_params))
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
from nyaggle.experiment import Experiment
|
||||
from nyaggle.testing import get_temp_directory
|
||||
|
||||
|
@ -12,13 +14,7 @@ def test_experiment_continue():
|
|||
with Experiment.continue_from(logging_dir) as e:
|
||||
e.log_metric('LB', 0.95)
|
||||
|
||||
metric_file = os.path.join(logging_dir, 'metrics.txt')
|
||||
|
||||
with open(metric_file, 'r') as f:
|
||||
lines = [line.split(',') for line in f.readlines()]
|
||||
|
||||
assert lines[0][0] == 'CV'
|
||||
assert lines[1][0] == 'LB'
|
||||
metric_file = os.path.join(logging_dir, 'metrics.json')
|
||||
|
||||
import mlflow
|
||||
|
||||
|
@ -26,3 +22,8 @@ def test_experiment_continue():
|
|||
data = client.get_run(mlflow.active_run().info.run_id).data
|
||||
assert data.metrics['CV'] == 0.97
|
||||
assert data.metrics['LB'] == 0.95
|
||||
|
||||
with open(metric_file, 'r') as f:
|
||||
obj = json.load(f)
|
||||
assert obj['CV'] == 0.97
|
||||
assert obj['LB'] == 0.95
|
||||
|
|
|
@ -16,7 +16,13 @@ from nyaggle.feature_store import save_feature
|
|||
from nyaggle.testing import make_classification_df, make_regression_df, get_temp_directory
|
||||
|
||||
|
||||
def _check_file_exists(directory, files):
|
||||
def _check_file_exists(directory, submission_filename=None, with_mlflow=False):
|
||||
files = ['oof_prediction.npy', 'test_prediction.npy', 'metrics.json', 'params.json']
|
||||
if submission_filename:
|
||||
files.append(submission_filename)
|
||||
if with_mlflow:
|
||||
files.append('mlflow.json')
|
||||
|
||||
for f in files:
|
||||
assert os.path.exists(os.path.join(directory, f)), 'File not found: {}'.format(f)
|
||||
|
||||
|
@ -40,7 +46,7 @@ def test_experiment_lgb_classifier():
|
|||
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
|
||||
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_lgb_regressor():
|
||||
|
@ -61,7 +67,7 @@ def test_experiment_lgb_regressor():
|
|||
assert len(np.unique(result.test_prediction)) > 5
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_lgb_multiclass():
|
||||
|
@ -83,7 +89,7 @@ def test_experiment_lgb_multiclass():
|
|||
assert result.oof_prediction.shape == (len(y_train), 5)
|
||||
assert result.test_prediction.shape == (len(y_test), 5)
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_cat_classifier():
|
||||
|
@ -107,7 +113,7 @@ def test_experiment_cat_classifier():
|
|||
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_cat_regressor():
|
||||
|
@ -125,7 +131,7 @@ def test_experiment_cat_regressor():
|
|||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat')
|
||||
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_cat_multiclass():
|
||||
|
@ -148,7 +154,7 @@ def test_experiment_cat_multiclass():
|
|||
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path, submission_filename='submission.csv')
|
||||
|
||||
|
||||
def test_experiment_xgb_classifier():
|
||||
|
@ -172,7 +178,7 @@ def test_experiment_xgb_classifier():
|
|||
assert roc_auc_score(y_test, result.test_prediction) >= 0.9
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', 'tgt']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path, submission_filename='submission.csv')
|
||||
|
||||
|
||||
def test_experiment_xgb_regressor():
|
||||
|
@ -190,7 +196,7 @@ def test_experiment_xgb_regressor():
|
|||
result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='xgb', with_auto_prep=True)
|
||||
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_xgb_multiclass():
|
||||
|
@ -214,7 +220,7 @@ def test_experiment_xgb_multiclass():
|
|||
|
||||
assert list(pd.read_csv(os.path.join(temp_path, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']
|
||||
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path, submission_filename='submission.csv')
|
||||
|
||||
|
||||
def test_experiment_sklearn_classifier():
|
||||
|
@ -236,7 +242,7 @@ def test_experiment_sklearn_classifier():
|
|||
assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
|
||||
assert roc_auc_score(y_test, result.test_prediction) >= 0.8
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_sklearn_regressor():
|
||||
|
@ -257,7 +263,7 @@ def test_experiment_sklearn_regressor():
|
|||
assert len(np.unique(result.test_prediction)) > 5
|
||||
assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_sklearn_multiclass():
|
||||
|
@ -279,7 +285,7 @@ def test_experiment_sklearn_multiclass():
|
|||
assert result.oof_prediction.shape == (len(y_train), 5)
|
||||
assert result.test_prediction.shape == (len(y_test), 5)
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_cat_custom_eval():
|
||||
|
@ -299,7 +305,7 @@ def test_experiment_cat_custom_eval():
|
|||
algorithm_type='cat', eval_func=mean_absolute_error)
|
||||
|
||||
assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1]
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_without_test_data():
|
||||
|
@ -317,7 +323,7 @@ def test_experiment_without_test_data():
|
|||
result = run_experiment(params, X_train, y_train, None, temp_path)
|
||||
|
||||
assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(temp_path)
|
||||
|
||||
|
||||
def test_experiment_fit_params():
|
||||
|
@ -357,7 +363,7 @@ def test_experiment_mlflow():
|
|||
with get_temp_directory() as temp_path:
|
||||
run_experiment(params, X_train, y_train, None, temp_path, with_mlflow=True)
|
||||
|
||||
_check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))
|
||||
_check_file_exists(temp_path, with_mlflow=True)
|
||||
|
||||
# test if output files are also stored in the mlflow artifact uri
|
||||
with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
|
||||
|
@ -365,7 +371,7 @@ def test_experiment_mlflow():
|
|||
p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
|
||||
if os.name == 'nt' and p.startswith("/"):
|
||||
p = p[1:]
|
||||
_check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))
|
||||
_check_file_exists(p, with_mlflow=False)
|
||||
|
||||
|
||||
def test_experiment_already_exists():
|
||||
|
@ -380,13 +386,13 @@ def test_experiment_already_exists():
|
|||
}
|
||||
|
||||
with get_temp_directory() as temp_path:
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
run_experiment(params, X_train, y_train, None, temp_path)
|
||||
|
||||
# result is overwrited by default
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=True)
|
||||
# result is not overwrited by default
|
||||
run_experiment(params, X_train, y_train, None, temp_path, if_exists='replace')
|
||||
|
||||
with pytest.raises(Exception):
|
||||
run_experiment(params, X_train, y_train, None, temp_path, overwrite=False)
|
||||
run_experiment(params, X_train, y_train, None, temp_path)
|
||||
|
||||
|
||||
def test_submission_filename():
|
||||
|
|
Loading…
Reference in New Issue