fix code style by code inspection
parent
18606d7251
commit
9b314cf655
|
@ -51,11 +51,6 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
|||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'alabaster'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
|
|
|
@ -45,6 +45,6 @@ experiment_gbdt(logging_directory='baseline_kaggledays_tokyo',
|
|||
X_train=X_train,
|
||||
y=y_train,
|
||||
X_test=X_test,
|
||||
eval=mean_squared_error,
|
||||
eval_func=mean_squared_error,
|
||||
type_of_target='continuous',
|
||||
overwrite=True)
|
||||
|
|
|
@ -190,24 +190,25 @@ class Experiment(object):
|
|||
import mlflow
|
||||
mlflow.log_artifact(path + '.npy')
|
||||
|
||||
def log_dataframe(self, name: str, df: pd.DataFrame, format: str = 'feather'):
|
||||
def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'):
|
||||
"""
|
||||
Log a pandas dataframe under the logging directory.
|
||||
|
||||
Args:
|
||||
name:
|
||||
Name of the file. A .f or .csv extension will be appended to the file name if it does not already have one.
|
||||
Name of the file. A .f or .csv extension will be appended to the file name if it does not already
|
||||
have one.
|
||||
df:
|
||||
A dataframe to be saved.
|
||||
format:
|
||||
file_format:
|
||||
A format of output file. ``csv`` and ``feather`` are supported.
|
||||
"""
|
||||
path = os.path.join(self.logging_directory, name)
|
||||
if format == 'feather':
|
||||
if file_format == 'feather':
|
||||
if not path.endswith('.f'):
|
||||
path += '.f'
|
||||
df.to_feather(path)
|
||||
elif format == 'csv':
|
||||
elif file_format == 'csv':
|
||||
if not path.endswith('.csv'):
|
||||
path += '.csv'
|
||||
df.to_csv(path, index=False)
|
||||
|
|
|
@ -22,7 +22,7 @@ GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'scor
|
|||
def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_column: str,
|
||||
X_train: pd.DataFrame, y: pd.Series,
|
||||
X_test: Optional[pd.DataFrame] = None,
|
||||
eval: Optional[Callable] = None,
|
||||
eval_func: Optional[Callable] = None,
|
||||
gbdt_type: str = 'lgbm',
|
||||
fit_params: Optional[Dict[str, Any]] = None,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
|
@ -78,7 +78,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
|
|||
Target
|
||||
X_test:
|
||||
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
|
||||
eval:
|
||||
eval_func:
|
||||
Function used for logging and calculation of returning scores.
|
||||
This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
|
||||
gbdt_type:
|
||||
|
@ -122,7 +122,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
|
|||
numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
|
||||
* scores:
|
||||
list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold.
|
||||
``scores[-1]`` is overall score. `None` if eval is not specified
|
||||
``scores[-1]`` is overall score. `None` if eval_func is not specified
|
||||
* models:
|
||||
list of objects, shape(nfolds) Trained models for each folds.
|
||||
* importance:
|
||||
|
@ -155,7 +155,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
|
|||
|
||||
if type_of_target == 'auto':
|
||||
type_of_target = multiclass.type_of_target(y)
|
||||
model, eval, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval)
|
||||
model, eval_func, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval_func)
|
||||
models = [model(**model_params) for _ in range(cv.get_n_splits())]
|
||||
|
||||
if fit_params is None:
|
||||
|
@ -164,7 +164,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
|
|||
fit_params[cat_param_name] = categorical_feature
|
||||
|
||||
result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups,
|
||||
logger=exp.get_logger(), eval=eval, fit_params=fit_params)
|
||||
logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params)
|
||||
|
||||
for i in range(cv.get_n_splits()):
|
||||
exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
|
||||
|
@ -192,7 +192,8 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
|
|||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
return GBDTResult(result.oof_prediction, result.test_prediction, result.scores, models, importance, elapsed_time)
|
||||
return GBDTResult(result.oof_prediction, result.test_prediction,
|
||||
result.scores, models, importance, elapsed_time)
|
||||
|
||||
|
||||
def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Callable] = None):
|
||||
|
@ -206,11 +207,11 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
|
|||
if found is None:
|
||||
raise RuntimeError('Not supported gbdt_type ({}) or type_of_target ({}).'.format(gbdt_type, target_type))
|
||||
|
||||
model, eval, cat_param = found[2], found[3], found[4]
|
||||
model, eval_func, cat_param = found[2], found[3], found[4]
|
||||
if custom_eval is not None:
|
||||
eval = custom_eval
|
||||
eval_func = custom_eval
|
||||
|
||||
return model, eval, cat_param
|
||||
return model, eval_func, cat_param
|
||||
|
||||
|
||||
def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int):
|
||||
|
|
|
@ -44,7 +44,7 @@ class BertSentenceVectorizer(BaseFeaturizer):
|
|||
def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
|
||||
text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
|
||||
use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
|
||||
model = None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
|
||||
model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
|
||||
if tokenizer is not None:
|
||||
assert model is not None
|
||||
self.tokenizer = tokenizer
|
||||
|
@ -173,4 +173,3 @@ class BertSentenceVectorizer(BaseFeaturizer):
|
|||
Ignored
|
||||
"""
|
||||
return self._process(X, self._fit_transform_one)
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
from nyaggle.testing.util import *
|
||||
from nyaggle.testing.util import *
|
||||
|
|
|
@ -36,10 +36,13 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
|
|||
>>> })
|
||||
>>> plot_importance(importance, 'importance.png')
|
||||
"""
|
||||
sorted = importance.groupby('feature')['importance'].mean().reset_index().sort_values(by='importance', ascending=False)
|
||||
importance = importance.groupby('feature')['importance']\
|
||||
.mean()\
|
||||
.reset_index()\
|
||||
.sort_values(by='importance', ascending=False)
|
||||
|
||||
if len(sorted) > top_n:
|
||||
sorted = sorted.iloc[:top_n, :]
|
||||
if len(importance) > top_n:
|
||||
importance = importance.iloc[:top_n, :]
|
||||
|
||||
if figsize is None:
|
||||
figsize = (10, 16)
|
||||
|
@ -48,7 +51,7 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
|
|||
title = 'Feature Importance'
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
sns.barplot(x="importance", y="feature", data=sorted)
|
||||
sns.barplot(x="importance", y="feature", data=importance)
|
||||
plt.title(title)
|
||||
plt.tight_layout()
|
||||
plt.savefig(path)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from collections import namedtuple
|
||||
from typing import Dict, Optional
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
@ -76,7 +76,7 @@ def adversarial_validate(X_train: pd.DataFrame,
|
|||
else:
|
||||
nfolds_evaluate = None
|
||||
result = cross_validate(estimator, concat, y, None, cv=5, predict_proba=True,
|
||||
eval=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
|
||||
eval_func=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
|
||||
nfolds_evaluate=nfolds_evaluate)
|
||||
|
||||
importance = pd.concat(result.importance)
|
||||
|
|
|
@ -22,7 +22,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
X_test: Union[pd.DataFrame, np.ndarray] = None,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
predict_proba: bool = False, eval: Optional[Callable] = None, logger: Optional[Logger] = None,
|
||||
predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None,
|
||||
on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None,
|
||||
fit_params: Optional[Dict] = None,
|
||||
importance_type: str = 'gain',
|
||||
|
@ -51,7 +51,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
|
||||
predict_proba:
|
||||
If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data.
|
||||
eval:
|
||||
eval_func:
|
||||
Function used for logging and returning scores
|
||||
logger:
|
||||
logger
|
||||
|
@ -60,7 +60,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
fit_params:
|
||||
Parameters passed to the fit method of the estimator
|
||||
importance_type:
|
||||
The type of feature importance to be used to calculate result. Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
|
||||
The type of feature importance to be used to calculate result.
|
||||
Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
|
||||
nfolds_evaluate:
|
||||
If not ``None``, and ``nfolds_evaluate`` < ``nfolds``, only ``nfolds_evaluate`` folds are evaluated.
|
||||
For example, if ``nfolds = 5`` and ``nfolds_evaluate = 2``, only the first 2 folds out of 5 are evaluated.
|
||||
|
@ -95,7 +96,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
>>> y=y[:3],
|
||||
>>> X_test=X[3:, :],
|
||||
>>> cv=3,
|
||||
>>> eval=mean_squared_error)
|
||||
>>> eval_func=mean_squared_error)
|
||||
>>> print(pred_oof)
|
||||
[-101.1123267 , 26.79300693, 17.72635528]
|
||||
>>> print(pred_test)
|
||||
|
@ -121,14 +122,15 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
if logger is None:
|
||||
logger = getLogger(__name__)
|
||||
|
||||
def _predict(model: BaseEstimator, x: pd.DataFrame, predict_proba: bool):
|
||||
if predict_proba:
|
||||
def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool):
|
||||
if _predict_proba:
|
||||
return model.predict_proba(x)[:, 1]
|
||||
else:
|
||||
return model.predict(x)
|
||||
|
||||
oof = np.zeros(len(X_train))
|
||||
evaluated = np.full(len(X_train), False)
|
||||
test = None
|
||||
if X_test is not None:
|
||||
test = np.zeros((len(X_test), cv.get_n_splits()))
|
||||
|
||||
|
@ -174,8 +176,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
if isinstance(estimator[n], (LGBMModel, CatBoost)):
|
||||
importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))
|
||||
|
||||
if eval is not None:
|
||||
score = eval(valid_y, oof[valid_idx])
|
||||
if eval_func is not None:
|
||||
score = eval_func(valid_y, oof[valid_idx])
|
||||
scores.append(score)
|
||||
logger.info('Fold {} score: {}'.format(n, score))
|
||||
|
||||
|
@ -183,8 +185,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
|
|||
eta_all.append(elapsed)
|
||||
logger.debug('{:.3f} sec / fold'.format(elapsed))
|
||||
|
||||
if eval is not None:
|
||||
score = eval(y.loc[evaluated], oof[evaluated])
|
||||
if eval_func is not None:
|
||||
score = eval_func(y.loc[evaluated], oof[evaluated])
|
||||
scores.append(score)
|
||||
logger.info('Overall score: {}'.format(score))
|
||||
|
||||
|
|
4
setup.py
4
setup.py
|
@ -10,6 +10,7 @@ def get_long_description():
|
|||
long_description = f.read()
|
||||
return long_description
|
||||
|
||||
|
||||
def get_version():
|
||||
version_filepath = path.join(path.dirname(__file__), 'nyaggle', 'version.py')
|
||||
with open(version_filepath) as f:
|
||||
|
@ -17,6 +18,7 @@ def get_version():
|
|||
if line.startswith('__version__'):
|
||||
return line.strip().split()[-1][1:-1]
|
||||
|
||||
|
||||
setup(
|
||||
name='nyaggle',
|
||||
packages=['nyaggle'],
|
||||
|
@ -50,5 +52,5 @@ setup(
|
|||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7'
|
||||
], # パッケージ(プロジェクト)の分類。https://pypi.org/classifiers/に掲載されているものを指定可能。
|
||||
]
|
||||
)
|
||||
|
|
|
@ -22,11 +22,13 @@ def _check_file_exists(directory, files):
|
|||
|
||||
@contextmanager
|
||||
def _get_temp_directory() -> str:
|
||||
path = None
|
||||
try:
|
||||
path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
|
||||
yield path
|
||||
finally:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
if path:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
|
||||
|
||||
def test_experiment_lgb_classifier():
|
||||
|
@ -125,7 +127,7 @@ def test_experiment_cat_custom_eval():
|
|||
|
||||
with _get_temp_directory() as temp_path:
|
||||
result = experiment_gbdt(temp_path, params, 'user_id',
|
||||
X_train, y_train, X_test, gbdt_type='cat', eval=mean_absolute_error)
|
||||
X_train, y_train, X_test, gbdt_type='cat', eval_func=mean_absolute_error)
|
||||
|
||||
assert mean_absolute_error(y_train, result.oof_prediction) == result.scores[-1]
|
||||
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'scores.txt'))
|
||||
|
|
|
@ -15,7 +15,7 @@ def test_cv_sklean_binary():
|
|||
|
||||
model = RidgeClassifier(alpha=1.0)
|
||||
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score)
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)
|
||||
|
||||
assert len(scores) == 5 + 1
|
||||
assert scores[-1] >= 0.85 # overall auc
|
||||
|
@ -29,7 +29,7 @@ def test_cv_sklean_regression():
|
|||
|
||||
model = Ridge(alpha=1.0)
|
||||
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=r2_score)
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score)
|
||||
|
||||
print(scores)
|
||||
assert len(scores) == 5 + 1
|
||||
|
@ -45,7 +45,7 @@ def test_cv_lgbm():
|
|||
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
|
||||
|
||||
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
|
||||
eval=roc_auc_score,
|
||||
eval_func=roc_auc_score,
|
||||
fit_params={'early_stopping_rounds': 200})
|
||||
|
||||
print(scores)
|
||||
|
@ -66,7 +66,7 @@ def test_cv_lgbm_df():
|
|||
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
|
||||
|
||||
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
|
||||
eval=roc_auc_score)
|
||||
eval_func=roc_auc_score)
|
||||
|
||||
print(scores)
|
||||
assert len(scores) == 5 + 1
|
||||
|
@ -87,7 +87,7 @@ def test_cv_cat_df():
|
|||
models = [CatBoostClassifier(n_estimators=300) for _ in range(5)]
|
||||
|
||||
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
|
||||
eval=roc_auc_score,
|
||||
eval_func=roc_auc_score,
|
||||
fit_params={'cat_features': ['cat_0']})
|
||||
|
||||
print(scores)
|
||||
|
@ -114,7 +114,7 @@ def test_cv_partial_evaluate():
|
|||
nonlocal n
|
||||
n += 1
|
||||
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score,
|
||||
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score,
|
||||
nfolds_evaluate=2, on_each_fold=_fold_count)
|
||||
|
||||
assert len(scores) == 2 + 1
|
||||
|
|
Loading…
Reference in New Issue