fix code style by code inspection

pull/12/head
nyanp 2020-01-13 23:47:45 +09:00
parent 18606d7251
commit 9b314cf655
12 changed files with 53 additions and 48 deletions

View File

@ -51,11 +51,6 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".

View File

@ -45,6 +45,6 @@ experiment_gbdt(logging_directory='baseline_kaggledays_tokyo',
X_train=X_train,
y=y_train,
X_test=X_test,
eval=mean_squared_error,
eval_func=mean_squared_error,
type_of_target='continuous',
overwrite=True)

View File

@ -190,24 +190,25 @@ class Experiment(object):
import mlflow
mlflow.log_artifact(path + '.npy')
def log_dataframe(self, name: str, df: pd.DataFrame, format: str = 'feather'):
def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'):
"""
Log a pandas dataframe under the logging directory.
Args:
name:
Name of the file. A .f or .csv extension will be appended to the file name if it does not already have one.
Name of the file. A .f or .csv extension will be appended to the file name if it does not already
have one.
df:
A dataframe to be saved.
format:
file_format:
A format of output file. ``csv`` and ``feather`` are supported.
"""
path = os.path.join(self.logging_directory, name)
if format == 'feather':
if file_format == 'feather':
if not path.endswith('.f'):
path += '.f'
df.to_feather(path)
elif format == 'csv':
elif file_format == 'csv':
if not path.endswith('.csv'):
path += '.csv'
df.to_csv(path, index=False)

View File

@ -22,7 +22,7 @@ GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'scor
def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_column: str,
X_train: pd.DataFrame, y: pd.Series,
X_test: Optional[pd.DataFrame] = None,
eval: Optional[Callable] = None,
eval_func: Optional[Callable] = None,
gbdt_type: str = 'lgbm',
fit_params: Optional[Dict[str, Any]] = None,
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
@ -78,7 +78,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
Target
X_test:
Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
eval:
eval_func:
Function used for logging and calculation of returning scores.
This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
gbdt_type:
@ -122,7 +122,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
* scores:
list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold.
``scores[-1]`` is overall score. `None` if eval is not specified
``scores[-1]`` is overall score. `None` if eval_func is not specified
* models:
list of objects, shape(nfolds) Trained models for each folds.
* importance:
@ -155,7 +155,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
if type_of_target == 'auto':
type_of_target = multiclass.type_of_target(y)
model, eval, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval)
model, eval_func, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval_func)
models = [model(**model_params) for _ in range(cv.get_n_splits())]
if fit_params is None:
@ -164,7 +164,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
fit_params[cat_param_name] = categorical_feature
result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups,
logger=exp.get_logger(), eval=eval, fit_params=fit_params)
logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params)
for i in range(cv.get_n_splits()):
exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
@ -192,7 +192,8 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
elapsed_time = time.time() - start_time
return GBDTResult(result.oof_prediction, result.test_prediction, result.scores, models, importance, elapsed_time)
return GBDTResult(result.oof_prediction, result.test_prediction,
result.scores, models, importance, elapsed_time)
def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Callable] = None):
@ -206,11 +207,11 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
if found is None:
raise RuntimeError('Not supported gbdt_type ({}) or type_of_target ({}).'.format(gbdt_type, target_type))
model, eval, cat_param = found[2], found[3], found[4]
model, eval_func, cat_param = found[2], found[3], found[4]
if custom_eval is not None:
eval = custom_eval
eval_func = custom_eval
return model, eval, cat_param
return model, eval_func, cat_param
def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int):

View File

@ -44,7 +44,7 @@ class BertSentenceVectorizer(BaseFeaturizer):
def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
model = None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
if tokenizer is not None:
assert model is not None
self.tokenizer = tokenizer
@ -173,4 +173,3 @@ class BertSentenceVectorizer(BaseFeaturizer):
Ignored
"""
return self._process(X, self._fit_transform_one)

View File

@ -1 +1 @@
from nyaggle.testing.util import *
from nyaggle.testing.util import *

View File

@ -36,10 +36,13 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
>>> })
>>> plot_importance(importance, 'importance.png')
"""
sorted = importance.groupby('feature')['importance'].mean().reset_index().sort_values(by='importance', ascending=False)
importance = importance.groupby('feature')['importance']\
.mean()\
.reset_index()\
.sort_values(by='importance', ascending=False)
if len(sorted) > top_n:
sorted = sorted.iloc[:top_n, :]
if len(importance) > top_n:
importance = importance.iloc[:top_n, :]
if figsize is None:
figsize = (10, 16)
@ -48,7 +51,7 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
title = 'Feature Importance'
plt.figure(figsize=figsize)
sns.barplot(x="importance", y="feature", data=sorted)
sns.barplot(x="importance", y="feature", data=importance)
plt.title(title)
plt.tight_layout()
plt.savefig(path)

View File

@ -1,5 +1,5 @@
from collections import namedtuple
from typing import Dict, Optional
from typing import Optional
import numpy as np
import pandas as pd
@ -76,7 +76,7 @@ def adversarial_validate(X_train: pd.DataFrame,
else:
nfolds_evaluate = None
result = cross_validate(estimator, concat, y, None, cv=5, predict_proba=True,
eval=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
eval_func=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
nfolds_evaluate=nfolds_evaluate)
importance = pd.concat(result.importance)

View File

@ -22,7 +22,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
X_test: Union[pd.DataFrame, np.ndarray] = None,
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
groups: Optional[pd.Series] = None,
predict_proba: bool = False, eval: Optional[Callable] = None, logger: Optional[Logger] = None,
predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None,
on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None,
fit_params: Optional[Dict] = None,
importance_type: str = 'gain',
@ -51,7 +51,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
Group labels for the samples. Only used in conjunction with a Group cv instance (e.g., ``GroupKFold``).
predict_proba:
If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data.
eval:
eval_func:
Function used for logging and returning scores
logger:
logger
@ -60,7 +60,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
fit_params:
Parameters passed to the fit method of the estimator
importance_type:
The type of feature importance to be used to calculate result. Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
The type of feature importance to be used to calculate result.
Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
nfolds_evaluate:
If not ``None``, and ``nfolds_evaluate`` < ``nfolds``, only ``nfolds_evaluate`` folds are evaluated.
For example, if ``nfolds = 5`` and ``nfolds_evaluate = 2``, only the first 2 folds out of 5 are evaluated.
@ -95,7 +96,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
>>> y=y[:3],
>>> X_test=X[3:, :],
>>> cv=3,
>>> eval=mean_squared_error)
>>> eval_func=mean_squared_error)
>>> print(pred_oof)
[-101.1123267 , 26.79300693, 17.72635528]
>>> print(pred_test)
@ -121,14 +122,15 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
if logger is None:
logger = getLogger(__name__)
def _predict(model: BaseEstimator, x: pd.DataFrame, predict_proba: bool):
if predict_proba:
def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool):
if _predict_proba:
return model.predict_proba(x)[:, 1]
else:
return model.predict(x)
oof = np.zeros(len(X_train))
evaluated = np.full(len(X_train), False)
test = None
if X_test is not None:
test = np.zeros((len(X_test), cv.get_n_splits()))
@ -174,8 +176,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
if isinstance(estimator[n], (LGBMModel, CatBoost)):
importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))
if eval is not None:
score = eval(valid_y, oof[valid_idx])
if eval_func is not None:
score = eval_func(valid_y, oof[valid_idx])
scores.append(score)
logger.info('Fold {} score: {}'.format(n, score))
@ -183,8 +185,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
eta_all.append(elapsed)
logger.debug('{:.3f} sec / fold'.format(elapsed))
if eval is not None:
score = eval(y.loc[evaluated], oof[evaluated])
if eval_func is not None:
score = eval_func(y.loc[evaluated], oof[evaluated])
scores.append(score)
logger.info('Overall score: {}'.format(score))

View File

@ -10,6 +10,7 @@ def get_long_description():
long_description = f.read()
return long_description
def get_version():
version_filepath = path.join(path.dirname(__file__), 'nyaggle', 'version.py')
with open(version_filepath) as f:
@ -17,6 +18,7 @@ def get_version():
if line.startswith('__version__'):
return line.strip().split()[-1][1:-1]
setup(
name='nyaggle',
packages=['nyaggle'],
@ -50,5 +52,5 @@ setup(
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7'
], # パッケージ(プロジェクト)の分類。https://pypi.org/classifiers/に掲載されているものを指定可能。
]
)

View File

@ -22,11 +22,13 @@ def _check_file_exists(directory, files):
@contextmanager
def _get_temp_directory() -> str:
path = None
try:
path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
yield path
finally:
shutil.rmtree(path, ignore_errors=True)
if path:
shutil.rmtree(path, ignore_errors=True)
def test_experiment_lgb_classifier():
@ -125,7 +127,7 @@ def test_experiment_cat_custom_eval():
with _get_temp_directory() as temp_path:
result = experiment_gbdt(temp_path, params, 'user_id',
X_train, y_train, X_test, gbdt_type='cat', eval=mean_absolute_error)
X_train, y_train, X_test, gbdt_type='cat', eval_func=mean_absolute_error)
assert mean_absolute_error(y_train, result.oof_prediction) == result.scores[-1]
_check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'scores.txt'))

View File

@ -15,7 +15,7 @@ def test_cv_sklean_binary():
model = RidgeClassifier(alpha=1.0)
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score)
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)
assert len(scores) == 5 + 1
assert scores[-1] >= 0.85 # overall auc
@ -29,7 +29,7 @@ def test_cv_sklean_regression():
model = Ridge(alpha=1.0)
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=r2_score)
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score)
print(scores)
assert len(scores) == 5 + 1
@ -45,7 +45,7 @@ def test_cv_lgbm():
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
eval=roc_auc_score,
eval_func=roc_auc_score,
fit_params={'early_stopping_rounds': 200})
print(scores)
@ -66,7 +66,7 @@ def test_cv_lgbm_df():
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
eval=roc_auc_score)
eval_func=roc_auc_score)
print(scores)
assert len(scores) == 5 + 1
@ -87,7 +87,7 @@ def test_cv_cat_df():
models = [CatBoostClassifier(n_estimators=300) for _ in range(5)]
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
eval=roc_auc_score,
eval_func=roc_auc_score,
fit_params={'cat_features': ['cat_0']})
print(scores)
@ -114,7 +114,7 @@ def test_cv_partial_evaluate():
nonlocal n
n += 1
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score,
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score,
nfolds_evaluate=2, on_each_fold=_fold_count)
assert len(scores) == 2 + 1