fix code style by code inspection

2020-01-13 23:47:45 +09:00 · 2020-01-13 23:47:45 +09:00 · 9b314cf655
parent 18606d7251
commit 9b314cf655
12 changed files with 53 additions and 48 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -51,11 +51,6 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

 # -- Options for HTML output -------------------------------------------------

-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'alabaster'
-
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
--- a/examples/kaggle_days_tokyo.py
+++ b/examples/kaggle_days_tokyo.py
@ -45,6 +45,6 @@ experiment_gbdt(logging_directory='baseline_kaggledays_tokyo',
                X_train=X_train,
                y=y_train,
                X_test=X_test,
-                eval=mean_squared_error,
+                eval_func=mean_squared_error,
                type_of_target='continuous',
                overwrite=True)
--- a/nyaggle/experiment/experiment.py
+++ b/nyaggle/experiment/experiment.py
@ -190,24 +190,25 @@ class Experiment(object):
            import mlflow
            mlflow.log_artifact(path + '.npy')

-    def log_dataframe(self, name: str, df: pd.DataFrame, format: str = 'feather'):
+    def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'):
        """
        Log a pandas dataframe under the logging directory.

        Args:
            name:
-                Name of the file. A .f or .csv extension will be appended to the file name if it does not already have one.
+                Name of the file. A .f or .csv extension will be appended to the file name if it does not already
+                have one.
            df:
                A dataframe to be saved.
-            format:
+            file_format:
                A format of output file. ``csv`` and ``feather`` are supported.
        """
        path = os.path.join(self.logging_directory, name)
-        if format == 'feather':
+        if file_format == 'feather':
            if not path.endswith('.f'):
                path += '.f'
            df.to_feather(path)
-        elif format == 'csv':
+        elif file_format == 'csv':
            if not path.endswith('.csv'):
                path += '.csv'
            df.to_csv(path, index=False)
--- a/nyaggle/experiment/gbdt.py
+++ b/nyaggle/experiment/gbdt.py
@ -22,7 +22,7 @@ GBDTResult = namedtuple('LGBResult', ['oof_prediction', 'test_prediction', 'scor
 def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_column: str,
                    X_train: pd.DataFrame, y: pd.Series,
                    X_test: Optional[pd.DataFrame] = None,
-                    eval: Optional[Callable] = None,
+                    eval_func: Optional[Callable] = None,
                    gbdt_type: str = 'lgbm',
                    fit_params: Optional[Dict[str, Any]] = None,
                    cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
@ -78,7 +78,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
            Target
        X_test:
            Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
-        eval:
+        eval_func:
            Function used for logging and calculation of returning scores.
            This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
        gbdt_type:
@ -122,7 +122,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
            numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
        * scores:
            list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold.
-            ``scores[-1]`` is overall score. `None` if eval is not specified
+            ``scores[-1]`` is overall score. `None` if eval_func is not specified
        * models:
            list of objects, shape(nfolds) Trained models for each folds.
        * importance:
@ -155,7 +155,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col

        if type_of_target == 'auto':
            type_of_target = multiclass.type_of_target(y)
-        model, eval, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval)
+        model, eval_func, cat_param_name = _dispatch_gbdt(gbdt_type, type_of_target, eval_func)
        models = [model(**model_params) for _ in range(cv.get_n_splits())]

        if fit_params is None:
@ -164,7 +164,7 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col
            fit_params[cat_param_name] = categorical_feature
    
        result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups,
-                                logger=exp.get_logger(), eval=eval, fit_params=fit_params)
+                                logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params)

        for i in range(cv.get_n_splits()):
            exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
@ -192,7 +192,8 @@ def experiment_gbdt(logging_directory: str, model_params: Dict[str, Any], id_col

        elapsed_time = time.time() - start_time

-        return GBDTResult(result.oof_prediction, result.test_prediction, result.scores, models, importance, elapsed_time)
+        return GBDTResult(result.oof_prediction, result.test_prediction,
+                          result.scores, models, importance, elapsed_time)


 def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Callable] = None):
@ -206,11 +207,11 @@ def _dispatch_gbdt(gbdt_type: str, target_type: str, custom_eval: Optional[Calla
    if found is None:
        raise RuntimeError('Not supported gbdt_type ({}) or type_of_target ({}).'.format(gbdt_type, target_type))

-    model, eval, cat_param = found[2], found[3], found[4]
+    model, eval_func, cat_param = found[2], found[3], found[4]
    if custom_eval is not None:
-        eval = custom_eval
+        eval_func = custom_eval

-    return model, eval, cat_param
+    return model, eval_func, cat_param


 def _save_model(gbdt_type: str, model: Union[CatBoost, LGBMModel], logging_directory: str, fold: int):
--- a/nyaggle/feature/nlp/bert.py
+++ b/nyaggle/feature/nlp/bert.py
@ -44,7 +44,7 @@ class BertSentenceVectorizer(BaseFeaturizer):
    def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
                 text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
                 use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
-                 model = None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
+                 model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
        if tokenizer is not None:
            assert model is not None
            self.tokenizer = tokenizer
@ -173,4 +173,3 @@ class BertSentenceVectorizer(BaseFeaturizer):
                Ignored
        """
        return self._process(X, self._fit_transform_one)
-
--- a/nyaggle/testing/init.py
+++ b/nyaggle/testing/init.py
@ -1 +1 @@
-from nyaggle.testing.util import *
+from nyaggle.testing.util import *
--- a/nyaggle/util/plot_importance.py
+++ b/nyaggle/util/plot_importance.py
@ -36,10 +36,13 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
        >>> })
        >>> plot_importance(importance, 'importance.png')
    """
-    sorted = importance.groupby('feature')['importance'].mean().reset_index().sort_values(by='importance', ascending=False)
+    importance = importance.groupby('feature')['importance']\
+        .mean()\
+        .reset_index()\
+        .sort_values(by='importance', ascending=False)

-    if len(sorted) > top_n:
-        sorted = sorted.iloc[:top_n, :]
+    if len(importance) > top_n:
+        importance = importance.iloc[:top_n, :]

    if figsize is None:
        figsize = (10, 16)
@ -48,7 +51,7 @@ def plot_importance(importance: pd.DataFrame, path: str, top_n: int = 100, figsi
        title = 'Feature Importance'

    plt.figure(figsize=figsize)
-    sns.barplot(x="importance", y="feature", data=sorted)
+    sns.barplot(x="importance", y="feature", data=importance)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path)
--- a/nyaggle/validation/adversarial_validate.py
+++ b/nyaggle/validation/adversarial_validate.py
@ -1,5 +1,5 @@
 from collections import namedtuple
-from typing import Dict, Optional
+from typing import Optional

 import numpy as np
 import pandas as pd
@ -76,7 +76,7 @@ def adversarial_validate(X_train: pd.DataFrame,
    else:
        nfolds_evaluate = None
    result = cross_validate(estimator, concat, y, None, cv=5, predict_proba=True,
-                            eval=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
+                            eval_func=roc_auc_score, fit_params={'verbose': -1}, importance_type=importance_type,
                            nfolds_evaluate=nfolds_evaluate)

    importance = pd.concat(result.importance)
--- a/nyaggle/validation/cross_validate.py
+++ b/nyaggle/validation/cross_validate.py
@ -22,7 +22,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
                   X_test: Union[pd.DataFrame, np.ndarray] = None,
                   cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
                   groups: Optional[pd.Series] = None,
-                   predict_proba: bool = False, eval: Optional[Callable] = None, logger: Optional[Logger] = None,
+                   predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None,
                   on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None,
                   fit_params: Optional[Dict] = None,
                   importance_type: str = 'gain',
@ -51,7 +51,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        predict_proba:
            If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data.
-        eval:
+        eval_func:
            Function used for logging and returning scores
        logger:
            logger
@ -60,7 +60,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        fit_params:
            Parameters passed to the fit method of the estimator
        importance_type:
-            The type of feature importance to be used to calculate result. Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
+            The type of feature importance to be used to calculate result.
+            Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
        nfolds_evaluate:
            If not ``None``, and ``nfolds_evaluate`` < ``nfolds``, only ``nfolds_evaluate`` folds are evaluated.
            For example, if ``nfolds = 5`` and ``nfolds_evaluate = 2``, only the first 2 folds out of 5 are evaluated.
@ -95,7 +96,7 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        >>>                    y=y[:3],
        >>>                    X_test=X[3:, :],
        >>>                    cv=3,
-        >>>                    eval=mean_squared_error)
+        >>>                    eval_func=mean_squared_error)
        >>> print(pred_oof)
        [-101.1123267 ,   26.79300693,   17.72635528]
        >>> print(pred_test)
@ -121,14 +122,15 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
    if logger is None:
        logger = getLogger(__name__)

-    def _predict(model: BaseEstimator, x: pd.DataFrame, predict_proba: bool):
-        if predict_proba:
+    def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool):
+        if _predict_proba:
            return model.predict_proba(x)[:, 1]
        else:
            return model.predict(x)

    oof = np.zeros(len(X_train))
    evaluated = np.full(len(X_train), False)
+    test = None
    if X_test is not None:
        test = np.zeros((len(X_test), cv.get_n_splits()))

@ -174,8 +176,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        if isinstance(estimator[n], (LGBMModel, CatBoost)):
            importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))

-        if eval is not None:
-            score = eval(valid_y, oof[valid_idx])
+        if eval_func is not None:
+            score = eval_func(valid_y, oof[valid_idx])
            scores.append(score)
            logger.info('Fold {} score: {}'.format(n, score))

@ -183,8 +185,8 @@ def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
        eta_all.append(elapsed)
        logger.debug('{:.3f} sec / fold'.format(elapsed))

-    if eval is not None:
-        score = eval(y.loc[evaluated], oof[evaluated])
+    if eval_func is not None:
+        score = eval_func(y.loc[evaluated], oof[evaluated])
        scores.append(score)
        logger.info('Overall score: {}'.format(score))

--- a/setup.py
+++ b/setup.py
@ -10,6 +10,7 @@ def get_long_description():
        long_description = f.read()
    return long_description

+
 def get_version():
    version_filepath = path.join(path.dirname(__file__), 'nyaggle', 'version.py')
    with open(version_filepath) as f:
@ -17,6 +18,7 @@ def get_version():
            if line.startswith('__version__'):
                return line.strip().split()[-1][1:-1]

+
 setup(
    name='nyaggle',
    packages=['nyaggle'],
@ -50,5 +52,5 @@ setup(
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7'
-    ], # パッケージ(プロジェクト)の分類。https://pypi.org/classifiers/に掲載されているものを指定可能。
+    ]
 )
--- a/tests/experiment/test_gbdt.py
+++ b/tests/experiment/test_gbdt.py
@ -22,11 +22,13 @@ def _check_file_exists(directory, files):

@contextmanager
 def _get_temp_directory() -> str:
+    path = None
    try:
        path = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
        yield path
    finally:
-        shutil.rmtree(path, ignore_errors=True)
+        if path:
+            shutil.rmtree(path, ignore_errors=True)


 def test_experiment_lgb_classifier():
@ -125,7 +127,7 @@ def test_experiment_cat_custom_eval():

    with _get_temp_directory() as temp_path:
        result = experiment_gbdt(temp_path, params, 'user_id',
-                                 X_train, y_train, X_test, gbdt_type='cat', eval=mean_absolute_error)
+                                 X_train, y_train, X_test, gbdt_type='cat', eval_func=mean_absolute_error)

        assert mean_absolute_error(y_train, result.oof_prediction) == result.scores[-1]
        _check_file_exists(temp_path, ('submission.csv', 'oof_prediction.npy', 'test_prediction.npy', 'scores.txt'))
--- a/tests/validation/test_cross_validate.py
+++ b/tests/validation/test_cross_validate.py
@ -15,7 +15,7 @@ def test_cv_sklean_binary():

    model = RidgeClassifier(alpha=1.0)

-    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score)
+    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)

    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall auc
@ -29,7 +29,7 @@ def test_cv_sklean_regression():

    model = Ridge(alpha=1.0)

-    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=r2_score)
+    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score)

    print(scores)
    assert len(scores) == 5 + 1
@ -45,7 +45,7 @@ def test_cv_lgbm():
    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
-                                                             eval=roc_auc_score,
+                                                             eval_func=roc_auc_score,
                                                             fit_params={'early_stopping_rounds': 200})

    print(scores)
@ -66,7 +66,7 @@ def test_cv_lgbm_df():
    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
-                                                             eval=roc_auc_score)
+                                                             eval_func=roc_auc_score)

    print(scores)
    assert len(scores) == 5 + 1
@ -87,7 +87,7 @@ def test_cv_cat_df():
    models = [CatBoostClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
-                                                             eval=roc_auc_score,
+                                                             eval_func=roc_auc_score,
                                                             fit_params={'cat_features': ['cat_0']})

    print(scores)
@ -114,7 +114,7 @@ def test_cv_partial_evaluate():
        nonlocal n
        n += 1

-    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval=roc_auc_score,
+    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score,
                                                    nfolds_evaluate=2, on_each_fold=_fold_count)

    assert len(scores) == 2 + 1