reformat files

pull/72/head
nyanp 2020-05-18 23:04:03 +09:00
parent 6f64d1dd25
commit 345f1b3746
18 changed files with 50 additions and 53 deletions

View File

@ -1,5 +1,4 @@
from collections import namedtuple
from typing import Callable, Iterable, List, Union, Optional, Tuple
from typing import Callable, List, Optional, Tuple
import numpy as np
import pandas as pd
@ -105,6 +104,7 @@ def averaging_opt(test_predictions: List[np.ndarray],
* score:
float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
"""
def _minimize(weights):
prediction = np.zeros_like(oof_predictions[0])
for weight, oof in zip(weights, oof_predictions):

View File

@ -94,4 +94,3 @@ def stacking(test_predictions: List[np.ndarray],
score = result.scores[-1] if result.scores else None
return EnsembleResult(result.test_prediction, result.oof_prediction, score)

View File

@ -56,4 +56,3 @@ def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> T
return strain.fillna(fillval), stest.fillna(fillval)
else:
return strain.astype(str), stest.astype(str)

View File

@ -1,8 +1,8 @@
import copy
from typing import Dict, Iterable, Optional, Union
import pandas as pd
import optuna.integration.lightgbm as optuna_lgb
import pandas as pd
import sklearn.utils.multiclass as multiclass
from sklearn.model_selection import BaseCrossValidator

View File

@ -5,7 +5,6 @@ from collections import namedtuple
from datetime import datetime
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
import numpy as np
import pandas as pd
import sklearn.utils.multiclass as multiclass
from sklearn.base import BaseEstimator

View File

@ -33,6 +33,7 @@ class KFoldEncoderWrapper(BaseFeaturizer):
If True, `transform` and `fit_transform` return the same type as X.
If False, these APIs always return a numpy array, similar to sklearn's API.
"""
def __init__(self, base_transformer: BaseEstimator,
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, return_same_type: bool = True,
groups: Optional[pd.Series] = None):
@ -168,6 +169,7 @@ class TargetEncoder(KFoldEncoderWrapper):
If True, ``transform`` and ``fit_transform`` return the same type as X.
If False, these APIs always return a numpy array, similar to sklearn's API.
"""
def __init__(self, cv: Optional[Union[Iterable, BaseCrossValidator]] = None,
groups: Optional[pd.Series] = None,
cols: List[str] = None,

View File

@ -1,10 +1,10 @@
import functools
import os
import pyarrow
import warnings
from typing import List, Optional, Union
import pandas as pd
import pyarrow
from tqdm import tqdm
@ -171,6 +171,7 @@ def cached_feature(feature_name: Union[int, str], directory: str = './features/'
"called"
>>> x = make_feature_x(...) # load from file in the second time
"""
def _decorator(fun):
@functools.wraps(fun)
def _decorated_fun(*args, **kwargs):

View File

@ -1,11 +1,11 @@
from typing import Optional, Tuple
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100,
def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100,
figsize: Optional[Tuple[int, int]] = None,
title: Optional[str] = None):
"""
@ -37,9 +37,9 @@ def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n:
>>> })
>>> plot_importance(importance, 'importance.png')
"""
importance = importance.groupby('feature')['importance']\
.mean()\
.reset_index()\
importance = importance.groupby('feature')['importance'] \
.mean() \
.reset_index() \
.sort_values(by='importance', ascending=False)
if len(importance) > top_n:

View File

@ -19,8 +19,8 @@ def adversarial_validate(X_train: pd.DataFrame,
X_test: pd.DataFrame,
importance_type: str = 'gain',
estimator: Optional[BaseEstimator] = None,
cat_cols = None,
cv = None) -> ADVResult:
cat_cols=None,
cv=None) -> ADVResult:
"""
Perform adversarial validation between X_train and X_test.
@ -63,7 +63,7 @@ def adversarial_validate(X_train: pd.DataFrame,
col_9 170.6438643
"""
concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True)
y = np.array([1]*len(X_train) + [0]*len(X_test))
y = np.array([1] * len(X_train) + [0] * len(X_test))
if estimator is None:
requires_lightgbm()

View File

@ -11,10 +11,10 @@ import sklearn.utils.multiclass as multiclass
from category_encoders.utils import convert_input, convert_input_vector
from sklearn.base import BaseEstimator
from sklearn.model_selection import BaseCrossValidator
from nyaggle.util.traits import is_gbdt_instance
from nyaggle.validation.split import check_cv
CVResult = namedtuple('CVResult', ['oof_prediction', 'test_prediction', 'scores', 'importance'])

View File

@ -70,6 +70,7 @@ class Take(BaseCrossValidator):
>>> folds.get_n_splits()
3
"""
def __init__(self, n: int, base_validator: BaseCrossValidator):
self.base_validator = base_validator
self.n = n
@ -118,6 +119,7 @@ class Skip(BaseCrossValidator):
>>> folds.get_n_splits()
2
"""
def __init__(self, n: int, base_validator: BaseCrossValidator):
self.base_validator = base_validator
self.n = n
@ -156,9 +158,10 @@ class Nth(BaseCrossValidator):
>>> folds.get_n_splits()
1
"""
def __init__(self, n: int, base_validator: BaseCrossValidator):
assert n > 0, "n is 1-origin and should be greater than 0"
self.base_validator = Take(1, Skip(n-1, base_validator))
self.base_validator = Take(1, Skip(n - 1, base_validator))
self.n = n
def get_n_splits(self, X=None, y=None, groups=None):
@ -245,7 +248,8 @@ class TimeSeriesSplit(BaseCrossValidator):
assert train_interval[1], "train_interval[1] should not be None"
assert test_interval[0], "test_interval[0] should not be None"
assert (not train_interval[0]) or (train_interval[0] <= train_interval[1]), "train_interval[0] < train_interval[1]"
assert (not train_interval[0]) or (
train_interval[0] <= train_interval[1]), "train_interval[0] < train_interval[1]"
assert (not test_interval[1]) or (test_interval[0] <= test_interval[1]), "test_interval[0] < test_interval[1]"
self.times.append((train_interval, test_interval))
@ -440,7 +444,7 @@ class StratifiedGroupKFold(_BaseKFold):
else:
freq_loss = freq_loss / denom
losses = ratio_loss + freq_loss
#-------
# -------
splitx = np.argmin(losses)
split_freq[splitx] = cand_freq[splitx]
split_ratios[splitx] = cand_ratio[splitx]

View File

@ -1,7 +1,8 @@
from setuptools import find_packages, setup
from codecs import open
from os import path
from setuptools import find_packages, setup
def get_long_description():
here = path.abspath(path.dirname(__file__))

View File

@ -1,7 +1,8 @@
import os
import tempfile
import shutil
import tempfile
import uuid
import pytest

View File

@ -1,15 +1,14 @@
import numpy as np
import scipy.stats as stats
from numpy.testing import assert_array_almost_equal
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.utils.multiclass import type_of_target
from sklearn.svm import SVC, SVR
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.utils.multiclass import type_of_target
from nyaggle.testing import make_classification_df, make_regression_df
from nyaggle.ensemble import averaging, averaging_opt
from nyaggle.testing import make_classification_df, make_regression_df
from nyaggle.validation import cross_validate
@ -40,7 +39,7 @@ def test_averaging():
result = averaging(test)
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
assert result.score is None
assert result.oof_prediction is None
@ -53,8 +52,8 @@ def test_averaging_with_oof():
result = averaging(test, oof, y_train)
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
assert result.score is None
@ -66,8 +65,8 @@ def test_averaging_regression():
result = averaging(test, oof, y_train)
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
assert result.score is None
@ -79,8 +78,8 @@ def test_averaging_multiclass():
result = averaging(test, oof, y_train)
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
assert result.score is None
@ -103,8 +102,8 @@ def test_weight_averaging():
result = averaging(test, oof, y_train, weights=[0.2, 0.4, 0.3])
assert_array_almost_equal(0.2*test[0]+0.4*test[1]+0.3*test[2], result.test_prediction)
assert_array_almost_equal(0.2*oof[0]+0.4*oof[1]+0.3*oof[2], result.oof_prediction)
assert_array_almost_equal(0.2 * test[0] + 0.4 * test[1] + 0.3 * test[2], result.test_prediction)
assert_array_almost_equal(0.2 * oof[0] + 0.4 * oof[1] + 0.3 * oof[2], result.oof_prediction)
assert result.score is None
@ -118,7 +117,7 @@ def test_rank_averaging():
test_rank = [stats.rankdata(t) / len(X_test) for t in test]
assert_array_almost_equal((test_rank[0]+test_rank[1]+test_rank[2])/3, result.test_prediction)
assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
assert result.score is None
@ -133,8 +132,8 @@ def test_rank_averaging_with_oof():
oof_rank = [stats.rankdata(o) / len(X_train) for o in oof]
test_rank = [stats.rankdata(t) / len(X_test) for t in test]
assert_array_almost_equal((test_rank[0]+test_rank[1]+test_rank[2])/3, result.test_prediction)
assert_array_almost_equal((oof_rank[0]+oof_rank[1]+oof_rank[2])/3, result.oof_prediction)
assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
assert_array_almost_equal((oof_rank[0] + oof_rank[1] + oof_rank[2]) / 3, result.oof_prediction)
assert result.score is None

View File

@ -1,15 +1,12 @@
import numpy as np
import scipy.stats as stats
from numpy.testing import assert_array_almost_equal
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.utils.multiclass import type_of_target
from nyaggle.testing import make_classification_df, make_regression_df
from nyaggle.ensemble import stacking
from nyaggle.testing import make_classification_df
from nyaggle.validation import cross_validate
@ -43,5 +40,3 @@ def test_stacking():
result = stacking(test, oof, y_train, eval_func=roc_auc_score)
assert roc_auc_score(y_train, result.oof_prediction) > worst_base_roc

View File

@ -61,7 +61,7 @@ def test_target_encoder_fit_transform():
})
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, pd.Series([None]*4)]).astype(float)
y = pd.concat([y_train, pd.Series([None] * 4)]).astype(float)
ce1 = TargetEncoder(cols=['x'])
ce1.fit(X_train, y_train)

View File

@ -89,7 +89,7 @@ def test_bert_en_svd_multicol():
ret = bert.fit_transform(X)
assert ret.shape[0] == 6
assert ret.shape[1] == 2*768 + 1
assert ret.shape[1] == 2 * 768 + 1
ret.drop('id', axis=1, inplace=True)
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3)
@ -112,5 +112,3 @@ def test_bert_jp():
ret.drop('id', axis=1, inplace=True)
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values)
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values)

View File

@ -1,12 +1,11 @@
import os
import pytest
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal
import nyaggle.feature_store as fs
from nyaggle.testing import get_temp_directory