reformat files
parent
6f64d1dd25
commit
345f1b3746
|
@ -1,5 +1,4 @@
|
|||
from collections import namedtuple
|
||||
from typing import Callable, Iterable, List, Union, Optional, Tuple
|
||||
from typing import Callable, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
@ -105,6 +104,7 @@ def averaging_opt(test_predictions: List[np.ndarray],
|
|||
* score:
|
||||
float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
|
||||
"""
|
||||
|
||||
def _minimize(weights):
|
||||
prediction = np.zeros_like(oof_predictions[0])
|
||||
for weight, oof in zip(weights, oof_predictions):
|
||||
|
|
|
@ -94,4 +94,3 @@ def stacking(test_predictions: List[np.ndarray],
|
|||
score = result.scores[-1] if result.scores else None
|
||||
|
||||
return EnsembleResult(result.test_prediction, result.oof_prediction, score)
|
||||
|
||||
|
|
|
@ -56,4 +56,3 @@ def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> T
|
|||
return strain.fillna(fillval), stest.fillna(fillval)
|
||||
else:
|
||||
return strain.astype(str), stest.astype(str)
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import copy
|
||||
from typing import Dict, Iterable, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import optuna.integration.lightgbm as optuna_lgb
|
||||
import pandas as pd
|
||||
import sklearn.utils.multiclass as multiclass
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ from collections import namedtuple
|
|||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import sklearn.utils.multiclass as multiclass
|
||||
from sklearn.base import BaseEstimator
|
||||
|
|
|
@ -33,6 +33,7 @@ class KFoldEncoderWrapper(BaseFeaturizer):
|
|||
If True, `transform` and `fit_transform` return the same type as X.
|
||||
If False, these APIs always return a numpy array, similar to sklearn's API.
|
||||
"""
|
||||
|
||||
def __init__(self, base_transformer: BaseEstimator,
|
||||
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, return_same_type: bool = True,
|
||||
groups: Optional[pd.Series] = None):
|
||||
|
@ -168,6 +169,7 @@ class TargetEncoder(KFoldEncoderWrapper):
|
|||
If True, ``transform`` and ``fit_transform`` return the same type as X.
|
||||
If False, these APIs always return a numpy array, similar to sklearn's API.
|
||||
"""
|
||||
|
||||
def __init__(self, cv: Optional[Union[Iterable, BaseCrossValidator]] = None,
|
||||
groups: Optional[pd.Series] = None,
|
||||
cols: List[str] = None,
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import functools
|
||||
import os
|
||||
import pyarrow
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
|
@ -171,6 +171,7 @@ def cached_feature(feature_name: Union[int, str], directory: str = './features/'
|
|||
"called"
|
||||
>>> x = make_feature_x(...) # load from file in the second time
|
||||
"""
|
||||
|
||||
def _decorator(fun):
|
||||
@functools.wraps(fun)
|
||||
def _decorated_fun(*args, **kwargs):
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from typing import Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
|
||||
def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100,
|
||||
def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n: int = 100,
|
||||
figsize: Optional[Tuple[int, int]] = None,
|
||||
title: Optional[str] = None):
|
||||
"""
|
||||
|
@ -37,9 +37,9 @@ def plot_importance(importance: pd.DataFrame, path: Optional[str] = None, top_n:
|
|||
>>> })
|
||||
>>> plot_importance(importance, 'importance.png')
|
||||
"""
|
||||
importance = importance.groupby('feature')['importance']\
|
||||
.mean()\
|
||||
.reset_index()\
|
||||
importance = importance.groupby('feature')['importance'] \
|
||||
.mean() \
|
||||
.reset_index() \
|
||||
.sort_values(by='importance', ascending=False)
|
||||
|
||||
if len(importance) > top_n:
|
||||
|
|
|
@ -19,8 +19,8 @@ def adversarial_validate(X_train: pd.DataFrame,
|
|||
X_test: pd.DataFrame,
|
||||
importance_type: str = 'gain',
|
||||
estimator: Optional[BaseEstimator] = None,
|
||||
cat_cols = None,
|
||||
cv = None) -> ADVResult:
|
||||
cat_cols=None,
|
||||
cv=None) -> ADVResult:
|
||||
"""
|
||||
Perform adversarial validation between X_train and X_test.
|
||||
|
||||
|
@ -63,7 +63,7 @@ def adversarial_validate(X_train: pd.DataFrame,
|
|||
col_9 170.6438643
|
||||
"""
|
||||
concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True)
|
||||
y = np.array([1]*len(X_train) + [0]*len(X_test))
|
||||
y = np.array([1] * len(X_train) + [0] * len(X_test))
|
||||
|
||||
if estimator is None:
|
||||
requires_lightgbm()
|
||||
|
|
|
@ -11,10 +11,10 @@ import sklearn.utils.multiclass as multiclass
|
|||
from category_encoders.utils import convert_input, convert_input_vector
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
|
||||
from nyaggle.util.traits import is_gbdt_instance
|
||||
from nyaggle.validation.split import check_cv
|
||||
|
||||
|
||||
CVResult = namedtuple('CVResult', ['oof_prediction', 'test_prediction', 'scores', 'importance'])
|
||||
|
||||
|
||||
|
|
|
@ -70,6 +70,7 @@ class Take(BaseCrossValidator):
|
|||
>>> folds.get_n_splits()
|
||||
3
|
||||
"""
|
||||
|
||||
def __init__(self, n: int, base_validator: BaseCrossValidator):
|
||||
self.base_validator = base_validator
|
||||
self.n = n
|
||||
|
@ -118,6 +119,7 @@ class Skip(BaseCrossValidator):
|
|||
>>> folds.get_n_splits()
|
||||
2
|
||||
"""
|
||||
|
||||
def __init__(self, n: int, base_validator: BaseCrossValidator):
|
||||
self.base_validator = base_validator
|
||||
self.n = n
|
||||
|
@ -156,9 +158,10 @@ class Nth(BaseCrossValidator):
|
|||
>>> folds.get_n_splits()
|
||||
1
|
||||
"""
|
||||
|
||||
def __init__(self, n: int, base_validator: BaseCrossValidator):
|
||||
assert n > 0, "n is 1-origin and should be greater than 0"
|
||||
self.base_validator = Take(1, Skip(n-1, base_validator))
|
||||
self.base_validator = Take(1, Skip(n - 1, base_validator))
|
||||
self.n = n
|
||||
|
||||
def get_n_splits(self, X=None, y=None, groups=None):
|
||||
|
@ -245,7 +248,8 @@ class TimeSeriesSplit(BaseCrossValidator):
|
|||
assert train_interval[1], "train_interval[1] should not be None"
|
||||
assert test_interval[0], "test_interval[0] should not be None"
|
||||
|
||||
assert (not train_interval[0]) or (train_interval[0] <= train_interval[1]), "train_interval[0] < train_interval[1]"
|
||||
assert (not train_interval[0]) or (
|
||||
train_interval[0] <= train_interval[1]), "train_interval[0] < train_interval[1]"
|
||||
assert (not test_interval[1]) or (test_interval[0] <= test_interval[1]), "test_interval[0] < test_interval[1]"
|
||||
|
||||
self.times.append((train_interval, test_interval))
|
||||
|
@ -440,7 +444,7 @@ class StratifiedGroupKFold(_BaseKFold):
|
|||
else:
|
||||
freq_loss = freq_loss / denom
|
||||
losses = ratio_loss + freq_loss
|
||||
#-------
|
||||
# -------
|
||||
splitx = np.argmin(losses)
|
||||
split_freq[splitx] = cand_freq[splitx]
|
||||
split_ratios[splitx] = cand_ratio[splitx]
|
||||
|
|
3
setup.py
3
setup.py
|
@ -1,7 +1,8 @@
|
|||
from setuptools import find_packages, setup
|
||||
from codecs import open
|
||||
from os import path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
def get_long_description():
|
||||
here = path.abspath(path.dirname(__file__))
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
import numpy as np
|
||||
import scipy.stats as stats
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
from sklearn.linear_model import Ridge, LogisticRegression
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from nyaggle.testing import make_classification_df, make_regression_df
|
||||
from nyaggle.ensemble import averaging, averaging_opt
|
||||
from nyaggle.testing import make_classification_df, make_regression_df
|
||||
from nyaggle.validation import cross_validate
|
||||
|
||||
|
||||
|
@ -40,7 +39,7 @@ def test_averaging():
|
|||
|
||||
result = averaging(test)
|
||||
|
||||
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
|
||||
assert result.score is None
|
||||
assert result.oof_prediction is None
|
||||
|
||||
|
@ -53,8 +52,8 @@ def test_averaging_with_oof():
|
|||
|
||||
result = averaging(test, oof, y_train)
|
||||
|
||||
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
|
||||
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
@ -66,8 +65,8 @@ def test_averaging_regression():
|
|||
|
||||
result = averaging(test, oof, y_train)
|
||||
|
||||
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
|
||||
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
@ -79,8 +78,8 @@ def test_averaging_multiclass():
|
|||
|
||||
result = averaging(test, oof, y_train)
|
||||
|
||||
assert_array_almost_equal((test[0]+test[1]+test[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0]+oof[1]+oof[2])/3, result.oof_prediction)
|
||||
assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
|
||||
assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
@ -103,8 +102,8 @@ def test_weight_averaging():
|
|||
|
||||
result = averaging(test, oof, y_train, weights=[0.2, 0.4, 0.3])
|
||||
|
||||
assert_array_almost_equal(0.2*test[0]+0.4*test[1]+0.3*test[2], result.test_prediction)
|
||||
assert_array_almost_equal(0.2*oof[0]+0.4*oof[1]+0.3*oof[2], result.oof_prediction)
|
||||
assert_array_almost_equal(0.2 * test[0] + 0.4 * test[1] + 0.3 * test[2], result.test_prediction)
|
||||
assert_array_almost_equal(0.2 * oof[0] + 0.4 * oof[1] + 0.3 * oof[2], result.oof_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
@ -118,7 +117,7 @@ def test_rank_averaging():
|
|||
|
||||
test_rank = [stats.rankdata(t) / len(X_test) for t in test]
|
||||
|
||||
assert_array_almost_equal((test_rank[0]+test_rank[1]+test_rank[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
@ -133,8 +132,8 @@ def test_rank_averaging_with_oof():
|
|||
oof_rank = [stats.rankdata(o) / len(X_train) for o in oof]
|
||||
test_rank = [stats.rankdata(t) / len(X_test) for t in test]
|
||||
|
||||
assert_array_almost_equal((test_rank[0]+test_rank[1]+test_rank[2])/3, result.test_prediction)
|
||||
assert_array_almost_equal((oof_rank[0]+oof_rank[1]+oof_rank[2])/3, result.oof_prediction)
|
||||
assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
|
||||
assert_array_almost_equal((oof_rank[0] + oof_rank[1] + oof_rank[2]) / 3, result.oof_prediction)
|
||||
assert result.score is None
|
||||
|
||||
|
||||
|
|
|
@ -1,15 +1,12 @@
|
|||
import numpy as np
|
||||
import scipy.stats as stats
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
from sklearn.linear_model import Ridge, LogisticRegression
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from nyaggle.testing import make_classification_df, make_regression_df
|
||||
from nyaggle.ensemble import stacking
|
||||
from nyaggle.testing import make_classification_df
|
||||
from nyaggle.validation import cross_validate
|
||||
|
||||
|
||||
|
@ -43,5 +40,3 @@ def test_stacking():
|
|||
result = stacking(test, oof, y_train, eval_func=roc_auc_score)
|
||||
|
||||
assert roc_auc_score(y_train, result.oof_prediction) > worst_base_roc
|
||||
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ def test_target_encoder_fit_transform():
|
|||
})
|
||||
|
||||
X = pd.concat([X_train, X_test])
|
||||
y = pd.concat([y_train, pd.Series([None]*4)]).astype(float)
|
||||
y = pd.concat([y_train, pd.Series([None] * 4)]).astype(float)
|
||||
|
||||
ce1 = TargetEncoder(cols=['x'])
|
||||
ce1.fit(X_train, y_train)
|
||||
|
|
|
@ -89,7 +89,7 @@ def test_bert_en_svd_multicol():
|
|||
ret = bert.fit_transform(X)
|
||||
|
||||
assert ret.shape[0] == 6
|
||||
assert ret.shape[1] == 2*768 + 1
|
||||
assert ret.shape[1] == 2 * 768 + 1
|
||||
|
||||
ret.drop('id', axis=1, inplace=True)
|
||||
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values, decimal=3)
|
||||
|
@ -112,5 +112,3 @@ def test_bert_jp():
|
|||
ret.drop('id', axis=1, inplace=True)
|
||||
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[4, :].values)
|
||||
npt.assert_almost_equal(ret.iloc[0, :].values, ret.iloc[5, :].values)
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
import os
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
import nyaggle.feature_store as fs
|
||||
|
||||
from nyaggle.testing import get_temp_directory
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue