Merge pull request #81 from wakamezake/features/groupby

Add an aggregation function to make pandas.groupby useful
pull/87/head
nyanp 2020-09-07 12:02:13 +09:00 committed by GitHub
commit e934f34c68
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 154 additions and 0 deletions

View File

@ -9,3 +9,8 @@ nyaggle.feature
.. automodule:: nyaggle.feature.nlp
:members:
:imported-members:
.. automodule:: nyaggle.feature.groupby
:members:
:imported-members:

View File

@ -0,0 +1,112 @@
"""
Modified work:
-----------------------------------------------------------------------------
Copyright (c) 2020 Kota Yuhara (@wakamezake)
-----------------------------------------------------------------------------
Original work of aggregation:
https://github.com/pfnet-research/xfeat/blob/master/xfeat/helper.py
-----------------------------------------------------------------------------
MIT License
Copyright (c) 2020 Preferred Networks, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-----------------------------------------------------------------------------
"""
from types import LambdaType, FunctionType
from typing import List, Callable, Union
import pandas as pd
from pandas.core.common import get_callable_name
def is_lambda_function(obj):
"""
Example:
>>> import numpy as np
>>> def custom_function(x): return np.sum(x)
>>> is_lambda_function(lambda x: np.sum(x))
True
>>> is_lambda_function(np.sum)
False
>>> is_lambda_function(custom_function)
False
"""
# It's worth noting that types.LambdaType is an alias for types.FunctionType
return isinstance(obj, LambdaType) and obj.__name__ == "<lambda>"
def aggregation(
input_df: pd.DataFrame,
group_key: str,
group_values: List[str],
agg_methods: List[Union[str, FunctionType]],
):
"""Aggregate values after grouping table rows by a given key.
Args:
input_df:
Input data frame.
group_key:
Used to determine the groups for the groupby.
group_values:
Used to aggregate values for the groupby.
agg_methods:
List of function or function names,
e.g. ['mean', 'max', 'min', numpy.mean].
Do not use a lambda function because the name attribute of the lambda function
cannot generate a unique string of column names in <lambda>.
Returns:
Tuple of output dataframe and new column names.
"""
new_df = input_df.copy()
new_cols = []
for agg_method in agg_methods:
if is_lambda_function(agg_method):
raise ValueError('Not supported lambda function.')
elif isinstance(agg_method, str):
pass
elif isinstance(agg_method, FunctionType):
pass
else:
raise ValueError('Supported types are: {} or {}.'
' Got {} instead.'.format(str, Callable, type(agg_method)))
for agg_method in agg_methods:
for col in group_values:
# only str or FunctionType
if isinstance(agg_method, str):
agg_method_name = agg_method
else:
agg_method_name = get_callable_name(agg_method)
new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key)
df_agg = (
input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(
agg_method)
)
df_agg.columns = [new_col]
new_cols.append(new_col)
new_df = new_df.merge(
df_agg, how="left", right_index=True, left_on=group_key
)
return new_df, new_cols

View File

@ -0,0 +1,37 @@
import numpy as np
import pandas as pd
import pytest
from sklearn import datasets
from nyaggle.feature.groupby import aggregation
@pytest.fixture
def iris_dataframe():
iris = datasets.load_iris()
df = pd.DataFrame(np.concatenate([iris.data,
iris.target.reshape((iris.target.shape[0], 1))], axis=1))
df.columns = ['sl', 'sw', 'pl', 'pw', 'species']
group_key = 'species'
group_values = ['sl', 'sw', 'pl', 'pw']
return df, group_key, group_values
def custom_function(x):
return np.sum(x)
def test_return_type_by_aggregation(iris_dataframe):
df, group_key, group_values = iris_dataframe
agg_methods = ["max", np.sum, custom_function]
new_df, new_cols = aggregation(df, group_key, group_values,
agg_methods)
assert isinstance(new_df, pd.DataFrame)
assert isinstance(new_cols, list)
@pytest.mark.parametrize('agg_method', [[int], [lambda x: np.max(x)]])
def test_assert_by_aggregation(iris_dataframe, agg_method):
df, group_key, group_values = iris_dataframe
with pytest.raises(ValueError):
aggregation(df, group_key, group_values, agg_method)