Merge pull request #81 from wakamezake/features/groupby
Add an aggregation function to make pandas.groupby usefulpull/87/head
commit
e934f34c68
|
@ -9,3 +9,8 @@ nyaggle.feature
|
|||
.. automodule:: nyaggle.feature.nlp
|
||||
:members:
|
||||
:imported-members:
|
||||
|
||||
|
||||
.. automodule:: nyaggle.feature.groupby
|
||||
:members:
|
||||
:imported-members:
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
"""
|
||||
Modified work:
|
||||
-----------------------------------------------------------------------------
|
||||
Copyright (c) 2020 Kota Yuhara (@wakamezake)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Original work of aggregation:
|
||||
https://github.com/pfnet-research/xfeat/blob/master/xfeat/helper.py
|
||||
-----------------------------------------------------------------------------
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 Preferred Networks, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
-----------------------------------------------------------------------------
|
||||
"""
|
||||
|
||||
from types import LambdaType, FunctionType
|
||||
from typing import List, Callable, Union
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.common import get_callable_name
|
||||
|
||||
|
||||
def is_lambda_function(obj):
|
||||
"""
|
||||
Example:
|
||||
>>> import numpy as np
|
||||
>>> def custom_function(x): return np.sum(x)
|
||||
>>> is_lambda_function(lambda x: np.sum(x))
|
||||
True
|
||||
>>> is_lambda_function(np.sum)
|
||||
False
|
||||
>>> is_lambda_function(custom_function)
|
||||
False
|
||||
"""
|
||||
# It's worth noting that types.LambdaType is an alias for types.FunctionType
|
||||
return isinstance(obj, LambdaType) and obj.__name__ == "<lambda>"
|
||||
|
||||
|
||||
def aggregation(
|
||||
input_df: pd.DataFrame,
|
||||
group_key: str,
|
||||
group_values: List[str],
|
||||
agg_methods: List[Union[str, FunctionType]],
|
||||
):
|
||||
"""Aggregate values after grouping table rows by a given key.
|
||||
Args:
|
||||
input_df:
|
||||
Input data frame.
|
||||
group_key:
|
||||
Used to determine the groups for the groupby.
|
||||
group_values:
|
||||
Used to aggregate values for the groupby.
|
||||
agg_methods:
|
||||
List of function or function names,
|
||||
e.g. ['mean', 'max', 'min', numpy.mean].
|
||||
Do not use a lambda function because the name attribute of the lambda function
|
||||
cannot generate a unique string of column names in <lambda>.
|
||||
Returns:
|
||||
Tuple of output dataframe and new column names.
|
||||
"""
|
||||
new_df = input_df.copy()
|
||||
|
||||
new_cols = []
|
||||
for agg_method in agg_methods:
|
||||
if is_lambda_function(agg_method):
|
||||
raise ValueError('Not supported lambda function.')
|
||||
elif isinstance(agg_method, str):
|
||||
pass
|
||||
elif isinstance(agg_method, FunctionType):
|
||||
pass
|
||||
else:
|
||||
raise ValueError('Supported types are: {} or {}.'
|
||||
' Got {} instead.'.format(str, Callable, type(agg_method)))
|
||||
|
||||
for agg_method in agg_methods:
|
||||
for col in group_values:
|
||||
# only str or FunctionType
|
||||
if isinstance(agg_method, str):
|
||||
agg_method_name = agg_method
|
||||
else:
|
||||
agg_method_name = get_callable_name(agg_method)
|
||||
new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key)
|
||||
|
||||
df_agg = (
|
||||
input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(
|
||||
agg_method)
|
||||
)
|
||||
df_agg.columns = [new_col]
|
||||
new_cols.append(new_col)
|
||||
new_df = new_df.merge(
|
||||
df_agg, how="left", right_index=True, left_on=group_key
|
||||
)
|
||||
|
||||
return new_df, new_cols
|
|
@ -0,0 +1,37 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from sklearn import datasets
|
||||
|
||||
from nyaggle.feature.groupby import aggregation
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def iris_dataframe():
|
||||
iris = datasets.load_iris()
|
||||
df = pd.DataFrame(np.concatenate([iris.data,
|
||||
iris.target.reshape((iris.target.shape[0], 1))], axis=1))
|
||||
df.columns = ['sl', 'sw', 'pl', 'pw', 'species']
|
||||
group_key = 'species'
|
||||
group_values = ['sl', 'sw', 'pl', 'pw']
|
||||
return df, group_key, group_values
|
||||
|
||||
|
||||
def custom_function(x):
|
||||
return np.sum(x)
|
||||
|
||||
|
||||
def test_return_type_by_aggregation(iris_dataframe):
|
||||
df, group_key, group_values = iris_dataframe
|
||||
agg_methods = ["max", np.sum, custom_function]
|
||||
new_df, new_cols = aggregation(df, group_key, group_values,
|
||||
agg_methods)
|
||||
assert isinstance(new_df, pd.DataFrame)
|
||||
assert isinstance(new_cols, list)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('agg_method', [[int], [lambda x: np.max(x)]])
|
||||
def test_assert_by_aggregation(iris_dataframe, agg_method):
|
||||
df, group_key, group_values = iris_dataframe
|
||||
with pytest.raises(ValueError):
|
||||
aggregation(df, group_key, group_values, agg_method)
|
Loading…
Reference in New Issue