fix categorical variable handling

pull/28/head
nyanp 2020-02-03 23:55:51 +09:00
parent 8dc0c4c78d
commit 935f5fa497
2 changed files with 20 additions and 11 deletions

View File

@ -398,8 +398,8 @@ def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]):
while fillval in unique_values:
fillval += '-'
if is_categorical(strain):
strain = strain.cat.add_categories(fillval).fillna(fillval)
stest = stest.cat.add_categories(fillval).fillna(fillval)
strain = strain.cat.codes
stest = stest.cat.codes
else:
strain = strain.fillna(fillval)
stest = stest.fillna(fillval)
@ -432,7 +432,7 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
if gbdt_type == 'cat' and len(categorical_feature_to_treat) > 0:
X_train = X_train.copy()
X_test = X_test if X_test is not None else X_train.iloc[:1, :].copy() # dummy
X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy() # dummy
for c in categorical_feature_to_treat:
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])

View File

@ -501,13 +501,11 @@ def test_with_rare_categories():
'x0': [None]*100,
'x1': np.random.choice([np.inf, -np.inf], size=100),
'x2': ['nan'] + [None]*99,
'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D'], size=50)])
'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D', 'na'], size=50)])
})
y = pd.Series(np.random.choice([0, 1], size=100), name='y')
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.5)
params = {
'lgbm': {
'objective': 'binary',
@ -523,11 +521,22 @@ def test_with_rare_categories():
}
}
for algorithm in ('cat', 'xgb', 'lgbm'):
with get_temp_directory() as temp_path:
experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
logging_directory=temp_path, with_mlflow=True,
categorical_feature=['x0', 'x1', 'x2', 'x3'])
for cat_cast in (True, False):
X_ = X.copy()
y_ = y.copy()
if cat_cast:
for c in X.columns:
X_[c] = X_[c].astype('category')
X_ = X_.iloc[:50, :]
y_ = y_.iloc[:50]
X_train, X_test, y_train, y_test = train_test_split(X_, y_, shuffle=False, test_size=0.5)
for algorithm in ('cat', 'xgb', 'lgbm'):
with get_temp_directory() as temp_path:
experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
logging_directory=temp_path, with_mlflow=True,
categorical_feature=['x0', 'x1', 'x2', 'x3'])
def test_inherit_outer_scope_run():