fix categorical variable handling
parent
8dc0c4c78d
commit
935f5fa497
|
@ -398,8 +398,8 @@ def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]):
|
|||
while fillval in unique_values:
|
||||
fillval += '-'
|
||||
if is_categorical(strain):
|
||||
strain = strain.cat.add_categories(fillval).fillna(fillval)
|
||||
stest = stest.cat.add_categories(fillval).fillna(fillval)
|
||||
strain = strain.cat.codes
|
||||
stest = stest.cat.codes
|
||||
else:
|
||||
strain = strain.fillna(fillval)
|
||||
stest = stest.fillna(fillval)
|
||||
|
@ -432,7 +432,7 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
|
|||
|
||||
if gbdt_type == 'cat' and len(categorical_feature_to_treat) > 0:
|
||||
X_train = X_train.copy()
|
||||
X_test = X_test if X_test is not None else X_train.iloc[:1, :].copy() # dummy
|
||||
X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy() # dummy
|
||||
for c in categorical_feature_to_treat:
|
||||
X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
|
||||
|
||||
|
|
|
@ -501,13 +501,11 @@ def test_with_rare_categories():
|
|||
'x0': [None]*100,
|
||||
'x1': np.random.choice([np.inf, -np.inf], size=100),
|
||||
'x2': ['nan'] + [None]*99,
|
||||
'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D'], size=50)])
|
||||
'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D', 'na'], size=50)])
|
||||
})
|
||||
|
||||
y = pd.Series(np.random.choice([0, 1], size=100), name='y')
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.5)
|
||||
|
||||
params = {
|
||||
'lgbm': {
|
||||
'objective': 'binary',
|
||||
|
@ -523,11 +521,22 @@ def test_with_rare_categories():
|
|||
}
|
||||
}
|
||||
|
||||
for algorithm in ('cat', 'xgb', 'lgbm'):
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
|
||||
logging_directory=temp_path, with_mlflow=True,
|
||||
categorical_feature=['x0', 'x1', 'x2', 'x3'])
|
||||
for cat_cast in (True, False):
|
||||
X_ = X.copy()
|
||||
y_ = y.copy()
|
||||
if cat_cast:
|
||||
for c in X.columns:
|
||||
X_[c] = X_[c].astype('category')
|
||||
X_ = X_.iloc[:50, :]
|
||||
y_ = y_.iloc[:50]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_, y_, shuffle=False, test_size=0.5)
|
||||
|
||||
for algorithm in ('cat', 'xgb', 'lgbm'):
|
||||
with get_temp_directory() as temp_path:
|
||||
experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
|
||||
logging_directory=temp_path, with_mlflow=True,
|
||||
categorical_feature=['x0', 'x1', 'x2', 'x3'])
|
||||
|
||||
|
||||
def test_inherit_outer_scope_run():
|
||||
|
|
Loading…
Reference in New Issue