fix categorical variable handling

2020-02-03 23:55:51 +09:00 · 2020-02-03 23:55:51 +09:00 · 935f5fa497
parent 8dc0c4c78d
commit 935f5fa497
2 changed files with 20 additions and 11 deletions
--- a/nyaggle/experiment/gbdt.py
+++ b/nyaggle/experiment/gbdt.py
@ -398,8 +398,8 @@ def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]):
        while fillval in unique_values:
            fillval += '-'
    if is_categorical(strain):
-        strain = strain.cat.add_categories(fillval).fillna(fillval)
-        stest = stest.cat.add_categories(fillval).fillna(fillval)
+        strain = strain.cat.codes
+        stest = stest.cat.codes
    else:
        strain = strain.fillna(fillval)
        stest = stest.fillna(fillval)
@ -432,7 +432,7 @@ def autoprep_gbdt(X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],

    if gbdt_type == 'cat' and len(categorical_feature_to_treat) > 0:
        X_train = X_train.copy()
-        X_test = X_test if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
+        X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])

--- a/tests/experiment/test_gbdt.py
+++ b/tests/experiment/test_gbdt.py
@ -501,13 +501,11 @@ def test_with_rare_categories():
        'x0': [None]*100,
        'x1': np.random.choice([np.inf, -np.inf], size=100),
        'x2': ['nan'] + [None]*99,
-        'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D'], size=50)])
+        'x3': np.concatenate([np.random.choice(['A', 'B'], size=50), np.random.choice(['C', 'D', 'na'], size=50)])
    })

    y = pd.Series(np.random.choice([0, 1], size=100), name='y')

-    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.5)
-
    params = {
        'lgbm': {
            'objective': 'binary',
@ -523,11 +521,22 @@ def test_with_rare_categories():
        }
    }

-    for algorithm in ('cat', 'xgb', 'lgbm'):
-        with get_temp_directory() as temp_path:
-            experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
-                            logging_directory=temp_path, with_mlflow=True,
-                            categorical_feature=['x0', 'x1', 'x2', 'x3'])
+    for cat_cast in (True, False):
+        X_ = X.copy()
+        y_ = y.copy()
+        if cat_cast:
+            for c in X.columns:
+                X_[c] = X_[c].astype('category')
+            X_ = X_.iloc[:50, :]
+            y_ = y_.iloc[:50]
+
+        X_train, X_test, y_train, y_test = train_test_split(X_, y_, shuffle=False, test_size=0.5)
+
+        for algorithm in ('cat', 'xgb', 'lgbm'):
+            with get_temp_directory() as temp_path:
+                experiment_gbdt(params[algorithm], X_train, y_train, X_test, gbdt_type=algorithm,
+                                logging_directory=temp_path, with_mlflow=True,
+                                categorical_feature=['x0', 'x1', 'x2', 'x3'])


 def test_inherit_outer_scope_run():