Update comments and KNN example

2018-07-19 11:48:06 -05:00 · 2018-07-19 11:48:06 -05:00 · 13fa08cf6f
parent 083f6b679f
commit 13fa08cf6f
3 changed files with 15 additions and 3 deletions
--- a/examples/clustering/example_knn_classifier.py
+++ b/examples/clustering/example_knn_classifier.py
@ -5,22 +5,27 @@ from __future__ import absolute_import
 from packtml.clustering import KNNClassifier
 from packtml.utils.plotting import add_decision_boundary_to_axis
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import accuracy_score
 from sklearn.datasets import load_iris
 from matplotlib import pyplot as plt
 from matplotlib.colors import ListedColormap
-import numpy as np
 import sys

 # #############################################################################
 # Create a classification sub-dataset using iris
 iris = load_iris()
-X = iris.data[:, :2]
+X = iris.data[:, :2]  # just use the first two dimensions
 y = iris.target

 # split data
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

+# scale the data
+scaler = StandardScaler()
+X_train = scaler.fit_transform(X_train)
+X_test = scaler.transform(X_test)
+
 # #############################################################################
 # Fit a k-nearest neighbor model and get predictions
 k=10
--- a/img/clustering/example_knn_classifier.png
+++ b/img/clustering/example_knn_classifier.png
--- a/packtml/clustering/knn.py
+++ b/packtml/clustering/knn.py
@ -80,7 +80,8 @@ class KNNClassifier(BaseSimpleEstimator):
        # Compute the pairwise distances between each observation in
        # the dataset and the training data. This can be relatively expensive
        # for very large datasets!!
-        dists = euclidean_distances(X, self.X)
+        train = self.X
+        dists = euclidean_distances(X, train)

        # Arg sort to find the shortest distance for each row. This sorts
        # elements in each row (independent of other rows) to determine the
@ -93,7 +94,13 @@ class KNNClassifier(BaseSimpleEstimator):
        nearest = np.argsort(dists, axis=1)

        # We only care about the top K, really, so get sorted and then truncate
+        # I.e:
+        # array([[1, 2, 1],
+        #           ...
+        #        [0, 0, 0]])
        predicted_labels = self.y[nearest][:, :self.k]

        # We want the most common along the rows as the predictions
+        # I.e:
+        # array([1, ..., 0])
        return mode(predicted_labels, axis=-1)[0].ravel()