Hands-on-Supervised-Machine.../packtml/neural_net/mlp.py

# -*- coding: utf-8 -*-
#
# Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
#
# A simple multilayer perceptron classifier. If you find yourself struggling
# to follow the derivation of the back-propagation, check out this great
# refresher on scalar & matrix calculas + differential equations.
# http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html

from __future__ import absolute_import, division

from sklearn.utils.validation import check_X_y, check_random_state
from sklearn.utils.multiclass import check_classification_targets

import numpy as np

from ..base import BaseSimpleEstimator
from .base import NeuralMixin, tanh

__all__ = [
    'NeuralNetClassifier'
]

try:
    xrange
except NameError:  # py3
    xrange = range


def _calculate_loss(truth, preds, weights, l2):
    """Compute the log loss.

    Calculate the log loss between the true class labels and the predictions
    generated by the softmax layer in our neural network.

    Parameters
    ----------
    truth : np.ndarray, shape=(n_samples,)
        The true labels

    preds : np.ndarray, shape=(n_samples, n_classes)
        The predicted class probabilities

    weights : list
        The list of weights matrices. Used for computing the loss
        with the L2 regularization.

    l2 : float
        The regularization parameter
    """
    # get the log probs of the prediction for the true class labels
    n_samples = truth.shape[0]
    logprobs = -np.log(preds[range(n_samples), truth])

    # compute the sum of log probs
    sum_logprobs = logprobs.sum()

    # add the L2 regularization term
    sum_logprobs += l2 / 2. * sum(np.square(W).sum() for W in weights)
    return 1. / n_samples * sum_logprobs


def softmax(X):
    """Apply the softmax function.

    The softmax function squashes an N-dimensional vector into a K-dimensional
    vector whose elements add up to 1, and whose elements are bound in (0, 1).

    Parameters
    ----------
    X : np.ndarray, shape=(n_samples, n_features)
        The matrix over which to apply softmax along the rows.
    """
    # first compute the exponential. This is a step that would take place
    # in the sigmoid (logistic) function as well. We can already begin to see
    # where this is going to resemble logistic regression...
    X_exp = np.exp(X)
    return X_exp / np.sum(X_exp, axis=1, keepdims=True)


class NeuralNetClassifier(BaseSimpleEstimator, NeuralMixin):
    """A neural network classifier.

    Create a multi-layer perceptron classifier. Note that this is a very
    simple implementation of an MLP with only fully-connected layers and
    very few tunable parameters. It is designed for readability. For more
    optimized neural network code, look into TensorFlow, Keras or other
    libraries.

    This implementation of a neural net uses the ReLu activation function
    *only*, and does not allow early convergence. It will continue for
    ``n_iter``. There are many other parameters that would typically be
    tunable in a network, for instance dropout, regularization, learning
    rate, etc. The majority of these parameters are left out of this
    implementation to keep it simple.

    Parameters
    ----------
    X : array-like, shape=(n_samples, n_features)
        The training array. Should be a numpy array or array-like structure
        with only finite values.

    y : array-like, shape=(n_samples,)
        The target vector.

    hidden : iterable, optional (default=(25,))
        An iterable indicating the number of units per hidden layer.

    n_iter : int, optional (default=10)
        The default number of iterations to perform.

    learning_rate : float, optional (default=0.001)
        The rate at which we descend the gradient.

    random_state : int, None or RandomState, optional (default=42)
        The random state for initializing the weights matrices.
    """
    def __init__(self, X, y, hidden=(25,), n_iter=10, learning_rate=0.001,
                 regularization=0.01, random_state=42):

        self.hidden = hidden
        self.random_state = random_state
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.regularization = regularization

        # initialize weights, biases, etc.
        X, y, weights, biases = self._init_weights_biases(
            X, y, hidden, random_state, last_dim=None)

        # we can keep track of the loss for each iter
        train_loss = []

        # for each iteration, feed X through the network, compute the loss,
        # and back-propagate the error to correct the weights.
        for _ in xrange(n_iter):
            # compute the product of X on the hidden layers (the output of
            # the network)
            out, layer_results = self._forward_step(X, weights, biases)

            # compute the loss on the output
            loss = _calculate_loss(truth=y, preds=out, weights=weights,
                                   l2=self.regularization)
            train_loss.append(loss)

            # now back-propagate to correct the weights and biases via
            # gradient descent
            self._back_propagate(y, out, layer_results, weights,
                                 biases, learning_rate,
                                 self.regularization)

        # save the weights, biases and loss as instance attributes
        self.weights = weights
        self.biases = biases
        self.train_loss = train_loss

    @staticmethod
    def _init_weights_biases(X, y, hidden, random_state, last_dim=None):
        # make sure dims all match in X, y and that we have appropriate
        # classification targets
        X, y = check_X_y(X, y, copy=False)
        check_classification_targets(y)

        random_state = check_random_state(random_state)

        # initialize the weights and biases. For each layer, we create a new
        # matrix of dimensions [last_layer_col_dim, new_col_dim]. This ensures
        # we can compute matrix products across the layers and that the
        # dimensions all match up. The biases will each be a vector of ones
        # in this example, though in other networks that can be initialized
        # differently
        weights = []
        biases = []

        # if last dim is undefined, use the column shape of the input data.
        # this argument is used to simplify the initialization of weights/
        # biases in the transfer learning class...
        if last_dim is None:
            last_dim = X.shape[1]

        for layer_size in hidden:
            # initialize to extremely small values
            w = random_state.rand(last_dim, layer_size) * 0.01
            b = np.ones(layer_size)
            last_dim = layer_size

            weights.append(w)
            biases.append(b)

        # we need to add one more layer (the output layer) that is the size of
        # the expected output probabilities. We'll apply the softmax function
        # to the output of this layer.
        n_outputs = np.unique(y).shape[0]
        weights.append(random_state.rand(last_dim, n_outputs))
        biases.append(np.ones(n_outputs))

        return X, y, weights, biases

    @staticmethod
    def _forward_step(X, weights, biases):
        # track the intermediate products
        intermediate_results = [X]

        # progress through all the layers EXCEPT the very last one.
        for w, b in zip(weights[:-1], biases[:-1]):

            # apply the activation function to the product of X and the weights
            # (after adding the bias vector)
            X = tanh(X.dot(w) + b)

            # append this layer result
            intermediate_results.append(X)

        # we handle the very last layer a bit differently, since it's out
        # output layer. First compute the product...
        X = X.dot(weights[-1]) + biases[-1]

        # then rather than apply the activation function (tanh), we apply
        # the softmax, which is essentially generalized logistic regression.
        return softmax(X), intermediate_results

    @staticmethod
    def _back_propagate(truth, probas, layer_results, weights,
                        biases, learning_rate, l2):
        # the probabilities are our first delta. Subtract 1 from the
        # TRUE labels' probabilities in the predictions
        n_samples = truth.shape[0]

        # subtract 1 from true idcs. initial deltas are: (y_hat - y)
        probas[range(n_samples), truth] -= 1.

        # iterate back through the layers computing the deltas (derivatives)
        last_delta = probas
        for next_weights, next_biases, layer_res in \
                zip(weights[::-1], biases[::-1], layer_results[::-1]):

            # the gradient for this layer is equivalent to the previous delta
            # multiplied by the intermittent layer result
            d_W = layer_res.T.dot(last_delta)

            # column sums of the (just-computed) delta is the derivative
            # of the biases
            d_b = np.sum(last_delta, axis=0)

            # set the next delta for the next iter
            last_delta = last_delta.dot(next_weights.T) * \
                (1. - np.power(layer_res, 2.))

            # update the weights gradient with the L2 regularization term
            d_W += l2 * next_weights

            # update the weights in this layer. The learning rate governs how
            # quickly we descend the gradient
            next_weights += -learning_rate * d_W
            next_biases += -learning_rate * d_b

    def predict(self, X):
        # compute the probabilities and then get the argmax for each class
        probas = self.predict_proba(X)

        # we want the argmaxes of each row
        return np.argmax(probas, axis=1)

    def predict_proba(self, X):
        # simply compute a forward step (we don't care about idx 1 of the
        # tuple, which is just the intermediate products)
        return self._forward_step(X, self.weights, self.biases)[0]

    def export_weights_and_biases(self, output_layer=True):
        w, b = self.weights, self.biases
        if output_layer:
            return w, b
        return w[:-1], b[:-1]