Hands-on-Supervised-Machine.../packtml/neural_net/mlp.py

274 lines
9.9 KiB
Python

# -*- coding: utf-8 -*-
#
# Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
#
# A simple multilayer perceptron classifier. If you find yourself struggling
# to follow the derivation of the back-propagation, check out this great
# refresher on scalar & matrix calculas + differential equations.
# http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html
from __future__ import absolute_import, division
from sklearn.utils.validation import check_X_y, check_random_state
from sklearn.utils.multiclass import check_classification_targets
import numpy as np
from ..base import BaseSimpleEstimator
from .base import NeuralMixin, tanh
__all__ = [
'NeuralNetClassifier'
]
try:
xrange
except NameError: # py3
xrange = range
def _calculate_loss(truth, preds, weights, l2):
"""Compute the log loss.
Calculate the log loss between the true class labels and the predictions
generated by the softmax layer in our neural network.
Parameters
----------
truth : np.ndarray, shape=(n_samples,)
The true labels
preds : np.ndarray, shape=(n_samples, n_classes)
The predicted class probabilities
weights : list
The list of weights matrices. Used for computing the loss
with the L2 regularization.
l2 : float
The regularization parameter
"""
# get the log probs of the prediction for the true class labels
n_samples = truth.shape[0]
logprobs = -np.log(preds[range(n_samples), truth])
# compute the sum of log probs
sum_logprobs = logprobs.sum()
# add the L2 regularization term
sum_logprobs += l2 / 2. * sum(np.square(W).sum() for W in weights)
return 1. / n_samples * sum_logprobs
def softmax(X):
"""Apply the softmax function.
The softmax function squashes an N-dimensional vector into a K-dimensional
vector whose elements add up to 1, and whose elements are bound in (0, 1).
Parameters
----------
X : np.ndarray, shape=(n_samples, n_features)
The matrix over which to apply softmax along the rows.
"""
# first compute the exponential. This is a step that would take place
# in the sigmoid (logistic) function as well. We can already begin to see
# where this is going to resemble logistic regression...
X_exp = np.exp(X)
return X_exp / np.sum(X_exp, axis=1, keepdims=True)
class NeuralNetClassifier(BaseSimpleEstimator, NeuralMixin):
"""A neural network classifier.
Create a multi-layer perceptron classifier. Note that this is a very
simple implementation of an MLP with only fully-connected layers and
very few tunable parameters. It is designed for readability. For more
optimized neural network code, look into TensorFlow, Keras or other
libraries.
This implementation of a neural net uses the ReLu activation function
*only*, and does not allow early convergence. It will continue for
``n_iter``. There are many other parameters that would typically be
tunable in a network, for instance dropout, regularization, learning
rate, etc. The majority of these parameters are left out of this
implementation to keep it simple.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
The training array. Should be a numpy array or array-like structure
with only finite values.
y : array-like, shape=(n_samples,)
The target vector.
hidden : iterable, optional (default=(25,))
An iterable indicating the number of units per hidden layer.
n_iter : int, optional (default=10)
The default number of iterations to perform.
learning_rate : float, optional (default=0.001)
The rate at which we descend the gradient.
random_state : int, None or RandomState, optional (default=42)
The random state for initializing the weights matrices.
"""
def __init__(self, X, y, hidden=(25,), n_iter=10, learning_rate=0.001,
regularization=0.01, random_state=42):
self.hidden = hidden
self.random_state = random_state
self.n_iter = n_iter
self.learning_rate = learning_rate
self.regularization = regularization
# initialize weights, biases, etc.
X, y, weights, biases = self._init_weights_biases(
X, y, hidden, random_state, last_dim=None)
# we can keep track of the loss for each iter
train_loss = []
# for each iteration, feed X through the network, compute the loss,
# and back-propagate the error to correct the weights.
for _ in xrange(n_iter):
# compute the product of X on the hidden layers (the output of
# the network)
out, layer_results = self._forward_step(X, weights, biases)
# compute the loss on the output
loss = _calculate_loss(truth=y, preds=out, weights=weights,
l2=self.regularization)
train_loss.append(loss)
# now back-propagate to correct the weights and biases via
# gradient descent
self._back_propagate(y, out, layer_results, weights,
biases, learning_rate,
self.regularization)
# save the weights, biases and loss as instance attributes
self.weights = weights
self.biases = biases
self.train_loss = train_loss
@staticmethod
def _init_weights_biases(X, y, hidden, random_state, last_dim=None):
# make sure dims all match in X, y and that we have appropriate
# classification targets
X, y = check_X_y(X, y, copy=False)
check_classification_targets(y)
random_state = check_random_state(random_state)
# initialize the weights and biases. For each layer, we create a new
# matrix of dimensions [last_layer_col_dim, new_col_dim]. This ensures
# we can compute matrix products across the layers and that the
# dimensions all match up. The biases will each be a vector of ones
# in this example, though in other networks that can be initialized
# differently
weights = []
biases = []
# if last dim is undefined, use the column shape of the input data.
# this argument is used to simplify the initialization of weights/
# biases in the transfer learning class...
if last_dim is None:
last_dim = X.shape[1]
for layer_size in hidden:
# initialize to extremely small values
w = random_state.rand(last_dim, layer_size) * 0.01
b = np.ones(layer_size)
last_dim = layer_size
weights.append(w)
biases.append(b)
# we need to add one more layer (the output layer) that is the size of
# the expected output probabilities. We'll apply the softmax function
# to the output of this layer.
n_outputs = np.unique(y).shape[0]
weights.append(random_state.rand(last_dim, n_outputs))
biases.append(np.ones(n_outputs))
return X, y, weights, biases
@staticmethod
def _forward_step(X, weights, biases):
# track the intermediate products
intermediate_results = [X]
# progress through all the layers EXCEPT the very last one.
for w, b in zip(weights[:-1], biases[:-1]):
# apply the activation function to the product of X and the weights
# (after adding the bias vector)
X = tanh(X.dot(w) + b)
# append this layer result
intermediate_results.append(X)
# we handle the very last layer a bit differently, since it's out
# output layer. First compute the product...
X = X.dot(weights[-1]) + biases[-1]
# then rather than apply the activation function (tanh), we apply
# the softmax, which is essentially generalized logistic regression.
return softmax(X), intermediate_results
@staticmethod
def _back_propagate(truth, probas, layer_results, weights,
biases, learning_rate, l2):
# the probabilities are our first delta. Subtract 1 from the
# TRUE labels' probabilities in the predictions
n_samples = truth.shape[0]
# subtract 1 from true idcs. initial deltas are: (y_hat - y)
probas[range(n_samples), truth] -= 1.
# iterate back through the layers computing the deltas (derivatives)
last_delta = probas
for next_weights, next_biases, layer_res in \
zip(weights[::-1], biases[::-1], layer_results[::-1]):
# the gradient for this layer is equivalent to the previous delta
# multiplied by the intermittent layer result
d_W = layer_res.T.dot(last_delta)
# column sums of the (just-computed) delta is the derivative
# of the biases
d_b = np.sum(last_delta, axis=0)
# set the next delta for the next iter
last_delta = last_delta.dot(next_weights.T) * \
(1. - np.power(layer_res, 2.))
# update the weights gradient with the L2 regularization term
d_W += l2 * next_weights
# update the weights in this layer. The learning rate governs how
# quickly we descend the gradient
next_weights += -learning_rate * d_W
next_biases += -learning_rate * d_b
def predict(self, X):
# compute the probabilities and then get the argmax for each class
probas = self.predict_proba(X)
# we want the argmaxes of each row
return np.argmax(probas, axis=1)
def predict_proba(self, X):
# simply compute a forward step (we don't care about idx 1 of the
# tuple, which is just the intermediate products)
return self._forward_step(X, self.weights, self.biases)[0]
def export_weights_and_biases(self, output_layer=True):
w, b = self.weights, self.biases
if output_layer:
return w, b
return w[:-1], b[:-1]