274 lines
9.9 KiB
Python
274 lines
9.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
|
|
#
|
|
# A simple multilayer perceptron classifier. If you find yourself struggling
|
|
# to follow the derivation of the back-propagation, check out this great
|
|
# refresher on scalar & matrix calculas + differential equations.
|
|
# http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html
|
|
|
|
from __future__ import absolute_import, division
|
|
|
|
from sklearn.utils.validation import check_X_y, check_random_state
|
|
from sklearn.utils.multiclass import check_classification_targets
|
|
|
|
import numpy as np
|
|
|
|
from ..base import BaseSimpleEstimator
|
|
from .base import NeuralMixin, tanh
|
|
|
|
__all__ = [
|
|
'NeuralNetClassifier'
|
|
]
|
|
|
|
try:
|
|
xrange
|
|
except NameError: # py3
|
|
xrange = range
|
|
|
|
|
|
def _calculate_loss(truth, preds, weights, l2):
|
|
"""Compute the log loss.
|
|
|
|
Calculate the log loss between the true class labels and the predictions
|
|
generated by the softmax layer in our neural network.
|
|
|
|
Parameters
|
|
----------
|
|
truth : np.ndarray, shape=(n_samples,)
|
|
The true labels
|
|
|
|
preds : np.ndarray, shape=(n_samples, n_classes)
|
|
The predicted class probabilities
|
|
|
|
weights : list
|
|
The list of weights matrices. Used for computing the loss
|
|
with the L2 regularization.
|
|
|
|
l2 : float
|
|
The regularization parameter
|
|
"""
|
|
# get the log probs of the prediction for the true class labels
|
|
n_samples = truth.shape[0]
|
|
logprobs = -np.log(preds[range(n_samples), truth])
|
|
|
|
# compute the sum of log probs
|
|
sum_logprobs = logprobs.sum()
|
|
|
|
# add the L2 regularization term
|
|
sum_logprobs += l2 / 2. * sum(np.square(W).sum() for W in weights)
|
|
return 1. / n_samples * sum_logprobs
|
|
|
|
|
|
def softmax(X):
|
|
"""Apply the softmax function.
|
|
|
|
The softmax function squashes an N-dimensional vector into a K-dimensional
|
|
vector whose elements add up to 1, and whose elements are bound in (0, 1).
|
|
|
|
Parameters
|
|
----------
|
|
X : np.ndarray, shape=(n_samples, n_features)
|
|
The matrix over which to apply softmax along the rows.
|
|
"""
|
|
# first compute the exponential. This is a step that would take place
|
|
# in the sigmoid (logistic) function as well. We can already begin to see
|
|
# where this is going to resemble logistic regression...
|
|
X_exp = np.exp(X)
|
|
return X_exp / np.sum(X_exp, axis=1, keepdims=True)
|
|
|
|
|
|
class NeuralNetClassifier(BaseSimpleEstimator, NeuralMixin):
|
|
"""A neural network classifier.
|
|
|
|
Create a multi-layer perceptron classifier. Note that this is a very
|
|
simple implementation of an MLP with only fully-connected layers and
|
|
very few tunable parameters. It is designed for readability. For more
|
|
optimized neural network code, look into TensorFlow, Keras or other
|
|
libraries.
|
|
|
|
This implementation of a neural net uses the ReLu activation function
|
|
*only*, and does not allow early convergence. It will continue for
|
|
``n_iter``. There are many other parameters that would typically be
|
|
tunable in a network, for instance dropout, regularization, learning
|
|
rate, etc. The majority of these parameters are left out of this
|
|
implementation to keep it simple.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape=(n_samples, n_features)
|
|
The training array. Should be a numpy array or array-like structure
|
|
with only finite values.
|
|
|
|
y : array-like, shape=(n_samples,)
|
|
The target vector.
|
|
|
|
hidden : iterable, optional (default=(25,))
|
|
An iterable indicating the number of units per hidden layer.
|
|
|
|
n_iter : int, optional (default=10)
|
|
The default number of iterations to perform.
|
|
|
|
learning_rate : float, optional (default=0.001)
|
|
The rate at which we descend the gradient.
|
|
|
|
random_state : int, None or RandomState, optional (default=42)
|
|
The random state for initializing the weights matrices.
|
|
"""
|
|
def __init__(self, X, y, hidden=(25,), n_iter=10, learning_rate=0.001,
|
|
regularization=0.01, random_state=42):
|
|
|
|
self.hidden = hidden
|
|
self.random_state = random_state
|
|
self.n_iter = n_iter
|
|
self.learning_rate = learning_rate
|
|
self.regularization = regularization
|
|
|
|
# initialize weights, biases, etc.
|
|
X, y, weights, biases = self._init_weights_biases(
|
|
X, y, hidden, random_state, last_dim=None)
|
|
|
|
# we can keep track of the loss for each iter
|
|
train_loss = []
|
|
|
|
# for each iteration, feed X through the network, compute the loss,
|
|
# and back-propagate the error to correct the weights.
|
|
for _ in xrange(n_iter):
|
|
# compute the product of X on the hidden layers (the output of
|
|
# the network)
|
|
out, layer_results = self._forward_step(X, weights, biases)
|
|
|
|
# compute the loss on the output
|
|
loss = _calculate_loss(truth=y, preds=out, weights=weights,
|
|
l2=self.regularization)
|
|
train_loss.append(loss)
|
|
|
|
# now back-propagate to correct the weights and biases via
|
|
# gradient descent
|
|
self._back_propagate(y, out, layer_results, weights,
|
|
biases, learning_rate,
|
|
self.regularization)
|
|
|
|
# save the weights, biases and loss as instance attributes
|
|
self.weights = weights
|
|
self.biases = biases
|
|
self.train_loss = train_loss
|
|
|
|
@staticmethod
|
|
def _init_weights_biases(X, y, hidden, random_state, last_dim=None):
|
|
# make sure dims all match in X, y and that we have appropriate
|
|
# classification targets
|
|
X, y = check_X_y(X, y, copy=False)
|
|
check_classification_targets(y)
|
|
|
|
random_state = check_random_state(random_state)
|
|
|
|
# initialize the weights and biases. For each layer, we create a new
|
|
# matrix of dimensions [last_layer_col_dim, new_col_dim]. This ensures
|
|
# we can compute matrix products across the layers and that the
|
|
# dimensions all match up. The biases will each be a vector of ones
|
|
# in this example, though in other networks that can be initialized
|
|
# differently
|
|
weights = []
|
|
biases = []
|
|
|
|
# if last dim is undefined, use the column shape of the input data.
|
|
# this argument is used to simplify the initialization of weights/
|
|
# biases in the transfer learning class...
|
|
if last_dim is None:
|
|
last_dim = X.shape[1]
|
|
|
|
for layer_size in hidden:
|
|
# initialize to extremely small values
|
|
w = random_state.rand(last_dim, layer_size) * 0.01
|
|
b = np.ones(layer_size)
|
|
last_dim = layer_size
|
|
|
|
weights.append(w)
|
|
biases.append(b)
|
|
|
|
# we need to add one more layer (the output layer) that is the size of
|
|
# the expected output probabilities. We'll apply the softmax function
|
|
# to the output of this layer.
|
|
n_outputs = np.unique(y).shape[0]
|
|
weights.append(random_state.rand(last_dim, n_outputs))
|
|
biases.append(np.ones(n_outputs))
|
|
|
|
return X, y, weights, biases
|
|
|
|
@staticmethod
|
|
def _forward_step(X, weights, biases):
|
|
# track the intermediate products
|
|
intermediate_results = [X]
|
|
|
|
# progress through all the layers EXCEPT the very last one.
|
|
for w, b in zip(weights[:-1], biases[:-1]):
|
|
|
|
# apply the activation function to the product of X and the weights
|
|
# (after adding the bias vector)
|
|
X = tanh(X.dot(w) + b)
|
|
|
|
# append this layer result
|
|
intermediate_results.append(X)
|
|
|
|
# we handle the very last layer a bit differently, since it's out
|
|
# output layer. First compute the product...
|
|
X = X.dot(weights[-1]) + biases[-1]
|
|
|
|
# then rather than apply the activation function (tanh), we apply
|
|
# the softmax, which is essentially generalized logistic regression.
|
|
return softmax(X), intermediate_results
|
|
|
|
@staticmethod
|
|
def _back_propagate(truth, probas, layer_results, weights,
|
|
biases, learning_rate, l2):
|
|
# the probabilities are our first delta. Subtract 1 from the
|
|
# TRUE labels' probabilities in the predictions
|
|
n_samples = truth.shape[0]
|
|
|
|
# subtract 1 from true idcs. initial deltas are: (y_hat - y)
|
|
probas[range(n_samples), truth] -= 1.
|
|
|
|
# iterate back through the layers computing the deltas (derivatives)
|
|
last_delta = probas
|
|
for next_weights, next_biases, layer_res in \
|
|
zip(weights[::-1], biases[::-1], layer_results[::-1]):
|
|
|
|
# the gradient for this layer is equivalent to the previous delta
|
|
# multiplied by the intermittent layer result
|
|
d_W = layer_res.T.dot(last_delta)
|
|
|
|
# column sums of the (just-computed) delta is the derivative
|
|
# of the biases
|
|
d_b = np.sum(last_delta, axis=0)
|
|
|
|
# set the next delta for the next iter
|
|
last_delta = last_delta.dot(next_weights.T) * \
|
|
(1. - np.power(layer_res, 2.))
|
|
|
|
# update the weights gradient with the L2 regularization term
|
|
d_W += l2 * next_weights
|
|
|
|
# update the weights in this layer. The learning rate governs how
|
|
# quickly we descend the gradient
|
|
next_weights += -learning_rate * d_W
|
|
next_biases += -learning_rate * d_b
|
|
|
|
def predict(self, X):
|
|
# compute the probabilities and then get the argmax for each class
|
|
probas = self.predict_proba(X)
|
|
|
|
# we want the argmaxes of each row
|
|
return np.argmax(probas, axis=1)
|
|
|
|
def predict_proba(self, X):
|
|
# simply compute a forward step (we don't care about idx 1 of the
|
|
# tuple, which is just the intermediate products)
|
|
return self._forward_step(X, self.weights, self.biases)[0]
|
|
|
|
def export_weights_and_biases(self, output_layer=True):
|
|
w, b = self.weights, self.biases
|
|
if output_layer:
|
|
return w, b
|
|
return w[:-1], b[:-1]
|