Hands-on-Supervised-Machine.../packtml/recommendation/itemitem.py

143 lines
4.8 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from sklearn.utils.validation import check_array
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from packtml.recommendation.base import RecommenderMixin
from packtml.base import BaseSimpleEstimator
__all__ = [
'ItemItemRecommender'
]
try:
xrange
except NameError: # py3
xrange = range
class ItemItemRecommender(BaseSimpleEstimator, RecommenderMixin):
"""Item-to-item recommendation system using cosine similarity.
A collaborative filtering recommender algorithm that computes the cosine
similarity between each item and generates recommendations for users'
highly rated items by returning similar items.
Parameters
----------
R : array-like, shape=(n_users, n_items)
The ratings matrix. This must be an explicit ratings matrix where
0 indicates an item that a user has not yet rated.
Attributes
----------
similarity : np.ndarray, shape=(n_items, n_items)
The similarity matrix.
Notes
-----
This implementation is very rudimentary and does not allow tuning of
hyper-parameters apart from ``k``. No similarity metrics apart from cosine
similarity may be used. It is largely written to optimize readability. For
a very highly optimized version, try the "implicit" library.
"""
def __init__(self, R, k=10):
# check the array, but don't copy if not needed
R = check_array(R, dtype=np.float32, copy=False) # type: np.ndarray
# save the hyper param for later use later
self.k = k
self.similarity = self._compute_sim(R, k)
def _compute_sim(self, R, k):
# compute the similarity between all the items. This calculates the
# similarity between each ITEM
sim = cosine_similarity(R.T)
# Only keep the similarities of the top K, setting all others to zero
# (negative since we want descending)
not_top_k = np.argsort(-sim, axis=1)[:, k:] # shape=(n_items, k)
if not_top_k.shape[1]: # only if there are cols (k < n_items)
# now we have to set these to zero in the similarity matrix
row_indices = np.repeat(range(not_top_k.shape[0]),
not_top_k.shape[1])
sim[row_indices, not_top_k.ravel()] = 0.
return sim
def recommend_for_user(self, R, user, n=10,
filter_previously_seen=False,
return_scores=True, **kwargs):
"""Generate predictions for a single user.
Parameters
----------
R : array-like, shape=(n_users, n_items)
The test ratings matrix. This must be an explicit ratings matrix
where 0 indicates an item that a user has not yet rated.
user : int
The user index for whom to generate predictions.
n : int or None, optional (default=10)
The number of recommendations to return. Default is 10. For all,
set to None.
filter_previously_seen : bool, optional (default=False)
Whether to filter out previously-rated items.
return_scores : bool, optional (default=True)
Whether to return the computed scores for the recommended items.
**kwargs : keyword args
Ignored. Present to match super signature.
Returns
-------
items : np.ndarray
The top ``n`` items recommended for the user.
recommendations (optional) : np.ndarray
The corresponding scores for the top ``n`` items for the
user. Only returned if ``return_scores`` is True.
"""
# check the array and get the user vector
R = check_array(R, dtype=np.float32, copy=False)
user_vector = R[user, :]
# compute the dot product between the user vector and the similarity
# matrix
recommendations = user_vector.dot(self.similarity) # shape=(n_items,)
# if we're filtering previously-seen items, now is the time to do that
item_indices = np.arange(recommendations.shape[0])
if filter_previously_seen:
rated_mask = user_vector != 0.
recommendations = recommendations[~rated_mask]
item_indices = item_indices[~rated_mask]
# now arg sort descending (most similar items first)
order = np.argsort(-recommendations)[:n]
items = item_indices[order]
if return_scores:
return items, recommendations[order]
return items
def predict(self, R):
"""Generate predictions for the test set.
Computes the predicted product of users' rated vectors on the
pre-computed similarity matrix.
"""
R = check_array(R, dtype=np.float32, copy=False) # type: np.ndarray
# compute the product R*sim
return R.dot(self.similarity)