Dimensionality reduction

Movielens-20m is a dataset consisting in 20 million ratings and 465,000 tag applications applied to 27.000 movies by 138.000 users. We use the Singular Value Decomposition algorithm to build a recommender system for movies.

import os

import numpy as np
import pandas as pd
import scipy.linalg.interpolative as sli

from lightonml.datasets import movielens20m
from lightonml.encoding.base import SeparatedBitPlanEncoder, MixingBitPlanDecoder
from lightonml.random_projections.opu import OPURandomMapping
from lightonopu.opu import OPU
ratings, id_to_movie = movielens20m(id_to_movie=True)
df = pd.DataFrame(ratings[1:], columns=ratings[0])
df_m = pd.DataFrame(id_to_movie)
n_movies = len(df.movieId.unique())
n_users = len(df.userId.unique())
print('n users: {} n_movies: {}'.format(n_movies, n_users))

# create the user-item ranking matrix
df = df.pivot(index='movieId', columns='userId', values='rating')
n users: 26744 n_movies: 138493
ratings = df.values
print('Size of the ratings matrix: {}'.format(ratings.shape))
del df
Size of the ratings matrix: (26744, 138493)
# demeaning ignoring nans along users
ratings -= np.nanmean(ratings, axis=0, keepdims=True)
# set nans to zero after demeaning
ratings[np.isnan(ratings)] = 0

Try SVD on original data

try:
    start = time()
    u, s, v = np.linalg.svd(ratings)
    svd_original = time() - start
except MemoryError:
    print('SVD requires too much memory.')
SVD requires too much memory.

Trying to perform SVD on the original data fails because of the high memory requirement of the algorithm.

Use randomized SVD instead

Randomized SVD consists in reducing the dimensionality of the data through random projections before performing SVD. The randomized version of the algorithm reduces the memory requirements and also decreases the computational complexity from \(O(kmn)\) to \(O(mn \log(k) + (m + n)k^2)\).

We follow the algorithm in Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions, Halko et al., 2009.

def interp_dec(A, k):
    idx, proj = sli.interp_decomp(A, k)
    X = np.hstack([np.eye(k), proj])[:, np.argsort(idx)]
    return idx[:k], X


def randomize(x, k, thresh=0):
    #n_bits = 1
    #encoder = SeparatedBitPlanEncoder(n_bits=n_bits)
    opu = OPU(600, 200)
    mapping = OPURandomMapping(opu, n_components=k, disable_pbar=True)
    #decoder = MixingBitPlanDecoder(n_bits=n_bits)
    x = (x > thresh).astype('uint8')
    #x = encoder.fit_transform(x)
    mapping.fit(x)
    y = mapping.transform(x, n_samples_by_pass=3000)
    #y = decoder.fit_transform(y)
    return y


def svd(x_proj, x, k):
    x_proj = np.dot(x, np.dot(x.T, x_proj))
    J, X = interp_dec(x_proj.T, k)
    Q1, R1 = np.linalg.qr(x[J, :])
    U, s, Vt = np.linalg.svd(np.dot(X.T, Q1))
    V = np.dot(Vt[:k, :], R1)
    return U[:, :k], s, V


def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indices = np.argsort(-similarity)+1
    return sort_indices[:top_n]
k = 100
start = time()
ratings_proj = randomize(ratings, k)
rp_time = time() - start
OPU: random projections of an array of size (26744,138493)
c = 100
start = time()
u, s, v = svd(ratings_proj, ratings, c)
svd_time = time() - start
reconstruction = np.dot(u * s, v)
del ratings_proj
print('Total time: {:.2f}'.format(rp_time + svd_time))
print('RMSE: {:.4f}'.format(np.sqrt(np.mean((reconstruction-ratings)**2))))
Total time: 125.35
RMSE: 0.1186
# keep only important singular values (90% of energy)
energy = 0
for i, el in enumerate(s):
    energy += el
    if energy > (s**2).sum()*0.9:
        break
k = i
movie_id = 1
top_n = 2
sliced = u[:, :k]
indices = top_cosine_similarity(sliced, movie_id, top_n)
print('Query: {}, {}'.format(df_m.loc[0].title, df_m.loc[0].genres))

for idx in indices[1:]:
    print('Recommended: {}, {}'.format(df_m.loc[idx-1].title,
                                       df_m.loc[idx-1].genres))
Query: Toy Story (1995), Adventure|Animation|Children|Comedy|Fantasy
Recommended: Toy Story 2 (1999), Adventure|Animation|Children|Comedy|Fantasy