Deep Learning for Content-Based Filtering

import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
from recsysNN_utils import *
pd.set_option("display.precision", 1)
# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
scaledata = True  # applies the standard scalar to data if true
print(f"Number of training vectors: {len(item_train)}")
Number of training vectors: 58187
pprint_train(user_train, user_features, uvs,  u_s, maxcount=5)
[user id] [rating count] [rating ave] Act ion Adve nture Anim ation Chil dren Com edy Crime Docum entary Drama Fan tasy Hor ror Mys tery Rom ance Sci -Fi Thri ller
2 16 4.1 3.9 5.0 0.0 0.0 4.0 4.2 4.0 4.0 0.0 3.0 4.0 0.0 4.2 3.9
2 16 4.1 3.9 5.0 0.0 0.0 4.0 4.2 4.0 4.0 0.0 3.0 4.0 0.0 4.2 3.9
2 16 4.1 3.9 5.0 0.0 0.0 4.0 4.2 4.0 4.0 0.0 3.0 4.0 0.0 4.2 3.9
2 16 4.1 3.9 5.0 0.0 0.0 4.0 4.2 4.0 4.0 0.0 3.0 4.0 0.0 4.2 3.9
2 16 4.1 3.9 5.0 0.0 0.0 4.0 4.2 4.0 4.0 0.0 3.0 4.0 0.0 4.2 3.9
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)
[movie id] year ave rating Act ion Adve nture Anim ation Chil dren Com edy Crime Docum entary Drama Fan tasy Hor ror Mys tery Rom ance Sci -Fi Thri ller
6874 2003 4.0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
6874 2003 4.0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
6874 2003 4.0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
8798 2004 3.8 1 0 0 0 0 0 0 0 0 0 0 0 0 0
8798 2004 3.8 0 0 0 0 0 1 0 0 0 0 0 0 0 0
print(f"y_train[:5]: {y_train[:5]}")
y_train[:5]: [4.  4.  4.  3.5 3.5]
# scale training data
if scaledata:
    item_train_save = item_train
    user_train_save = user_train

    scalerItem = StandardScaler()
    item_train = scalerItem.transform(item_train)

    scalerUser = StandardScaler()
    user_train = scalerUser.transform(user_train)

    print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train)))
    print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train)))
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test  data shape: {item_test.shape}")
movie/item training data shape: (46549, 17)
movie/item test  data shape: (11638, 17)

The scaled, shuffled data now has a mean of zero.

pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
[user id] [rating count] [rating ave] Act ion Adve nture Anim ation Chil dren Com edy Crime Docum entary Drama Fan tasy Hor ror Mys tery Rom ance Sci -Fi Thri ller
1 0 0.6 0.7 0.6 0.6 0.7 0.7 0.5 0.7 0.2 0.3 0.3 0.5 0.5 0.8 0.5
0 0 1.6 1.5 1.7 0.9 1.0 1.4 0.8 -1.2 1.2 1.2 1.6 0.9 1.4 1.2 1.0
0 0 0.8 0.6 0.7 0.5 0.6 0.6 0.3 -1.2 0.7 0.8 0.9 0.6 0.2 0.6 0.6
1 0 -0.1 0.2 -0.1 0.3 0.7 0.3 0.2 1.0 -0.5 -0.7 -2.1 0.5 0.7 0.3 0.0
-1 0 -1.3 -0.8 -0.8 0.1 -0.1 -1.1 -0.9 -1.2 -1.5 -0.6 -0.5 -0.6 -0.9 -0.4 -0.9
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)
(46549, 1) (11638, 1)

Neural Network for content-based filtering

num_outputs = 32
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = Model([input_user, input_item], output)

Model: "model"
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 14)]         0           []                               
 input_2 (InputLayer)           [(None, 16)]         0           []                               
 sequential (Sequential)        (None, 32)           40864       ['input_1[0][0]']                
 sequential_1 (Sequential)      (None, 32)           41376       ['input_2[0][0]']                
 tf.math.l2_normalize (TFOpLamb  (None, 32)          0           ['sequential[0][0]']             
 tf.math.l2_normalize_1 (TFOpLa  (None, 32)          0           ['sequential_1[0][0]']           
 dot (Dot)                      (None, 1)            0           ['tf.math.l2_normalize[0][0]',   
Total params: 82,240
Trainable params: 82,240
Non-trainable params: 0

We’ll use a mean squared error loss and an Adam optimizer.

cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
Num GPUs Available:  1
model.fit([user_train[:, u_s:], item_train[:, i_s:]], ynorm_train, epochs=30)
Epoch 1/30
1455/1455 [==============================] - 21s 11ms/step - loss: 0.1249
Epoch 2/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1183
Epoch 3/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1167
Epoch 4/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1148
Epoch 5/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1131
Epoch 6/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1110
Epoch 7/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1099
Epoch 8/30
1455/1455 [==============================] - 15s 11ms/step - loss: 0.1090
Epoch 9/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1081
Epoch 10/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1075
Epoch 11/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1066
Epoch 12/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1065
Epoch 13/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1058
Epoch 14/30
1455/1455 [==============================] - 15s 11ms/step - loss: 0.1051
Epoch 15/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1045
Epoch 16/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1039
Epoch 17/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1033
Epoch 18/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1030
Epoch 19/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1023
Epoch 20/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1020
Epoch 21/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1014
Epoch 22/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1010
Epoch 23/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1005
Epoch 24/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1001
Epoch 25/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0999
Epoch 26/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0993
Epoch 27/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0990
Epoch 28/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0988
Epoch 29/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0983
Epoch 30/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0980

<keras.callbacks.History at 0x2430f6b67f0>

Evaluate the model to determine loss on the test data. It is comparable to the training loss indicating the model has not substantially overfit the training data.

model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)
364/364 [==============================] - 2s 5ms/step - loss: 0.1045


Predictions for a new user

First, we’ll create a new user and have the model suggest movies for that user. After you have tried this example on the example user content, feel free to change the user content to match your own preferences and see what the model suggests. Note that ratings are between 0.5 and 5.0, inclusive, in half-step increments.

new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

Let’s look at the top-rated movies for the new user. Recall, the user vector had genres that favored Comedy and Romance. Below, we’ll use a set of movie/item vectors, item_vecs that have a vector for each movie in the training/test set. This is matched with the user vector above and the scaled vectors are used to predict ratings for all the movies for our new user above.

# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs,  item_vecs, model, u_s, i_s, 
                                                                       scaler, scalerUser, scalerItem, scaledata=scaledata)

print_pred_movies(sorted_ypu, sorted_user, sorted_items, movie_dict, maxcount = 10)
59/59 [==============================] - 0s 4ms/step
y_p movie id rating avetitle genres
4.84793 76293 3.31818Date Night (2010) Action|Comedy|Romance
4.82386 69406 3.5 Proposal, The (2009) Comedy|Romance
4.8197 58047 3.42857Definitely, Maybe (2008) Comedy|Drama|Romance
4.81538 62155 3.35 Nick and Norah's Infinite Playlist (2008) Comedy|Drama|Romance
4.80771 99007 3.5 Warm Bodies (2013) Comedy|Horror|Romance
4.80431 86882 3.56 Midnight in Paris (2011) Comedy|Fantasy|Romance
4.80238 56949 3.3 27 Dresses (2008) Comedy|Romance
4.79867 54004 3.45455I Now Pronounce You Chuck and Larry (2007)Comedy|Romance
4.77714 5377 3.71591About a Boy (2002) Comedy|Drama|Romance
4.77429 5992 3.7 Hours, The (2002) Drama|Romance

Predictions for an existing user.

Let’s look at the predictions for “user 36”, one of the users in the data set. We can compare the predicted ratings with the model’s ratings. Note that movies with multiple genre’s show up multiple times in the training data. For example,’The Time Machine’ has three genre’s: Adventure, Action, Sci-Fi

uid =  36 
# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train), item_vecs, user_to_genre)

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, 
                                                                      scalerUser, scalerItem, scaledata=scaledata)
sorted_y = y_vecs[sorted_index]

#print sorted predictions
print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, item_features, ivs, uvs, movie_dict, maxcount = 10)
59/59 [==============================] - 0s 4ms/step
y_p y user user genre ave movie rating avetitle genres
3.23.0 36 3.00 2.86Time Machine, The (2002)Adventure
3.13.0 36 3.00 2.86Time Machine, The (2002)Action
3.03.0 36 3.00 2.86Time Machine, The (2002)Sci-Fi
2.01.0 36 1.50 4.00Beautiful Mind, A (2001)Drama
1.91.5 36 1.75 3.52Road to Perdition (2002)Crime
1.92.0 36 1.75 3.52Gangs of New York (2002)Crime
1.81.0 36 1.00 4.00Beautiful Mind, A (2001)Romance
1.61.5 36 1.50 3.52Road to Perdition (2002)Drama
1.62.0 36 1.50 3.52Gangs of New York (2002)Drama
def sq_dist(a,b):
    Returns the squared distance between two vectors
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
      d (float) : distance
    d = np.sum(np.square(a - b))
    return (d)
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = Model(input_item_m, vm_m)                                
Model: "model_1"
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 16)]              0         
 sequential_1 (Sequential)   (None, 32)                41376     
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
Total params: 41,376
Trainable params: 41,376
Non-trainable params: 0
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")
59/59 [==============================] - 0s 2ms/step
size of all predicted movie feature vectors: (1883, 32)
count = 50
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    genre1,_  = get_item_genre(item_vecs[i,:], ivs, item_features)
    genre2,_  = get_item_genre(item_vecs[min_idx,:], ivs, item_features)

    disp.append( [movie_dict[movie1_id]['title'], genre1,
                  movie_dict[movie2_id]['title'], genre2]
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
movie1 genres movie2 genres
Save the Last Dance (2001) Drama John Q (2002) Drama
Save the Last Dance (2001) Romance Wedding Planner, The (2001) Romance
Wedding Planner, The (2001) Comedy Spy Kids (2001) Comedy
Wedding Planner, The (2001) Romance Sweetest Thing, The (2002) Romance
Hannibal (2001) Horror Resident Evil: Apocalypse (2004)Horror
Hannibal (2001) Thriller Sum of All Fears, The (2002) Thriller
Saving Silverman (Evil Woman) (2001) Comedy Cats & Dogs (2001) Comedy
Saving Silverman (Evil Woman) (2001) Romance Save the Last Dance (2001) Romance
Down to Earth (2001) Comedy Joe Dirt (2001) Comedy
Down to Earth (2001) Fantasy Haunted Mansion, The (2003) Fantasy
Down to Earth (2001) Romance Joe Dirt (2001) Romance
Mexican, The (2001) Action Knight's Tale, A (2001) Action
Mexican, The (2001) Comedy Knight's Tale, A (2001) Comedy
15 Minutes (2001) Thriller Panic Room (2002) Thriller
Heartbreakers (2001) Comedy Animal, The (2001) Comedy
Heartbreakers (2001) Crime Stepford Wives, The (2004) Thriller
Heartbreakers (2001) Romance Bewitched (2005) Romance
Spy Kids (2001) Action Lara Croft: Tomb Raider (2001) Action
Spy Kids (2001) AdventureLara Croft: Tomb Raider (2001) Adventure
Spy Kids (2001) Children Princess Diaries, The (2001) Children
Spy Kids (2001) Comedy Wedding Planner, The (2001) Comedy
Along Came a Spider (2001) Action Swordfish (2001) Action
Along Came a Spider (2001) Crime Swordfish (2001) Crime
Along Came a Spider (2001) Mystery Ring, The (2002) Mystery
Along Came a Spider (2001) Thriller Signs (2002) Thriller
Blow (2001) Crime Training Day (2001) Crime
Blow (2001) Drama Training Day (2001) Drama
Bridget Jones's Diary (2001) Comedy Super Troopers (2001) Comedy
Bridget Jones's Diary (2001) Drama Others, The (2001) Drama
Bridget Jones's Diary (2001) Romance Punch-Drunk Love (2002) Romance
Joe Dirt (2001) AdventureBulletproof Monk (2003) Adventure
Joe Dirt (2001) Comedy Dr. Dolittle 2 (2001) Comedy
Joe Dirt (2001) Mystery Grudge, The (2004) Mystery
Joe Dirt (2001) Romance Down to Earth (2001) Romance
Crocodile Dundee in Los Angeles (2001)Comedy Heartbreakers (2001) Comedy
Crocodile Dundee in Los Angeles (2001)Drama Scary Movie 4 (2006) Horror
Mummy Returns, The (2001) Action Swordfish (2001) Action
Mummy Returns, The (2001) AdventureRundown, The (2003) Adventure
Mummy Returns, The (2001) Comedy American Pie 2 (2001) Comedy
Mummy Returns, The (2001) Thriller Fast and the Furious, The (2001)Thriller
Knight's Tale, A (2001) Action Mexican, The (2001) Action
Knight's Tale, A (2001) Comedy Mexican, The (2001) Comedy
Knight's Tale, A (2001) Romance Monster's Ball (2001) Romance
Shrek (2001) AdventureMonsters, Inc. (2001) Adventure
Shrek (2001) AnimationMonsters, Inc. (2001) Animation
Shrek (2001) Children Monsters, Inc. (2001) Children
Shrek (2001) Comedy Monsters, Inc. (2001) Comedy
Shrek (2001) Fantasy Monsters, Inc. (2001) Fantasy
Shrek (2001) Romance Monsoon Wedding (2001) Romance
Animal, The (2001) Comedy Heartbreakers (2001) Comedy