Machine Learning enthusiast
Data Science Master student
View My LinkedIn Profile
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
from recsysNN_utils import *
pd.set_option("display.precision", 1)
# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()
num_user_features = user_train.shape[1] - 3 # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1 # remove movie id at train time
uvs = 3 # user genre vector start
ivs = 3 # item genre vector start
u_s = 3 # start of columns to use in training, user
i_s = 1 # start of columns to use in training, items
scaledata = True # applies the standard scalar to data if true
print(f"Number of training vectors: {len(item_train)}")
Number of training vectors: 58187
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
[user id] | [rating count] | [rating ave] | Act ion | Adve nture | Anim ation | Chil dren | Com edy | Crime | Docum entary | Drama | Fan tasy | Hor ror | Mys tery | Rom ance | Sci -Fi | Thri ller |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 16 | 4.1 | 3.9 | 5.0 | 0.0 | 0.0 | 4.0 | 4.2 | 4.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 4.2 | 3.9 |
2 | 16 | 4.1 | 3.9 | 5.0 | 0.0 | 0.0 | 4.0 | 4.2 | 4.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 4.2 | 3.9 |
2 | 16 | 4.1 | 3.9 | 5.0 | 0.0 | 0.0 | 4.0 | 4.2 | 4.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 4.2 | 3.9 |
2 | 16 | 4.1 | 3.9 | 5.0 | 0.0 | 0.0 | 4.0 | 4.2 | 4.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 4.2 | 3.9 |
2 | 16 | 4.1 | 3.9 | 5.0 | 0.0 | 0.0 | 4.0 | 4.2 | 4.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 4.2 | 3.9 |
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)
[movie id] | year | ave rating | Act ion | Adve nture | Anim ation | Chil dren | Com edy | Crime | Docum entary | Drama | Fan tasy | Hor ror | Mys tery | Rom ance | Sci -Fi | Thri ller |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6874 | 2003 | 4.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6874 | 2003 | 4.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6874 | 2003 | 4.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
8798 | 2004 | 3.8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8798 | 2004 | 3.8 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
print(f"y_train[:5]: {y_train[:5]}")
y_train[:5]: [4. 4. 4. 3.5 3.5]
# scale training data
if scaledata:
item_train_save = item_train
user_train_save = user_train
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)
scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)
print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train)))
True
True
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")
movie/item training data shape: (46549, 17)
movie/item test data shape: (11638, 17)
The scaled, shuffled data now has a mean of zero.
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
[user id] | [rating count] | [rating ave] | Act ion | Adve nture | Anim ation | Chil dren | Com edy | Crime | Docum entary | Drama | Fan tasy | Hor ror | Mys tery | Rom ance | Sci -Fi | Thri ller |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0 | 0.6 | 0.7 | 0.6 | 0.6 | 0.7 | 0.7 | 0.5 | 0.7 | 0.2 | 0.3 | 0.3 | 0.5 | 0.5 | 0.8 | 0.5 |
0 | 0 | 1.6 | 1.5 | 1.7 | 0.9 | 1.0 | 1.4 | 0.8 | -1.2 | 1.2 | 1.2 | 1.6 | 0.9 | 1.4 | 1.2 | 1.0 |
0 | 0 | 0.8 | 0.6 | 0.7 | 0.5 | 0.6 | 0.6 | 0.3 | -1.2 | 0.7 | 0.8 | 0.9 | 0.6 | 0.2 | 0.6 | 0.6 |
1 | 0 | -0.1 | 0.2 | -0.1 | 0.3 | 0.7 | 0.3 | 0.2 | 1.0 | -0.5 | -0.7 | -2.1 | 0.5 | 0.7 | 0.3 | 0.0 |
-1 | 0 | -1.3 | -0.8 | -0.8 | 0.1 | -0.1 | -1.1 | -0.9 | -1.2 | -1.5 | -0.6 | -0.5 | -0.6 | -0.9 | -0.4 | -0.9 |
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)
(46549, 1) (11638, 1)
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(num_outputs),
])
item_NN = tf.keras.models.Sequential([
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(num_outputs),
])
# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)
# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)
# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])
# specify the inputs and output of the model
model = Model([input_user, input_item], output)
model.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 14)] 0 []
input_2 (InputLayer) [(None, 16)] 0 []
sequential (Sequential) (None, 32) 40864 ['input_1[0][0]']
sequential_1 (Sequential) (None, 32) 41376 ['input_2[0][0]']
tf.math.l2_normalize (TFOpLamb (None, 32) 0 ['sequential[0][0]']
da)
tf.math.l2_normalize_1 (TFOpLa (None, 32) 0 ['sequential_1[0][0]']
mbda)
dot (Dot) (None, 1) 0 ['tf.math.l2_normalize[0][0]',
'tf.math.l2_normalize_1[0][0]']
==================================================================================================
Total params: 82,240
Trainable params: 82,240
Non-trainable params: 0
__________________________________________________________________________________________________
We’ll use a mean squared error loss and an Adam optimizer.
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
loss=cost_fn)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
Num GPUs Available: 1
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], ynorm_train, epochs=30)
Epoch 1/30
1455/1455 [==============================] - 21s 11ms/step - loss: 0.1249
Epoch 2/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1183
Epoch 3/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1167
Epoch 4/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1148
Epoch 5/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1131
Epoch 6/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1110
Epoch 7/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1099
Epoch 8/30
1455/1455 [==============================] - 15s 11ms/step - loss: 0.1090
Epoch 9/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1081
Epoch 10/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1075
Epoch 11/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1066
Epoch 12/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1065
Epoch 13/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1058
Epoch 14/30
1455/1455 [==============================] - 15s 11ms/step - loss: 0.1051
Epoch 15/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1045
Epoch 16/30
1455/1455 [==============================] - 15s 10ms/step - loss: 0.1039
Epoch 17/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1033
Epoch 18/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1030
Epoch 19/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1023
Epoch 20/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1020
Epoch 21/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1014
Epoch 22/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1010
Epoch 23/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1005
Epoch 24/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.1001
Epoch 25/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0999
Epoch 26/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0993
Epoch 27/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0990
Epoch 28/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0988
Epoch 29/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0983
Epoch 30/30
1455/1455 [==============================] - 16s 11ms/step - loss: 0.0980
<keras.callbacks.History at 0x2430f6b67f0>
Evaluate the model to determine loss on the test data. It is comparable to the training loss indicating the model has not substantially overfit the training data.
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)
364/364 [==============================] - 2s 5ms/step - loss: 0.1045
0.10449469089508057
First, we’ll create a new user and have the model suggest movies for that user. After you have tried this example on the example user content, feel free to change the user content to match your own preferences and see what the model suggests. Note that ratings are between 0.5 and 5.0, inclusive, in half-step increments.
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3
user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
new_action, new_adventure, new_animation, new_childrens,
new_comedy, new_crime, new_documentary,
new_drama, new_fantasy, new_horror, new_mystery,
new_romance, new_scifi, new_thriller]])
Let’s look at the top-rated movies for the new user. Recall, the user vector had genres that favored Comedy and Romance.
Below, we’ll use a set of movie/item vectors, item_vecs
that have a vector for each movie in the training/test set. This is matched with the user vector above and the scaled vectors are used to predict ratings for all the movies for our new user above.
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))
# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s,
scaler, scalerUser, scalerItem, scaledata=scaledata)
print_pred_movies(sorted_ypu, sorted_user, sorted_items, movie_dict, maxcount = 10)
59/59 [==============================] - 0s 4ms/step
y_p | movie id | rating ave | title | genres |
---|---|---|---|---|
4.84793 | 76293 | 3.31818 | Date Night (2010) | Action|Comedy|Romance |
4.82386 | 69406 | 3.5 | Proposal, The (2009) | Comedy|Romance |
4.8197 | 58047 | 3.42857 | Definitely, Maybe (2008) | Comedy|Drama|Romance |
4.81538 | 62155 | 3.35 | Nick and Norah's Infinite Playlist (2008) | Comedy|Drama|Romance |
4.80771 | 99007 | 3.5 | Warm Bodies (2013) | Comedy|Horror|Romance |
4.80431 | 86882 | 3.56 | Midnight in Paris (2011) | Comedy|Fantasy|Romance |
4.80238 | 56949 | 3.3 | 27 Dresses (2008) | Comedy|Romance |
4.79867 | 54004 | 3.45455 | I Now Pronounce You Chuck and Larry (2007) | Comedy|Romance |
4.77714 | 5377 | 3.71591 | About a Boy (2002) | Comedy|Drama|Romance |
4.77429 | 5992 | 3.7 | Hours, The (2002) | Drama|Romance |
Let’s look at the predictions for “user 36”, one of the users in the data set. We can compare the predicted ratings with the model’s ratings. Note that movies with multiple genre’s show up multiple times in the training data. For example,’The Time Machine’ has three genre’s: Adventure, Action, Sci-Fi
uid = 36
# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train), item_vecs, user_to_genre)
# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler,
scalerUser, scalerItem, scaledata=scaledata)
sorted_y = y_vecs[sorted_index]
#print sorted predictions
print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, item_features, ivs, uvs, movie_dict, maxcount = 10)
59/59 [==============================] - 0s 4ms/step
y_p | y | user | user genre ave | movie rating ave | title | genres |
---|---|---|---|---|---|---|
3.2 | 3.0 | 36 | 3.00 | 2.86 | Time Machine, The (2002) | Adventure |
3.1 | 3.0 | 36 | 3.00 | 2.86 | Time Machine, The (2002) | Action |
3.0 | 3.0 | 36 | 3.00 | 2.86 | Time Machine, The (2002) | Sci-Fi |
2.0 | 1.0 | 36 | 1.50 | 4.00 | Beautiful Mind, A (2001) | Drama |
1.9 | 1.5 | 36 | 1.75 | 3.52 | Road to Perdition (2002) | Crime |
1.9 | 2.0 | 36 | 1.75 | 3.52 | Gangs of New York (2002) | Crime |
1.8 | 1.0 | 36 | 1.00 | 4.00 | Beautiful Mind, A (2001) | Romance |
1.6 | 1.5 | 36 | 1.50 | 3.52 | Road to Perdition (2002) | Drama |
1.6 | 2.0 | 36 | 1.50 | 3.52 | Gangs of New York (2002) | Drama |
def sq_dist(a,b):
"""
Returns the squared distance between two vectors
Args:
a (ndarray (n,)): vector with n features
b (ndarray (n,)): vector with n features
Returns:
d (float) : distance
"""
d = np.sum(np.square(a - b))
return (d)
input_item_m = tf.keras.layers.Input(shape=(num_item_features)) # input layer
vm_m = item_NN(input_item_m) # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1) # incorporate normalization as was done in the original model
model_m = Model(input_item_m, vm_m)
model_m.summary()
Model: "model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_3 (InputLayer) [(None, 16)] 0
sequential_1 (Sequential) (None, 32) 41376
tf.math.l2_normalize_2 (TFO (None, 32) 0
pLambda)
=================================================================
Total params: 41,376
Trainable params: 41,376
Non-trainable params: 0
_________________________________________________________________
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")
59/59 [==============================] - 0s 2ms/step
size of all predicted movie feature vectors: (1883, 32)
count = 50
dim = len(vms)
dist = np.zeros((dim,dim))
for i in range(dim):
for j in range(dim):
dist[i,j] = sq_dist(vms[i, :], vms[j, :])
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0])) # mask the diagonal
disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
min_idx = np.argmin(m_dist[i])
movie1_id = int(item_vecs[i,0])
movie2_id = int(item_vecs[min_idx,0])
genre1,_ = get_item_genre(item_vecs[i,:], ivs, item_features)
genre2,_ = get_item_genre(item_vecs[min_idx,:], ivs, item_features)
disp.append( [movie_dict[movie1_id]['title'], genre1,
movie_dict[movie2_id]['title'], genre2]
)
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
table
movie1 | genres | movie2 | genres |
---|---|---|---|
Save the Last Dance (2001) | Drama | John Q (2002) | Drama |
Save the Last Dance (2001) | Romance | Wedding Planner, The (2001) | Romance |
Wedding Planner, The (2001) | Comedy | Spy Kids (2001) | Comedy |
Wedding Planner, The (2001) | Romance | Sweetest Thing, The (2002) | Romance |
Hannibal (2001) | Horror | Resident Evil: Apocalypse (2004) | Horror |
Hannibal (2001) | Thriller | Sum of All Fears, The (2002) | Thriller |
Saving Silverman (Evil Woman) (2001) | Comedy | Cats & Dogs (2001) | Comedy |
Saving Silverman (Evil Woman) (2001) | Romance | Save the Last Dance (2001) | Romance |
Down to Earth (2001) | Comedy | Joe Dirt (2001) | Comedy |
Down to Earth (2001) | Fantasy | Haunted Mansion, The (2003) | Fantasy |
Down to Earth (2001) | Romance | Joe Dirt (2001) | Romance |
Mexican, The (2001) | Action | Knight's Tale, A (2001) | Action |
Mexican, The (2001) | Comedy | Knight's Tale, A (2001) | Comedy |
15 Minutes (2001) | Thriller | Panic Room (2002) | Thriller |
Heartbreakers (2001) | Comedy | Animal, The (2001) | Comedy |
Heartbreakers (2001) | Crime | Stepford Wives, The (2004) | Thriller |
Heartbreakers (2001) | Romance | Bewitched (2005) | Romance |
Spy Kids (2001) | Action | Lara Croft: Tomb Raider (2001) | Action |
Spy Kids (2001) | Adventure | Lara Croft: Tomb Raider (2001) | Adventure |
Spy Kids (2001) | Children | Princess Diaries, The (2001) | Children |
Spy Kids (2001) | Comedy | Wedding Planner, The (2001) | Comedy |
Along Came a Spider (2001) | Action | Swordfish (2001) | Action |
Along Came a Spider (2001) | Crime | Swordfish (2001) | Crime |
Along Came a Spider (2001) | Mystery | Ring, The (2002) | Mystery |
Along Came a Spider (2001) | Thriller | Signs (2002) | Thriller |
Blow (2001) | Crime | Training Day (2001) | Crime |
Blow (2001) | Drama | Training Day (2001) | Drama |
Bridget Jones's Diary (2001) | Comedy | Super Troopers (2001) | Comedy |
Bridget Jones's Diary (2001) | Drama | Others, The (2001) | Drama |
Bridget Jones's Diary (2001) | Romance | Punch-Drunk Love (2002) | Romance |
Joe Dirt (2001) | Adventure | Bulletproof Monk (2003) | Adventure |
Joe Dirt (2001) | Comedy | Dr. Dolittle 2 (2001) | Comedy |
Joe Dirt (2001) | Mystery | Grudge, The (2004) | Mystery |
Joe Dirt (2001) | Romance | Down to Earth (2001) | Romance |
Crocodile Dundee in Los Angeles (2001) | Comedy | Heartbreakers (2001) | Comedy |
Crocodile Dundee in Los Angeles (2001) | Drama | Scary Movie 4 (2006) | Horror |
Mummy Returns, The (2001) | Action | Swordfish (2001) | Action |
Mummy Returns, The (2001) | Adventure | Rundown, The (2003) | Adventure |
Mummy Returns, The (2001) | Comedy | American Pie 2 (2001) | Comedy |
Mummy Returns, The (2001) | Thriller | Fast and the Furious, The (2001) | Thriller |
Knight's Tale, A (2001) | Action | Mexican, The (2001) | Action |
Knight's Tale, A (2001) | Comedy | Mexican, The (2001) | Comedy |
Knight's Tale, A (2001) | Romance | Monster's Ball (2001) | Romance |
Shrek (2001) | Adventure | Monsters, Inc. (2001) | Adventure |
Shrek (2001) | Animation | Monsters, Inc. (2001) | Animation |
Shrek (2001) | Children | Monsters, Inc. (2001) | Children |
Shrek (2001) | Comedy | Monsters, Inc. (2001) | Comedy |
Shrek (2001) | Fantasy | Monsters, Inc. (2001) | Fantasy |
Shrek (2001) | Romance | Monsoon Wedding (2001) | Romance |
Animal, The (2001) | Comedy | Heartbreakers (2001) | Comedy |