-
Notifications
You must be signed in to change notification settings - Fork 0
/
rs_matrix_factorization.py
494 lines (415 loc) · 19.9 KB
/
rs_matrix_factorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
#the code was rewrite with tensorflow2.0
#tensorflow implementation of collaborate filter recommendation system and softmax for rs
#google colab url:https://colab.research.google.com/github/google/eng-edu/blob/master/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems
from __future__ import print_function
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
#tf.disable_v2_behavior()
import altair as alt
#alt.renderers.enable('vegascope')
import gspread
#tf.logging.set_verbosity(tf.logging.ERROR)
# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.3f}'.format
def mask(df,key,function):
"""Return a filtered dataframe ,by applying function to key"""
return df[function(df[key])]
def flatten_cols(df):
df.columns = [' '.join(col).strip() for col in df.columns.values]
return df
pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols
#load each data
user_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user',sep='|',names=user_cols,encoding='latin-1')
ratting_cols = ['user_id','movie_id','rating','unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data',sep='\t',names=ratting_cols,encoding='latin-1')
#the movie file contains a binary feature for each genre
genre_cols = ["genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
"Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
"Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movie_cols = ['movie_id','title','release_date','video_release_date','imba_url'] + genre_cols
movies = pd.read_csv('ml-100k/u.item',sep='|',names=movie_cols,encoding='latin-1')
print(movies.head())
#Since the ids start at 1, we shift them to start at 0. 数据集的ids都会从1开始,改为从0开始
users['user_id'] = users['user_id'].apply(lambda x: x - 1)
movies['movie_id'] = movies['movie_id'].apply(lambda x: x - 1)
movies['year'] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings['movie_id'] = ratings['movie_id'].apply(lambda x: x - 1)
ratings['user_id'] = ratings['user_id'].apply(lambda x: x - 1)
ratings['rating'] = ratings['rating'].apply(lambda x: x - 1)
#Compute the number of movies to which a genre is assigned.
#计算每种电影分类中有多少部电影
genre_occurences = movies[genre_cols].sum().to_dict()
print(genre_occurences)
# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.
#由于一些电影属于多个分类,我们创建2个不同的分类字段,all_genres 电影的所有分类 ,genre,从所有分类中随机取一个分类
def mark_genre(movies,genres):
def sample_genre(gs):
#gs是movie的所有分类值的集合,例如id为1的movie的gs值为
# (0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
active = [genre for genre,g in zip(genres,gs) if g == 1]
if len(active) == 0:
active = 'Other'
return np.random.choice(active)
def all_genre(gs):
active = [genre for genre,g in zip(genres,gs) if g == 1]
if len(active) == 0:
active = 'Other'
return '-'.join(active)
movies['genre'] = [sample_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
movies['all_genres'] = [all_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
mark_genre(movies,genre_cols)
# Create one merged DataFrame containing all the movielens data.
#把movies,users,ratings合并为一个数据框
movielens = ratings.merge(movies,on='movie_id').merge(users,on='user_id')
def split_dataframe(df,holdhout_fraction=0.1):
test = df.sample(frac=holdhout_fraction,replace=False)
train = df[~df.index.isin(test.index)]
return train,test
#Before we dive into model building, let's inspect our MovieLens dataset. It is usually helpful to understand the statistics of the dataset.
#Users
#We start by printing some basic statistics describing the numeric user features.
# @title Altair visualization code (run this cell)
# The following functions are used to generate interactive Altair charts.
# We will display histograms of the data, sliced by a given attribute.
occupation_filter = alt.selection_multi(fields=["occupation"])
occupation_chart = alt.Chart().mark_bar().encode(
x="count()",
y=alt.Y("occupation:N"),
color=alt.condition(
occupation_filter,
alt.Color("occupation:N", scale=alt.Scale(scheme='category20')),
alt.value("lightgray")),
).properties(width=300, height=300, selection=occupation_filter)
def filtered_hist(field, label, filter):
"""Creates a layered chart of histograms.
The first layer (light gray) contains the histogram of the full data, and the
second contains the histogram of the filtered data.
Args:
field: the field for which to generate the histogram.
label: String label of the histogram.
filter: an alt.Selection object to be used to filter the data.
"""
base = alt.Chart().mark_bar().encode(
x=alt.X(field, bin=alt.Bin(maxbins=10), title=label),
y="count()",
).properties(
width=300,
)
return alt.layer(
base.transform_filter(filter),
base.encode(color=alt.value('lightgray'), opacity=alt.value(.7)),
).resolve_scale(y='independent')
users_ratings = (
ratings
.groupby('user_id', as_index=False)
.agg({'rating': ['count', 'mean']})
.flatten_cols()
.merge(users, on='user_id')
)
#users_ratings_occup = users_ratings.groupby('occupation',as_index=False).agg({'rating count': ['count']})
print(users_ratings[(users_ratings['occupation'].isin(['administrator']))])
# Create a chart for the count, and one for the mean.
alt.hconcat(
filtered_hist('rating count', '# ratings / user', occupation_filter),
filtered_hist('rating mean', 'mean user rating', occupation_filter),
occupation_chart,
data=users_ratings)
movies_ratings = movies.merge(
ratings
.groupby('movie_id', as_index=False)
.agg({'rating': ['count', 'mean']})
.flatten_cols(),
on='movie_id')
genre_filter = alt.selection_multi(fields=['genre'])
genre_chart = alt.Chart().mark_bar().encode(
x="count()",
y=alt.Y('genre'),
color=alt.condition(
genre_filter,
alt.Color("genre:N"),
alt.value('lightgray'))
).properties(height=300, selection=genre_filter)
alt.hconcat(
filtered_hist('rating count', '# ratings / movie', genre_filter),
filtered_hist('rating mean', 'mean movie rating', genre_filter),
genre_chart,
data=movies_ratings)
'''The rating matrix could be very large and, in general, most of the entries are unobserved,
since a given user will only rate a small subset of movies. For effcient representation,
we will use a tf.SparseTensor. A SparseTensor uses three tensors to represent the matrix:
tf.SparseTensor(indices, values, dense_shape) represents a tensor,
where a value Aij=a is encoded by setting indices[k] = [i, j] and values[k] = a.
The last tensor dense_shape is used to specify the shape of the full underlying matrix.
user_id movie_id rating
0 0 5.0
0 1 3.0
1 3 1.0
SparseTensor(
indices=[[0, 0], [0, 1], [1,3]],
values=[5.0, 3.0, 1.0],
dense_shape=[2, 4])
'''
def build_rating_sparse_tensor(rating_df):
indices = rating_df[['user_id','movie_id']].values
values = rating_df['rating'].values
dense_shape = [users.shape[0],movies.shape[0]]
return tf.sparse.SparseTensor(indices=indices,values=values,dense_shape=dense_shape)
def sparse_mean_square_error(sparse_rating,user_embedings,movie_embedings):
prediction = tf.gather_nd(
tf.matmul(user_embedings,movie_embedings,transpose_b=True),
indices=sparse_rating.indices
)
loss = tf.losses.mean_absolute_error(sparse_rating.values,prediction)
return loss
'''
Note: One approach is to compute the full prediction matrix UV⊤ ,
then gather the entries corresponding to the observed pairs. The memory cost of this approach is O(NM) .
For the MovieLens dataset, this is fine, as the dense N×M matrix is small enough to fit
in memory ( N=943 , M=1682 ).
Another approach (given in the alternate solution below) is to only gather the embeddings of the observed pairs,
then compute their dot products. The memory cost is O(|Ω|d) where d is the embedding dimension.
In our case, |Ω|=105 , and the embedding dimension is on the order of 10 ,
so the memory cost of both methods is comparable. But when the number of users or movies is much larger,
the first approach becomes infeasible.
'''
def sparse_mean_square_error(sparse_rating,user_embedings,movie_embedings):
prediction = tf.reduce_sum(
tf.gather(user_embedings,sparse_rating.indices[:,0]) *
tf.gather(movie_embedings,sparse_rating.indices[:,1]),axis=1
)
loss = tf.losses.mean_squared_error(sparse_rating.values,prediction)
return loss
class CFModel():
def __init__(self,embedding_vars,loss,metrics=None):
self._embedding_vars = embedding_vars
self._loss = loss
self._metrics = metrics
self._embeddings = {k:None for k in embedding_vars}
self._session = None
@property
def embeddings(self):
return self._embeddings
def train(self,num_iterations=100,learning_rate=1.0,plot_results=True,
optimizer=tf.compat.v1.train.GradientDescentOptimizer):
with self._loss.graph.as_default():
opt = optimizer(learning_rate)
train_op = opt.minimize(self._loss)
local_init_op = tf.group(tf.compat.v1.variables_initializer(opt.variables()),
tf.compat.v1.local_variables_initializer())
if self._session is None:
self._session = tf.compat.v1.Session()
with self._session.as_default():
self._session.run(tf.compat.v1.global_variables_initializer())
self._session.run(tf.compat.v1.tables_initializer())
tf.compat.v1.train.start_queue_runners()
with self._session.as_default():
local_init_op.run()
iterations = []
metrics = self._metrics or ({},)
metrics_vals = [collections.defaultdict(list) for _ in self._metrics]
#train and append result
for i in range(num_iterations + 1):
_,results = self._session.run((train_op,metrics))
if (i % 10 == 0) or i == num_iterations:
print('\r iteration %d:' % i + ','.join(
['%s=%f' % (k,v) for r in results for k,v in r.items()]),end='')
iterations.append(i)
for metrics_val,result in zip(metrics_vals,results):
for k,v in result.items():
metrics_val[k].append(v)
for k, v in self._embedding_vars.items():
self._embeddings[k] = v.eval()
if plot_results:
num_subplots = len(metrics) + 1
fig = plt.figure()
fig.set_size_inches(num_subplots * 10,8)
for i ,metric_vals in enumerate(metrics_vals):
ax = fig.add_subplot(1,num_subplots,i + 1)
for k, v in metric_vals.items():
ax.plot(iterations,v,label=k)
ax.set_xlim([1,num_iterations])
ax.legend()
return results
def build_model(ratings,embedding_dim=3,init_stddev=1.):
#split the dataset into train and test
train,test = split_dataframe(ratings)
#sparse tensor representation of the train and test
a_train = build_rating_sparse_tensor(train)
a_test = build_rating_sparse_tensor(test)
print(a_train.dense_shape[0],a_train.dense_shape[1])
#initialize the variables using a normal distribution
U = tf.Variable(tf.random.normal([a_train.dense_shape[0],embedding_dim],stddev=init_stddev))
V = tf.Variable(tf.random.normal([a_train.dense_shape[1],embedding_dim],stddev=init_stddev))
train_loss = sparse_mean_square_error(a_train,U,V)
test_loss = sparse_mean_square_error(a_test,U,V)
metrics = {'trainning_error':train_loss,'test_error':test_loss}
embeddings = {'user_id':U,'movie_id':V}
return CFModel(embeddings,train_loss,[metrics])
model = build_model(ratings,embedding_dim=30,init_stddev=0.5)
model.train(num_iterations=100,learning_rate=10.)
DOT = 'dot'
COSINE = 'cosine'
def compute_score(query_embedding,item_embeddings,measure=DOT):
"""
Computes the scores of the candidates given a query.
Args:
query_embedding: a vector of shape [k], representing the query embedding.
item_embeddings: a matrix of shape [N, k], such that row i is the embedding
of item i.
measure: a string specifying the similarity measure to be used. Can be
either DOT or COSINE.
Returns:
scores: a vector of shape [N], such that scores[i] is the score of item i.
"""
u = query_embedding
V = item_embeddings
if measure == COSINE:
u = u / np.linalg.norm(u)
V = V / np.linalg.norm(V, axis=1, keepdims=True)
scores = u.dot(V.T)
return scores
#Equipped with this function, we can compute recommendations,
# where the query embedding can be either a user embedding or a movie embedding.
def user_recommendation(user_id,model,measure=DOT,exclude_rated=False,k=6):
scores = compute_score(model.embeddings['user_id'][user_id],model.embeddings['movie_id'])
score_key = 'score ' + measure
df = pd.DataFrame({score_key: list(scores),
'movie_id': movies['movie_id'],
'title': movies['title'],
'genres': movies['all_genres'],})
if exclude_rated:
rated_movie_ids = ratings[ratings['user_id'] == user_id]['movie_id'].values
df = df[df.movie_id.apply(lambda x: x not in rated_movie_ids)]
display.display(df.sort_values(score_key,ascending=False).head(k))
def movie_neighbors(movie_title,model,meansure=DOT,k=6):
query_movie_ids = movies[movies['title'].str.contains(movie_title)].index.values
titles = movies.iloc[query_movie_ids]['title'].values
if len(query_movie_ids) == 0:
raise ValueError('Found no moive with title:%s' % movie_title)
print('Nearest neighbors of :%s. ' % titles[0])
if len(titles) > 1:
print('[Found more than one matching movies,other candidates {} '.format(",".join(titles[1:])))
movie_id = query_movie_ids[0]
score_key = 'socre ' + DOT
scores = compute_score(model.embeddings['movie_id'][movie_id],model.embeddings['movie_id'])
df = pd.DataFrame({score_key: scores,
'title': movies['title'],
'genres': movies['all_genres']})
display.display(df.sort_values(score_key, ascending=False).head(k))
user_recommendation(940, model, exclude_rated=True)
movie_neighbors("Aladdin", model, DOT)
movie_neighbors("Aladdin", model, COSINE)
def movie_embedding_norm(models):
"""Visualizes the norm and number of ratings of the movie embeddings.
Args:
model: A MFModel object.
"""
if not isinstance(models, list):
models = [models]
df = pd.DataFrame({
'title': movies['title'],
'genre': movies['genre'],
'num_ratings': movies_ratings['rating count'],
})
charts = []
brush = alt.selection_interval()
for i, model in enumerate(models):
norm_key = 'norm' + str(i)
df[norm_key] = np.linalg.norm(model.embeddings["movie_id"], axis=1)
nearest = alt.selection(
type='single', encodings=['x', 'y'], on='mouseover', nearest=True,
empty='none')
base = alt.Chart().mark_circle().encode(
x='num_ratings',
y=norm_key,
color=alt.condition(brush, alt.value('#4c78a8'), alt.value('lightgray'))
).properties(
selection=nearest).add_selection(brush)
text = alt.Chart().mark_text(align='center', dx=5, dy=-5).encode(
x='num_ratings', y=norm_key,
text=alt.condition(nearest, 'title', alt.value('')))
charts.append(alt.layer(base, text))
return alt.hconcat(*charts, data=df)
def visualize_movie_embeddings(data, x, y):
nearest = alt.selection(
type='single', encodings=['x', 'y'], on='mouseover', nearest=True,
empty='none')
base = alt.Chart().mark_circle().encode(
x=x,
y=y,
color=alt.condition(genre_filter, "genre", alt.value("whitesmoke")),
).properties(
width=600,
height=600,
selection=nearest)
text = alt.Chart().mark_text(align='left', dx=5, dy=-5).encode(
x=x,
y=y,
text=alt.condition(nearest, 'title', alt.value('')))
return alt.hconcat(alt.layer(base, text), genre_chart, data=data)
def tsne_movie_embeddings(model):
"""Visualizes the movie embeddings, projected using t-SNE with Cosine measure.
Args:
model: A MFModel object.
"""
tsne = sklearn.manifold.TSNE(
n_components=2, perplexity=40, metric='cosine', early_exaggeration=10.0,
init='pca', verbose=True, n_iter=400)
print('Running t-SNE...')
V_proj = tsne.fit_transform(model.embeddings["movie_id"])
movies.loc[:, 'x'] = V_proj[:, 0]
movies.loc[:, 'y'] = V_proj[:, 1]
return visualize_movie_embeddings(movies, 'x', 'y')
movie_embedding_norm(model)
model_lowinit = build_model(ratings, embedding_dim=30, init_stddev=0.05)
model_lowinit.train(num_iterations=100, learning_rate=10.)
movie_neighbors("Aladdin", model_lowinit, DOT)
movie_neighbors("Aladdin", model_lowinit, COSINE)
movie_embedding_norm([model, model_lowinit])
def gravity(U, V):
return 1 / (U.shape[0].value * V.shape[0].value) * tf.reduce_sum(
tf.matmul(U, U, transpose_a=True) * tf.matmul(V, V, transpose_a=True)
)
def build_regularized_model(ratings_df, embedding_dim=3, regularization_coeff=.1, gravity_coeff=1., init_stddev=0.1):
#Split the ratings DataFrame into train and test
train,test = split_dataframe(ratings_df)
train_sparse_tensor = build_rating_sparse_tensor(train)
test_sparse_tensor = build_rating_sparse_tensor(test)
U = tf.Variable(tf.random.normal([train_sparse_tensor.dense_shape[0], embedding_dim], stddev=init_stddev))
V = tf.Variable(tf.random.normal([train_sparse_tensor.dense_shape[1], embedding_dim], stddev=init_stddev))
error_train = sparse_mean_square_error(train_sparse_tensor, U, V)
error_test = sparse_mean_square_error(test_sparse_tensor, U, V)
gravity_loss = gravity_coeff * gravity(U, V)
regularization_loss = regularization_coeff * (tf.reduce_sum(U * U) / U.shape[0].value +
tf.reduce_sum(V * V) / V.shape[0].value)
total_loss = error_train + gravity_loss + regularization_loss
losses = {'train_error_observed': error_train, 'test_error_observed': error_test}
loss_components = {
'observed_loss': error_train,
'regularization_loss': regularization_loss,
'gravity_loss': gravity_loss,
}
embeddings = {'user_id': U, 'movie_id': V}
return CFModel(embeddings,total_loss,[losses,loss_components])
reg_model = build_regularized_model(ratings, regularization_coeff=0.1, gravity_coeff=1.0, embedding_dim=35,
init_stddev=0.05)
reg_model.train(num_iterations=200, learning_rate=20.)
user_recommendation(940, reg_model, DOT, exclude_rated=True, k=10)
movie_embedding_norm([model, model_lowinit, reg_model])
tsne_movie_embeddings(model_lowinit)
tsne_movie_embeddings(reg_model)