-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
102 lines (76 loc) · 5.04 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time
import numpy as np
from Factors_Update_Functions import Update_user_biases,Update_user_factors,Update_movie_biases,Update_movie_factors,update_feature_factors,calc_metrics
from Helper_Functions import plot_likelihood, plot_rmse ,setup_logging, setup_experiment_folder ,save_model,Load_training_data,Load_idx_maps,Load_test_data
# saving plots and training logs
experiment_name = "32_million_trials" # Customize this name
experiment_folder = setup_experiment_folder(experiment_name)
logger = setup_logging(experiment_folder)
# by running Data_preprocessing, this data will be created in the data folder
data_folder = "TRAIN_TEST_DATA"
users_train,movies_train,movies_train_idxes,users_train_idxes,movies_genres_array = Load_training_data(data_folder)
users_test,movies_test,users_test_idxes,movies_test_idxes = Load_test_data(data_folder)
user_idx_map, movie_idx_map ,idx_to_user,idx_to_movie,genre_to_idx,genre_to_movies = Load_idx_maps(data_folder)
print("Training data is loaded")
# users + bias update only
n_users = len(user_idx_map) ; n_movies = len(movie_idx_map)
K_factors = 30 ; n_Epochs = 100 ; lambda_reg = 1 ; gamma = 0.01 ; taw = 10 ; std = np.sqrt(K_factors)
# Initializing Latent Factors
users_factors = np.random.normal(loc=0, scale = 1/std, size=(n_users ,K_factors))
movies_factors = np.random.normal(loc=0, scale = 1/std, size=(n_movies,K_factors))
genre_factors = np.random.normal(loc=0, scale = 1/std, size=(19,K_factors))
#2 - Initialize Bias Terms:
user_bias = np.random.randn(n_users)
item_bias = np.random.randn(n_movies)
train_loss = []
valid_loss = []
train_rmse = []
valid_rmse = []
# MAIN LOOP ----> in this code, users without data will still be random, not updated to zeros like biases, should the biases be randomized? idk
print("Training Started ...")
# Tracking the time taken for different updates
for EPOCH in range(n_Epochs):
epoch_start_time = time.time() # Start time for the epoch
# Timing for user updates
user_update_time = 0
start_time = time.time() # Start timing the user updates
# Perform the updates on users
for current_user_idx, (movie_indices, rating) in zip(users_train_idxes, users_train):
Update_user_biases(current_user_idx, movie_indices, rating , item_bias, user_bias, lambda_reg, gamma, users_factors, movies_factors)
Update_user_factors(current_user_idx, movie_indices, rating, movies_factors, users_factors, user_bias, item_bias, lambda_reg, taw, K_factors)
# Accumulate the time taken for these updates
user_update_time += time.time() - start_time
# Timing for movie updates
movie_update_time = 0
start_time = time.time() # Start timing the movie updates
# Perform the updates on movies
for current_movie_idx,(user_indices, rating) in zip(movies_train_idxes,movies_train):
Update_movie_biases( current_movie_idx,user_indices, rating , user_bias, item_bias, lambda_reg, gamma, users_factors, movies_factors)
Update_movie_factors( current_movie_idx,user_indices, rating , users_factors, movies_factors, user_bias, item_bias, lambda_reg, taw, K_factors)
# Update_movie_factors_with_features(current_movie_idx, user_indices, rating, users_factors, movies_factors, users_biases, movies_biases, lambda_reg, taw, K_factors, movies_genres_array, genre_factors)
movie_update_time += time.time() - start_time
# genre_update_time = 0
# start_time = time.time() # Start timing the genre updates
# Perform the updates on features
# for Gdx,associated_movies in enumerate(genre_to_movies):
# update_feature_factors(Gdx,associated_movies ,movies_factors, genre_factors, taw)
# genre_update_time += time.time() - start_time
# calculate the likelihood
training_loss,trmse = calc_metrics(users_train,users_train_idxes ,users_factors,movies_factors, user_bias, item_bias, lambda_reg, gamma,taw)
validation_loss,vrmse = calc_metrics(users_test,users_test_idxes ,users_factors,movies_factors,user_bias,item_bias,lambda_reg,gamma,taw)
train_loss.append(training_loss)
valid_loss.append(validation_loss)
train_rmse.append(trmse)
valid_rmse.append(vrmse)
# End time for the epoch
epoch_time = time.time() - epoch_start_time
# Now log information & Print the time taken for each part
logger.info(f"Epoch {EPOCH + 1}/{n_Epochs} :Train RMSE = {trmse:.4f} --Valid RMSE = {vrmse:.4f} |Train loss = {training_loss/1000000:.1f} million -- validation likelihood {validation_loss/1000000:.1f} million")
logger.info(f" - User Updates Time: {user_update_time:.2f} seconds")
logger.info(f" - Movie Updates Time: {movie_update_time:.2f} seconds")
# logger.info(f" - Features Updates Time: {genre_update_time:.2f} seconds")
logger.info(f" - Total Epoch Time: {epoch_time:.2f} seconds\n")
save_model(experiment_folder,movies_factors,users_factors,user_bias,item_bias)
# PLOTTING THE RESULTS
plot_likelihood(x_axis= n_Epochs , y_axis= [train_loss,valid_loss],experiment_folder=experiment_folder )
plot_rmse(x_axis= n_Epochs , y_axis= [train_rmse,valid_rmse] ,experiment_folder=experiment_folder )