base line
```
import pandas as pd
pd.set_option('display.max_colwidth', None)
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import numpy as np
import random
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
```
```
ratings = pd.read_csv('initial_Train.csv')
ratings['interactions'] = 1
```
```
df = ratings.sort_values(by = ['timestamp'])
print("Rating Distribution")
# df.groupby(['rating'])['rating'].count()
names = ['userId', 'movieId', 'rating', 'timestamp', 'interactions']
train_, test_ = train_test_split(ratings.values, test_size=0.3, shuffle=False)
train_ = pd.DataFrame(train_, columns = names )
# train_.to_csv('initial_train_data.csv')
test_ = pd.DataFrame(test_, columns = names)
```
```
import numpy
from numpy.compat import pickle
le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
# le_user.classes_ = le_user.classes_.astype(np.float64)
# le_item.classes_ = le_item.classes_.astype(np.float64)
train_['user_id_idx'] = le_user.fit_transform(train_['userId'].values)
train_['item_id_idx'] = le_item.fit_transform(train_['movieId'].values)
train_user_ids = train_['userId'].unique()
train_item_ids = train_['movieId'].unique()
test_ = test_[(test_['userId'].isin(train_user_ids))&(test_['movieId'].isin(train_item_ids))]
test_['user_id_idx'] = le_user.transform(test_['userId'].values)
test_['item_id_idx'] = le_item.transform(test_['movieId'].values)
```
```
rating_matrix = train_.pivot(index='user_id_idx', columns='item_id_idx', values='interactions')
n_users, n_movies = rating_matrix.shape
```
```
rating_matrix[rating_matrix.isnull()] = 0
rating_matrix = rating_matrix.astype(float)
rating_matrix = torch.FloatTensor(rating_matrix.values)
```
```
latent_vectors = 64
user_features = torch.randn(n_users, latent_vectors, requires_grad=True)
user_features.data.mul_(0.01)
movie_features = torch.randn(n_movies, latent_vectors, requires_grad=True)
movie_features.data.mul_(0.01)
```
```
class PMFLoss(torch.nn.Module):
def __init__(self, lam_u=0.3, lam_v=0.3):
super().__init__()
self.lam_u = lam_u
self.lam_v = lam_v
def forward(self, matrix, u_features, v_features):
non_zero_mask = (matrix != -1).type(torch.FloatTensor)
predicted = torch.sigmoid(torch.mm(u_features, v_features.t()))
diff = (matrix - predicted)**2
prediction_error = torch.sum(diff*non_zero_mask)
u_regularization = self.lam_u * torch.sum(u_features.norm(dim=1))
v_regularization = self.lam_v * torch.sum(v_features.norm(dim=1))
return prediction_error + u_regularization + v_regularization
```
```
criterion = PMFLoss()
loss = criterion(rating_matrix, user_features, movie_features)
```
```
latent_vectors = 64
user_features = torch.randn(n_users, latent_vectors, requires_grad=True)
user_features.data.mul_(0.01)
movie_features = torch.randn(n_movies, latent_vectors, requires_grad=True)
movie_features.data.mul_(0.01)
loss_ = []
step_ = []
pmferror = PMFLoss(lam_u=0.05, lam_v=0.05)
optimizer = torch.optim.Adam([user_features, movie_features], lr=0.01)
for step, epoch in enumerate(range(3000)):
optimizer.zero_grad()
loss = pmferror(rating_matrix, user_features, movie_features)
loss.backward()
optimizer.step()
if step % 50 == 0:
loss_.append(loss)
step_.append(step)
print(f"Step {step}, {loss:.3f}")
```
```
def convert_to_sparse_tensor(dok_mtrx):
dok_mtrx_coo = dok_mtrx.tocoo().astype(np.float32)
values = dok_mtrx_coo.data
indices = np.vstack((dok_mtrx_coo.row, dok_mtrx_coo.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = dok_mtrx_coo.shape
dok_mtrx_sparse_tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))
return dok_mtrx_sparse_tensor
```
```
from sklearn import metrics
from sklearn.preprocessing import label_binarize
def get_metrics(user_Embed_wts, item_Embed_wts, n_users, n_items, train_data, test_data, K):
test = sp.dok_matrix((n_users, n_items), dtype = np.float32)
test[test_data['user_id_idx'], test_data['item_id_idx']] = 1.0
test = test.toarray()
user_Embedding = nn.Embedding(user_Embed_wts.size()[0], user_Embed_wts.size()[1], _weight = user_Embed_wts)
item_Embedding = nn.Embedding(item_Embed_wts.size()[0], item_Embed_wts.size()[1], _weight = item_Embed_wts)
test_user_ids = torch.LongTensor(test_data['user_id_idx'].unique())
relevance_score = torch.matmul(user_Embed_wts, torch.transpose(item_Embed_wts,0, 1))
itms = test_data['item_id_idx'].unique()
usrs = test_data['user_id_idx'].unique()
R = sp.dok_matrix((n_users, n_items), dtype = np.float32)
R[train_data['user_id_idx'], train_data['item_id_idx']] = 1.0
R_tensor = convert_to_sparse_tensor(R)
R_tensor_dense = R_tensor.to_dense()
R_tensor_dense = R_tensor_dense*(-np.inf)
R_tensor_dense = torch.nan_to_num(R_tensor_dense, nan=0.0)
relevance_score = relevance_score + R_tensor_dense
# print(torch.topk(relevance_score, K))
topk_relevance_score = torch.topk(relevance_score, K).values
# print(len(topk_relevance_score))
topk_relevance_indices = torch.topk(relevance_score, K).indices
# print(topk_relevance_indices)
#covert topk_relevance_indices to cpu
topk_relevance_indices = topk_relevance_indices.
topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.numpy(),columns =['top_indx_'+str(x+1) for x in range(K)])
# print(topk_relevance_indices_df)
topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
#
topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df[['top_indx_'+str(x+1) for x in range(K)]].values.tolist()
topk_relevance_indices_df = topk_relevance_indices_df[['user_ID','top_rlvnt_itm']]
# print(topk_relevance_indices_df)
test_interacted_items = test_data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
metrics_df = pd.merge(test_interacted_items,topk_relevance_indices_df, how= 'left', left_on = 'user_id_idx',right_on = ['user_ID'])
# print(metrics_df)
metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in zip(metrics_df.item_id_idx, metrics_df.top_rlvnt_itm)]
metrics_df['recall'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/len(x['item_id_idx']), axis = 1)
metrics_df['precision'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/K, axis = 1)
# print(metrics_df)
# metrics_df['auc'] = metrics_df.apply(lambda x : metrics.roc_auc_score([1]*len(x['intrsctn_itm']) + [0]*(K-len(x['intrsctn_itm'])), x['intrsctn_itm'] + [0]*(K-len(x['intrsctn_itm']))), axis = 1)
def get_hit_list(item_id_idx, top_rlvnt_itm):
return [1 if x in set(item_id_idx) else 0 for x in top_rlvnt_itm ]
metrics_df['hit_list'] = metrics_df.apply(lambda x : get_hit_list(x['item_id_idx'], x['top_rlvnt_itm']), axis = 1)
usert = test_data['user_id_idx'].unique()
# print(intrsctn_itm)
def get_auc(test, relevance_score, usert):
relevance_score = relevance_score.cpu()
auc = []
for i in usert:
a = metrics.roc_auc_score(test[i], relevance_score[i])
auc.append(a)
return auc
def get_dcg_idcg(item_id_idx, hit_list):
idcg = sum([1 / np.log1p(idx+1) for idx in range(min(len(item_id_idx),len(hit_list)))])
dcg = sum([hit / np.log1p(idx+1) for idx, hit in enumerate(hit_list)])
return dcg/idcg
def get_cumsum(hit_list):
return np.cumsum(hit_list)
def get_map(item_id_idx, hit_list, hit_list_cumsum):
return sum([hit_cumsum*hit/(idx+1) for idx, (hit, hit_cumsum) in enumerate(zip(hit_list, hit_list_cumsum))])/len(item_id_idx)
metrics_df['ndcg'] = metrics_df.apply(lambda x : get_dcg_idcg(x['item_id_idx'], x['hit_list']), axis = 1)
metrics_df['hit_list_cumsum'] = metrics_df.apply(lambda x : get_cumsum(x['hit_list']), axis = 1)
metrics_df['map'] = metrics_df.apply(lambda x : get_map(x['item_id_idx'], x['hit_list'], x['hit_list_cumsum']), axis = 1)
a_m = get_auc(test, relevance_score, usert)
return a_m, metrics_df['recall'], metrics_df['precision'], metrics_df['ndcg'], metrics_df['map']
```
```
train_ = train_.sort_values(by = ['user_id_idx'])
train_ = train_.reset_index(drop = True)
user_id_idx_end = train_.groupby('user_id_idx').tail(1).index
test_ = test_.sort_values(by = ['user_id_idx'])
```
```
user_id_idx_dict = dict(zip(train_['userId'], train_['user_id_idx']))
```
```
import json
import math
import copy
begin = 0 #index of the first user
number_of_users = 1000
end = number_of_users - 1
auc_ = []
ndcg_ = []
map_ = []
recall_ = []
precision_ = []
while(begin != len(train_)):
temp = user_id_idx_end[end]
print('\n')
print('begin: ', begin)
print('\n')
print('temp: ', temp)
print('\n')
print('number_of_users: ', number_of_users)
train = copy.deepcopy(train_.iloc[begin:temp+1])
train.iloc[:]['user_id_idx'] -= train.iloc[0]['user_id_idx']
test_user_dic = dict(zip(train['userId'].unique(), train['user_id_idx'].unique()))
test = copy.deepcopy(test_[test_['userId'].isin(test_user_dic.keys())])
test['user_id_idx'] = test.iloc[:]['userId'].map(test_user_dic)
#-----------------------------------------------
b = train.iloc[0]['userId']
b = user_id_idx_dict[b]
e = train.iloc[-1]['userId']
e = user_id_idx_dict[e]
auc ,recall, precision, ndcg, map = get_metrics(user_features[b:e+1], movie_features, number_of_users, n_movies, train, test, K=5)
print(b)
print(e)
auc_.extend(auc)
ndcg_.extend(ndcg)
precision_.extend(precision)
map_.extend(map)
recall_.extend(recall)
#=----------------------------------------------
begin = temp + 1
if ((end + number_of_users) <= len(user_id_idx_end)-1):
temp = user_id_idx_end[end]#index of the last user
end += number_of_users
else:
print('-------------------------------------------')
number_of_users = len(user_id_idx_end) - end - 1
temp = user_id_idx_end[-1]
end = len(user_id_idx_end)- 1
# user_final_embedding =torch.flip(user_final_embedding, [0])
```
```
print("auc: ", numpy.array(auc_).mean())
print("test_topK_recall: ", numpy.array(recall_).mean())
print("test_topK_precision: ", numpy.array(precision_).mean())
print("test_topK_ndcg: ", numpy.array(ndcg_).mean())
print("test_topK_map: ", numpy.array(map_).mean())
```
```
user_features = user_features.detach()
movie_features = movie_features.detach()
auc ,test_topK_recall, test_topK_precision, test_topK_ndcg, test_topK_map = get_metrics(user_features, movie_features, n_users, n_movies, train_, test_, K=5)
print("auc: ", auc)
print("test_topK_recall: ", test_topK_recall)
print("test_topK_precision: ", test_topK_precision)
print("test_topK_ndcg: ", test_topK_ndcg)
print("test_topK_map: ", test_topK_map)
```
```
test_topK_recall, test_topK_precision, test_topK_ndcg, test_topK_map = get_metrics(user_e.to('cpu'), item_e.to('cpu'), n_users, n_items, train_, test_, K=5)
print("test_topK_recall: ", test_topK_recall)
print("test_topK_precision: ", test_topK_precision)
print("test_topK_ndcg: ", test_topK_ndcg)
print("test_topK_map: ", test_topK_map)
```
plt.plot(step_, loss_, label='Total Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()y
```