import torch
import pandas as pd
pd.set_option('display.max_colwidth', None)
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import numpy as np
import random
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
------
torch.__version__
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
------
df = pd.read_csv("initial_Train.csv")
---
df = df.sort_values(by = ['timestamp'])
train_, test_ = train_test_split(df.values, test_size=0.3, shuffle=False)
train_ = pd.DataFrame(train_, columns = names )
test_ = pd.DataFrame(test_, columns = names)
print("Train Size : ", len(train_))
print("Test Size : ", len (test_
import numpy
from numpy.compat import pickle
le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_['user_id_idx'] = le_user.fit_transform(train_['userId'].values)
train_['item_id_idx'] = le_item.fit_transform(train_['movieId'].values)
output_user = open('user_encoder.pkl', 'wb')
output_item = open('item_encodesr.pkl', 'wb')
pickle.dump(le_user, output_user)
pickle.dump(le_item, output_item)
output_item.close()
output_user.close()
train_user_ids = train_['userId'].unique()
train_item_ids = train_['movieId'].unique()
test_ = test_[(test_['userId'].isin(train_user_ids))&(test_['movieId'].isin(train_item_ids))]
print(len(test_))
print(len(train_))
test_['user_id_idx'] = le_user.transform(test_['userId'].values)
test_['item_id_idx'] = le_item.transform(test_['movieId'].values)
n_users = train_['user_id_idx'].nunique()
n_items = train_['item_id_idx'].nunique()
print("Number of Unique Users : ", n_users)
print("Number of unique Items : ", n_items)
print("SUM : ",n_users + n_items )
n_users_ = test_['user_id_idx'].nunique()
n_items_ = test_['item_id_idx'].nunique()
print("Number of Unique Users : ", n_users_)
print("Number of unique Items : ", n_items_)
print("SUM : ",n_users_ + n_items_ )
latent_dim = 64
n_layers = 3
item_eb = nn.Embedding(num_embeddings=n_items, embedding_dim=latent_dim)
nn.init.normal_(item_eb.weight, std=0.1)
torch.save(item_eb.weight, 'initial_item_Embed0.pt')
```
def convert_to_sparse_tensor(dok_mtrx):
dok_mtrx_coo = dok_mtrx.tocoo().astype(np.float32)
values = dok_mtrx_coo.data
indices = np.vstack((dok_mtrx_coo.row, dok_mtrx_coo.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = dok_mtrx_coo.shape
dok_mtrx_sparse_tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))
return dok_mtrx_sparse_tensor.to(device)
```
```
from sklearn import metrics
from sklearn.preprocessing import label_binarize
def get_metrics(user_Embed_wts, item_Embed_wts, n_users, n_items, train_data, test_data, K):
test = sp.dok_matrix((n_users, n_items), dtype = np.float32)
test[test_data['user_id_idx'], test_data['item_id_idx']] = 1.0
test = test.toarray()
user_Embedding = nn.Embedding(user_Embed_wts.size()[0], user_Embed_wts.size()[1], _weight = user_Embed_wts)
item_Embedding = nn.Embedding(item_Embed_wts.size()[0], item_Embed_wts.size()[1], _weight = item_Embed_wts)
test_user_ids = torch.LongTensor(test_data['user_id_idx'].unique())
relevance_score = torch.matmul(user_Embed_wts.to(device), torch.transpose(item_Embed_wts,0, 1).to(device))
itms = test_data['item_id_idx'].unique()
usrs = test_data['user_id_idx'].unique()
R = sp.dok_matrix((n_users, n_items), dtype = np.float32)
R[train_data['user_id_idx'], train_data['item_id_idx']] = 1.0
R_tensor = convert_to_sparse_tensor(R)
R_tensor_dense = R_tensor.to_dense()
R_tensor_dense = R_tensor_dense*(-np.inf)
R_tensor_dense = torch.nan_to_num(R_tensor_dense, nan=0.0)
relevance_score = relevance_score.to(device) + R_tensor_dense.to(device)
# print(torch.topk(relevance_score, K))
topk_relevance_score = torch.topk(relevance_score, K).values
# print(len(topk_relevance_score))
topk_relevance_indices = torch.topk(relevance_score, K).indices
# print(topk_relevance_indices)
#covert topk_relevance_indices to cpu
topk_relevance_indices = topk_relevance_indices.cpu()
topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.numpy(),columns =['top_indx_'+str(x+1) for x in range(K)])
# print(topk_relevance_indices_df)
topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
#
topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df[['top_indx_'+str(x+1) for x in range(K)]].values.tolist()
topk_relevance_indices_df = topk_relevance_indices_df[['user_ID','top_rlvnt_itm']]
# print(topk_relevance_indices_df)
test_interacted_items = test_data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
metrics_df = pd.merge(test_interacted_items,topk_relevance_indices_df, how= 'left', left_on = 'user_id_idx',right_on = ['user_ID'])
# print(metrics_df)
metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in zip(metrics_df.item_id_idx, metrics_df.top_rlvnt_itm)]
metrics_df['recall'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/len(x['item_id_idx']), axis = 1)
metrics_df['precision'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/K, axis = 1)
# print(metrics_df)
# metrics_df['auc'] = metrics_df.apply(lambda x : metrics.roc_auc_score([1]*len(x['intrsctn_itm']) + [0]*(K-len(x['intrsctn_itm'])), x['intrsctn_itm'] + [0]*(K-len(x['intrsctn_itm']))), axis = 1)
def get_hit_list(item_id_idx, top_rlvnt_itm):
return [1 if x in set(item_id_idx) else 0 for x in top_rlvnt_itm ]
metrics_df['hit_list'] = metrics_df.apply(lambda x : get_hit_list(x['item_id_idx'], x['top_rlvnt_itm']), axis = 1)
usert = test_data['user_id_idx'].unique()
# print(intrsctn_itm)
def get_auc(test, relevance_score, usert):
relevance_score = relevance_score.cpu()
auc = []
for i in usert:
a = metrics.roc_auc_score(test[i], relevance_score[i])
auc.append(a)
return np.mean(auc)
def get_dcg_idcg(item_id_idx, hit_list):
idcg = sum([1 / np.log1p(idx+1) for idx in range(min(len(item_id_idx),len(hit_list)))])
dcg = sum([hit / np.log1p(idx+1) for idx, hit in enumerate(hit_list)])
return dcg/idcg
def get_cumsum(hit_list):
return np.cumsum(hit_list)
def get_map(item_id_idx, hit_list, hit_list_cumsum):
return sum([hit_cumsum*hit/(idx+1) for idx, (hit, hit_cumsum) in enumerate(zip(hit_list, hit_list_cumsum))])/len(item_id_idx)
metrics_df['ndcg'] = metrics_df.apply(lambda x : get_dcg_idcg(x['item_id_idx'], x['hit_list']), axis = 1)
metrics_df['hit_list_cumsum'] = metrics_df.apply(lambda x : get_cumsum(x['hit_list']), axis = 1)
metrics_df['map'] = metrics_df.apply(lambda x : get_map(x['item_id_idx'], x['hit_list'], x['hit_list_cumsum']), axis = 1)
a_m = get_auc(test, relevance_score, usert)
return a_m, metrics_df['recall'].mean(), metrics_df['precision'].mean(), metrics_df['ndcg'].mean(), metrics_df['map'].mean()
```
```
class LightGCN(nn.Module):
def __init__(self, data, n_users, n_items, n_layers, latent_dim):
super(LightGCN, self).__init__()
self.data = data
self.n_users = n_users
self.n_items = n_items
self.n_layers = n_layers
self.latent_dim = latent_dim
self.init_embedding()
self.norm_adj_mat_sparse_tensor = self.get_A_tilda()
def init_embedding(self):
self.embedding_user = nn.Embedding(
num_embeddings=self.n_users, embedding_dim=self.latent_dim)
self.embedding_item = torch.load('initial_item_Embed0.pt')
nn.init.normal_(self.embedding_user.weight, std=0.1)
self.E0 = torch.cat((self.embedding_user.weight.to(device), self.embedding_item.to(device)), 0)
self.E0 = nn.Parameter(self.E0).to(device)
def get_A_tilda(self):
R = sp.dok_matrix((self.n_users, self.n_items), dtype = np.float32)
R[self.data['user_id_idx'], self.data['item_id_idx']] = 1.0
adj_mat = sp.dok_matrix(
(self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32
)
adj_mat = adj_mat.tolil()
R = R.tolil()
adj_mat[: self.n_users, self.n_users :] = R
adj_mat[self.n_users :, : self.n_users] = R.T
adj_mat = adj_mat.todok()
rowsum = np.array(adj_mat.sum(1))
d_inv = np.power(rowsum + 1e-9, -0.5).flatten()
d_inv[np.isinf(d_inv)] = 0.0
d_mat_inv = sp.diags(d_inv)
norm_adj_mat = d_mat_inv.dot(adj_mat)
norm_adj_mat = norm_adj_mat.dot(d_mat_inv)
norm_adj_mat_coo = norm_adj_mat.tocoo().astype(np.float32)
values = norm_adj_mat_coo.data
indices = np.vstack((norm_adj_mat_coo.row, norm_adj_mat_coo.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = norm_adj_mat_coo.shape
norm_adj_mat_sparse_tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))
return norm_adj_mat_sparse_tensor.to(device)
def propagate_through_layers(self):
all_layer_embedding = [self.E0]
E_lyr = self.E0
for layer in range(self.n_layers):
E_lyr = torch.sparse.mm(self.norm_adj_mat_sparse_tensor, E_lyr)
all_layer_embedding.append(E_lyr)
all_layer_embedding = torch.stack(all_layer_embedding)
mean_layer_embedding = torch.mean(all_layer_embedding, axis = 0)
final_user_Embed, final_item_Embed = torch.split(mean_layer_embedding, [self.n_users, self.n_items])
initial_user_Embed, initial_item_Embed = torch.split(self.E0, [self.n_users, self.n_items])
return final_user_Embed, final_item_Embed, initial_user_Embed, initial_item_Embed
def forward(self, users, pos_items, neg_items):
final_user_Embed, final_item_Embed, initial_user_Embed, initial_item_Embed = self.propagate_through_layers()
users_emb, pos_emb, neg_emb = final_user_Embed[users], final_item_Embed[pos_items], final_item_Embed[neg_items]
userEmb0, posEmb0, negEmb0 = initial_user_Embed[users], initial_item_Embed[pos_items], initial_item_Embed[neg_items]
return users_emb, pos_emb, neg_emb, userEmb0, posEmb0, negEmb0
```
```
train_ = train_.sort_values(by = ['user_id_idx'])
train_ = train_.reset_index(drop = True)
test_ = test_.sort_values(by = ['user_id_idx'])
```
```
def bpr_loss(users, users_emb, pos_emb, neg_emb, userEmb0, posEmb0, negEmb0):
reg_loss = (1/2)*(userEmb0.norm().pow(2) +
posEmb0.norm().pow(2) +
negEmb0.norm().pow(2))/float(len(users))
pos_scores = torch.mul(users_emb, pos_emb)
pos_scores = torch.sum(pos_scores, dim=1)
neg_scores = torch.mul(users_emb, neg_emb)
neg_scores = torch.sum(neg_scores, dim=1)
loss = torch.mean(torch.nn.functional.softplus(neg_scores - pos_scores))
return loss, reg_loss
```
user_id_idx_end = train_.groupby('user_id_idx').tail(1).index
```
def data_loader(data, batch_size, n_usr, n_itm):
interected_items_df = data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
def sample_neg(x):
while True:
neg_id = random.randint(0, n_itm - 1)
if neg_id not in x:
return neg_id
indices = [x for x in range(n_usr)]
if n_usr < batch_size:
users = [random.choice(indices) for _ in range(batch_size)]
else:
users = random.sample(indices, batch_size)
users.sort()
users_df = pd.DataFrame(users,columns = ['users'])
interected_items_df = pd.merge(interected_items_df, users_df, how = 'right', left_on = 'user_id_idx', right_on = 'users')
pos_items = interected_items_df['item_id_idx'].apply(lambda x : random.choice(x)).values
neg_items = interected_items_df['item_id_idx'].apply(lambda x: sample_neg(x)).values
users = np.array(list(users))
users = torch.from_numpy(users)
pos_items = np.array(list(pos_items))
pos_items = torch.from_numpy(pos_items)
neg_items = np.array(list(neg_items))
neg_items = torch.from_numpy(neg_items)
return users.type(torch.long), pos_items.type(torch.long), neg_items.type(torch.long)
```
```
EPOCHS = 10
BATCH_SIZE = 1024
DECAY = 0.0001
K = 5
```
```
from sklearn.utils import column_or_1d
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
class MyLabelEncoder(LabelEncoder):
def fit(self, y):
y = column_or_1d(y, warn=True)
self.classes_ = pd.Series(y).unique()
#shuffle the classes
self.classes_ = shuffle(self.classes_)
print(self.classes_)
return self
```
```
import json
import math
import copy
a = []
r = []
p = []
n = []
m = []
time_ = []
for s in range(10):
item_eb = nn.Embedding(num_embeddings=n_items, embedding_dim=latent_dim)
nn.init.normal_(item_eb.weight, std=0.1)
torch.save(item_eb.weight, 'initial_item_Embed0.pt')
le = MyLabelEncoder()
le.fit(train_['userId'].values)
train_['user_id_idx'] = le.transform(train_['userId'])
test_['user_id_idx'] = le.transform(test_['userId'])
train_ = train_.sort_values(by = ['user_id_idx'])
train_ = train_.reset_index(drop = True)
user_id_idx_end = train_.groupby('user_id_idx').tail(1).index
st = time.time()
l = time.time()
for t in range(1):
i = 0
user_e = []
item_e = 0
begin = 0 #index of the first user
number_of_users = 6583
end = number_of_users - 1 #index of the last user
# if t == 1:
# train_.iloc[:]['user_id_idx'] -= train_.iloc[-1]['user_id_idx']
# train_.loc[train_['user_id_idx']<0, 'user_id_idx'] *= -1
# train_ = train_.sort_values(by = ['user_id_idx'])
# train_ = train_.reset_index(drop = True)
# user_id_idx_end = train_.groupby('user_id_idx').tail(1).index
while(begin != len(train_)):
loss_list_epoch = []
MF_loss_list_epoch = []
reg_loss_list_epoch = []
recall_list = []
precision_list = []
ndcg_list = []
map_list = []
train_time_list = []
eval_time_list = []
temp = user_id_idx_end[end]
print(i)
print('\n')
print('begin: ', begin)
print('\n')
print('temp: ', temp)
print('\n')
print('number_of_users: ', number_of_users)
train = copy.deepcopy(train_.iloc[begin:temp+1])
train.iloc[:]['user_id_idx'] -= train.iloc[0]['user_id_idx']
test_user_dic = dict(zip(train['userId'].unique(), train['user_id_idx'].unique()))
#save dictionary as pkl file
with open('onehot_user_dic+'+str(i)+'.pkl', 'wb') as f:
pickle.dump(test_user_dic, f)
test = copy.deepcopy(test_[test_['userId'].isin(test_user_dic.keys())])
test['user_id_idx'] = test.iloc[:]['userId'].map(test_user_dic)
lightGCN = LightGCN(train, number_of_users, n_items, n_layers, latent_dim)
begin = temp + 1
for epoch in tqdm(range(EPOCHS)):
n_batch = int(len(train)/BATCH_SIZE)
final_loss_list = []
MF_loss_list = []
reg_loss_list = []
best_ndcg = -1
train_start_time = time.time()
model = lightGCN.to(device)
optimizer = torch.optim.Adam(lightGCN.parameters(), lr = 0.005)
model.train()
for batch_idx in range(n_batch):
if(time.time()-l >= 362.8118):
print("--------------------------------------------break--------------------------------------------------")
break
optimizer.zero_grad()
users, pos_items, neg_items = data_loader(train, BATCH_SIZE, number_of_users, n_items)
users_emb, pos_emb, neg_emb, userEmb0, posEmb0, negEmb0 = lightGCN.forward(users.to(device), pos_items.to(device), neg_items.to(device))
mf_loss, reg_loss = bpr_loss(users, users_emb, pos_emb, neg_emb, userEmb0, posEmb0, negEmb0)
reg_loss = DECAY * reg_loss
final_loss = mf_loss + reg_loss
if batch_idx % 100 == 0:
print("final_loss: ", final_loss)
final_loss.backward()
optimizer.step()
final_loss_list.append(final_loss.item())
MF_loss_list.append(mf_loss.item())
reg_loss_list.append(reg_loss.item())
train_end_time = time.time()
train_time = train_end_time - train_start_time
model.eval()
with torch.no_grad():
final_user_Embed, final_item_Embed, initial_user_Embed,initial_item_Embed = lightGCN.propagate_through_layers()
auc, test_topK_recall, test_topK_precision, test_topK_ndcg, test_topK_map = get_metrics(final_user_Embed.to(device), final_item_Embed.to(device), number_of_users, n_items, train, test, K)
if test_topK_ndcg > best_ndcg:
best_ndcg = test_topK_ndcg
# torch.save(final_user_Embed, 'final_user_Embed' +str(i)+ '.pt')
torch.save(final_item_Embed, 'final_item_Embed' +str(i)+ '.pt')
# torch.save(initial_user_Embed, 'initial_user_Embed' +str(i)+ '.pt')
torch.save(initial_item_Embed, 'initial_item_Embed0.pt')
# torch.save(lightGCN.state_dict(),'pre_LightGCN_20M.pth')
user_final_embedding = final_user_Embed
item_final_embedding = final_item_Embed
eval_time = time.time() - train_end_time
loss_list_epoch.append(round(np.mean(final_loss_list),4))
MF_loss_list_epoch.append(round(np.mean(MF_loss_list),4))
reg_loss_list_epoch.append(round(np.mean(reg_loss_list),4))
recall_list.append(round(test_topK_recall,4))
precision_list.append(round(test_topK_precision,4))
ndcg_list.append(round(test_topK_ndcg,4))
map_list.append(round(test_topK_map,4))
if ((end + number_of_users) <= len(user_id_idx_end)-1):
temp = user_id_idx_end[end]#index of the last user
end += number_of_users
else:
print('-------------------------------------------')
number_of_users = len(user_id_idx_end) - end - 1
temp = user_id_idx_end[-1]
end = len(user_id_idx_end)- 1
# user_final_embedding =torch.flip(user_final_embedding, [0])
user_e.append(user_final_embedding)
# print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXuser_e:",(user_e))
# torch.cat(user_e, dim=0)
item_e = item_final_embedding
i += 1
print("Last Epoch's Test Data Recall -> ", recall_list[-1])
print("Last Epoch's Test Data Precision -> ", precision_list[-1])
print("Last Epoch's Test Data F1 score -> ", (2*precision_list[-1]*recall_list[-1])/(precision_list[-1]+recall_list[-1]))
print("Last Epoch's Test Data NDCG -> ", ndcg_list[-1])
print("Last Epoch's Test Data MAP -> ", map_list[-1])
print("Last Epoch's Train Data Loss -> ", loss_list_epoch[-1])
print("---------------------------------------------------------------------------------------------------------------------------")
# user_e.reverse()
user_e = torch.cat(user_e, dim=0)
# user_e = torch.flip(user_e, [0])
auc, test_topK_recall, test_topK_precision, test_topK_ndcg, test_topK_map = get_metrics(user_e.to('cpu'), item_e.to('cpu'), n_users, n_items, train_, test_, K=5)
a.append(round(auc,4))
r.append(round(test_topK_recall,4))
p.append(round(test_topK_precision,4))
n.append(round(test_topK_ndcg,4))
m.append(round(test_topK_map,4))
time_.append(time.time() - st)
print(n)
```
a = numpy.array(a)
r = numpy.array(r)
p = numpy.array(p)
n = numpy.array(n)
t = numpy.array(time_)
m = numpy.array(m)
a_mean = np.mean(a)
t_mean = np.mean(t)
r_mean = np.mean(r)
p_mean = np.mean(p)
n_mean = np.mean(n)
m_mean = np.mean(m)
a_std = np.std(a)
r_std = np.std(r)
p_std = np.std(p)
n_std = np.std(n)
m_std = np.std(m)
print("Time: ", t_mean)
print("AUC: ", a_mean, a_std)
print("Recall: ", r_mean, r_std)
print("Precision: ", p_mean, p_std)
print("NDCG: ", n_mean, n_std)
print("MAP: ", m_mean, m_std)
```
```
user_e = torch.cat(user_e, 0)
test_topK_recall, test_topK_precision, test_topK_ndcg, test_topK_map = get_metrics(user_e.to('cpu'), item_e.to('cpu'), n_users, n_items, train_, test_, K=5)
print("test_topK_recall: ", test_topK_recall)
print("test_topK_precision: ", test_topK_precision)
print("test_topK_ndcg: ", test_topK_ndcg)
print("test_topK_map: ", test_topK_map)
```