PC-GNN 2305 - HackMD

# SRC ## graphsage.py ```python! import torch import torch.nn as nn from torch.nn import init # import torch.nn.functional as F from torch.autograd import Variable # import random """ GraphSAGE implementations Paper: Inductive Representation Learning on Large Graphs Source: https://github.com/williamleif/graphsage-simple/ """ class GraphSage(nn.Module): """ Vanilla GraphSAGE Model Code partially from https://github.com/williamleif/graphsage-simple/ """ def __init__(self, num_classes, enc): super(GraphSage, self).__init__() self.enc = enc self.xent = nn.CrossEntropyLoss() self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim)) init.xavier_uniform_(self.weight) def forward(self, nodes): embeds = self.enc(nodes) scores = self.weight.mm(embeds) return scores.t() def to_prob(self, nodes): pos_scores = torch.sigmoid(self.forward(nodes)) return pos_scores def loss(self, nodes, labels): scores = self.forward(nodes) return self.xent(scores, labels.squeeze()) class GCN(nn.Module): """ Vanilla GCN Model Code partially from https://github.com/williamleif/graphsage-simple/ """ def __init__(self, num_classes, enc): super(GCN, self).__init__() self.enc = enc self.xent = nn.CrossEntropyLoss() self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim)) init.xavier_uniform_(self.weight) def forward(self, nodes): embeds = self.enc(nodes) scores = self.weight.mm(embeds) return scores.t() def to_prob(self, nodes): pos_scores = torch.sigmoid(self.forward(nodes)) return pos_scores def loss(self, nodes, labels): scores = self.forward(nodes) return self.xent(scores, labels.squeeze()) ``` ## layers.py ```python! import torch import torch.nn as nn from torch.nn import init import torch.nn.functional as F from torch.autograd import Variable from operator import itemgetter import math """ PC-GNN Layers Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection Modified from https://github.com/YingtongDou/CARE-GNN """ class InterAgg(nn.Module): def __init__(self, features, feature_dim, embed_dim, train_pos, adj_lists, intraggs, inter='GNN', cuda=True): """ Initialize the inter-relation aggregator :param features: the input node features or embeddings for all nodes :param feature_dim: the input dimension :param embed_dim: the embed dimension :param train_pos: positive samples in training set :param adj_lists: a list of adjacency lists for each single-relation graph :param intraggs: the intra-relation aggregators used by each single-relation graph :param inter: NOT used in this version, the aggregator type: 'Att', 'Weight', 'Mean', 'GNN' :param cuda: whether to use GPU """ super(InterAgg, self).__init__() self.features = features self.dropout = 0.6 self.adj_lists = adj_lists self.intra_agg1 = intraggs[0] self.intra_agg2 = intraggs[1] self.intra_agg3 = intraggs[2] self.embed_dim = embed_dim self.feat_dim = feature_dim self.inter = inter self.cuda = cuda self.intra_agg1.cuda = cuda self.intra_agg2.cuda = cuda self.intra_agg3.cuda = cuda self.train_pos = train_pos # initial filtering thresholds self.thresholds = [0.5, 0.5, 0.5] # parameter used to transform node embeddings before inter-relation aggregation self.weight = nn.Parameter(torch.FloatTensor(self.embed_dim*len(intraggs)+self.feat_dim, self.embed_dim)) init.xavier_uniform_(self.weight) # label predictor for similarity measure self.label_clf = nn.Linear(self.feat_dim, 2) # initialize the parameter logs self.weights_log = [] self.thresholds_log = [self.thresholds] self.relation_score_log = [] def forward(self, nodes, labels, train_flag=True): """ :param nodes: a list of batch node ids :param labels: a list of batch node labels :param train_flag: indicates whether in training or testing mode :return combined: the embeddings of a batch of input node features :return center_scores: the label-aware scores of batch nodes """ # extract 1-hop neighbor ids from adj lists of each single-relation graph to_neighs = [] for adj_list in self.adj_lists: to_neighs.append([set(adj_list[int(node)]) for node in nodes]) # find unique nodes and their neighbors used in current batch unique_nodes = set.union(set.union(*to_neighs[0]), set.union(*to_neighs[1]), set.union(*to_neighs[2], set(nodes))) # calculate label-aware scores if self.cuda: batch_features = self.features(torch.cuda.LongTensor(list(unique_nodes))) pos_features = self.features(torch.cuda.LongTensor(list(self.train_pos))) else: batch_features = self.features(torch.LongTensor(list(unique_nodes))) pos_features = self.features(torch.LongTensor(list(self.train_pos))) batch_scores = self.label_clf(batch_features) pos_scores = self.label_clf(pos_features) id_mapping = {node_id: index for node_id, index in zip(unique_nodes, range(len(unique_nodes)))} # the label-aware scores for current batch of nodes center_scores = batch_scores[itemgetter(*nodes)(id_mapping), :] # get neighbor node id list for each batch node and relation r1_list = [list(to_neigh) for to_neigh in to_neighs[0]] r2_list = [list(to_neigh) for to_neigh in to_neighs[1]] r3_list = [list(to_neigh) for to_neigh in to_neighs[2]] # assign label-aware scores to neighbor nodes for each batch node and relation r1_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r1_list] r2_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r2_list] r3_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r3_list] # count the number of neighbors kept for aggregation for each batch node and relation r1_sample_num_list = [math.ceil(len(neighs) * self.thresholds[0]) for neighs in r1_list] r2_sample_num_list = [math.ceil(len(neighs) * self.thresholds[1]) for neighs in r2_list] r3_sample_num_list = [math.ceil(len(neighs) * self.thresholds[2]) for neighs in r3_list] # intra-aggregation steps for each relation # Eq. (8) in the paper r1_feats, r1_scores = self.intra_agg1.forward(nodes, labels, r1_list, center_scores, r1_scores, pos_scores, r1_sample_num_list, train_flag) r2_feats, r2_scores = self.intra_agg2.forward(nodes, labels, r2_list, center_scores, r2_scores, pos_scores, r2_sample_num_list, train_flag) r3_feats, r3_scores = self.intra_agg3.forward(nodes, labels, r3_list, center_scores, r3_scores, pos_scores, r3_sample_num_list, train_flag) # get features or embeddings for batch nodes if self.cuda and isinstance(nodes, list): index = torch.LongTensor(nodes).cuda() else: index = torch.LongTensor(nodes) self_feats = self.features(index) # number of nodes in a batch n = len(nodes) # concat the intra-aggregated embeddings from each relation # Eq. (9) in the paper cat_feats = torch.cat((self_feats, r1_feats, r2_feats, r3_feats), dim=1) combined = F.relu(cat_feats.mm(self.weight).t()) return combined, center_scores class IntraAgg(nn.Module): def __init__(self, features, feat_dim, embed_dim, train_pos, rho, cuda=False): """ Initialize the intra-relation aggregator :param features: the input node features or embeddings for all nodes :param feat_dim: the input dimension :param embed_dim: the embed dimension :param train_pos: positive samples in training set :param rho: the ratio of the oversample neighbors for the minority class :param cuda: whether to use GPU """ super(IntraAgg, self).__init__() self.features = features self.cuda = cuda self.feat_dim = feat_dim self.embed_dim = embed_dim self.train_pos = train_pos self.rho = rho self.weight = nn.Parameter(torch.FloatTensor(2*self.feat_dim, self.embed_dim)) init.xavier_uniform_(self.weight) def forward(self, nodes, batch_labels, to_neighs_list, batch_scores, neigh_scores, pos_scores, sample_list, train_flag): """ Code partially from https://github.com/williamleif/graphsage-simple/ :param nodes: list of nodes in a batch :param to_neighs_list: neighbor node id list for each batch node in one relation :param batch_scores: the label-aware scores of batch nodes :param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation :param pos_scores: the label-aware scores 1-hop neighbors for the minority positive nodes :param train_flag: indicates whether in training or testing mode :param sample_list: the number of neighbors kept for each batch node in one relation :return to_feats: the aggregated embeddings of batch nodes neighbors in one relation :return samp_scores: the average neighbor distances for each relation after filtering """ # filer neighbors under given relation in the train mode if train_flag: samp_neighs, samp_scores = choose_step_neighs(batch_scores, batch_labels, neigh_scores, to_neighs_list, pos_scores, self.train_pos, sample_list, self.rho) else: samp_neighs, samp_scores = choose_step_test(batch_scores, neigh_scores, to_neighs_list, sample_list) # find the unique nodes among batch nodes and the filtered neighbors unique_nodes_list = list(set.union(*samp_neighs)) unique_nodes = {n: i for i, n in enumerate(unique_nodes_list)} # intra-relation aggregation only with sampled neighbors mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes))) column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh] row_indices = [i for i in range(len(samp_neighs)) for _ in range(len(samp_neighs[i]))] mask[row_indices, column_indices] = 1 if self.cuda: mask = mask.cuda() num_neigh = mask.sum(1, keepdim=True) mask = mask.div(num_neigh) # mean aggregator if self.cuda: self_feats = self.features(torch.LongTensor(nodes).cuda()) embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda()) else: self_feats = self.features(torch.LongTensor(nodes)) embed_matrix = self.features(torch.LongTensor(unique_nodes_list)) agg_feats = mask.mm(embed_matrix) # single relation aggregator cat_feats = torch.cat((self_feats, agg_feats), dim=1) # concat with last layer to_feats = F.relu(cat_feats.mm(self.weight)) return to_feats, samp_scores def choose_step_neighs(center_scores, center_labels, neigh_scores, neighs_list, minor_scores, minor_list, sample_list, sample_rate): """ Choose step for neighborhood sampling :param center_scores: the label-aware scores of batch nodes :param center_labels: the label of batch nodes :param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation :param neighs_list: neighbor node id list for each batch node in one relation :param minor_scores: the label-aware scores for nodes of minority class in one relation :param minor_list: minority node id list for each batch node in one relation :param sample_list: the number of neighbors kept for each batch node in one relation :para sample_rate: the ratio of the oversample neighbors for the minority class """ samp_neighs = [] samp_score_diff = [] for idx, center_score in enumerate(center_scores): center_score = center_scores[idx][0] neigh_score = neigh_scores[idx][:, 0].view(-1, 1) center_score_neigh = center_score.repeat(neigh_score.size()[0], 1) neighs_indices = neighs_list[idx] num_sample = sample_list[idx] # compute the L1-distance of batch nodes and their neighbors score_diff_neigh = torch.abs(center_score_neigh - neigh_score).squeeze() sorted_score_diff_neigh, sorted_neigh_indices = torch.sort(score_diff_neigh, dim=0, descending=False) selected_neigh_indices = sorted_neigh_indices.tolist() # top-p sampling according to distance ranking if len(neigh_scores[idx]) > num_sample + 1: selected_neighs = [neighs_indices[n] for n in selected_neigh_indices[:num_sample]] selected_score_diff = sorted_score_diff_neigh.tolist()[:num_sample] else: selected_neighs = neighs_indices selected_score_diff = score_diff_neigh.tolist() if isinstance(selected_score_diff, float): selected_score_diff = [selected_score_diff] if center_labels[idx] == 1: num_oversample = int(num_sample * sample_rate) center_score_minor = center_score.repeat(minor_scores.size()[0], 1) score_diff_minor = torch.abs(center_score_minor - minor_scores[:, 0].view(-1, 1)).squeeze() sorted_score_diff_minor, sorted_minor_indices = torch.sort(score_diff_minor, dim=0, descending=False) selected_minor_indices = sorted_minor_indices.tolist() selected_neighs.extend([minor_list[n] for n in selected_minor_indices[:num_oversample]]) selected_score_diff.extend(sorted_score_diff_minor.tolist()[:num_oversample]) samp_neighs.append(set(selected_neighs)) samp_score_diff.append(selected_score_diff) return samp_neighs, samp_score_diff def choose_step_test(center_scores, neigh_scores, neighs_list, sample_list): """ Filter neighbors according label predictor result with adaptive thresholds :param center_scores: the label-aware scores of batch nodes :param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation :param neighs_list: neighbor node id list for each batch node in one relation :param sample_list: the number of neighbors kept for each batch node in one relation :return samp_neighs: the neighbor indices and neighbor simi scores :return samp_scores: the average neighbor distances for each relation after filtering """ samp_neighs = [] samp_scores = [] for idx, center_score in enumerate(center_scores): center_score = center_scores[idx][0] neigh_score = neigh_scores[idx][:, 0].view(-1, 1) center_score = center_score.repeat(neigh_score.size()[0], 1) neighs_indices = neighs_list[idx] num_sample = sample_list[idx] # compute the L1-distance of batch nodes and their neighbors score_diff = torch.abs(center_score - neigh_score).squeeze() sorted_scores, sorted_indices = torch.sort(score_diff, dim=0, descending=False) selected_indices = sorted_indices.tolist() # top-p sampling according to distance ranking and thresholds if len(neigh_scores[idx]) > num_sample + 1: selected_neighs = [neighs_indices[n] for n in selected_indices[:num_sample]] selected_scores = sorted_scores.tolist()[:num_sample] else: selected_neighs = neighs_indices selected_scores = score_diff.tolist() if isinstance(selected_scores, float): selected_scores = [selected_scores] samp_neighs.append(set(selected_neighs)) samp_scores.append(selected_scores) return samp_neighs, samp_scores ``` ## model_handler.py ```python! import time, datetime import os import random import argparse import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from src.utils import test_pcgnn, load_data, pos_neg_split, normalize, pick_step from src.model import PCALayer from src.layers import InterAgg, IntraAgg from src.graphsage import * """ Training PC-GNN Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection """ class ModelHandler(object): def __init__(self, config): args = argparse.Namespace(**config) # load graph, feature, and label [homo, relation1, relation2, relation3], feat_data, labels = load_data(args.data_name, prefix=args.data_dir) # train_test split np.random.seed(args.seed) random.seed(args.seed) if args.data_name == 'yelp': index = list(range(len(labels))) idx_train, idx_rest, y_train, y_rest = train_test_split(index, labels, stratify=labels, train_size=args.train_ratio, random_state=2, shuffle=True) idx_valid, idx_test, y_valid, y_test = train_test_split(idx_rest, y_rest, stratify=y_rest, test_size=args.test_ratio, random_state=2, shuffle=True) print(f'Run on {args.data_name}, postive/total num: {np.sum(labels)}/{len(labels)}, train num {len(y_train)},'+ f'valid num {len(y_valid)}, test num {len(y_test)}, test positive num {np.sum(y_test)}') print(f"Classification threshold: {args.thres}") print(f"Feature dimension: {feat_data.shape[1]}") # split pos neg sets for under-sampling train_pos, train_neg = pos_neg_split(idx_train, y_train) feat_data = normalize(feat_data) args.cuda = not args.no_cuda and torch.cuda.is_available() os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_id adj_lists = [relation1, relation2, relation3] print(f'Model: {args.model}, multi-relation aggregator: {args.multi_relation}, emb_size: {args.emb_size}.') self.args = args self.dataset = {'feat_data': feat_data, 'labels': labels, 'adj_lists': adj_lists, 'homo': homo, 'idx_train': idx_train, 'idx_valid': idx_valid, 'idx_test': idx_test, 'y_train': y_train, 'y_valid': y_valid, 'y_test': y_test, 'train_pos': train_pos, 'train_neg': train_neg} def train(self): args = self.args feat_data, adj_lists = self.dataset['feat_data'], self.dataset['adj_lists'] idx_train, y_train = self.dataset['idx_train'], self.dataset['y_train'] idx_valid, y_valid, idx_test, y_test = self.dataset['idx_valid'], self.dataset['y_valid'], self.dataset['idx_test'], self.dataset['y_test'] # initialize model input features = nn.Embedding(feat_data.shape[0], feat_data.shape[1]) features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False) if args.cuda: features.cuda() # build one-layer models if args.model == 'PCGNN': intra1 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda) intra2 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda) intra3 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda) inter1 = InterAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], adj_lists, [intra1, intra2, intra3], inter=args.multi_relation, cuda=args.cuda) if args.model == 'PCGNN': gnn_model = PCALayer(2, inter1, args.alpha) if args.cuda: gnn_model.cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gnn_model.parameters()), lr=args.lr, weight_decay=args.weight_decay) timestamp = time.time() timestamp = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H-%M-%S') dir_saver = args.save_dir+timestamp path_saver = os.path.join(dir_saver, '{}_{}.pkl'.format(args.data_name, args.model)) precision_best, ep_best = 0, -1 # train the model for epoch in range(args.num_epochs): sampled_idx_train = pick_step(idx_train, y_train, self.dataset['homo'], size=len(self.dataset['train_pos'])*2) random.shuffle(sampled_idx_train) num_batches = int(len(sampled_idx_train) / args.batch_size) + 1 loss = 0.0 epoch_time = 0 # mini-batch training for batch in range(num_batches): start_time = time.time() i_start = batch * args.batch_size i_end = min((batch + 1) * args.batch_size, len(sampled_idx_train)) batch_nodes = sampled_idx_train[i_start:i_end] batch_label = self.dataset['labels'][np.array(batch_nodes)] optimizer.zero_grad() if args.cuda: loss = gnn_model.loss(batch_nodes, Variable(torch.cuda.LongTensor(batch_label))) else: loss = gnn_model.loss(batch_nodes, Variable(torch.LongTensor(batch_label))) loss.backward() optimizer.step() end_time = time.time() epoch_time += end_time - start_time loss += loss.item() print(f'Epoch: {epoch}, loss: {loss.item() / num_batches}, time: {epoch_time}s') # Valid the model for every $valid_epoch$ epoch if epoch % args.valid_epochs == 0: print("Valid at epoch {}".format(epoch)) tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp = test_pcgnn(idx_valid, y_valid, gnn_model, args.batch_size, args.thres, test = False, top = args.top) precision_val = sorted_tp/(sorted_tp+sorted_fp) if precision_val > precision_best: precision_best, ep_best = precision_val, epoch if not os.path.exists(dir_saver): os.makedirs(dir_saver) print(" sorted_tn, sorted_fp, sorted_fn, sorted_tp: ", sorted_tn, sorted_fp, sorted_fn, sorted_tp) print("tn, fp, fn, tp: ",tn, fp, fn, tp) print(' Saving model ... at epoch:', epoch) # torch.save(gnn_model.state_dict(), path_saver) torch.save(gnn_model, path_saver) print("Restore model from epoch {}".format(ep_best)) print("Model path: {}".format(path_saver)) # gnn_model.load_state_dict(torch.load(path_saver)) gnn_model = torch.load(path_saver) f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = test_pcgnn(idx_test, y_test, gnn_model, args.batch_size, args.thres, test = True, top = args.top) return f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn ``` ## model.py ```py! import torch import torch.nn as nn from torch.nn import init """ PC-GNN Model Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection Modified from https://github.com/YingtongDou/CARE-GNN """ class PCALayer(nn.Module): """ One Pick-Choose-Aggregate layer """ def __init__(self, num_classes, inter1, lambda_1): """ Initialize the PC-GNN model :param num_classes: number of classes (2 in our paper) :param inter1: the inter-relation aggregator that output the final embedding """ super(PCALayer, self).__init__() self.inter1 = inter1 self.xent = nn.CrossEntropyLoss() # the parameter to transform the final embedding self.weight = nn.Parameter(torch.FloatTensor(num_classes, inter1.embed_dim)) init.xavier_uniform_(self.weight) self.lambda_1 = lambda_1 self.epsilon = 0.1 def forward(self, nodes, labels, train_flag=True): embeds1, label_scores = self.inter1(nodes, labels, train_flag) scores = self.weight.mm(embeds1) return scores.t(), label_scores def to_prob(self, nodes, labels, train_flag=True): gnn_logits, label_logits = self.forward(nodes, labels, train_flag) gnn_scores = torch.sigmoid(gnn_logits) label_scores = torch.sigmoid(label_logits) return gnn_scores, label_scores # def loss(self, nodes, labels, train_flag=True): # gnn_scores, label_scores = self.forward(nodes, labels, train_flag) # # Simi loss, Eq. (7) in the paper # label_loss = self.xent(label_scores, labels.squeeze()) # # GNN loss, Eq. (10) in the paper # gnn_loss = self.xent(gnn_scores, labels.squeeze()) # # the loss function of PC-GNN, Eq. (11) in the paper # final_loss = gnn_loss + self.lambda_1 * label_loss # return final_loss def loss(self, nodes, labels, train_flag=True): gnn_scores, label_scores = self.forward(nodes, labels, train_flag) # Simi loss, Eq. (7) in the paper label_loss = self.xent(label_scores, labels.squeeze()) # GNN loss, Eq. (10) in the paper predictions = torch.argmax(gnn_scores, dim=1) predicted_as_label_1_indices = torch.nonzero(predictions == 1).squeeze() gnn_scores_predicted_as_label_1 = gnn_scores[predicted_as_label_1_indices] labels_predicted_as_label_1 = labels[predicted_as_label_1_indices] gnn_loss = self.xent(gnn_scores_predicted_as_label_1, labels_predicted_as_label_1.squeeze()) # the loss function of PC-GNN, Eq. (11) in the paper final_loss = gnn_loss + self.lambda_1 * label_loss return final_loss ``` ## utils.py ```py! import pickle import random import numpy as np import scipy.sparse as sp from scipy.io import loadmat import copy as cp from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, average_precision_score, confusion_matrix from collections import defaultdict """ Utility functions to handle data and evaluate model. """ def load_data(data, prefix='data/'): """ Load graph, feature, and label given dataset name :returns: home and single-relation graphs, feature, label """ data_file = loadmat(prefix + 'YelpChi.mat') labels = data_file['label'].flatten() feat_data = data_file['features'].todense().A # load the preprocessed adj_lists with open(prefix + 'yelp_homo_adjlists.pickle', 'rb') as file: homo = pickle.load(file) file.close() with open(prefix + 'yelp_rur_adjlists.pickle', 'rb') as file: relation1 = pickle.load(file) file.close() with open(prefix + 'yelp_rtr_adjlists.pickle', 'rb') as file: relation2 = pickle.load(file) file.close() with open(prefix + 'yelp_rsr_adjlists.pickle', 'rb') as file: relation3 = pickle.load(file) file.close() return [homo, relation1, relation2, relation3], feat_data, labels def normalize(mx): """ Row-normalize sparse matrix Code from https://github.com/williamleif/graphsage-simple/ """ rowsum = np.array(mx.sum(1)) + 0.01 r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) mx = r_mat_inv.dot(mx) return mx def sparse_to_adjlist(sp_matrix, filename): """ Transfer sparse matrix to adjacency list :param sp_matrix: the sparse matrix :param filename: the filename of adjlist """ # add self loop homo_adj = sp_matrix + sp.eye(sp_matrix.shape[0]) # create adj_list adj_lists = defaultdict(set) edges = homo_adj.nonzero() for index, node in enumerate(edges[0]): adj_lists[node].add(edges[1][index]) adj_lists[edges[1][index]].add(node) with open(filename, 'wb') as file: pickle.dump(adj_lists, file) file.close() def pos_neg_split(nodes, labels): """ Find positive and negative nodes given a list of nodes and their labels :param nodes: a list of nodes :param labels: a list of node labels :returns: the spited positive and negative nodes """ pos_nodes = [] neg_nodes = cp.deepcopy(nodes) aux_nodes = cp.deepcopy(nodes) for idx, label in enumerate(labels): if label == 1: pos_nodes.append(aux_nodes[idx]) neg_nodes.remove(aux_nodes[idx]) return pos_nodes, neg_nodes def pick_step(idx_train, y_train, adj_list, size): degree_train = [len(adj_list[node]) for node in idx_train] lf_train = (y_train.sum()-len(y_train))*y_train + len(y_train) smp_prob = np.array(degree_train) / lf_train return random.choices(idx_train, weights=smp_prob, k=size) def prob2pred(y_prob, thres=0.5): """ Convert probability to predicted results according to given threshold :param y_prob: numpy array of probability in [0, 1] :param thres: binary classification threshold, default 0.5 :returns: the predicted result with the same shape as y_prob """ y_pred = np.zeros_like(y_prob, dtype=np.int32) y_pred[y_prob >= thres] = 1 y_pred[y_prob < thres] = 0 return y_pred def test_pcgnn(test_cases, labels, model, batch_size, thres=0.5, test = False, top = 6000): """ Test the performance of PC-GNN and its variants :param test_cases: a list of testing node :param labels: a list of testing node labels :param model: the GNN model :param batch_size: number nodes in a batch :returns: the AUC and Recall of GNN and Simi modules """ test_batch_num = int(len(test_cases) / batch_size) + 1 f1_gnn = 0.0 acc_gnn = 0.0 recall_gnn = 0.0 f1_label1 = 0.0 acc_label1 = 0.00 recall_label1 = 0.0 gnn_pred_list = [] gnn_prob_list = [] label_list1 = [] for iteration in range(test_batch_num): i_start = iteration * batch_size i_end = min((iteration + 1) * batch_size, len(test_cases)) batch_nodes = test_cases[i_start:i_end] batch_label = labels[i_start:i_end] gnn_prob, label_prob1 = model.to_prob(batch_nodes, batch_label, train_flag=False) gnn_prob_arr = gnn_prob.data.cpu().numpy()[:, 1] gnn_pred = prob2pred(gnn_prob_arr, thres) f1_label1 += f1_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro") acc_label1 += accuracy_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1)) recall_label1 += recall_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro") gnn_pred_list.extend(gnn_pred.tolist()) gnn_prob_list.extend(gnn_prob_arr.tolist()) label_list1.extend(label_prob1.data.cpu().numpy()[:, 1].tolist()) auc_gnn = roc_auc_score(labels, np.array(gnn_prob_list)) # ap_gnn = average_precision_score(labels, np.array(gnn_prob_list)) auc_label1 = roc_auc_score(labels, np.array(label_list1)) ap_label1 = average_precision_score(labels, np.array(label_list1)) f1_binary_1_gnn = f1_score(labels, np.array(gnn_pred_list), pos_label=1, average='binary') f1_binary_0_gnn = f1_score(labels, np.array(gnn_pred_list), pos_label=0, average='binary') # f1_micro_gnn = f1_score(labels, np.array(gnn_pred_list), average='micro') f1_macro_gnn = f1_score(labels, np.array(gnn_pred_list), average='macro') conf_gnn = confusion_matrix(labels, np.array(gnn_pred_list)) tn, fp, fn, tp = conf_gnn.ravel() # gmean_gnn = conf_gmean(conf_gnn) print("________________________________________________________________") print(f" GNN TP: {tp}\tTN: {tn}\tFN: {fn}\tFP: {fp}") print(f" GNN F1-binary-1: {f1_binary_1_gnn:.4f}\tF1-binary-0: {f1_binary_0_gnn:.4f}"+ f"\tF1-macro: {f1_macro_gnn:.4f}\tAUC: {auc_gnn:.4f}") print(f"Label1 F1: {f1_label1 / test_batch_num:.4f}\tAccuracy: {acc_label1 / test_batch_num:.4f}"+ f"\tRecall: {recall_label1 / test_batch_num:.4f}\tAUC: {auc_label1:.4f}\tAP: {ap_label1:.4f}") print("________________________________________________________________") sorted_gnn_prob_list = list(gnn_prob_list) sorted_labels_list = list(labels) combined = list(zip(sorted_gnn_prob_list, sorted_labels_list)) sorted_combined = sorted(combined, key=lambda x: x[0], reverse=True) sorted_gnn_prob_list, sorted_labels_list = zip(*sorted_combined) sorted_conf_gnn = confusion_matrix(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres)) sorted_tn, sorted_fp, sorted_fn, sorted_tp = sorted_conf_gnn.ravel() sorted_auc_gnn = roc_auc_score(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres)) if test: return f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn else: return tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp def conf_gmean(conf): tn, fp, fn, tp = conf.ravel() return (tp*tn/((tp+fn)*(tn+fp)))**0.5 ``` # main.py ```python! import argparse import yaml import torch # import time import numpy as np # from collections import defaultdict, OrderedDict from src.model_handler import ModelHandler ################################################################################ # Main # ################################################################################ def set_random_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) def main(config): print_config(config) set_random_seed(config['seed']) model = ModelHandler(config) f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = model.train() print("Testing result----------------------------------") print("f1_macro_gnn: {}".format(f1_macro_gnn)) print("f1_binary_1_gnn: {}".format(f1_binary_1_gnn)) print("f1_binary_0_gnn: {}".format(f1_binary_0_gnn)) print("auc_gnn: {}".format(auc_gnn)) print("------------------------------------------------") print("tp: {}".format(tp),"fn: {}".format(fn)) print("fp: {}".format(fp),"tn: {}".format(tn)) print("Precision: {}".format(tp/(tp+fp))) print("Recall: {}".format( tp/(tp+fn))) print("For @",config['top'],"-------------------------") print("sorted_tp: {}".format(sorted_tp),"sorted_fn: {}".format(sorted_fn)) print("sorted_fp: {}".format(sorted_fp),"sorted_tn: {}".format(sorted_tn)) print("sorted_auc_gnn: {}".format(sorted_auc_gnn)) print("sorted_Precision: {}".format(sorted_tp/(sorted_tp+sorted_fp))) print("sorted_Recall: {}".format( sorted_tp/(sorted_tp+sorted_fn))) ################################################################################ # ArgParse and Helper Functions # ################################################################################ def get_config(config_path="config.yml"): with open(config_path, "r") as setting: config = yaml.load(setting, Loader=yaml.FullLoader) return config def get_args(): parser = argparse.ArgumentParser() parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file') parser.add_argument('--multi_run', action='store_true', help='flag: multi run') args = vars(parser.parse_args()) return args def print_config(config): print("**************** MODEL CONFIGURATION ****************") for key in sorted(config.keys()): val = config[key] keystr = "{}".format(key) + (" " * (24 - len(key))) print("{} --> {}".format(keystr, val)) print("**************** MODEL CONFIGURATION ****************") ################################################################################ # Module Command-line Behavior # ################################################################################ if __name__ == '__main__': cfg = get_args() config = get_config(cfg['config']) main(config) ``` # test_model.py ```py! import torch import numpy as np from sklearn.metrics import roc_auc_score, confusion_matrix import pickle from scipy.io import loadmat path_saver = 'pytorch_models\\2023-09-24 23-02-39\yelp_PCGNN.pkl' data_name = "Amazon" thres = 0.5 top = 200 batch_size = 1024 def prob2pred(y_prob, thres=0.5): """ Convert probability to predicted results according to given threshold :param y_prob: numpy array of probability in [0, 1] :param thres: binary classification threshold, default 0.5 :returns: the predicted result with the same shape as y_prob """ y_pred = np.zeros_like(y_prob, dtype=np.int32) y_pred[y_prob >= thres] = 1 y_pred[y_prob < thres] = 0 return y_pred def test_pcgnn(test_cases, labels, model, batch_size, thres=0.5, top = 6000): """ Test the performance of PC-GNN and its variants :param test_cases: a list of testing node :param labels: a list of testing node labels :param model: the GNN model :param batch_size: number nodes in a batch :returns: the AUC and Recall of GNN and Simi modules """ test_batch_num = int(len(test_cases) / batch_size) + 1 gnn_pred_list = [] gnn_prob_list = [] label_list1 = [] for iteration in range(test_batch_num): i_start = iteration * batch_size i_end = min((iteration + 1) * batch_size, len(test_cases)) batch_nodes = test_cases[i_start:i_end] batch_label = labels[i_start:i_end] gnn_prob, label_prob1 = model.to_prob(batch_nodes, batch_label, train_flag=False) gnn_prob_arr = gnn_prob.data.cpu().numpy()[:, 1] gnn_pred = prob2pred(gnn_prob_arr, thres) gnn_pred_list.extend(gnn_pred.tolist()) gnn_prob_list.extend(gnn_prob_arr.tolist()) label_list1.extend(label_prob1.data.cpu().numpy()[:, 1].tolist()) conf_gnn = confusion_matrix(labels, np.array(gnn_pred_list)) tn, fp, fn, tp = conf_gnn.ravel() # gmean_gnn = conf_gmean(conf_gnn) sorted_gnn_prob_list = list(gnn_prob_list) sorted_labels_list = list(labels) combined = list(zip(sorted_gnn_prob_list, sorted_labels_list)) sorted_combined = sorted(combined, key=lambda x: x[0], reverse=True) sorted_gnn_prob_list, sorted_labels_list = zip(*sorted_combined) sorted_conf_gnn = confusion_matrix(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres)) sorted_tn, sorted_fp, sorted_fn, sorted_tp = sorted_conf_gnn.ravel() sorted_auc_gnn = roc_auc_score(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres)) return tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn def conf_gmean(conf): tn, fp, fn, tp = conf.ravel() return (tp*tn/((tp+fn)*(tn+fp)))**0.5 def load_data( prefix='data/'): """ Load graph, feature, and label given dataset name :returns: home and single-relation graphs, feature, label """ data_file = loadmat(prefix + 'Amazon.mat') labels = data_file['label'].flatten() feat_data = data_file['features'].todense().A # load the preprocessed adj_lists with open(prefix + 'amz_homo_adjlists.pickle', 'rb') as file: homo = pickle.load(file) file.close() with open(prefix + 'amz_upu_adjlists.pickle', 'rb') as file: relation1 = pickle.load(file) file.close() with open(prefix + 'amz_usu_adjlists.pickle', 'rb') as file: relation2 = pickle.load(file) file.close() with open(prefix + 'amz_uvu_adjlists.pickle', 'rb') as file: relation3 = pickle.load(file) return [homo, relation1, relation2, relation3], feat_data, labels [homo, relation1, relation2, relation3], feat_data, labels = load_data(prefix='./data/') index = list(range(len(labels))) print(f'Run on {data_name}, postive/total num: {np.sum(labels)}/{len(labels)}') print(f"Classification threshold: {thres}") print(f"Feature dimension: {feat_data.shape[1]}") gnn_model = torch.load(path_saver) tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = test_pcgnn(index, labels, gnn_model, batch_size = 1024, thres = 0.5, top = top) print("------------------------------------------------") print("tp: {}".format(tp),"fn: {}".format(fn)) print("fp: {}".format(fp),"tn: {}".format(tn)) print("Precision: {}".format(tp/(tp+fp))) print("Recall: {}".format( tp/(tp+fn))) print("For @2000-------------------------") print("sorted_tp: {}".format(sorted_tp),"sorted_fn: {}".format(sorted_fn)) print("sorted_fp: {}".format(sorted_fp),"sorted_tn: {}".format(sorted_tn)) print("sorted_auc_gnn: {}".format(sorted_auc_gnn)) print("sorted_Precision: {}".format(sorted_tp/(sorted_tp+sorted_fp))) print("sorted_Recall: {}".format( sorted_tp/(sorted_tp+sorted_fn))) ```