# SRC
## graphsage.py
```python!
import torch
import torch.nn as nn
from torch.nn import init
# import torch.nn.functional as F
from torch.autograd import Variable
# import random
"""
GraphSAGE implementations
Paper: Inductive Representation Learning on Large Graphs
Source: https://github.com/williamleif/graphsage-simple/
"""
class GraphSage(nn.Module):
"""
Vanilla GraphSAGE Model
Code partially from https://github.com/williamleif/graphsage-simple/
"""
def __init__(self, num_classes, enc):
super(GraphSage, self).__init__()
self.enc = enc
self.xent = nn.CrossEntropyLoss()
self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
init.xavier_uniform_(self.weight)
def forward(self, nodes):
embeds = self.enc(nodes)
scores = self.weight.mm(embeds)
return scores.t()
def to_prob(self, nodes):
pos_scores = torch.sigmoid(self.forward(nodes))
return pos_scores
def loss(self, nodes, labels):
scores = self.forward(nodes)
return self.xent(scores, labels.squeeze())
class GCN(nn.Module):
"""
Vanilla GCN Model
Code partially from https://github.com/williamleif/graphsage-simple/
"""
def __init__(self, num_classes, enc):
super(GCN, self).__init__()
self.enc = enc
self.xent = nn.CrossEntropyLoss()
self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
init.xavier_uniform_(self.weight)
def forward(self, nodes):
embeds = self.enc(nodes)
scores = self.weight.mm(embeds)
return scores.t()
def to_prob(self, nodes):
pos_scores = torch.sigmoid(self.forward(nodes))
return pos_scores
def loss(self, nodes, labels):
scores = self.forward(nodes)
return self.xent(scores, labels.squeeze())
```
## layers.py
```python!
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.autograd import Variable
from operator import itemgetter
import math
"""
PC-GNN Layers
Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection
Modified from https://github.com/YingtongDou/CARE-GNN
"""
class InterAgg(nn.Module):
def __init__(self, features, feature_dim, embed_dim,
train_pos, adj_lists, intraggs, inter='GNN', cuda=True):
"""
Initialize the inter-relation aggregator
:param features: the input node features or embeddings for all nodes
:param feature_dim: the input dimension
:param embed_dim: the embed dimension
:param train_pos: positive samples in training set
:param adj_lists: a list of adjacency lists for each single-relation graph
:param intraggs: the intra-relation aggregators used by each single-relation graph
:param inter: NOT used in this version, the aggregator type: 'Att', 'Weight', 'Mean', 'GNN'
:param cuda: whether to use GPU
"""
super(InterAgg, self).__init__()
self.features = features
self.dropout = 0.6
self.adj_lists = adj_lists
self.intra_agg1 = intraggs[0]
self.intra_agg2 = intraggs[1]
self.intra_agg3 = intraggs[2]
self.embed_dim = embed_dim
self.feat_dim = feature_dim
self.inter = inter
self.cuda = cuda
self.intra_agg1.cuda = cuda
self.intra_agg2.cuda = cuda
self.intra_agg3.cuda = cuda
self.train_pos = train_pos
# initial filtering thresholds
self.thresholds = [0.5, 0.5, 0.5]
# parameter used to transform node embeddings before inter-relation aggregation
self.weight = nn.Parameter(torch.FloatTensor(self.embed_dim*len(intraggs)+self.feat_dim, self.embed_dim))
init.xavier_uniform_(self.weight)
# label predictor for similarity measure
self.label_clf = nn.Linear(self.feat_dim, 2)
# initialize the parameter logs
self.weights_log = []
self.thresholds_log = [self.thresholds]
self.relation_score_log = []
def forward(self, nodes, labels, train_flag=True):
"""
:param nodes: a list of batch node ids
:param labels: a list of batch node labels
:param train_flag: indicates whether in training or testing mode
:return combined: the embeddings of a batch of input node features
:return center_scores: the label-aware scores of batch nodes
"""
# extract 1-hop neighbor ids from adj lists of each single-relation graph
to_neighs = []
for adj_list in self.adj_lists:
to_neighs.append([set(adj_list[int(node)]) for node in nodes])
# find unique nodes and their neighbors used in current batch
unique_nodes = set.union(set.union(*to_neighs[0]), set.union(*to_neighs[1]),
set.union(*to_neighs[2], set(nodes)))
# calculate label-aware scores
if self.cuda:
batch_features = self.features(torch.cuda.LongTensor(list(unique_nodes)))
pos_features = self.features(torch.cuda.LongTensor(list(self.train_pos)))
else:
batch_features = self.features(torch.LongTensor(list(unique_nodes)))
pos_features = self.features(torch.LongTensor(list(self.train_pos)))
batch_scores = self.label_clf(batch_features)
pos_scores = self.label_clf(pos_features)
id_mapping = {node_id: index for node_id, index in zip(unique_nodes, range(len(unique_nodes)))}
# the label-aware scores for current batch of nodes
center_scores = batch_scores[itemgetter(*nodes)(id_mapping), :]
# get neighbor node id list for each batch node and relation
r1_list = [list(to_neigh) for to_neigh in to_neighs[0]]
r2_list = [list(to_neigh) for to_neigh in to_neighs[1]]
r3_list = [list(to_neigh) for to_neigh in to_neighs[2]]
# assign label-aware scores to neighbor nodes for each batch node and relation
r1_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r1_list]
r2_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r2_list]
r3_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r3_list]
# count the number of neighbors kept for aggregation for each batch node and relation
r1_sample_num_list = [math.ceil(len(neighs) * self.thresholds[0]) for neighs in r1_list]
r2_sample_num_list = [math.ceil(len(neighs) * self.thresholds[1]) for neighs in r2_list]
r3_sample_num_list = [math.ceil(len(neighs) * self.thresholds[2]) for neighs in r3_list]
# intra-aggregation steps for each relation
# Eq. (8) in the paper
r1_feats, r1_scores = self.intra_agg1.forward(nodes, labels, r1_list, center_scores, r1_scores, pos_scores, r1_sample_num_list, train_flag)
r2_feats, r2_scores = self.intra_agg2.forward(nodes, labels, r2_list, center_scores, r2_scores, pos_scores, r2_sample_num_list, train_flag)
r3_feats, r3_scores = self.intra_agg3.forward(nodes, labels, r3_list, center_scores, r3_scores, pos_scores, r3_sample_num_list, train_flag)
# get features or embeddings for batch nodes
if self.cuda and isinstance(nodes, list):
index = torch.LongTensor(nodes).cuda()
else:
index = torch.LongTensor(nodes)
self_feats = self.features(index)
# number of nodes in a batch
n = len(nodes)
# concat the intra-aggregated embeddings from each relation
# Eq. (9) in the paper
cat_feats = torch.cat((self_feats, r1_feats, r2_feats, r3_feats), dim=1)
combined = F.relu(cat_feats.mm(self.weight).t())
return combined, center_scores
class IntraAgg(nn.Module):
def __init__(self, features, feat_dim, embed_dim, train_pos, rho, cuda=False):
"""
Initialize the intra-relation aggregator
:param features: the input node features or embeddings for all nodes
:param feat_dim: the input dimension
:param embed_dim: the embed dimension
:param train_pos: positive samples in training set
:param rho: the ratio of the oversample neighbors for the minority class
:param cuda: whether to use GPU
"""
super(IntraAgg, self).__init__()
self.features = features
self.cuda = cuda
self.feat_dim = feat_dim
self.embed_dim = embed_dim
self.train_pos = train_pos
self.rho = rho
self.weight = nn.Parameter(torch.FloatTensor(2*self.feat_dim, self.embed_dim))
init.xavier_uniform_(self.weight)
def forward(self, nodes, batch_labels, to_neighs_list, batch_scores, neigh_scores, pos_scores, sample_list, train_flag):
"""
Code partially from https://github.com/williamleif/graphsage-simple/
:param nodes: list of nodes in a batch
:param to_neighs_list: neighbor node id list for each batch node in one relation
:param batch_scores: the label-aware scores of batch nodes
:param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation
:param pos_scores: the label-aware scores 1-hop neighbors for the minority positive nodes
:param train_flag: indicates whether in training or testing mode
:param sample_list: the number of neighbors kept for each batch node in one relation
:return to_feats: the aggregated embeddings of batch nodes neighbors in one relation
:return samp_scores: the average neighbor distances for each relation after filtering
"""
# filer neighbors under given relation in the train mode
if train_flag:
samp_neighs, samp_scores = choose_step_neighs(batch_scores, batch_labels, neigh_scores, to_neighs_list, pos_scores, self.train_pos, sample_list, self.rho)
else:
samp_neighs, samp_scores = choose_step_test(batch_scores, neigh_scores, to_neighs_list, sample_list)
# find the unique nodes among batch nodes and the filtered neighbors
unique_nodes_list = list(set.union(*samp_neighs))
unique_nodes = {n: i for i, n in enumerate(unique_nodes_list)}
# intra-relation aggregation only with sampled neighbors
mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
row_indices = [i for i in range(len(samp_neighs)) for _ in range(len(samp_neighs[i]))]
mask[row_indices, column_indices] = 1
if self.cuda:
mask = mask.cuda()
num_neigh = mask.sum(1, keepdim=True)
mask = mask.div(num_neigh) # mean aggregator
if self.cuda:
self_feats = self.features(torch.LongTensor(nodes).cuda())
embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
else:
self_feats = self.features(torch.LongTensor(nodes))
embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
agg_feats = mask.mm(embed_matrix) # single relation aggregator
cat_feats = torch.cat((self_feats, agg_feats), dim=1) # concat with last layer
to_feats = F.relu(cat_feats.mm(self.weight))
return to_feats, samp_scores
def choose_step_neighs(center_scores, center_labels, neigh_scores, neighs_list, minor_scores, minor_list, sample_list, sample_rate):
"""
Choose step for neighborhood sampling
:param center_scores: the label-aware scores of batch nodes
:param center_labels: the label of batch nodes
:param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation
:param neighs_list: neighbor node id list for each batch node in one relation
:param minor_scores: the label-aware scores for nodes of minority class in one relation
:param minor_list: minority node id list for each batch node in one relation
:param sample_list: the number of neighbors kept for each batch node in one relation
:para sample_rate: the ratio of the oversample neighbors for the minority class
"""
samp_neighs = []
samp_score_diff = []
for idx, center_score in enumerate(center_scores):
center_score = center_scores[idx][0]
neigh_score = neigh_scores[idx][:, 0].view(-1, 1)
center_score_neigh = center_score.repeat(neigh_score.size()[0], 1)
neighs_indices = neighs_list[idx]
num_sample = sample_list[idx]
# compute the L1-distance of batch nodes and their neighbors
score_diff_neigh = torch.abs(center_score_neigh - neigh_score).squeeze()
sorted_score_diff_neigh, sorted_neigh_indices = torch.sort(score_diff_neigh, dim=0, descending=False)
selected_neigh_indices = sorted_neigh_indices.tolist()
# top-p sampling according to distance ranking
if len(neigh_scores[idx]) > num_sample + 1:
selected_neighs = [neighs_indices[n] for n in selected_neigh_indices[:num_sample]]
selected_score_diff = sorted_score_diff_neigh.tolist()[:num_sample]
else:
selected_neighs = neighs_indices
selected_score_diff = score_diff_neigh.tolist()
if isinstance(selected_score_diff, float):
selected_score_diff = [selected_score_diff]
if center_labels[idx] == 1:
num_oversample = int(num_sample * sample_rate)
center_score_minor = center_score.repeat(minor_scores.size()[0], 1)
score_diff_minor = torch.abs(center_score_minor - minor_scores[:, 0].view(-1, 1)).squeeze()
sorted_score_diff_minor, sorted_minor_indices = torch.sort(score_diff_minor, dim=0, descending=False)
selected_minor_indices = sorted_minor_indices.tolist()
selected_neighs.extend([minor_list[n] for n in selected_minor_indices[:num_oversample]])
selected_score_diff.extend(sorted_score_diff_minor.tolist()[:num_oversample])
samp_neighs.append(set(selected_neighs))
samp_score_diff.append(selected_score_diff)
return samp_neighs, samp_score_diff
def choose_step_test(center_scores, neigh_scores, neighs_list, sample_list):
"""
Filter neighbors according label predictor result with adaptive thresholds
:param center_scores: the label-aware scores of batch nodes
:param neigh_scores: the label-aware scores 1-hop neighbors each batch node in one relation
:param neighs_list: neighbor node id list for each batch node in one relation
:param sample_list: the number of neighbors kept for each batch node in one relation
:return samp_neighs: the neighbor indices and neighbor simi scores
:return samp_scores: the average neighbor distances for each relation after filtering
"""
samp_neighs = []
samp_scores = []
for idx, center_score in enumerate(center_scores):
center_score = center_scores[idx][0]
neigh_score = neigh_scores[idx][:, 0].view(-1, 1)
center_score = center_score.repeat(neigh_score.size()[0], 1)
neighs_indices = neighs_list[idx]
num_sample = sample_list[idx]
# compute the L1-distance of batch nodes and their neighbors
score_diff = torch.abs(center_score - neigh_score).squeeze()
sorted_scores, sorted_indices = torch.sort(score_diff, dim=0, descending=False)
selected_indices = sorted_indices.tolist()
# top-p sampling according to distance ranking and thresholds
if len(neigh_scores[idx]) > num_sample + 1:
selected_neighs = [neighs_indices[n] for n in selected_indices[:num_sample]]
selected_scores = sorted_scores.tolist()[:num_sample]
else:
selected_neighs = neighs_indices
selected_scores = score_diff.tolist()
if isinstance(selected_scores, float):
selected_scores = [selected_scores]
samp_neighs.append(set(selected_neighs))
samp_scores.append(selected_scores)
return samp_neighs, samp_scores
```
## model_handler.py
```python!
import time, datetime
import os
import random
import argparse
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from src.utils import test_pcgnn, load_data, pos_neg_split, normalize, pick_step
from src.model import PCALayer
from src.layers import InterAgg, IntraAgg
from src.graphsage import *
"""
Training PC-GNN
Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection
"""
class ModelHandler(object):
def __init__(self, config):
args = argparse.Namespace(**config)
# load graph, feature, and label
[homo, relation1, relation2, relation3], feat_data, labels = load_data(args.data_name, prefix=args.data_dir)
# train_test split
np.random.seed(args.seed)
random.seed(args.seed)
if args.data_name == 'yelp':
index = list(range(len(labels)))
idx_train, idx_rest, y_train, y_rest = train_test_split(index, labels, stratify=labels, train_size=args.train_ratio,
random_state=2, shuffle=True)
idx_valid, idx_test, y_valid, y_test = train_test_split(idx_rest, y_rest, stratify=y_rest, test_size=args.test_ratio,
random_state=2, shuffle=True)
print(f'Run on {args.data_name}, postive/total num: {np.sum(labels)}/{len(labels)}, train num {len(y_train)},'+
f'valid num {len(y_valid)}, test num {len(y_test)}, test positive num {np.sum(y_test)}')
print(f"Classification threshold: {args.thres}")
print(f"Feature dimension: {feat_data.shape[1]}")
# split pos neg sets for under-sampling
train_pos, train_neg = pos_neg_split(idx_train, y_train)
feat_data = normalize(feat_data)
args.cuda = not args.no_cuda and torch.cuda.is_available()
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_id
adj_lists = [relation1, relation2, relation3]
print(f'Model: {args.model}, multi-relation aggregator: {args.multi_relation}, emb_size: {args.emb_size}.')
self.args = args
self.dataset = {'feat_data': feat_data, 'labels': labels, 'adj_lists': adj_lists, 'homo': homo,
'idx_train': idx_train, 'idx_valid': idx_valid, 'idx_test': idx_test,
'y_train': y_train, 'y_valid': y_valid, 'y_test': y_test,
'train_pos': train_pos, 'train_neg': train_neg}
def train(self):
args = self.args
feat_data, adj_lists = self.dataset['feat_data'], self.dataset['adj_lists']
idx_train, y_train = self.dataset['idx_train'], self.dataset['y_train']
idx_valid, y_valid, idx_test, y_test = self.dataset['idx_valid'], self.dataset['y_valid'], self.dataset['idx_test'], self.dataset['y_test']
# initialize model input
features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
if args.cuda:
features.cuda()
# build one-layer models
if args.model == 'PCGNN':
intra1 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda)
intra2 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda)
intra3 = IntraAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'], args.rho, cuda=args.cuda)
inter1 = InterAgg(features, feat_data.shape[1], args.emb_size, self.dataset['train_pos'],
adj_lists, [intra1, intra2, intra3], inter=args.multi_relation, cuda=args.cuda)
if args.model == 'PCGNN':
gnn_model = PCALayer(2, inter1, args.alpha)
if args.cuda:
gnn_model.cuda()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gnn_model.parameters()), lr=args.lr, weight_decay=args.weight_decay)
timestamp = time.time()
timestamp = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H-%M-%S')
dir_saver = args.save_dir+timestamp
path_saver = os.path.join(dir_saver, '{}_{}.pkl'.format(args.data_name, args.model))
precision_best, ep_best = 0, -1
# train the model
for epoch in range(args.num_epochs):
sampled_idx_train = pick_step(idx_train, y_train, self.dataset['homo'], size=len(self.dataset['train_pos'])*2)
random.shuffle(sampled_idx_train)
num_batches = int(len(sampled_idx_train) / args.batch_size) + 1
loss = 0.0
epoch_time = 0
# mini-batch training
for batch in range(num_batches):
start_time = time.time()
i_start = batch * args.batch_size
i_end = min((batch + 1) * args.batch_size, len(sampled_idx_train))
batch_nodes = sampled_idx_train[i_start:i_end]
batch_label = self.dataset['labels'][np.array(batch_nodes)]
optimizer.zero_grad()
if args.cuda:
loss = gnn_model.loss(batch_nodes, Variable(torch.cuda.LongTensor(batch_label)))
else:
loss = gnn_model.loss(batch_nodes, Variable(torch.LongTensor(batch_label)))
loss.backward()
optimizer.step()
end_time = time.time()
epoch_time += end_time - start_time
loss += loss.item()
print(f'Epoch: {epoch}, loss: {loss.item() / num_batches}, time: {epoch_time}s')
# Valid the model for every $valid_epoch$ epoch
if epoch % args.valid_epochs == 0:
print("Valid at epoch {}".format(epoch))
tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp = test_pcgnn(idx_valid, y_valid, gnn_model, args.batch_size, args.thres, test = False, top = args.top)
precision_val = sorted_tp/(sorted_tp+sorted_fp)
if precision_val > precision_best:
precision_best, ep_best = precision_val, epoch
if not os.path.exists(dir_saver):
os.makedirs(dir_saver)
print(" sorted_tn, sorted_fp, sorted_fn, sorted_tp: ", sorted_tn, sorted_fp, sorted_fn, sorted_tp)
print("tn, fp, fn, tp: ",tn, fp, fn, tp)
print(' Saving model ... at epoch:', epoch)
# torch.save(gnn_model.state_dict(), path_saver)
torch.save(gnn_model, path_saver)
print("Restore model from epoch {}".format(ep_best))
print("Model path: {}".format(path_saver))
# gnn_model.load_state_dict(torch.load(path_saver))
gnn_model = torch.load(path_saver)
f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = test_pcgnn(idx_test, y_test, gnn_model, args.batch_size, args.thres, test = True, top = args.top)
return f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn
```
## model.py
```py!
import torch
import torch.nn as nn
from torch.nn import init
"""
PC-GNN Model
Paper: Pick and Choose: A GNN-based Imbalanced Learning Approach for Fraud Detection
Modified from https://github.com/YingtongDou/CARE-GNN
"""
class PCALayer(nn.Module):
"""
One Pick-Choose-Aggregate layer
"""
def __init__(self, num_classes, inter1, lambda_1):
"""
Initialize the PC-GNN model
:param num_classes: number of classes (2 in our paper)
:param inter1: the inter-relation aggregator that output the final embedding
"""
super(PCALayer, self).__init__()
self.inter1 = inter1
self.xent = nn.CrossEntropyLoss()
# the parameter to transform the final embedding
self.weight = nn.Parameter(torch.FloatTensor(num_classes, inter1.embed_dim))
init.xavier_uniform_(self.weight)
self.lambda_1 = lambda_1
self.epsilon = 0.1
def forward(self, nodes, labels, train_flag=True):
embeds1, label_scores = self.inter1(nodes, labels, train_flag)
scores = self.weight.mm(embeds1)
return scores.t(), label_scores
def to_prob(self, nodes, labels, train_flag=True):
gnn_logits, label_logits = self.forward(nodes, labels, train_flag)
gnn_scores = torch.sigmoid(gnn_logits)
label_scores = torch.sigmoid(label_logits)
return gnn_scores, label_scores
# def loss(self, nodes, labels, train_flag=True):
# gnn_scores, label_scores = self.forward(nodes, labels, train_flag)
# # Simi loss, Eq. (7) in the paper
# label_loss = self.xent(label_scores, labels.squeeze())
# # GNN loss, Eq. (10) in the paper
# gnn_loss = self.xent(gnn_scores, labels.squeeze())
# # the loss function of PC-GNN, Eq. (11) in the paper
# final_loss = gnn_loss + self.lambda_1 * label_loss
# return final_loss
def loss(self, nodes, labels, train_flag=True):
gnn_scores, label_scores = self.forward(nodes, labels, train_flag)
# Simi loss, Eq. (7) in the paper
label_loss = self.xent(label_scores, labels.squeeze())
# GNN loss, Eq. (10) in the paper
predictions = torch.argmax(gnn_scores, dim=1)
predicted_as_label_1_indices = torch.nonzero(predictions == 1).squeeze()
gnn_scores_predicted_as_label_1 = gnn_scores[predicted_as_label_1_indices]
labels_predicted_as_label_1 = labels[predicted_as_label_1_indices]
gnn_loss = self.xent(gnn_scores_predicted_as_label_1, labels_predicted_as_label_1.squeeze())
# the loss function of PC-GNN, Eq. (11) in the paper
final_loss = gnn_loss + self.lambda_1 * label_loss
return final_loss
```
## utils.py
```py!
import pickle
import random
import numpy as np
import scipy.sparse as sp
from scipy.io import loadmat
import copy as cp
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, average_precision_score, confusion_matrix
from collections import defaultdict
"""
Utility functions to handle data and evaluate model.
"""
def load_data(data, prefix='data/'):
"""
Load graph, feature, and label given dataset name
:returns: home and single-relation graphs, feature, label
"""
data_file = loadmat(prefix + 'YelpChi.mat')
labels = data_file['label'].flatten()
feat_data = data_file['features'].todense().A
# load the preprocessed adj_lists
with open(prefix + 'yelp_homo_adjlists.pickle', 'rb') as file:
homo = pickle.load(file)
file.close()
with open(prefix + 'yelp_rur_adjlists.pickle', 'rb') as file:
relation1 = pickle.load(file)
file.close()
with open(prefix + 'yelp_rtr_adjlists.pickle', 'rb') as file:
relation2 = pickle.load(file)
file.close()
with open(prefix + 'yelp_rsr_adjlists.pickle', 'rb') as file:
relation3 = pickle.load(file)
file.close()
return [homo, relation1, relation2, relation3], feat_data, labels
def normalize(mx):
"""
Row-normalize sparse matrix
Code from https://github.com/williamleif/graphsage-simple/
"""
rowsum = np.array(mx.sum(1)) + 0.01
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx
def sparse_to_adjlist(sp_matrix, filename):
"""
Transfer sparse matrix to adjacency list
:param sp_matrix: the sparse matrix
:param filename: the filename of adjlist
"""
# add self loop
homo_adj = sp_matrix + sp.eye(sp_matrix.shape[0])
# create adj_list
adj_lists = defaultdict(set)
edges = homo_adj.nonzero()
for index, node in enumerate(edges[0]):
adj_lists[node].add(edges[1][index])
adj_lists[edges[1][index]].add(node)
with open(filename, 'wb') as file:
pickle.dump(adj_lists, file)
file.close()
def pos_neg_split(nodes, labels):
"""
Find positive and negative nodes given a list of nodes and their labels
:param nodes: a list of nodes
:param labels: a list of node labels
:returns: the spited positive and negative nodes
"""
pos_nodes = []
neg_nodes = cp.deepcopy(nodes)
aux_nodes = cp.deepcopy(nodes)
for idx, label in enumerate(labels):
if label == 1:
pos_nodes.append(aux_nodes[idx])
neg_nodes.remove(aux_nodes[idx])
return pos_nodes, neg_nodes
def pick_step(idx_train, y_train, adj_list, size):
degree_train = [len(adj_list[node]) for node in idx_train]
lf_train = (y_train.sum()-len(y_train))*y_train + len(y_train)
smp_prob = np.array(degree_train) / lf_train
return random.choices(idx_train, weights=smp_prob, k=size)
def prob2pred(y_prob, thres=0.5):
"""
Convert probability to predicted results according to given threshold
:param y_prob: numpy array of probability in [0, 1]
:param thres: binary classification threshold, default 0.5
:returns: the predicted result with the same shape as y_prob
"""
y_pred = np.zeros_like(y_prob, dtype=np.int32)
y_pred[y_prob >= thres] = 1
y_pred[y_prob < thres] = 0
return y_pred
def test_pcgnn(test_cases, labels, model, batch_size, thres=0.5, test = False, top = 6000):
"""
Test the performance of PC-GNN and its variants
:param test_cases: a list of testing node
:param labels: a list of testing node labels
:param model: the GNN model
:param batch_size: number nodes in a batch
:returns: the AUC and Recall of GNN and Simi modules
"""
test_batch_num = int(len(test_cases) / batch_size) + 1
f1_gnn = 0.0
acc_gnn = 0.0
recall_gnn = 0.0
f1_label1 = 0.0
acc_label1 = 0.00
recall_label1 = 0.0
gnn_pred_list = []
gnn_prob_list = []
label_list1 = []
for iteration in range(test_batch_num):
i_start = iteration * batch_size
i_end = min((iteration + 1) * batch_size, len(test_cases))
batch_nodes = test_cases[i_start:i_end]
batch_label = labels[i_start:i_end]
gnn_prob, label_prob1 = model.to_prob(batch_nodes, batch_label, train_flag=False)
gnn_prob_arr = gnn_prob.data.cpu().numpy()[:, 1]
gnn_pred = prob2pred(gnn_prob_arr, thres)
f1_label1 += f1_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro")
acc_label1 += accuracy_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1))
recall_label1 += recall_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro")
gnn_pred_list.extend(gnn_pred.tolist())
gnn_prob_list.extend(gnn_prob_arr.tolist())
label_list1.extend(label_prob1.data.cpu().numpy()[:, 1].tolist())
auc_gnn = roc_auc_score(labels, np.array(gnn_prob_list))
# ap_gnn = average_precision_score(labels, np.array(gnn_prob_list))
auc_label1 = roc_auc_score(labels, np.array(label_list1))
ap_label1 = average_precision_score(labels, np.array(label_list1))
f1_binary_1_gnn = f1_score(labels, np.array(gnn_pred_list), pos_label=1, average='binary')
f1_binary_0_gnn = f1_score(labels, np.array(gnn_pred_list), pos_label=0, average='binary')
# f1_micro_gnn = f1_score(labels, np.array(gnn_pred_list), average='micro')
f1_macro_gnn = f1_score(labels, np.array(gnn_pred_list), average='macro')
conf_gnn = confusion_matrix(labels, np.array(gnn_pred_list))
tn, fp, fn, tp = conf_gnn.ravel()
# gmean_gnn = conf_gmean(conf_gnn)
print("________________________________________________________________")
print(f" GNN TP: {tp}\tTN: {tn}\tFN: {fn}\tFP: {fp}")
print(f" GNN F1-binary-1: {f1_binary_1_gnn:.4f}\tF1-binary-0: {f1_binary_0_gnn:.4f}"+
f"\tF1-macro: {f1_macro_gnn:.4f}\tAUC: {auc_gnn:.4f}")
print(f"Label1 F1: {f1_label1 / test_batch_num:.4f}\tAccuracy: {acc_label1 / test_batch_num:.4f}"+
f"\tRecall: {recall_label1 / test_batch_num:.4f}\tAUC: {auc_label1:.4f}\tAP: {ap_label1:.4f}")
print("________________________________________________________________")
sorted_gnn_prob_list = list(gnn_prob_list)
sorted_labels_list = list(labels)
combined = list(zip(sorted_gnn_prob_list, sorted_labels_list))
sorted_combined = sorted(combined, key=lambda x: x[0], reverse=True)
sorted_gnn_prob_list, sorted_labels_list = zip(*sorted_combined)
sorted_conf_gnn = confusion_matrix(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres))
sorted_tn, sorted_fp, sorted_fn, sorted_tp = sorted_conf_gnn.ravel()
sorted_auc_gnn = roc_auc_score(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres))
if test:
return f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn
else:
return tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp
def conf_gmean(conf):
tn, fp, fn, tp = conf.ravel()
return (tp*tn/((tp+fn)*(tn+fp)))**0.5
```
# main.py
```python!
import argparse
import yaml
import torch
# import time
import numpy as np
# from collections import defaultdict, OrderedDict
from src.model_handler import ModelHandler
################################################################################
# Main #
################################################################################
def set_random_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
def main(config):
print_config(config)
set_random_seed(config['seed'])
model = ModelHandler(config)
f1_macro_gnn, f1_binary_1_gnn, f1_binary_0_gnn, auc_gnn, tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = model.train()
print("Testing result----------------------------------")
print("f1_macro_gnn: {}".format(f1_macro_gnn))
print("f1_binary_1_gnn: {}".format(f1_binary_1_gnn))
print("f1_binary_0_gnn: {}".format(f1_binary_0_gnn))
print("auc_gnn: {}".format(auc_gnn))
print("------------------------------------------------")
print("tp: {}".format(tp),"fn: {}".format(fn))
print("fp: {}".format(fp),"tn: {}".format(tn))
print("Precision: {}".format(tp/(tp+fp)))
print("Recall: {}".format( tp/(tp+fn)))
print("For @",config['top'],"-------------------------")
print("sorted_tp: {}".format(sorted_tp),"sorted_fn: {}".format(sorted_fn))
print("sorted_fp: {}".format(sorted_fp),"sorted_tn: {}".format(sorted_tn))
print("sorted_auc_gnn: {}".format(sorted_auc_gnn))
print("sorted_Precision: {}".format(sorted_tp/(sorted_tp+sorted_fp)))
print("sorted_Recall: {}".format( sorted_tp/(sorted_tp+sorted_fn)))
################################################################################
# ArgParse and Helper Functions #
################################################################################
def get_config(config_path="config.yml"):
with open(config_path, "r") as setting:
config = yaml.load(setting, Loader=yaml.FullLoader)
return config
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file')
parser.add_argument('--multi_run', action='store_true', help='flag: multi run')
args = vars(parser.parse_args())
return args
def print_config(config):
print("**************** MODEL CONFIGURATION ****************")
for key in sorted(config.keys()):
val = config[key]
keystr = "{}".format(key) + (" " * (24 - len(key)))
print("{} --> {}".format(keystr, val))
print("**************** MODEL CONFIGURATION ****************")
################################################################################
# Module Command-line Behavior #
################################################################################
if __name__ == '__main__':
cfg = get_args()
config = get_config(cfg['config'])
main(config)
```
# test_model.py
```py!
import torch
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix
import pickle
from scipy.io import loadmat
path_saver = 'pytorch_models\\2023-09-24 23-02-39\yelp_PCGNN.pkl'
data_name = "Amazon"
thres = 0.5
top = 200
batch_size = 1024
def prob2pred(y_prob, thres=0.5):
"""
Convert probability to predicted results according to given threshold
:param y_prob: numpy array of probability in [0, 1]
:param thres: binary classification threshold, default 0.5
:returns: the predicted result with the same shape as y_prob
"""
y_pred = np.zeros_like(y_prob, dtype=np.int32)
y_pred[y_prob >= thres] = 1
y_pred[y_prob < thres] = 0
return y_pred
def test_pcgnn(test_cases, labels, model, batch_size, thres=0.5, top = 6000):
"""
Test the performance of PC-GNN and its variants
:param test_cases: a list of testing node
:param labels: a list of testing node labels
:param model: the GNN model
:param batch_size: number nodes in a batch
:returns: the AUC and Recall of GNN and Simi modules
"""
test_batch_num = int(len(test_cases) / batch_size) + 1
gnn_pred_list = []
gnn_prob_list = []
label_list1 = []
for iteration in range(test_batch_num):
i_start = iteration * batch_size
i_end = min((iteration + 1) * batch_size, len(test_cases))
batch_nodes = test_cases[i_start:i_end]
batch_label = labels[i_start:i_end]
gnn_prob, label_prob1 = model.to_prob(batch_nodes, batch_label, train_flag=False)
gnn_prob_arr = gnn_prob.data.cpu().numpy()[:, 1]
gnn_pred = prob2pred(gnn_prob_arr, thres)
gnn_pred_list.extend(gnn_pred.tolist())
gnn_prob_list.extend(gnn_prob_arr.tolist())
label_list1.extend(label_prob1.data.cpu().numpy()[:, 1].tolist())
conf_gnn = confusion_matrix(labels, np.array(gnn_pred_list))
tn, fp, fn, tp = conf_gnn.ravel()
# gmean_gnn = conf_gmean(conf_gnn)
sorted_gnn_prob_list = list(gnn_prob_list)
sorted_labels_list = list(labels)
combined = list(zip(sorted_gnn_prob_list, sorted_labels_list))
sorted_combined = sorted(combined, key=lambda x: x[0], reverse=True)
sorted_gnn_prob_list, sorted_labels_list = zip(*sorted_combined)
sorted_conf_gnn = confusion_matrix(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres))
sorted_tn, sorted_fp, sorted_fn, sorted_tp = sorted_conf_gnn.ravel()
sorted_auc_gnn = roc_auc_score(sorted_labels_list[:top], prob2pred(np.array(sorted_gnn_prob_list[:top]), thres))
return tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn
def conf_gmean(conf):
tn, fp, fn, tp = conf.ravel()
return (tp*tn/((tp+fn)*(tn+fp)))**0.5
def load_data( prefix='data/'):
"""
Load graph, feature, and label given dataset name
:returns: home and single-relation graphs, feature, label
"""
data_file = loadmat(prefix + 'Amazon.mat')
labels = data_file['label'].flatten()
feat_data = data_file['features'].todense().A
# load the preprocessed adj_lists
with open(prefix + 'amz_homo_adjlists.pickle', 'rb') as file:
homo = pickle.load(file)
file.close()
with open(prefix + 'amz_upu_adjlists.pickle', 'rb') as file:
relation1 = pickle.load(file)
file.close()
with open(prefix + 'amz_usu_adjlists.pickle', 'rb') as file:
relation2 = pickle.load(file)
file.close()
with open(prefix + 'amz_uvu_adjlists.pickle', 'rb') as file:
relation3 = pickle.load(file)
return [homo, relation1, relation2, relation3], feat_data, labels
[homo, relation1, relation2, relation3], feat_data, labels = load_data(prefix='./data/')
index = list(range(len(labels)))
print(f'Run on {data_name}, postive/total num: {np.sum(labels)}/{len(labels)}')
print(f"Classification threshold: {thres}")
print(f"Feature dimension: {feat_data.shape[1]}")
gnn_model = torch.load(path_saver)
tn, fp, fn, tp, sorted_tn, sorted_fp, sorted_fn, sorted_tp, sorted_auc_gnn = test_pcgnn(index, labels, gnn_model, batch_size = 1024, thres = 0.5, top = top)
print("------------------------------------------------")
print("tp: {}".format(tp),"fn: {}".format(fn))
print("fp: {}".format(fp),"tn: {}".format(tn))
print("Precision: {}".format(tp/(tp+fp)))
print("Recall: {}".format( tp/(tp+fn)))
print("For @2000-------------------------")
print("sorted_tp: {}".format(sorted_tp),"sorted_fn: {}".format(sorted_fn))
print("sorted_fp: {}".format(sorted_fp),"sorted_tn: {}".format(sorted_tn))
print("sorted_auc_gnn: {}".format(sorted_auc_gnn))
print("sorted_Precision: {}".format(sorted_tp/(sorted_tp+sorted_fp)))
print("sorted_Recall: {}".format( sorted_tp/(sorted_tp+sorted_fn)))
```