-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
121 lines (98 loc) · 4.1 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import torch
import numpy as np
import time
from config import *
from scipy.sparse import *
class HCSPMM_dataset(torch.nn.Module):
"""
data loading for more graphs
"""
def __init__(self, path, dim, num_class, load_from_txt=True, verbose=False):
super(HCSPMM_dataset, self).__init__()
self.nodes = set()
self.load_from_txt = load_from_txt
self.num_nodes = 0
self.num_features = dim
self.num_classes = num_class
self.edge_index = None
self.reorder_flag = False
self.verbose_flag = verbose
self.avg_degree = -1
self.avg_edgeSpan = -1
self.init_edges(path)
self.init_embedding(dim)
self.init_labels(num_class)
train = 1
val = 0.3
test = 0.1
self.train_mask = [1] * int(self.num_nodes * train) + [0] * (self.num_nodes - int(self.num_nodes * train))
self.val_mask = [1] * int(self.num_nodes * val)+ [0] * (self.num_nodes - int(self.num_nodes * val))
self.test_mask = [1] * int(self.num_nodes * test) + [0] * (self.num_nodes - int(self.num_nodes * test))
self.train_mask = torch.BoolTensor(self.train_mask).cuda()
self.val_mask = torch.BoolTensor(self.val_mask).cuda()
self.test_mask = torch.BoolTensor(self.test_mask).cuda()
def init_edges(self, path):
# loading from a txt graph file
if self.load_from_txt:
fp = open(path, "r")
src_li = []
dst_li = []
start = time.perf_counter()
for line in fp:
dst, src = line.strip('\n').split(",")
src, dst = int(src) - 1, int(dst) - 1
src_li.append(src)
dst_li.append(dst)
self.nodes.add(src)
self.nodes.add(dst)
self.num_edges = len(src_li)
self.num_nodes = max(self.nodes) + 1
self.edge_index = np.stack([src_li, dst_li])
dur = time.perf_counter() - start
if self.verbose_flag:
print("# Loading (txt) {:.3f}s ".format(dur))
# loading from a .npz graph file
else:
if not path.endswith('.npz'):
raise ValueError("graph file must be a .npz file")
start = time.perf_counter()
graph_obj = np.load(path)
src_li = graph_obj['src_li']
dst_li = graph_obj['dst_li']
self.num_nodes = graph_obj['num_nodes']
self.num_edges = len(src_li)
self.edge_index = np.stack([src_li, dst_li])
dur = time.perf_counter() - start
if self.verbose_flag:
print("# Loading (npz)(s): {:.3f}".format(dur))
self.avg_degree = self.num_edges / self.num_nodes
self.avg_edgeSpan = np.mean(np.abs(np.subtract(src_li, dst_li)))
if self.verbose_flag:
print('# nodes: {}'.format(self.num_nodes))
print("# avg_degree: {:.2f}".format(self.avg_degree))
print("# avg_edgeSpan: {}".format(int(self.avg_edgeSpan)))
# Build graph CSR.
val = [1] * self.num_edges
start = time.perf_counter()
scipy_coo = coo_matrix((val, self.edge_index), shape=(self.num_nodes, self.num_nodes))
scipy_csr = scipy_coo.tocsr()
build_csr = time.perf_counter() - start
if self.verbose_flag:
print("# Build CSR (s): {:.3f}".format(build_csr))
self.column_index = torch.IntTensor(scipy_csr.indices)
self.row_pointers = torch.IntTensor(scipy_csr.indptr)
# Get degrees array.
degrees = (self.row_pointers[1:] - self.row_pointers[:-1]).tolist()
self.degrees = torch.sqrt(torch.FloatTensor(list(map(func, degrees)))).cuda()
def init_embedding(self, dim):
'''
Generate node embedding for nodes.
Called from __init__.
'''
self.x = torch.randn(self.num_nodes, dim).cuda()
def init_labels(self, num_class):
'''
Generate the node label.
Called from __init__.
'''
self.y = torch.ones(self.num_nodes).long().cuda()