-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster.py
270 lines (213 loc) · 10.6 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
from __future__ import division
from utils.datasets import *
from utils.parse_config import parse_data_cfg
from utils.utils import *
import sys
import argparse
from torch.utils.data import DataLoader
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.cluster import AgglomerativeClustering
from matplotlib.pylab import show, cm, axis
from matplotlib.colors import LinearSegmentedColormap
seed = 578912
random.seed(seed)
np.random.seed(seed)
def _get_adjacency_matrix(train_path, num_classes, co_occurance_out_file):
if os.path.exists(co_occurance_out_file):
objects_adjacency_matrix = np.genfromtxt(co_occurance_out_file, delimiter=',', dtype=float)
else:
dir_name = os.path.dirname(train_path)
print("Extracting the co-occurance matrix from the training dataset")
objects_adjacency_matrix = np.zeros((num_classes,num_classes))
# Loop on Images Labels files
f = open(train_path, "r")
train_paths = f.readlines()
train_paths = [
os.path.join(dir_name, path.rstrip().replace("./", ""). \
replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt"))
for path in train_paths
]
for filename in train_paths:
if filename.endswith(".txt"):
# Loop on Objects inside each Image
objects = []
with open(filename, "r") as a_file:
for line in a_file:
stripped_line = line.split(" ")
objects.append(int(stripped_line[0]))
for obj1 in objects:
for obj2 in objects:
objects_adjacency_matrix[obj1,obj2] += 1
objects_adjacency_matrix[obj2,obj1] += 1
np.savetxt(co_occurance_out_file, objects_adjacency_matrix, delimiter=",", fmt='%d')
return objects_adjacency_matrix
def _get_common_classes(adjacency_matrix):
temp = np.copy(adjacency_matrix)
temp[temp<100] = 0
temp[temp>=100] = 1
common_classes = []
for i in range(len(temp)):
if np.sum(temp[i]) >= opt.common_classes_thres:
common_classes.append(i)
return common_classes
def _remove_common_classes(objects_adjacency_matrix, common_classes, num_classes):
# Get mapping to classes idx after removing common classes
classes_idx_dict = {}
j = 0
for i in range(objects_adjacency_matrix.shape[0]):
if i not in common_classes:
classes_idx_dict[j] = i
j += 1
# Remove common classes
r_objects_matrix = [[objects_adjacency_matrix[i,j] for j in range(len(objects_adjacency_matrix)) if j not in common_classes]
for i in range(len(objects_adjacency_matrix)) if i not in common_classes]
r_objects_matrix = np.asarray(r_objects_matrix)
r_objects_matrix = [[r_objects_matrix[i,j] if i != j else 0 for i in range(len(r_objects_matrix))] for j in range(len(r_objects_matrix)) ]
r_objects_matrix = np.asarray(r_objects_matrix)
return classes_idx_dict, r_objects_matrix
def _get_probability_adjacency_matrix(adjacency_matrix):
prob = np.copy(adjacency_matrix)
for i in range(len(adjacency_matrix)):
for j in range(len(adjacency_matrix)):
prob[i,j] = adjacency_matrix[i,j]/(np.sum(adjacency_matrix[:,j])+1)
return prob
def _cluster(positions, num_of_clusters):
keys = sorted([k for k in positions.keys()])
classes_dict = {}
for i, key in enumerate(positions.keys()):
classes_dict[i] = key
positions = [p for p in positions.values()]
ag_clusters = AgglomerativeClustering(n_clusters=num_of_clusters).fit(positions)
labels = ag_clusters.labels_
clusters = []
for c in range(num_of_clusters):
clusters.append([classes_dict[i] for i in range(len(labels)) if labels[i] == c])
return clusters
def _create_graph(adjacency_matrix, pruning_threshold, itterations):
''' create_graph takes an adjacency matrix and the pruning threshold
It returns the positions of the nodes in the graph,
and the adjacency matrix in sparse format
'''
G = nx.Graph()
# Add classes names as graph nodes
for i in range(len(adjacency_matrix)):
G.add_node(i)
# Add the a weighed edge between the nodes if
# the corresponding value in the adjacency matrix
# is greater than the pruning threshold
for i, row in enumerate(adjacency_matrix):
for j, w in enumerate(row):
if i == j:
continue
if adjacency_matrix[i,j] > pruning_threshold:
G.add_edge(i,j,weight=w)
# Convert the Graph to a sparse format matrix to use in plotting
s_matrix = nx.to_scipy_sparse_matrix(G)
# Spead the graph according to the weights to get the nodes positions
n_positions = nx.spring_layout(G, iterations=itterations)
return n_positions, s_matrix
def _evaluate_clustering(valid_path, common_classes, clusters_list):
dataset = LoadImagesAndLabels(valid_path, 416, 1, rect=False, single_cls=False, pad=0.5)
dloader = DataLoader(dataset, batch_size=1, collate_fn=dataset.collate_fn) # Batch size has to be 1 here
neglected_objects = 0
all_objects = 1
for batch_i, (_, targets, _, _) in enumerate(tqdm(dloader, desc="Evaluating Clusters")):
ts = targets[:, 1].tolist()
extras = 0
cluster_cnt = np.zeros(len(clusters_list))
for t in ts:
if t in common_classes:
extras += 1
continue
for i, cluster in enumerate(clusters_list):
if t in cluster:
cluster_cnt[i] += 1
break
dominent_clus = [np.argmax(cluster_cnt)]
neglected_objects += np.sum(cluster_cnt) - np.sum(cluster_cnt[dominent_clus])
all_objects += np.sum(cluster_cnt) + extras
return neglected_objects, all_objects
def _plot_graph(s_adjmatrix, positions, clusters, classes_ids, common_classes, classes_names, fig_name):
fig = plt.figure()
graph = nx.Graph(s_adjmatrix)
# map node to cluster id for colors
cluster_map = {node: i for i, cluster in enumerate(clusters) for node in cluster}
colors = [cluster_map[i] for i in range(len(graph.nodes()))]
# retrieve the node labels (class names)
labels = {}
j = 0
for i in range(len(classes_ids)):
if i not in common_classes:
labels[j] = classes_names[i]
j += 1
nx.draw_networkx_nodes(graph, pos=positions, node_size=700, node_color=colors,cmap=cm.Set3)
nx.draw_networkx_labels(graph, pos=positions, labels=labels, font_size=12)
plt.savefig(fig_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--common_classes_thres", type=int, default=55, help="Number of common classes to determine the class as common classes")
parser.add_argument("--num_clusters", type=int, default=4, help="Number of object clusters")
parser.add_argument("--data", type=str, default="data/coco2014.data", help="Path to data config file")
parser.add_argument("--pruning_threshold", type=float, default=0.05, help="Pruning threshold for edge weights")
parser.add_argument("--co-occurance_out", type=str, default="co-occurence_adjacency_matrix.csv", help="Use if you already have the co-occurance matrix from previous runs")
parser.add_argument('--eval', action='store_true', help='Evaluate the clustering')
opt = parser.parse_args()
data_config = parse_data_cfg(opt.data)
train_path = data_config["train"]
valid_path = data_config["valid"]
num_classes = int(data_config["classes"])
classes_names = load_classes(data_config['names'])
num_clusters = opt.num_clusters
dir_name = os.path.dirname(train_path)
##### Compute the frequency based adjacency Matrix #####
adjacency_matrix = _get_adjacency_matrix(train_path=train_path,
num_classes=num_classes,
co_occurance_out_file=opt.co_occurance_out)
chosen_classes = [0, 2, 3, 5, 7, 9, 10, 11, 12, 18, 17, 19, 26, 22, 41, 42, 43, 44,
45, 48, 57, 63, 62, 64, 65, 66, 68, 69, 71, 73]
print("Chosen subset:", len(chosen_classes))
# chosen_classes = np.arange(80)
classes_names = np.asarray(classes_names)
if len(chosen_classes) < num_classes:
classes_names = classes_names[chosen_classes]
adjacency_matrix = adjacency_matrix[chosen_classes,:]
adjacency_matrix = adjacency_matrix[:,chosen_classes]
# Get Common Classes
common_classes = _get_common_classes(adjacency_matrix)
print("Number of common_classes = ", len(common_classes), classes_names[common_classes])
classes_idx_dict, adjacency_matrix = _remove_common_classes(adjacency_matrix,
common_classes=common_classes,
num_classes=num_classes)
# Convert frequency based agjacency matrix to probablility based adjacency matrix
prob_adjacency_matrix = _get_probability_adjacency_matrix(adjacency_matrix)
# Convert the probability based adjacency matrix to a graph
n_positions, s_matrix = _create_graph(prob_adjacency_matrix, pruning_threshold=opt.pruning_threshold, itterations=200)
# Cluster the classes based on their positions in the graph
clusters = _cluster(n_positions, num_clusters)
# Plot the graph with the clustered classes
if not os.path.exists("output"):
os.makedirs("output")
if not os.path.exists("output/clustering"):
os.makedirs("output/clustering")
fig_name = "output/clustering/cluster_" + str(opt.num_clusters) + "_" + str(opt.common_classes_thres)
_plot_graph(s_matrix, n_positions, clusters, chosen_classes, common_classes, classes_names, fig_name + ".png")
## Add common classes to clusters list and save the clusters file
for idx, cluster in enumerate(clusters):
clusters[idx] = [classes_idx_dict[i] for i in cluster]
clusters[idx].extend(common_classes)
with open(fig_name + ".data", 'w') as f:
for cluster in clusters:
for i, obj in enumerate(cluster):
if i == 0:
f.write("%s"%(obj))
else:
f.write(",%s"%(obj))
f.write("\n")
## Evaluate the clustering by checking the branch miss rate
if opt.eval:
neglected, all_obj = _evaluate_clustering(valid_path, common_classes, clusters)
print(f'Separability Error = {neglected/all_obj*100}%')