-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_graph2vec_corpus.py
472 lines (399 loc) · 18.7 KB
/
make_graph2vec_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# Code base reused from https://github.com/benedekrozemberczki/graph2vec
import os,sys,json,glob,copy,ast
from pprint import pprint
from time import time
import hashlib
import networkx as nx, numpy as np
import itertools
from collections import defaultdict
from joblib import Parallel,delayed
from copy import deepcopy
import pandas as pd
from functools import partial
from itertools import chain
from itertools import product
from itertools import starmap
from itertools import combinations
class IndexGenerator:
def __init__(self):
self.idx = 1
def next(self):
self.idx = self.idx + 1
return self.idx - 1
def add_unique_word(node_label_map):
next_value = max([int(x) for x in list(node_label_map.values())]) + 10
node_label_map[next_value] = next_value
return next_value
# Reserved node labels.
RELABELED_ROOT_LABEL = -1
RELABELED_LABEL_TO_IGNORE = -1
LOGS_FILENAME = "logs"
node_label_map = {}
node_label_generator = IndexGenerator()
label_tags = {}
mutually_exclusive_pairs = {}
word_counts = {} # hashmap with bool values
def get_hash(str):
hash_object = hashlib.md5(str.encode())
return hash_object.hexdigest()
def get_node_label(g, node):
return g.nodes[node]['neighborhood_label'][0]
def multiply_label(num_times, label):
label_list = []
prev_feature = label
for it in range(int(num_times)):
label_list.append(label + "_bis_" + str(it))
'''
compressed_feature = get_hash(prev_feature)
label_list.append(compressed_feature)
prev_feature = compressed_feature
'''
return label_list
def update_word_counts(label, word_counts):
if label in word_counts:
word_counts[label] = word_counts[label] + 1
else:
word_counts[label] = 1
def initial_relabel(g, node_label_attr_name='Label'):
global node_label_map
global node_label_generator
global word_counts
try:
opfname = g+'.tmpg'
g = nx.read_gexf(g)
except:
opfname = None
pass
nx.convert_node_labels_to_integers(g, first_label=0) # this needs to be done for the initial interation only
for node in g.nodes():
g.nodes[node]['neighborhood_label'] = {}
for node in g.nodes():
try:
node_id = g.nodes[node][node_label_attr_name]
except:
# no node label referred in 'node_label_attr_name' is present
assert False
if not node_id in node_label_map:
node_label = node_label_generator.next()
if str(node) == "0": # root
node_label = str(node_label)
node_label_map[node_id] = node_label #insert the new label to the label map
g.nodes[node]['neighborhood_label'][0] = str(node_label)
else:
g.nodes[node]['neighborhood_label'][0] = str(node_label_map[node_id])
for node in g.nodes():
update_word_counts(g.nodes[node]['neighborhood_label'][0], word_counts)
if opfname:
nx.write_gexf(g,opfname)
else:
return g
def wl_relabel(g, it, debug=False):
global node_label_map
global node_label_generator
global word_counts
try:
opfname = g+'.tmpg'
g = nx.read_gexf(g+'.tmpg')
new_g = deepcopy(g)
for n in g.nodes():
new_g.nodes[n]['neighborhood_label'] = ast.literal_eval(g.nodes[n]['neighborhood_label'])
g = new_g
except:
opfname = None
pass
prev_iter = it - 1
for node in g.nodes():
neighbors = list(nx.all_neighbors(g, node))
neighborhood_label = sorted([g.nodes[nei]['neighborhood_label'][prev_iter] for nei in neighbors])
new_node_features = [g.nodes[node]['neighborhood_label'][prev_iter]] + neighborhood_label
new_node_features = "_".join(new_node_features)
if new_node_features in node_label_map:
next_label = node_label_map[new_node_features]
else:
next_label = str(node_label_generator.next())
node_label_map[new_node_features] = next_label
if str(node) == '0': # root
next_label = next_label
g.nodes[node]['neighborhood_label'][it] = next_label #get_hash(new_node_features)
update_word_counts(next_label, word_counts)
if opfname:
nx.write_gexf(g,opfname)
else:
return g
def add_label_tag(label, tag):
global label_tags
if label in label_tags:
if tag != label_tags[label]:
assert False
label_tags[label] = tag
return label_tags[label]
def write_labels_to_file(label, legend_tag, fh_vocabulary, fh_vocabulary_legend):
print(label, file=fh_vocabulary)
label_legend = add_label_tag(label, legend_tag)
print(label_legend, file=fh_vocabulary_legend)
def dump_sg2vec_str (fname, max_h, g, vocabulary_params):
global mutually_exclusive_pairs
global RELABELED_ROOT_LABEL, RELABELED_LABEL_TO_IGNORE
global word_counts
global node_label_map
opfname = fname + '.g2v' + str(max_h)
fh_vocabulary = open(opfname,'w')
fh_vocabulary_legend = open(opfname + ".legend",'w')
# Add labels construted around each node.
has_non_unique_words = False
unique_word = None
for n,d in g.nodes(data=True):
# Neighborhood labels
if vocabulary_params["neighborhoods"]:
for i in d['neighborhood_label'].keys(): # TODO (this is redundant).
# Add label tag.
if i not in vocabulary_params["wlk_sizes"]:
continue
if vocabulary_params["remove_unique_words"] and word_counts[d['neighborhood_label'][i]] <= 2:
unique_word = d['neighborhood_label'][i]
continue
if i == 0: # individual nodes
if d['neighborhood_label'][i] != str(RELABELED_ROOT_LABEL):
add_label_tag(d['neighborhood_label'][i], "single_node")
write_labels_to_file(d['neighborhood_label'][i], "single_node", fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
else:
add_label_tag(d['neighborhood_label'][i], "neighborhood_" + str(i))
write_labels_to_file(d['neighborhood_label'][i], "neighborhood_" + str(i), fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
# Structural labels
if vocabulary_params["structure"]:
for i in d['structure_label'].keys():
# Add label tag.
if i == 0:# or i == 1:
assert False
if vocabulary_params["remove_unique_words"] and word_counts[d['structure_label'][i]] <= 2:
unique_word = d['structure_label'][i]
continue
write_labels_to_file(d['structure_label'][i], "structure_" + str(i), fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
# Repeat labels for individual nodes
if vocabulary_params["individual_nodes"]:
if (str(d['neighborhood_label'][0]) != str(RELABELED_ROOT_LABEL) and
str(d['neighborhood_label'][0]) != str(RELABELED_LABEL_TO_IGNORE)):
if vocabulary_params["remove_unique_words"] and word_counts[d['neighborhood_label'][0]] <= 2:
unique_word = d['neighborhood_label'][0]
else:
has_non_unique_words = True
for label_copy in multiply_label(vocabulary_params["individual_nodes"], d['neighborhood_label'][0]):
write_labels_to_file(label_copy, "single_node_copy", fh_vocabulary, fh_vocabulary_legend)
# Nonadjacent_path_pair_labels: exclude root, the current node and the ancester.
if vocabulary_params["non_adjacent_pairs"]:
for label in d['nonadjacent_path_pair_labels']:
if vocabulary_params["remove_unique_words"] and word_counts[label] <= 2: # remove unique words
unique_word = label
continue
write_labels_to_file(label, "co_occurence", fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
for label_copy in multiply_label(vocabulary_params["non_adjacent_pairs"]-1, label):
write_labels_to_file(label_copy, "co_occurence_copy", fh_vocabulary, fh_vocabulary_legend)
# Direct_edge_labels: If no neighborhoods are considered, the add all the direct edges. Else, add only
# the edges between nodes with node degree > 1 (the others are covered in the neighborhoods).
if vocabulary_params["direct_edges"]:
for label in d['direct_edge_labels']:
if vocabulary_params["remove_unique_words"] and word_counts[label] <= 2: # remove unique words
unique_word = label
continue
if not(vocabulary_params["neighborhoods"] and 1 in vocabulary_params["wlk_sizes"] and g.degree(n) == 1):
write_labels_to_file(label, "co_occurence", fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
for label_copy in multiply_label(vocabulary_params["direct_edges"]-1, label):
write_labels_to_file(label_copy, "co_occurence_copy", fh_vocabulary, fh_vocabulary_legend)
# Add roo-child relations.
if vocabulary_params["root_child_relations"]:
for label in g.nodes['0']['root_child_relations']:
if vocabulary_params["remove_unique_words"] and word_counts[label] <= 2:
unique_word = label
continue
write_labels_to_file(label, "root_child_relations", fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
for label_copy in multiply_label(vocabulary_params["root_child_relations"]-1, label):
write_labels_to_file(label_copy, "root_child_relations_copy", fh_vocabulary, fh_vocabulary_legend)
# Add mutually exclusive pairs.
if vocabulary_params["mutually_exclusive_pairs"]:
for label in mutually_exclusive_pairs[g]:
if vocabulary_params["remove_unique_words"] and word_counts[label] <= 2: # remove unique words
unique_word = label
continue
write_labels_to_file(label, "mutually_exclusive", fh_vocabulary, fh_vocabulary_legend)
has_non_unique_words = True
for label_copy in multiply_label(vocabulary_params["mutually_exclusive_pairs"]-1, label):
write_labels_to_file(label_copy, "mutually_exclusive_copy", fh_vocabulary, fh_vocabulary_legend)
label_legend_filename = os.path.join(os.path.dirname(fname), "label_legend.csv")
pd.DataFrame.from_dict(data=label_tags, orient='index').to_csv(label_legend_filename, header=False)
if vocabulary_params["remove_unique_words"] and not has_non_unique_words:
if not unique_word:
unique_word = add_unique_word(node_label_map)
write_labels_to_file(unique_word, "unique_word", fh_vocabulary, fh_vocabulary_legend)
if os.path.isfile(fname+'.tmpg'):
os.system('rm '+fname+'.tmpg')
def print_node_labels(g):
print(g.nodes())
print([g.nodes[node] for node in g.nodes()])
def encode_neighborhoods(graphs, node_label_attr_name, max_h, root_label=0, label_to_ignore=None):
global RELABELED_ROOT_LABEL, RELABELED_LABEL_TO_IGNORE
t0 = time()
graphs = [initial_relabel(g,node_label_attr_name) for g in graphs]
print('initial relabeling done in {} sec'.format(round(time() - t0, 2)))
RELABELED_ROOT_LABEL = int(node_label_map[root_label])
if label_to_ignore:
RELABELED_LABEL_TO_IGNORE = int(node_label_map[label_to_ignore])
for it in range(1, max_h + 1): # range from [1 to max_h]
print("##### iteration", it, "#####")
t0 = time()
for g in graphs:
wl_relabel(g, it)
print('WL iteration {} done in {} sec.'.format(it, round(time() - t0, 2)))
return graphs
def wlk_relabel_and_dump_memory_version(fnames, max_h, vocabulary_params, node_label_attr_name='Label'):
global node_label_generator
global mutually_exclusive_pairs
global RELABELED_LABEL_TO_IGNORE
global word_counts
t0 = time()
graphs = [nx.read_gexf(fname) for fname in fnames]
print('loaded all graphs in {} sec'.format(round(time() - t0, 2)))
# Encode neighborhoods.
print("Encoding neighborhoods....")
graphs = encode_neighborhoods(graphs, node_label_attr_name, max_h,
root_label=vocabulary_params["root_label"], label_to_ignore=vocabulary_params["ignore_label"])
# Encoding graph structures. This is equivalent with encoding the neighborhoods
# for non-labeled graphs (or graphs with the same label for all nodes). The root will be labeled differenty.
print("Encoding tree structures....")
non_labeled_graphs = deepcopy(graphs)
root_relabel = node_label_generator.next()
regular_node_relabel = node_label_generator.next()
for g in non_labeled_graphs:
for n in g.nodes():
if str(n) == "0": # root
g.nodes[n][node_label_attr_name] = root_relabel
else:
g.nodes[n][node_label_attr_name] = regular_node_relabel
non_labeled_graphs = encode_neighborhoods(non_labeled_graphs, node_label_attr_name, max_h)
# Add the new labels as structure labels in the old trees.
for g1, g2 in zip(graphs, non_labeled_graphs):
for n1, n2 in zip(g1.nodes(), g2.nodes()):
g1.nodes[n1]['structure_label'] = g2.nodes[n2]['neighborhood_label']
# Delete the structure encoding for single nodes.
del g1.nodes[n1]['structure_label'][0]
# Encode root-child node relations.
print("Encoding root-child relations....")
for g in graphs:
g.nodes['0']['root_child_relations'] = []
root_node = g.nodes()['0']
root_label = get_node_label(g, "0")
for node in nx.all_neighbors(g,'0'):
child_label = get_node_label(g, node)
if str(child_label) == str(RELABELED_LABEL_TO_IGNORE):
continue
feature = "_".join([root_label, child_label, "root-child"])
g.nodes['0']['root_child_relations'].append(feature)
update_word_counts(feature, word_counts)
# Encode pairwise relations on the same path.
print("Encoding co-occurence....")
for g in graphs:
for node in g.nodes():
g.nodes[node]['nonadjacent_path_pair_labels'] = []
g.nodes[node]['direct_edge_labels'] = []
if str(node) == "0": # root
continue
node_label = get_node_label(g, node)
if str(node_label) == str(RELABELED_LABEL_TO_IGNORE):
continue
path_to_root = list(nx.all_simple_paths(g, source='0', target=node))
assert len(path_to_root) == 1 # There is only one path to root
path_to_root = path_to_root[0]
assert path_to_root[-1] == node
# Add reachable non-neighbouring pairs, excluding root and direct edges.
for reachable_node in path_to_root[1:-2]: # exclude root, the current node and the ancester. The ancester will be added though the direct edges.
reachable_node_label = get_node_label(g, reachable_node)
if str(reachable_node_label) == str(RELABELED_LABEL_TO_IGNORE):
continue
feature = "_".join([reachable_node_label, node_label, "co-occurence"])
g.nodes[node]['nonadjacent_path_pair_labels'].append(feature)
update_word_counts(feature, word_counts)
# Add direct edges. If no neighborhoods are considered, then add all the direct edges. Else, add only
# the edges between nodes with node degree > 1 (the others are covered in the neighborhoods).
# Also exclude root-child edges.
if len(path_to_root) < 2:
continue
ancester = path_to_root[-2]
if str(ancester) == "0": # root
continue
ancester_label = get_node_label(g, ancester)
if str(ancester_label) == str(RELABELED_LABEL_TO_IGNORE):
continue
feature = "_".join([ancester_label, node_label, "co-occurence"])
g.nodes[node]['direct_edge_labels'].append(feature)
update_word_counts(feature, word_counts)
longest_path = 0
# Encode pairwise relations on different paths.
print("Encoding mutual exclusivity....")
for g in graphs:
features = set()
mutually_exclusive_pairs[g] = []
leaves = (v for v, d in g.degree() if d == 1 and v != 0)
all_paths = partial(nx.all_simple_paths, g)
chains = chain.from_iterable
list_all_paths = list(chains(starmap(all_paths, product(['0'], leaves))))
if list_all_paths:
longest_path = max(longest_path,max([len(path) for path in list_all_paths]))
# For each pair of paths.
path_pairs = list(combinations(list_all_paths, 2))
for path_pair in path_pairs:
# Extract the branches.
path_pair = list(set(path_pair[0]) - set(path_pair[1])), list(set(path_pair[1]) - set(path_pair[0]))
mutual_exclusive_pairs = list(itertools.product(path_pair[0], path_pair[1]))
for pair in mutual_exclusive_pairs:
if str(get_node_label(g, pair[0])) == str(RELABELED_LABEL_TO_IGNORE) or str(get_node_label(g, pair[1])) == str(RELABELED_LABEL_TO_IGNORE):
continue
# Add the pair (pair[0], pair[1]) to the vocabulary. The order doesn't matter, therefore I sort by node label.
feature_1 = get_node_label(g, pair[0])
feature_2 = get_node_label(g, pair[1])
feature = "_".join(sorted([feature_1, feature_2]) + ["mutually_exclusive"])
features.add(feature)
for feature in features:
mutually_exclusive_pairs[g].append(feature)
update_word_counts(feature, word_counts)
t0 = time()
# Write subgraph labels to file.
for fname, g in zip(fnames, graphs):
dump_sg2vec_str(fname, max_h, g, vocabulary_params)
print('dumped sg2vec sentences in {} sec.'.format(round(time() - t0, 2)))
dir_path = os.path.dirname(fnames[0])
with open(os.path.join(dir_path, "label_tags.json"), 'w') as convert_file:
convert_file.write(json.dumps(label_tags))
vocabulary_size = 0
for n,d in g.nodes(data=True):
for key in d:
if key.lower() != "label":
vocabulary_size = vocabulary_size + len(d[key])
vocabulary_size = vocabulary_size / 2 # the number of trees is doubled.
with open(os.path.join(dir_path, LOGS_FILENAME), 'w') as logs_file:
logs_file.write("Vocabulary parameters: " + json.dumps(vocabulary_params))
logs_file.write("\n\n")
logs_file.write("Number of trees:" + str(len(fnames)/2)) # the number of trees is doubled.
logs_file.write("\n")
logs_file.write("Longest root-leaf path: " + str(longest_path) + ".")
logs_file.write("\n")
logs_file.write("Vocabulary size: " + str(vocabulary_size) + " words.")
logs_file.write("\n")
emb_size = 32
if vocabulary_size > 1000 and vocabulary_size <= 10000:
emb_size = 64
elif vocabulary_size > 10000:
emb_size = 128
logs_file.write("Recommended embedding size: " + str(emb_size) + ".")
def main():
ip_folder = '../data/kdd_datasets/ptc'
max_h = 3
all_files = sorted(glob.glob(os.path.join(ip_folder,'*gexf')))#[:100]
print('loaded {} files in total'.format(len(all_files)))
wlk_relabel_and_dump_memory_version(all_files,max_h)
if __name__ == '__main__':
main()