-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRDclassify.py
493 lines (438 loc) · 18.7 KB
/
RDclassify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import initiate.config as config
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from subprocess import call
from collections import Counter
from visualization.plot_heatmap import plot_correction_heatmap
#from visualization.plot_driver_number_effect import random_choose_drivers, generate_drive_accuracy_matrix, plot_driver_number_effect
import seaborn as sns
'''
:1 Create correction heatmap function v
:2 Create graph function for case study:
x: change number of driver
y: average accuracy
'''
def random_choose_drivers(Data_dict, driver_num):
'''
generate data by randomly selecting certain number of drivers
:param Data_dict: the dict of driver data, the key should be the driver id
:param driver_num: the number of driver being randomly selected
retrun
:dict with selected driver datas
'''
selected_driver_list = random.sample(Data_dict.keys(), driver_num)
out_Data_dict = {}
for s in selected_driver_list:
out_Data_dict[s] = Data_dict[s]
return out_Data_dict
def generate_drive_accuracy_matrix(dict_org, num_of_exp_per_driverNum=100):
'''
generate drive_accuracy_matrix by randomly running model on difference driver number
:param dict_org: the original dict of driver data, the key should be the driver id
:param num_of_exp_per_driverNum: the number of experiments per diver number
retrun
:drive_acc_mat with data needed to plot
'''
drive_acc_mat = np.array([])
for i in range(2, 12):
pcorr_list = RandomForest(
dict_org, experiment_num=num_of_exp_per_driverNum, random_select_driver=True, driver_num=i)
local_drive_acc_mat = np.hstack(
(np.ones((num_of_exp_per_driverNum, 1))*i, np.array(pcorr_list).reshape(-1, 1)))
#print('local_drive_acc_mat.shape', local_drive_acc_mat.shape)
if drive_acc_mat.shape[0] == 0:
drive_acc_mat = np.array(local_drive_acc_mat)
else:
drive_acc_mat = np.vstack((drive_acc_mat, local_drive_acc_mat))
return drive_acc_mat
def plot_driver_number_effect(drive_acc_mat):
'''
plot effect of changing driver number on average accuracy
:param drive_acc_mat
'''
sns.set_context('talk')
plt.figure(figsize=(10, 10))
ax = sns.lineplot(x=drive_acc_mat[:, 0], y=drive_acc_mat[:, 1])
plt.xticks(range(2, 12))
plt.xlabel('Number of Classes(Drivers)')
plt.ylabel('Accuracy')
plt.title(
'Accuracy with increasing number of drivers with 100 experiments', fontweight='bold')
save_figure_path = os.path.join(
config.figures_folder, config.driver_number_casestudy_folder)
if not os.path.exists(save_figure_path):
os.mkdir(save_figure_path)
plt.savefig(os.path.join(save_figure_path, "casestudy" + ".png"))
plt.close()
def load_json_file(folder, file_name):
file_name = os.path.join(folder, file_name)
with open(file_name, "r") as json_file:
travel_distance_frequency_dict = json.load(json_file)
return travel_distance_frequency_dict
def driving_date_figure(driving_time_dict):
"""
Show How many days each driver drives their vehicle. Used to
determine train/test data ratio
:return:
"""
driver_num = len(driving_time_dict.keys())
driver_idx = 0
driving_date_matrix = None
for driver_id in driving_time_dict.keys():
local_driving_dict = driving_time_dict[driver_id]
for time_string in local_driving_dict.keys():
if driving_date_matrix is None:
driving_date_matrix = np.zeros((driver_num, 30))
date_idx = int(time_string.split("-")[-1]) - 1
driving_date_matrix[driver_idx, date_idx] = 1
driver_idx += 1
# plt.figure(figsize=[6, 7])
plt.imshow(driving_date_matrix, cmap="gray_r")
plt.colorbar()
plt.title("Driving date display")
plt.xlabel("Day in a month")
plt.ylabel("Driver sequence")
save_figure_path = os.path.join(
config.figures_folder, config.Learn2classify_test_folder)
if not os.path.exists(save_figure_path):
os.mkdir(save_figure_path)
plt.savefig(os.path.join(save_figure_path, "driving date test" + ".png"))
plt.close()
def data_process_for_classify(link_level_dict, driving_time_dict):
"""
Construct each driver's daily driving data as a vector for SVD classify
:param link_level_dict:
:return:
"""
# construct total_link_set
total_link_set = []
for driver_id in link_level_dict.keys():
local_driving_dict = link_level_dict[driver_id]
for time_string in local_driving_dict.keys():
local_driver_link_dict = local_driving_dict[time_string]
for link_string in local_driver_link_dict.keys():
local_link = link_string
if local_link not in total_link_set:
total_link_set.append(link_string)
# construct rd_dict
RD_dict = {}
for driver_id in link_level_dict.keys():
local_driving_dict = link_level_dict[driver_id]
if driver_id not in RD_dict.keys():
RD_dict[driver_id] = {}
for date in local_driving_dict.keys():
local_driver_link_dict = local_driving_dict[date]
if date not in RD_dict[driver_id].keys():
RD_dict[driver_id][date] = []
for link_set_string in total_link_set:
if link_set_string in local_driver_link_dict.keys():
distance = local_driver_link_dict[link_set_string]['distance']
RD_dict[driver_id][date].append(distance)
else:
RD_dict[driver_id][date].append(0.0)
for driver_id in driving_time_dict.keys():
if driver_id not in RD_dict.keys():
print(
"Error! driver id not in RD_dict. driving_time_dict not match with link_level_dict!")
exit()
local_driving_dict = driving_time_dict[driver_id]
svd_driver_dict = RD_dict[driver_id]
for date in local_driving_dict.keys():
local_driving_time_list = local_driving_dict[date]
# some driver may have driving time but don't have link info
# ************* this problem needs further discuss *******************
if date not in svd_driver_dict.keys():
# print(date)
# print(driver_id)
# print("Error! driver date not in RD_dict. driving_time_dict not match with link_level_dict!")
# exit()
zero_distance_in_all_link_set = [0.0] * len(total_link_set)
RD_dict[driver_id][date] = zero_distance_in_all_link_set
for travel_time in local_driving_time_list:
RD_dict[driver_id][date].append(travel_time)
return RD_dict
def RandomForest(RD_dict_org, experiment_num=100, plot_correction_flag=True, random_select_driver=False, driver_num=None, plot_heatmap=False):
"""
:param RD_dict_org:
:param experiment_num:
:param plot_correction_flag:
:return:
"""
if random_select_driver == False:
# Construct driver_id--label dict and driver_num
driver_num = len(RD_dict_org.keys())
driver_id2label_dict = {}
count = 0
for driver_id in RD_dict_org.keys():
driver_id2label_dict[driver_id] = count
count += 1
# Construct SVD matrix
data_list = []
label_list = []
for driver_id in RD_dict_org.keys():
local_driving_dict = RD_dict_org[driver_id]
for date in local_driving_dict.keys():
sample_driving_data = local_driving_dict[date]
data_list.append(sample_driving_data)
label_list.append(driver_id2label_dict[driver_id])
# keep this name same as the SVD classifier
svd_matrix = (np.array(data_list)).transpose()
RD_dict = RD_dict_org
else:
pass
'''
PCA: reduce dimension
Sigma = V D V^t
D: (…, M) array, The eigenvalues, each repeated according to its multiplicity
V: (…, M, M) array, The normalized (unit “length”) eigenvectors
'''
#Sigma = np.cov(svd_matrix.T, rowvar=False)
#D, V = np.linalg.eig(Sigma)
#print('D:', D.shape)
#print('V:', V.shape)
#sorted_D = np.sort(D)
#plot_list = sorted_D[::-1][100:200]
#print('Non zero Eigen value:', sum([1 for v in sorted_D if v > 0.1]))
#x = [i for i in range(0, len(plot_list))]
#plt.scatter(x, plot_list, color='k', s=10)
#plt.ylabel('Eig Value')
# plt.xlabel('k')
# save_figure_path = os.path.join(
# config.figures_folder, config.Learn2classify_test_folder)
# if not os.path.exists(save_figure_path):
# os.mkdir(save_figure_path)
# plt.savefig(os.path.join(save_figure_path,
# "Random Forest PCA" + ".png"))
# plt.close()
################################################
# Transforming to PCA space
################################################
#k = 200
#projecting_v = V[:, np.argsort(D)[::-1][:k]]
#print('projecting_v.shape:', projecting_v.shape)
#svd_matrix = svd_matrix.T.dot(projecting_v).T
################################################
# Random Forest
pcorr_list = []
correction_table = np.zeros((driver_num, experiment_num))
correction = np.zeros((driver_num, experiment_num))
for iteration in range(experiment_num):
if random_select_driver == True:
RD_dict = random_choose_drivers(RD_dict_org, driver_num)
print(RD_dict.keys())
# Construct driver_id--label dict and driver_num
driver_num = len(RD_dict.keys())
driver_id2label_dict = {}
count = 0
for driver_id in RD_dict.keys():
driver_id2label_dict[driver_id] = count
count += 1
# Construct SVD matrix
data_list = []
label_list = []
for driver_id in RD_dict.keys():
local_driving_dict = RD_dict[driver_id]
for date in local_driving_dict.keys():
sample_driving_data = local_driving_dict[date]
data_list.append(sample_driving_data)
label_list.append(driver_id2label_dict[driver_id])
# keep this name same as the SVD classifier
svd_matrix = (np.array(data_list)).transpose()
# do lexperiment_num times
# divid svd_matrix into train matrix and test matrix
svd_matrix_train, svd_matrix_test, label_train, label_test = \
divide_data_2_train_test(RD_dict, svd_matrix, label_list)
# Learn Random Forest
classifier = learn_RandomForest(
RD_dict, svd_matrix_train.transpose(), label_train)
# Classify test set
predicted_label = predict_RandomForest(
classifier, svd_matrix_test.transpose())
# Calculate correct percentage
pcorr = pcorrect(predicted_label, label_test)
pcorr_list.append(pcorr)
# construct correction table
tmp = []
correction_tmp = []
for i in range(len(predicted_label)):
if predicted_label[i] == label_test[i]:
tmp.append(1)
correction_tmp.append(predicted_label[i])
else:
tmp.append(0)
correction_tmp.append(predicted_label[i])
correction_table[:, iteration] = tmp
correction[:, iteration] = correction_tmp
# export tree image for (each) of the tree in Random Forest
for tree_id in range(10):
estimator = classifier.estimators_[tree_id]
# Export the last classifier as dot file
export_graphviz(estimator, out_file='tree/tree'+str(tree_id)+'.dot',
rounded=True, proportion=False,
precision=3, filled=True)
'''Exporting .png process may take long time'''
# Convert to png using system command (requires Graphviz)
# call(['dot', '-Tpng', 'tree/tree'+str(tree_id)+'.dot', '-o',
# 'tree/tree'+str(tree_id)+'.png', '-Gdpi=300'])
# plot correction table .csv
test(correction)
# plot correction table
if plot_correction_flag is True:
plot_correction_table(correction_table)
plot_each_driver_accuracy(RD_dict, correction_table)
# plot accuracy figure
mean_accuracy = plot_accuracy_figure(pcorr_list, experiment_num)
if plot_heatmap:
# plot heatmap
plot_correction_heatmap(RD_dict, correction, mean_accuracy)
return pcorr_list
def plot_each_driver_accuracy(RD_dict, correction_table):
plt.figure(figsize=[14, 4])
X = range(len(RD_dict.keys()))
Xticks = RD_dict.keys()
y = np.mean(correction_table, axis=1)
plt.plot(X, y, 'k-')
plt.xticks(ticks=X, labels=Xticks)
plt.title("Driver with accuracy")
plt.xlabel("Driver ID")
plt.ylabel("mean accuracy")
for i, n in enumerate([len(RD_dict[k].keys()) for k in RD_dict.keys()]):
plt.annotate(n, (X[i], y[i]))
save_figure_path = os.path.join(
config.figures_folder, config.Learn2classify_test_folder)
if not os.path.exists(save_figure_path):
os.mkdir(save_figure_path)
plt.savefig(os.path.join(save_figure_path,
"Random Forest Driver Accuracy" + ".png"))
plt.close()
def test(correction):
np.savetxt("correction tabel RD.csv", correction, delimiter=",",
fmt='%10.5f')
def plot_accuracy_figure(pcorr_list, experiment_num):
mean_accuracy = [sum(pcorr_list) / len(pcorr_list)] * \
(len(pcorr_list) + 10)
print("mean accuracy", mean_accuracy[0])
x = [i for i in np.arange(0, experiment_num, 1)]
x1 = [i for i in np.arange(-5, experiment_num+5, 1)]
f1 = plt.plot(x, pcorr_list, color='k')
f2 = plt.plot(x1, mean_accuracy, 'b--')
plt.legend(f2, "mean accuracy")
plt.title("Accuracy in {} times experiment, # of test data: {}".format(
experiment_num, len(pcorr_list)))
# plt.grid(linewidth=0.3)
plt.xlabel("experiment idx")
plt.ylabel("accuracy")
# plt.show()
save_figure_path = os.path.join(
config.figures_folder, config.Learn2classify_test_folder)
if not os.path.exists(save_figure_path):
os.mkdir(save_figure_path)
plt.savefig(os.path.join(save_figure_path,
"Random Forest 11 test data" + ".png"))
plt.close()
return np.mean(mean_accuracy)
def plot_correction_table(correction_table):
plt.figure(figsize=[14, 4])
plt.imshow(correction_table, cmap='gray', aspect='auto')
plt.colorbar(ticks=range(2), label="classify correct or not")
plt.title("1:correct 0:incorrect")
plt.xlabel("Experiment idx")
plt.ylabel("Driver idx")
save_figure_path = os.path.join(
config.figures_folder, config.Learn2classify_test_folder)
if not os.path.exists(save_figure_path):
os.mkdir(save_figure_path)
plt.savefig(os.path.join(save_figure_path,
"Random Forest correction table" + ".png"))
plt.close()
def divide_data_2_train_test(RD_dict, svd_matrix, label_list):
'''
Pick one data randomly from each driver's data as test data
'''
# divid svd_matrix into train matrix and test matrix
driver_num = len(RD_dict.keys())
test_idx = [] # store the index of test data
svd_matrix_test = None
for i in range(driver_num):
data_idx = [j for j, x in enumerate(label_list) if x == i]
start_idx = data_idx[0]
end_idx = data_idx[-1]
idx_list = np.arange(start_idx, end_idx + 1, 1)
# randomly choose one data as test sample
chosen_test_idx = random.choice(idx_list)
test_idx.append(chosen_test_idx)
dim = svd_matrix[:, 0].shape[0]
local_data = np.reshape(svd_matrix[:, chosen_test_idx], (dim, 1))
if svd_matrix_test is None:
svd_matrix_test = local_data
else:
svd_matrix_test = np.hstack((svd_matrix_test, local_data))
svd_matrix_train = np.delete(svd_matrix, test_idx, 1)
label_train = label_list[:]
label_test = []
for x in test_idx[::-1]:
label_test.append(label_list[x])
label_train = label_train[:x] + label_train[x + 1:]
label_test = label_test[::-1]
return svd_matrix_train, svd_matrix_test, label_train, label_test
def learn_RandomForest(RD_dict, svd_matrix, label_list):
classifier = RandomForestClassifier(n_estimators=35,
criterion='gini',
max_depth=21,
min_samples_split=2,
random_state=0)
classifier.fit(svd_matrix, label_list)
return classifier
def predict_RandomForest(classifier, test_matrix):
predicted_label = classifier.predict(test_matrix)
return predicted_label
def pcorrect(predicted_label, test_label):
count = 0
for i in range(len(predicted_label)):
predict = predicted_label[i]
real = test_label[i]
if predict == real:
count += 1
else:
continue
percent = count / len(predicted_label)
return percent
def RDclassify(folder, plot_for_driver_num=False):
# load the travel distance and frequency data
# travel_distance_frequency_dict = load_json_file(folder, config.grid_travel_info)
# print("Now plot the travel distance and frequency figures...")
# # generate_grid_travel_figures(travel_distance_frequency_dict)
#
# # load the driving time data
all_driving_time_dict = load_json_file(folder, config.driving_time_info)
# print(driving_time_dict['10125'].keys())
# load the link-level data
all_link_level_dict = load_json_file(folder, config.link_level)
# print(link_level_dict['10125'].keys())
driving_time_dict = {}
link_level_dict = {}
#Counter({6: 24, 10: 24, 9: 20, 8: 19, 2: 16, 4: 16, 3: 14, 0: 12, 1: 10, 7: 9, 5: 8})
for key in all_driving_time_dict.keys():
if len(all_driving_time_dict[key].keys()) > 10:
driving_time_dict[key] = all_driving_time_dict[key]
link_level_dict[key] = all_link_level_dict[key]
# generate driving date figure
# driving_date_figure(driving_time_dict)
if plot_for_driver_num == False:
RD_dict = data_process_for_classify(
link_level_dict, driving_time_dict)
mean_accuracy = RandomForest(
RD_dict, experiment_num=1000, plot_heatmap=True)
else:
RD_dict_org = data_process_for_classify(
all_link_level_dict, all_driving_time_dict)
drive_acc_mat = generate_drive_accuracy_matrix(
RD_dict_org, num_of_exp_per_driverNum=100)
plot_driver_number_effect(drive_acc_mat)
if __name__ == '__main__':
RDclassify("ann_arbor", plot_for_driver_num=True)