-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
85 lines (64 loc) · 2.67 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
from copy import deepcopy
import fastdup
from sklearn.metrics.pairwise import cosine_similarity
from definitions import *
from utils import *
from config import *
from process_cc_duplicates import do_process_cc_duplicates
from process_similarity_threshold import do_process_similarity_threshold
from process_outliers import do_process_outliers
from export_dataset_splits import do_save_dataset_splits
file_list, feats = fastdup.load_binary_feature(filename=FASTDUP_FEATURES, d=FEAT_DIM)
assert len(file_list) == TOTAL_FITZPATRICK17K_IMAGES
assert feats.shape == (TOTAL_FITZPATRICK17K_IMAGES, FEAT_DIM)
# Convert the list of file paths to a pandas DataFrame.
file_list_df = filelist_to_df(file_list)
# Dictionary mapping filename (without the .jpg) to absolute filepath.
# This is useful for looking up the filepath of a given filename.
# https://stackoverflow.com/a/17426500
filename_to_path_dict = dict(zip(file_list_df.filename, file_list_df.filepath))
file_list_filtered = deepcopy(file_list)
pairswise_similarities = cosine_similarity(feats)
assert check_symmetric(pairswise_similarities) == True
pairwise_similarities_nondiag = deepcopy(pairswise_similarities)
np.fill_diagonal(pairwise_similarities_nondiag, -1)
max_similarities = np.max(pairwise_similarities_nondiag, axis=0)
assert max_similarities.shape == (TOTAL_FITZPATRICK17K_IMAGES,)
if USE_SIMILARITY_THRESHOLD:
file_list_filtered = do_process_similarity_threshold(
max_similarities, file_list_filtered
)
filenames_to_be_removed = []
if REMOVE_CC_DUPLICATES_FASTDUP_CLEANLAB:
filenames_to_be_removed.extend(
do_process_cc_duplicates(path_map=filename_to_path_dict)
)
if REMOVE_OUTLIERS:
filenames_to_be_removed.extend(do_process_outliers())
# print(len(filenames_to_be_removed), len(set(filenames_to_be_removed)))
filepaths_to_be_removed = [
filename_to_path_dict[filename] for filename in filenames_to_be_removed
]
# print(len(filepaths_to_be_removed), len(set(filepaths_to_be_removed)))
file_list_filtered = [
filepath
for filepath in file_list_filtered
if filepath not in filepaths_to_be_removed
]
file_list_filtered_diseases = [
get_disease_name(filepath) for filepath in file_list_filtered
]
if REMOVE_MISSING_FST:
file_list_filtered = [
filepath for filepath in file_list_filtered if get_fst_label(filepath) != "0"
]
print(
f"Number of images after removing duplicates and outliers: {len(file_list_filtered)}"
)
print(
f"Number of unique diseases after removing duplicates and outliers: {len(set(file_list_filtered_diseases))}"
)
if SAVE_FILTERED_FILE_LISTS:
do_save_dataset_splits(file_list_filtered)
print("Saved filtered file lists.")