diff --git a/MixMatch_OOD_main.py b/MixMatch_OOD_main.py new file mode 100644 index 0000000..d7af06b --- /dev/null +++ b/MixMatch_OOD_main.py @@ -0,0 +1,684 @@ +from fastai.vision import * +from fastai.callbacks import CSVLogger +from numbers import Integral +import torch +import logging +import sys +from torchvision.utils import save_image +import numpy as np +#from utilities.InBreastDataset import InBreastDataset +from utilities.run_context import RunContext +import utilities.cli as cli +import torchvision +#from utilities.albumentations_manager import get_albumentations + + +import mlflow +import os +import shutil +import time +import datetime +import matplotlib.pyplot as plt +import imageio +from skimage import transform + + +class MultiTransformLabelList(LabelList): + def __getitem__(self, idxs: Union[int, np.ndarray]) -> 'LabelList': + """ + Create K transformed images for the unlabeled data + :param idxs: + :return: + """ + "return a single (x, y) if `idxs` is an integer or a new `LabelList` object if `idxs` is a range." + global args + #print("MULTITRANSFORM LIST") + idxs = try_int(idxs) + if isinstance(idxs, Integral): + if self.item is None: + #CALLED EVEN FOR UNLABELED DATA, Y IS USED! + x, y = self.x[idxs], self.y[idxs] + else: + x, y = self.item, 0 + if self.tfms or self.tfmargs: + #THIS IS DONE FOR UNLABELED DATA + x = [x.apply_tfms(self.tfms, **self.tfmargs) for _ in range(args.K_transforms)] + if hasattr(self, 'tfms_y') and self.tfm_y and self.item is None: + #IS NOT CALLED FOR UNLABELED DATA + y = y.apply_tfms(self.tfms_y, **{**self.tfmargs_y, 'do_resolve': False}) + if y is None: y = 0 + return x, y + else: + return self.new(self.x[idxs], self.y[idxs]) + + + +def MixmatchCollate(batch): + """ + # I'll also need to change the default collate function to accomodate multiple augments + :param batch: + :return: + """ + batch = to_data(batch) + if isinstance(batch[0][0], list): + batch = [[torch.stack(s[0]), s[1]] for s in batch] + return torch.utils.data.dataloader.default_collate(batch) + + + + + +class MixupLoss(nn.Module): + """ + Implements the mixup loss + """ + + def forward(self, preds, target, unsort=None, ramp=None, bs=None): + """ + Ramp, unsort and bs is None when doing validation + :param preds: + :param target: + :param unsort: + :param ramp: + :param bs: + :return: + """ + global args + + if(args.balanced==5): + return self.forward_balanced_cross_entropy(preds, target, unsort, ramp, bs) + else: + return self.forward_original(preds, target, unsort, ramp, bs) + + def forward_cross_entropy(self, preds, target, unsort=None, ramp=None, bs=None): + + global args + if unsort is None: + return F.cross_entropy(preds, target) + + calculate_cross_entropy = nn.CrossEntropyLoss() + preds = preds[unsort] + preds_l = preds[:bs] + preds_ul = preds[bs:] + # calculate log of softmax, to ensure correct usage of cross entropy + # one column per class, one batch per row + + preds_ul = torch.softmax(preds_ul, dim=1) + # TARGETS CANNOT BE 1-K ONE HOT VECTOR + (highest_values, highest_classes) = torch.max(target[:bs], 1) + + highest_classes = highest_classes.long() + + loss_x = calculate_cross_entropy(preds_l, highest_classes) + # loss_x = -(preds_l * target[:bs]).sum(dim=1).mean() + loss_u = F.mse_loss(preds_ul, target[bs:]) + self.loss_x = loss_x.item() + self.loss_u = loss_u.item() + return loss_x + args.lambda_unsupervised * ramp * loss_u + + + def forward_original(self, preds, target, unsort=None, ramp=None, num_labeled=None): + global args + """ + Implements the forward pass of the loss function + :param preds: predictions of the model + :param target: ground truth targets + :param unsort: ? + :param ramp: ramp weight + :param num_labeled: + :return: + """ + if unsort is None: + #used for evaluation + return F.cross_entropy(preds,target) + preds = preds[unsort] + #labeled and unlabeled observations were packed in the same array + preds_l = preds[:num_labeled] + preds_ul = preds[num_labeled:] + #apply logarithm to softmax of output, to ensure the correct usage of cross entropy + preds_l = torch.log_softmax(preds_l,dim=1) + preds_ul = torch.softmax(preds_ul,dim=1) + loss_x = -(preds_l * target[:num_labeled]).sum(dim=1).mean() + loss_u = F.mse_loss(preds_ul, target[num_labeled:]) + self.loss_x = loss_x.item() + self.loss_u = loss_u.item() + return loss_x + args.lambda_unsupervised * ramp * loss_u + + def forward_balanced(self, preds, target, unsort=None, ramp=None, bs=None): + """ + Balanced forward implementation + :param preds: + :param target: + :param unsort: + :param ramp: + :param bs: + :return: + """ + global args + if unsort is None: + return F.cross_entropy(preds, target) + # target contains mixed up targets!! not just 0s and 1s + preds = preds[unsort] + preds_l = preds[:bs] + preds_ul = preds[bs:] + # calculate log of softmax, to ensure correct usage of cross entropy + # one column per class, one batch per row + preds_l = torch.log_softmax(preds_l, dim=1) + + # get the weights for the labeled observations + weights_labeled = self.get_weights_observations(target[:bs]) + preds_ul = torch.softmax(preds_ul, dim=1) + # get the weights for the unlabeled observations + weights_unlabeled = self.get_weights_observations(target[bs:]) + loss_x = -(weights_labeled * preds_l * target[:bs]).sum(dim=1).mean() + loss_u = F.mse_loss(weights_unlabeled * preds_ul, weights_unlabeled * target[bs:]) + self.loss_x = loss_x.item() + self.loss_u = loss_u.item() + return loss_x + args.lambda_unsupervised * ramp * loss_u + + def forward_balanced_cross_entropy(self, preds, target, unsort=None, ramp=None, bs=None): + global args, class_weights + if unsort is None: + return F.cross_entropy(preds, target) + weights_unlabeled = self.get_weights_observations(target[bs:]).float() + calculate_cross_entropy = nn.CrossEntropyLoss(weight = class_weights.float()) + preds = preds[unsort] + preds_l = preds[:bs] + preds_ul = preds[bs:] + # calculate log of softmax, to ensure correct usage of cross entropy + # one column per class, one batch per row + preds_ul = torch.softmax(preds_ul, dim=1) + # TARGETS CANNOT BE 1-K ONE HOT VECTOR + (highest_values, highest_classes) = torch.max(target[:bs], 1) + highest_classes = highest_classes.long() + loss_x = calculate_cross_entropy(preds_l, highest_classes) + loss_u = F.mse_loss(weights_unlabeled * preds_ul, weights_unlabeled * target[bs:]) + self.loss_x = loss_x.item() + self.loss_u = loss_u.item() + return loss_x + args.lambda_unsupervised * ramp * loss_u + + def get_weights_observations(self, array_predictions): + global class_weights + # class_weights = torch.tensor([0.2, 0.2, 0.2, 0.2, 0.2]) + # each column is a class, each row an observation + num_classes = array_predictions.shape[1] + num_observations = array_predictions.shape[0] + (highest_values, highest_classes) = torch.max(array_predictions, 1) + # turn the highest_classes array a column vector + highest_classes_col = highest_classes.view(-1, 1) + # highest classes for all the observations (rows) and classes (columns) + highest_classes_all = highest_classes_col.repeat(1, num_classes) + # print("highest classes all") + # print(highest_classes_all) + # scores all + scores_all = class_weights[highest_classes_all] + scores_all.to(device="cuda:0") + return scores_all + + + +class MixMatchImageList(ImageList): + + + """ + Custom ImageList with filter function + """ + def filter_train(self, num_items, seed = 23488): + """ + Takes a number of observations as labeled, assumes that the evaluation observations are in the test folder + :param num_items: + :param seed: The seed is fixed for reproducibility + :return: return the filtering function by itself + """ + global args + path_unlabeled = args.path_unlabeled + if (args.path_unlabeled == ""): + path_unlabeled = args.path_labeled + #this means that a customized unlabeled dataset is not to be used, just pick the rest of the labelled data as unlabelled + if(path_unlabeled == args.path_labeled): + train_idxs = np.array([i for i, observation in enumerate(self.items) if Path(observation).parts[-3] != "test"]) + else: + # IGNORE THE DATA ALREADY IN THE UNLABELED DATASET + dataset_unlabeled = torchvision.datasets.ImageFolder(path_unlabeled + "/train/") + list_file_names_unlabeled = dataset_unlabeled.imgs + for i in range(0, len(list_file_names_unlabeled)): + #delete root of path + list_file_names_unlabeled[i] = list_file_names_unlabeled[i][0].replace(path_unlabeled, "") + list_train = [] + #add to train if is not in the unlabeled dataset + for i, observation in enumerate(self.items): + path_1 = str(Path(observation)) + sub_str = args.path_labeled + path_2 = path_1.replace(sub_str, "") + path_2 = path_2.replace("train/", "") + is_path_in_unlabeled = path_2 in list_file_names_unlabeled + #add the observation to the train list, if is not in the unlabeled dataset + if( not "test" in path_2 and not is_path_in_unlabeled): + list_train += [i] + #store the train idxs c + train_idxs = np.array(list_train) + logger.info("Customized number of unlabeled observations " + str(len(list_file_names_unlabeled))) + + valid_idxs = np.array([i for i, observation in enumerate(self.items) if Path(observation).parts[-3] == "test"]) + # for reproducibility + np.random.seed(seed) + # keep the number of items desired, 500 by default + keep_idxs = np.random.choice(train_idxs, num_items, replace=False) + + logger.info("Number of labeled observations: " + str(len(keep_idxs))) + logger.info("First labeled id: " + str(keep_idxs[0])) + logger.info("Number of validation observations: " + str(len(valid_idxs))) + logger.info("Number of training observations " + str(len(train_idxs))) + self.items = np.array([o for i, o in enumerate(self.items) if i in np.concatenate([keep_idxs, valid_idxs])]) + return self + +class PartialTrainer(LearnerCallback): + def on_epoch_end(self, epoch, last_metrics, smooth_loss, last_loss, **kwargs): + train_loss = float(smooth_loss) + val_loss = float(last_metrics[0]) + val_accuracy = float(last_metrics[1]) + mlflow.log_metric(key= 'train_loss', value=train_loss, step=epoch)#last_loss + mlflow.log_metric(key= 'val_loss', value=val_loss, step=epoch)#last_metric #1 + mlflow.log_metric(key= 'val_accuracy', value=val_accuracy, step=epoch) + +class MixMatchTrainer(LearnerCallback): + """ + Mix match trainer functions + """ + + def on_train_begin(self, **kwargs): + """ + Callback used when the trainer is beginning, inits variables + :param kwargs: + :return: + """ + global data_labeled + self.l_dl = iter(data_labeled.train_dl) + #metrics recorder + self.smoothL, self.smoothUL = SmoothenValue(0.98), SmoothenValue(0.98) + #metrics to be displayed in the table + self.it = 0 + + def mixup(self, a_x, a_y, b_x, b_y): + """ + Mixup augments data by mixing labels and pseudo labels and its observations + :param a_x: + :param a_y: + :param b_x: + :param b_y: + :param alpha: + :return: + """ + global args + alpha = args.alpha_mix + l = np.random.beta(alpha, alpha) + l = max(l, 1 - l) + x = l * a_x + (1 - l) * b_x + y = l * a_y + (1 - l) * b_y + return x, y + + def sharpen(self, p): + global args + """ + Sharpens the distribution output, to encourage confidence + :param p: + :param T: + :return: + """ + T = args.T_sharpening + u = p ** (1 / T) + return u / u.sum(dim=1, keepdim=True) + + def on_batch_begin(self, train, last_input, last_target, **kwargs): + """ + Called on batch training at the begining + :param train: + :param last_input: + :param last_target: + :param kwargs: + :return: + """ + global data_labeled, args + if not train: return + try: + x_l, y_l = next(self.l_dl) + except: + self.l_dl = iter(data_labeled.train_dl) + x_l, y_l = next(self.l_dl) + x_ul = last_input + with torch.no_grad(): + #calculates the pseudo sharpened labels + ul_labels = self.sharpen( + torch.softmax(torch.stack([self.learn.model(x_ul[:, i]) for i in range(x_ul.shape[1])], dim=1), + dim=2).mean(dim=1)) + #create torch array of unlabeled data + x_ul = torch.cat([x for x in x_ul]) + + #WE CAN CALCULATE HERE THE CONFIDENCE COEFFICIENT + + ul_labels = torch.cat([y.unsqueeze(0).expand(args.K_transforms, -1) for y in ul_labels]) + + l_labels = torch.eye(data_labeled.c).cuda()[y_l] + + w_x = torch.cat([x_l, x_ul]) + w_y = torch.cat([l_labels, ul_labels]) + idxs = torch.randperm(w_x.shape[0]) + #create mixed input and targets + mixed_input, mixed_target = self.mixup(w_x, w_y, w_x[idxs], w_y[idxs]) + bn_idxs = torch.randperm(mixed_input.shape[0]) + unsort = [0] * len(bn_idxs) + for i, j in enumerate(bn_idxs): unsort[j] = i + mixed_input = mixed_input[bn_idxs] + + ramp = self.it / args.rampup_coefficient if self.it < args.rampup_coefficient else 1.0 + return {"last_input": mixed_input, "last_target": (mixed_target, unsort, ramp, x_l.shape[0])} + + def on_batch_end(self, train, **kwargs): + """ + Add the metrics at the end of the batch training + :param train: + :param kwargs: + :return: + """ + if not train: return + self.smoothL.add_value(self.learn.loss_func.loss_x) + self.smoothUL.add_value(self.learn.loss_func.loss_u) + self.it += 1 + + """def on_epoch_end(self, last_metrics, **kwargs): + Avoid adding weird stuff on metrics table + When the epoch ends, add the accmulated metric values + :param last_metrics: + :param kwargs: + :return: + + return add_metrics(last_metrics, [self.smoothL.smooth, self.smoothUL.smooth]) + """ + + def on_epoch_end(self, epoch, last_metrics, smooth_loss, last_loss, **kwargs): + train_loss = float(smooth_loss) + val_loss = float(last_metrics[0]) + val_accuracy = float(last_metrics[1]) + mlflow.log_metric(key= 'train_loss', value=train_loss, step=epoch)#last_loss + mlflow.log_metric(key= 'val_loss', value=val_loss, step=epoch)#last_metric #1 + mlflow.log_metric(key= 'val_accuracy', value=val_accuracy, step=epoch) + + +def get_dataset_stats(args): + #note: these are just used as a placeholder, the actual standardization stats are calculated on per batch basis when the data is read + if(args.norm_stats.strip() == "MNIST"): + # stats for MNIST + meanDatasetComplete = [0.1307, 0.1307, 0.1307] + stdDatasetComplete = [0.3081, 0.3081, 0.3081] + + return (meanDatasetComplete, stdDatasetComplete) + +def calculate_weights(list_labels): + """ + Calculate the class weights according to the number of observations + :param list_labels: + :return: + """ + global logger, args + array_labels = np.array(list_labels) + logger.info("Using balanced loss: " + str(args.balanced)) + list_classes = np.unique(array_labels) + weight_classes = np.zeros(len(list_classes)) + for curr_class in list_classes: + + number_observations_class = len(array_labels[array_labels == curr_class]) + logger.info("Number observations " + str(number_observations_class) + " for class " + str(curr_class)) + weight_classes[curr_class] = 1 / number_observations_class + + weight_classes = weight_classes / weight_classes.sum() + logger.info("Weights to use: " + str(weight_classes)) + weight_classes_tensor = torch.tensor(weight_classes, device ="cuda:0" ) + return weight_classes_tensor + +def get_datasets(): + """ + Get datasets (FAST AI data bunches ) for labeled, unlabeled and validation + :return: data_labeled (limited labeled data), data_unlabeled , data_full (complete labeled dataset) + """ + global args, data_labeled, logger, class_weights + path_labeled = args.path_labeled + path_unlabeled = args.path_unlabeled + if (args.path_unlabeled == ""): + path_unlabeled = path_labeled + #get dataset mean and std + norm_stats = get_dataset_stats(args) + logger.info("Loading labeled data from: " + path_labeled) + logger.info("Loading unlabeled data from: " + path_unlabeled) + # Create two databunch objects for the labeled and unlabled images. A fastai databunch is a container for train, validation, and + # test dataloaders which automatically processes transforms and puts the data on the gpu. + # https://docs.fast.ai/vision.transform.html + + #COMPUTE BATCH NORMALIZATION STATS FOR LABELED DATA + data_labeled = (MixMatchImageList.from_folder(path_labeled) + .filter_train(args.number_labeled) # Use 500 labeled images for traning + .split_by_folder(valid="test") # test on all 10000 images in test set + .label_from_folder() + .transform(get_transforms(do_flip = True, flip_vert = True, max_zoom=1, max_warp=None, p_affine=0, p_lighting = 0), + size=args.size_image) + # On windows, must set num_workers=0. Otherwise, remove the argument for a potential performance improvement + .databunch(bs=args.batch_size, num_workers=args.workers) + .normalize(norm_stats)) + train_set = set(data_labeled.train_ds.x.items) + #logging the labeled inputs to artifacts/inputs/labelled + labeled_array_list = [] + for labeled in train_set: + mlflow.log_artifact(labeled, artifact_path='inputs/labelled') + image = imageio.imread(labeled) + labeled_array_list.append(image) + labeled_shape = image.shape + labeled_array = np.array(labeled_array_list)/255. + if len(labeled_array.shape) < 4: #for grayscale data we copy the last chanel three times + norm_stats_labeled = (list(np.mean(labeled_array[:,:,:,np.newaxis], axis=(0,1,2)))*3, list(np.std(labeled_array[:,:,:,np.newaxis], axis=(0,1,2)))*3) + else: + norm_stats_labeled = (list(np.mean(labeled_array, axis=(0,1,2))), list(np.std(labeled_array, axis=(0,1,2)))) + + #CREATE DATA BUNCH WITH BATCH STATS FOR LABELED DATA + data_labeled = (MixMatchImageList.from_folder(path_labeled) + .filter_train(args.number_labeled) # Use 500 labeled images for traning + .split_by_folder(valid="test") # test on all 10000 images in test set + .label_from_folder() + .transform(get_transforms(do_flip = True, flip_vert = True, max_zoom=1, max_warp=None, p_affine=0, p_lighting = 0), + size=args.size_image) + # On windows, must set num_workers=0. Otherwise, remove the argument for a potential performance improvement + .databunch(bs=args.batch_size, num_workers=args.workers) + .normalize(norm_stats_labeled)) + # normalize_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False) + train_set = set(data_labeled.train_ds.x.items) + + #get the list of labels for the dataset + list_labels = data_labeled.train_ds.y.items + #calculate the class weights + class_weights = calculate_weights(list_labels) + # load the unlabeled data + #filter picks the labeled images not contained in the unlabeled dataset, in the case of SSDL + #the test set is in the unlabeled folder + + src = (ImageList.from_folder(path_unlabeled) + .filter_by_func(lambda x: x not in train_set) + .split_by_folder(valid="test") + ) + unlabeled_array_list = [] + #logging iod and ood unlabelled data + for class_id in os.listdir(path_unlabeled+'/train'): + for unlabelled in os.listdir(path_unlabeled+'/train'+'/'+class_id): + if 'ood' in unlabelled: + mlflow.log_artifact(path_unlabeled+'/train'+'/'+class_id+'/'+unlabelled, artifact_path='inputs/unlabelled/ood/'+class_id) + image = imageio.imread(path_unlabeled+'/train'+'/'+class_id+'/'+unlabelled) + image = transform.resize(image, (args.size_image, args.size_image, 3),preserve_range=True) + else: + #mlflow.log_artifact(path_unlabeled+'/train'+'/'+class_id+'/'+unlabelled, artifact_path='inputs/unlabelled/iod/'+class_id) + mlflow.log_artifact(path_unlabeled+'/train'+'/'+class_id+'/'+unlabelled, artifact_path='inputs/unlabelled/iod/'+class_id) + image = imageio.imread(path_unlabeled+'/train'+'/'+class_id+'/'+unlabelled) + image = transform.resize(image, (args.size_image, args.size_image, 3), preserve_range=True) + unlabeled_array_list.append(image) + + unlabeled_array = np.array(unlabeled_array_list)/255. + print('##################################') + print(unlabeled_array.shape) + print('##################################') + if len(unlabeled_array.shape) < 4: #for grayscale data we copy the last chanel three times + norm_stats_unlabeled = (list(np.mean(unlabeled_array[:,:,:,np.newaxis], axis=(0,1,2)))*3, list(np.std(unlabeled_array[:,:,:,np.newaxis], axis=(0,1,2)))*3) + else: + norm_stats_unlabeled = (list(np.mean(unlabeled_array, axis=(0,1,2))), list(np.std(unlabeled_array, axis=(0,1,2)))) + mlflow.log_param(key="norm_stats_labeled", value=str(norm_stats_labeled)) + mlflow.log_param(key="norm_stats_unlabeled", value=str(norm_stats_unlabeled)) + + #AUGMENT THE DATA + src.train._label_list = MultiTransformLabelList + # https://docs.fast.ai/vision.transform.html + # data not in the train_set and splitted by test folder is used as unlabeled + data_unlabeled = (src.label_from_folder() + .transform(get_transforms(do_flip = True, flip_vert = True, max_zoom=1, max_warp=None, p_affine=0, p_lighting = 0), size=args.size_image) + .databunch(bs=args.batch_size, collate_fn=MixmatchCollate, num_workers=10) + .normalize(norm_stats_unlabeled)) + + + # Databunch with all 50k images labeled, for baseline + data_full = (ImageList.from_folder(path_labeled) + .split_by_folder(valid="test") + .label_from_folder() + .transform(get_transforms(do_flip = True, flip_vert = True, max_zoom=1, max_warp=None, p_affine=0, p_lighting = 0), + size=args.size_image) + .databunch(bs=args.batch_size, num_workers=args.workers) + .normalize(norm_stats)) + return (data_labeled, data_unlabeled, data_full) + + + + +def train_mix_match(): + """ + Train the mix match model + :param path_labeled: + :param path_unlabeled: + :param number_epochs: + :param learning_rate: + :param mode: + :return: + """ + global data_labeled, is_colab, logger, args + learning_rate = args.lr + number_epochs = args.epochs + logger = logging.getLogger('main') + (data_labeled, data_unlabeled, data_full)= get_datasets() + + #start_nf the initial number of features + """ + Wide ResNet with num_groups and a width of k. + Each group contains N blocks. start_nf the initial number of features. Dropout of drop_p is applied in between the two convolutions in each block. The expected input channel size is fixed at 3. + Structure: initial convolution -> num_groups x N blocks -> final layers of regularization and pooli + """ + if(args.model == "wide_resnet"): + model = models.WideResNet(num_groups=3,N=4,num_classes=args.num_classes,k = 2,start_nf=args.size_image) + elif(args.model == "densenet"): + model = models.densenet121(num_classes=args.num_classes) + elif(args.model == "squeezenet"): + model = models.squeezenet1_1(num_classes=args.num_classes) + elif(args.model.strip() == "alexnet"): + logger.info("Using alexnet") + model = models.alexnet(num_classes=args.num_classes) + + if (args.mode.strip() == "fully_supervised"): + logger.info("Training fully supervised model") + # Edit: We can find the answer ‘Note that metrics are always calculated on the validation set.’ on this page: https://docs.fast.ai/training.html 42. + if (is_colab): + learn = Learner(data_full, model, metrics=[accuracy]) + else: #, callback_fns = [CSVLogger] + learn = Learner(data_full, model, metrics=[accuracy], callback_fns = [CSVLogger]) + + + + if (args.mode.strip() == "partial_supervised"): + logger.info("Training supervised model with a limited set of labeled data") + if(is_colab): + #uses loss_func=FlattenedLoss of CrossEntropyLoss() + learn = Learner(data_labeled, model, metrics=[accuracy]) + else: + if(args.balanced == 5): + logger.info("Using balanced cross entropy") + calculate_cross_entropy = nn.CrossEntropyLoss(weight=class_weights.float()) + learn = Learner(data_labeled, model, metrics=[accuracy], callback_fns = [PartialTrainer, CSVLogger], loss_func = calculate_cross_entropy) + else: + learn = Learner(data_labeled, model, metrics=[accuracy], callback_fns=[PartialTrainer, CSVLogger]) + + + #learn.fit_one_cycle(number_epochs, learning_rate, wd=args.weight_decay) + + """ + fit[source][test] + fit(epochs:int, lr:Union[float, Collection[float], slice]=slice(None, 0.003, None), wd:Floats=None, callbacks:Collection[Callback]=None) + Fit the model on this learner with lr learning rate, wd weight decay for epochs with callbacks. + """ + + if (args.mode.strip() == "ssdl"): + logger.info("Training semi supervised model with limited set of labeled data") + # https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin + if(is_colab): + learn = Learner(data_unlabeled, model, loss_func=MixupLoss(), callback_fns=[MixMatchTrainer], metrics=[accuracy]) + else: + learn = Learner(data_unlabeled, model, loss_func=MixupLoss(), callback_fns=[MixMatchTrainer, CSVLogger], + metrics=[accuracy]) + + #train the model + learn.fit_one_cycle(number_epochs, learning_rate, wd=args.weight_decay) + #if it is not colab, write the csv to harddrive + if(not is_colab): + logged_frame = learn.csv_logger.read_logged_file() + + + +def main_colab(): + global args, logger, is_colab + is_colab = True + dateInfo = "{date:%Y-%m-%d_%H_%M_%S}".format(date=datetime.now()) + logging.basicConfig(filename="log_" + dateInfo + ".txt", level=logging.INFO, format='%(message)s') + logger = logging.getLogger('main') + #Get the default arguments + args = create_parser().parse_args(args=[]) + #args.balanced = False + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.info("Arguments: " + str(args)) + train_mix_match() + +if __name__ == '__main__': + global args, counter, context, logger, is_colab + is_colab = False + args = cli.parse_commandline_args() + print("Balanced loss: ") + #args.balanced = False + print(args.balanced) + + print("Rampup coefficient: ", args.rampup_coefficient) + + logger = logging.getLogger('main') + logger.info("Learning rate " + str(args.lr)) + + + #mlflow logging + _, batch_info = args.path_unlabeled.rsplit('/',1) + batch, batch_num, batch_stats = batch_info.split('_', 2) + num_labeled = str(args.number_labeled) + _, _, num_unlabeled, _, _, ood_perc_pp = batch_stats.split('_') + + experiment_name = args.dataset+'-'+num_labeled+'-'+ood_perc_pp + run_name = batch+'_'+batch_num + mlflow.set_experiment(experiment_name=experiment_name) #create the experiment + if args.exp_creator == "Yes": + quit() + mlflow.start_run(run_name=run_name) #start the mlflow run for logging + + mlflow.log_params(params=vars(args)) #log all parameters in one go using log_batch + mlflow.log_param(key='batch number', value=batch+' '+batch_num) + mlflow.log_param(key='batch stats', value = batch_stats) + + train_mix_match() + mlflow.end_run() diff --git a/ood_experiment_at_scale_script.sh b/ood_experiment_at_scale_script.sh new file mode 100644 index 0000000..4b368d2 --- /dev/null +++ b/ood_experiment_at_scale_script.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# +#EXPERIMENT CONFIGS +#DATA +#DLIDS are the "download ids" of the gdrive for each data set +export CUDA_VISIBLE_DEVICES=1 +declare -A DATA_DLIDS=( ["MNIST"]="10pULG3xRIkl5tDo6VJHUK6NHIguMT7Jq" ["FASHIONMNIST"]="11AJ-OEgtj7XDeLHPVEyWuvHzFIIadj9_" ["CIFAR10"]="1O6uarg54CwtZ3h_B6YzD1KW9nbQ-E7Xl" ["TINYIMAGENET"]="10i1FV1SgXxMWfgTEpAXr0qe7q7s6Ko6e" ["SVHN"]="1wgTQJOtGxWPLNKPMuDc7vzYejYnWJsVr" ["SVHN-different"]="10HbYSMt3CbHeieUBqO675eNpAef-UY3_" ["GaussianNoise"]="1GXljou_EJGcdfVsfVJiVMVo7-RKNu106" ["SALTANDPEPPER"]="1iBwKyR7M4_ca2Ti7xW5FJ-BNdqEDG-vK" ["FASHIONPRODUCT"]="1zN1BF1u1SJl81JpH6hexYgrVvivaGxa5") +#FPATHS are the file paths for the data sets in the local working directory when doing experiments +declare -A DATA_FPATHS=(["MNIST"]="data/MNIST/" ["FASHIONMNIST"]="data/FASHIONMNIST/" ["CIFAR10"]="data/CIFAR10/" ["TINYIMAGENET"]="data/TINYIMAGENET/" ["SVHN-different"]="data/SVHN-different/" ["GaussianNoise"]="data/GaussianNoise/" ["SALTANDPEPPER"]="data/SALTANDPEPPER/" ["SVHN"]="data/SVHN/" ["FASHIONPRODUCT"]="data/FASHIONPRODUCT/") + +BASE_DATA="CIFAR10" #the base data that is used for the iod data. around it which the experiment is centered +DIFFERENT_DATA="MNIST" #array of datasets used for the ood setting "different" -> contrasting data sets + +OOD_PERC_PP_LIST=(0 50 100) #the ood percentage in percentage points +NUM_UNLABELED=3000 #should be 3000 +MIN_CLASS_ID=0 #the lowest class id for the classes in the data set +MAX_CLASS_ID=9 #the highest class id for the classes of the data set +NUM_CLASSES_IN_DIST=5 #the number of classes to select for the in dist class +NUM_LABELED_LIST=(60 100 150) +OOD_TYPE="different" #can be "half-half" (ood samples come from same dataset but are a subset of the classes) or "different" (ood samples come from different dataset) +#NOTE: seed for shuf of iod class permuations is given by batch id on run level +# +#RUN CONFIGS (a run is an iteration of teh experiment with one of the random data batches) +BATCHES=(0 1 2 3 4 5 6 7 8 9) #the batch id used for the different runs of the experiment +# +#MIX MATCH ALGO CONFIGS +MODEL="wide_resnet" +DATASET="CIFAR10-BASELINE" +RESULTS_FILE="stats_OOD_4_SSDL.csv" +WORKERS="1" +EPOCHS="50" #should be 50 +BATCH_SIZE="16" #should be 16 +LR="0.0002" +WEIGHT_DECAY="0.0001" +K_TRANSFORMS="2" +T_SHARPENING="0.25" +ALPHA_MIX="0.75" +MODE="ssdl" +BALANCED="5" #int -1 no bal, 5 bal +GAMMA_US="25" #the gamma for the unsupervised loss +IMG_SIZE="32" +NORM_STATS="MNIST" #is not used +# +LOG_FOLDER="logs" +SAVE_WEIGHTS="FALSE" +WEIGHTS_PATH="" +RAMPUP_COEFFICIENT="3000" +# +N=10 #number of parallel processes +#DOWNLOAD DATA +#steps +#create data dir +mkdir data +#cd insto data dir +cd data + +#do it for base data +##downlaod file +###get file id +FILEID="${DATA_DLIDS["${BASE_DATA}"]}" +###download +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${BASE_DATA}.zip && rm -rf /tmp/cookies.txt +##unzip +unzip ${BASE_DATA}.zip +##remove zip +rm ${BASE_DATA}.zip + +#do it for different data +###get file id +FILEID="${DATA_DLIDS["${DIFFERENT_DATA}"]}" +###download +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${DIFFERENT_DATA}.zip && rm -rf /tmp/cookies.txt +##unzip +unzip ${DIFFERENT_DATA}.zip +##remove zip +rm ${DIFFERENT_DATA}.zip +#when done cd .. back to work dir +cd .. + +#EXPERIMENTS +PATH_BASE="${DATA_FPATHS["${BASE_DATA}"]}" #path to the dataset that will be used for the experiments and construction of the train and val sets. assumes at the end of path exists another directory /all/ which contains subdirectories with id name of class and then each containing the samples of this class +PATH_DIFFERENT="${DATA_FPATHS["${DIFFERENT_DATA}"]}" + +train_batch(){ + local batch_id=$1 + local OOD_PERC=$2 + local OOD_PERC_PP=$3 + local NUM_UNLABELED=$4 + local IDS=$5 + local LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data + + local PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id" + local PATH_UNLABELED="$PATH_BASE/unlabeled/batch_${batch_id}_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}" + + python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT +} + + + +if [ $OOD_TYPE = "half-half" ] +then + for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data + do + for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data + do + OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command + #first iteration over batches creates the data + IDS_LIST=() #keep track of the class ids per batch for running mixmatch later + for batch_id in ${BATCHES[@]} + do + IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability + IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training + LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data + + python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES + + python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_BASE/batches_unlabeled_out_dist/batch_$batch_id" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id" + + done + + #the second iteration over batches runs mixmatch in parallel + for ((i=0;i<${#BATCHES[@]};++i)) + do + ((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files + train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "${IDS_LIST[i]}"& + done + wait + #) + #do clean up prior to the next experiment + rm -r $PATH_BASE/unlabeled/ + rm -r $PATH_BASE/batches_labeled_in_dist/ + rm -r $PATH_BASE/batches_unlabeled_out_dist/ + done + done +elif [[ $OOD_TYPE -eq "different" ]] +then + #add loop over the different datasets -> no loop as I split experiments into invidual data sets + + for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data + do + for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data + do + OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command + IDS_LIST=() #keep track of the class ids per batch for running mixmatch later + for batch_id in ${BATCHES[@]} + do + IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability + IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training + LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data#the subset of the classes in the dataset that are used as in dist data + python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES + + python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_DIFFERENT" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id" + #path_ood here has one subfolder with all the unlabelled images + + done + + #the second iteration over batches runs mixmatch in parallel + PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id" + PATH_UNLABELED="$PATH_BASE/unlabeled/batch_0_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}" + python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT --exp_creator "Yes" + for ((i=0;i<${#BATCHES[@]};++i)) + do + ((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files + train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "$LIST_IN_DIST_CLASSES"& + done + wait + + #do clean up prior to the next experiment + rm -r $PATH_BASE/unlabeled/ + rm -r $PATH_BASE/batches_labeled_in_dist/ + rm -r $PATH_BASE/batches_unlabeled_out_dist/ + done + done + + + +else + echo "No valid OOD_TYPE was specified. Choose 'same' or 'different'" +fi +#final cleanup -> delete all data +rm -r data/ +#dynamic resizing of images for different setting -> when they do not exactly match -> defined in mixmatch diff --git a/results_analysis_3.py b/results_analysis_3.py new file mode 100644 index 0000000..f2c3707 --- /dev/null +++ b/results_analysis_3.py @@ -0,0 +1,41 @@ +from mlflow.tracking.client import MlflowClient +import mlflow +import numpy as np + + +start_experiment=3 +end_experiment=9 +metric='val_accuracy' +num_epochs=50 + +#range(start_experiment,end_experiment+1) +experiment_ids=[1,2,3] + + +means=[] +stds=[] +for experiment_id in experiment_ids: + experiment_id = str(experiment_id) + experiment_name = mlflow.get_experiment(experiment_id).name + print(experiment_name) + results = mlflow.search_runs(experiment_ids=[experiment_id]) + + max_accuracies=[] + for run_id in (results['run_id']): + val_accuracies=MlflowClient().get_metric_history(run_id, metric) + max_accuracy=0. + for epoch in range(num_epochs): + accuracy=val_accuracies[epoch].value + if val_accuracies[epoch].value > max_accuracy: + max_accuracy=val_accuracies[epoch].value + max_accuracies.append(max_accuracy) + mean=np.mean(max_accuracies) + std=np.std(max_accuracies) + print('Mean:{} Std:{}'.format(mean, std)) + means.append(mean) + stds.append(std) + +#just use format + +print("& 0 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[1], stds[1], means[2], stds[2],r"\\")) +#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std diff --git a/results_analysis_6.py b/results_analysis_6.py new file mode 100644 index 0000000..0926cfa --- /dev/null +++ b/results_analysis_6.py @@ -0,0 +1,40 @@ +from mlflow.tracking.client import MlflowClient +import mlflow +import numpy as np + + +start_experiment=3 +end_experiment=9 +metric='val_accuracy' +num_epochs=50 + +experiment_ids=[1,2,3,4,5,6] + + +means=[] +stds=[] +for experiment_id in experiment_ids: + experiment_id = str(experiment_id) + experiment_name = mlflow.get_experiment(experiment_id).name + print(experiment_name) + results = mlflow.search_runs(experiment_ids=[experiment_id]) + + max_accuracies=[] + for run_id in (results['run_id']): + val_accuracies=MlflowClient().get_metric_history(run_id, metric) + max_accuracy=0. + for epoch in range(num_epochs): + accuracy=val_accuracies[epoch].value + if val_accuracies[epoch].value > max_accuracy: + max_accuracy=val_accuracies[epoch].value + max_accuracies.append(max_accuracy) + mean=np.mean(max_accuracies) + std=np.std(max_accuracies) + print('Mean:{} Std:{}'.format(mean, std)) + means.append(mean) + stds.append(std) + +#just use format + +print("& 50 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${} \n & & & 100 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[2], stds[2], means[4], stds[4],r"\\", means[1], stds[1], means[3], stds[3], means[5], stds[5], r" \\")) +#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std diff --git a/utilities/cli.py b/utilities/cli.py new file mode 100644 index 0000000..b53da52 --- /dev/null +++ b/utilities/cli.py @@ -0,0 +1,152 @@ +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.` + +import re +import argparse +import logging + + + +DEFAULT_PATH = "/media/Data/saul/Datasets/Inbreast_folder_per_class" + +NUMBER_LABELED_OBSERVATIONS = 150 +BATCH_SIZE = 8 + +LAMBDA_DEFAULT = 25 +# Modified from +K_DEFAULT = 2 +T_DEFAULT = 0.25 +ALPHA_DEFAULT = 0.75 +LR_DEFAULT = 2e-6 +WEIGHT_DECAY_DEFAULT = 1e-4 +DEFAULT_RESULTS_FILE = "Stats.csv" +LOG = logging.getLogger('main') + +__all__ = ['parse_cmd_args', 'parse_dict_args'] + + +def create_parser(): + parser = argparse.ArgumentParser(description='PyTorch Mix Match Training') + + parser.add_argument('--exp_creator', type=str, default="No", + help='Whether to use script to create experiment') + + parser.add_argument('--dataset', type=str, default="No data set specified", + help='Name of the dataset used for the experiments') + + parser.add_argument('--path_labeled', type=str, default=DEFAULT_PATH, + help='The directory with the labeled data') + parser.add_argument('--path_unlabeled', type=str, default="", + help='The directory with the unlabeled data') + + parser.add_argument('--results_file_name', type=str, default=DEFAULT_RESULTS_FILE, + help='Name of results file') + parser.add_argument('-j', '--workers', default=10, type=int, metavar='N', + help='number of data loading workers (default: 4)') + parser.add_argument('--epochs', default=300, type=int, metavar='N', + help='number of total epochs to run') + + parser.add_argument('-b', '--batch_size', default=BATCH_SIZE, type=int, + metavar='N', help='mini-batch size (default: 256)') + parser.add_argument('--lr', '--learning-rate', default=LR_DEFAULT, type=float, + metavar='LR', help='learning rate') + + parser.add_argument('--weight_decay', default=WEIGHT_DECAY_DEFAULT, type=float, + metavar='W', help='weight decay (default: 1e-4)') + + parser.add_argument('--K_transforms', default=K_DEFAULT, type=int, metavar='K', help = 'Number of simple transformations') + parser.add_argument('--T_sharpening', default=T_DEFAULT, type=float, metavar='T', + help='Sharpening coefficient') + + parser.add_argument('--alpha_mix', default=ALPHA_DEFAULT, type=float, metavar='A', + help='Mix alpha coefficient') + + parser.add_argument('--mode', default="fully_supervised", type=str, + help='Modes: fully_supervised, partial_supervised, ssdl') + #int -1 no bal, 5 bal + parser.add_argument('--balanced', default=-1, type=int, + help='Balance the cross entropy loss') + + parser.add_argument('--lambda_unsupervised', default=LAMBDA_DEFAULT, type=float, + help='Unsupervised learning coefficient') + + parser.add_argument('--number_labeled', default=NUMBER_LABELED_OBSERVATIONS, type=int, metavar='A', + help='Number of labeled observations') + + parser.add_argument('--model', default="densenet", type=str, metavar='A', + help='Model to use') + + parser.add_argument('--num_classes', default=5, type=int, + help='Number of classes') + + parser.add_argument('--size_image', default=32, type=int, + help='Image input size') + + parser.add_argument('--log_folder', type=str, default="logs", + help='logging folder') + + + + parser.add_argument('--norm_stats', type=str, default="MNIST", + help='mean std values for dataset: MNIST and COVID provided') + + parser.add_argument('--save_weights', default=False, type=bool, + help='Save the weights of the last model found in training') + + parser.add_argument('--weights_path_name', type=str, default="", + help='path to store weights') + + parser.add_argument('--rampup_coefficient', default=3000, type=int, + help='Rampup coefficient for the unsupervised term') + + return parser + + +def parse_commandline_args(): + return create_parser().parse_args() + + +def parse_dict_args(**kwargs): + def to_cmdline_kwarg(key, value): + if len(key) == 1: + key = "-{}".format(key) + else: + key = "--{}".format(re.sub(r"_", "-", key)) + value = str(value) + return key, value + + kwargs_pairs = (to_cmdline_kwarg(key, value) + for key, value in kwargs.items()) + cmdline_args = list(sum(kwargs_pairs, ())) + + logging.info("Using these command line args: %s", " ".join(cmdline_args)) + + return create_parser().parse_args(cmdline_args) + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def str2epochs(v): + try: + if len(v) == 0: + epochs = [] + else: + epochs = [int(string) for string in v.split(",")] + except: + raise argparse.ArgumentTypeError( + 'Expected comma-separated list of integers, got "{}"'.format(v)) + if not all(0 < epoch1 < epoch2 for epoch1, epoch2 in zip(epochs[:-1], epochs[1:])): + raise argparse.ArgumentTypeError( + 'Expected the epochs to be listed in increasing order') + return epochs diff --git a/utilities/dataset_distance_measurer.py b/utilities/dataset_distance_measurer.py new file mode 100644 index 0000000..6d6a897 --- /dev/null +++ b/utilities/dataset_distance_measurer.py @@ -0,0 +1,538 @@ +import torch +import torchvision.models as models +from fastai.vision import * +import pingouin as pg +from fastai.callbacks import CSVLogger +from numbers import Integral +import torch +import logging +import sys +from torchvision.utils import save_image +import numpy as np +import pandas as pd +import scipy +from PIL import Image +import torchvision.models.vgg as models2 +import torchvision.models as models3 +from scipy.stats import entropy +from scipy.spatial import distance + +import torchvision.transforms as transforms +import torchvision +from scipy.stats import mannwhitneyu + + + +def pytorch_feature_extractor(): + input = torch.rand(1, 3, 50, 50) + vgg16 = models3.resnet152(pretrained=True) + print(vgg16) + output = vgg16[:-1](input) + print(output) + + +def calculate_Minowski_feature_space_stats(databunch1, databunch2, model, batch_size=100, p=2, num_batches=10): + run_results = [] + for i in range(0, num_batches): + dist_i = calculate_Minowski_feature_space(databunch1, databunch2, model, batch_size, p) + run_results += [dist_i] + run_results_np = np.array(run_results) + mean_results = run_results_np.mean() + std_results = run_results_np.std() + return (mean_results, std_results) + + +def calculate_pdf_dist_stats(databunch1, databunch2, model, batch_size=80, + distance_func=distance.jensenshannon, num_batches=10): + """ + + :param databunch1: + :param databunch2: + :param model: + :param batch_size: + :param distance_func: + :param num_batches: + :return: + """ + run_results = [] + for i in range(0, num_batches): + dist_i = calculate_pdf_dist(databunch1, databunch2, model, batch_size, distance_func) + run_results += [dist_i] + run_results_np = np.array(run_results) + mean_results = run_results_np.mean() + std_results = run_results_np.std() + return (mean_results, std_results) + + + +def calculate_pdf_dist(databunch1, databunch2, model, batch_size=80, distance_func=distance.jensenshannon): + # just get the number of dimensions + + tensorbunch1 = databunch_to_tensor(databunch1) + tensorbunch2 = databunch_to_tensor(databunch2) + + feature_extractor = get_feature_extractor(model) + batch_tensors1 = tensorbunch1[0:batch_size, :, :, :] + # get number of features + features_bunch1 = feature_extractor(batch_tensors1) + num_features = features_bunch1.shape[1] + print("Calculating pdf distance for for feature space of dimensions: ", num_features) + js_dist_dims = [] + # calculate distance of histograms for given + for i in range(0, num_features): + js_dist_dims += [ + calculate_distance_hists(tensorbunch1, tensorbunch2, feature_extractor, dimension=i, batch_size=batch_size, + distance_func=distance_func)] + js_dist_sum = sum(js_dist_dims) + return js_dist_sum + + +def calculate_distance_hists(tensorbunch1, tensorbunch2, feature_extractor, dimension, batch_size=20, + distance_func=distance.jensenshannon): + # random pick of batch observations + total_number_obs_1 = tensorbunch1.shape[0] + total_number_obs_2 = tensorbunch2.shape[0] + batch_indices_1 = generate_rand_bin_array(batch_size, total_number_obs_1) + batch_indices_2 = generate_rand_bin_array(batch_size, total_number_obs_2) + # create the batch of tensors to get its features + batch_tensors1 = tensorbunch1[batch_indices_1, :, :, :] + batch_tensors2 = tensorbunch2[batch_indices_2, :, :, :] + + # get the features from the selected batch + features_bunch1 = feature_extractor(batch_tensors1) + features_bunch2 = feature_extractor(batch_tensors2) + # get the values of a specific dimension + values_dimension_bunch1 = features_bunch1[:, dimension].cpu().detach().numpy() + values_dimension_bunch2 = features_bunch2[:, dimension].cpu().detach().numpy() + # calculate the histograms + (hist1, bucks1) = np.histogram(values_dimension_bunch1, bins=15, range=None, normed=None, weights=None, + density=None) + #ensure that the histograms have the same meaning, by using the same buckets + (hist2, bucks2) = np.histogram(values_dimension_bunch2, bins=bucks1, range=None, normed=None, weights=None, + density=None) + # normalize the histograms + hist1 = np.array(hist1) / sum(hist1) + hist2 = np.array(hist2) / sum(hist2) + js_dist = distance_func(hist1.tolist(), hist2.tolist()) + return js_dist + + +# databunch1 is the smallest +def calculate_Minowski_feature_space(databunch1, databunch2, model, batch_size = 100, p = 2): + print("Calculating Minowski distance of two samples of two datasets, p: ", p) + feature_extractor = get_feature_extractor(model) + tensorbunch1 = databunch_to_tensor(databunch1) + tensorbunch2 = databunch_to_tensor(databunch2) + # get the randomized batch indices + total_number_obs_1 = tensorbunch1.shape[0] + total_number_obs_2 = tensorbunch2.shape[0] + batch_indices_1 = generate_rand_bin_array(batch_size, total_number_obs_1) + batch_indices_2 = generate_rand_bin_array(batch_size, total_number_obs_2) + + # total number of observations of the smallest databunch + total_observations_bunch1 = tensorbunch1.shape[0] + # pick random observations for the batch + batch_tensors1 = tensorbunch1[batch_indices_1, :, :, :].to(device="cuda:0") + batch_tensors2 = tensorbunch2[batch_indices_2, :, :, :].to(device="cuda:0") + # extract its features + features_bunch1 = feature_extractor(batch_tensors1) + features_bunch2 = feature_extractor(batch_tensors2) + + sum_mses = [] + # one to all distance accumulation + for i in range(0, batch_size): + mse_i = calculate_Minowski_observation_min(features_bunch1[i], features_bunch2, p) + sum_mses += [mse_i.item()] + sum_mses_np = np.array(sum_mses) + # delete features to prevent gpu memory overflow + del features_bunch1 + del features_bunch2 + torch.cuda.empty_cache() + mse_mean_all_batch = sum_mses_np.mean() + # take one batch + return mse_mean_all_batch + + +def calculate_Minowski_observation(observation, tensorbunch, p=2): + # vectorize all the images in tensorbunch + # if it receives an image + # tensorbunch_vec = img[:].view(-1, tensorbunch.shape[1]*tensorbunch.shape[2]*tensorbunch.shape[3]) + observation_vec = observation.view(-1) + difference_bunch = tensorbunch - observation_vec + # for all observations in the bunch, calculate its euclidian proximity + # L2 Norm over columns + minowski_distances = torch.norm(difference_bunch, p, 1) + # choose mse or min? + minowski_distance = minowski_distances.sum() / len(minowski_distances) + return minowski_distance + + +def calculate_Minowski_observation_min(observation, tensorbunch, p = 2): + # vectorize all the images in tensorbunch + # if it receives an image + # tensorbunch_vec = img[:].view(-1, tensorbunch.shape[1]*tensorbunch.shape[2]*tensorbunch.shape[3]) + observation_vec = observation.view(-1) + # difference between bunch of tensors and the current observation to analyze + difference_bunch = tensorbunch - observation_vec + # for all observations in the bunch, calculate its euclidian proximity + # L2 Norm over columns + minowski_distances = torch.norm(difference_bunch, p, 1) + # choose mse or min + min_dist = minowski_distances.min() + return min_dist + + +def databunch_to_tensor(databunch1): + # tensor of tensor + tensor_bunch = torch.zeros(len(databunch1.train_ds), databunch1.train_ds[0][0].shape[0], + databunch1.train_ds[0][0].shape[1], databunch1.train_ds[0][0].shape[2], device="cuda:0") + for i in range(0, len(databunch1.train_ds)): + tensor_bunch[i, :, :, :] = databunch1.train_ds[i][0].data.to(device="cuda:0") + + return tensor_bunch + + +def get_feature_extractor(model): + global key + + path = untar_data(URLs.MNIST_SAMPLE) + data = ImageDataBunch.from_folder(path) + # save learner to reload it as a pytorch model + learner = Learner(data, model, metrics=[accuracy]) + learner.export('/media/Data/user/Code_Projects/OOD4SSDL/utilities/model/final_model_' + key + ".pk") + torch_dict = torch.load('/media/Data/user/Code_Projects/OOD4SSDL/utilities/model/final_model_' + key + ".pk") + # get the model + model_loaded = torch_dict["model"] + # put it on gpu! + model_loaded = model_loaded.to(device="cuda:0") + # usually the last set of layers act as classifier, therefore we discard it + feature_extractor = model_loaded.features[:-1] + return feature_extractor + + +def dataset_distance_tester_pdf_half(path_bunch1 = "/media/Data/user/Datasets/CIFAR10_HALF/CIFAR10_60_50/", path_bunch2 = "/media/Data/user/Datasets/CIFAR10_HALF/CIFAR10_60_50/",ood_perc = 50, num_unlabeled = 3000, name_ood_dataset = "half", num_batches=1, size_image = 28, distance_func = distance.jensenshannon): + """ + Testing + :return: + """ + global key + key = "pdf" + print("Computing distance for dataset: ", name_ood_dataset) + model = models.WideResNet(num_groups=3, N=4, num_classes=10, k=2, start_nf=64) + dists_reference = [] + dists_bunch1_bunch2 = [] + dists_substracted = [] + for i in range(0, num_batches): + path_mnist_half_in_dist = path_bunch1 + "/batch" + str(i) + "/artifacts/inputs/labelled/" + print("INDIST: ", path_mnist_half_in_dist) + path_mnist_half_out_dist = path_bunch2 + "/batch" + str(i) + "/artifacts/inputs/unlabelled/" + print("outDIST: ", path_mnist_half_out_dist) + databunch1 = (ImageList.from_folder(path_mnist_half_in_dist) + .split_none() + .label_from_folder() + .transform(size=size_image) + .databunch()) + databunch2 = (ImageList.from_folder(path_mnist_half_out_dist) + .split_none() + .label_from_folder() + .transform(size=size_image) + .databunch()) + + + (dist_ref_i, std_ref) = calculate_pdf_dist_stats(databunch1, databunch1, model, batch_size=50, + distance_func=distance_func, num_batches=3) + dists_reference += [dist_ref_i] + print("Distance to itself (reference): ", dist_ref_i, " for batch: ", i) + (dist_between_bunches_i, dist_between_bunches_std) = calculate_pdf_dist_stats(databunch1, databunch2, model, batch_size=50, + distance_func=distance_func, num_batches=3) + dists_bunch1_bunch2 += [dist_between_bunches_i] + print("Distance between bunches: ", dist_between_bunches_i, " for batch:", i) + dists_substracted += [abs(dist_between_bunches_i - dist_ref_i)] + + + dist_between_bunches = np.mean(dists_substracted) + print("Distance between bunches: ", dist_between_bunches) + stat, p_value = scipy.stats.wilcoxon(dists_reference, dists_bunch1_bunch2, correction=True) + # means + dists_reference += [np.array(dists_reference).mean()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).mean()] + dists_substracted += [np.array(dists_substracted).mean()] + # stds are the last row + dists_reference += [np.array(dists_reference).std()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).std()] + dists_substracted += [np.array(dists_substracted).std()] + + header3 = 'Distance_substracted with p ' + str(p_value) + + dict_csv = {'Reference': dists_reference, + 'Distance': dists_bunch1_bunch2, + header3: dists_substracted + + } + dataframe = pd.DataFrame(dict_csv, columns=['Reference', 'Distance', header3]) + dataframe.to_csv( + '/media/Data/user/Code_Projects/OOD4SSDL/utilities/csv_distances_reports/' + name_ood_dataset + "ood_perc_" + str( + ood_perc) + '.csv', index=False, header=True) + + return dist_between_bunches + + +def dataset_distance_tester_pdf(path_bunch1 = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", path_bunch2 = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled/batch_",ood_perc = 100, num_unlabeled = 3000, name_ood_dataset = "in_dist", num_batches=10, size_image = 28, distance_func = distance.jensenshannon): + """ + Testing + :return: + """ + + global key + key = "pdf" + print("Computing distance for dataset: ", name_ood_dataset) + model = models.WideResNet(num_groups=3, N=4, num_classes=10, k=2, start_nf=64) + dists_reference = [] + dists_bunch1_bunch2 = [] + dists_substracted = [] + for i in range(0, num_batches): + path_mnist_half_in_dist = path_bunch1 + "/batch_" + str(i) + path_mnist_half_out_dist = path_bunch2 + str(i) + "/batch_" + str(i) + "_num_unlabeled_" + str(num_unlabeled) + "_ood_perc_" + str(ood_perc) + print("IN DIST PATH ", path_mnist_half_in_dist) + print("OUT DIST PATH ", path_mnist_half_out_dist) + + + + databunch1 = (ImageList.from_folder(path_mnist_half_in_dist) + .split_none() + .label_from_folder() + .transform(size=size_image) + .databunch()) + databunch2 = (ImageList.from_folder(path_mnist_half_out_dist) + .split_none() + + .label_from_folder() + .transform(size=size_image) + .databunch()) + + + (dist_ref_i, std_ref) = calculate_pdf_dist_stats(databunch1, databunch1, model, batch_size=80, + distance_func=distance_func, num_batches=3) + dists_reference += [dist_ref_i] + print("Distance to itself (reference): ", dist_ref_i, " for batch: ", i) + (dist_between_bunches_i, dist_between_bunches_std) = calculate_pdf_dist_stats(databunch1, databunch2, model, batch_size=80, + distance_func=distance_func, num_batches=3) + dists_bunch1_bunch2 += [dist_between_bunches_i] + print("Distance between bunches: ", dist_between_bunches_i, " for batch:", i) + dists_substracted += [abs(dist_between_bunches_i - dist_ref_i)] + + + + + dist_between_bunches = np.mean(dists_substracted) + print("Distance between bunches: ", dist_between_bunches) + stat, p_value = scipy.stats.wilcoxon(dists_reference, dists_bunch1_bunch2, correction = True) + #means + dists_reference += [np.array(dists_reference).mean()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).mean()] + dists_substracted += [np.array(dists_substracted).mean()] + # stds are the last row + dists_reference += [np.array(dists_reference).std()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).std()] + dists_substracted += [np.array(dists_substracted).std()] + + header3 = 'Distance_substracted with p ' + str(p_value) + + dict_csv = {'Reference': dists_reference, + 'Distance': dists_bunch1_bunch2, + header3: dists_substracted + + } + dataframe = pd.DataFrame(dict_csv, columns=['Reference', 'Distance', header3]) + dataframe.to_csv('/media/Data/user/Code_Projects/OOD4SSDL/utilities/csv_distances_reports/' + name_ood_dataset + "ood_perc_" + str(ood_perc) + '.csv', index=False, header=True) + + return dist_between_bunches + + + + #calculate distance + + dist2 = calculate_Minowski_feature_space_stats(databunch1, databunch2, model, batch_size=80, p=2, num_batches=3) + print("Distance MNIST in dist to MNIST out dist : ", dist2) + reference2 = calculate_Minowski_feature_space_stats(databunch1, databunch1, model, batch_size=80, p=2, num_batches=3) + print("Distance MNIST in dist to MNIST out dist (second): ", reference2) + + + +def dataset_distance_tester(path_bunch1 = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", path_bunch2 = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled/batch_",ood_perc = 100, num_unlabeled = 3000, name_ood_dataset = "in_dist", num_batches=10, size_image = 28, p = 2): + """ + Testing + :return: + """ + global key + key = "minkowski" + + + print("Computing distance for dataset: ", name_ood_dataset, " p: ", p, " ood: ", ood_perc) + model = models.WideResNet(num_groups=3, N=4, num_classes=10, k=2, start_nf=64) + dists_reference = [] + dists_bunch1_bunch2 = [] + dists_substracted = [] + for i in range(0, num_batches): + path_mnist_half_in_dist = path_bunch1 + str(i) + path_mnist_half_out_dist = path_bunch2 + str(i) + "/batch_" + str(i) + "_num_unlabeled_" + str(num_unlabeled) + "_ood_perc_" + str(ood_perc) + + databunch1 = (ImageList.from_folder(path_mnist_half_in_dist) + .split_none() + .label_from_folder() + .transform(size=size_image) + .databunch()) + databunch2 = (ImageList.from_folder(path_mnist_half_out_dist) + .split_none() + + .label_from_folder() + .transform(size=size_image) + .databunch()) + + #databunch1 = ImageDataBunch.from_folder(path_mnist_half_in_dist, ignore_empty=True) + #databunch2 = ImageDataBunch.from_folder(path_mnist_half_out_dist, ignore_empty=True) + (dist_ref_i, std_ref) = calculate_Minowski_feature_space_stats(databunch1, databunch1, model, batch_size=80, p=p, num_batches=3) + dists_reference += [dist_ref_i] + print("Distance to itself (reference): ", dist_ref_i, " for batch: ", i) + (dist_between_bunches_i, dist_between_bunches_std) = calculate_Minowski_feature_space_stats(databunch1, databunch2, model, batch_size=80, p=p, num_batches=3) + dists_bunch1_bunch2 += [dist_between_bunches_i] + print("Distance between bunches: ", dist_between_bunches_i, " for batch:", i) + dists_substracted += [abs(dist_between_bunches_i - dist_ref_i)] + + + + + dist_between_bunches = np.mean(dists_substracted) + print("Distance between bunches: ", dist_between_bunches) + stat, p_value = scipy.stats.wilcoxon(dists_reference, dists_bunch1_bunch2, correction = True) + #means + dists_reference += [np.array(dists_reference).mean()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).mean()] + dists_substracted += [np.array(dists_substracted).mean()] + # stds are the last row + dists_reference += [np.array(dists_reference).std()] + dists_bunch1_bunch2 += [np.array(dists_bunch1_bunch2).std()] + dists_substracted += [np.array(dists_substracted).std()] + + header3 = 'Distance_substracted with p ' + str(p_value) + + dict_csv = {'Reference': dists_reference, + 'Distance': dists_bunch1_bunch2, + header3: dists_substracted + + } + dataframe = pd.DataFrame(dict_csv, columns=['Reference', 'Distance', header3]) + dataframe.to_csv('/media/Data/user/Code_Projects/OOD4SSDL/utilities/csv_distances_reports/' + name_ood_dataset + "_p_" + str(p)+ "_OOD_perc_" + str(ood_perc) +'.csv', index=False, header=True) + + return dist_between_bunches + + + + #calculate distance + + dist2 = calculate_Minowski_feature_space_stats(databunch1, databunch2, model, batch_size=80, p=2, num_batches=3) + print("Distance MNIST in dist to MNIST out dist : ", dist2) + reference2 = calculate_Minowski_feature_space_stats(databunch1, databunch1, model, batch_size=80, p=2, num_batches=3) + print("Distance MNIST in dist to MNIST out dist (second): ", reference2) + + +def run_tests_minowski(p = 1, ood_perc = 50): + print("Calculating distance for IMAGENET TINY dataset") + dataset_distance_tester( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_IMAGENET/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet" + str(p), num_batches=10, p=p) + + print("Calculating distance for Gaussian dataset") + dataset_distance_tester( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_Gaussian/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian", num_batches=10, p=p) + #salt and pepper + print("Calculating distance for Salt and pepper dataset") + dataset_distance_tester( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_Gaussian/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper", num_batches=10, p = p) + + print("Calculating distance for SVHN dataset") + dataset_distance_tester( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN" + str(p), num_batches=10, p=p) + + + + + + + +def run_tests_pdf(distance_str = "js", ood_perc = 50): + """ + + :param distance: distance_str + :return: + """ + print("DISTANCE STR: ", distance_str) + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + print("Calculating " + distance_str + " distance for Salt and pepper dataset") + dataset_distance_tester_pdf( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_pdf_" + distance_str, num_batches=10, + distance_func=distance_func) + print("Calculating " + distance_str + " distance for SVHN dataset") + + + dataset_distance_tester_pdf( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN__pdf_" + distance_str, num_batches=10, distance_func=distance_func) + + print("Calculating " + distance_str + " distance for IMAGENET dataset") + dataset_distance_tester_pdf( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_IMAGENET/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="IMAGENET_pdf_" + distance_str, num_batches=10, + distance_func=distance_func) + + print("Calculating " + distance_str + " distance for HALF dataset") + dataset_distance_tester_pdf( + path_bunch1="/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_", + path_bunch2="/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_out_dist/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="half_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + + + + + + + + + + + +def generate_rand_bin_array(num_ones, length_array): + arr = np.zeros(length_array) + arr[:num_ones] = 1 + np.random.shuffle(arr) + bool_array = torch.tensor(array(arr.tolist(), dtype=bool)) + return bool_array + + +#run_tests_minowski(p = 1, ood_perc = 50) diff --git a/utilities/dataset_partitioner.py b/utilities/dataset_partitioner.py new file mode 100644 index 0000000..e821ec4 --- /dev/null +++ b/utilities/dataset_partitioner.py @@ -0,0 +1,519 @@ + +import torchvision +from shutil import copy2 +from sklearn.model_selection import train_test_split +from PIL import Image +import matplotlib +import re +import argparse +import logging +matplotlib.use('Agg') +import os +import random +# import copy +import ntpath +#OOD flag +OOD_LABEL = -1 +import numpy as np +import shutil +from torch.utils.data import DataLoader +from torch.utils.data.dataset import Dataset +from torch.utils.data.sampler import Sampler, SubsetRandomSampler +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler +import torch + +def create_parser(): + """ + Parser for the data partitioner + :return: + """ + parser = argparse.ArgumentParser(description='Dataset partitioner') + parser.add_argument('--mode', type=str, default="unlabeled_partitioner", + help='Options: 1. train_partitioner 2. unlabeled_partitioner ') + parser.add_argument('--batch_id_num', default=0, type=int, help='batch id number') + + #Options for the labeled/unlabeled data partitioner + parser.add_argument('--path_iod', type=str, default="", help='The directory with the IOD data') + parser.add_argument('--path_ood', type=str, default="", help='The directory with the OOD data') + parser.add_argument('--path_dest', type=str, default="", help='The destination directory') + parser.add_argument('--ood_perc', default=1.0, type=float, help='From 0 to 1') + parser.add_argument('--num_unlabeled', default=100, type=int, help='Number of unlabeled observations') + #Options for the train/test/ood partitioner + parser.add_argument('--path_base', type=str, default="", help='Base directory') + parser.add_argument('--list_in_dist_classes', type=str, default="", help='The List of in distribution classes') + parser.add_argument('--eval_perc', default=0.25, type=float, help='From 0 to 1') + return parser + +def parse_commandline_args(): + """ + Create the parser + :return: + """ + return create_parser().parse_args() + +def create_train_test_folder_partitions_ood_undersampled(datasetpath_base, percentage_evaluation=0.25, random_state=42, batch=0, create_dirs = True, classes_in_dist = []): + """ + Train and test partitioner + :param datasetpath_base: + :param percentage_used_labeled_observations: The percentage of the labeled observations to use from the 1 - percentage_evaluation + :param num_batches: total number of batches + :param create_dirs: + :param percentage_evaluation: test percentage of data + :return: + """ + #for the same batch, same result + random.seed(random_state + batch) + datasetpath_test = datasetpath_base + "/batches_labeled_undersampled_in_dist/batch_" + str(batch) + "/test/" + datasetpath_train = datasetpath_base + "/batches_labeled_undersampled_in_dist/batch_" + str(batch) + "/train/" + datasetpath_ood = datasetpath_base + "/batches_unlabeled_undersampled_out_dist/batch_" + str(batch) + "/" + + datasetpath_all = datasetpath_base + "/all" + print("All data path: ", datasetpath_all) + #read dataset + dataset = torchvision.datasets.ImageFolder(datasetpath_all) + #get filenames + list_file_names_and_labels = dataset.imgs + labels_temp = dataset.targets + list_file_names = [] + list_labels = [] + list_files_labels = [] + # list of file names and labels + for i in range(0, len(list_file_names_and_labels)): + file_name_path = list_file_names_and_labels[i][0] + list_file_names += [file_name_path] + list_labels += [labels_temp[i]] + #for random swapping + list_files_labels += [(file_name_path, labels_temp[i])] + #total number of classes + num_classes = len(np.unique(np.array(labels_temp))) + print("Total number of classes detected: ", num_classes) + #if no custom in distribution classes were chosen, take them all + if(classes_in_dist == []): + print("No out of distribution data required") + for i in range(0, num_classes): classes_in_dist += [i] + + #get the number of observations for the less represented class + array_labels = np.array(list_labels) + observations_under_class = len(array_labels[array_labels == 0]) + array_labels = np.array(list_labels) + #find the under represented class + for i in range(0, num_classes): + num_obs_class = len(array_labels[array_labels == i]) + if(num_obs_class < observations_under_class): + observations_under_class = num_obs_class + print("Under represented class with the following number of observations ", observations_under_class) + + + if (create_dirs): + # create the directories + try: + for i in classes_in_dist: + print("Creating test path: ", datasetpath_test + "/" + str(i)) + print("Creating training path: ", datasetpath_train + "/" + str(i)) + os.makedirs(datasetpath_test + "/" + str(i)) + os.makedirs(datasetpath_train + "/" + str(i)) + except: + print("Rewritting directories...") + + try: + for j in range(0, num_classes): + if(not j in in_dist_classes_list): + print("Creating OOD path: ", datasetpath_ood + "/" + str(j)) + os.makedirs(datasetpath_ood + "/" + str(j)) + except: + print("Rewritting directories...") + # test and train splitter for unlabeled and labeled data split + #for the same batch number, same results + + for curr_class in range(0, num_classes): + #get all the labels of the + + (list_files_class, list_labels_class) = undersample_list(observations_under_class, list_files_labels, curr_class) + print("Undersampled list of files size ", len(list_files_class), " for class ", curr_class) + print("Undersampled list of labels size ", len(list_labels_class), " for class ", curr_class) + + X_train, X_test, y_train, y_test = train_test_split(list_files_class, list_labels_class, test_size=percentage_evaluation, + random_state=random_state + batch) + print("Creating trainig partitioned folders...", len(X_train)) + for i in range(0, len(X_train)): + path_src = X_train[i] + # extract the file name + file_name = ntpath.basename(path_src) + label = y_train[i] + #if the label is among the in distribution selected label, copy it there + if(label in classes_in_dist): + #In distribution data + path_dest = datasetpath_train + str(label) + "/" + file_name + else: + #out distribution data + path_dest = datasetpath_ood + str(label) + "/" + file_name + + copy2(path_src, path_dest) + + print("Creating test partitioned folders...", len(X_test)) + for i in range(0, len(X_test)): + label = y_test[i] + # if the label is among the in distribution selected label, copy it there + if (label in classes_in_dist): + path_src = X_test[i] + file_name = ntpath.basename(path_src) + path_dest = datasetpath_test + str(y_test[i]) + "/" + file_name + copy2(path_src, path_dest) + +def undersample_list(observations_under_class, list_files_labels, curr_class): + random.shuffle(list_files_labels) + #load only the labels + array_labels = np.array([element[1] for element in list_files_labels]) + + + list_file_indices_class = (array_labels == curr_class) + list_files_class_selected = [] + list_labels_class_selected = [] + # get the file names of the class + number_added = 0 + # undersample the folders to the lowest num of observations per class + for index in range(0, len(list_file_indices_class)): + if (list_file_indices_class[index] and number_added < observations_under_class): + list_files_class_selected += [list_files_labels[index][0]] + list_labels_class_selected += [list_files_labels[index][1]] + number_added += 1 + + + print("Number of labels undersampled ", len(list_labels_class_selected)) + print("First file taken: ") + print(list_files_class_selected[0]) + return (list_files_class_selected, list_labels_class_selected) + +def create_train_test_folder_partitions_ood(datasetpath_base, percentage_evaluation=0.25, random_state=42, batch=0, create_dirs = True, classes_in_dist = []): + """ + Train and test partitioner + :param datasetpath_base: + :param percentage_used_labeled_observations: The percentage of the labeled observations to use from the 1 - percentage_evaluation + :param num_batches: total number of batches + :param create_dirs: + :param percentage_evaluation: test percentage of data + :return: + """ + + datasetpath_test = datasetpath_base + "/batches_labeled_in_dist/batch_" + str(batch) + "/test/" + datasetpath_train = datasetpath_base + "/batches_labeled_in_dist/batch_" + str(batch) + "/train/" + datasetpath_ood = datasetpath_base + "/batches_unlabeled_out_dist/batch_" + str(batch) + "/" + + datasetpath_all = datasetpath_base + "/all/" + print("All data path: ", datasetpath_all) + #read dataset + dataset = torchvision.datasets.ImageFolder(datasetpath_all) + #get filenames + list_file_names_and_labels = dataset.imgs + labels_temp = dataset.targets + list_file_names = [] + list_labels = [] + # list of file names and labels + for i in range(0, len(list_file_names_and_labels)): + file_name_path = list_file_names_and_labels[i][0] + list_file_names += [file_name_path] + list_labels += [labels_temp[i]] + #total number of classes + num_classes = len(np.unique(np.array(labels_temp))) + print("Total number of classes detected: ", num_classes) + #if no custom in distribution classes were chosen, take them all + if(classes_in_dist == []): + for i in range(0, num_classes): classes_in_dist += [i] + + + + if (create_dirs): + # create the directories + + + try: + for i in classes_in_dist: + print("Creating test path: ", datasetpath_test + "/" + str(i)) + print("Creating training path: ", datasetpath_train + "/" + str(i)) + os.makedirs(datasetpath_test + "/" + str(i)) + os.makedirs(datasetpath_train + "/" + str(i)) + except: + print("Rewritting directories...") + + try: + for j in range(0, num_classes): + if(not j in in_dist_classes_list): + print("Creating OOD path: ", datasetpath_ood + "/" + str(j)) + os.makedirs(datasetpath_ood + "/" + str(j)) + except: + print("Rewritting directories...") + # test and train splitter for unlabeled and labeled data split + #for the same batch number, same results + X_train, X_test, y_train, y_test = train_test_split(list_file_names, list_labels, test_size=percentage_evaluation, + random_state=random_state + batch) + print("Creating trainig partitioned folders...", len(X_train)) + for i in range(0, len(X_train)): + path_src = X_train[i] + # extract the file name + file_name = ntpath.basename(path_src) + label = y_train[i] + #if the label is among the in distribution selected label, copy it there + if(label in classes_in_dist): + #In distribution data + path_dest = datasetpath_train + str(label) + "/" + file_name + # print("COPY TO: " + path_dest) + else: + #out distribution data + path_dest = datasetpath_ood + str(label) + "/" + file_name + + copy2(path_src, path_dest) + + print("Creating test partitioned folders...", len(X_test)) + for i in range(0, len(X_test)): + label = y_test[i] + # if the label is among the in distribution selected label, copy it there + if (label in classes_in_dist): + path_src = X_test[i] + file_name = ntpath.basename(path_src) + path_dest = datasetpath_test + str(y_test[i]) + "/" + file_name + copy2(path_src, path_dest) + +def create_train_test_folder_partitions_simple(datasetpath_base, percentage_evaluation=0.25, random_state=42, batch=0, + create_dirs = True): + """ + Train and test partitioner + :param datasetpath_base: + :param percentage_used_labeled_observations: The percentage of the labeled observations to use from the 1 - percentage_evaluation + :param num_batches: total number of batches + :param create_dirs: + :param percentage_evaluation: test percentage of data + :return: + """ + datasetpath_test = datasetpath_base + "/batch_" + str(batch) + "/test/" + datasetpath_train = datasetpath_base + "/batch_" + str(batch) + "/train/" + datasetpath_all = datasetpath_base + "/all" + print("datasetpath_all") + print(datasetpath_all) + dataset = torchvision.datasets.ImageFolder(datasetpath_all) + list_file_names_and_labels = dataset.imgs + labels_temp = dataset.targets + list_file_names = [] + list_labels = [] + # list of file names and labels + for i in range(0, len(list_file_names_and_labels)): + file_name_path = list_file_names_and_labels[i][0] + list_file_names += [file_name_path] + list_labels += [labels_temp[i]] + + if (create_dirs): + # create the directories + print("Trying to create dir") + print(datasetpath_test) + os.makedirs(datasetpath_test) + print(datasetpath_test) + os.makedirs(datasetpath_train) + for i in range(0, 6): + os.makedirs(datasetpath_test + "/" + str(i)) + os.makedirs(datasetpath_train + "/" + str(i)) + + # test and train splitter for unlabeled and labeled data split + + X_train, X_test, y_train, y_test = train_test_split(list_file_names, list_labels, test_size=percentage_evaluation, + random_state=random_state) + print("Creating trainig partitioned folders...", len(X_train)) + for i in range(0, len(X_train)): + path_src = X_train[i] + # extract the file name + file_name = ntpath.basename(path_src) + path_dest = datasetpath_train + str(y_train[i]) + "/" + file_name + copy2(path_src, path_dest) + + print("Creating test partitioned folders...", len(X_test)) + for i in range(0, len(X_test)): + path_src = X_test[i] + file_name = ntpath.basename(path_src) + path_dest = datasetpath_test + str(y_test[i]) + "/" + file_name + copy2(path_src, path_dest) + +def create_folder_partitions_unlabeled_ood(iod_dataset_path, ood_dataset_path, dest_unlabeled_path_base, + total_unlabeled_obs=1000, ood_percentage=0.5, random_state=42, batch=0, + create_dirs=True): + """ + Create the folder partitions for unlabeled data repository, preserving the folder structure of train data + This MUST BE EXECUTED AFTER the training batches have been built + The OOD data is randomly copied among the training subfolders, given the folder structure used in the MixMatch FAST AI implementation + :param iod_dataset_path: + :param ood_dataset_path: + :param dest_unlabeled_path_base: We create the train folder, as the test folder is just copied from IOD folder (test is always In Distribution) + :param total_unlabeled_obs: + :param ood_percentage: percentage of out of distribution data + :param random_state: seed + :param batch: batch number id for the folder + :param create_dirs: create the necessary directories + :return: + """ + # read the data from the in distribution dataset train batch (the selected observations for unlabeled data will be deleted from there) + dataset = torchvision.datasets.ImageFolder(iod_dataset_path) + dataset_ood = torchvision.datasets.ImageFolder(ood_dataset_path) + # read the data path + list_file_names_in_dist_data = dataset.imgs + list_file_names_out_dist_data = dataset_ood.imgs + + in_dist_classes_list_all = os.listdir(iod_dataset_path) + print("NEW LABELS TEMP ", in_dist_classes_list_all) + + labels_temp_in_dist = dataset.targets + # init variables + list_in_dist_data = [] + list_out_dist_data = [] + # total number of iod observations + number_iod = int((1 - ood_percentage) * total_unlabeled_obs) + number_ood = int(ood_percentage * total_unlabeled_obs) + print("Reading and shuffling data...") + # list of file names and labels of in distribution data + #in_dist_classes_list_all = list(np.unique(np.array(labels_temp_in_dist))) + print("Total number of classes detected: ", len(in_dist_classes_list_all)) + print("List of in distribution classes: ", in_dist_classes_list_all) + #copy file name and labels to list in data + for i in range(0, len(list_file_names_in_dist_data)): + file_name_path = list_file_names_in_dist_data[i][0] + label_index = labels_temp_in_dist[i] + #we need to use the actual folder name and not the label index reported by pytorch ImageFolder + list_in_dist_data += [(file_name_path, in_dist_classes_list_all[label_index])] + + # list of files and labeles out distribution data + for i in range(0, len(list_file_names_out_dist_data)): + file_name_path = list_file_names_out_dist_data[i][0] + list_out_dist_data += [(file_name_path, OOD_LABEL)] + # shuffle the list and select the percentage of ood and iod data + random.seed(random_state + batch) + selected_iod_data = random.sample(list_in_dist_data, number_iod) + selected_ood_data = random.sample(list_out_dist_data, number_ood) + print("Number of selected iod observations") + print(len(selected_iod_data)) + print("Number of selected ood observations") + print(len(selected_ood_data)) + dest_unlabeled_path_batch = dest_unlabeled_path_base + "/batch_" + str(batch) + "_num_unlabeled_" + str( + total_unlabeled_obs) + "_ood_perc_" + str(int(100 * ood_percentage)) + "/" + if (create_dirs): + # create the directories + try: + print("Trying to create directories: ") + print(dest_unlabeled_path_batch) + os.makedirs(dest_unlabeled_path_batch) + except: + print("Could not create dir, already exists") + # copy the iid observations + print("Copying IOD data...") + for file_label in selected_iod_data: + path_src = file_label[0] + label = file_label[1] + final_dest = dest_unlabeled_path_batch + "/train/" + str(label) + "/" + try: + os.makedirs(final_dest) + except: + a = 0 + copy2(path_src, final_dest) + + # copy the ood observations + print("Copying OOD data randomly in the training folders...") + for file_label in selected_ood_data: + path_src = file_label[0] + random_label = random.sample(in_dist_classes_list_all, 1)[0] + file_name = os.path.basename(path_src) + _, file_extension = os.path.splitext(path_src) + try: + os.makedirs(dest_unlabeled_path_batch + "/train/" + str(random_label)) + except: + a = 0 + final_dest = dest_unlabeled_path_batch + "/train/" + str(random_label) + "/ood_" + file_name + file_extension + copy2(path_src, final_dest) + print("Copying the test folder to the unlabeled data destination... ") + iod_test_path = iod_dataset_path.replace("/train","") + "/test/" + print("From: ", iod_test_path) + print("To: ", dest_unlabeled_path_batch + "/test/") + shutil.copytree(iod_test_path, dest_unlabeled_path_batch + "/test/") + print("A total of ", len(selected_ood_data), " OOD observations were randomly added to the IOD train subfolders!") + return (number_iod, number_ood) + + +def unit_test_data_partitioner(): + """ + Tester of the partitioner + :return: + """ + DEFAULT_PATH_MNIST_1_5 = "/media/Data/user/Datasets/MNIST/OOD_datasets/Out_Distribution_Dataset/all" + DEFAULT_PATH_MNIST_0_4 = "/media/Data/user/Datasets/MNIST/OOD_datasets/In_Distribution_Datasets/In_Distribution_Dataset_1/batch_0" + ood_dataset_path = DEFAULT_PATH_MNIST_1_5 + in_dist_dataset_path = DEFAULT_PATH_MNIST_0_4 + "/train/" + dest_unlabeled_path_base = "/media/Data/user/Datasets/MNIST/OOD_datasets/Out_Distribution_Dataset/unlabeled" + create_folder_partitions_unlabeled_ood(in_dist_dataset_path, ood_dataset_path, dest_unlabeled_path_base, ood_percentage=1, total_unlabeled_obs=10000) + +def unit_test_partitioner_training_test(): + """ + Test for training and test partitioner + :return: + """ + DEFAULT_PATH_MNIST_0_4 = "/media/Data/user/Datasets/MNIST/OOD_datasets/In_Distribution_Datasets/In_Distribution_Dataset_1/batch_0" + + random_state_base = 42 + datasetpath_base = DEFAULT_PATH_MNIST_0_4 + for i in range(0, 2): + random_state_base += 1 + create_train_test_folder_partitions(datasetpath_base, percentage_evaluation=0.25, + random_state=random_state_base, batch=i) + + + + + +def get_mean_and_std(dataset): + """ + Compute the mean and std value of dataset. + :param dataset: + :return: + """ + data_loader = torch.utils.data.DataLoader(dataset, num_workers= 5, pin_memory=True, batch_size =1) + + #init the mean and std + mean = torch.zeros(3) + std = torch.zeros(3) + print('==> Computing mean and std..') + k = 1 + for inputs, targets in data_loader: + #mean and std from the image + #print("Processing image: ", k) + for i in range(3): + mean[i] += inputs[:,i,:,:].mean() + std[i] += inputs[:,i,:,:].std() + k += 1 + + #normalize + mean.div_(len(dataset)) + std.div_(len(dataset)) + print("mean: " + str(mean)) + print("std: " + str(std)) + return mean, std + + +if __name__ == '__main__': + global args, is_colab + is_colab = False + args = parse_commandline_args() + + #use the arguments from cli + #Labeled/unlabeled data partitioner + ood_dataset_path = args.path_ood + in_dist_dataset_path = args.path_iod + dest_unlabeled_path_base = args.path_dest + if(args.mode.strip() == "unlabeled_partitioner"): + create_folder_partitions_unlabeled_ood(in_dist_dataset_path, ood_dataset_path, dest_unlabeled_path_base, ood_percentage=args.ood_perc, total_unlabeled_obs=args.num_unlabeled, batch = args.batch_id_num) + elif(args.mode.strip() == "train_partitioner_balanced"): + print("Train partitioner balanced") + create_train_test_folder_partitions_ood_undersampled(args.path_base, percentage_evaluation=args.eval_perc, random_state=42 + args.batch_id_num, batch=args.batch_id_num, create_dirs=True) + #Train/Test data partitioner + elif(args.mode.strip() == "train_partitioner"): + in_dist_classes_str = args.list_in_dist_classes + #assumes a string with the format '0, 0, 0, 11, 0, 0, 0, 0, 0, 19, 0, 9, 0, 0, 0, 0, 0, 0, 11' + if(in_dist_classes_str != ""): + in_dist_classes_list = [int(s) for s in in_dist_classes_str.split(',')] + else: + in_dist_classes_list = [] + + print("List in distribution classes ", in_dist_classes_list) + create_train_test_folder_partitions_ood(args.path_base, percentage_evaluation=args.eval_perc, random_state=42, batch=args.batch_id_num, create_dirs=True, classes_in_dist=in_dist_classes_list) diff --git a/utilities/noisy_ood_generator.py b/utilities/noisy_ood_generator.py new file mode 100644 index 0000000..5e69350 --- /dev/null +++ b/utilities/noisy_ood_generator.py @@ -0,0 +1,78 @@ +import numpy as np +import cv2 +import random + + +def create_gaussian_noise_images(path="", number_images = 20000, is_rgb = False): + """ + Create gaussian noise images + :param path: + :param number_images: + :param is_rgb: + :return: + """ + dimensions = 224 + mean = 0 + var = 10 + sigma = var ** 0.5 + + image_size = (dimensions, dimensions) + print("Writing images to: ", path ) + print("Number of images: ", number_images) + if(is_rgb): + image_size = (dimensions, dimensions, 3) + + for i in range(0, number_images): + noisy_image = np.random.normal(mean, sigma, image_size) + cv2.normalize(noisy_image, noisy_image, 0, 255, cv2.NORM_MINMAX, dtype=-1) + noisy_image = noisy_image.astype(np.uint8) + file_path_name = path + "/gaussian_" + str(i) + ".png" + cv2.imwrite(file_path_name, noisy_image) + print("Images written!") + +def sp_noise(image, prob = 0.05): + ''' + Add salt and pepper noise to image + prob: Probability of the noise + ''' + output = np.zeros(image.shape, np.uint8) + thres = 1 - prob + for i in range(image.shape[0]): + for j in range(image.shape[1]): + rdn = random.random() + if rdn < prob: + output[i][j] = 0 + elif rdn > thres: + output[i][j] = 255 + else: + output[i][j] = image[i][j] + return output + +def create_salt_pepper_noise_images(path="", number_images = 20000, is_rgb = False): + """ + Create salt and pepper noise images + :param path: + :param number_images: + :param is_rgb: + :return: + """ + dimensions = 224 + + image_size = (dimensions, dimensions) + print("Writing images with S&P noise to: ", path ) + print("Number of images: ", number_images) + if(is_rgb): + image_size = (dimensions, dimensions, 3) + + for i in range(0, number_images): + black_image = np.zeros(image_size) + noisy_image = sp_noise(black_image) + cv2.normalize(noisy_image, noisy_image, 0, 255, cv2.NORM_MINMAX, dtype=-1) + noisy_image = noisy_image.astype(np.uint8) + file_path_name = path + "/gaussian_" + str(i) + ".png" + cv2.imwrite(file_path_name, noisy_image) + print("Images written!") + + +#create_gaussian_noise_images(path="/media/Data/user/Datasets/GaussianNoiseImages/", number_images=20000) +create_salt_pepper_noise_images(path="/media/Data/user/Datasets/SaltAndPepper/", number_images=20000) diff --git a/utilities/run_context.py b/utilities/run_context.py new file mode 100644 index 0000000..d6b8f8e --- /dev/null +++ b/utilities/run_context.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. + +from datetime import datetime +from collections import defaultdict +import threading +import time +import logging +import os +import sys +from pandas import DataFrame +from collections import defaultdict +from csv import writer + +class TrainLog: + """Saves training logs in Pandas csvs""" + + INCREMENTAL_UPDATE_TIME = 300 + + def __init__(self, directory, name): + self.log_file_path = "{}/{}.csv".format(directory, name) + self._log = defaultdict(dict) + self._log_lock = threading.RLock() + self._last_update_time = time.time() - self.INCREMENTAL_UPDATE_TIME + + def record_single(self, step, column, value): + self._record(step, {column: value}) + + def record(self, step, col_val_dict): + self._record(step, col_val_dict) + + def save(self): + df = self._as_dataframe() + #df.to_msgpack(self.log_file_path, compress='zlib') + df.to_csv(self.log_file_path) + + def _record(self, step, col_val_dict): + with self._log_lock: + self._log[step].update(col_val_dict) + #if time.time() - self._last_update_time >= self.INCREMENTAL_UPDATE_TIME: + self._last_update_time = time.time() + self.save() + + def _as_dataframe(self): + with self._log_lock: + return DataFrame.from_dict(self._log, orient='index') + + +class RunContext: + """Creates directories and files for the run""" + def __init__(self, logging, args): + name_log_folder = args.log_folder + self.dateInfo = "{date:%Y-%m-%d_%H_%M_%S}".format(date=datetime.now()) + self.result_dir = ("../../{root}/" + self.dateInfo + "/").format(root = name_log_folder) + #transient dir contains the checkpoints, information logs and training logs + self.transient_dir = self.result_dir + "/logs/" + os.makedirs(self.result_dir) + os.makedirs(self.transient_dir) + logging.basicConfig(filename=self.transient_dir + "log_" + self.dateInfo + ".txt", level=logging.INFO, format='%(message)s') + self.LOG = logging.getLogger('main') + self.init_logger() + #creating log in log dir + self.LOG.info("Creating directories for execution: ") + self.LOG.info(self.result_dir) + self.LOG.info(self.transient_dir) + self.write_args_log(args) + + + def write_args_log(self, args): + self.LOG.info("List of parameters") + self.LOG.info(str(args)) + + def write_run_log(self, run_log_pandas, name_results_log): + name_run_log = self.transient_dir + "run_log_" + self.dateInfo + ".csv" + self.LOG.info("Writing run log to : " + name_run_log) + run_log_pandas.to_csv(name_run_log) + maximum_validation_acc = run_log_pandas['accuracy'].max() + minimum_train_loss = run_log_pandas['train_loss'].min() + self.LOG.info("Maximum accuracy yielded: " + str(maximum_validation_acc)) + self.LOG.info("Minimum training loss: " + str(minimum_train_loss)) + name_results_log = "../../summaries/" + name_results_log + new_row = [name_run_log, minimum_train_loss, maximum_validation_acc] + with open(name_results_log, 'a+', newline='') as write_obj: + # Create a writer object from csv module + csv_writer = writer(write_obj) + # Add contents of list as last row in the csv file + csv_writer.writerow(new_row) + self.LOG.info("Stats file written in: " + name_results_log) + write_obj.close() + + + def init_logger(self): + """ + Sets logging details + :return: + """ + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + handler.setFormatter(formatter) + self.LOG.addHandler(handler) + + def get_logger(self): + return self.LOG + + def create_train_log(self, name): + return TrainLog(self.transient_dir, name) + + def create_results_all_log(self,name, directory = "../logs/summary"): + return TrainLog(directory, name) diff --git a/utilities/scripts_CIFAR_10_distances.py b/utilities/scripts_CIFAR_10_distances.py new file mode 100644 index 0000000..512b4be --- /dev/null +++ b/utilities/scripts_CIFAR_10_distances.py @@ -0,0 +1,106 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester +#distance measurer for CIFAR 10 + +def run_tests_minkowski_cifar10_no_half(p = 1, ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/CIFAR10/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/CIFAR10" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ",path_base_unlabelled ) + + + + #Gaussian distance + print("Calculating p" + str(p) + " distance for Gaussian dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_p_" + str(p), num_batches=10, p = p) + + #Salt and pepper + print("Calculating p" + str(p) + " distance for Salt and pepper dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_p_" + str(p), num_batches=10, + p = p) + + #SVHN + print("Calculating p" + str(p) + " distance for SVHN dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN__p_" + str(p), num_batches = 10, p = p) + + #Imagenet + print("Calculating p" + str(p) + " distance for Imagenet dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_p_" + str(p), num_batches=10, p = p) + + +def run_tests_pdf_cifar10_no_half(distance_str = "js", ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/CIFAR10/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/CIFAR10" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ", path_base_unlabelled) + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #Gaussian distance + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + + #Salt and pepper + print("Calculating " + distance_str + " distance for Salt and pepper dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_pdf_" + distance_str, num_batches=10, + distance_func=distance_func) + + #SVHN + print("Calculating " + distance_str + " distance for SVHN dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN__pdf_" + distance_str, num_batches=10, distance_func=distance_func) + + #Imagenet + print("Calculating " + distance_str + " distance for Imagenet dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + +def run_cifar10_tests_pdf(): + run_tests_pdf_cifar10_no_half(distance_str="js", ood_perc=50) + run_tests_pdf_cifar10_no_half(distance_str="js", ood_perc=100) + run_tests_pdf_cifar10_no_half(distance_str="cosine", ood_perc=50) + run_tests_pdf_cifar10_no_half(distance_str="cosine", ood_perc=100) + +def run_cifar10_tests_p(): + run_tests_minkowski_cifar10_no_half(p = 1, ood_perc=50) + run_tests_minkowski_cifar10_no_half(p = 1, ood_perc=100) + run_tests_minkowski_cifar10_no_half(p = 2, ood_perc=50) + run_tests_minkowski_cifar10_no_half(p = 2, ood_perc=100) + + +run_cifar10_tests_pdf() diff --git a/utilities/scripts_FASHIONMNIST_distances.py b/utilities/scripts_FASHIONMNIST_distances.py new file mode 100644 index 0000000..d7b4b21 --- /dev/null +++ b/utilities/scripts_FASHIONMNIST_distances.py @@ -0,0 +1,107 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester +#distance measurer for CIFAR 10 + +def run_tests_minkowski_fashionmnist_no_half(p = 1, ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/FASHIONMNIST" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ",path_base_unlabelled ) + + + + #Gaussian distance + print("Calculating p" + str(p) + " distance for Gaussian dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_p_" + str(p), num_batches=10, p = p) + + #Salt and pepper + print("Calculating p" + str(p) + " distance for Salt and pepper dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_p_" + str(p), num_batches=10, + p = p) + + #FASHION PRODUCT + print("Calculating p" + str(p) + " distance for FASHION PRODUCT dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_FASHIONPRODUCT/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="FASHIONPRODUCT__p_" + str(p), num_batches = 10, p = p) + + #Imagenet + print("Calculating p" + str(p) + " distance for Imagenet dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_p_" + str(p), num_batches=10, p = p) + + +def run_tests_pdf_fashionmnist_no_half(distance_str = "js", ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/FASHIONMNIST" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ", path_base_unlabelled) + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #Gaussian distance + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + + #Salt and pepper + print("Calculating " + distance_str + " distance for Salt and pepper dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_pdf_" + distance_str, num_batches=10, + distance_func=distance_func) + + #SVHN + print("Calculating " + distance_str + " distance for FASHION PRODUCT dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_FASHIONPRODUCT/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="FASHIONPRODUCT__pdf_" + distance_str, num_batches=10, distance_func=distance_func) + + #Imagenet + print("Calculating " + distance_str + " distance for Imagenet dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + +def run_fashionmnist_tests_pdf(): + run_tests_pdf_fashionmnist_no_half(distance_str="js", ood_perc=50) + run_tests_pdf_fashionmnist_no_half(distance_str="js", ood_perc=100) + run_tests_pdf_fashionmnist_no_half(distance_str="cosine", ood_perc=50) + run_tests_pdf_fashionmnist_no_half(distance_str="cosine", ood_perc=100) + +def run_fashionmnist_tests_p(): + run_tests_minkowski_fashionmnist_no_half(p = 1, ood_perc=50) + run_tests_minkowski_fashionmnist_no_half(p = 1, ood_perc=100) + run_tests_minkowski_fashionmnist_no_half(p = 2, ood_perc=50) + run_tests_minkowski_fashionmnist_no_half(p = 2, ood_perc=100) + + + +run_fashionmnist_tests_pdf() diff --git a/utilities/scripts_HALF_distances_cifar10.py b/utilities/scripts_HALF_distances_cifar10.py new file mode 100644 index 0000000..ac8ac6a --- /dev/null +++ b/utilities/scripts_HALF_distances_cifar10.py @@ -0,0 +1,63 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester + +#distance measurer for CIFAR 10 + + + + + + +def run_tests_pdf_cifar10_half(distance_str = "js", ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/CIFAR10/batches_labeled_in_dist_60" + path_base_unlabelled = "/media/Data/user/Datasets/CIFAR10/batches_unlabeled_HALF_60" + + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #HALF + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_pdf_" + distance_str, num_batches=9, distance_func = distance_func) + + +def run_tests_minkowski_cifar10_half(p = 1, ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/CIFAR10/batches_labeled_in_dist_60" + path_base_unlabelled = "/media/Data/user/Datasets/CIFAR10/batches_unlabeled_HALF_60" + + + + #HALF + print("Calculating p" + str(p) + " distance for half dataset") + + dataset_distance_tester( + path_bunch1=path_labelled+ "/batch_", + path_bunch2=path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_p_" + str(p), num_batches=9, p=p) + +def run_cifar10_tests_p(ood_perc = 50): + run_tests_minkowski_cifar10_half(p=1, ood_perc = ood_perc) + run_tests_minkowski_cifar10_half(p=2, ood_perc = ood_perc) + +def run_cifar10_tests_pdf(ood_perc = 50): + run_tests_pdf_cifar10_half(distance_str="js", ood_perc = ood_perc) + run_tests_pdf_cifar10_half(distance_str="cosine", ood_perc = ood_perc) + + + + +run_cifar10_tests_p(ood_perc = 50) +run_cifar10_tests_pdf(ood_perc = 50) diff --git a/utilities/scripts_HALF_distances_fashionmnist.py b/utilities/scripts_HALF_distances_fashionmnist.py new file mode 100644 index 0000000..636b034 --- /dev/null +++ b/utilities/scripts_HALF_distances_fashionmnist.py @@ -0,0 +1,63 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester + +#distance measurer for CIFAR 10 + + + + + + +def run_tests_pdf_FASHIONMNIST_half(distance_str = "js", ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_labeled_in_dist_60" + path_base_unlabelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_unlabeled_HALF_60" + + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #HALF + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + + +def run_tests_minkowski_FASHIONMNIST_half(p = 1, ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_labeled_in_dist_60" + path_base_unlabelled = "/media/Data/user/Datasets/FASHIONMNIST/batches_unlabeled_HALF_60" + + + + #HALF + print("Calculating p" + str(p) + " distance for half dataset") + + dataset_distance_tester( + path_bunch1=path_labelled+ "/batch_", + path_bunch2=path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_p_" + str(p), num_batches=10, p=p) + +def run_FASHIONMNIST_tests_p(ood_perc = 100): + run_tests_minkowski_FASHIONMNIST_half(p=1, ood_perc = ood_perc) + run_tests_minkowski_FASHIONMNIST_half(p=2, ood_perc = ood_perc) + +def run_FASHIONMNIST_tests_pdf(ood_perc = 100): + run_tests_pdf_FASHIONMNIST_half(distance_str="js", ood_perc = ood_perc) + #run_tests_pdf_FASHIONMNIST_half(distance_str="cosine", ood_perc = ood_perc) + + + + +#run_FASHIONMNIST_tests_p(ood_perc = 50) +run_FASHIONMNIST_tests_pdf(ood_perc = 50) diff --git a/utilities/scripts_HALF_distances_mnist.py b/utilities/scripts_HALF_distances_mnist.py new file mode 100644 index 0000000..3692380 --- /dev/null +++ b/utilities/scripts_HALF_distances_mnist.py @@ -0,0 +1,63 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester + +#distance measurer for CIFAR 10 + + + + + + +def run_tests_pdf_mnist_half(distance_str = "js", ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist" + path_base_unlabelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_HALF" + + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #HALF + print("Calculating " + distance_str + " distance for MNIST HALF dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_pdf_" + distance_str, num_batches=9, distance_func = distance_func) + + +def run_tests_minkowski_mnist_half(p = 1, ood_perc = 100): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist" + path_base_unlabelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_HALF" + + + + #HALF + print("Calculating p" + str(p) + " distance for MNIST HALF dataset") + + dataset_distance_tester( + path_bunch1=path_labelled+ "/batch_", + path_bunch2=path_base_unlabelled + "/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="HALF_OOD_p_" + str(p), num_batches=9, p=p) + +def run_mnist_tests_p(ood_perc = 50): + run_tests_minkowski_mnist_half(p=1, ood_perc = ood_perc) + run_tests_minkowski_mnist_half(p=2, ood_perc = ood_perc) + +def run_mnist_tests_pdf(ood_perc = 50): + run_tests_pdf_mnist_half(distance_str="js", ood_perc = ood_perc) + run_tests_pdf_mnist_half(distance_str="cosine", ood_perc = ood_perc) + + + + +run_mnist_tests_p(ood_perc = 50) +run_mnist_tests_pdf(ood_perc = 50) diff --git a/utilities/scripts_MNIST_distances.py b/utilities/scripts_MNIST_distances.py new file mode 100644 index 0000000..4374939 --- /dev/null +++ b/utilities/scripts_MNIST_distances.py @@ -0,0 +1,106 @@ +from scipy.spatial import distance +from dataset_distance_measurer import dataset_distance_tester_pdf +from dataset_distance_measurer import dataset_distance_tester +#distance measurer for CIFAR 10 + +def run_tests_minkowski_MNIST_medium_complete_no_half(p = 1, ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/MNIST_medium_complete" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ",path_base_unlabelled ) + + + + #Gaussian distance + print("Calculating p" + str(p) + " distance for Gaussian dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_p_" + str(p), num_batches=10, p = p) + + #Salt and pepper + print("Calculating p" + str(p) + " distance for Salt and pepper dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_p_" + str(p), num_batches=10, + p = p) + + #SVHN + print("Calculating p" + str(p) + " distance for SVHN dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN__p_" + str(p), num_batches = 10, p = p) + + #Imagenet + print("Calculating p" + str(p) + " distance for Imagenet dataset") + dataset_distance_tester( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_p_" + str(p), num_batches=10, p = p) + + +def run_tests_pdf_MNIST_medium_complete_no_half(distance_str = "js", ood_perc = 50): + """ + :param distance: distance_str + :return: + """ + path_labelled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_" + path_base_unlabelled = "/media/Data/user/Datasets/MNIST_medium_complete" + print("Path labelled: ", path_labelled) + print("Path unlabelled: ", path_base_unlabelled) + if(distance_str == "js"): + distance_func = distance.jensenshannon + elif(distance_str == "cosine"): + distance_func = distance.cosine + + #Gaussian distance + print("Calculating " + distance_str + " distance for Gaussian dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_Gaussian/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Gaussian_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + + #Salt and pepper + print("Calculating " + distance_str + " distance for Salt and pepper dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled +"/batches_unlabeled_SaltAndPepper/batch_", + ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SaltAndPepper_pdf_" + distance_str, num_batches=10, + distance_func=distance_func) + + #SVHN + print("Calculating " + distance_str + " distance for SVHN dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2= path_base_unlabelled + "/batches_unlabeled_SVHN/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="SVHN__pdf_" + distance_str, num_batches=10, distance_func=distance_func) + + #Imagenet + print("Calculating " + distance_str + " distance for Imagenet dataset") + dataset_distance_tester_pdf( + path_bunch1=path_labelled, + path_bunch2=path_base_unlabelled+"/batches_unlabeled_IMAGENET/batch_", ood_perc=ood_perc, + num_unlabeled=3000, name_ood_dataset="Imagenet_pdf_" + distance_str, num_batches=10, distance_func = distance_func) + +def run_MNIST_medium_complete_tests_pdf(): + run_tests_pdf_MNIST_medium_complete_no_half(distance_str="js", ood_perc=50) + run_tests_pdf_MNIST_medium_complete_no_half(distance_str="js", ood_perc=100) + run_tests_pdf_MNIST_medium_complete_no_half(distance_str="cosine", ood_perc=50) + run_tests_pdf_MNIST_medium_complete_no_half(distance_str="cosine", ood_perc=100) + + +def run_MNIST_medium_complete_tests_p(): + run_tests_minkowski_MNIST_medium_complete_no_half(p = 1, ood_perc=50) + run_tests_minkowski_MNIST_medium_complete_no_half(p = 1, ood_perc=100) + run_tests_minkowski_MNIST_medium_complete_no_half(p = 2, ood_perc=50) + run_tests_minkowski_MNIST_medium_complete_no_half(p = 2, ood_perc=100) + +run_MNIST_medium_complete_tests_pdf() diff --git a/utilities/test_generator.py b/utilities/test_generator.py new file mode 100644 index 0000000..dc4d1d1 --- /dev/null +++ b/utilities/test_generator.py @@ -0,0 +1,234 @@ + +from shutil import copy2 +import matplotlib +import re +import argparse +import logging +import random +import cli + +from dataset_partitioner import create_parser + + +def create_training_test_datasets_shell_script(script_name_path, num_batches, args_parser, name_program_py): + """ + Generic creator of traing/test/ood partitioner script + :param dataset_all_path: + :param dataset_name: + :param script_name: + :param num_batches: + :param args: + :return: + """ + file = open(script_name_path, 'w') + #create a line for excecuting the program with a different batch + for i in range(0, num_batches): + #randomly take 5 of the 10 MNIST classes for in dist data (used to train the model) + list_in_dist_classes = random.sample([0,1,2,3,4,5,6,7,8,9], 5) + list_in_dist_classes_str = str(list_in_dist_classes).replace("[", "").replace("]", "").replace(",", "$") + print(list_in_dist_classes_str) + args_parser.list_in_dist_classes = list_in_dist_classes_str + #the new batch id + args_parser.batch_id_num = i + print("args_parser.batch_id_num ", args_parser.batch_id_num ) + print(args_parser) + args_str_no_pp = str(args_parser) + args_str = create_args_string(args_str_no_pp, name_program_py) + if(i < num_batches - 1): + file.write(args_str + "\n") + else: + file.write(args_str) + file.close() + +def create_mixmatch_fully_supervised_shell_script(script_name_path, num_batches, args_parser, name_program_py): + """ + Generic creator of traing/test/ood partitioner script + :param dataset_all_path: + :param dataset_name: + :param script_name: + :param num_batches: + :param args: + :return: + """ + file = open(script_name_path, 'w') + name_program_py += " " + #create a line for excecuting the program with a different batch + for i in range(0, num_batches): + #the new batch id + temp_str = args_parser.path_labeled + #avoid multiple number concats + args_parser.path_labeled += str(i) + args_str_no_pp = str(args_parser) + args_str = create_args_string(args_str_no_pp, name_program_py) + print("Adding to script : ", args_str) + args_parser.path_labeled = temp_str + if(i < num_batches - 1): + file.write(args_str + "\n") + else: + file.write(args_str) + file.close() + + +def create_mixmatch_ssdl_shell_script(script_name_path, num_batches, args_parser, name_program_py, ood_perc, num_unlabeled, use_unlabeled_external): + """ + Generic creator of traing/test/ood partitioner script + :param dataset_all_path: + :param dataset_name: + :param script_name: + :param num_batches: + :param args: + :return: + """ + args_parser.mode = "ssdl" + file = open(script_name_path, 'w') + name_program_py += " " + #create a line for excecuting the program with a different batch + for i in range(0, num_batches): + #the new batch id + temp_str = args_parser.path_labeled + temp_str2 = args_parser.path_unlabeled + #avoid multiple number concats + args_parser.path_labeled += str(i) + #add details of + if(use_unlabeled_external): + args_parser.path_unlabeled += str(i) + "/batch_" + str(i) + "_num_unlabeled_" + str(num_unlabeled) + "_ood_perc_" + str(int(100 * ood_perc)) + args_str_no_pp = str(args_parser) + args_str = create_args_string(args_str_no_pp, name_program_py) + print("Adding to script : ", args_str) + args_parser.path_labeled = temp_str + args_parser.path_unlabeled = temp_str2 + if(i < num_batches - 1): + file.write(args_str + "\n") + else: + file.write(args_str) + file.close() + + +def create_partitioner_unlabeled_script_MNIST(num_unlabeled, ood_perc, num_batches): + script_name_path = "../shell_scripts/unlabeled_ood_partitioner_ood4ssdl_MNIST_" + str(num_batches) + "_num_unlabeled_" + str(num_unlabeled) + "_ood_perc_" + str(ood_perc) + ".sh" + args_parser = create_parser().parse_args([]) + args_parser.mode = "unlabeled_partitioner" + name_program_py = "../utilities/dataset_partitioner.py " + args_parser.path_ood = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled_out_dist/batch_" + #base path + args_parser.path_dest = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled/batch_" + #OOD percentage + args_parser.ood_perc = ood_perc + args_parser.path_iod = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_" + #number of unlabeled observations + args_parser.num_unlabeled = num_unlabeled + + + file = open(script_name_path, 'w') + + for curr_batch in range(0, num_batches): + args_parser.batch_id_num = curr_batch + + temp_str_dest = args_parser.path_dest + args_parser.path_dest += str(curr_batch) + temp_str_ood = args_parser.path_ood + args_parser.path_ood += str(curr_batch) + temp_str_iod = args_parser.path_iod + args_parser.path_iod += str(curr_batch) + "/train" + #print("PATH IOD ", args_parser.path_iod) + args_str_no_pp = str(args_parser) + args_str = create_args_string(args_str_no_pp, name_program_py) + args_parser.path_dest = temp_str_dest + args_parser.path_ood = temp_str_ood + args_parser.path_iod = temp_str_iod + + print("Adding to script: ", args_str) + if (curr_batch < num_batches - 1): + file.write(args_str + "\n") + else: + file.write(args_str) + + file.close() + +def create_partitioner_trainer_script_MNIST(): + """ + Partitioner script for MNIST + :return: + """ + script_name_path = "../shell_scripts/training_test_ood_partitioner_ood4ssdl_MNIST.sh" + args_parser = create_parser().parse_args([]) + args_parser.mode = "train_partitioner" + args_parser.path_base = "/media/Data/user/Datasets/MNIST_medium_complete/" + args_parser.eval_perc = 0.25 + num_batches = 10 + create_training_test_datasets_shell_script(script_name_path, num_batches, args_parser) + +def create_args_string(args_namespace_str, program_name): + """ + Format string for posting in an sh file + :param args_namespace_str: + :param program_name: + :return: + """ + #equals must be eliminated + args_namespace_str = args_namespace_str.replace("=", " ") + args_namespace_str = args_namespace_str.replace("Namespace(", "--").replace(")", "") + args_namespace_str = args_namespace_str.replace(", ", " --").replace("$", ",") + args_str = "python " + program_name + args_namespace_str + return args_str + +def create_test_scripts_MNIST_fully_supervised(): + """ + + :return: + """ + script_name_path = "../shell_scripts/mixmatch_train_fully_supervised_MNIST.sh" + args_parser = cli.create_parser().parse_args([]) + args_parser.model = "wide_resnet" + args_parser.path_labeled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_" + args_parser.results_file_name = "stats_fully_supervised_MNIST.csv" + args_parser.lr = 2e-4 + args_parser.weight_decay = 1e-4 + args_parser.epochs = 1 + args_parser.mode = "fully_supervised" + args_parser.num_classes = 5 + args_parser.size_image = 28 + args_parser.batch_size = 32 + args_parser.log_folder = "logs_MNIST_supervised" + args_parser.norm_stats = "MNIST" + args_parser.epochs = 30 + num_batches = 10 + name_program_py = "../MixMatch_OOD_main.py" + create_mixmatch_fully_supervised_shell_script(script_name_path, num_batches, args_parser, name_program_py) + + +def create_test_scripts_MNIST_semi_supervised(use_unlabeled_external, num_unlabeled, ood_perc): + """ + + :return: + """ + script_name_path = "../shell_scripts/mixmatch_train_semi_supervised_MNIST_use_external_" + str(use_unlabeled_external) + "_num_unlabeled_" + str(num_unlabeled) + "_ood_perc_" + str(ood_perc) + ".sh" + args_parser = cli.create_parser().parse_args([]) + args_parser.model = "wide_resnet" + args_parser.path_labeled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_labeled_in_dist/batch_" + args_parser.path_unlabeled = "" + if(use_unlabeled_external): + args_parser.path_unlabeled = "/media/Data/user/Datasets/MNIST_medium_complete/batches_unlabeled/batch_" + + args_parser.results_file_name = "stats_semi_supervised_MNIST.csv" + args_parser.lr = 2e-4 + args_parser.weight_decay = 1e-4 + args_parser.epochs = 1 + args_parser.mode = "ssdl" + args_parser.num_classes = 5 + args_parser.size_image = 28 + args_parser.batch_size = 32 + args_parser.log_folder = "logs_MNIST__semi_supervised" + args_parser.norm_stats = "MNIST" + args_parser.epochs = 30 + num_batches = 10 + name_program_py = "../MixMatch_OOD_main.py" + create_mixmatch_ssdl_shell_script(script_name_path, num_batches, args_parser, name_program_py, ood_perc, num_unlabeled, use_unlabeled_external) + +def test_create_test_scripts_MNIST_semi_supervised(): + create_test_scripts_MNIST_semi_supervised(use_unlabeled_external = True, num_unlabeled = 15000, ood_perc = 0.5) + +def test_create_unlabeled_script_MNIST(): + create_test_scripts_MNIST_semi_supervised(use_unlabeled_external = True, num_unlabeled = 15000, ood_perc = 0.5, num_batches = 10) + +test_create_test_scripts_MNIST_semi_supervised()