base scripts paper

luisoala · Jun 13, 2020 · c45f3b4 · c45f3b4
commit c45f3b4
Show file tree

Hide file tree

Showing 16 changed files with 3,090 additions and 0 deletions.
diff --git a/MixMatch_OOD_main.py b/MixMatch_OOD_main.py
diff --git a/ood_experiment_at_scale_script.sh b/ood_experiment_at_scale_script.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+#
+#EXPERIMENT CONFIGS
+#DATA
+#DLIDS are the "download ids" of the gdrive for each data set
+export CUDA_VISIBLE_DEVICES=1
+declare -A DATA_DLIDS=( ["MNIST"]="10pULG3xRIkl5tDo6VJHUK6NHIguMT7Jq" ["FASHIONMNIST"]="11AJ-OEgtj7XDeLHPVEyWuvHzFIIadj9_" ["CIFAR10"]="1O6uarg54CwtZ3h_B6YzD1KW9nbQ-E7Xl" ["TINYIMAGENET"]="10i1FV1SgXxMWfgTEpAXr0qe7q7s6Ko6e" ["SVHN"]="1wgTQJOtGxWPLNKPMuDc7vzYejYnWJsVr" ["SVHN-different"]="10HbYSMt3CbHeieUBqO675eNpAef-UY3_" ["GaussianNoise"]="1GXljou_EJGcdfVsfVJiVMVo7-RKNu106" ["SALTANDPEPPER"]="1iBwKyR7M4_ca2Ti7xW5FJ-BNdqEDG-vK" ["FASHIONPRODUCT"]="1zN1BF1u1SJl81JpH6hexYgrVvivaGxa5")
+#FPATHS are the file paths for the data sets in the local working directory when doing experiments
+declare -A DATA_FPATHS=(["MNIST"]="data/MNIST/" ["FASHIONMNIST"]="data/FASHIONMNIST/" ["CIFAR10"]="data/CIFAR10/" ["TINYIMAGENET"]="data/TINYIMAGENET/" ["SVHN-different"]="data/SVHN-different/" ["GaussianNoise"]="data/GaussianNoise/" ["SALTANDPEPPER"]="data/SALTANDPEPPER/" ["SVHN"]="data/SVHN/" ["FASHIONPRODUCT"]="data/FASHIONPRODUCT/")
+
+BASE_DATA="CIFAR10" #the base data that is used for the iod data. around it which the experiment is centered
+DIFFERENT_DATA="MNIST" #array of datasets used for the ood setting "different" -> contrasting data sets
+
+OOD_PERC_PP_LIST=(0 50 100) #the ood percentage in percentage points
+NUM_UNLABELED=3000 #should be 3000
+MIN_CLASS_ID=0 #the lowest class id for the classes in the data set
+MAX_CLASS_ID=9 #the highest class id for the classes of the data set
+NUM_CLASSES_IN_DIST=5 #the number of classes to select for the in dist class
+NUM_LABELED_LIST=(60 100 150)
+OOD_TYPE="different" #can be "half-half" (ood samples come from same dataset but are a subset of the classes) or "different" (ood samples come from different dataset)
+#NOTE: seed for shuf of iod class permuations is given by batch id on run level
+#
+#RUN CONFIGS (a run is an iteration of teh experiment with one of the random data batches)
+BATCHES=(0 1 2 3 4 5 6 7 8 9) #the batch id used for the different runs of the experiment
+#
+#MIX MATCH ALGO CONFIGS
+MODEL="wide_resnet"
+DATASET="CIFAR10-BASELINE"
+RESULTS_FILE="stats_OOD_4_SSDL.csv"
+WORKERS="1"
+EPOCHS="50" #should be 50
+BATCH_SIZE="16" #should be 16
+LR="0.0002"
+WEIGHT_DECAY="0.0001"
+K_TRANSFORMS="2"
+T_SHARPENING="0.25"
+ALPHA_MIX="0.75"
+MODE="ssdl"
+BALANCED="5" #int -1 no bal, 5 bal
+GAMMA_US="25" #the gamma for the unsupervised loss
+IMG_SIZE="32"
+NORM_STATS="MNIST" #is not used
+#
+LOG_FOLDER="logs"
+SAVE_WEIGHTS="FALSE"
+WEIGHTS_PATH=""
+RAMPUP_COEFFICIENT="3000"
+#
+N=10 #number of parallel processes
+#DOWNLOAD DATA
+#steps
+#create data dir
+mkdir data
+#cd insto data dir
+cd data
+
+#do it for base data
+##downlaod file
+###get file id
+FILEID="${DATA_DLIDS["${BASE_DATA}"]}"
+###download
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${BASE_DATA}.zip && rm -rf /tmp/cookies.txt
+##unzip
+unzip ${BASE_DATA}.zip
+##remove zip
+rm ${BASE_DATA}.zip
+
+#do it for different data
+###get file id
+FILEID="${DATA_DLIDS["${DIFFERENT_DATA}"]}"
+###download
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${DIFFERENT_DATA}.zip && rm -rf /tmp/cookies.txt
+##unzip
+unzip ${DIFFERENT_DATA}.zip
+##remove zip
+rm ${DIFFERENT_DATA}.zip
+#when done cd .. back to work dir
+cd ..
+
+#EXPERIMENTS
+PATH_BASE="${DATA_FPATHS["${BASE_DATA}"]}" #path to the dataset that will be used for the experiments and construction of the train and val sets. assumes at the end of path exists another directory /all/ which contains subdirectories with id name of class and then each containing the samples of this class
+PATH_DIFFERENT="${DATA_FPATHS["${DIFFERENT_DATA}"]}"
+
+train_batch(){
+	local batch_id=$1
+	local OOD_PERC=$2
+	local OOD_PERC_PP=$3
+	local NUM_UNLABELED=$4
+	local IDS=$5
+	local LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data
+
+	local PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
+	local PATH_UNLABELED="$PATH_BASE/unlabeled/batch_${batch_id}_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"
+
+	python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED  --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr  $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT
+}
+
+
+
+if [ $OOD_TYPE = "half-half" ]
+then
+	for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
+	do
+		for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
+		do
+			OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
+			#first iteration over batches creates the data
+			IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
+			for batch_id in ${BATCHES[@]}
+			do
+				IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
+				IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
+				LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data
+
+				python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES
+
+				python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_BASE/batches_unlabeled_out_dist/batch_$batch_id" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"
+
+			done
+
+			#the second iteration over batches runs mixmatch in parallel
+			for ((i=0;i<${#BATCHES[@]};++i))
+			do
+				((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
+				train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "${IDS_LIST[i]}"&
+			done
+			wait
+			#)
+			#do clean up prior to the next experiment
+			rm -r $PATH_BASE/unlabeled/
+			rm -r $PATH_BASE/batches_labeled_in_dist/
+			rm -r $PATH_BASE/batches_unlabeled_out_dist/
+		done
+	done
+elif [[ $OOD_TYPE -eq "different" ]]
+then
+	#add loop over the different datasets -> no loop as I split experiments into invidual data sets
+
+	for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
+	do
+		for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
+		do
+			OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
+			IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
+			for batch_id in ${BATCHES[@]}
+			do
+				IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
+				IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
+				LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data#the subset of the classes in the dataset that are used as in dist data
+				python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES
+
+				python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_DIFFERENT" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"
+				#path_ood here has one subfolder with all the unlabelled images
+
+			done
+
+			#the second iteration over batches runs mixmatch in parallel
+			PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
+			PATH_UNLABELED="$PATH_BASE/unlabeled/batch_0_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"
+			python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED  --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr  $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT --exp_creator "Yes"
+			for ((i=0;i<${#BATCHES[@]};++i))
+			do
+				((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
+				train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "$LIST_IN_DIST_CLASSES"&
+			done
+			wait
+
+			#do clean up prior to the next experiment
+			rm -r $PATH_BASE/unlabeled/
+			rm -r $PATH_BASE/batches_labeled_in_dist/
+			rm -r $PATH_BASE/batches_unlabeled_out_dist/
+		done
+	done
+
+
+
+else
+	echo "No valid OOD_TYPE was specified. Choose 'same' or 'different'"
+fi
+#final cleanup -> delete all data
+rm -r data/
+#dynamic resizing of images for different setting -> when they do not exactly match -> defined in mixmatch
diff --git a/results_analysis_3.py b/results_analysis_3.py
@@ -0,0 +1,41 @@
+from mlflow.tracking.client import MlflowClient
+import mlflow
+import numpy as np
+
+
+start_experiment=3
+end_experiment=9
+metric='val_accuracy'
+num_epochs=50
+
+#range(start_experiment,end_experiment+1)
+experiment_ids=[1,2,3]
+
+
+means=[]
+stds=[]
+for experiment_id in experiment_ids:
+	experiment_id = str(experiment_id)
+	experiment_name = mlflow.get_experiment(experiment_id).name
+	print(experiment_name)
+	results = mlflow.search_runs(experiment_ids=[experiment_id])
+
+	max_accuracies=[]
+	for run_id in (results['run_id']):
+		val_accuracies=MlflowClient().get_metric_history(run_id, metric)
+		max_accuracy=0.
+		for epoch in range(num_epochs):
+			accuracy=val_accuracies[epoch].value
+			if val_accuracies[epoch].value > max_accuracy:
+				max_accuracy=val_accuracies[epoch].value
+		max_accuracies.append(max_accuracy)
+	mean=np.mean(max_accuracies)
+	std=np.std(max_accuracies)
+	print('Mean:{} Std:{}'.format(mean, std))
+	means.append(mean)
+	stds.append(std)
+
+#just use format
+
+print("& 0 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[1], stds[1], means[2], stds[2],r"\\"))
+#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std
diff --git a/results_analysis_6.py b/results_analysis_6.py
@@ -0,0 +1,40 @@
+from mlflow.tracking.client import MlflowClient
+import mlflow
+import numpy as np
+
+
+start_experiment=3
+end_experiment=9
+metric='val_accuracy'
+num_epochs=50
+
+experiment_ids=[1,2,3,4,5,6]
+
+
+means=[]
+stds=[]
+for experiment_id in experiment_ids:
+	experiment_id = str(experiment_id)
+	experiment_name = mlflow.get_experiment(experiment_id).name
+	print(experiment_name)
+	results = mlflow.search_runs(experiment_ids=[experiment_id])
+
+	max_accuracies=[]
+	for run_id in (results['run_id']):
+		val_accuracies=MlflowClient().get_metric_history(run_id, metric)
+		max_accuracy=0.
+		for epoch in range(num_epochs):
+			accuracy=val_accuracies[epoch].value
+			if val_accuracies[epoch].value > max_accuracy:
+				max_accuracy=val_accuracies[epoch].value
+		max_accuracies.append(max_accuracy)
+	mean=np.mean(max_accuracies)
+	std=np.std(max_accuracies)
+	print('Mean:{} Std:{}'.format(mean, std))
+	means.append(mean)
+	stds.append(std)
+
+#just use format
+
+print("& 50 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${} \n & & & 100 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[2], stds[2], means[4], stds[4],r"\\",  means[1], stds[1], means[3], stds[3], means[5], stds[5], r" \\"))
+#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std