Skip to content

Commit

Permalink
base scripts paper
Browse files Browse the repository at this point in the history
  • Loading branch information
luisoala committed Jun 13, 2020
0 parents commit c45f3b4
Show file tree
Hide file tree
Showing 16 changed files with 3,090 additions and 0 deletions.
684 changes: 684 additions & 0 deletions MixMatch_OOD_main.py

Large diffs are not rendered by default.

182 changes: 182 additions & 0 deletions ood_experiment_at_scale_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/bin/bash
#
#EXPERIMENT CONFIGS
#DATA
#DLIDS are the "download ids" of the gdrive for each data set
export CUDA_VISIBLE_DEVICES=1
declare -A DATA_DLIDS=( ["MNIST"]="10pULG3xRIkl5tDo6VJHUK6NHIguMT7Jq" ["FASHIONMNIST"]="11AJ-OEgtj7XDeLHPVEyWuvHzFIIadj9_" ["CIFAR10"]="1O6uarg54CwtZ3h_B6YzD1KW9nbQ-E7Xl" ["TINYIMAGENET"]="10i1FV1SgXxMWfgTEpAXr0qe7q7s6Ko6e" ["SVHN"]="1wgTQJOtGxWPLNKPMuDc7vzYejYnWJsVr" ["SVHN-different"]="10HbYSMt3CbHeieUBqO675eNpAef-UY3_" ["GaussianNoise"]="1GXljou_EJGcdfVsfVJiVMVo7-RKNu106" ["SALTANDPEPPER"]="1iBwKyR7M4_ca2Ti7xW5FJ-BNdqEDG-vK" ["FASHIONPRODUCT"]="1zN1BF1u1SJl81JpH6hexYgrVvivaGxa5")
#FPATHS are the file paths for the data sets in the local working directory when doing experiments
declare -A DATA_FPATHS=(["MNIST"]="data/MNIST/" ["FASHIONMNIST"]="data/FASHIONMNIST/" ["CIFAR10"]="data/CIFAR10/" ["TINYIMAGENET"]="data/TINYIMAGENET/" ["SVHN-different"]="data/SVHN-different/" ["GaussianNoise"]="data/GaussianNoise/" ["SALTANDPEPPER"]="data/SALTANDPEPPER/" ["SVHN"]="data/SVHN/" ["FASHIONPRODUCT"]="data/FASHIONPRODUCT/")

BASE_DATA="CIFAR10" #the base data that is used for the iod data. around it which the experiment is centered
DIFFERENT_DATA="MNIST" #array of datasets used for the ood setting "different" -> contrasting data sets

OOD_PERC_PP_LIST=(0 50 100) #the ood percentage in percentage points
NUM_UNLABELED=3000 #should be 3000
MIN_CLASS_ID=0 #the lowest class id for the classes in the data set
MAX_CLASS_ID=9 #the highest class id for the classes of the data set
NUM_CLASSES_IN_DIST=5 #the number of classes to select for the in dist class
NUM_LABELED_LIST=(60 100 150)
OOD_TYPE="different" #can be "half-half" (ood samples come from same dataset but are a subset of the classes) or "different" (ood samples come from different dataset)
#NOTE: seed for shuf of iod class permuations is given by batch id on run level
#
#RUN CONFIGS (a run is an iteration of teh experiment with one of the random data batches)
BATCHES=(0 1 2 3 4 5 6 7 8 9) #the batch id used for the different runs of the experiment
#
#MIX MATCH ALGO CONFIGS
MODEL="wide_resnet"
DATASET="CIFAR10-BASELINE"
RESULTS_FILE="stats_OOD_4_SSDL.csv"
WORKERS="1"
EPOCHS="50" #should be 50
BATCH_SIZE="16" #should be 16
LR="0.0002"
WEIGHT_DECAY="0.0001"
K_TRANSFORMS="2"
T_SHARPENING="0.25"
ALPHA_MIX="0.75"
MODE="ssdl"
BALANCED="5" #int -1 no bal, 5 bal
GAMMA_US="25" #the gamma for the unsupervised loss
IMG_SIZE="32"
NORM_STATS="MNIST" #is not used
#
LOG_FOLDER="logs"
SAVE_WEIGHTS="FALSE"
WEIGHTS_PATH=""
RAMPUP_COEFFICIENT="3000"
#
N=10 #number of parallel processes
#DOWNLOAD DATA
#steps
#create data dir
mkdir data
#cd insto data dir
cd data

#do it for base data
##downlaod file
###get file id
FILEID="${DATA_DLIDS["${BASE_DATA}"]}"
###download
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${BASE_DATA}.zip && rm -rf /tmp/cookies.txt
##unzip
unzip ${BASE_DATA}.zip
##remove zip
rm ${BASE_DATA}.zip

#do it for different data
###get file id
FILEID="${DATA_DLIDS["${DIFFERENT_DATA}"]}"
###download
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${DIFFERENT_DATA}.zip && rm -rf /tmp/cookies.txt
##unzip
unzip ${DIFFERENT_DATA}.zip
##remove zip
rm ${DIFFERENT_DATA}.zip
#when done cd .. back to work dir
cd ..

#EXPERIMENTS
PATH_BASE="${DATA_FPATHS["${BASE_DATA}"]}" #path to the dataset that will be used for the experiments and construction of the train and val sets. assumes at the end of path exists another directory /all/ which contains subdirectories with id name of class and then each containing the samples of this class
PATH_DIFFERENT="${DATA_FPATHS["${DIFFERENT_DATA}"]}"

train_batch(){
local batch_id=$1
local OOD_PERC=$2
local OOD_PERC_PP=$3
local NUM_UNLABELED=$4
local IDS=$5
local LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data

local PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
local PATH_UNLABELED="$PATH_BASE/unlabeled/batch_${batch_id}_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"

python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT
}



if [ $OOD_TYPE = "half-half" ]
then
for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
do
for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
do
OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
#first iteration over batches creates the data
IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
for batch_id in ${BATCHES[@]}
do
IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data

python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES

python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_BASE/batches_unlabeled_out_dist/batch_$batch_id" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"

done

#the second iteration over batches runs mixmatch in parallel
for ((i=0;i<${#BATCHES[@]};++i))
do
((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "${IDS_LIST[i]}"&
done
wait
#)
#do clean up prior to the next experiment
rm -r $PATH_BASE/unlabeled/
rm -r $PATH_BASE/batches_labeled_in_dist/
rm -r $PATH_BASE/batches_unlabeled_out_dist/
done
done
elif [[ $OOD_TYPE -eq "different" ]]
then
#add loop over the different datasets -> no loop as I split experiments into invidual data sets

for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
do
for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
do
OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
for batch_id in ${BATCHES[@]}
do
IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data#the subset of the classes in the dataset that are used as in dist data
python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES

python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_DIFFERENT" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"
#path_ood here has one subfolder with all the unlabelled images

done

#the second iteration over batches runs mixmatch in parallel
PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
PATH_UNLABELED="$PATH_BASE/unlabeled/batch_0_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"
python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT --exp_creator "Yes"
for ((i=0;i<${#BATCHES[@]};++i))
do
((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "$LIST_IN_DIST_CLASSES"&
done
wait

#do clean up prior to the next experiment
rm -r $PATH_BASE/unlabeled/
rm -r $PATH_BASE/batches_labeled_in_dist/
rm -r $PATH_BASE/batches_unlabeled_out_dist/
done
done



else
echo "No valid OOD_TYPE was specified. Choose 'same' or 'different'"
fi
#final cleanup -> delete all data
rm -r data/
#dynamic resizing of images for different setting -> when they do not exactly match -> defined in mixmatch
41 changes: 41 additions & 0 deletions results_analysis_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from mlflow.tracking.client import MlflowClient
import mlflow
import numpy as np


start_experiment=3
end_experiment=9
metric='val_accuracy'
num_epochs=50

#range(start_experiment,end_experiment+1)
experiment_ids=[1,2,3]


means=[]
stds=[]
for experiment_id in experiment_ids:
experiment_id = str(experiment_id)
experiment_name = mlflow.get_experiment(experiment_id).name
print(experiment_name)
results = mlflow.search_runs(experiment_ids=[experiment_id])

max_accuracies=[]
for run_id in (results['run_id']):
val_accuracies=MlflowClient().get_metric_history(run_id, metric)
max_accuracy=0.
for epoch in range(num_epochs):
accuracy=val_accuracies[epoch].value
if val_accuracies[epoch].value > max_accuracy:
max_accuracy=val_accuracies[epoch].value
max_accuracies.append(max_accuracy)
mean=np.mean(max_accuracies)
std=np.std(max_accuracies)
print('Mean:{} Std:{}'.format(mean, std))
means.append(mean)
stds.append(std)

#just use format

print("& 0 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[1], stds[1], means[2], stds[2],r"\\"))
#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std
40 changes: 40 additions & 0 deletions results_analysis_6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from mlflow.tracking.client import MlflowClient
import mlflow
import numpy as np


start_experiment=3
end_experiment=9
metric='val_accuracy'
num_epochs=50

experiment_ids=[1,2,3,4,5,6]


means=[]
stds=[]
for experiment_id in experiment_ids:
experiment_id = str(experiment_id)
experiment_name = mlflow.get_experiment(experiment_id).name
print(experiment_name)
results = mlflow.search_runs(experiment_ids=[experiment_id])

max_accuracies=[]
for run_id in (results['run_id']):
val_accuracies=MlflowClient().get_metric_history(run_id, metric)
max_accuracy=0.
for epoch in range(num_epochs):
accuracy=val_accuracies[epoch].value
if val_accuracies[epoch].value > max_accuracy:
max_accuracy=val_accuracies[epoch].value
max_accuracies.append(max_accuracy)
mean=np.mean(max_accuracies)
std=np.std(max_accuracies)
print('Mean:{} Std:{}'.format(mean, std))
means.append(mean)
stds.append(std)

#just use format

print("& 50 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${} \n & & & 100 & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}$ & ${:.3f}\pm{:.3f}${}".format(means[0], stds[0], means[2], stds[2], means[4], stds[4],r"\\", means[1], stds[1], means[3], stds[3], means[5], stds[5], r" \\"))
#60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std, 60 mean, 60 std, 100 mean, 100 std, 150 mean, 150 std
Loading

0 comments on commit c45f3b4

Please sign in to comment.