From 45de58441102fc87b3909116f4283f12c24aeee5 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Fri, 5 Oct 2018 14:41:45 -0400
Subject: [PATCH 01/36] relocate dataset file lists

---
 .../costar_block_stacking_v0.2_success_only_test_files.txt        | 0
 .../costar_block_stacking_v0.2_success_only_train_files.txt       | 0
 .../costar_block_stacking_v0.2_success_only_val_files.txt         | 0
 .../costar_block_stacking_v0.4_success_only_test_files.txt        | 0
 .../costar_block_stacking_v0.4_success_only_train_files.txt       | 0
 .../costar_block_stacking_v0.4_success_only_val_files.txt         | 0
 ...ned_block_plush_stacking_v0.4_success_only_corrupted_files.txt | 0
 ...combined_block_plush_stacking_v0.4_success_only_test_files.txt | 0
 ...ombined_block_plush_stacking_v0.4_success_only_train_files.txt | 0
 ..._combined_block_plush_stacking_v0.4_success_only_val_files.txt | 0
 .../costar_plush_block_stacking_v0.4_success_only_test_files.txt  | 0
 .../costar_plush_block_stacking_v0.4_success_only_train_files.txt | 0
 .../costar_plush_block_stacking_v0.4_success_only_val_files.txt   | 0
 13 files changed, 0 insertions(+), 0 deletions(-)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.2_success_only_test_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.2_success_only_train_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.2_success_only_val_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.4_success_only_test_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.4_success_only_train_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_block_stacking_v0.4_success_only_val_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_combined_block_plush_stacking_v0.4_success_only_corrupted_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_combined_block_plush_stacking_v0.4_success_only_test_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_combined_block_plush_stacking_v0.4_success_only_train_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_combined_block_plush_stacking_v0.4_success_only_val_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_plush_block_stacking_v0.4_success_only_test_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_plush_block_stacking_v0.4_success_only_train_files.txt (100%)
 rename costar_hyper/{ => dataset}/costar_plush_block_stacking_v0.4_success_only_val_files.txt (100%)

diff --git a/costar_hyper/costar_block_stacking_v0.2_success_only_test_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.2_success_only_test_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.2_success_only_test_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.2_success_only_test_files.txt
diff --git a/costar_hyper/costar_block_stacking_v0.2_success_only_train_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.2_success_only_train_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.2_success_only_train_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.2_success_only_train_files.txt
diff --git a/costar_hyper/costar_block_stacking_v0.2_success_only_val_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.2_success_only_val_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.2_success_only_val_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.2_success_only_val_files.txt
diff --git a/costar_hyper/costar_block_stacking_v0.4_success_only_test_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.4_success_only_test_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.4_success_only_test_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.4_success_only_test_files.txt
diff --git a/costar_hyper/costar_block_stacking_v0.4_success_only_train_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.4_success_only_train_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.4_success_only_train_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.4_success_only_train_files.txt
diff --git a/costar_hyper/costar_block_stacking_v0.4_success_only_val_files.txt b/costar_hyper/dataset/costar_block_stacking_v0.4_success_only_val_files.txt
similarity index 100%
rename from costar_hyper/costar_block_stacking_v0.4_success_only_val_files.txt
rename to costar_hyper/dataset/costar_block_stacking_v0.4_success_only_val_files.txt
diff --git a/costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_corrupted_files.txt b/costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_corrupted_files.txt
similarity index 100%
rename from costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_corrupted_files.txt
rename to costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_corrupted_files.txt
diff --git a/costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_test_files.txt b/costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_test_files.txt
similarity index 100%
rename from costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_test_files.txt
rename to costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_test_files.txt
diff --git a/costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_train_files.txt b/costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_train_files.txt
similarity index 100%
rename from costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_train_files.txt
rename to costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_train_files.txt
diff --git a/costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_val_files.txt b/costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_val_files.txt
similarity index 100%
rename from costar_hyper/costar_combined_block_plush_stacking_v0.4_success_only_val_files.txt
rename to costar_hyper/dataset/costar_combined_block_plush_stacking_v0.4_success_only_val_files.txt
diff --git a/costar_hyper/costar_plush_block_stacking_v0.4_success_only_test_files.txt b/costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_test_files.txt
similarity index 100%
rename from costar_hyper/costar_plush_block_stacking_v0.4_success_only_test_files.txt
rename to costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_test_files.txt
diff --git a/costar_hyper/costar_plush_block_stacking_v0.4_success_only_train_files.txt b/costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_train_files.txt
similarity index 100%
rename from costar_hyper/costar_plush_block_stacking_v0.4_success_only_train_files.txt
rename to costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_train_files.txt
diff --git a/costar_hyper/costar_plush_block_stacking_v0.4_success_only_val_files.txt b/costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_val_files.txt
similarity index 100%
rename from costar_hyper/costar_plush_block_stacking_v0.4_success_only_val_files.txt
rename to costar_hyper/dataset/costar_plush_block_stacking_v0.4_success_only_val_files.txt

From 1c35ca6e1c490a2fd21f823e89377be1104dd9b3 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Fri, 5 Oct 2018 14:42:18 -0400
Subject: [PATCH 02/36] Costar plan and costar hyper readme improvements

---
 Readme.md              |  56 +++++++++++++++++----
 costar_hyper/README.md | 111 ++++++++++++++++++++++-------------------
 2 files changed, 105 insertions(+), 62 deletions(-)

diff --git a/Readme.md b/Readme.md
index 2fba1b14f..7773b2e21 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,25 +1,55 @@
-# CoSTAR Task Planner (CTP)
+# CoSTAR Plan
 
 [![Build Status](https://travis-ci.com/cpaxton/costar_plan.svg?token=13PmLzWGjzrfxQvEyWp1&branch=master)](https://travis-ci.com/cpaxton/costar_plan)
 
+CoSTAR Plan is for deep learning with robots, divided into two main parts, the CoSTAR Task Planner (CTP) library and CoSTAR Hyper. A summary is below.
+
+## CoSTAR Task Planner (CTP)
+
+### Associated Paper
+
+Visual Robot Task Planning
+
+## [CoSTAR Hyper](costar_hyper/README.md)
+
+### Associated Paper
+
+Training Frankenstein's Creature To Stack: HyperTree Architecture Search
+
+### Supported Datasets
+
+  - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset) ([code readme](costar_hyper/README.md))
+  - Cornell Grasping Dataset ([code readme](costar_hyper/README.md))
+  - Google Brain Grasping Dataset ([code readme](costar_hyper/README.md))
+
+
+
+# CoSTAR Task Planner (CTP)
+
+
 The CoSTAR Planner is part of the larger [CoSTAR project](https://github.com/cpaxton/costar_stack/). It integrates some learning from demonstration and task planning capabilities into the larger CoSTAR framework in different ways.
 
 [![Visual Task Planning](https://img.youtube.com/vi/Rk4EDL4B7zQ/0.jpg)](https://youtu.be/Rk4EDL4B7zQ "Visual Task Planning")
 
 Specifically it is a project for creating task and motion planning algorithms that use machine learning to solve challenging problems in a variety of domains. This code provides a testbed for complex task and motion planning search algorithms. The goal is to describe example problems where actor must move around in the world and plan complex interactions with other actors or the environment that correspond to high-level symbolic states. Among these is our Visual Task Planning project, in which robots learn representations of their world and use these to imagine possible futures, then use these for planning.
-
-[![CoSTAR Real Robot Data Collection](https://img.youtube.com/vi/LMqEcoYbrLM/0.jpg)](https://youtu.be/LMqEcoYbrLM "CoSTAR Real Robot Data Collection")
-
 To run deep learning examples, you will need TensorFlow and Keras, plus a number of Python packages. To run robot experiments, you'll need a simulator (Gazebo or PyBullet), and ROS Indigo or Kinetic. Other versions of ROS may work but have not been tested. If you want to stick to the toy examples, you do not need to use this as a ROS package.
 
 *About this repository:* CTP is a _single-repository_ project. As such, all the custom code you need should be in one place: here. There are exceptions, such as the [CoSTAR Stack](https://github.com/cpaxton/costar_stack/) for real robot execution, but these are generally not necessary. The minimal installation of CTP is just to install the `costar_models` package as a normal [python package](https://github.com/cpaxton/costar_plan/tree/master/costar_models/python) ignoring everything else.
 
-Datasets:
+# CTP Datasets
+
+  - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset)
   - [PyBullet Block Stacking](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/simdata.tar.gz)
   - [Sample Husky Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/husky_data.tar.gz)
-  - [CoSTAR Real Robot Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/sample_real_ur5_robot_data.tar.gz)
 
-Contents:
+## Deprecated Datsets
+
+[![Classic CoSTAR Real Robot Data Collection](https://img.youtube.com/vi/LMqEcoYbrLM/0.jpg)](https://youtu.be/LMqEcoYbrLM "Classic CoSTAR Real Robot Data Collection")
+  - [Classic CoSTAR Real Robot Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/sample_real_ur5_robot_data.tar.gz)
+
+The original collection run
+
+# Contents
   - [0. Introduction](docs/introduction.md)
   - [1. Installation Guide](docs/install.md)
     - [1.1 Docker Instructions](docs/docker_instructions.md)
@@ -28,7 +58,7 @@ Contents:
     - [2.1 Software Design](docs/design.md): high-level notes
   - [3. Machine Learning Models](docs/learning.md): using the command line tool
     - [3.1 Data collection](docs/collect_data.md): data collection with a real or simulated robot
-    - [3.2 MARCC instructions](docs/marcc.md): learning models using JHU's MARCC cluste
+    - [3.2 MARCC instructions](docs/marcc.md): learning models using JHU's MARCC cluster
     - [3.3 Generative Adversarial Models](docs/learning_gan.md)
     - [3.4 SLURM Utilities](docs/slurm_utils.md): tools for using slurm on MARCC
   - [4. Creating and training a custom task](docs/task_learning.md): overview of task representations
@@ -45,7 +75,7 @@ Contents:
     - [7.2 The Real TOM](docs/tom_real_robot.md): details about parts of the system for running on the real TOM
   - [8. CoSTAR Robot](docs/costar_real_robot.md): execution with a standard UR5
 
-Package/folder layout:
+# Package/folder layout
   - [CoSTAR Simulation](costar_simulation/Readme.md): Gazebo simulation and ROS execution
   - [CoSTAR Task Plan](costar_task_plan/Readme.md): the high-level python planning library
   - [CoSTAR Gazebo Plugins](costar_gazebo_plugins/Readme.md): assorted plugins for integration
@@ -61,7 +91,11 @@ Package/folder layout:
   - Others are temporary packages for various projects
 
 Many of these sections are a work in progress; if you have any questions shoot me an email (`cpaxton@jhu.edu`).
-## Contact
 
-This code is maintained by Chris Paxton (cpaxton@jhu.edu).
+# Contact
+
+This code is maintained by:
+
+ - Chris Paxton (cpaxton@jhu.edu).
+ - Andrew Hundt (ATHundt@gmail.com)
 
diff --git a/costar_hyper/README.md b/costar_hyper/README.md
index 59538d22e..e78a9fa8c 100644
--- a/costar_hyper/README.md
+++ b/costar_hyper/README.md
@@ -99,41 +99,6 @@ plt.show()
 
 some of those fields will vary for different use cases.
 
-## Google Brain Grasp Dataset APIs
-
-<img width="1511" alt="2017-12-16 surface relative transforms correct" src="https://user-images.githubusercontent.com/55744/34134058-5846b59e-e426-11e7-92d6-699883199255.png">
-This version should be ready to use when generating data real training.
-
-Plus now there is a flag to draw a circle at the location of the gripper as stored in the dataset:
-![102_grasp_0_rgb_success_1](https://user-images.githubusercontent.com/55744/34133964-ccf57caa-e425-11e7-8ab1-6bba459a5408.gif)
-
-A new feature is writing out depth image gifs:
-![102_grasp_0_depth_success_1](https://user-images.githubusercontent.com/55744/34133966-d0951f28-e425-11e7-85d1-aa2706a4ba05.gif)
-
-Image data can be resized:
-
-![102_grasp_1_rgb_success_1](https://user-images.githubusercontent.com/55744/34430739-3adbd65c-ec36-11e7-84b5-3c3712949914.gif)
-
-The blue circle is a visualization, not actual input, which marks the gripper stored in the dataset pose information.
-
-Color augmentation is also available:
-
-![102_grasp_2_rgb_success_1](https://user-images.githubusercontent.com/55744/34698561-ba2bd61e-f4a6-11e7-88d9-5091aed500fe.gif)
-![102_grasp_3_rgb_success_1](https://user-images.githubusercontent.com/55744/34698564-bef2fba0-f4a6-11e7-9547-06b4410d86aa.gif)
-
-### How to view the vrep dataset visualization
-
-1. copy the .ttt file and the .so file (.dylib on mac) into the `costar_google_brainrobotdata/vrep` folder.
-2. Run vrep with -s file pointing to the example:
-
-```
-./vrep.sh -s ~/src/costar_ws/src/costar_plan/costar_google_brainrobotdata/vrep/kukaRemoteApiCommandServerExample.ttt
-```
-
-4. vrep should load and start the simulation
-5. make sure the folder holding `vrep_grasp.py` is on your PYTHONPATH
-6. cd to `~/src/costar_ws/src/costar_plan/costar_google_brainrobotdata/`, or wherever you put the repository
-7. run `export CUDA_VISIBLE_DEVICES="" && python2 vrep_grasp.py`
 
 ## Hyperparameter search
 
@@ -182,21 +147,7 @@ export CUDA_VISIBLE_DEVICES="0" && python2 costar_block_stacking_train_ranked_re
 
 You may wish to use the `--learning_rate_schedule triangular` flag for one run and then the `--learning_rate_schedule triangular2 --load_weights path/to/previous_best_weights.h5` for a second run. These learning rate schedules use the [keras_contrib](github.com/keras-team/keras-contrib) cyclical learning rate callback, see [Cyclical learning rate repo](https://github.com/bckenstler/CLR) for a detailed description and paper links.
 
-### Google Brain Grasping Dataset
-
-To run the search execute the following command
-
-```
-export CUDA_VISIBLE_DEVICES="0" && python2 google_grasp_hyperopt.py --run_name single_prediction_all_transforms
-```
-
-Generating a hyperparameter search results summary for google brain grasping dataset classification:
-
-```
-python hyperopt_rank.py --log_dir hyperopt_logs_google_brain_classification --sort_by val_acc
-```
-
-### Cornell Dataset
+## Cornell Dataset
 
 These are instructions for training on the [cornell grasping dataset](http://pr.cs.cornell.edu/grasping/rect_data/data.php).
 
@@ -324,4 +275,62 @@ Here is the command to actually run k-fold training:
 export CUDA_VISIBLE_DEVICES="0" && python cornell_grasp_train_classification.py  --run_name 2018-04-08-21-04-19_s2c2hw4 --pipeline_stage k_fold
 ```
 
-After it finishes running there should be a file created named `*summary.json` with your final results.
\ No newline at end of file
+After it finishes running there should be a file created named `*summary.json` with your final results.
+
+
+## Google Brain Grasp Dataset APIs
+
+Note: The [Google Brain Grasping Dataset](https://sites.google.com/site/brainrobotdata/home/grasping-dataset) has several important limitations which must be considered before trying it out:
+- There is no validation or test dataset with novel objects
+- There is no robot model available, and the robot is not commercially available
+- Data is collected at 1Hz and may not be well synchronized w.r.t. time.
+- The robot may move vast distances and change directions completely between frames.
+
+<img width="1511" alt="2017-12-16 surface relative transforms correct" src="https://user-images.githubusercontent.com/55744/34134058-5846b59e-e426-11e7-92d6-699883199255.png">
+This version should be ready to use when generating data real training.
+
+Plus now there is a flag to draw a circle at the location of the gripper as stored in the dataset:
+![102_grasp_0_rgb_success_1](https://user-images.githubusercontent.com/55744/34133964-ccf57caa-e425-11e7-8ab1-6bba459a5408.gif)
+
+A new feature is writing out depth image gifs:
+![102_grasp_0_depth_success_1](https://user-images.githubusercontent.com/55744/34133966-d0951f28-e425-11e7-85d1-aa2706a4ba05.gif)
+
+Image data can be resized:
+
+![102_grasp_1_rgb_success_1](https://user-images.githubusercontent.com/55744/34430739-3adbd65c-ec36-11e7-84b5-3c3712949914.gif)
+
+The blue circle is a visualization, not actual input, which marks the gripper stored in the dataset pose information. You can see the time synchronization issue in these frames.
+
+Color augmentation is also available:
+
+![102_grasp_2_rgb_success_1](https://user-images.githubusercontent.com/55744/34698561-ba2bd61e-f4a6-11e7-88d9-5091aed500fe.gif)
+![102_grasp_3_rgb_success_1](https://user-images.githubusercontent.com/55744/34698564-bef2fba0-f4a6-11e7-9547-06b4410d86aa.gif)
+
+### How to view the vrep dataset visualization
+
+1. copy the .ttt file and the .so file (.dylib on mac) into the `costar_google_brainrobotdata/vrep` folder.
+2. Run vrep with -s file pointing to the example:
+
+```
+./vrep.sh -s ~/src/costar_ws/src/costar_plan/costar_google_brainrobotdata/vrep/kukaRemoteApiCommandServerExample.ttt
+```
+
+4. vrep should load and start the simulation
+5. make sure the folder holding `vrep_grasp.py` is on your PYTHONPATH
+6. cd to `~/src/costar_ws/src/costar_plan/costar_google_brainrobotdata/`, or wherever you put the repository
+7. run `export CUDA_VISIBLE_DEVICES="" && python2 vrep_grasp.py`
+
+
+### Google Brain Grasping Dataset
+
+To run the search execute the following command
+
+```
+export CUDA_VISIBLE_DEVICES="0" && python2 google_grasp_hyperopt.py --run_name single_prediction_all_transforms
+```
+
+Generating a hyperparameter search results summary for google brain grasping dataset classification:
+
+```
+python hyperopt_rank.py --log_dir hyperopt_logs_google_brain_classification --sort_by val_acc
+```
\ No newline at end of file

From e0e53664eeba766fc04c7b46a982c3deb58f781a Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Fri, 5 Oct 2018 14:48:03 -0400
Subject: [PATCH 03/36] readme.md add website link

---
 Readme.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/Readme.md b/Readme.md
index 7773b2e21..d5a0c6ee9 100644
--- a/Readme.md
+++ b/Readme.md
@@ -14,14 +14,17 @@ Visual Robot Task Planning
 
 ### Associated Paper
 
-Training Frankenstein's Creature To Stack: HyperTree Architecture Search
+[Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://sites.google.com/view/hypertree-renas/home)
+
+[![Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://img.youtube.com/vi/1MV7slHnMX0/0.jpg)](https://youtu.be/1MV7slHnMX0 "Training Frankenstein's Creature To Stack: HyperTree Architecture Search")
 
 ### Supported Datasets
 
-  - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset) ([code readme](costar_hyper/README.md))
-  - Cornell Grasping Dataset ([code readme](costar_hyper/README.md))
-  - Google Brain Grasping Dataset ([code readme](costar_hyper/README.md))
+Details are in the [costar hyper readme](costar_hyper/README.md).
 
+  - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset)
+  - Cornell Grasping Dataset
+  - Google Brain Grasping Dataset
 
 
 # CoSTAR Task Planner (CTP)
@@ -37,17 +40,15 @@ To run deep learning examples, you will need TensorFlow and Keras, plus a number
 *About this repository:* CTP is a _single-repository_ project. As such, all the custom code you need should be in one place: here. There are exceptions, such as the [CoSTAR Stack](https://github.com/cpaxton/costar_stack/) for real robot execution, but these are generally not necessary. The minimal installation of CTP is just to install the `costar_models` package as a normal [python package](https://github.com/cpaxton/costar_plan/tree/master/costar_models/python) ignoring everything else.
 
 # CTP Datasets
-
-  - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset)
   - [PyBullet Block Stacking](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/simdata.tar.gz)
   - [Sample Husky Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/husky_data.tar.gz)
 
 ## Deprecated Datsets
 
-[![Classic CoSTAR Real Robot Data Collection](https://img.youtube.com/vi/LMqEcoYbrLM/0.jpg)](https://youtu.be/LMqEcoYbrLM "Classic CoSTAR Real Robot Data Collection")
   - [Classic CoSTAR Real Robot Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/sample_real_ur5_robot_data.tar.gz)
+     - We recommend using the full [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset) over this early version.
+     - [![Classic CoSTAR Real Robot Data Collection](https://img.youtube.com/vi/LMqEcoYbrLM/0.jpg)](https://youtu.be/LMqEcoYbrLM "Classic CoSTAR Real Robot Data Collection")
 
-The original collection run
 
 # Contents
   - [0. Introduction](docs/introduction.md)

From 78cf2f203cd654a314ab37c1607470e2dd4e4249 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Fri, 5 Oct 2018 14:55:28 -0400
Subject: [PATCH 04/36] readme cleanup and links

---
 Readme.md | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/Readme.md b/Readme.md
index d5a0c6ee9..da676b8d2 100644
--- a/Readme.md
+++ b/Readme.md
@@ -6,15 +6,11 @@ CoSTAR Plan is for deep learning with robots, divided into two main parts, the C
 
 ## CoSTAR Task Planner (CTP)
 
-### Associated Paper
-
-Visual Robot Task Planning
+Code for the paper [Visual Robot Task Planning](https://arxiv.org/abs/1804.00062).
 
 ## [CoSTAR Hyper](costar_hyper/README.md)
 
-### Associated Paper
-
-[Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://sites.google.com/view/hypertree-renas/home)
+Code for the paper [Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://sites.google.com/view/hypertree-renas/home).
 
 [![Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://img.youtube.com/vi/1MV7slHnMX0/0.jpg)](https://youtu.be/1MV7slHnMX0 "Training Frankenstein's Creature To Stack: HyperTree Architecture Search")
 
@@ -40,14 +36,11 @@ To run deep learning examples, you will need TensorFlow and Keras, plus a number
 *About this repository:* CTP is a _single-repository_ project. As such, all the custom code you need should be in one place: here. There are exceptions, such as the [CoSTAR Stack](https://github.com/cpaxton/costar_stack/) for real robot execution, but these are generally not necessary. The minimal installation of CTP is just to install the `costar_models` package as a normal [python package](https://github.com/cpaxton/costar_plan/tree/master/costar_models/python) ignoring everything else.
 
 # CTP Datasets
-  - [PyBullet Block Stacking](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/simdata.tar.gz)
-  - [Sample Husky Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/husky_data.tar.gz)
-
-## Deprecated Datsets
 
-  - [Classic CoSTAR Real Robot Data](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/sample_real_ur5_robot_data.tar.gz)
-     - We recommend using the full [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset) over this early version.
-     - [![Classic CoSTAR Real Robot Data Collection](https://img.youtube.com/vi/LMqEcoYbrLM/0.jpg)](https://youtu.be/LMqEcoYbrLM "Classic CoSTAR Real Robot Data Collection")
+  - PyBullet Block Stacking [download tar.gz](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/simdata.tar.gz)
+  - Sample Husky Data [download tar.gz](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/husky_data.tar.gz)
+  - Classic CoSTAR Real Robot Data [download tar.gz](https://github.com/cpaxton/costar_plan/releases/download/v0.6.0/sample_real_ur5_robot_data.tar.gz)
+     - Early version, deprecated in lieu of the full [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset).
 
 
 # Contents

From b9d3c00e86125cfbb4aece5428c8edee348a3e3b Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Fri, 5 Oct 2018 15:00:38 -0400
Subject: [PATCH 05/36] readme.md create & improve links

---
 Readme.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Readme.md b/Readme.md
index da676b8d2..9b2e7d15e 100644
--- a/Readme.md
+++ b/Readme.md
@@ -7,20 +7,20 @@ CoSTAR Plan is for deep learning with robots, divided into two main parts, the C
 ## CoSTAR Task Planner (CTP)
 
 Code for the paper [Visual Robot Task Planning](https://arxiv.org/abs/1804.00062).
+Details are
 
 ## [CoSTAR Hyper](costar_hyper/README.md)
 
 Code for the paper [Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://sites.google.com/view/hypertree-renas/home).
+Details are in the [costar hyper readme](costar_hyper/README.md).
 
-[![Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://img.youtube.com/vi/1MV7slHnMX0/0.jpg)](https://youtu.be/1MV7slHnMX0 "Training Frankenstein's Creature To Stack: HyperTree Architecture Search")
+[![Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://img.youtube.com/vi/1MV7slHnMX0/1.jpg)](https://youtu.be/1MV7slHnMX0 "Training Frankenstein's Creature To Stack: HyperTree Architecture Search")
 
 ### Supported Datasets
 
-Details are in the [costar hyper readme](costar_hyper/README.md).
-
   - [CoSTAR Block Stacking Dataset](sites.google.com/site/costardataset)
-  - Cornell Grasping Dataset
-  - Google Brain Grasping Dataset
+  - [Cornell Grasping Dataset](http://pr.cs.cornell.edu/grasping/rect_data/data.php)
+  - [Google Brain Grasping Dataset](https://sites.google.com/site/brainrobotdata/home/grasping-dataset)
 
 
 # CoSTAR Task Planner (CTP)

From bb2bb6ff16641441fecb9c3cc3275d8550d66004 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Sun, 7 Oct 2018 21:55:47 -0400
Subject: [PATCH 06/36] Slightly modify README to be more readable

---
 Readme.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/Readme.md b/Readme.md
index 9b2e7d15e..8dbdf8704 100644
--- a/Readme.md
+++ b/Readme.md
@@ -2,14 +2,13 @@
 
 [![Build Status](https://travis-ci.com/cpaxton/costar_plan.svg?token=13PmLzWGjzrfxQvEyWp1&branch=master)](https://travis-ci.com/cpaxton/costar_plan)
 
-CoSTAR Plan is for deep learning with robots, divided into two main parts, the CoSTAR Task Planner (CTP) library and CoSTAR Hyper. A summary is below.
+CoSTAR Plan is for deep learning with robots, divided into two main parts, the CoSTAR Task Planner (CTP) library and CoSTAR Hyper.
 
-## CoSTAR Task Planner (CTP)
+### CoSTAR Task Planner (CTP)
 
 Code for the paper [Visual Robot Task Planning](https://arxiv.org/abs/1804.00062).
-Details are
 
-## [CoSTAR Hyper](costar_hyper/README.md)
+### [CoSTAR Hyper](costar_hyper/README.md)
 
 Code for the paper [Training Frankenstein's Creature To Stack: HyperTree Architecture Search](https://sites.google.com/view/hypertree-renas/home).
 Details are in the [costar hyper readme](costar_hyper/README.md).
@@ -30,7 +29,10 @@ The CoSTAR Planner is part of the larger [CoSTAR project](https://github.com/cpa
 
 [![Visual Task Planning](https://img.youtube.com/vi/Rk4EDL4B7zQ/0.jpg)](https://youtu.be/Rk4EDL4B7zQ "Visual Task Planning")
 
-Specifically it is a project for creating task and motion planning algorithms that use machine learning to solve challenging problems in a variety of domains. This code provides a testbed for complex task and motion planning search algorithms. The goal is to describe example problems where actor must move around in the world and plan complex interactions with other actors or the environment that correspond to high-level symbolic states. Among these is our Visual Task Planning project, in which robots learn representations of their world and use these to imagine possible futures, then use these for planning.
+Specifically it is a project for creating task and motion planning algorithms that use machine learning to solve challenging problems in a variety of domains. This code provides a testbed for complex task and motion planning search algorithms. 
+
+The goal is to describe example problems where the actor must move around in the world and plan complex interactions with other actors or the environment that correspond to high-level symbolic states. Among these is our Visual Task Planning project, in which robots learn representations of their world and use these to imagine possible futures, then use these for planning.
+
 To run deep learning examples, you will need TensorFlow and Keras, plus a number of Python packages. To run robot experiments, you'll need a simulator (Gazebo or PyBullet), and ROS Indigo or Kinetic. Other versions of ROS may work but have not been tested. If you want to stick to the toy examples, you do not need to use this as a ROS package.
 
 *About this repository:* CTP is a _single-repository_ project. As such, all the custom code you need should be in one place: here. There are exceptions, such as the [CoSTAR Stack](https://github.com/cpaxton/costar_stack/) for real robot execution, but these are generally not necessary. The minimal installation of CTP is just to install the `costar_models` package as a normal [python package](https://github.com/cpaxton/costar_plan/tree/master/costar_models/python) ignoring everything else.

From 81e3eee6b3fee6f6a99a19f6e287e84a9af94ca8 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Sun, 7 Oct 2018 23:20:41 -0400
Subject: [PATCH 07/36] Add code to check action labels for consistency

---
 .../scripts/view_convert_dataset.py           | 32 ++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/ctp_integration/scripts/view_convert_dataset.py b/ctp_integration/scripts/view_convert_dataset.py
index 7ff58dc8a..da39c7cb2 100755
--- a/ctp_integration/scripts/view_convert_dataset.py
+++ b/ctp_integration/scripts/view_convert_dataset.py
@@ -208,6 +208,8 @@ def _parse_args():
                                 and inserts them directly into the hdf5 file.
                              """)
     parser.add_argument("--write", action='store_true', help='Actually write out the changes specified in preprocess_inplace, or label_correction.')
+    parser.add_argument("--action_label_check", action='store_true', default=False,
+                        help='check action labels for each goal step')
 
     return vars(parser.parse_args())
 
@@ -566,7 +568,11 @@ def main(args, root="root"):
                     example_folder_path, name = os.path.split(example_filename)
                     progress_bar.write(example_folder_path)
                     name = name.replace('.h5f', '')
-                    example_folder_path = os.path.join(example_folder_path, 'goal_images')
+                    if args['action_label_check']:
+                        action_label_check(data_labels_to_name)
+                        example_folder_path = os.path.join(example_folder_path, 'action_label_check')
+                    else:
+                        example_folder_path = os.path.join(example_folder_path, 'goal_images')
                     if not os.path.exists(example_folder_path):
                         os.makedirs(example_folder_path)
                     # extract the clear view image
@@ -839,6 +845,30 @@ def generate_gripper_action_label(data):
     return gripper_action_label, gripper_action_goal_idx
 
 
+def action_label_check(action_labels):
+    stored_action_labels = \
+       [b'place_green_on_yellow', b'move_to_home', b'place_blue_on_yellowred', b'place_yellow_on_red',
+        b'place_blue_on_red', b'grab_blue', b'place_red_on_blueyellow', b'place_green_on_redyellow', 
+        b'place_red_on_yellow', b'place_green_on_blueyellow', b'place_red_on_greenblue', b'place_blue_on_green',
+        b'place_blue_on_redgreen',b'place_yellow_on_greenblue', b'place_yellow_on_blue', b'place_blue_on_greenyellow',
+        b'place_blue_on_yellowgreen', b'place_blue_on_greenred', b'place_yellow_on_redgreen', b'grab_yellow', 
+        b'place_red_on_greenyellow', b'grab_green', b'place_red_on_green', b'place_yellow_on_bluered', 
+        b'place_yellow_on_green', b'place_green_on_blue', b'place_yellow_on_bluegreen', b'place_blue_on_redyellow', 
+        b'place_red_on_blue', b'place_red_on_yellowgreen', b'place_yellow_on_greenred', b'place_green_on_yellowblue',  
+        b'place_red_on_bluegreen', b'place_green_on_red', b'place_red_on_yellowblue', b'place_green_on_yellowred',
+        b'place_green_on_redblue', b'grab_red', b'place_yellow_on_redblue', b'place_green_on_bluered', b'place_blue_on_yellow']
+    
+    assert len(stored_action_labels) == len(action_labels)
+
+    if stored_action_labels != action_labels:
+        print("WARNING! Inconsistent action labels detected")
+        for i in range(len(action_labels)):
+            if stored_action_labels[i] != action_labels[i]:
+                print("Expected in {0}: {1}, get: {2}".format(i, stored_action_labels[i], action_labels[i]))
+        # raise ValueError("WARNING! Inconsistent action labels detected")
+
+
+
 if __name__ == "__main__":
     if tf is not None:
         tf.enable_eager_execution()

From a607c9102b8cfcfeb1fcdbf17e2119bb8b3f2591 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Sun, 7 Oct 2018 23:21:14 -0400
Subject: [PATCH 08/36] Modify deprecated argument usage

---
 ctp_integration/scripts/view_convert_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctp_integration/scripts/view_convert_dataset.py b/ctp_integration/scripts/view_convert_dataset.py
index da39c7cb2..7a4aa60a1 100755
--- a/ctp_integration/scripts/view_convert_dataset.py
+++ b/ctp_integration/scripts/view_convert_dataset.py
@@ -196,7 +196,7 @@ def _parse_args():
     parser.add_argument("--rgb", action='store_true', default=True, help='process rgb data')
     parser.add_argument("--no-rgb", action='store_false', default=True, help='do not process rgb data', dest='rgb')
     parser.add_argument("--fps", type=int, default=10, help='framerate to process images in frames per second')
-    parser.add_argument("--matplotlib", type=bool, default=False,
+    parser.add_argument("--matplotlib", action='store_true', default=False,
                         help='preview data with matplotlib, slower but you can do pixel lookups')
     parser.add_argument("--print", type=str, default='',
                         help=('Comma separated list of data channels to convert to a list and print as a string.'

From 386e808ad1134ac840d9d3bca21b71a32998368f Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 9 Oct 2018 16:37:07 -0400
Subject: [PATCH 09/36] Implement failure and error set splitting

---
 .../costar_block_stacking_split_dataset.py    | 307 +++++++++++++-----
 1 file changed, 226 insertions(+), 81 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 68695fbcc..5e95197dd 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -34,8 +34,11 @@ def _parse_args():
     #                     default=False, help='skip grasp success cases')
     # parser.add_argument("--ignore_error", action='store_true', default=False,
     #                     help='skip attempts that are both failures and contain errors')
-    # parser.add_argument("--success_only", action='store_true', default=False,
-    #                     help='only visit stacking data labeled as successful')
+    parser.add_argument("--success_only", action='store_true', default=False,
+                        help='only visit stacking data labeled as successful')
+    parser.add_ardument("--split_all", action='store_true', default=False,
+                        help='split all datasets into success, failure, and error sets.'
+                             'requires train/val/test from success_only subset')
     parser.add_argument("--plush", action='store_true', default=False,
                         help='processing plush attempts')
     parser.add_argument("--train", type=str, default='',
@@ -81,25 +84,22 @@ def get_existing_filenames(path_to_file):
     return filenames
 
 
-def split_dataset(filenames, train_set, val_set, test_set):
+def split_dataset(filenames, train_set, val_set, test_set, val_len=64, test_len=64):
     """Split the input filenames into three sets.
-    If val_set and test_set are both zero, split the input 8:1:1
+    If val_set and test_set are empty, the sets will be of length val_len and test_len.
     If val_set and test_set have unequal length, match the two lengths
     Add additional files not in val or test sets into training set
-
     """
     if len(val_set) is 0 and len(test_set) is 0:
-        # from math import floor
-
-        # total_samples = len(filenames)
-        # ten_percent_samples = int(floor(total_samples / 10))
-        ten_percent_samples = 64
-
-        not_train_set = [filename for filename in filenames if filename not in train_set]
+        if len(train_set) != 0:
+            not_train_set = [filename for filename in filenames
+                             if filename not in train_set]
+        else:
+            not_train_set = filenames
 
-        val_set = not_train_set[0:ten_percent_samples]
-        test_set = not_train_set[ten_percent_samples:2*ten_percent_samples]
-        train_set += not_train_set[2*ten_percent_samples:]
+        val_set = not_train_set[0:val_len]
+        test_set = not_train_set[val_len:val_len+test_len]
+        train_set += not_train_set[val_len+test_len:]
 
     else:
         """
@@ -172,69 +172,8 @@ def output_file(path, plush, output_prefix, set_name, filenames):
     f.close()
 
 
-def pause():
-    _ = input("Press <Enter> to continue...")
-
-
-def compare_filenames(path, name1, name2):
-    """ Quick function meant to check if filenames are the same
-    Intended to be used in REPL only
-    from split_dataset import compare_filenames
-
-    costar_block_stacking_v0.3_success_only_train_files.txt
-    """
-
-    path = os.path.expanduser(path)
-    if os.path.isdir(path):
-        filenames = os.listdir(path)
-    else:
-        raise ValueError('Path entered is not a path: ' + path)
-    print('Read ' + str(len(filenames)) + ' filenames in the folder')
-
-    # Read files that are success, for now
-    filenames = [filename for filename in filenames if '.success.h5f' in filename]
-    print('Selecting ' + str(len(filenames)) + ' success files')
-    pause()
-
-    file1 = get_existing_filenames(os.path.join(path, name1))
-    file2 = get_existing_filenames(os.path.join(path, name2))
-
-    print(name1 + ": " + str(len(file1)) + "; "
-          + name2 + ": " + str(len(file2)))
-
-    same = []
-    diff = []
-    if len(file1) < len(file2):
-        file1, file2 = file2, file1
-
-    for filename in file1:
-        if filename in file2:
-            same.append(filename)
-        else:
-            diff.append(filename)
-    print("same: " + str(len(same)) + "; diff: " + str(len(diff)))
-
-    return same, diff
-
-
-def main(args, root='root'):
-    # Open and get existing training/validation/test files
-    # Save the filenames to separate lists
-    # Get all the .h5f filenames in the path, compare them to the lists
-    # In training, validation, or test set -> continue
-    # Not in any set -> add to test set
-
-    # handle no pre-existing files => randomize and split files into 3 sets 8:1:1
-
-    # Get the path to
-    path = os.path.expanduser(args['path'])
-    if os.path.isdir(path):
-        filenames = os.listdir(path)
-    else:
-        raise ValueError('Path entered is not a path: ' + path)
-    print('Read ' + str(len(filenames)) + ' filenames in the folder')
-
-    # Read files that are success, for now
+def split_success_only(args, filenames, path):
+    # Read files that are success
     filenames = [filename for filename in filenames if '.success.h5f' in filename]
     print('Selecting ' + str(len(filenames)) + ' success files')
     args['output_name'] += '_success_only'
@@ -307,9 +246,215 @@ def main(args, root='root'):
         raise Exception("Something is wrong!")
 
     # Write the output files
-    output_file(path, args['plush'], args['output_name'], 'train', train_set)
-    output_file(path, args['plush'], args['output_name'], 'val', val_set)
-    output_file(path, args['plush'], args['output_name'], 'test', test_set)
+    output_file(path, args['plush'], args['output_name'], 'success_only_train', train_set)
+    output_file(path, args['plush'], args['output_name'], 'success_only_val', val_set)
+    output_file(path, args['plush'], args['output_name'], 'success_only_test', test_set)
+
+
+def split_all(args, filenames, path):
+    # Get the success, failure, and error filenames with nonzero frames
+    success_filenames, failure_filenames, error_filenames = count_nonzero_files(filenames)
+    pause()  # DEBUG
+
+    # Calculate the percentage of success, failure and error
+    total_file_count = len(filenames)
+    if len(success_filenames) + len(failure_filenames) + \
+       len(error_filenames) != total_file_count:
+        raise Exception("The numbers don't add up!")
+    success_ratio = len(success_filenames) / total_file_count
+    failure_ratio = len(failure_filenames) / total_file_count
+    error_ratio = len(error_filenames) / total_file_count
+    print("Ratios: {0:.2f} success, {1:.2f} failure, {2:.2f} error".format(
+            success_ratio, failure_ratio, error_ratio))
+    pause()  # DEBUG
+
+    # Read the train/val set from success_only subset
+    if args['plush']:
+        default_name = 'costar_plush_block_stacking_v0.4_success_only_'
+    else:
+        default_name = 'costar_block_stacking_v0.4_success_only_'
+    # Read filenames for the previous training set
+    if not args['train']:
+        # Look for v0.4 success only train filenames
+        print('No train file is specified. Trying to open v0.4 success only...')
+        pre_existing_set_file = path + default_name + 'train_files.txt'
+    else:
+        pre_existing_set_file = path + args['train']
+
+    if not os.path.isfile(pre_existing_set_file):
+        raise ValueError(
+            'Pre-existing training file is not a file: ' +
+            pre_existing_set_file)
+    success_train_len = len(get_existing_filenames(pre_existing_set_file))
+
+    # Read filenames for the previous validation set
+    if not args['val']:
+        # Look for v0.4 success only val filenames
+        print('No val file is specified. Trying to open v0.4 success only...')
+        pre_existing_set_file = path + default_name + 'val_files.txt'
+    else:
+        pre_existing_set_file = path + args['val']
+
+    if not os.path.isfile(pre_existing_set_file):
+        raise ValueError(
+            'Pre-existing validating file is not a file: ' +
+            pre_existing_set_file)
+    success_val_len = len(get_existing_filenames(pre_existing_set_file))
+
+    # Read filenames for the previous test set
+    if not args['test']:
+        # Look for v0.4 success only train filenames
+        print('No test file is specified. Trying to open v0.4 success only...')
+        pre_existing_set_file = path + default_name + 'test_files.txt'
+    else:
+        pre_existing_set_file = path + args['test']
+
+    if not os.path.isfile(pre_existing_set_file):
+        raise ValueError(
+            'Pre-existing testing file is not a file: ' +
+            pre_existing_set_file)
+    success_test_len = len(get_existing_filenames(pre_existing_set_file))
+
+    # Calculate set size for failure and error, based on success_only subset
+    multiplier_failure = len(failure_filenames)/len(success_filenames)
+    failure_val_len, failure_test_len = \
+        int(round(success_val_len*multiplier_failure)), \
+        int(round(success_test_len*multiplier_failure))
+    failure_train_len = len(failure_filenames) - (failure_val_len + failure_test_len)
+    multiplier_error = len(error_filenames)/len(success_filenames)
+    error_val_len, error_test_len = \
+        int(round(success_val_len*multiplier_error)), \
+        int(round(success_test_len*multiplier_error))
+    error_train_len = len(error_filenames) - (error_val_len + error_test_len)
+    print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
+            success_train_len, success_val_len, success_test_len))
+    print("Length for failure sets: {0} train, {1} val, {2} test".format(
+            failure_train_len, failure_val_len, failure_test_len))
+    print("Length for error sets: {0} train, {1} val, {2} test".format(
+            error_train_len, error_val_len, error_test_len))
+    pause()
+    
+    # Split the dataset for failure and error
+    fail_train_set, fail_val_set, fail_test_set = \
+        split_dataset(failure_filenames, [], [], [])
+    err_train_set,  err_val_set,  err_test_set = \
+        split_dataset(error_filenames, [], [], [])
+
+    # Write the output files
+    output_file(path, args['plush'], args['output_name'], 'failure_only_train', fail_train_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_only_val', fail_val_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_only_test', fail_test_set)
+    output_file(path, args['plush'], args['output_name'], 'error_only_train', err_train_set)
+    output_file(path, args['plush'], args['output_name'], 'error_only_val', err_val_set)
+    output_file(path, args['plush'], args['output_name'], 'error_only_test', err_test_set)
+
+
+def count_nonzero_files(filenames):
+    '''
+    Open the files and check frame count. Skip files with 0 frame.
+
+    :param filenames: .h5f filenames in the folder
+    :return: Lists of success/failure/error filenames with nonzero frames
+    '''
+    import h5py  # Needs h5py to open the files and check frame count
+    # TODO: Write total frames into csv file as a new column
+
+    # Open the files to check frame count. Skip files with 0 frame.
+    error_filenames = []
+    failure_filenames = []
+    success_filenames = []
+    for filename in filenames:
+        with h5py.File(filename, 'r') as data:
+            total_frames = len(data['image'])
+            if total_frames == 0:  # Skip files with 0 frame
+                print('Skipping %s since it has 0 image frame' % filename)
+                continue
+
+            if 'error' in filename:
+                error_filenames += filename
+            elif 'failure' in filename:
+                failure_filenames += filename
+            # else:  # success
+            #     success_filenames += filename
+            elif 'success' in filename:
+                success_filenames += filename
+            else:  # BUG: Sanity check for debugging
+                raise Exception('Somthing is wrong!')
+
+    print("Counted {0} success files, {1} failure files, and {2} error files.".format(
+            len(success_filenames), len(failure_filenames), len(error_filenames)))
+
+    return success_filenames, failure_filenames, error_filenames
+
+
+def pause():
+    _ = input("Press <Enter> to continue...")
+
+
+def compare_filenames(path, name1, name2):
+    """ Quick function meant to check if filenames are the same
+    Intended to be used in REPL only
+    from split_dataset import compare_filenames
+
+    costar_block_stacking_v0.3_success_only_train_files.txt
+    """
+
+    path = os.path.expanduser(path)
+    if os.path.isdir(path):
+        filenames = os.listdir(path)
+    else:
+        raise ValueError('Path entered is not a path: ' + path)
+    print('Read ' + str(len(filenames)) + ' filenames in the folder')
+
+    # Read files that are success, for now
+    filenames = [filename for filename in filenames if '.success.h5f' in filename]
+    print('Selecting ' + str(len(filenames)) + ' success files')
+    pause()
+
+    file1 = get_existing_filenames(os.path.join(path, name1))
+    file2 = get_existing_filenames(os.path.join(path, name2))
+
+    print(name1 + ": " + str(len(file1)) + "; "
+          + name2 + ": " + str(len(file2)))
+
+    same = []
+    diff = []
+    if len(file1) < len(file2):
+        file1, file2 = file2, file1
+
+    for filename in file1:
+        if filename in file2:
+            same.append(filename)
+        else:
+            diff.append(filename)
+    print("same: " + str(len(same)) + "; diff: " + str(len(diff)))
+
+    return same, diff
+
+
+def main(args, root='root'):
+    # Open and get existing training/validation/test files
+    # Save the filenames to separate lists
+    # Get all the .h5f filenames in the path, compare them to the lists
+    # In training, validation, or test set -> continue
+    # Not in any set -> add to test set
+
+    # handle no pre-existing files => randomize and split files into 3 sets 8:1:1
+
+    # Get the path to
+    path = os.path.expanduser(args['path'])
+    if os.path.isdir(path):
+        filenames = os.listdir(path)
+    else:
+        raise ValueError('Path entered is not a path: ' + path)
+    print('Read ' + str(len(filenames)) + ' filenames in the folder')
+
+    if args['success_only'] and args['split_all']:
+        raise ValueError('success_only and split_all are mutually exclusive. Please choose just one.')
+    elif args['success_only']:
+        split_success_only(args, filenames, path)
+    elif args['split_all']:
+        split_all(args, filenames, path)
 
 
 if __name__ == '__main__':

From 0d0bf811ffe649fb98acf9444c08467049e7d169 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 9 Oct 2018 21:54:43 -0400
Subject: [PATCH 10/36] Implement dataset splitting for error only and failure
 only subset

---
 .../costar_block_stacking_split_dataset.py    | 118 ++++++++++++------
 1 file changed, 78 insertions(+), 40 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 5e95197dd..cc4583ec4 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -18,6 +18,7 @@
 import os
 
 
+
 def _parse_args():
     parser = argparse.ArgumentParser(description=
         'Splits dataset into train, validation and test sets.'
@@ -36,9 +37,9 @@ def _parse_args():
     #                     help='skip attempts that are both failures and contain errors')
     parser.add_argument("--success_only", action='store_true', default=False,
                         help='only visit stacking data labeled as successful')
-    parser.add_ardument("--split_all", action='store_true', default=False,
-                        help='split all datasets into success, failure, and error sets.'
-                             'requires train/val/test from success_only subset')
+    parser.add_argument("--split_all", action='store_true', default=False,
+                        help='Split all datasets into success, failure, and error sets. '
+                             'Requires train/val/test from success_only subset')
     parser.add_argument("--plush", action='store_true', default=False,
                         help='processing plush attempts')
     parser.add_argument("--train", type=str, default='',
@@ -80,7 +81,7 @@ def get_existing_filenames(path_to_file):
     f.close()
 
     print('Read ' + str(len(filenames)) + ' filenames from ' + path_to_file)
-    pause()  # DEBUG
+    # pause()  # DEBUG
     return filenames
 
 
@@ -155,7 +156,7 @@ def output_file(path, plush, output_prefix, set_name, filenames):
     print('Writing ' + path + output_filename)
 
     f = open(os.path.join(path, output_filename), 'w')
-    print(f)
+    # print(f)
 
     if plush:
         folder = 'blocks_with_plush_toy/'
@@ -176,7 +177,6 @@ def split_success_only(args, filenames, path):
     # Read files that are success
     filenames = [filename for filename in filenames if '.success.h5f' in filename]
     print('Selecting ' + str(len(filenames)) + ' success files')
-    args['output_name'] += '_success_only'
     pause()
 
     # Read filenames for the previous training set
@@ -253,19 +253,16 @@ def split_success_only(args, filenames, path):
 
 def split_all(args, filenames, path):
     # Get the success, failure, and error filenames with nonzero frames
-    success_filenames, failure_filenames, error_filenames = count_nonzero_files(filenames)
+    success_filenames, failure_filenames, error_filenames = count_nonzero_files(path, filenames)
     pause()  # DEBUG
 
     # Calculate the percentage of success, failure and error
-    total_file_count = len(filenames)
-    if len(success_filenames) + len(failure_filenames) + \
-       len(error_filenames) != total_file_count:
-        raise Exception("The numbers don't add up!")
+    total_file_count = len(success_filenames) + len(failure_filenames) + len(error_filenames)
     success_ratio = len(success_filenames) / total_file_count
     failure_ratio = len(failure_filenames) / total_file_count
     error_ratio = len(error_filenames) / total_file_count
-    print("Ratios: {0:.2f} success, {1:.2f} failure, {2:.2f} error".format(
-            success_ratio, failure_ratio, error_ratio))
+    print("Ratios: {:.2f}% success, {:.2f}% failure, {:.2f}% error".format(
+            success_ratio*100, failure_ratio*100, error_ratio*100))
     pause()  # DEBUG
 
     # Read the train/val set from success_only subset
@@ -317,14 +314,12 @@ def split_all(args, filenames, path):
 
     # Calculate set size for failure and error, based on success_only subset
     multiplier_failure = len(failure_filenames)/len(success_filenames)
-    failure_val_len, failure_test_len = \
-        int(round(success_val_len*multiplier_failure)), \
-        int(round(success_test_len*multiplier_failure))
+    failure_val_len = int(round(success_val_len*multiplier_failure))
+    failure_test_len = int(round(success_test_len*multiplier_failure))
     failure_train_len = len(failure_filenames) - (failure_val_len + failure_test_len)
     multiplier_error = len(error_filenames)/len(success_filenames)
-    error_val_len, error_test_len = \
-        int(round(success_val_len*multiplier_error)), \
-        int(round(success_test_len*multiplier_error))
+    error_val_len = int(round(success_val_len*multiplier_error))
+    error_test_len = int(round(success_test_len*multiplier_error))
     error_train_len = len(error_filenames) - (error_val_len + error_test_len)
     print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
             success_train_len, success_val_len, success_test_len))
@@ -334,11 +329,36 @@ def split_all(args, filenames, path):
             error_train_len, error_val_len, error_test_len))
     pause()
     
+    # Randomize the filenames
+    from random import shuffle
+    shuffle(failure_filenames)
+    shuffle(error_filenames)
+
     # Split the dataset for failure and error
     fail_train_set, fail_val_set, fail_test_set = \
-        split_dataset(failure_filenames, [], [], [])
+        split_dataset(failure_filenames, [], [], [], failure_val_len, failure_test_len)
     err_train_set,  err_val_set,  err_test_set = \
-        split_dataset(error_filenames, [], [], [])
+        split_dataset(error_filenames, [], [], [], error_val_len, error_test_len)
+
+    for i in fail_val_set:
+        if i in fail_train_set:
+            print("fail: val attempt in train set! %s" % i)
+            pause()
+    for i in fail_test_set:
+        if i in fail_train_set:
+            print("fail: test attempt in train set! %s" % i)
+            pause()
+
+    for i in err_val_set:
+        if i in err_train_set:
+            print("err: val attempt in train set! %s" % i)
+            pause()
+
+    for i in err_test_set:
+        if i in err_train_set:
+            print("err: test attempt in train set! %s" % i)
+            pause()
+    pause()
 
     # Write the output files
     output_file(path, args['plush'], args['output_name'], 'failure_only_train', fail_train_set)
@@ -349,7 +369,7 @@ def split_all(args, filenames, path):
     output_file(path, args['plush'], args['output_name'], 'error_only_test', err_test_set)
 
 
-def count_nonzero_files(filenames):
+def count_nonzero_files(path, filenames):
     '''
     Open the files and check frame count. Skip files with 0 frame.
 
@@ -357,32 +377,48 @@ def count_nonzero_files(filenames):
     :return: Lists of success/failure/error filenames with nonzero frames
     '''
     import h5py  # Needs h5py to open the files and check frame count
+    import sys
+    import traceback
     # TODO: Write total frames into csv file as a new column
 
     # Open the files to check frame count. Skip files with 0 frame.
     error_filenames = []
     failure_filenames = []
     success_filenames = []
+    skip_count = 0
+    i = 0
     for filename in filenames:
-        with h5py.File(filename, 'r') as data:
-            total_frames = len(data['image'])
-            if total_frames == 0:  # Skip files with 0 frame
-                print('Skipping %s since it has 0 image frame' % filename)
-                continue
+        # print(filename)
+        i += 1
+        try:
+            with h5py.File(os.path.join(path, filename), 'r') as data:
+                try:
+                    total_frames = len(data['image'])
+                except KeyError as e:
+                    print('Skipping %s for KeyError' % filename)
+                    continue
 
-            if 'error' in filename:
-                error_filenames += filename
-            elif 'failure' in filename:
-                failure_filenames += filename
-            # else:  # success
-            #     success_filenames += filename
-            elif 'success' in filename:
-                success_filenames += filename
-            else:  # BUG: Sanity check for debugging
-                raise Exception('Somthing is wrong!')
-
-    print("Counted {0} success files, {1} failure files, and {2} error files.".format(
+                if total_frames == 0:  # Skip files with 0 frame
+                    # print('Skipping %s since it has 0 image frame' % filename)
+                    skip_count += 1
+                    continue
+
+                if 'error' in filename:
+                    error_filenames += [filename]
+                elif 'failure' in filename:
+                    failure_filenames += [filename]
+                # else:  # success
+                #     success_filenames += filename
+                elif 'success' in filename:
+                    success_filenames += [filename]
+                else:  # BUG: Sanity check for debugging
+                    raise Exception('Somthing is wrong!')
+        except IOError as ex:
+            print('Skipping %s for IO error' % filename)
+
+    print("Counted {:d} success files, {:d} failure files, and {:d} error files.".format(
             len(success_filenames), len(failure_filenames), len(error_filenames)))
+    print("Skipped %d files since they have 0 image frame" % skip_count)
 
     return success_filenames, failure_filenames, error_filenames
 
@@ -447,7 +483,9 @@ def main(args, root='root'):
         filenames = os.listdir(path)
     else:
         raise ValueError('Path entered is not a path: ' + path)
-    print('Read ' + str(len(filenames)) + ' filenames in the folder')
+
+    filenames = [filename for filename in filenames if '.h5f' in filename]
+    print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')
 
     if args['success_only'] and args['split_all']:
         raise ValueError('success_only and split_all are mutually exclusive. Please choose just one.')

From 6c4fd0e407d23fd5c0a287dae1bc67a5c03603e2 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 9 Oct 2018 22:32:02 -0400
Subject: [PATCH 11/36] Error is also a type of failure

---
 .../costar_block_stacking_split_dataset.py    | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index cc4583ec4..d5a729fd7 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -261,6 +261,7 @@ def split_all(args, filenames, path):
     success_ratio = len(success_filenames) / total_file_count
     failure_ratio = len(failure_filenames) / total_file_count
     error_ratio = len(error_filenames) / total_file_count
+    print("Total: %d files" % total_file_count)
     print("Ratios: {:.2f}% success, {:.2f}% failure, {:.2f}% error".format(
             success_ratio*100, failure_ratio*100, error_ratio*100))
     pause()  # DEBUG
@@ -324,6 +325,10 @@ def split_all(args, filenames, path):
     print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
             success_train_len, success_val_len, success_test_len))
     print("Length for failure sets: {0} train, {1} val, {2} test".format(
+            failure_train_len + error_train_len, 
+            failure_val_len + error_val_len, 
+            failure_test_len + error_test_len))
+    print("Length for failure (no error) sets: {0} train, {1} val, {2} test".format(
             failure_train_len, failure_val_len, failure_test_len))
     print("Length for error sets: {0} train, {1} val, {2} test".format(
             error_train_len, error_val_len, error_test_len))
@@ -361,13 +366,20 @@ def split_all(args, filenames, path):
     pause()
 
     # Write the output files
-    output_file(path, args['plush'], args['output_name'], 'failure_only_train', fail_train_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_only_val', fail_val_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_only_test', fail_test_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_train', fail_train_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_val', fail_val_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_test', fail_test_set)
     output_file(path, args['plush'], args['output_name'], 'error_only_train', err_train_set)
     output_file(path, args['plush'], args['output_name'], 'error_only_val', err_val_set)
     output_file(path, args['plush'], args['output_name'], 'error_only_test', err_test_set)
 
+    # Error is also a type of failure!
+    fail_train_set += err_train_set
+    fail_val_set += err_val_set
+    fail_test_set += err_test_set
+    output_file(path, args['plush'], args['output_name'], 'failure_only_train', fail_train_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_only_val', fail_val_set)
+    output_file(path, args['plush'], args['output_name'], 'failure_only_test', fail_test_set)
 
 def count_nonzero_files(path, filenames):
     '''
@@ -407,12 +419,12 @@ def count_nonzero_files(path, filenames):
                     error_filenames += [filename]
                 elif 'failure' in filename:
                     failure_filenames += [filename]
-                # else:  # success
-                #     success_filenames += filename
-                elif 'success' in filename:
+                else:  # success
                     success_filenames += [filename]
-                else:  # BUG: Sanity check for debugging
-                    raise Exception('Somthing is wrong!')
+                # elif 'success' in filename:
+                #     success_filenames += [filename]
+                # else:  # BUG: Sanity check for debugging
+                #     raise Exception('Somthing is wrong!')
         except IOError as ex:
             print('Skipping %s for IO error' % filename)
 

From 87beadd7d7881022c8cd8292030109b52e67decd Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 9 Oct 2018 22:35:40 -0400
Subject: [PATCH 12/36] Modify output label

---
 costar_hyper/costar_block_stacking_split_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index d5a729fd7..838f83d87 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -262,7 +262,7 @@ def split_all(args, filenames, path):
     failure_ratio = len(failure_filenames) / total_file_count
     error_ratio = len(error_filenames) / total_file_count
     print("Total: %d files" % total_file_count)
-    print("Ratios: {:.2f}% success, {:.2f}% failure, {:.2f}% error".format(
+    print("Ratios: {:.2f}% success, {:.2f}% failure(no error), {:.2f}% error".format(
             success_ratio*100, failure_ratio*100, error_ratio*100))
     pause()  # DEBUG
 

From ba29f96bd9d148c3c817a55ee36e2ae0339fd980 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Wed, 10 Oct 2018 15:48:33 -0400
Subject: [PATCH 13/36] Modify output file names

---
 .../costar_block_stacking_split_dataset.py    | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 838f83d87..322b2c809 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -366,20 +366,21 @@ def split_all(args, filenames, path):
     pause()
 
     # Write the output files
-    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_train', fail_train_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_val', fail_val_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_no_error_only_test', fail_test_set)
-    output_file(path, args['plush'], args['output_name'], 'error_only_train', err_train_set)
-    output_file(path, args['plush'], args['output_name'], 'error_only_val', err_val_set)
-    output_file(path, args['plush'], args['output_name'], 'error_only_test', err_test_set)
+    output_file(path, args['plush'], args['output_name'], 'task_failure_only_train', fail_train_set)
+    output_file(path, args['plush'], args['output_name'], 'task_failure_only_val', fail_val_set)
+    output_file(path, args['plush'], args['output_name'], 'task_failure_only_test', fail_test_set)
+    output_file(path, args['plush'], args['output_name'], 'error_failure_only_train', err_train_set)
+    output_file(path, args['plush'], args['output_name'], 'error_failure_only_val', err_val_set)
+    output_file(path, args['plush'], args['output_name'], 'error_failure_only_test', err_test_set)
 
     # Error is also a type of failure!
     fail_train_set += err_train_set
     fail_val_set += err_val_set
     fail_test_set += err_test_set
-    output_file(path, args['plush'], args['output_name'], 'failure_only_train', fail_train_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_only_val', fail_val_set)
-    output_file(path, args['plush'], args['output_name'], 'failure_only_test', fail_test_set)
+    output_file(path, args['plush'], args['output_name'], 'all_failure_only_train', fail_train_set)
+    output_file(path, args['plush'], args['output_name'], 'all_failure_only_val', fail_val_set)
+    output_file(path, args['plush'], args['output_name'], 'all_failure_only_test', fail_test_set)
+
 
 def count_nonzero_files(path, filenames):
     '''
@@ -389,8 +390,6 @@ def count_nonzero_files(path, filenames):
     :return: Lists of success/failure/error filenames with nonzero frames
     '''
     import h5py  # Needs h5py to open the files and check frame count
-    import sys
-    import traceback
     # TODO: Write total frames into csv file as a new column
 
     # Open the files to check frame count. Skip files with 0 frame.

From 19dcafce6e7dca73dce462ad9371a38ac133a92d Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Fri, 12 Oct 2018 18:02:15 -0400
Subject: [PATCH 14/36] Increase readability on the help text. Parameterize the
 hard-coded variable.

---
 .../scripts/view_convert_dataset.py           | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/ctp_integration/scripts/view_convert_dataset.py b/ctp_integration/scripts/view_convert_dataset.py
index 7a4aa60a1..cd65bd4ff 100755
--- a/ctp_integration/scripts/view_convert_dataset.py
+++ b/ctp_integration/scripts/view_convert_dataset.py
@@ -209,7 +209,12 @@ def _parse_args():
                              """)
     parser.add_argument("--write", action='store_true', help='Actually write out the changes specified in preprocess_inplace, or label_correction.')
     parser.add_argument("--action_label_check", action='store_true', default=False,
-                        help='check action labels for each goal step')
+                        help='''To be used with flag --goal-to-jpeg. Check the action label strings in each file for consistency.
+                                1. Output the image frame at the goals for each file to folder action_label_check/ for manual inspection of
+                                   the action label and the actual action that the robot executes. The filename for the jpeg images contain
+                                   the action label that the robot was supposed to do at the time frame.
+                                2. Checks the action labels stored in each file for inconsistent ordering.
+                             ''')
 
     return vars(parser.parse_args())
 
@@ -845,18 +850,19 @@ def generate_gripper_action_label(data):
     return gripper_action_label, gripper_action_goal_idx
 
 
-def action_label_check(action_labels):
-    stored_action_labels = \
-       [b'place_green_on_yellow', b'move_to_home', b'place_blue_on_yellowred', b'place_yellow_on_red',
-        b'place_blue_on_red', b'grab_blue', b'place_red_on_blueyellow', b'place_green_on_redyellow', 
-        b'place_red_on_yellow', b'place_green_on_blueyellow', b'place_red_on_greenblue', b'place_blue_on_green',
-        b'place_blue_on_redgreen',b'place_yellow_on_greenblue', b'place_yellow_on_blue', b'place_blue_on_greenyellow',
-        b'place_blue_on_yellowgreen', b'place_blue_on_greenred', b'place_yellow_on_redgreen', b'grab_yellow', 
-        b'place_red_on_greenyellow', b'grab_green', b'place_red_on_green', b'place_yellow_on_bluered', 
-        b'place_yellow_on_green', b'place_green_on_blue', b'place_yellow_on_bluegreen', b'place_blue_on_redyellow', 
-        b'place_red_on_blue', b'place_red_on_yellowgreen', b'place_yellow_on_greenred', b'place_green_on_yellowblue',  
-        b'place_red_on_bluegreen', b'place_green_on_red', b'place_red_on_yellowblue', b'place_green_on_yellowred',
-        b'place_green_on_redblue', b'grab_red', b'place_yellow_on_redblue', b'place_green_on_bluered', b'place_blue_on_yellow']
+def action_label_check(action_labels, stored_action_labels=None):
+    if stored_action_labels is None:
+        stored_action_labels = [
+            b'place_green_on_yellow', b'move_to_home', b'place_blue_on_yellowred', b'place_yellow_on_red',
+            b'place_blue_on_red', b'grab_blue', b'place_red_on_blueyellow', b'place_green_on_redyellow', 
+            b'place_red_on_yellow', b'place_green_on_blueyellow', b'place_red_on_greenblue', b'place_blue_on_green',
+            b'place_blue_on_redgreen',b'place_yellow_on_greenblue', b'place_yellow_on_blue', b'place_blue_on_greenyellow',
+            b'place_blue_on_yellowgreen', b'place_blue_on_greenred', b'place_yellow_on_redgreen', b'grab_yellow', 
+            b'place_red_on_greenyellow', b'grab_green', b'place_red_on_green', b'place_yellow_on_bluered', 
+            b'place_yellow_on_green', b'place_green_on_blue', b'place_yellow_on_bluegreen', b'place_blue_on_redyellow', 
+            b'place_red_on_blue', b'place_red_on_yellowgreen', b'place_yellow_on_greenred', b'place_green_on_yellowblue',  
+            b'place_red_on_bluegreen', b'place_green_on_red', b'place_red_on_yellowblue', b'place_green_on_yellowred',
+            b'place_green_on_redblue', b'grab_red', b'place_yellow_on_redblue', b'place_green_on_bluered', b'place_blue_on_yellow']
     
     assert len(stored_action_labels) == len(action_labels)
 

From 0444fb18719d7708a99dcfc1b9b2712c3aaebef0 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Fri, 12 Oct 2018 19:35:49 -0400
Subject: [PATCH 15/36] WIP: Add bazilion comments; split_dataset behavior
 overhaul

---
 .../costar_block_stacking_split_dataset.py    | 348 ++++++++++--------
 1 file changed, 202 insertions(+), 146 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 322b2c809..5dd5b5ff9 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -1,40 +1,42 @@
 '''
 Splits dataset into train, validation, and test sets.
-Inherits existing validation and test sets.
-New files are added into training set.
-
-flags:
-path to dataset
-train file name (optional)
-val file name (optional)
-test file name (optional)
-output name
-success_only
-ignore_failure
-ignore_success
-ignore_error
+Inherits existing validation and test sets. New files are added into training set.
+
+To split the success_only subset or to add new files ot the success_only subset, call:
+
+python costar_block_stacking_split_dataset.py --path /path/to/dataset/folder\
+    --success_only (--plush) (--train train/txt/filename)                   \
+    (--val val/txt/filename) (-test test/txt/filename/)                     \
+    --output_name [filename prefix for the output train/val/test filenames]
+
+To split all dataset, i.e. split error files and failure files into train/val/test sets,
+call the following command after success_only subset is splitted:
+
+python costar_block_stacking_split_dataset.py --path /path/to/dataset/folder     \
+    --success_only (--plush) --train success_only/train/txt/filename             \
+    --val [success_only val txt filename] --test [success_only test txt filename]\
+    --output_name [filename prefix for the output train/val/test filenames]
+
+This will output task_failure_only, error_failure_only, and all_failure_only 
+train/val/test filenames as 9 separate txt files.
+
+Author: Chia-Hung "Rexxar" Lin (rexxarchl)
+Apache License 2.0 https://www.apache.org/licenses/LICENSE-2.0
 '''
 import argparse
 import os
 
 
-
 def _parse_args():
-    parser = argparse.ArgumentParser(description=
-        'Splits dataset into train, validation and test sets.'
-        'Inherits existing validation and test sets.'
-        'New files are added into training set.'
-        'If no pre-existing sets of files are indicated, randomize and split the files'
-        ' in the folder 8:1:1 for train/val/test.')
+    parser = argparse.ArgumentParser(
+        description='Splits dataset into train, validation and test sets. '
+                    'Inherits existing validation and test sets. '
+                    'New files are added into training set. '
+                    'If no pre-existing sets of files are indicated, randomize and split '
+                    'the files in the folder 8:1:1 for train/val/test.')
     parser.add_argument("--path", type=str,
                         default=os.path.join(os.path.expanduser("~"), '.costar', 'data'),
                         help='path to dataset folder containing many files')
-    # parser.add_argument("--ignore_failure", action='store_true',
-    #                     default=False, help='skip grasp failure cases')
-    # parser.add_argument("--ignore_success", action='store_true',
-    #                     default=False, help='skip grasp success cases')
-    # parser.add_argument("--ignore_error", action='store_true', default=False,
-    #                     help='skip attempts that are both failures and contain errors')
     parser.add_argument("--success_only", action='store_true', default=False,
                         help='only visit stacking data labeled as successful')
     parser.add_argument("--split_all", action='store_true', default=False,
@@ -53,18 +55,23 @@ def _parse_args():
                         'the file is expected to be in argument `path`')
     parser.add_argument("--output_name", type=str,
                         default='costar_block_stacking_dataset', help='output file name')
+    parser.add_argument("--val_len", type=int, default=None, 
+                        help='Expected val set length')
+    parser.add_argument("--test_len", type=int, default=None, 
+                        help='Expected test set length')
     return vars(parser.parse_args())
 
 
 def extract_filename_from_url(url):
+    '''Extract the string after the last '/' in the input `url`
+    '''
     filename = url[url.rfind("/")+1:]
     return filename
 
 
 def get_existing_filenames(path_to_file):
-    """Open the file indicated by the input, and output a list of the filenames in the file.
-
-    """
+    '''Open the file indicated by the input, and output a list of the filenames in the file.
+    '''
     f = open(path_to_file, 'r')
 
     filenames = []  # A list to store the filenames in the file
@@ -85,12 +92,33 @@ def get_existing_filenames(path_to_file):
     return filenames
 
 
-def split_dataset(filenames, train_set, val_set, test_set, val_len=64, test_len=64):
-    """Split the input filenames into three sets.
+def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_len=None):
+    '''Split the input filenames into three sets.
     If val_set and test_set are empty, the sets will be of length val_len and test_len.
-    If val_set and test_set have unequal length, match the two lengths
-    Add additional files not in val or test sets into training set
-    """
+    If val_set and test_set have unequal length, match the two lengths.
+    Files not in val or test sets are added into training set.
+
+    :param filenames: The filenames to be split into three sets.
+    :param train_set: The filenames already in the train set.
+    :param val_set: The filenames already in the val set.
+    :param test_set: The filenames already in the test set.
+    :param val_len: The expected output val set length.
+    :param test_len: The expected output test set length.
+    :return train_set, val_set, test_set: train/val/test set filenames.
+    '''
+    if len(test_set) is 0 and test_len is None:
+        raise ValueError("split_dataset: test_set is empty and no test_len is specified!")
+    if len(val_set) is 0 and val_len is None:
+        raise ValueError("split_dataset: val_set is empty and no val_len is specified!")
+
+    # If we reach here without error, either the sets are non-empty, or
+    # test_len and val_len are not None
+    if test_len is None:
+        test_len = len(test_set)
+    if val_len is None:
+        val_len = len(val_set)
+
+    # No val set and test set is provided, create new val/test sets
     if len(val_set) is 0 and len(test_set) is 0:
         if len(train_set) != 0:
             not_train_set = [filename for filename in filenames
@@ -101,57 +129,62 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=64, test_len=
         val_set = not_train_set[0:val_len]
         test_set = not_train_set[val_len:val_len+test_len]
         train_set += not_train_set[val_len+test_len:]
-
     else:
-        """
-        for filename in filenames:
-            if (filename in train_set) \
-                or (filename in val_set) \
-                or (filename in test_set):
-                continue
-            else:
-                if filename is '':
-                    print("A filename is empty!")
-                    pause()
-                    continue
-
-                # Add the filenames to val and test until they are of equal length
-                if len(val_set) < len(test_set):
-                    val_set.append(filename)
-                elif len(val_set) > len(test_set):
-                    test_set.append(filename)
-
-                # Dump the rest into training set
-                else:
-                    train_set.append(filename)
-        """
-
         # Select filenames not in test or val set
         not_val_or_test_set = \
             [filename for filename in filenames if
              filename not in val_set and filename not in test_set]
 
-        # Check that val and test set are of the same size
-        val_test_len_diff = len(val_set) - len(test_set)
+        # Check if expected length and current lenth for val set are different
+        len_diff = val_len - len(val_set)
+        if len_diff > 0:
+            # Add additional files to val set
+            val_set += not_val_or_test_set[0:len_diff]
+            not_val_or_test_set = not_val_or_test_set[len_diff:]
+            print("Expected val set length: {}, current val set length: {}".format(
+                    val_len, len(val_set)))
+            print("Added %d files to val set." % len_diff)
+
+            print("Unusual behavior. Do you really want to add files to val set?")
+            pause()
+        elif len_diff < 0:
+            print("Expected val set length: {}, current val set length: {}".format(
+                    val_len, len(val_set)))
+            raise RuntimeError("Expected length is smaller than current length!")
+
+        # Do the same check for test set
+        len_diff = test_len - len(test_set)
+        if len_diff > 0:
+            # Add additional files to test set
+            test_set += not_val_or_test_set[0:len_diff]
+            not_val_or_test_set = not_val_or_test_set[len_diff:]
+            print("Expected test set length: {}, current test set length: {}".format(
+                    val_len, len(val_set)))
+            print("Added %d files to test set." % len_diff)
+
+            print("Unusual behavior. Do you really want to add files to test set?")
+            pause()
+        elif len_diff < 0:
+            print("Expected test set length: {}, current test set length: {}".format(
+                    val_len, len(val_set)))
+            raise RuntimeError("Expected length is smaller than current length!")
 
-        if val_test_len_diff == 0:
-            # val and test set same size => everything belongs to train set
-            train_set = not_val_or_test_set
-        else:
-            # add filenames to val and test until they are of equal length
-            if val_test_len_diff > 0:
-                # val set is longer
-                test_set.extend(not_val_or_test_set[0:val_test_len_diff])
-                train_set = not_val_or_test_set[val_test_len_diff:]
-            else:
-                # test set is longer
-                val_set.extend(not_val_or_test_set[0:-val_test_len_diff])
-                train_set = not_val_or_test_set[-val_test_len_diff:]
+        # Dump the rest of the files into train set
+        train_set = not_val_or_test_set
 
     return train_set, val_set, test_set
 
 
 def output_file(path, plush, output_prefix, set_name, filenames):
+    '''Output the filenames as a txt file. 
+    Automatically adds appropriate keras path for the filenames.
+
+    :param path: The path to store the output txt file.
+    :param plush: A bool stating whether the program is processing plush subset.
+    :param output_prefix: Prefix of the output txt file.
+    :param set_name: train/val/test, to be added to the output filename.
+    :param filenames: The filenames to be written in the txt file.
+    '''
     output_filename = output_prefix + '_' + set_name + '_files.txt'
     print('Writing ' + path + output_filename)
 
@@ -173,17 +206,31 @@ def output_file(path, plush, output_prefix, set_name, filenames):
     f.close()
 
 
-def split_success_only(args, filenames, path):
+def split_success_only(
+        filenames, path, plush, train_txt, val_txt, test_txt, output_name,
+        val_len=None, test_len=None):
+    '''Splits success files into success_only train/val/test txt files.
+
+    :param filenames: A list of .h5f filenames under the path.
+    :param path: Path to the folder with the .h5f files.
+    :param plush: A bool indicating whether the program is processing plush subset.
+    :param train_txt: Filename to a pre-existing train txt file.
+    :param val_txt: Filename to a pre-existing val txt file.
+    :param test_txt: Filename to a pre-existing test txt file.
+    :param val_len: Expected output val set length.
+    :param test_len: Expected output test set length.
+    :param output_name: Filename prefix to the output train/val/test txt files.
+    '''
     # Read files that are success
     filenames = [filename for filename in filenames if '.success.h5f' in filename]
     print('Selecting ' + str(len(filenames)) + ' success files')
     pause()
 
     # Read filenames for the previous training set
-    if not args['train']:
+    if not train_txt:
         train_set = []
     else:
-        pre_existing_set_file = path + args['train']
+        pre_existing_set_file = path + train_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
                 'Pre-existing training file is not a file: ' +
@@ -192,10 +239,10 @@ def split_success_only(args, filenames, path):
         train_set = get_existing_filenames(pre_existing_set_file)
 
     # Read filenames for the previous validation set
-    if not args['val']:
+    if not val_txt:
         val_set = []
     else:
-        pre_existing_set_file = path + args['val']
+        pre_existing_set_file = path + val_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
                 'Pre-existing validating file is not a file: ' +
@@ -204,10 +251,10 @@ def split_success_only(args, filenames, path):
         val_set = get_existing_filenames(pre_existing_set_file)
 
     # Read filenames for the previous test set
-    if not args['test']:
+    if not test_txt:
         test_set = []
     else:
-        pre_existing_set_file = path + args['test']
+        pre_existing_set_file = path + test_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
                 'Pre-existing testing file is not a file: ' +
@@ -226,7 +273,8 @@ def split_success_only(args, filenames, path):
     shuffle(filenames)
 
     # Split the dataset
-    train_set, val_set, test_set = split_dataset(filenames, train_set, val_set, test_set)
+    train_set, val_set, test_set = split_dataset(
+        filenames, train_set, val_set, test_set, val_len, test_len)
 
     for i in val_set:
         if i in train_set:
@@ -246,18 +294,39 @@ def split_success_only(args, filenames, path):
         raise Exception("Something is wrong!")
 
     # Write the output files
-    output_file(path, args['plush'], args['output_name'], 'success_only_train', train_set)
-    output_file(path, args['plush'], args['output_name'], 'success_only_val', val_set)
-    output_file(path, args['plush'], args['output_name'], 'success_only_test', test_set)
-
-
-def split_all(args, filenames, path):
+    output_file(path, plush, output_name, 'success_only_train', train_set)
+    output_file(path, plush, output_name, 'success_only_val', val_set)
+    output_file(path, plush, output_name, 'success_only_test', test_set)
+
+
+def split_all(
+        filenames, path, plush, train_txt, val_txt, test_txt, output_name,
+        val_len=None, test_len=None):
+    '''Splits failure files into all_failure_only, task_failure_only and
+    error_failure_only subsets.
+    1. Open all filenames with h5py to only count the files that contain images
+    2. Calculate success:failure:error ratios
+    3. Refer to pre-existing success_only train/val/test txt file counts and output
+       train/val/test txt files according to the calculated success:failure:error ratio.
+
+    :param filenames: A list of .h5f filenames under the path.
+    :param path: Path to the folder with the .h5f files.
+    :param plush: A bool indicating whether the program is processing plush subset.
+    :param train_txt: Filename to success_only train txt file.
+    :param val_txt: Filename to success_only val txt file.
+    :param test_txt: Filename to success_only test txt file.
+    :param val_len: Expected output val set length.
+    :param test_len: Expected output test set length.
+    :param output_name: Filename prefix to the output train/val/test txt files.
+    '''
     # Get the success, failure, and error filenames with nonzero frames
-    success_filenames, failure_filenames, error_filenames = count_nonzero_files(path, filenames)
+    success_filenames, failure_filenames, error_filenames = count_files_containing_images(
+                                                                path, filenames)
     pause()  # DEBUG
 
     # Calculate the percentage of success, failure and error
-    total_file_count = len(success_filenames) + len(failure_filenames) + len(error_filenames)
+    total_file_count = (
+        len(success_filenames) + len(failure_filenames) + len(error_filenames))
     success_ratio = len(success_filenames) / total_file_count
     failure_ratio = len(failure_filenames) / total_file_count
     error_ratio = len(error_filenames) / total_file_count
@@ -267,17 +336,17 @@ def split_all(args, filenames, path):
     pause()  # DEBUG
 
     # Read the train/val set from success_only subset
-    if args['plush']:
+    if plush:
         default_name = 'costar_plush_block_stacking_v0.4_success_only_'
     else:
         default_name = 'costar_block_stacking_v0.4_success_only_'
     # Read filenames for the previous training set
-    if not args['train']:
+    if not train_txt:
         # Look for v0.4 success only train filenames
         print('No train file is specified. Trying to open v0.4 success only...')
         pre_existing_set_file = path + default_name + 'train_files.txt'
     else:
-        pre_existing_set_file = path + args['train']
+        pre_existing_set_file = path + train_txt
 
     if not os.path.isfile(pre_existing_set_file):
         raise ValueError(
@@ -286,12 +355,12 @@ def split_all(args, filenames, path):
     success_train_len = len(get_existing_filenames(pre_existing_set_file))
 
     # Read filenames for the previous validation set
-    if not args['val']:
+    if not val_txt:
         # Look for v0.4 success only val filenames
         print('No val file is specified. Trying to open v0.4 success only...')
         pre_existing_set_file = path + default_name + 'val_files.txt'
     else:
-        pre_existing_set_file = path + args['val']
+        pre_existing_set_file = path + val_txt
 
     if not os.path.isfile(pre_existing_set_file):
         raise ValueError(
@@ -300,12 +369,12 @@ def split_all(args, filenames, path):
     success_val_len = len(get_existing_filenames(pre_existing_set_file))
 
     # Read filenames for the previous test set
-    if not args['test']:
+    if not test_txt:
         # Look for v0.4 success only train filenames
         print('No test file is specified. Trying to open v0.4 success only...')
         pre_existing_set_file = path + default_name + 'test_files.txt'
     else:
-        pre_existing_set_file = path + args['test']
+        pre_existing_set_file = path + test_txt
 
     if not os.path.isfile(pre_existing_set_file):
         raise ValueError(
@@ -325,8 +394,8 @@ def split_all(args, filenames, path):
     print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
             success_train_len, success_val_len, success_test_len))
     print("Length for failure sets: {0} train, {1} val, {2} test".format(
-            failure_train_len + error_train_len, 
-            failure_val_len + error_val_len, 
+            failure_train_len + error_train_len,
+            failure_val_len + error_val_len,
             failure_test_len + error_test_len))
     print("Length for failure (no error) sets: {0} train, {1} val, {2} test".format(
             failure_train_len, failure_val_len, failure_test_len))
@@ -366,25 +435,24 @@ def split_all(args, filenames, path):
     pause()
 
     # Write the output files
-    output_file(path, args['plush'], args['output_name'], 'task_failure_only_train', fail_train_set)
-    output_file(path, args['plush'], args['output_name'], 'task_failure_only_val', fail_val_set)
-    output_file(path, args['plush'], args['output_name'], 'task_failure_only_test', fail_test_set)
-    output_file(path, args['plush'], args['output_name'], 'error_failure_only_train', err_train_set)
-    output_file(path, args['plush'], args['output_name'], 'error_failure_only_val', err_val_set)
-    output_file(path, args['plush'], args['output_name'], 'error_failure_only_test', err_test_set)
-
-    # Error is also a type of failure!
+    output_file(path, plush, output_name, 'task_failure_only_train', fail_train_set)
+    output_file(path, plush, output_name, 'task_failure_only_val', fail_val_set)
+    output_file(path, plush, output_name, 'task_failure_only_test', fail_test_set)
+    output_file(path, plush, output_name, 'error_failure_only_train', err_train_set)
+    output_file(path, plush, output_name, 'error_failure_only_val', err_val_set)
+    output_file(path, plush, output_name, 'error_failure_only_test', err_test_set)
+
+    # Error is also a type of failure! Combine task failure and error failure subsets.
     fail_train_set += err_train_set
     fail_val_set += err_val_set
     fail_test_set += err_test_set
-    output_file(path, args['plush'], args['output_name'], 'all_failure_only_train', fail_train_set)
-    output_file(path, args['plush'], args['output_name'], 'all_failure_only_val', fail_val_set)
-    output_file(path, args['plush'], args['output_name'], 'all_failure_only_test', fail_test_set)
+    output_file(path, plush, output_name, 'all_failure_only_train', fail_train_set)
+    output_file(path, plush, output_name, 'all_failure_only_val', fail_val_set)
+    output_file(path, plush, output_name, 'all_failure_only_test', fail_test_set)
 
 
-def count_nonzero_files(path, filenames):
-    '''
-    Open the files and check frame count. Skip files with 0 frame.
+def count_files_containing_images(path, filenames):
+    '''Open the files and check frame count. Skip files with 0 frame.
 
     :param filenames: .h5f filenames in the folder
     :return: Lists of success/failure/error filenames with nonzero frames
@@ -418,12 +486,12 @@ def count_nonzero_files(path, filenames):
                     error_filenames += [filename]
                 elif 'failure' in filename:
                     failure_filenames += [filename]
-                else:  # success
+                elif 'success' in filename:
                     success_filenames += [filename]
-                # elif 'success' in filename:
-                #     success_filenames += [filename]
-                # else:  # BUG: Sanity check for debugging
-                #     raise Exception('Somthing is wrong!')
+                else:  # BUG: Sanity check for debugging
+                    raise Exception(
+                        'Somthing is wrong! The file does not contain `error`,'
+                        '`failure`, or `success` in the filename: %s' % filename)
         except IOError as ex:
             print('Skipping %s for IO error' % filename)
 
@@ -439,25 +507,17 @@ def pause():
 
 
 def compare_filenames(path, name1, name2):
-    """ Quick function meant to check if filenames are the same
-    Intended to be used in REPL only
-    from split_dataset import compare_filenames
-
-    costar_block_stacking_v0.3_success_only_train_files.txt
-    """
+    '''Check if filenames within two txt files are the same.
+    Example use: compare train and val files to make sure the filenames do not overlap.
+
+    :param path: Path containing two txt files to compare.
+    :param name1: Filename of a txt file to compare.
+    :param name2: Filename of a txt file to compare.
+    :return same, diff: Two lists containing the filenames that are the same or different
+                        across two txt files.
+    '''
 
     path = os.path.expanduser(path)
-    if os.path.isdir(path):
-        filenames = os.listdir(path)
-    else:
-        raise ValueError('Path entered is not a path: ' + path)
-    print('Read ' + str(len(filenames)) + ' filenames in the folder')
-
-    # Read files that are success, for now
-    filenames = [filename for filename in filenames if '.success.h5f' in filename]
-    print('Selecting ' + str(len(filenames)) + ' success files')
-    pause()
-
     file1 = get_existing_filenames(os.path.join(path, name1))
     file2 = get_existing_filenames(os.path.join(path, name2))
 
@@ -480,15 +540,6 @@ def compare_filenames(path, name1, name2):
 
 
 def main(args, root='root'):
-    # Open and get existing training/validation/test files
-    # Save the filenames to separate lists
-    # Get all the .h5f filenames in the path, compare them to the lists
-    # In training, validation, or test set -> continue
-    # Not in any set -> add to test set
-
-    # handle no pre-existing files => randomize and split files into 3 sets 8:1:1
-
-    # Get the path to
     path = os.path.expanduser(args['path'])
     if os.path.isdir(path):
         filenames = os.listdir(path)
@@ -499,11 +550,16 @@ def main(args, root='root'):
     print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')
 
     if args['success_only'] and args['split_all']:
-        raise ValueError('success_only and split_all are mutually exclusive. Please choose just one.')
+        raise ValueError('success_only and split_all are mutually exclusive. '
+                         'Please choose just one.')
     elif args['success_only']:
-        split_success_only(args, filenames, path)
+        split_success_only(
+            filenames, path, args['plush'], args['train'], args['val'],
+            args['test'], args['val_len'], args['test_len'], args['output_name'])
     elif args['split_all']:
-        split_all(args, filenames, path)
+        split_all(
+            filenames, path, args['plush'], args['train'], args['val'],
+            args['test'], args['val_len'], args['test_len'], args['output_name'])
 
 
 if __name__ == '__main__':

From 65e3f78c3d70d0be1f1f06ad70155f758e7911e4 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Sat, 13 Oct 2018 00:00:21 -0400
Subject: [PATCH 16/36] Fix a bug and add a bunch of sanity checks

---
 .../costar_block_stacking_split_dataset.py    | 79 ++++++++++++-------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 5dd5b5ff9..f5414e5f8 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -150,7 +150,7 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
         elif len_diff < 0:
             print("Expected val set length: {}, current val set length: {}".format(
                     val_len, len(val_set)))
-            raise RuntimeError("Expected length is smaller than current length!")
+            raise RuntimeError("split_dataset: Expected val length is smaller than current length!")
 
         # Do the same check for test set
         len_diff = test_len - len(test_set)
@@ -167,7 +167,7 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
         elif len_diff < 0:
             print("Expected test set length: {}, current test set length: {}".format(
                     val_len, len(val_set)))
-            raise RuntimeError("Expected length is smaller than current length!")
+            raise RuntimeError("split_dataset: Expected test length is smaller than current length!")
 
         # Dump the rest of the files into train set
         train_set = not_val_or_test_set
@@ -233,7 +233,7 @@ def split_success_only(
         pre_existing_set_file = path + train_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
-                'Pre-existing training file is not a file: ' +
+                'split_success_only: Pre-existing training file is not a file: ' +
                 pre_existing_set_file)
 
         train_set = get_existing_filenames(pre_existing_set_file)
@@ -245,7 +245,7 @@ def split_success_only(
         pre_existing_set_file = path + val_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
-                'Pre-existing validating file is not a file: ' +
+                'split_success_only: Pre-existing validating file is not a file: ' +
                 pre_existing_set_file)
 
         val_set = get_existing_filenames(pre_existing_set_file)
@@ -257,7 +257,7 @@ def split_success_only(
         pre_existing_set_file = path + test_txt
         if not os.path.isfile(pre_existing_set_file):
             raise ValueError(
-                'Pre-existing testing file is not a file: ' +
+                'split_success_only: Pre-existing testing file is not a file: ' +
                 pre_existing_set_file)
 
         test_set = get_existing_filenames(pre_existing_set_file)
@@ -276,22 +276,26 @@ def split_success_only(
     train_set, val_set, test_set = split_dataset(
         filenames, train_set, val_set, test_set, val_len, test_len)
 
+    # Sanity check
     for i in val_set:
         if i in train_set:
-            print("val attempt in train set! %s" % i)
-            pause()
-
+            raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
+            # print("split_success_only: val attempt in train set! %s" % i)
     for i in test_set:
         if i in train_set:
-            print("test attempt in train set! %s" % i)
-            pause()
-
+            raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
+            # print("split_success_only: test attempt in train set! %s" % i)
+    for i in test_set:
+        if i in val_set:
+            raise RuntimeError("split_success_only: test attempt in val set! %s" % i)
+            # print("split_success_only: test attempt in train set! %s" % i)
     if (len(train_set) + len(val_set) + len(test_set)) != len(filenames):
         print("ERROR! lenth of train, val and test = %d, %d, %d"
               % (len(train_set), len(val_set), len(test_set)))
         print("Length of all files: %d" % len(filenames))
-        pause()
-        raise Exception("Something is wrong!")
+        raise RuntimeError("split_success_only: Numbers do not add up. Something is wrong!")
+    print("Split complete. Sanity check passed.")
+    pause()
 
     # Write the output files
     output_file(path, plush, output_name, 'success_only_train', train_set)
@@ -393,15 +397,15 @@ def split_all(
     error_train_len = len(error_filenames) - (error_val_len + error_test_len)
     print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
             success_train_len, success_val_len, success_test_len))
-    print("Length for failure sets: {0} train, {1} val, {2} test".format(
+    print("Length for all failure sets: {0} train, {1} val, {2} test".format(
             failure_train_len + error_train_len,
             failure_val_len + error_val_len,
             failure_test_len + error_test_len))
-    print("Length for failure (no error) sets: {0} train, {1} val, {2} test".format(
+    print("Length for task failure sets: {0} train, {1} val, {2} test".format(
             failure_train_len, failure_val_len, failure_test_len))
-    print("Length for error sets: {0} train, {1} val, {2} test".format(
+    print("Length for error failure sets: {0} train, {1} val, {2} test".format(
             error_train_len, error_val_len, error_test_len))
-    pause()
+    # pause()
     
     # Randomize the filenames
     from random import shuffle
@@ -414,24 +418,36 @@ def split_all(
     err_train_set,  err_val_set,  err_test_set = \
         split_dataset(error_filenames, [], [], [], error_val_len, error_test_len)
 
+    # Sanity check
     for i in fail_val_set:
         if i in fail_train_set:
-            print("fail: val attempt in train set! %s" % i)
-            pause()
+            raise RuntimeError("split_all: fail: val attempt in train set! %s" % i)
+            # print("split_all: fail: val attempt in train set! %s" % i)
     for i in fail_test_set:
         if i in fail_train_set:
-            print("fail: test attempt in train set! %s" % i)
-            pause()
-
+            raise RuntimeError("split_all: fail: test attempt in train set! %s" % i)
+            # print("split_all: fail: test attempt in train set! %s" % i)
     for i in err_val_set:
         if i in err_train_set:
-            print("err: val attempt in train set! %s" % i)
-            pause()
-
+            raise RuntimeError("split_all: err: val attempt in train set! %s" % i)
+            # print("split_all: err: val attempt in train set! %s" % i)
     for i in err_test_set:
         if i in err_train_set:
-            print("err: test attempt in train set! %s" % i)
-            pause()
+            raise RuntimeError("split_all: err: test attempt in train set! %s" % i)
+            # print("split_all: err: test attempt in train set! %s" % i)
+    for i in err_train_set:
+        if i in fail_train_set:
+            raise RuntimeError("split_all: err train set overlap with fail train set! %s" % i)
+            # print("split_all: err train set overlap with fail train set! %s" % i)
+    for i in err_val_set:
+        if i in fail_val_set:
+            raise RuntimeError("split_all: err val set overlap with fail val set! %s" % i)
+            # print("split_all: err val set overlap with fail val set! %s" % i)
+    for i in err_test_set:
+        if i in fail_test_set:
+            raise RuntimeError("split_all: err test set overlap with fail test set! %s" % i)
+            # print("split_all: err test set overlap with fail test set! %s" % i)
+    print("Split complete. Sanity check passed.")
     pause()
 
     # Write the output files
@@ -466,9 +482,12 @@ def count_files_containing_images(path, filenames):
     success_filenames = []
     skip_count = 0
     i = 0
+    print("Checking %d files. This can take some time." % len(filenames))
     for filename in filenames:
-        # print(filename)
         i += 1
+        if i % 100 == 0:
+            # TODO: incorporate tqdm progress bar
+            print("{} out of {} files checked".format(i, len(filenames)))
         try:
             with h5py.File(os.path.join(path, filename), 'r') as data:
                 try:
@@ -555,11 +574,11 @@ def main(args, root='root'):
     elif args['success_only']:
         split_success_only(
             filenames, path, args['plush'], args['train'], args['val'],
-            args['test'], args['val_len'], args['test_len'], args['output_name'])
+            args['test'], args['output_name'], args['val_len'], args['test_len'])
     elif args['split_all']:
         split_all(
             filenames, path, args['plush'], args['train'], args['val'],
-            args['test'], args['val_len'], args['test_len'], args['output_name'])
+            args['test'], args['output_name'], args['val_len'], args['test_len'])
 
 
 if __name__ == '__main__':

From cf83741a3a811343c57f6fd7c4cf38835598d6ed Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Sat, 13 Oct 2018 12:47:18 -0400
Subject: [PATCH 17/36] grasp_model.py -> hypertree_model.py

---
 costar_hyper/cornell_grasp_train.py           |  8 ++---
 .../costar_block_stacking_train_regression.py |  4 +--
 costar_hyper/grasp_loss.py                    |  2 +-
 costar_hyper/grasp_train.py                   | 30 ++++++++--------
 .../{grasp_model.py => hypertree_model.py}    |  2 +-
 .../scripts/costar_hyper_prediction.py        | 34 +++++++++----------
 6 files changed, 40 insertions(+), 40 deletions(-)
 rename costar_hyper/{grasp_model.py => hypertree_model.py} (99%)

diff --git a/costar_hyper/cornell_grasp_train.py b/costar_hyper/cornell_grasp_train.py
index defb6c790..db9ef6a7b 100755
--- a/costar_hyper/cornell_grasp_train.py
+++ b/costar_hyper/cornell_grasp_train.py
@@ -57,10 +57,10 @@ def tqdm(*args, **kwargs):
 from keras.callbacks import TensorBoard
 from keras.models import Model
 from keras.models import model_from_json
-from grasp_model import concat_images_with_tiled_vector_layer
-from grasp_model import top_block
-from grasp_model import create_tree_roots
-from grasp_model import choose_hypertree_model
+from hypertree_model import concat_images_with_tiled_vector_layer
+from hypertree_model import top_block
+from hypertree_model import create_tree_roots
+from hypertree_model import choose_hypertree_model
 from cornell_grasp_dataset_reader import parse_and_preprocess
 
 from callbacks import EvaluateInputGenerator
diff --git a/costar_hyper/costar_block_stacking_train_regression.py b/costar_hyper/costar_block_stacking_train_regression.py
index fa07e49e1..38eeb1e7f 100644
--- a/costar_hyper/costar_block_stacking_train_regression.py
+++ b/costar_hyper/costar_block_stacking_train_regression.py
@@ -113,7 +113,7 @@ def main(_):
                 # load_weights = './logs_cornell/2018-07-30-21-47-16_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3/2018-07-30-21-47-16_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3-epoch-016-val_loss-0.000-val_grasp_acc-0.273.h5'
                 # load_weights = './logs_cornell/2018-07-09-09-08-15_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3/2018-07-09-09-08-15_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3-epoch-115-val_loss-0.000-val_grasp_acc-0.258.h5'
                 # use these weights for both xyz and axis angle input data
-                # Be careful if loading the weights below, the correct vector input data and backwards compatibility code must be in place to avoid: 
+                # Be careful if loading the weights below, the correct vector input data and backwards compatibility code must be in place to avoid:
                 # "ValueError: You are trying to load a weight file containing 13 layers into a model with 11 layers."
                 # load_weights = './logs_cornell/2018-08-09-11-26-03_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3/2018-08-09-11-26-03_nasnet_mobile_semantic_translation_regression_model-_img_nasnet_mobile_vec_dense_trunk_vgg_conv_block-dataset_costar_block_stacking-grasp_goal_xyz_3-epoch-003-val_loss-0.000-val_grasp_acc-0.160.h5'
                 # weights below are trained with data augmentation, weights 2018-07-31-21-40-50 above are actual best so far for translation as of 2018-08-12
@@ -208,7 +208,7 @@ def main(_):
         print('EVAL on training data (well, a slightly hacky version) with 0 LR 0 dropout trainable False, no learning rate schedule')
         learning_rate = 0.000000000001
         hyperparams['dropout_rate'] = 0.000000000001
-        # TODO(ahundt) it seems set_trainable_layers in grasp_model.py has a bug?
+        # TODO(ahundt) it seems set_trainable_layers in hypertree_model.py has a bug?
         # hyperparams['trainable'] = 0.00000000001
         FLAGS.learning_rate_schedule = 'none'
     else:
diff --git a/costar_hyper/grasp_loss.py b/costar_hyper/grasp_loss.py
index d9a941eb9..a762909a8 100644
--- a/costar_hyper/grasp_loss.py
+++ b/costar_hyper/grasp_loss.py
@@ -1,5 +1,5 @@
 import tensorflow as tf
-from grasp_model import tile_vector_as_image_channels
+from hypertree_model import tile_vector_as_image_channels
 import keras
 from keras import backend as K
 from keras_contrib.losses import segmentation_losses
diff --git a/costar_hyper/grasp_train.py b/costar_hyper/grasp_train.py
index 5e648c919..1dce076e2 100644
--- a/costar_hyper/grasp_train.py
+++ b/costar_hyper/grasp_train.py
@@ -38,7 +38,7 @@
 from tensorflow.python.platform import flags
 
 import grasp_dataset
-import grasp_model
+import hypertree_model
 import grasp_loss
 import grasp_utilities
 import keras_workaround
@@ -207,7 +207,7 @@ def train(self, dataset=None,
               test_per_epoch=None,
               load_weights=None,
               save_weights=None,
-              make_model_fn=grasp_model.grasp_model_densenet,
+              make_model_fn=hypertree_model.grasp_model_densenet,
               imagenet_preprocessing=None,
               grasp_sequence_min_time_step=None,
               grasp_sequence_max_time_step=None,
@@ -267,7 +267,7 @@ def train(self, dataset=None,
         if save_weights is None:
             save_weights = FLAGS.save_weights
         if make_model_fn is None:
-            make_model_fn = grasp_model.grasp_model_densenet
+            make_model_fn = hypertree_model.grasp_model_densenet
         if imagenet_preprocessing is None:
             imagenet_preprocessing = FLAGS.imagenet_preprocessing
         if grasp_sequence_min_time_step is None:
@@ -557,7 +557,7 @@ def eval(self, dataset=None,
              batch_size=None,
              load_weights=None,
              save_weights=None,
-             make_model_fn=grasp_model.grasp_model_densenet,
+             make_model_fn=hypertree_model.grasp_model_densenet,
              imagenet_preprocessing=None,
              grasp_sequence_min_time_step=None,
              grasp_sequence_max_time_step=None,
@@ -603,7 +603,7 @@ def eval(self, dataset=None,
             if save_weights is None:
                 save_weights = FLAGS.save_weights
             if make_model_fn is None:
-                make_model_fn = grasp_model.grasp_model_densenet
+                make_model_fn = hypertree_model.grasp_model_densenet
             if imagenet_preprocessing is None:
                 imagenet_preprocessing = FLAGS.imagenet_preprocessing,
             if grasp_sequence_max_time_step is None:
@@ -727,7 +727,7 @@ def eval(self, dataset=None,
     def get_compiled_model(self, dataset=None,
                            batch_size=1,
                            load_weights=None,
-                           make_model_fn=grasp_model.grasp_model_densenet,
+                           make_model_fn=hypertree_model.grasp_model_densenet,
                            imagenet_preprocessing=None,
                            grasp_sequence_min_time_step=None,
                            grasp_sequence_max_time_step=None,
@@ -745,7 +745,7 @@ def get_compiled_model(self, dataset=None,
             if load_weights is None:
                 load_weights = FLAGS.load_weights
             if make_model_fn is None:
-                make_model_fn = grasp_model.grasp_model_densenet
+                make_model_fn = hypertree_model.grasp_model_densenet
             if imagenet_preprocessing is None:
                 imagenet_preprocessing = FLAGS.imagenet_preprocessing,
             if grasp_sequence_max_time_step is None:
@@ -884,7 +884,7 @@ def choose_make_model_fn(grasp_model_name=None, hyperparams=None):
 
         This lets us write custom code that sets up the model
         you asked for in the `--grasp_model` command line argument,
-        FLAGS.grasp_model. This means that when GraspTrain actually
+        FLAGS.hypertree_model. This means that when GraspTrain actually
         creates the model they will all work in exactly the same way.
         The end result is GraspTrain doesn't need a bunch of if
         statements for every type of model, and the class can be more focused
@@ -910,18 +910,18 @@ class HyperparamCarrier:
         grasp_model_name = FLAGS.grasp_model
     if grasp_model_name == 'grasp_model_resnet':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_resnet(
+            return hypertree_model.grasp_model_resnet(
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_pretrained':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_pretrained(
+            return hypertree_model.grasp_model_pretrained(
                 growth_rate=FLAGS.densenet_growth_rate,
                 reduction=FLAGS.densenet_reduction_after_pretrained,
                 dense_blocks=FLAGS.densenet_dense_blocks,
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_densenet':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_densenet(
+            return hypertree_model.grasp_model_densenet(
                 growth_rate=FLAGS.densenet_growth_rate,
                 reduction=FLAGS.densenet_reduction,
                 dense_blocks=FLAGS.densenet_dense_blocks,
@@ -929,18 +929,18 @@ def make_model_fn(*a, **kw):
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_segmentation':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_segmentation(
+            return hypertree_model.grasp_model_segmentation(
                 growth_rate=FLAGS.densenet_growth_rate,
                 reduction=FLAGS.densenet_reduction,
                 dense_blocks=FLAGS.densenet_dense_blocks,
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_levine_2016_segmentation':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_levine_2016_segmentation(
+            return hypertree_model.grasp_model_levine_2016_segmentation(
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_levine_2016':
         def make_model_fn(*a, **kw):
-            return grasp_model.grasp_model_levine_2016(
+            return hypertree_model.grasp_model_levine_2016(
                 *a, **kw)
     elif grasp_model_name == 'grasp_model_hypertree':
         def make_model_fn(
@@ -976,7 +976,7 @@ def make_model_fn(
             kw.pop('batch_size', None)
             kw.pop('feature_combo_name', None)
             # TODO(ahundt) consider making image_model_weights shared vs separate configurable
-            return grasp_model.choose_hypertree_model(
+            return hypertree_model.choose_hypertree_model(
                 images=images,
                 vectors=vectors,
                 image_shapes=image_shapes,
diff --git a/costar_hyper/grasp_model.py b/costar_hyper/hypertree_model.py
similarity index 99%
rename from costar_hyper/grasp_model.py
rename to costar_hyper/hypertree_model.py
index 5311c06bf..0ec500b60 100644
--- a/costar_hyper/grasp_model.py
+++ b/costar_hyper/hypertree_model.py
@@ -273,7 +273,7 @@ def classifier_block(input_tensor, include_top=True, top='classification',
             print("    GlobalMaxPooling2D")
         x = GlobalMaxPooling2D()(x)
     else:
-        raise ValueError('grasp_model.py::classifier_block() unsupported top: ' + str(top))
+        raise ValueError('hypertree_model.py::classifier_block() unsupported top: ' + str(top))
     return x
 
 
diff --git a/ctp_integration/scripts/costar_hyper_prediction.py b/ctp_integration/scripts/costar_hyper_prediction.py
index 58dbce43b..cba2300bb 100755
--- a/ctp_integration/scripts/costar_hyper_prediction.py
+++ b/ctp_integration/scripts/costar_hyper_prediction.py
@@ -18,7 +18,7 @@
 import costar_hyper
 from costar_hyper import grasp_utilities
 from costar_hyper import cornell_grasp_train
-from costar_hyper import grasp_model
+from costar_hyper import hypertree_model
 from costar_hyper import block_stacking_reader
 from costar_hyper import grasp_metrics
 from threading import Lock
@@ -57,7 +57,7 @@ def tqdm(*args, **kwargs):
 flags.DEFINE_string('translation_problem_type', 'semantic_translation_regression', 'see problem_type parameter in other apis')
 flags.DEFINE_string('rotation_problem_type', 'semantic_rotation_regression', 'see problem_type parameter in other apis')
 flags.DEFINE_string('force_action', None, 'force predicting only a single action, accepts a string or integer id')
-flags.DEFINE_string('default_action', '5', 
+flags.DEFINE_string('default_action', '5',
     'default action if no action has been'
     ' received from ROS on the topic /costar/action_label_current.'
     ' The default 5 means grab_blue.')
@@ -65,8 +65,8 @@ def tqdm(*args, **kwargs):
 FLAGS = flags.FLAGS
 
 def extract_filename_from_url(url):
-    # note this is almost certainly insecure, 
-    # and the url has to exactly match a filename, 
+    # note this is almost certainly insecure,
+    # and the url has to exactly match a filename,
     # no extra string contents at the end
     filename = url[url.rfind("/")+1:]
     return filename
@@ -81,7 +81,7 @@ def get_file_from_url(url, extract=True, file_hash=None, cache_subdir='models'):
                 found_extension = extension
 
     path = keras.utils.get_file(filename, url, extract=extract, file_hash=file_hash, cache_subdir=cache_subdir)
-    if found_extension is not None: 
+    if found_extension is not None:
         # strip the file extension
         path = path.replace(found_extension, '')
 
@@ -219,7 +219,7 @@ def _initialize_hypertree_model_for_inference(
          loss, metrics, classes, success_only] = cornell_grasp_train.choose_features_and_metrics(feature_combo_name, problem_type)
 
 
-        model = grasp_model.choose_hypertree_model(
+        model = hypertree_model.choose_hypertree_model(
             image_shapes=image_shapes,
             vector_shapes=vector_shapes,
             top=top,
@@ -231,12 +231,12 @@ def _initialize_hypertree_model_for_inference(
             optimizer='sgd',
             loss=loss,
             metrics=metrics)
-        
+
         model.summary()
 
         is_file = os.path.isfile(load_weights)
         if not is_file:
-            raise RuntimeError('costar_hyper_prediction.py: Weights file does not exist: ' + load_weights) 
+            raise RuntimeError('costar_hyper_prediction.py: Weights file does not exist: ' + load_weights)
         print(problem_type + ' loading weights: ' + load_weights)
         model.load_weights(load_weights)
 
@@ -285,7 +285,7 @@ def _initialize_ros(self, robot_config, tf_buffer, tf_listener):
                 self.info_topic,
                 String,
                 self._clear_view_CB)
-        
+
         # we sleep for 1 second so that
         # the buffer can collect some transforms
         rospy.sleep(1)
@@ -350,7 +350,7 @@ def _current_label_Cb(self, msg):
                 # get possible labels, or None if not specified yet
                 labels = self.labels
             try:
-                # TODO(ahundt) incorporate data_features_to_extract, so we use the right encoding method 
+                # TODO(ahundt) incorporate data_features_to_extract, so we use the right encoding method
                 # encode the action
                 action_labels = [block_stacking_reader.encode_action(current_label, possible_actions=labels)]
             except ValueError as ve:
@@ -401,7 +401,7 @@ def _clearViewCb(self, msg):
         """
         with self.mutex:
             self.need_clear_view_rgb_img = True
-    
+
     def get_latest_transform(self, from_frame=None, to_frame=None, preferred_time=None, max_attempts=10, backup_timestamp_attempts=4):
         """
         # Arguments
@@ -410,7 +410,7 @@ def get_latest_transform(self, from_frame=None, to_frame=None, preferred_time=No
         backup_timestamp_attempts: the number attempts that should use a backup timestamp.
 
         # Returns
-        
+
         transform, time
         """
         if from_frame is None:
@@ -455,7 +455,7 @@ def get_latest_transform(self, from_frame=None, to_frame=None, preferred_time=No
                         'CostarHyperPosePredictor failed to use the rgb image rosmsg timestamp, '
                         'trying latest available time as backup. '
                         'Note: This message may print >1000x less often than the problem occurs.'
-                        ' We checked time t: ' + str(t) + 
+                        ' We checked time t: ' + str(t) +
                         ', and will now try the latest available: ' + str(latest_available_time_lookup) )
                     # try the backup timestamp even though it will be less accurate
                     t = latest_available_time_lookup
@@ -508,12 +508,12 @@ def __call__(self):
                 current_images=rgb_images)
 
         rotation_predictions = self.rotation_model.predict_on_batch(X)
-        rospy.loginfo_throttle(10.0, 
-            'encoded translation predictions: ' + str(translation_predictions) + 
+        rospy.loginfo_throttle(10.0,
+            'encoded translation predictions: ' + str(translation_predictions) +
             ' encoded rotation predictions: ' + str(rotation_predictions))
         tr_predictions = np.concatenate([translation_predictions[0], rotation_predictions[0]])
         prediction_xyz_qxyzw = grasp_metrics.decode_xyz_aaxyz_nsc_to_xyz_qxyzw(tr_predictions)
-        rospy.loginfo_throttle(10.0, 
+        rospy.loginfo_throttle(10.0,
             'decoded prediction_xyz_qxyzw: ' + str(prediction_xyz_qxyzw))
 
         # prediction_kdl = kdl.Frame(
@@ -559,7 +559,7 @@ def main(_):
         start_time = time.clock()
         prediction_xyz_qxyzw, prediction_input_data_time = predictor()
         tick_time = time.clock()
-        
+
         b_to_e = TransformStamped()
         b_to_e.header.stamp = prediction_input_data_time
         b_to_e.header.frame_id = predictor.base_link

From d0bbdefa70c71f0084d9723586cdeb948d580f05 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Sun, 14 Oct 2018 22:01:24 -0400
Subject: [PATCH 18/36] Minor code style changes

---
 costar_hyper/costar_block_stacking_split_dataset.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index f5414e5f8..e80c09185 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -25,6 +25,7 @@
 '''
 import argparse
 import os
+from random import shuffle
 
 
 def _parse_args():
@@ -131,9 +132,9 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
         train_set += not_train_set[val_len+test_len:]
     else:
         # Select filenames not in test or val set
-        not_val_or_test_set = \
-            [filename for filename in filenames if
-             filename not in val_set and filename not in test_set]
+        not_val_or_test_set = [
+            filename for filename in filenames if
+            filename not in val_set and filename not in test_set]
 
         # Check if expected length and current lenth for val set are different
         len_diff = val_len - len(val_set)
@@ -269,7 +270,6 @@ def split_success_only(
               'Output results will be adjusted to same size sets')
 
     # Randomize the filenames
-    from random import shuffle
     shuffle(filenames)
 
     # Split the dataset
@@ -408,7 +408,6 @@ def split_all(
     # pause()
     
     # Randomize the filenames
-    from random import shuffle
     shuffle(failure_filenames)
     shuffle(error_filenames)
 

From 20c8c0fe9eb3350516aea5ff652e45683457dcd1 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Mon, 15 Oct 2018 17:39:07 -0400
Subject: [PATCH 19/36] costar_block_stacking_split_dataset.py set random seed,
 write summary csv

---
 .../costar_block_stacking_split_dataset.py    | 61 +++++++++++--------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index e80c09185..2f8a07664 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -13,11 +13,11 @@
 call the following command after success_only subset is splitted:
 
 python costar_block_stacking_split_dataset.py --path /path/to/dataset/folder     \
-    --success_only (--plush) --train success_only/train/txt/filename             \
+    --split_all (--plush) --train success_only/train/txt/filename             \
     --val [success_only val txt filename] --test [success_only test txt filename]\
     --output_name [filename prefix for the output train/val/test filenames]
 
-This will output task_failure_only, error_failure_only, and all_failure_only 
+This will output task_failure_only, error_failure_only, and all_failure_only
 train/val/test filenames as 9 separate txt files.
 
 Author: Chia-Hung "Rexxar" Lin (rexxarchl)
@@ -25,7 +25,7 @@
 '''
 import argparse
 import os
-from random import shuffle
+import random
 
 
 def _parse_args():
@@ -56,10 +56,12 @@ def _parse_args():
                         'the file is expected to be in argument `path`')
     parser.add_argument("--output_name", type=str,
                         default='costar_block_stacking_dataset', help='output file name')
-    parser.add_argument("--val_len", type=int, default=None, 
+    parser.add_argument("--val_len", type=int, default=None,
                         help='Expected val set length')
-    parser.add_argument("--test_len", type=int, default=None, 
+    parser.add_argument("--test_len", type=int, default=None,
                         help='Expected test set length')
+    parser.add_argument("--seed", type=int, default=0,
+                        help='Numpy seed for reproducing the output lists')
     return vars(parser.parse_args())
 
 
@@ -177,7 +179,7 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
 
 
 def output_file(path, plush, output_prefix, set_name, filenames):
-    '''Output the filenames as a txt file. 
+    '''Output the filenames as a txt file.
     Automatically adds appropriate keras path for the filenames.
 
     :param path: The path to store the output txt file.
@@ -270,7 +272,7 @@ def split_success_only(
               'Output results will be adjusted to same size sets')
 
     # Randomize the filenames
-    shuffle(filenames)
+    random.shuffle(filenames)
 
     # Split the dataset
     train_set, val_set, test_set = split_dataset(
@@ -395,27 +397,32 @@ def split_all(
     error_val_len = int(round(success_val_len*multiplier_error))
     error_test_len = int(round(success_test_len*multiplier_error))
     error_train_len = len(error_filenames) - (error_val_len + error_test_len)
-    print("Successfully read success_only filenames: {0} train, {1} val, {2} test".format(
-            success_train_len, success_val_len, success_test_len))
-    print("Length for all failure sets: {0} train, {1} val, {2} test".format(
+    dataset_splits_csv = 'subset, train, val, test\n'
+    dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
+                            success_train_len, success_val_len, success_test_len)
+    dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
             failure_train_len + error_train_len,
             failure_val_len + error_val_len,
-            failure_test_len + error_test_len))
-    print("Length for task failure sets: {0} train, {1} val, {2} test".format(
-            failure_train_len, failure_val_len, failure_test_len))
-    print("Length for error failure sets: {0} train, {1} val, {2} test".format(
-            error_train_len, error_val_len, error_test_len))
+            failure_test_len + error_test_len)
+    dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
+            failure_train_len, failure_val_len, failure_test_len)
+    dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
+            error_train_len, error_val_len, error_test_len)
+    dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
+    print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
+
+    csv_path = os.path.join(path, dataset_splits_csv_filename)
     # pause()
-    
+
     # Randomize the filenames
-    shuffle(failure_filenames)
-    shuffle(error_filenames)
+    random.shuffle(failure_filenames)
+    random.shuffle(error_filenames)
 
     # Split the dataset for failure and error
-    fail_train_set, fail_val_set, fail_test_set = \
-        split_dataset(failure_filenames, [], [], [], failure_val_len, failure_test_len)
-    err_train_set,  err_val_set,  err_test_set = \
-        split_dataset(error_filenames, [], [], [], error_val_len, error_test_len)
+    fail_train_set, fail_val_set, fail_test_set = split_dataset(
+        failure_filenames, [], [], [], failure_val_len, failure_test_len)
+    err_train_set,  err_val_set,  err_test_set = split_dataset(
+        error_filenames, [], [], [], error_val_len, error_test_len)
 
     # Sanity check
     for i in fail_val_set:
@@ -448,6 +455,8 @@ def split_all(
             # print("split_all: err test set overlap with fail test set! %s" % i)
     print("Split complete. Sanity check passed.")
     pause()
+    with open(csv_path, 'w+') as file_object:
+        file_object.write(dataset_splits_csv)
 
     # Write the output files
     output_file(path, plush, output_name, 'task_failure_only_train', fail_train_set)
@@ -461,9 +470,9 @@ def split_all(
     fail_train_set += err_train_set
     fail_val_set += err_val_set
     fail_test_set += err_test_set
-    output_file(path, plush, output_name, 'all_failure_only_train', fail_train_set)
-    output_file(path, plush, output_name, 'all_failure_only_val', fail_val_set)
-    output_file(path, plush, output_name, 'all_failure_only_test', fail_test_set)
+    output_file(path, plush, output_name, 'task_and_error_failure_train', fail_train_set)
+    output_file(path, plush, output_name, 'task_and_error_failure_val', fail_val_set)
+    output_file(path, plush, output_name, 'task_and_error_failure_test', fail_test_set)
 
 
 def count_files_containing_images(path, filenames):
@@ -563,6 +572,8 @@ def main(args, root='root'):
         filenames = os.listdir(path)
     else:
         raise ValueError('Path entered is not a path: ' + path)
+    # set the random seed for reproducible random lists
+    random.seed(args['seed'])
 
     filenames = [filename for filename in filenames if '.h5f' in filename]
     print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')

From d6a0d401d7053f5bb09735439537402efc7bca98 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Mon, 15 Oct 2018 17:45:31 -0400
Subject: [PATCH 20/36] return filename

---
 costar_hyper/costar_block_stacking_split_dataset.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 2f8a07664..4f48994d1 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -189,9 +189,10 @@ def output_file(path, plush, output_prefix, set_name, filenames):
     :param filenames: The filenames to be written in the txt file.
     '''
     output_filename = output_prefix + '_' + set_name + '_files.txt'
-    print('Writing ' + path + output_filename)
 
-    f = open(os.path.join(path, output_filename), 'w')
+    list_filename = os.path.join(path, output_filename)
+    print('Writing ' + list_filename)
+    f = open(list_filename, 'w')
     # print(f)
 
     if plush:
@@ -207,6 +208,7 @@ def output_file(path, plush, output_prefix, set_name, filenames):
         f.write(prefix_path + filename + '\n')
 
     f.close()
+    return list_filename
 
 
 def split_success_only(
@@ -397,7 +399,7 @@ def split_all(
     error_val_len = int(round(success_val_len*multiplier_error))
     error_test_len = int(round(success_test_len*multiplier_error))
     error_train_len = len(error_filenames) - (error_val_len + error_test_len)
-    dataset_splits_csv = 'subset, train, val, test\n'
+    dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
     dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
                             success_train_len, success_val_len, success_test_len)
     dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(

From f68cd257b182b3332adc706921537d2575374c7f Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Mon, 15 Oct 2018 21:22:03 -0400
Subject: [PATCH 21/36] WIP: Refactor dataset splitting to include default
 behaviour

---
 .../costar_block_stacking_split_dataset.py    | 158 +++++++++++-------
 1 file changed, 99 insertions(+), 59 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 4f48994d1..2f5e10f00 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -26,6 +26,11 @@
 import argparse
 import os
 import random
+try:
+    import h5py  # Needs h5py to open the files and check frame count
+except ImportError:
+    print("h5py is not available.")
+    h5py = None
 
 
 def _parse_args():
@@ -36,13 +41,19 @@ def _parse_args():
                     'If no pre-existing sets of files are indicated, randomize and split '
                     'the files in the folder 8:1:1 for train/val/test.')
     parser.add_argument("--path", type=str,
-                        default=os.path.join(os.path.expanduser("~"), '.costar', 'data'),
+                        default=os.path.join(
+                            os.path.expanduser("~"),
+                            '.keras/datasets/costar_block_stacking_dataset_v0.4/'),
                         help='path to dataset folder containing many files')
+    parser.add_argument("--dataset_path", type=str, default='/.keras/dataset/',
+                        help='The folder that is expected stores the dataset. '
+                             'Filenames in the output file will reference this path.')
+    parser.add_argument("--dataset_name", type=str, 
+                        default='costar_block_stacking_dataset_v0.4',
+                        help='Dataset name to store under dataset path.'
+                             'Filenames in the output file will reference this name.')
     parser.add_argument("--success_only", action='store_true', default=False,
                         help='only visit stacking data labeled as successful')
-    parser.add_argument("--split_all", action='store_true', default=False,
-                        help='Split all datasets into success, failure, and error sets. '
-                             'Requires train/val/test from success_only subset')
     parser.add_argument("--plush", action='store_true', default=False,
                         help='processing plush attempts')
     parser.add_argument("--train", type=str, default='',
@@ -61,7 +72,9 @@ def _parse_args():
     parser.add_argument("--test_len", type=int, default=None,
                         help='Expected test set length')
     parser.add_argument("--seed", type=int, default=0,
-                        help='Numpy seed for reproducing the output lists')
+                        help='Random seed for reproducing the output lists')
+    parser.add_argument("--write", type='store_true', default=False,
+                        help='Write to output files')
     return vars(parser.parse_args())
 
 
@@ -83,15 +96,13 @@ def get_existing_filenames(path_to_file):
         # Extract the file names and add them to the returning list
         filename = extract_filename_from_url(line)
         if not filename:
-            print("Empty line extracted.")
-            pause()
+            print("get_existing_filenames: Empty line extracted.")
             continue
         filenames.append(filename)
 
     f.close()
 
     print('Read ' + str(len(filenames)) + ' filenames from ' + path_to_file)
-    # pause()  # DEBUG
     return filenames
 
 
@@ -149,11 +160,11 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
             print("Added %d files to val set." % len_diff)
 
             print("Unusual behavior. Do you really want to add files to val set?")
-            pause()
         elif len_diff < 0:
             print("Expected val set length: {}, current val set length: {}".format(
                     val_len, len(val_set)))
-            raise RuntimeError("split_dataset: Expected val length is smaller than current length!")
+            raise RuntimeError(
+                "split_dataset: Expected val length is smaller than current length!")
 
         # Do the same check for test set
         len_diff = test_len - len(test_set)
@@ -166,11 +177,11 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
             print("Added %d files to test set." % len_diff)
 
             print("Unusual behavior. Do you really want to add files to test set?")
-            pause()
         elif len_diff < 0:
             print("Expected test set length: {}, current test set length: {}".format(
                     val_len, len(val_set)))
-            raise RuntimeError("split_dataset: Expected test length is smaller than current length!")
+            raise RuntimeError(
+                "split_dataset: Expected test length is smaller than current length!")
 
         # Dump the rest of the files into train set
         train_set = not_val_or_test_set
@@ -178,13 +189,14 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
     return train_set, val_set, test_set
 
 
-def output_file(path, plush, output_prefix, set_name, filenames):
+def output_file(path, plush, output_prefix, dataset_name, category_name, subset_name, filenames):
     '''Output the filenames as a txt file.
     Automatically adds appropriate keras path for the filenames.
 
     :param path: The path to store the output txt file.
     :param plush: A bool stating whether the program is processing plush subset.
     :param output_prefix: Prefix of the output txt file.
+    :param dataset_name: Dataset name to be placed after 
     :param set_name: train/val/test, to be added to the output filename.
     :param filenames: The filenames to be written in the txt file.
     '''
@@ -208,11 +220,13 @@ def output_file(path, plush, output_prefix, set_name, filenames):
         f.write(prefix_path + filename + '\n')
 
     f.close()
+
+
     return list_filename
 
 
 def split_success_only(
-        filenames, path, plush, train_txt, val_txt, test_txt, output_name,
+        filenames, path, plush, train_txt, val_txt, test_txt,
         val_len=None, test_len=None):
     '''Splits success files into success_only train/val/test txt files.
 
@@ -229,7 +243,6 @@ def split_success_only(
     # Read files that are success
     filenames = [filename for filename in filenames if '.success.h5f' in filename]
     print('Selecting ' + str(len(filenames)) + ' success files')
-    pause()
 
     # Read filenames for the previous training set
     if not train_txt:
@@ -299,7 +312,6 @@ def split_success_only(
         print("Length of all files: %d" % len(filenames))
         raise RuntimeError("split_success_only: Numbers do not add up. Something is wrong!")
     print("Split complete. Sanity check passed.")
-    pause()
 
     # Write the output files
     output_file(path, plush, output_name, 'success_only_train', train_set)
@@ -308,7 +320,7 @@ def split_success_only(
 
 
 def split_all(
-        filenames, path, plush, train_txt, val_txt, test_txt, output_name,
+        filenames, path, plush, train_txt, val_txt, test_txt,
         val_len=None, test_len=None):
     '''Splits failure files into all_failure_only, task_failure_only and
     error_failure_only subsets.
@@ -330,7 +342,6 @@ def split_all(
     # Get the success, failure, and error filenames with nonzero frames
     success_filenames, failure_filenames, error_filenames = count_files_containing_images(
                                                                 path, filenames)
-    pause()  # DEBUG
 
     # Calculate the percentage of success, failure and error
     total_file_count = (
@@ -341,7 +352,6 @@ def split_all(
     print("Total: %d files" % total_file_count)
     print("Ratios: {:.2f}% success, {:.2f}% failure(no error), {:.2f}% error".format(
             success_ratio*100, failure_ratio*100, error_ratio*100))
-    pause()  # DEBUG
 
     # Read the train/val set from success_only subset
     if plush:
@@ -399,22 +409,6 @@ def split_all(
     error_val_len = int(round(success_val_len*multiplier_error))
     error_test_len = int(round(success_test_len*multiplier_error))
     error_train_len = len(error_filenames) - (error_val_len + error_test_len)
-    dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
-    dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
-                            success_train_len, success_val_len, success_test_len)
-    dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
-            failure_train_len + error_train_len,
-            failure_val_len + error_val_len,
-            failure_test_len + error_test_len)
-    dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
-            failure_train_len, failure_val_len, failure_test_len)
-    dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
-            error_train_len, error_val_len, error_test_len)
-    dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
-    print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
-
-    csv_path = os.path.join(path, dataset_splits_csv_filename)
-    # pause()
 
     # Randomize the filenames
     random.shuffle(failure_filenames)
@@ -456,7 +450,7 @@ def split_all(
             raise RuntimeError("split_all: err test set overlap with fail test set! %s" % i)
             # print("split_all: err test set overlap with fail test set! %s" % i)
     print("Split complete. Sanity check passed.")
-    pause()
+
     with open(csv_path, 'w+') as file_object:
         file_object.write(dataset_splits_csv)
 
@@ -483,7 +477,6 @@ def count_files_containing_images(path, filenames):
     :param filenames: .h5f filenames in the folder
     :return: Lists of success/failure/error filenames with nonzero frames
     '''
-    import h5py  # Needs h5py to open the files and check frame count
     # TODO: Write total frames into csv file as a new column
 
     # Open the files to check frame count. Skip files with 0 frame.
@@ -531,10 +524,6 @@ def count_files_containing_images(path, filenames):
     return success_filenames, failure_filenames, error_filenames
 
 
-def pause():
-    _ = input("Press <Enter> to continue...")
-
-
 def compare_filenames(path, name1, name2):
     '''Check if filenames within two txt files are the same.
     Example use: compare train and val files to make sure the filenames do not overlap.
@@ -570,27 +559,78 @@ def compare_filenames(path, name1, name2):
 
 def main(args, root='root'):
     path = os.path.expanduser(args['path'])
-    if os.path.isdir(path):
-        filenames = os.listdir(path)
-    else:
+    if not os.path.isdir(path):
         raise ValueError('Path entered is not a path: ' + path)
-    # set the random seed for reproducible random lists
+
+    # Get the subfolders under this path
+    dir_list = [dir_name for dir_name in os.listdir(path)
+                if os.path.isdir(os.path.join(path, dir_name))]
+
+    # Set the random seed for reproducible random lists
     random.seed(args['seed'])
 
-    filenames = [filename for filename in filenames if '.h5f' in filename]
-    print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')
-
-    if args['success_only'] and args['split_all']:
-        raise ValueError('success_only and split_all are mutually exclusive. '
-                         'Please choose just one.')
-    elif args['success_only']:
-        split_success_only(
-            filenames, path, args['plush'], args['train'], args['val'],
-            args['test'], args['output_name'], args['val_len'], args['test_len'])
-    elif args['split_all']:
-        split_all(
-            filenames, path, args['plush'], args['train'], args['val'],
-            args['test'], args['output_name'], args['val_len'], args['test_len'])
+    for dir_name in dir_list:
+        dir_path = os.path.join(path, dir_name)
+
+        # Get all h5f files under this folder
+        filenames = [filename for filename in os.listdir(dir_path) if '.h5f' in filename]
+        if filenames:
+            print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')
+        else:
+            print('Skipping directory %s because it contains no h5f file.' % dir_name)
+            continue
+
+        # Split the dataset
+        if args['success_only']:
+            subsets = split_success_only(
+                filenames, dir_path, args['plush'], args['train'], args['val'],
+                args['test'], args['val_len'], args['test_len'])
+        else:
+            subsets = split_all(
+                filenames, dir_path, args['plush'], args['train'], args['val'],
+                args['test'], args['val_len'], args['test_len'])
+
+        # Output the files
+        # Modify here to add more categories
+        category_names = [
+            'success_only',
+            'task_and_error_failure',
+            'task_failure_only',
+            'error_failure_only']
+        subset_names = [
+            'train',
+            'val',
+            'test']
+        for i in range(len(subsets)):
+            category_name = category_names[i]
+            category_subsets = subsets[i]
+            # Output the files
+            # TODO: include dataset name
+            for j in range(len(category_subsets)):
+                subset_name = subset_names[j]
+                subset_filenames = category_subsets[j]
+                output_file(dir_path, dir_name, args['output_prefix'],
+                            args['dataset_name'], subset_name, subset_filenames)
+
+
+
+
+        dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
+        dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
+                                success_train_len, success_val_len, success_test_len)
+        dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
+                failure_train_len + error_train_len,
+                failure_val_len + error_val_len,
+                failure_test_len + error_test_len)
+        dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
+                failure_train_len, failure_val_len, failure_test_len)
+        dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
+                error_train_len, error_val_len, error_test_len)
+        dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
+        print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
+
+        csv_path = os.path.join(path, dataset_splits_csv_filename)
+        
 
 
 if __name__ == '__main__':

From be502c9cae8e8ea7c819a4aa222518ef08e90a23 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Mon, 15 Oct 2018 23:46:48 -0400
Subject: [PATCH 22/36] WIP: Finish refactoring the script. To be debugged.

---
 .../costar_block_stacking_split_dataset.py    | 597 ++++++++++--------
 1 file changed, 319 insertions(+), 278 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 2f5e10f00..84c005d16 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -48,23 +48,23 @@ def _parse_args():
     parser.add_argument("--dataset_path", type=str, default='/.keras/dataset/',
                         help='The folder that is expected stores the dataset. '
                              'Filenames in the output file will reference this path.')
-    parser.add_argument("--dataset_name", type=str, 
-                        default='costar_block_stacking_dataset_v0.4',
+    parser.add_argument("--dataset_name", type=str,
+                        default='costar_block_stacking_dataset',
                         help='Dataset name to store under dataset path.'
                              'Filenames in the output file will reference this name.')
+    # parser.add_argument("--dataset_version", type=str, default='v0.4',
+    #                     help='The current version of the dataset.')
     parser.add_argument("--success_only", action='store_true', default=False,
-                        help='only visit stacking data labeled as successful')
-    parser.add_argument("--plush", action='store_true', default=False,
-                        help='processing plush attempts')
-    parser.add_argument("--train", type=str, default='',
-                        help='pre-existing filenames for training. '
-                        'the file is expected to be in argument `path`')
-    parser.add_argument("--val", type=str, default='',
-                        help='pre-existing filenames for validation. '
-                        'the file is expected to be in argument `path`')
-    parser.add_argument("--test", type=str, default='',
-                        help='pre-existing filenames for testing. '
-                        'the file is expected to be in argument `path`')
+                        help='Only visit stacking data labeled as successful')
+    # parser.add_argument("--train", type=str, default='',
+    #                     help='Pre-existing filenames for training. '
+    #                     'the file is expected to be in argument `path`')
+    # parser.add_argument("--val", type=str, default='',
+    #                     help='Pre-existing filenames for validation. '
+    #                     'the file is expected to be in argument `path`')
+    # parser.add_argument("--test", type=str, default='',
+    #                     help='Pre-existing filenames for testing. '
+    #                     'the file is expected to be in argument `path`')
     parser.add_argument("--output_name", type=str,
                         default='costar_block_stacking_dataset', help='output file name')
     parser.add_argument("--val_len", type=int, default=None,
@@ -73,8 +73,13 @@ def _parse_args():
                         help='Expected test set length')
     parser.add_argument("--seed", type=int, default=0,
                         help='Random seed for reproducing the output lists')
-    parser.add_argument("--write", type='store_true', default=False,
+    parser.add_argument("--write", action='store_true', default=False,
                         help='Write to output files')
+    parser.add_argument("--existing_file_prefix", type=str, nargs='+',
+                        default=["costar_plush_block_stacking_dataset_v0.4",
+                                 "costar_block_stacking_dataset_v0.4"],
+                        help="Existing txt file prefixes to look for when opening "
+                             "train/val/test files.")
     return vars(parser.parse_args())
 
 
@@ -106,6 +111,43 @@ def get_existing_filenames(path_to_file):
     return filenames
 
 
+def output_file(dataset_path, dataset_name, dir_path, dir_name,
+                category_name, subset_name, subset_filenames, write):
+    '''Output the filenames as a txt file.
+    Automatically adds appropriate keras path for the filenames.
+
+    :param dataset_path: The path that the dataset is expected to store under.
+                         Defaults to '~/.keras/datasets/'.
+    :param dataset_name: The folder that the dataset is expected to store under.
+                         Defaults to 'costar_block_stacking_dataset_v0.4'.
+    :param dir_path: The path to the directory that the files are in.
+    :param dir_name: The name of the directory that the files are in.
+    :param category_name: success_only, task_failures_only, error_failures_only, etc.
+    :param subset_name: train/val/test, to be added to the output filename.
+    :param subset_filenames: The filenames to be written in the txt file.
+    :param write: The flag to actually write the output files.
+    :return output_path: The path to the output txt file.
+    '''
+    output_filename = "{0}_{1}_{2}_{3}_files.txt".format(
+                        dataset_name, dir_name, category_name, subset_name)
+    output_path = os.path.join(dir_path, dir_name, output_filename)
+    prefix_path = os.path.join(dataset_path, dataset_name, dir_name)
+    print("Output txt file: {}".format(output_path))
+    print("Length: {} files".format(len(subset_filenames)))
+    print("Example .h5f path in the txt file: %s".format(
+          os.path.join(prefix_path, subset_filenames[0])))
+
+    if write:
+        with open(output_filename, 'w') as f:
+            for filename in subset_filenames:
+                file_path = os.path.join(prefix_path, filename)
+                f.write(file_path + '\n')
+    else:
+        print("File not written. Use --write flag to actually output the file.")
+
+    return output_path
+
+
 def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_len=None):
     '''Split the input filenames into three sets.
     If val_set and test_set are empty, the sets will be of length val_len and test_len.
@@ -189,159 +231,197 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
     return train_set, val_set, test_set
 
 
-def output_file(path, plush, output_prefix, dataset_name, category_name, subset_name, filenames):
-    '''Output the filenames as a txt file.
-    Automatically adds appropriate keras path for the filenames.
-
-    :param path: The path to store the output txt file.
-    :param plush: A bool stating whether the program is processing plush subset.
-    :param output_prefix: Prefix of the output txt file.
-    :param dataset_name: Dataset name to be placed after 
-    :param set_name: train/val/test, to be added to the output filename.
-    :param filenames: The filenames to be written in the txt file.
+# def split_success_only(
+#         filenames, path, plush, train_txt, val_txt, test_txt,
+#         val_len=None, test_len=None):
+#     '''Splits success files into success_only train/val/test txt files.
+
+#     :param filenames: A list of .h5f filenames under the path.
+#     :param path: Path to the folder with the .h5f files.
+#     :param plush: A bool indicating whether the program is processing plush subset.
+#     :param train_txt: Filename to a pre-existing train txt file.
+#     :param val_txt: Filename to a pre-existing val txt file.
+#     :param test_txt: Filename to a pre-existing test txt file.
+#     :param val_len: Expected output val set length.
+#     :param test_len: Expected output test set length.
+#     :param output_name: Filename prefix to the output train/val/test txt files.
+#     '''
+#     # Read files that are success
+#     filenames = [filename for filename in filenames if '.success.h5f' in filename]
+#     print('Selecting ' + str(len(filenames)) + ' success files')
+
+#     # Read filenames for the previous training set
+#     if not train_txt:
+#         train_set = []
+#     else:
+#         pre_existing_set_file = path + train_txt
+#         if not os.path.isfile(pre_existing_set_file):
+#             raise ValueError(
+#                 'split_success_only: Pre-existing training file is not a file: ' +
+#                 pre_existing_set_file)
+
+#         train_set = get_existing_filenames(pre_existing_set_file)
+
+#     # Read filenames for the previous validation set
+#     if not val_txt:
+#         val_set = []
+#     else:
+#         pre_existing_set_file = path + val_txt
+#         if not os.path.isfile(pre_existing_set_file):
+#             raise ValueError(
+#                 'split_success_only: Pre-existing validating file is not a file: ' +
+#                 pre_existing_set_file)
+
+#         val_set = get_existing_filenames(pre_existing_set_file)
+
+#     # Read filenames for the previous test set
+#     if not test_txt:
+#         test_set = []
+#     else:
+#         pre_existing_set_file = path + test_txt
+#         if not os.path.isfile(pre_existing_set_file):
+#             raise ValueError(
+#                 'split_success_only: Pre-existing testing file is not a file: ' +
+#                 pre_existing_set_file)
+
+#         test_set = get_existing_filenames(pre_existing_set_file)
+
+#     # Inform the user that the length of val and test will be matched for output,
+#     # when the lengths of val and test are not equal
+#     if len(val_set) is not len(test_set):
+#         print('Validation set and testing set do not have the same length. '
+#               'Output results will be adjusted to same size sets')
+
+#     # Randomize the filenames
+#     random.shuffle(filenames)
+
+#     # Split the dataset
+#     train_set, val_set, test_set = split_dataset(
+#         filenames, train_set, val_set, test_set, val_len, test_len)
+
+#     # Sanity check
+#     for i in val_set:
+#         if i in train_set:
+#             raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
+#             # print("split_success_only: val attempt in train set! %s" % i)
+#     for i in test_set:
+#         if i in train_set:
+#             raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
+#             # print("split_success_only: test attempt in train set! %s" % i)
+#     for i in test_set:
+#         if i in val_set:
+#             raise RuntimeError("split_success_only: test attempt in val set! %s" % i)
+#             # print("split_success_only: test attempt in train set! %s" % i)
+#     if (len(train_set) + len(val_set) + len(test_set)) != len(filenames):
+#         print("ERROR! lenth of train, val and test = %d, %d, %d"
+#               % (len(train_set), len(val_set), len(test_set)))
+#         print("Length of all files: %d" % len(filenames))
+#         raise RuntimeError("split_success_only: Numbers do not add up. Something is wrong!")
+#     print("Split complete. Sanity check passed.")
+
+#     # Write the output files
+#     output_file(path, plush, output_name, 'success_only_train', train_set)
+#     output_file(path, plush, output_name, 'success_only_val', val_set)
+#     output_file(path, plush, output_name, 'success_only_test', test_set)
+
+def read_existing_files(
+        dir_path, dataset_name, dir_name, category_name, existing_file_prefix):
+    '''Try to open existing train/val/test txt files.
+    1. Try [dataset_name]_[dir_name]_[category_name]_[subset_name]_files.txt
+       e.g. costar_block_stacking_dataset_v0.4_blocks_only_success_only_train_files.txt
+    2. If that does not work, try the prefixes in existing_file_prefix
+       e.g. costar_plush_block_stacking_dataset_v0.4_success_only_train_files.txt
+
+    :return train_val_test_filenames: A list of 3 elements representing [train, val, test]
+                                      filenames. If no file is found for that subset, the
+                                      corresponding element will be None
     '''
-    output_filename = output_prefix + '_' + set_name + '_files.txt'
-
-    list_filename = os.path.join(path, output_filename)
-    print('Writing ' + list_filename)
-    f = open(list_filename, 'w')
-    # print(f)
-
-    if plush:
-        folder = 'blocks_with_plush_toy/'
-    else:
-        folder = 'blocks_only/'
-
-    prefix_path = \
-        "~/.keras/datasets/costar_block_stacking_dataset_v0.4/" + folder
-
-    for filename in filenames:
-        # print filename
-        f.write(prefix_path + filename + '\n')
-
-    f.close()
-
-
-    return list_filename
-
-
-def split_success_only(
-        filenames, path, plush, train_txt, val_txt, test_txt,
-        val_len=None, test_len=None):
-    '''Splits success files into success_only train/val/test txt files.
-
-    :param filenames: A list of .h5f filenames under the path.
-    :param path: Path to the folder with the .h5f files.
-    :param plush: A bool indicating whether the program is processing plush subset.
-    :param train_txt: Filename to a pre-existing train txt file.
-    :param val_txt: Filename to a pre-existing val txt file.
-    :param test_txt: Filename to a pre-existing test txt file.
-    :param val_len: Expected output val set length.
-    :param test_len: Expected output test set length.
-    :param output_name: Filename prefix to the output train/val/test txt files.
-    '''
-    # Read files that are success
-    filenames = [filename for filename in filenames if '.success.h5f' in filename]
-    print('Selecting ' + str(len(filenames)) + ' success files')
-
-    # Read filenames for the previous training set
-    if not train_txt:
-        train_set = []
-    else:
-        pre_existing_set_file = path + train_txt
-        if not os.path.isfile(pre_existing_set_file):
-            raise ValueError(
-                'split_success_only: Pre-existing training file is not a file: ' +
-                pre_existing_set_file)
-
-        train_set = get_existing_filenames(pre_existing_set_file)
-
-    # Read filenames for the previous validation set
-    if not val_txt:
-        val_set = []
-    else:
-        pre_existing_set_file = path + val_txt
-        if not os.path.isfile(pre_existing_set_file):
-            raise ValueError(
-                'split_success_only: Pre-existing validating file is not a file: ' +
-                pre_existing_set_file)
-
-        val_set = get_existing_filenames(pre_existing_set_file)
-
-    # Read filenames for the previous test set
-    if not test_txt:
-        test_set = []
-    else:
-        pre_existing_set_file = path + test_txt
-        if not os.path.isfile(pre_existing_set_file):
-            raise ValueError(
-                'split_success_only: Pre-existing testing file is not a file: ' +
-                pre_existing_set_file)
-
-        test_set = get_existing_filenames(pre_existing_set_file)
-
-    # Inform the user that the length of val and test will be matched for output,
-    # when the lengths of val and test are not equal
-    if len(val_set) is not len(test_set):
-        print('Validation set and testing set do not have the same length. '
-              'Output results will be adjusted to same size sets')
-
-    # Randomize the filenames
-    random.shuffle(filenames)
-
-    # Split the dataset
-    train_set, val_set, test_set = split_dataset(
-        filenames, train_set, val_set, test_set, val_len, test_len)
-
+    # Read the train/val/test set from dataset name and dir name
+    train_val_test_filenames = []
+    for subset_name in ['train', 'val', 'test']:
+        txt_filename = "{0}_{1}_{2}_{3}_files.txt".format(
+                        dataset_name, dir_name, category_name, subset_name)
+        txt_file_path = os.path.join(dir_path, txt_filename)
+        print("Trying %s..." % txt_file_path)
+        if os.path.isfile(txt_file_path):
+            print("Existing {} txt file found: {}".format(subset_name, txt_filename))
+            train_val_test_filenames.append(get_existing_filenames(txt_file_path))
+        else:
+            train_val_test_filenames.append(None)
+
+    # If no txt files are found, look in existing_file_prefix
+    if any(l is None for l in train_val_test_filenames):
+        for prefix in existing_file_prefix:
+            train_val_test_filenames = []
+            for subset_name in ['train', 'val', 'test']:
+                txt_filename = "{0}_{1}_{2}_files.txt".format(
+                                prefix, category_name, subset_name)
+                txt_file_path = os.path.join(dir_path, txt_filename)
+                print("Trying %s..." % txt_file_path)
+                if os.path.isfile(txt_file_path):
+                    print("Existing {} txt file found: {}".format(
+                           subset_name, txt_filename))
+                    train_val_test_filenames.append(get_existing_filenames(txt_file_path))
+                else:
+                    train_val_test_filenames.append(None)
+
+            # Successfully read some pre-existing train/val/test txt files
+            if any(l is not None for l in train_val_test_filenames):
+                break
+
+    return train_val_test_filenames
+
+
+def split_sanity_check(train_set, val_set, test_set, len_filenames):
     # Sanity check
     for i in val_set:
         if i in train_set:
-            raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
-            # print("split_success_only: val attempt in train set! %s" % i)
+            raise RuntimeError("split_sanity_check: test attempt in train set! %s" % i)
     for i in test_set:
         if i in train_set:
-            raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
-            # print("split_success_only: test attempt in train set! %s" % i)
+            raise RuntimeError("split_sanity_check: test attempt in train set! %s" % i)
     for i in test_set:
         if i in val_set:
-            raise RuntimeError("split_success_only: test attempt in val set! %s" % i)
-            # print("split_success_only: test attempt in train set! %s" % i)
-    if (len(train_set) + len(val_set) + len(test_set)) != len(filenames):
+            raise RuntimeError("split_sanity_check: test attempt in val set! %s" % i)
+    if (len(train_set) + len(val_set) + len(test_set)) != len_filenames:
         print("ERROR! lenth of train, val and test = %d, %d, %d"
               % (len(train_set), len(val_set), len(test_set)))
-        print("Length of all files: %d" % len(filenames))
-        raise RuntimeError("split_success_only: Numbers do not add up. Something is wrong!")
-    print("Split complete. Sanity check passed.")
-
-    # Write the output files
-    output_file(path, plush, output_name, 'success_only_train', train_set)
-    output_file(path, plush, output_name, 'success_only_val', val_set)
-    output_file(path, plush, output_name, 'success_only_test', test_set)
+        print("Length of all files: %d" % len_filenames)
+        raise RuntimeError("split_sanity_check: Numbers do not add up!!!")
+    print("Sanity check passed.")
 
 
 def split_all(
-        filenames, path, plush, train_txt, val_txt, test_txt,
+        filenames, dataset_name, dir_name, dir_path, success_only, existing_file_prefix,
         val_len=None, test_len=None):
-    '''Splits failure files into all_failure_only, task_failure_only and
-    error_failure_only subsets.
+    '''Splits all files into success_only, task_failure_only, error_failure_only, and
+    task_and_error_failure subsets.
     1. Open all filenames with h5py to only count the files that contain images
     2. Calculate success:failure:error ratios
-    3. Refer to pre-existing success_only train/val/test txt file counts and output
-       train/val/test txt files according to the calculated success:failure:error ratio.
+    3. Try to open success_only train/val/test txt file in existing_file_prefix for
+       length reference. Output train/val/test txt files according to the calculated
+       success:failure:error ratio. If no file is found, use val_len and test_len as
+       basis for creating success_only train/val/test txt files.
 
     :param filenames: A list of .h5f filenames under the path.
-    :param path: Path to the folder with the .h5f files.
-    :param plush: A bool indicating whether the program is processing plush subset.
-    :param train_txt: Filename to success_only train txt file.
-    :param val_txt: Filename to success_only val txt file.
-    :param test_txt: Filename to success_only test txt file.
+    :param dataset_name: The name of the dataset.
+    :param dir_name: The folder name for that the .h5f files are under.
+    :param dir_path: Path to the folder with the .h5f files.
+    :param success_only: A bool indicating whether the program should only process
+                         success_only files.
+    :param existing_file_prefix: Txt file prefixes for the program to look for in
+                                 the folder to open as success_only reference.
     :param val_len: Expected output val set length.
     :param test_len: Expected output test set length.
-    :param output_name: Filename prefix to the output train/val/test txt files.
+    :return: A list of 4 lists that contain 3 sublists.
+             The 4 lists are in the format of [success_only, task_and_error_failure,
+             task_failure_only, error_failure_only]
+             Each sublist contains [train, val, test] filenames.
     '''
     # Get the success, failure, and error filenames with nonzero frames
+    print("Checking h5f files in {}".format(dir_path))
     success_filenames, failure_filenames, error_filenames = count_files_containing_images(
-                                                                path, filenames)
+                                                                dir_path, filenames)
 
     # Calculate the percentage of success, failure and error
     total_file_count = (
@@ -353,62 +433,62 @@ def split_all(
     print("Ratios: {:.2f}% success, {:.2f}% failure(no error), {:.2f}% error".format(
             success_ratio*100, failure_ratio*100, error_ratio*100))
 
-    # Read the train/val set from success_only subset
-    if plush:
-        default_name = 'costar_plush_block_stacking_v0.4_success_only_'
-    else:
-        default_name = 'costar_block_stacking_v0.4_success_only_'
-    # Read filenames for the previous training set
-    if not train_txt:
-        # Look for v0.4 success only train filenames
-        print('No train file is specified. Trying to open v0.4 success only...')
-        pre_existing_set_file = path + default_name + 'train_files.txt'
-    else:
-        pre_existing_set_file = path + train_txt
-
-    if not os.path.isfile(pre_existing_set_file):
-        raise ValueError(
-            'Pre-existing training file is not a file: ' +
-            pre_existing_set_file)
-    success_train_len = len(get_existing_filenames(pre_existing_set_file))
-
-    # Read filenames for the previous validation set
-    if not val_txt:
-        # Look for v0.4 success only val filenames
-        print('No val file is specified. Trying to open v0.4 success only...')
-        pre_existing_set_file = path + default_name + 'val_files.txt'
-    else:
-        pre_existing_set_file = path + val_txt
-
-    if not os.path.isfile(pre_existing_set_file):
-        raise ValueError(
-            'Pre-existing validating file is not a file: ' +
-            pre_existing_set_file)
-    success_val_len = len(get_existing_filenames(pre_existing_set_file))
-
-    # Read filenames for the previous test set
-    if not test_txt:
-        # Look for v0.4 success only train filenames
-        print('No test file is specified. Trying to open v0.4 success only...')
-        pre_existing_set_file = path + default_name + 'test_files.txt'
-    else:
-        pre_existing_set_file = path + test_txt
+    # Process success_only files
+    train_val_test_filenames = read_existing_files(
+        dir_path, dataset_name, dir_name, 'success_only', existing_file_prefix)
+
+    # Extract the filenames into subsets
+    success_train_set, success_val_set, success_test_set = [
+        l if l is not None else [] for l in train_val_test_filenames]
+    success_train_len, success_val_len, success_test_len = list(
+        map(len, train_val_test_filenames))
+    if success_val_len == 0:
+        print("No val set found. Default output val length is 64.")
+        success_val_len = 64
+    if success_test_len == 0:
+        print("No test set found. Default output test length is 64.")
+        success_test_len = 64
+
+    # Split the dataset
+    random.shuffle(success_filenames)
+    sucess_train_set, success_val_set, success_test_set = split_dataset(
+        success_filenames, success_train_set, success_val_set, success_test_set,
+        success_val_len, success_test_len)
 
-    if not os.path.isfile(pre_existing_set_file):
-        raise ValueError(
-            'Pre-existing testing file is not a file: ' +
-            pre_existing_set_file)
-    success_test_len = len(get_existing_filenames(pre_existing_set_file))
+    # Sanity check
+    print("success_only split complete.")
+    split_sanity_check(
+        sucess_train_set, success_val_set, success_test_set, len(success_filenames))
+
+    if success_only:
+        # If only sucess_only files should be processed, then return the result now
+        return [[sucess_train_set, success_val_set, success_test_set], [], [], []]
+
+    # Process .failure.error files
+    train_val_test_filenames = read_existing_files(
+        dir_path, dataset_name, dir_name, 'error_failure_only', existing_file_prefix)
+    err_train_set, err_val_set, err_test_set = [
+        l if l is not None else [] for l in train_val_test_filenames]
+    # error_train_len, error_val_len, error_test_len = list(
+    #     map(len, train_val_test_filenames))
+
+    # Process .failure files
+    train_val_test_filenames = read_existing_files(
+        dir_path, dataset_name, dir_name, 'task_failure_only', existing_file_prefix)
+    fail_train_set, fail_val_set, fail_test_set = [
+        l if l is not None else [] for l in train_val_test_filenames]
+    # failure_train_len, failure_val_len, failure_test_len = list(
+    #     map(len, train_val_test_filenames))
 
     # Calculate set size for failure and error, based on success_only subset
     multiplier_failure = len(failure_filenames)/len(success_filenames)
     failure_val_len = int(round(success_val_len*multiplier_failure))
     failure_test_len = int(round(success_test_len*multiplier_failure))
-    failure_train_len = len(failure_filenames) - (failure_val_len + failure_test_len)
+    # failure_train_len = len(failure_filenames) - (failure_val_len + failure_test_len)
     multiplier_error = len(error_filenames)/len(success_filenames)
     error_val_len = int(round(success_val_len*multiplier_error))
     error_test_len = int(round(success_test_len*multiplier_error))
-    error_train_len = len(error_filenames) - (error_val_len + error_test_len)
+    # error_train_len = len(error_filenames) - (error_val_len + error_test_len)
 
     # Randomize the filenames
     random.shuffle(failure_filenames)
@@ -416,59 +496,28 @@ def split_all(
 
     # Split the dataset for failure and error
     fail_train_set, fail_val_set, fail_test_set = split_dataset(
-        failure_filenames, [], [], [], failure_val_len, failure_test_len)
-    err_train_set,  err_val_set,  err_test_set = split_dataset(
-        error_filenames, [], [], [], error_val_len, error_test_len)
-
-    # Sanity check
-    for i in fail_val_set:
-        if i in fail_train_set:
-            raise RuntimeError("split_all: fail: val attempt in train set! %s" % i)
-            # print("split_all: fail: val attempt in train set! %s" % i)
-    for i in fail_test_set:
-        if i in fail_train_set:
-            raise RuntimeError("split_all: fail: test attempt in train set! %s" % i)
-            # print("split_all: fail: test attempt in train set! %s" % i)
-    for i in err_val_set:
-        if i in err_train_set:
-            raise RuntimeError("split_all: err: val attempt in train set! %s" % i)
-            # print("split_all: err: val attempt in train set! %s" % i)
-    for i in err_test_set:
-        if i in err_train_set:
-            raise RuntimeError("split_all: err: test attempt in train set! %s" % i)
-            # print("split_all: err: test attempt in train set! %s" % i)
-    for i in err_train_set:
-        if i in fail_train_set:
-            raise RuntimeError("split_all: err train set overlap with fail train set! %s" % i)
-            # print("split_all: err train set overlap with fail train set! %s" % i)
-    for i in err_val_set:
-        if i in fail_val_set:
-            raise RuntimeError("split_all: err val set overlap with fail val set! %s" % i)
-            # print("split_all: err val set overlap with fail val set! %s" % i)
-    for i in err_test_set:
-        if i in fail_test_set:
-            raise RuntimeError("split_all: err test set overlap with fail test set! %s" % i)
-            # print("split_all: err test set overlap with fail test set! %s" % i)
-    print("Split complete. Sanity check passed.")
-
-    with open(csv_path, 'w+') as file_object:
-        file_object.write(dataset_splits_csv)
-
-    # Write the output files
-    output_file(path, plush, output_name, 'task_failure_only_train', fail_train_set)
-    output_file(path, plush, output_name, 'task_failure_only_val', fail_val_set)
-    output_file(path, plush, output_name, 'task_failure_only_test', fail_test_set)
-    output_file(path, plush, output_name, 'error_failure_only_train', err_train_set)
-    output_file(path, plush, output_name, 'error_failure_only_val', err_val_set)
-    output_file(path, plush, output_name, 'error_failure_only_test', err_test_set)
+        failure_filenames, fail_train_set, fail_val_set, fail_test_set,
+        failure_val_len, failure_test_len)
+    print("task_failure_only split complete.")
+    split_sanity_check(
+        fail_train_set, fail_val_set, fail_test_set, len(failure_filenames))
+
+    err_train_set, err_val_set, err_test_set = split_dataset(
+        error_filenames, err_train_set, err_val_set, err_test_set,
+        error_val_len, error_test_len)
+    print("error_failure_only split complete.")
+    split_sanity_check(
+        err_train_set, err_val_set, err_test_set, len(error_filenames))
 
     # Error is also a type of failure! Combine task failure and error failure subsets.
-    fail_train_set += err_train_set
-    fail_val_set += err_val_set
-    fail_test_set += err_test_set
-    output_file(path, plush, output_name, 'task_and_error_failure_train', fail_train_set)
-    output_file(path, plush, output_name, 'task_and_error_failure_val', fail_val_set)
-    output_file(path, plush, output_name, 'task_and_error_failure_test', fail_test_set)
+    task_n_err_train_set = fail_train_set + err_train_set
+    task_n_err_val_set = fail_val_set + err_val_set
+    task_n_err_test_set = fail_test_set + err_test_set
+
+    return [[sucess_train_set, success_val_set, success_test_set],
+            [task_n_err_train_set, task_n_err_val_set, task_n_err_test_set],
+            [fail_train_set, fail_val_set, fail_test_set],
+            [err_train_set, err_val_set, err_test_set]]
 
 
 def count_files_containing_images(path, filenames):
@@ -581,14 +630,10 @@ def main(args, root='root'):
             continue
 
         # Split the dataset
-        if args['success_only']:
-            subsets = split_success_only(
-                filenames, dir_path, args['plush'], args['train'], args['val'],
-                args['test'], args['val_len'], args['test_len'])
-        else:
-            subsets = split_all(
-                filenames, dir_path, args['plush'], args['train'], args['val'],
-                args['test'], args['val_len'], args['test_len'])
+        subsets = split_all(
+                    filenames, args['dataset_name'], dir_name, dir_path,
+                    args['success_only'], args['existing_file_prefix'],
+                    args['val_len'], args['test_len'])
 
         # Output the files
         # Modify here to add more categories
@@ -597,40 +642,36 @@ def main(args, root='root'):
             'task_and_error_failure',
             'task_failure_only',
             'error_failure_only']
-        subset_names = [
-            'train',
-            'val',
-            'test']
+        subset_names = ['train','val','test']
         for i in range(len(subsets)):
             category_name = category_names[i]
             category_subsets = subsets[i]
             # Output the files
-            # TODO: include dataset name
             for j in range(len(category_subsets)):
                 subset_name = subset_names[j]
                 subset_filenames = category_subsets[j]
-                output_file(dir_path, dir_name, args['output_prefix'],
-                            args['dataset_name'], subset_name, subset_filenames)
-
-
-
-
-        dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
-        dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
-                                success_train_len, success_val_len, success_test_len)
-        dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
-                failure_train_len + error_train_len,
-                failure_val_len + error_val_len,
-                failure_test_len + error_test_len)
-        dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
-                failure_train_len, failure_val_len, failure_test_len)
-        dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
-                error_train_len, error_val_len, error_test_len)
-        dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
-        print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
-
-        csv_path = os.path.join(path, dataset_splits_csv_filename)
-        
+                output_file(args['dataset_path'], args['dataset_name'],
+                            dir_path, dir_name, category_name,
+                            subset_name, subset_filenames)
+
+        # TODO: Implement csv output
+        # dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
+        # dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
+        #                         success_train_len, success_val_len, success_test_len)
+        # dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
+        #         failure_train_len + error_train_len,
+        #         failure_val_len + error_val_len,
+        #         failure_test_len + error_test_len)
+        # dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
+        #         failure_train_len, failure_val_len, failure_test_len)
+        # dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
+        #         error_train_len, error_val_len, error_test_len)
+        # dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
+        # print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
+
+        # csv_path = os.path.join(path, dataset_splits_csv_filename)
+        # with open(csv_path, 'w+') as file_object:
+        #     file_object.write(dataset_splits_csv)
 
 
 if __name__ == '__main__':

From 976fd1b1e3554bf738919c04152b9c884fefaa27 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Tue, 16 Oct 2018 15:13:07 -0400
Subject: [PATCH 23/36] hyperopt_plot.py dramatically improved plot output with
 averages

---
 costar_hyper/hyperopt_plot.py | 40 +++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/costar_hyper/hyperopt_plot.py b/costar_hyper/hyperopt_plot.py
index 5b5f9519a..364acf1c3 100644
--- a/costar_hyper/hyperopt_plot.py
+++ b/costar_hyper/hyperopt_plot.py
@@ -296,6 +296,8 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
                     values = values + [val]
                     split_values = split_values + [split_val]
                     name = row['basename'][:number_of_time_characters]
+                    # Uncomment below for separate train val test model names
+                    # name = name + ' ' + tvt
                     if 'epoch' in row:
                         # add an epoch field
                         name = name + epoch_name
@@ -308,6 +310,8 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
                     acc_limits = acc_limits + [acc_limit]
                     tvts = tvts + [tvt]
 
+    # print if each is part of the train val or test set
+    # print('tvts: ' + str(tvts))
     dictionary = {'name': names,
                   'error_distribution_limits': acc_range_limits,
                   'accuracy_range_value': split_values,
@@ -333,18 +337,32 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 rdf = create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 
 # key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Accuracy Range'), ('train_val_test', 'Train Val Test')]
-key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Error Distribution')]
+key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Error Distribution'), ('train_val_test', 'Dataset Split')]
 key_dimension_display_strs = [vt[1] for vt in key_dimensions]
-
-table = hv.Table(rdf, key_dimensions, 'accuracy_range_value')
-print('1.0 table created')
-table_bars = table.to.bars(key_dimension_display_strs, 'accuracy_range_value', ['train_val_test'])
-table_bars = table_bars.options(stack_index=1, width=1920, height=1080, xrotation=90, tools=['hover'])
-print('2.0 table bars')
-table_plot = renderer.get_plot(table_bars)
-print('3.0 table plot')
-plot_list = [[table_plot.state]]
-print('3.0 plot list')
+value_dimensions = [('accuracy_range_value', 'Cumulative Fraction of Examples'), ('avg_error', 'Average Error')]#, ('train_val_test', 'Dataset Split')]
+value_dimension_display_strs = [vt[1] for vt in value_dimensions]
+distribution_table = hv.Table(rdf, key_dimensions, value_dimensions)
+print('1.0 dist table created')
+distribution_table_bars = distribution_table.to.bars(key_dimension_display_strs, value_dimension_display_strs, ['train_val_test'])
+distribution_table_bars = distribution_table_bars.options(stack_index=1, width=1920, height=1080, xrotation=90, tools=['hover'], group_index=2, cmap='RdYlGn_r')
+# distribution_table_bars = distribution_table_bars.overlay('train_val_test')
+print('2.0 dist table bars')
+distribution_table_plot = renderer.get_plot(distribution_table_bars)
+
+print('3.0 dist table created')
+key_dimensions = [('name', 'Model')]
+key_dimension_display_strs = [vt[1] for vt in key_dimensions]
+value_dimensions = [('avg_error', 'Average Error'), ('train_val_test', 'Dataset Split')]
+value_dimension_display_strs = [vt[1] for vt in value_dimensions]
+avg_table_bars = hv.Table(rdf, key_dimensions, value_dimensions)
+print('4.0 avg table created')
+avg_table_bars = avg_table_bars.to.bars(key_dimension_display_strs, value_dimension_display_strs, ['train_val_test'])
+avg_table_bars = avg_table_bars.options(width=1920, height=640, xrotation=90, tools=['hover'])
+print('4.0 avg table bars')
+avg_table_plot = renderer.get_plot(avg_table_bars)
+print('5.0 table plot')
+plot_list = [[distribution_table_plot.state], [avg_table_plot.state]]
+print('6.0 plot list')
 # layout_child = layout(plot_list, sizing_mode='fixed')
 layout_child = layout(plot_list)
 curdoc().clear()

From 563ce982027b12334deac27a7fc577b10640bec9 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 15:55:01 -0400
Subject: [PATCH 24/36] WIP: Debug all functionalities. Working on output
 combined files

---
 .../costar_block_stacking_split_dataset.py    | 326 +++++++++---------
 1 file changed, 157 insertions(+), 169 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 84c005d16..f578b7715 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -26,11 +26,16 @@
 import argparse
 import os
 import random
+import h5py  # Needs h5py to open the files and check frame count
 try:
-    import h5py  # Needs h5py to open the files and check frame count
+    from tqdm import tqdm
 except ImportError:
-    print("h5py is not available.")
-    h5py = None
+    print("tqdm is not available. Progress bar functionalities will be disabled.")
+
+    def tqdm(*args, **kwargs):
+        if args:
+            return args[0]
+        return kwargs.get('iterable', None)
 
 
 def _parse_args():
@@ -49,7 +54,7 @@ def _parse_args():
                         help='The folder that is expected stores the dataset. '
                              'Filenames in the output file will reference this path.')
     parser.add_argument("--dataset_name", type=str,
-                        default='costar_block_stacking_dataset',
+                        default='costar_block_stacking_dataset_v0.4',
                         help='Dataset name to store under dataset path.'
                              'Filenames in the output file will reference this name.')
     # parser.add_argument("--dataset_version", type=str, default='v0.4',
@@ -76,8 +81,8 @@ def _parse_args():
     parser.add_argument("--write", action='store_true', default=False,
                         help='Write to output files')
     parser.add_argument("--existing_file_prefix", type=str, nargs='+',
-                        default=["costar_plush_block_stacking_dataset_v0.4",
-                                 "costar_block_stacking_dataset_v0.4"],
+                        default=["costar_plush_block_stacking_v0.4",
+                                 "costar_block_stacking_v0.4"],
                         help="Existing txt file prefixes to look for when opening "
                              "train/val/test files.")
     return vars(parser.parse_args())
@@ -101,13 +106,13 @@ def get_existing_filenames(path_to_file):
         # Extract the file names and add them to the returning list
         filename = extract_filename_from_url(line)
         if not filename:
-            print("get_existing_filenames: Empty line extracted.")
+            print(">>>get_existing_filenames: Empty line extracted.")
             continue
         filenames.append(filename)
 
     f.close()
 
-    print('Read ' + str(len(filenames)) + ' filenames from ' + path_to_file)
+    print('>>Read ' + str(len(filenames)) + ' filenames from ' + path_to_file)
     return filenames
 
 
@@ -130,24 +135,86 @@ def output_file(dataset_path, dataset_name, dir_path, dir_name,
     '''
     output_filename = "{0}_{1}_{2}_{3}_files.txt".format(
                         dataset_name, dir_name, category_name, subset_name)
-    output_path = os.path.join(dir_path, dir_name, output_filename)
+    output_path = os.path.join(dir_path, output_filename)
     prefix_path = os.path.join(dataset_path, dataset_name, dir_name)
     print("Output txt file: {}".format(output_path))
-    print("Length: {} files".format(len(subset_filenames)))
-    print("Example .h5f path in the txt file: %s".format(
+    print(">Length: {} files".format(len(subset_filenames)))
+    print(">Example .h5f path in the txt file: {}".format(
           os.path.join(prefix_path, subset_filenames[0])))
 
+    len_filenames = len(subset_filenames)
     if write:
-        with open(output_filename, 'w') as f:
-            for filename in subset_filenames:
+        with open(output_path, 'w') as f:
+            for i in range(len_filenames):
+                filename = subset_filenames[i]
+                linebreak = '\n' if i != len_filenames else ''
                 file_path = os.path.join(prefix_path, filename)
-                f.write(file_path + '\n')
+                f.write(file_path + linebreak)
+        print(">>Successfully saved as {}".format(output_filename))
     else:
-        print("File not written. Use --write flag to actually output the file.")
+        print(">>File not written. Use --write flag to actually output the file.")
 
     return output_path
 
 
+def output_csv(path, subsets):
+    '''Output split summary csv file
+
+    :param path: The path to store the csv file
+    :param subsets: The list of lists returned by split_all.
+                    A list of 4 lists that contain 3 sublists.
+                    The 4 lists are in the format of [success_only,
+                    task_and_error_failure, task_failure_only,
+                    error_failure_only]
+                    Each sublist contains [train, val, test] filenames.
+    :return csv_path: The path to the output csv file.
+    '''
+    # TODO(rexxarchl): Implement csv output
+    success, _, task_fail, err_fail = subsets
+    success_train_len, success_val_len, success_test_len = map(len, success)
+    failure_train_len, failure_val_len, failure_test_len = map(len, task_fail)
+    error_train_len, error_val_len, error_test_len = map(len, err_fail)
+
+    dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
+    dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
+                            success_train_len, success_val_len, success_test_len)
+    dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
+            failure_train_len + error_train_len,
+            failure_val_len + error_val_len,
+            failure_test_len + error_test_len)
+    dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
+            failure_train_len, failure_val_len, failure_test_len)
+    dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
+            error_train_len, error_val_len, error_test_len)
+
+    dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
+    print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
+
+    csv_path = os.path.join(path, dataset_splits_csv_filename)
+    with open(csv_path, 'w+') as file_object:
+        file_object.write(dataset_splits_csv)
+
+    print('CSV file saved as %s' % csv_path)
+    return csv_path
+
+
+def output_combined_files(path, output_files_dict, category_names):
+    '''Output combined txt files and overall summary csv file.
+    :param path: The path for thefiles to save in.
+    :param output_files_dict: A dictionary of output files in each directory.
+                              Key is the name of the directory, and item is the txt file
+                              paths that we just outputted.
+    '''
+    # TODO(rexxarchl): implement this function
+    print(output_files_dict)
+
+
+    # train_val_test_filenames = [[]]
+    # for k, v in output_files_dict.items():
+
+    pass
+
+
 def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_len=None):
     '''Split the input filenames into three sets.
     If val_set and test_set are empty, the sets will be of length val_len and test_len.
@@ -162,6 +229,19 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
     :param test_len: The expected output test set length.
     :return train_set, val_set, test_set: train/val/test set filenames.
     '''
+    len_filenames = len(filenames)
+    len_all_sets = len(train_set) + len(val_set) + len(test_set)
+    files_added = len_filenames - len_all_sets
+    print("Total {} files, with {} files already in txt files.".format(
+        len_filenames, len_all_sets))
+    print("{} files added".format(files_added))
+    if files_added is 0:
+        print("No need to split. Returning original results.")
+        return train_set, val_set, test_set
+    if files_added < 0:
+        raise Exception("split_dataset: Total file count is smaller than combined "
+                        "length of train/val/test set!")
+
     if len(test_set) is 0 and test_len is None:
         raise ValueError("split_dataset: test_set is empty and no test_len is specified!")
     if len(val_set) is 0 and val_len is None:
@@ -231,99 +311,6 @@ def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_le
     return train_set, val_set, test_set
 
 
-# def split_success_only(
-#         filenames, path, plush, train_txt, val_txt, test_txt,
-#         val_len=None, test_len=None):
-#     '''Splits success files into success_only train/val/test txt files.
-
-#     :param filenames: A list of .h5f filenames under the path.
-#     :param path: Path to the folder with the .h5f files.
-#     :param plush: A bool indicating whether the program is processing plush subset.
-#     :param train_txt: Filename to a pre-existing train txt file.
-#     :param val_txt: Filename to a pre-existing val txt file.
-#     :param test_txt: Filename to a pre-existing test txt file.
-#     :param val_len: Expected output val set length.
-#     :param test_len: Expected output test set length.
-#     :param output_name: Filename prefix to the output train/val/test txt files.
-#     '''
-#     # Read files that are success
-#     filenames = [filename for filename in filenames if '.success.h5f' in filename]
-#     print('Selecting ' + str(len(filenames)) + ' success files')
-
-#     # Read filenames for the previous training set
-#     if not train_txt:
-#         train_set = []
-#     else:
-#         pre_existing_set_file = path + train_txt
-#         if not os.path.isfile(pre_existing_set_file):
-#             raise ValueError(
-#                 'split_success_only: Pre-existing training file is not a file: ' +
-#                 pre_existing_set_file)
-
-#         train_set = get_existing_filenames(pre_existing_set_file)
-
-#     # Read filenames for the previous validation set
-#     if not val_txt:
-#         val_set = []
-#     else:
-#         pre_existing_set_file = path + val_txt
-#         if not os.path.isfile(pre_existing_set_file):
-#             raise ValueError(
-#                 'split_success_only: Pre-existing validating file is not a file: ' +
-#                 pre_existing_set_file)
-
-#         val_set = get_existing_filenames(pre_existing_set_file)
-
-#     # Read filenames for the previous test set
-#     if not test_txt:
-#         test_set = []
-#     else:
-#         pre_existing_set_file = path + test_txt
-#         if not os.path.isfile(pre_existing_set_file):
-#             raise ValueError(
-#                 'split_success_only: Pre-existing testing file is not a file: ' +
-#                 pre_existing_set_file)
-
-#         test_set = get_existing_filenames(pre_existing_set_file)
-
-#     # Inform the user that the length of val and test will be matched for output,
-#     # when the lengths of val and test are not equal
-#     if len(val_set) is not len(test_set):
-#         print('Validation set and testing set do not have the same length. '
-#               'Output results will be adjusted to same size sets')
-
-#     # Randomize the filenames
-#     random.shuffle(filenames)
-
-#     # Split the dataset
-#     train_set, val_set, test_set = split_dataset(
-#         filenames, train_set, val_set, test_set, val_len, test_len)
-
-#     # Sanity check
-#     for i in val_set:
-#         if i in train_set:
-#             raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
-#             # print("split_success_only: val attempt in train set! %s" % i)
-#     for i in test_set:
-#         if i in train_set:
-#             raise RuntimeError("split_success_only: test attempt in train set! %s" % i)
-#             # print("split_success_only: test attempt in train set! %s" % i)
-#     for i in test_set:
-#         if i in val_set:
-#             raise RuntimeError("split_success_only: test attempt in val set! %s" % i)
-#             # print("split_success_only: test attempt in train set! %s" % i)
-#     if (len(train_set) + len(val_set) + len(test_set)) != len(filenames):
-#         print("ERROR! lenth of train, val and test = %d, %d, %d"
-#               % (len(train_set), len(val_set), len(test_set)))
-#         print("Length of all files: %d" % len(filenames))
-#         raise RuntimeError("split_success_only: Numbers do not add up. Something is wrong!")
-#     print("Split complete. Sanity check passed.")
-
-#     # Write the output files
-#     output_file(path, plush, output_name, 'success_only_train', train_set)
-#     output_file(path, plush, output_name, 'success_only_val', val_set)
-#     output_file(path, plush, output_name, 'success_only_test', test_set)
-
 def read_existing_files(
         dir_path, dataset_name, dir_name, category_name, existing_file_prefix):
     '''Try to open existing train/val/test txt files.
@@ -342,9 +329,9 @@ def read_existing_files(
         txt_filename = "{0}_{1}_{2}_{3}_files.txt".format(
                         dataset_name, dir_name, category_name, subset_name)
         txt_file_path = os.path.join(dir_path, txt_filename)
-        print("Trying %s..." % txt_file_path)
+        print(">Trying %s..." % txt_file_path)
         if os.path.isfile(txt_file_path):
-            print("Existing {} txt file found: {}".format(subset_name, txt_filename))
+            print(">>Existing {} txt file found: {}".format(subset_name, txt_filename))
             train_val_test_filenames.append(get_existing_filenames(txt_file_path))
         else:
             train_val_test_filenames.append(None)
@@ -357,9 +344,9 @@ def read_existing_files(
                 txt_filename = "{0}_{1}_{2}_files.txt".format(
                                 prefix, category_name, subset_name)
                 txt_file_path = os.path.join(dir_path, txt_filename)
-                print("Trying %s..." % txt_file_path)
+                print(">Trying %s..." % txt_file_path)
                 if os.path.isfile(txt_file_path):
-                    print("Existing {} txt file found: {}".format(
+                    print(">>Existing {} txt file found: {}".format(
                            subset_name, txt_filename))
                     train_val_test_filenames.append(get_existing_filenames(txt_file_path))
                 else:
@@ -419,7 +406,6 @@ def split_all(
              Each sublist contains [train, val, test] filenames.
     '''
     # Get the success, failure, and error filenames with nonzero frames
-    print("Checking h5f files in {}".format(dir_path))
     success_filenames, failure_filenames, error_filenames = count_files_containing_images(
                                                                 dir_path, filenames)
 
@@ -430,16 +416,18 @@ def split_all(
     failure_ratio = len(failure_filenames) / total_file_count
     error_ratio = len(error_filenames) / total_file_count
     print("Total: %d files" % total_file_count)
-    print("Ratios: {:.2f}% success, {:.2f}% failure(no error), {:.2f}% error".format(
+    print("Ratios: {:.2f}% success, {:.2f}% task_failure, {:.2f}% error_failure".format(
             success_ratio*100, failure_ratio*100, error_ratio*100))
 
     # Process success_only files
+    print("\nProcessing success files")
     train_val_test_filenames = read_existing_files(
         dir_path, dataset_name, dir_name, 'success_only', existing_file_prefix)
 
     # Extract the filenames into subsets
-    success_train_set, success_val_set, success_test_set = [
-        l if l is not None else [] for l in train_val_test_filenames]
+    train_val_test_filenames = [l if l is not None else [] 
+                                for l in train_val_test_filenames]
+    success_train_set, success_val_set, success_test_set = train_val_test_filenames
     success_train_len, success_val_len, success_test_len = list(
         map(len, train_val_test_filenames))
     if success_val_len == 0:
@@ -464,15 +452,8 @@ def split_all(
         # If only sucess_only files should be processed, then return the result now
         return [[sucess_train_set, success_val_set, success_test_set], [], [], []]
 
-    # Process .failure.error files
-    train_val_test_filenames = read_existing_files(
-        dir_path, dataset_name, dir_name, 'error_failure_only', existing_file_prefix)
-    err_train_set, err_val_set, err_test_set = [
-        l if l is not None else [] for l in train_val_test_filenames]
-    # error_train_len, error_val_len, error_test_len = list(
-    #     map(len, train_val_test_filenames))
-
     # Process .failure files
+    print("\nProcessing task failure files")
     train_val_test_filenames = read_existing_files(
         dir_path, dataset_name, dir_name, 'task_failure_only', existing_file_prefix)
     fail_train_set, fail_val_set, fail_test_set = [
@@ -480,21 +461,14 @@ def split_all(
     # failure_train_len, failure_val_len, failure_test_len = list(
     #     map(len, train_val_test_filenames))
 
-    # Calculate set size for failure and error, based on success_only subset
+    # Calculate set size for failure, based on success_only subset
     multiplier_failure = len(failure_filenames)/len(success_filenames)
     failure_val_len = int(round(success_val_len*multiplier_failure))
     failure_test_len = int(round(success_test_len*multiplier_failure))
     # failure_train_len = len(failure_filenames) - (failure_val_len + failure_test_len)
-    multiplier_error = len(error_filenames)/len(success_filenames)
-    error_val_len = int(round(success_val_len*multiplier_error))
-    error_test_len = int(round(success_test_len*multiplier_error))
-    # error_train_len = len(error_filenames) - (error_val_len + error_test_len)
 
-    # Randomize the filenames
+    # Split the dataset for failure set
     random.shuffle(failure_filenames)
-    random.shuffle(error_filenames)
-
-    # Split the dataset for failure and error
     fail_train_set, fail_val_set, fail_test_set = split_dataset(
         failure_filenames, fail_train_set, fail_val_set, fail_test_set,
         failure_val_len, failure_test_len)
@@ -502,6 +476,23 @@ def split_all(
     split_sanity_check(
         fail_train_set, fail_val_set, fail_test_set, len(failure_filenames))
 
+    # Process .failure.error files
+    print("\nProcessing error failure files")
+    train_val_test_filenames = read_existing_files(
+        dir_path, dataset_name, dir_name, 'error_failure_only', existing_file_prefix)
+    err_train_set, err_val_set, err_test_set = [
+        l if l is not None else [] for l in train_val_test_filenames]
+    # error_train_len, error_val_len, error_test_len = list(
+    #     map(len, train_val_test_filenames))
+
+    # Calculate set size for error, based on success_only subset
+    multiplier_error = len(error_filenames)/len(success_filenames)
+    error_val_len = int(round(success_val_len*multiplier_error))
+    error_test_len = int(round(success_test_len*multiplier_error))
+    # error_train_len = len(error_filenames) - (error_val_len + error_test_len)
+
+    # Split the dataset for error
+    random.shuffle(error_filenames)
     err_train_set, err_val_set, err_test_set = split_dataset(
         error_filenames, err_train_set, err_val_set, err_test_set,
         error_val_len, error_test_len)
@@ -526,26 +517,21 @@ def count_files_containing_images(path, filenames):
     :param filenames: .h5f filenames in the folder
     :return: Lists of success/failure/error filenames with nonzero frames
     '''
-    # TODO: Write total frames into csv file as a new column
-
+    # TODO(rexxarchl): Write total frames into csv file as a new column
     # Open the files to check frame count. Skip files with 0 frame.
     error_filenames = []
     failure_filenames = []
     success_filenames = []
     skip_count = 0
-    i = 0
-    print("Checking %d files. This can take some time." % len(filenames))
-    for filename in filenames:
-        i += 1
-        if i % 100 == 0:
-            # TODO: incorporate tqdm progress bar
-            print("{} out of {} files checked".format(i, len(filenames)))
+    print("Checking {} files in {}...".format(len(filenames), path))
+    progress_bar = tqdm(filenames)
+    for filename in progress_bar:
         try:
             with h5py.File(os.path.join(path, filename), 'r') as data:
                 try:
                     total_frames = len(data['image'])
                 except KeyError as e:
-                    print('Skipping %s for KeyError' % filename)
+                    progress_bar.write('Skipping %s for KeyError' % filename)
                     continue
 
                 if total_frames == 0:  # Skip files with 0 frame
@@ -564,7 +550,7 @@ def count_files_containing_images(path, filenames):
                         'Somthing is wrong! The file does not contain `error`,'
                         '`failure`, or `success` in the filename: %s' % filename)
         except IOError as ex:
-            print('Skipping %s for IO error' % filename)
+            progress_bar.write('Skipping %s for IOError' % filename)
 
     print("Counted {:d} success files, {:d} failure files, and {:d} error files.".format(
             len(success_filenames), len(failure_filenames), len(error_filenames)))
@@ -618,13 +604,17 @@ def main(args, root='root'):
     # Set the random seed for reproducible random lists
     random.seed(args['seed'])
 
+    output_files_dict = dict()
     for dir_name in dir_list:
+        print("---------------------")
+        print("Processing directory: %s" % dir_name)
         dir_path = os.path.join(path, dir_name)
 
         # Get all h5f files under this folder
         filenames = [filename for filename in os.listdir(dir_path) if '.h5f' in filename]
         if filenames:
-            print('Read ' + str(len(filenames)) + ' h5f filenames in the folder')
+            print('Read ' + str(len(filenames)) +
+                  ' h5f filenames in directory %s' % dir_name)
         else:
             print('Skipping directory %s because it contains no h5f file.' % dir_name)
             continue
@@ -634,15 +624,20 @@ def main(args, root='root'):
                     filenames, args['dataset_name'], dir_name, dir_path,
                     args['success_only'], args['existing_file_prefix'],
                     args['val_len'], args['test_len'])
+        print("All splits complete. \n")
 
         # Output the files
-        # Modify here to add more categories
+        # NOTE: Modify here to add more categories. Modify the split_all function too.
         category_names = [
             'success_only',
             'task_and_error_failure',
             'task_failure_only',
             'error_failure_only']
-        subset_names = ['train','val','test']
+        if len(category_names) != len(subsets):
+            raise Exception("Length of categories does not match the length of lists "
+                            "returned by split_all. Did you add more categories?")
+        dir_output_file_dict = dict()
+        subset_names = ['train', 'val', 'test']
         for i in range(len(subsets)):
             category_name = category_names[i]
             category_subsets = subsets[i]
@@ -650,28 +645,21 @@ def main(args, root='root'):
             for j in range(len(category_subsets)):
                 subset_name = subset_names[j]
                 subset_filenames = category_subsets[j]
-                output_file(args['dataset_path'], args['dataset_name'],
-                            dir_path, dir_name, category_name,
-                            subset_name, subset_filenames)
-
-        # TODO: Implement csv output
-        # dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
-        # dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
-        #                         success_train_len, success_val_len, success_test_len)
-        # dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
-        #         failure_train_len + error_train_len,
-        #         failure_val_len + error_val_len,
-        #         failure_test_len + error_test_len)
-        # dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
-        #         failure_train_len, failure_val_len, failure_test_len)
-        # dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
-        #         error_train_len, error_val_len, error_test_len)
-        # dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
-        # print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
-
-        # csv_path = os.path.join(path, dataset_splits_csv_filename)
-        # with open(csv_path, 'w+') as file_object:
-        #     file_object.write(dataset_splits_csv)
+
+                # Output the files and store the outputted file paths in the dictionary
+                dir_output_file_dict[category_name] = output_file(
+                        args['dataset_path'], args['dataset_name'],
+                        dir_path, dir_name, category_name,
+                        subset_name, subset_filenames, args['write'])
+        # Store the outputted file paths in this directory
+        output_files_dict[dir_name] = dir_output_file_dict
+
+        if args['write'] and not args['success_only']:
+            # Output csv file
+            output_csv(dir_path, subsets)
+
+    if args['write'] and not args['success_only']:
+        output_combined_files(path, output_files_dict, category_names)
 
 
 if __name__ == '__main__':

From 23189ff1edabe68757005a01cfcaa97f25fa9e2f Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Tue, 16 Oct 2018 17:00:01 -0400
Subject: [PATCH 25/36] hyperopt_plot.py provides a proper summary now

---
 costar_hyper/hyperopt_plot.py | 37 +++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/costar_hyper/hyperopt_plot.py b/costar_hyper/hyperopt_plot.py
index 364acf1c3..8835263f1 100644
--- a/costar_hyper/hyperopt_plot.py
+++ b/costar_hyper/hyperopt_plot.py
@@ -337,31 +337,44 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 rdf = create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 
 # key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Accuracy Range'), ('train_val_test', 'Train Val Test')]
-key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Error Distribution'), ('train_val_test', 'Dataset Split')]
+key_dimensions = [('name', 'Model'), ('error_distribution_limits', 'Error Range'), ('train_val_test', 'Dataset Split')]
 key_dimension_display_strs = [vt[1] for vt in key_dimensions]
-value_dimensions = [('accuracy_range_value', 'Cumulative Fraction of Examples'), ('avg_error', 'Average Error')]#, ('train_val_test', 'Dataset Split')]
+value_dimensions = [('accuracy_range_value', 'Error Distribution'), ('avg_error', 'Average Error')]#, ('train_val_test', 'Dataset Split')]
 value_dimension_display_strs = [vt[1] for vt in value_dimensions]
 distribution_table = hv.Table(rdf, key_dimensions, value_dimensions)
 print('1.0 dist table created')
-distribution_table_bars = distribution_table.to.bars(key_dimension_display_strs, value_dimension_display_strs, ['train_val_test'])
-distribution_table_bars = distribution_table_bars.options(stack_index=1, width=1920, height=1080, xrotation=90, tools=['hover'], group_index=2, cmap='RdYlGn_r')
+distribution_table_bars = distribution_table.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
+height = 240
+distribution_table_bars = distribution_table_bars.options(stack_index=1, width=1280, height=height, xrotation=90, tools=['hover'], group_index='train_val_test', cmap='RdYlGn_r', show_grid=True)
+# plot train, val, test separately, the + sign sticks the plots together
+distribution_table_bars = (
+    distribution_table_bars.select(train_val_test='train').relabel(group='Train').options(xaxis=None) +
+    distribution_table_bars.select(train_val_test='val', xaxis=None).relabel(group='Val').options(xaxis=None) +
+    distribution_table_bars.select(train_val_test='test').relabel(group='Test').options(height=height + 160))
+distribution_table_bars = distribution_table_bars.cols(1)
+# distribution_table_bars = distribution_table_bars.select(train_val_test='train')
+# distribution_table_bars = distribution_table_bars.grid('train_val_test')
 # distribution_table_bars = distribution_table_bars.overlay('train_val_test')
 print('2.0 dist table bars')
-distribution_table_plot = renderer.get_plot(distribution_table_bars)
+# distribution_table_plot = renderer.get_plot(distribution_table_bars)
 
 print('3.0 dist table created')
-key_dimensions = [('name', 'Model')]
+key_dimensions = [('name', 'Model'), ('train_val_test', 'Dataset Split')]
 key_dimension_display_strs = [vt[1] for vt in key_dimensions]
-value_dimensions = [('avg_error', 'Average Error'), ('train_val_test', 'Dataset Split')]
+value_dimensions = [('avg_error', 'Average Error')]
 value_dimension_display_strs = [vt[1] for vt in value_dimensions]
 avg_table_bars = hv.Table(rdf, key_dimensions, value_dimensions)
 print('4.0 avg table created')
-avg_table_bars = avg_table_bars.to.bars(key_dimension_display_strs, value_dimension_display_strs, ['train_val_test'])
-avg_table_bars = avg_table_bars.options(width=1920, height=640, xrotation=90, tools=['hover'])
+avg_table_bars = avg_table_bars.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
+avg_table_bars = avg_table_bars.options(width=1280, height=160, xrotation=90, tools=['hover'], group_index='train_val_test', xaxis=None)
 print('4.0 avg table bars')
-avg_table_plot = renderer.get_plot(avg_table_bars)
-print('5.0 table plot')
-plot_list = [[distribution_table_plot.state], [avg_table_plot.state]]
+# avg_table_plot = renderer.get_plot(avg_table_bars)
+# print('5.0 table plot')
+# plot_list = [[avg_table_plot.state], [distribution_table_plot.state]]
+table_bars = avg_table_bars + distribution_table_bars
+table_bars = table_bars.cols(1)
+table_plot = renderer.get_plot(table_bars)
+plot_list = [[table_plot.state]]
 print('6.0 plot list')
 # layout_child = layout(plot_list, sizing_mode='fixed')
 layout_child = layout(plot_list)

From e30b50794557284d28eced35b5355a2b7abc0eab Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Tue, 16 Oct 2018 17:21:39 -0400
Subject: [PATCH 26/36] hyperopt_plot.py better variable parameterization

---
 costar_hyper/hyperopt_plot.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/costar_hyper/hyperopt_plot.py b/costar_hyper/hyperopt_plot.py
index 8835263f1..f2963d634 100644
--- a/costar_hyper/hyperopt_plot.py
+++ b/costar_hyper/hyperopt_plot.py
@@ -345,7 +345,10 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 print('1.0 dist table created')
 distribution_table_bars = distribution_table.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
 height = 240
-distribution_table_bars = distribution_table_bars.options(stack_index=1, width=1280, height=height, xrotation=90, tools=['hover'], group_index='train_val_test', cmap='RdYlGn_r', show_grid=True)
+width = 1280
+# uncomment below if you want to plot tons of models
+# width = 12800
+distribution_table_bars = distribution_table_bars.options(stack_index=1, width=width, height=height, xrotation=90, tools=['hover'], group_index='train_val_test', cmap='RdYlGn_r', show_grid=True)
 # plot train, val, test separately, the + sign sticks the plots together
 distribution_table_bars = (
     distribution_table_bars.select(train_val_test='train').relabel(group='Train').options(xaxis=None) +
@@ -366,7 +369,7 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 avg_table_bars = hv.Table(rdf, key_dimensions, value_dimensions)
 print('4.0 avg table created')
 avg_table_bars = avg_table_bars.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
-avg_table_bars = avg_table_bars.options(width=1280, height=160, xrotation=90, tools=['hover'], group_index='train_val_test', xaxis=None)
+avg_table_bars = avg_table_bars.options(width=width, height=160, xrotation=90, tools=['hover'], group_index='train_val_test', xaxis=None)
 print('4.0 avg table bars')
 # avg_table_plot = renderer.get_plot(avg_table_bars)
 # print('5.0 table plot')

From 4f158fb97b4429e5fb26d6824312174117535cdd Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 17:52:40 -0400
Subject: [PATCH 27/36] Finish refactoring the split script

---
 .../costar_block_stacking_split_dataset.py    | 178 ++++++++++++++----
 1 file changed, 137 insertions(+), 41 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index f578b7715..a077d9d5c 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -1,24 +1,17 @@
 '''
-Splits dataset into train, validation, and test sets.
-Inherits existing validation and test sets. New files are added into training set.
+This script will walk into each folder in path, read and count h5f files with nonzero
+image frames, read pre-existing train/val/test txt files, and split all the h5f files
+in this directory into success_only, task_failure_only, error_failure_only, and
+task_and_error_failure txt files.
 
-To split the success_only subset or to add new files ot the success_only subset, call:
+Files are assumed to store in different folders in:
+    ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
+If the files are stored in a different path, use --path to designate the path.
 
-python costar_block_stacking_split_dataset.py --path /path/to/dataset/folder\
-    --success_only (--plush) (--train train/txt/filename)                   \
-    (--val val/txt/filename) (-test test/txt/filename/)                     \
-    --output_name [filename prefix for the output train/val/test filenames]
+To split the success_only subset or to add new files ot the success_only subset, use
+--success_only flag.
 
-To split all dataset, i.e. split error files and failure files into train/val/test sets,
-call the following command after success_only subset is splitted:
-
-python costar_block_stacking_split_dataset.py --path /path/to/dataset/folder     \
-    --split_all (--plush) --train success_only/train/txt/filename             \
-    --val [success_only val txt filename] --test [success_only test txt filename]\
-    --output_name [filename prefix for the output train/val/test filenames]
-
-This will output task_failure_only, error_failure_only, and all_failure_only
-train/val/test filenames as 9 separate txt files.
+Use --help to see all possible uses for this function.
 
 Author: Chia-Hung "Rexxar" Lin (rexxarchl)
 Apache License 2.0 https://www.apache.org/licenses/LICENSE-2.0
@@ -27,6 +20,9 @@
 import os
 import random
 import h5py  # Needs h5py to open the files and check frame count
+
+# Progress bars using https://github.com/tqdm/tqdm
+# Import tqdm without enforcing it as a dependency
 try:
     from tqdm import tqdm
 except ImportError:
@@ -157,7 +153,7 @@ def output_file(dataset_path, dataset_name, dir_path, dir_name,
     return output_path
 
 
-def output_csv(path, subsets):
+def output_csv(path, subsets, write):
     '''Output split summary csv file
 
     :param path: The path to store the csv file
@@ -167,6 +163,7 @@ def output_csv(path, subsets):
                     task_and_error_failure, task_failure_only,
                     error_failure_only]
                     Each sublist contains [train, val, test] filenames.
+    :param write: The flag to actually write the output files.
     :return csv_path: The path to the output csv file.
     '''
     # TODO(rexxarchl): Implement csv output
@@ -188,31 +185,124 @@ def output_csv(path, subsets):
             error_train_len, error_val_len, error_test_len)
 
     dataset_splits_csv_filename = 'costar_block_stacking_dataset_split_summary.csv'
-    print(dataset_splits_csv_filename + '\n' + dataset_splits_csv)
+    print('\n' + dataset_splits_csv_filename + '\n' + dataset_splits_csv)
 
     csv_path = os.path.join(path, dataset_splits_csv_filename)
-    with open(csv_path, 'w+') as file_object:
-        file_object.write(dataset_splits_csv)
+    if write:
+        with open(csv_path, 'w+') as file_object:
+            file_object.write(dataset_splits_csv)
+        print('CSV file saved as %s' % csv_path)
+    else:
+        print('Dry run. Use --write to actually output the CSV file.')
 
-    print('CSV file saved as %s' % csv_path)
     return csv_path
 
 
-def output_combined_files(path, output_files_dict, category_names):
+def output_combined_files(path, dataset_name, output_files_dict, category_names, write):
     '''Output combined txt files and overall summary csv file.
-    :param path: The path for thefiles to save in.
+    The format for output_files_dict looks like this:
+    output_files_dict = {
+        dir_name: {
+            category_name: [[train txt file], [val txt file], [test txt file]]
+        }
+    }
+    The program will first convert output_files_dict into this format
+    categorized_train_val_test_filenames = {
+        category_name: [[train txt file paths in all directories],
+                        [val txt file paths in all directories],
+                        [test txt file paths in all directories]]
+    }
+    Then merge the train/val/test txt files for each category in all folders.
+
+    :param path: The path for the files to save in.
+    :param dataset_name: The name of the dataset to write in the output filenames.
     :param output_files_dict: A dictionary of output files in each directory.
-                              Key is the name of the directory, and item is the txt file
-                              paths that we just outputted.
+                              Key is the name of the directory, and item is a dictionary
+                              of categories that contains the list of the
+                              train/val/list txt file paths that we just outputted.
+    :param write: The flag to actually write the output files.
     '''
-    # TODO(rexxarchl): implement this function
-    print(output_files_dict)
-
+    # Split the output names into categories so that, for example, success_only
+    # files goes together. Further divide the files into lists that contain
+    # train/test/val txt filenames.
+    categorized_train_val_test_filenames = {category_name: [[], [], []]
+                                            for category_name in category_names}
+    for dir_name, category_dict in output_files_dict.items():
+        for category_name, paths in category_dict.items():
+            for i in range(len(paths)):
+                categorized_train_val_test_filenames[category_name][i].append(paths[i])
+
+    # Merge the train/val/test txt files for each category
+    subset_names = ['train', 'val', 'test']
+    summary_dict = {category_name: [] for category_name in category_names}
+    for (category_name,
+         train_val_test_file_paths) in categorized_train_val_test_filenames.items():
+        for i in range(len(train_val_test_file_paths)):
+            subset_name = subset_names[i]
+            output_filename = "{0}_combined_{1}_{2}_files.txt".format(
+                            dataset_name, category_name, subset_name)
+            output_file_path = os.path.join(path, output_filename)
+
+            # Write contents of all the files into the combined file
+            print('>Process combined file for {} {} files'.format(
+                    category_name, subset_name))
+            if write:
+                with open(output_file_path, 'w') as out_file:
+                    for txt_file_path in train_val_test_file_paths[i]:
+                        print('>>Opening txt file: {}'.format(
+                                extract_filename_from_url(txt_file_path)))
+                        with open(txt_file_path, 'r') as in_file:
+                            out_file.write(in_file.read())
+                print('>>Combined file saved as %s' % output_file_path)
+            else:
+                print('>>Dry run. Use --write to actually output the combined files')
+                for txt_file_path in train_val_test_file_paths[i]:
+                    print('>>>Reference txt file: {}'.format(
+                           extract_filename_from_url(txt_file_path)))
+
+            # Count the number of lines, i.e. files, in each txt file for use later in
+            # the summary section
+            size = 0
+            for txt_file_path in train_val_test_file_paths[i]:
+                try:
+                    with open(txt_file_path, 'r') as f:
+                        size += sum(1 for _ in f)
+                except FileNotFoundError:
+                    print('When counting for summary, file {} is not found. The '
+                          'summary below may be inaccurate.'.format(
+                              extract_filename_from_url(txt_file_path)))
+            summary_dict[category_name].append(size)
+
+    # Get the numbers for the summary
+    success_train_len, success_val_len, success_test_len = summary_dict['success_only']
+    (failure_train_len, failure_val_len,
+        failure_test_len) = summary_dict['task_failure_only']
+    error_train_len, error_val_len, error_test_len = summary_dict['error_failure_only']
+
+    # Output combined CVS file
+    dataset_splits_csv = 'subset, train_count, val_count, test_count\n'
+    dataset_splits_csv += "success_only, {0}, {1}, {2}\n".format(
+                            success_train_len, success_val_len, success_test_len)
+    dataset_splits_csv += "task_and_error_failure, {0}, {1}, {2}\n".format(
+            failure_train_len + error_train_len,
+            failure_val_len + error_val_len,
+            failure_test_len + error_test_len)
+    dataset_splits_csv += "task_failure_only, {0}, {1}, {2}\n".format(
+            failure_train_len, failure_val_len, failure_test_len)
+    dataset_splits_csv += "error_failure_only, {0}, {1}, {2}\n".format(
+            error_train_len, error_val_len, error_test_len)
 
-    # train_val_test_filenames = [[]]
-    # for k, v in output_files_dict.items():
+    dataset_splits_csv_filename = dataset_name + '_combined_summary.csv'
+    print('\n' + dataset_splits_csv_filename + '\n' + dataset_splits_csv)
 
-    pass
+    csv_path = os.path.join(path, dataset_splits_csv_filename)
+    if write:
+        with open(csv_path, 'w') as file_object:
+            file_object.write(dataset_splits_csv)
+        print('>CSV file saved as %s' % csv_path)
+    else:
+        print('>>Dry run. The CSV file will be saved as %s' % csv_path)
+        print('>>Use --write to actually output the CSV file.')
 
 
 def split_dataset(filenames, train_set, val_set, test_set, val_len=None, test_len=None):
@@ -425,7 +515,7 @@ def split_all(
         dir_path, dataset_name, dir_name, 'success_only', existing_file_prefix)
 
     # Extract the filenames into subsets
-    train_val_test_filenames = [l if l is not None else [] 
+    train_val_test_filenames = [l if l is not None else []
                                 for l in train_val_test_filenames]
     success_train_set, success_val_set, success_test_set = train_val_test_filenames
     success_train_len, success_val_len, success_test_len = list(
@@ -636,7 +726,7 @@ def main(args, root='root'):
         if len(category_names) != len(subsets):
             raise Exception("Length of categories does not match the length of lists "
                             "returned by split_all. Did you add more categories?")
-        dir_output_file_dict = dict()
+        dir_output_file_dict = {category_name: [] for category_name in category_names}
         subset_names = ['train', 'val', 'test']
         for i in range(len(subsets)):
             category_name = category_names[i]
@@ -647,19 +737,25 @@ def main(args, root='root'):
                 subset_filenames = category_subsets[j]
 
                 # Output the files and store the outputted file paths in the dictionary
-                dir_output_file_dict[category_name] = output_file(
+                dir_output_file_dict[category_name].append(
+                    output_file(
                         args['dataset_path'], args['dataset_name'],
                         dir_path, dir_name, category_name,
-                        subset_name, subset_filenames, args['write'])
+                        subset_name, subset_filenames, args['write']))
         # Store the outputted file paths in this directory
         output_files_dict[dir_name] = dir_output_file_dict
 
-        if args['write'] and not args['success_only']:
+        if not args['success_only']:
+            print('\nWriting csv file in directory %s' % dir_name)
             # Output csv file
-            output_csv(dir_path, subsets)
-
-    if args['write'] and not args['success_only']:
-        output_combined_files(path, output_files_dict, category_names)
+            output_csv(dir_path, subsets, args['write'])
+
+    if not args['success_only']:
+        print('---------------------')
+        print('Combining files')
+        # Write the combined txt files and summary csv file.
+        output_combined_files(path, args['dataset_name'], output_files_dict,
+                              category_names, args['write'])
 
 
 if __name__ == '__main__':

From 7466db6ff89f8a734ad180c1a39df37294b21cf1 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 17:55:33 -0400
Subject: [PATCH 28/36] Modify help description

---
 .../costar_block_stacking_split_dataset.py    | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index a077d9d5c..298bd2acc 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -36,11 +36,23 @@ def tqdm(*args, **kwargs):
 
 def _parse_args():
     parser = argparse.ArgumentParser(
-        description='Splits dataset into train, validation and test sets. '
-                    'Inherits existing validation and test sets. '
-                    'New files are added into training set. '
-                    'If no pre-existing sets of files are indicated, randomize and split '
-                    'the files in the folder 8:1:1 for train/val/test.')
+        description='''
+                    This script will walk into each folder in path, read and count h5f
+                    files with nonzero image frames, read pre-existing train/val/test
+                    txt files, and split all the h5f files in this directory into
+                    success_only, task_failure_only, error_failure_only, and
+                    task_and_error_failure txt files.
+
+                    Files are assumed to store in different folders in:
+                        ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
+                    If the files are stored in a different path, use --path to designate
+                    the path.
+
+                    To split the success_only subset or to add new files ot the
+                    success_only subset, use --success_only flag.
+
+                    Use --help to see all possible uses for this function.
+                    ''')
     parser.add_argument("--path", type=str,
                         default=os.path.join(
                             os.path.expanduser("~"),

From 8b94c4d648b16e6ca2806e90aa67e92d5100a59a Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 18:58:18 -0400
Subject: [PATCH 29/36] Update IA scripts for default values and metadata

---
 .../costar_block_stacking_ia_download.py      | 37 ++++++---
 .../costar_block_stacking_ia_upload.py        | 80 ++++++++++++++-----
 2 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_ia_download.py b/costar_hyper/costar_block_stacking_ia_download.py
index 91710b721..98dcaf154 100644
--- a/costar_hyper/costar_block_stacking_ia_download.py
+++ b/costar_hyper/costar_block_stacking_ia_download.py
@@ -2,25 +2,36 @@
 import argparse
 import os
 
+
 def _parse_args():
-    parser = argparse.ArgumentParser(description=\
-        'Downloads the dataset to `path` from the Internet Archive.')
-    parser.add_argument("--path", type=str, default=os.path.join(os.path.expanduser("~"), '.keras/datasets/costar_block_stacking_dataset_v0.4/'),
-                        help='The path to download the dataset to. Default is "~/.keras/datasets/costar_block_stacking_dataset_v0.4/"')
-    parser.add_argument("--execute", action='store_true', default=False, help='Use this flag to actually download the files from the internet archive')
+    parser = argparse.ArgumentParser(
+        description='Download the dataset from the Internet Archive.')
+    parser.add_argument(
+        "--path", type=str,
+        default=os.path.join(os.path.expanduser("~"),
+                             '.keras/datasets/costar_block_stacking_dataset_v0.4/'),
+        help='The path to download the dataset to. '
+             'Default is "~/.keras/datasets/costar_block_stacking_dataset_v0.4/"')
+    parser.add_argument(
+        "--execute", action='store_true', default=False,
+        help='Use this flag to actually download the files from the internet archive')
 
 
-def main(args, root = 'root'):
+def main(args, root='root'):
     item = internetarchive.Item('costar_block_stacking_dataset')
 
     r = item.download(
-            destdir = args['path'], # The directory to download files to
-            ignore_existing = True, # Skip files that already exist locally
-            checksum = True, # Skip files based on checksum
-            verbose = True, # Print progress to stdout
-            retries = 100, # Thenumber of times to retry on failed requests
-            # dryrun = args['execute']) # Set to true to print headers to stdout, and exit without uploading
-            dryrun = True)
+            destdir=args['path'],  # The directory to download files to
+            ignore_existing=True,  # Skip files that already exist locally
+            checksum=True,  # Skip files based on checksum
+            verbose=True,  # Print progress to stdout
+            retries=100,  # Thenumber of times to retry on failed requests
+            # Set to true to print headers to stdout, and exit without uploading
+            # dryrun = args['execute'])
+            dryrun=True)
+
+    print(r)
+
 
 if __name__ == '__main__':
     args = _parse_args()
diff --git a/costar_hyper/costar_block_stacking_ia_upload.py b/costar_hyper/costar_block_stacking_ia_upload.py
index 6a08e6cac..96ce156f5 100644
--- a/costar_hyper/costar_block_stacking_ia_upload.py
+++ b/costar_hyper/costar_block_stacking_ia_upload.py
@@ -2,36 +2,76 @@
 import argparse
 import os
 
+
 def _parse_args():
-    parser = argparse.ArgumentParser(description=\
-        'Uploads the folder specified in `path` argument to the Internet Archive.')
-    parser.add_argument("--path", type=str, default=os.path.join(os.path.dirname(os.path.abspath(__file__)), ''), help='Path to dataset folder containing many files. Default is current path.')
-    parser.add_argument("--execute", action='store_true', default=False, help='Use this flag to actually upload the files to the internet archive')
-    
+    parser = argparse.ArgumentParser(
+        description='Upload the dataset to the Internet Archive.')
+    parser.add_argument(
+        "--path", type=str,
+        default=os.path.join(os.path.expanduser("~"),
+                             '.keras/datasets/costar_block_stacking_dataset_v0.4/'),
+        help='Path to dataset folder containing many files. Default is current path.')
+    parser.add_argument(
+        "--execute", action='store_true', default=False,
+        help='Use this flag to actually upload the files to the internet archive')
+
     return vars(parser.parse_args())
 
 
-def main(args, root = 'root'):
+def main(args, root='root'):
     item = internetarchive.get_item('costar_block_stacking_dataset')
-    
-    md = dict(collection='test_collection', title='The CoSTAR Block Stacking Dataset', mediatype='data', noindex='True')
-    
+
+    md = dict(
+        collection='test_collection',  # TODO(rexxarchl): change to Dataset Collection
+        title='The CoSTAR Block Stacking Dataset',
+        version='v0.4',  # Custom metadata field for the current version
+        contributor='Andrew Hundt, Varun Jain, Chris Paxton, Chunting Jiao, '
+                    'Chia-Hung Lin, and Gregory D. Hager',
+        creator='Andrew Hundt: athundt[at]gmail[dot]com',
+        credits='''
+                Andrew Hundt, Varun Jain, Chris Paxton, Chunting Jiao, Chia-Hung Lin, 
+                and Gregory D. Hager<br>
+                The Johns Hopkins University<br>
+                <a href="https://cirl.lcsr.jhu.edu/">Computational Interaction and 
+                Robotics Laboratory</a><br>
+                This material is based upon work supported by the National Science 
+                Foundation under NRI Grant Award No. 1637949.
+                ''',
+        date='2018-10-17',
+        description='''
+            Stack blocks like a champion! The CoSTAR Block Stacking Dataset includes a 
+            real robot trying to stack colored children's blocks more than 10,000 times 
+            in a scene with challenging lighting and a movable bin obstacle which must 
+            be avoided. This dataset is especially well suited to the benchmarking and 
+            comparison of deep learning algorithms.<br>
+            Visit the <a href='https://sites.google.com/site/costardataset'>website</a> 
+            for more info.<br>
+            <b>Cite: </b><a href='https://sites.google.com/view/hypertree-renas'>Training 
+            Frankenstein's Creature to Stack: HyperTree Architecture Search</a>
+            ''',
+        license='https://creativecommons.org/licenses/by/4.0/',
+        mediatype='data',  # data is the default media type
+        noindex='True')  # Set to true for the item to not be listed
+
     print(args)
 
     r = item.upload(
-            args['path'], 
-            metadata = md,
-            verify = True, # Verify local MD5 checksum matches remote MD5 checksum
-            checksum = True, # Skip files based on checksum
-            verbose = True, # Print progress to stdout
-            retries = 100, # Number of times to retry the given request
-            retries_sleep = 5, # Amount of time to sleep between `retries`
-            queue_derive = False, # Prevent an item from being derived to another format after upload
-            # debug = args['execute']) # Set to true to print headers to stdout, and exit without uploading
-            debug = True)
-            
+            args['path'],
+            metadata=md,
+            verify=True,  # Verify local MD5 checksum matches remote MD5 checksum
+            checksum=True,  # Skip files based on checksum
+            verbose=True,  # Print progress to stdout
+            retries=100,  # Number of times to retry the given request
+            retries_sleep=5,  # Amount of time to sleep between `retries`
+            # Prevent an item from being derived to another format after upload
+            queue_derive=False,
+            # Set to true to print headers to stdout, and exit without uploading
+            # debug = args['execute'])
+            debug=True)
+
     print(r)
 
+
 if __name__ == '__main__':
     args = _parse_args()
     main(args)

From ec377c4ae924cac9f2f39bc2ab925865a23ba7e8 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 19:17:50 -0400
Subject: [PATCH 30/36] Minor update for better readability

---
 .../costar_block_stacking_split_dataset.py    | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_split_dataset.py b/costar_hyper/costar_block_stacking_split_dataset.py
index 298bd2acc..4fcd56f8b 100644
--- a/costar_hyper/costar_block_stacking_split_dataset.py
+++ b/costar_hyper/costar_block_stacking_split_dataset.py
@@ -4,9 +4,11 @@
 in this directory into success_only, task_failure_only, error_failure_only, and
 task_and_error_failure txt files.
 
-Files are assumed to store in different folders in:
-    ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
-If the files are stored in a different path, use --path to designate the path.
+--path defaults to: ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
+We expect that folder will contain directories containing h5f files. This is done to
+split the dataset across various collection runs.
+Details can be found in the "folder structure" section of
+https://sites.google.com/site/costardataset/usage
 
 To split the success_only subset or to add new files ot the success_only subset, use
 --success_only flag.
@@ -43,10 +45,11 @@ def _parse_args():
                     success_only, task_failure_only, error_failure_only, and
                     task_and_error_failure txt files.
 
-                    Files are assumed to store in different folders in:
-                        ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
-                    If the files are stored in a different path, use --path to designate
-                    the path.
+                    Path defaults to ~/.keras/datasets/costar_block_stacking_dataset_v0.4/
+                    We expect that folder will contain directories containing h5f files.
+                    This is done to split the dataset across various collection runs.
+                    Details can be found in the "folder structure" section of
+                    https://sites.google.com/site/costardataset/usage
 
                     To split the success_only subset or to add new files ot the
                     success_only subset, use --success_only flag.
@@ -178,7 +181,6 @@ def output_csv(path, subsets, write):
     :param write: The flag to actually write the output files.
     :return csv_path: The path to the output csv file.
     '''
-    # TODO(rexxarchl): Implement csv output
     success, _, task_fail, err_fail = subsets
     success_train_len, success_val_len, success_test_len = map(len, success)
     failure_train_len, failure_val_len, failure_test_len = map(len, task_fail)
@@ -280,9 +282,13 @@ def output_combined_files(path, dataset_name, output_files_dict, category_names,
                     with open(txt_file_path, 'r') as f:
                         size += sum(1 for _ in f)
                 except FileNotFoundError:
-                    print('When counting for summary, file {} is not found. The '
-                          'summary below may be inaccurate.'.format(
-                              extract_filename_from_url(txt_file_path)))
+                    print('''
+                        A file was not found at the expected path when validating and 
+                        summariing the dataset. This problem is most likely caused by 
+                        not running with --write flag. Re-run the program with --write 
+                        flag. The summary below may be inaccurate.\n
+                        The problematic file is {}
+                        '''.format(extract_filename_from_url(txt_file_path)))
             summary_dict[category_name].append(size)
 
     # Get the numbers for the summary
@@ -633,7 +639,7 @@ def count_files_containing_images(path, filenames):
                 try:
                     total_frames = len(data['image'])
                 except KeyError as e:
-                    progress_bar.write('Skipping %s for KeyError' % filename)
+                    progress_bar.write('KeyError: Skipping %s' % filename)
                     continue
 
                 if total_frames == 0:  # Skip files with 0 frame
@@ -652,7 +658,7 @@ def count_files_containing_images(path, filenames):
                         'Somthing is wrong! The file does not contain `error`,'
                         '`failure`, or `success` in the filename: %s' % filename)
         except IOError as ex:
-            progress_bar.write('Skipping %s for IOError' % filename)
+            progress_bar.write('IOError: Skipping %s' % filename)
 
     print("Counted {:d} success files, {:d} failure files, and {:d} error files.".format(
             len(success_filenames), len(failure_filenames), len(error_filenames)))

From 64e59558bd10e0f2e0567f1fa94ab4d7beff6bc4 Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Tue, 16 Oct 2018 19:21:12 -0400
Subject: [PATCH 31/36] Add expand user for path

---
 costar_hyper/costar_block_stacking_ia_download.py | 4 +++-
 costar_hyper/costar_block_stacking_ia_upload.py   | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_ia_download.py b/costar_hyper/costar_block_stacking_ia_download.py
index 98dcaf154..3171243f9 100644
--- a/costar_hyper/costar_block_stacking_ia_download.py
+++ b/costar_hyper/costar_block_stacking_ia_download.py
@@ -20,8 +20,10 @@ def _parse_args():
 def main(args, root='root'):
     item = internetarchive.Item('costar_block_stacking_dataset')
 
+    path = os.path.expanduser(args['path'])
+
     r = item.download(
-            destdir=args['path'],  # The directory to download files to
+            destdir=path,  # The directory to download files to
             ignore_existing=True,  # Skip files that already exist locally
             checksum=True,  # Skip files based on checksum
             verbose=True,  # Print progress to stdout
diff --git a/costar_hyper/costar_block_stacking_ia_upload.py b/costar_hyper/costar_block_stacking_ia_upload.py
index 96ce156f5..c6fd38795 100644
--- a/costar_hyper/costar_block_stacking_ia_upload.py
+++ b/costar_hyper/costar_block_stacking_ia_upload.py
@@ -54,9 +54,10 @@ def main(args, root='root'):
         noindex='True')  # Set to true for the item to not be listed
 
     print(args)
+    path = os.path.expanduser(args['path'])
 
     r = item.upload(
-            args['path'],
+            path,
             metadata=md,
             verify=True,  # Verify local MD5 checksum matches remote MD5 checksum
             checksum=True,  # Skip files based on checksum

From 6496ccda6d69a6d9c7f4ff9b20bbd72b91bf2764 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Wed, 17 Oct 2018 15:04:10 -0400
Subject: [PATCH 32/36] Change email address

---
 costar_hyper/costar_block_stacking_ia_upload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/costar_hyper/costar_block_stacking_ia_upload.py b/costar_hyper/costar_block_stacking_ia_upload.py
index c6fd38795..78040a86d 100644
--- a/costar_hyper/costar_block_stacking_ia_upload.py
+++ b/costar_hyper/costar_block_stacking_ia_upload.py
@@ -27,7 +27,7 @@ def main(args, root='root'):
         version='v0.4',  # Custom metadata field for the current version
         contributor='Andrew Hundt, Varun Jain, Chris Paxton, Chunting Jiao, '
                     'Chia-Hung Lin, and Gregory D. Hager',
-        creator='Andrew Hundt: athundt[at]gmail[dot]com',
+        creator='Andrew Hundt <ATHundt@gmail.com>',
         credits='''
                 Andrew Hundt, Varun Jain, Chris Paxton, Chunting Jiao, Chia-Hung Lin, 
                 and Gregory D. Hager<br>

From a1fa1728faa32af8f16f84f30c9037934d25502d Mon Sep 17 00:00:00 2001
From: "Chia-Hung \"Rexxar\" Lin" <rexxar.lin@gmail.com>
Date: Wed, 17 Oct 2018 15:10:26 -0400
Subject: [PATCH 33/36] Small changes to the code

---
 costar_hyper/costar_block_stacking_ia_download.py | 2 +-
 costar_hyper/costar_block_stacking_ia_upload.py   | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/costar_hyper/costar_block_stacking_ia_download.py b/costar_hyper/costar_block_stacking_ia_download.py
index 3171243f9..27ab6cae1 100644
--- a/costar_hyper/costar_block_stacking_ia_download.py
+++ b/costar_hyper/costar_block_stacking_ia_download.py
@@ -18,7 +18,7 @@ def _parse_args():
 
 
 def main(args, root='root'):
-    item = internetarchive.Item('costar_block_stacking_dataset')
+    item = internetarchive.get_item('costar_block_stacking_dataset')
 
     path = os.path.expanduser(args['path'])
 
diff --git a/costar_hyper/costar_block_stacking_ia_upload.py b/costar_hyper/costar_block_stacking_ia_upload.py
index 78040a86d..bd63ecced 100644
--- a/costar_hyper/costar_block_stacking_ia_upload.py
+++ b/costar_hyper/costar_block_stacking_ia_upload.py
@@ -22,7 +22,9 @@ def main(args, root='root'):
     item = internetarchive.get_item('costar_block_stacking_dataset')
 
     md = dict(
-        collection='test_collection',  # TODO(rexxarchl): change to Dataset Collection
+        # TODO(rexxarchl): change to Dataset Collection after proper testing
+        # collection='datasets',
+        collection='test_collection',
         title='The CoSTAR Block Stacking Dataset',
         version='v0.4',  # Custom metadata field for the current version
         contributor='Andrew Hundt, Varun Jain, Chris Paxton, Chunting Jiao, '

From 9de577cc2373fe79bce7523172fb6c6613ee53be Mon Sep 17 00:00:00 2001
From: Andrew Hundt <athundt@gmail.com>
Date: Wed, 17 Oct 2018 18:59:10 -0400
Subject: [PATCH 34/36] cornell_grasp_train.py fix user directory bug

---
 costar_hyper/cornell_grasp_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/costar_hyper/cornell_grasp_train.py b/costar_hyper/cornell_grasp_train.py
index defb6c790..9c0fd52d5 100755
--- a/costar_hyper/cornell_grasp_train.py
+++ b/costar_hyper/cornell_grasp_train.py
@@ -1310,7 +1310,7 @@ def load_dataset(
                 FLAGS.data_dir = os.path.expanduser('~/.keras/datasets/costar_block_stacking_dataset_v0.4/')
             # TODO(ahundt) make the data dir user configurable again for costar_block stacking
             # FLAGS.data_dir = os.path.expanduser('~/.keras/datasets/costar_block_stacking_dataset_v0.4/')
-            data_dir = FLAGS.data_dir
+            data_dir = os.path.expanduser(FLAGS.data_dir)
             costar_filename_base = FLAGS.costar_filename_base
 
             test_data_filename = os.path.join(data_dir, costar_filename_base + '_test_files.txt')

From 05f419ff5beff99d90a94b278f5a826a640573da Mon Sep 17 00:00:00 2001
From: Andrew Hundt <athundt@gmail.com>
Date: Wed, 17 Oct 2018 19:00:11 -0400
Subject: [PATCH 35/36] cornell_hyperopt.py do a new search on full grasp
 regression

---
 costar_hyper/cornell_hyperopt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/costar_hyper/cornell_hyperopt.py b/costar_hyper/cornell_hyperopt.py
index c96020637..9ca0c80e4 100644
--- a/costar_hyper/cornell_hyperopt.py
+++ b/costar_hyper/cornell_hyperopt.py
@@ -74,13 +74,13 @@ def main(_):
     # FLAGS.problem_type = 'classification'
     # FLAGS.dataset_name = 'cornell_grasping'
     FLAGS.dataset_name = 'costar_block_stacking'
-    # FLAGS.problem_type = 'semantic_grasp_regression'
+    FLAGS.problem_type = 'semantic_grasp_regression'
 
     ## CONFIGURE: Choose from one of the three problem types for ranking. 
     ## ---------------------------------------------------- 
     # When ranking translation use the following settings:
-    FLAGS.log_dir = 'hyperopt_logs_costar_translation_regression'
-    FLAGS.problem_type = 'semantic_translation_regression'
+    FLAGS.log_dir = '2018_10_hyperopt_logs_costar_grasp_regression'
+    # FLAGS.problem_type = 'semantic_translation_regression'
     # ----------------------------------------------------
     # When ranking rotation use the following settings:
     # FLAGS.log_dir = 'hyperopt_logs_costar_block_stacking_train_ranked_regression'

From 14b8a3539a640d1bc5d8f5873052bc34fd1daf50 Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Thu, 18 Oct 2018 00:25:12 -0400
Subject: [PATCH 36/36] hyperopt_plot.py configurable dimensions and model
 count

---
 costar_hyper/hyperopt_plot.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/costar_hyper/hyperopt_plot.py b/costar_hyper/hyperopt_plot.py
index f2963d634..1178e3ec4 100644
--- a/costar_hyper/hyperopt_plot.py
+++ b/costar_hyper/hyperopt_plot.py
@@ -128,9 +128,19 @@ def tqdm(*args, **kwargs):
 )
 flags.DEFINE_integer(
     'max_models_to_show',
-    24,
+    384,
     'Maximum number of models to display, 24 by default'
 )
+flags.DEFINE_integer(
+    'width',
+    6144,
+    'Width of figure in pixels, 1280 and 1920 are good options.'
+)
+flags.DEFINE_integer(
+    'height',
+    240,
+    'Height of each subfigure in pixels, 240 is a good option.'
+)
 
 
 FLAGS = flags.FLAGS
@@ -144,6 +154,9 @@ def tqdm(*args, **kwargs):
     csv_file = os.path.join(os.path.expanduser(FLAGS.log_dir), FLAGS.rank_csv)
 else:
     csv_file = os.path.expanduser(FLAGS.rank_csv)
+log_y = False
+width = FLAGS.width
+height = FLAGS.height
 # load the hyperparameter optimization ranking csv file created by hyperopt_rank.py
 dataframe = pandas.read_csv(csv_file, index_col=None, header=0)
 if problem_type == 'semantic_rotation_regression':
@@ -198,6 +211,7 @@ def tqdm(*args, **kwargs):
     ]
     units = 'mm'
     avg_error_suffix = 'cart_error'
+    log_y = True
 elif problem_type == 'semantic_grasp_regression':
     dataframe = dataframe.sort_values('val_grasp_acc', ascending=False)
     sort_by = 'val_grasp_acc'
@@ -344,8 +358,6 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 distribution_table = hv.Table(rdf, key_dimensions, value_dimensions)
 print('1.0 dist table created')
 distribution_table_bars = distribution_table.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
-height = 240
-width = 1280
 # uncomment below if you want to plot tons of models
 # width = 12800
 distribution_table_bars = distribution_table_bars.options(stack_index=1, width=width, height=height, xrotation=90, tools=['hover'], group_index='train_val_test', cmap='RdYlGn_r', show_grid=True)
@@ -365,11 +377,15 @@ def create_data_comparison_table(value_dimension_tuples_mm, units, problem_type)
 key_dimensions = [('name', 'Model'), ('train_val_test', 'Dataset Split')]
 key_dimension_display_strs = [vt[1] for vt in key_dimensions]
 value_dimensions = [('avg_error', 'Average Error')]
+if log_y:
+    value_dimensions = [('avg_error', 'Average Error (Log Scale)')]
 value_dimension_display_strs = [vt[1] for vt in value_dimensions]
 avg_table_bars = hv.Table(rdf, key_dimensions, value_dimensions)
 print('4.0 avg table created')
 avg_table_bars = avg_table_bars.to.bars(key_dimension_display_strs, value_dimension_display_strs, [])
-avg_table_bars = avg_table_bars.options(width=width, height=160, xrotation=90, tools=['hover'], group_index='train_val_test', xaxis=None)
+avg_table_bars = avg_table_bars.options(
+    width=width, height=height-80, xrotation=90, tools=['hover'], group_index='train_val_test',
+    xaxis=None, logy=log_y)
 print('4.0 avg table bars')
 # avg_table_plot = renderer.get_plot(avg_table_bars)
 # print('5.0 table plot')