diff --git a/POMO/TSP/TSPTrainer_Meta.py b/POMO/TSP/TSPTrainer_Meta.py
index 392bcf7..f41152d 100644
--- a/POMO/TSP/TSPTrainer_Meta.py
+++ b/POMO/TSP/TSPTrainer_Meta.py
@@ -95,8 +95,13 @@ def run(self):
             self.result_log.append('train_loss', epoch, train_loss)
             # Val
             dir, no_aug_score_list = "../../data/TSP/", []
-            # for val_path in ["tsp50_uniform.pkl", "tsp100_uniform.pkl", "tsp100_diagonal.pkl", "tsp150_uniform.pkl", "tsp200_uniform.pkl"]:
-            for val_path in ["tsp50_uniform.pkl", "tsp100_uniform.pkl", "tsp150_uniform.pkl", "tsp200_uniform.pkl"]:
+            if self.meta_params["data_type"] == "size":
+                paths = ["tsp50_uniform.pkl", "tsp100_uniform.pkl", "tsp200_uniform.pkl"]
+            elif self.meta_params["data_type"] == "distribution":
+                paths = ["tsp100_uniform.pkl", "tsp100_gaussian.pkl", "tsp100_cluster.pkl", "tsp100_diagonal.pkl", "tsp100_tsplib.pkl"]
+            elif self.meta_params["data_type"] == "size_distribution":
+                pass
+            for val_path in paths:
                 if self.meta_params['meta_method'] in ['fomaml', 'reptile']:
                     no_aug_score = self._fast_val(copy.deepcopy(self.meta_model), path=os.path.join(dir, val_path), val_episodes=64, mode="eval")
                 else:
@@ -155,14 +160,35 @@ def _train_one_epoch(self, epoch):
         self.meta_optimizer.zero_grad()
         score_AM = AverageMeter()
         loss_AM = AverageMeter()
-        batch_size = self.meta_params['meta_batch_size']
+
+        """
+        Curriculum learning:
+            for size: gradually increase the problem size
+            for distribution: gradually increase adversarial budgets (i.e., \epsilon)
+        """
+        if self.meta_params["data_type"] in ["size", "distribution"]:
+            self.min_n, self.max_n = self.task_set[0][0], self.task_set[-1][0]  # [20, 150] / [0, 130]
+            # start = self.min_n + int(epoch/self.meta_params['epochs'] * (self.max_n - self.min_n))  # linear
+            start = self.min_n + int(1/2 * (1-math.cos(math.pi * min(epoch/self.meta_params['epochs'], 1))) * (self.max_n - self.min_n))  # cosine
+            end = min(start + 10, self.max_n)  # 10 is the size of the sliding window
+            if self.meta_params["curriculum"]: print(">> training task {}".format((start, end)))
+        elif self.meta_params["data_type"] == "size_distribution":
+            pass
 
         self._alpha_scheduler(epoch)
         fast_weights, val_loss, fomaml_grad = [], 0, []
 
         # sample a batch of tasks
         for i in range(self.meta_params['B']):
-            task_params = random.sample(self.task_set, 1)[0]
+            if self.meta_params["data_type"] == "size":
+                task_params = random.sample(range(start, end+1), 1) if self.meta_params['curriculum'] else random.sample(self.task_set, 1)[0]
+                batch_size = self.meta_params['meta_batch_size'] if task_params[0] <= 100 else self.meta_params['meta_batch_size'] // 2
+            elif self.meta_params["data_type"] == "distribution":
+                task_params = random.sample(range(start, end+1), 1) if self.meta_params['curriculum'] else random.sample(self.task_set, 1)[0]
+                batch_size = self.meta_params['meta_batch_size']
+            elif self.meta_params["data_type"] == "size_distribution":
+                pass
+
             if self.meta_params['meta_method'] in ['fomaml', 'reptile']:
                 task_model = copy.deepcopy(self.meta_model)
                 optimizer = Optimizer(task_model.parameters(), **self.optimizer_params['optimizer'])
@@ -179,7 +205,6 @@ def _train_one_epoch(self, epoch):
             # inner-loop optimization
             for step in range(self.meta_params['k']):
                 data = self._get_data(batch_size, task_params)
-                data = self._generate_x_adv(data, eps=random.randint(10, 100)) if self.trainer_params['adv_train'] else data
                 env_params = {'problem_size': data.size(1), 'pomo_size': data.size(1)}
                 self.meta_model.train()
                 if self.meta_params['meta_method'] in ['reptile', 'fomaml']:
@@ -189,7 +214,7 @@ def _train_one_epoch(self, epoch):
                 score_AM.update(avg_score.item(), batch_size)
                 loss_AM.update(avg_loss.item(), batch_size)
 
-            val_data = self._get_val_data(self.meta_params['val_batch_size'], task_params)
+            val_data = self._get_val_data(batch_size, task_params)
             if self.meta_params['meta_method'] == 'maml':
                 val_loss = self._fast_val(fast_weight, data=val_data, mode="maml")
                 val_loss /= self.meta_params['B']
@@ -304,6 +329,7 @@ def _train_one_batch_maml(self, fast_weight, data, env, optimizer=None):
         #     for ((name, param), grad) in zip(fast_weight.items(), gradients)
         # )
         optimizer.zero_grad()
+        # torch.autograd.grad(loss_mean, fast_weight.values(), create_graph=True)
         loss_mean.backward(retain_graph=True, create_graph=True)
         optimizer.step()
 
@@ -418,23 +444,28 @@ def _bootstrap(self, fast_weight, data):
     def _get_data(self, batch_size, task_params):
 
         if self.meta_params['data_type'] == 'distribution':
-            assert len(task_params) == 2
-            data = get_random_problems(batch_size, self.env_params['problem_size'], num_modes=task_params[0], cdist=task_params[-1], distribution='gaussian_mixture')
+            assert len(task_params) == 1
+            data = get_random_problems(batch_size, self.env_params['problem_size'], num_modes=0, cdist=0, distribution='uniform')
+            data = self._generate_x_adv(data, eps=task_params[0])
         elif self.meta_params['data_type'] == 'size':
             assert len(task_params) == 1
             data = get_random_problems(batch_size, task_params[0], num_modes=0, cdist=0, distribution='uniform')
         elif self.meta_params['data_type'] == "size_distribution":
-            assert len(task_params) == 3
-            data = get_random_problems(batch_size, problem_size=task_params[0], num_modes=task_params[1], cdist=task_params[-1], distribution='gaussian_mixture')
+            assert len(task_params) == 2
+            data = get_random_problems(batch_size, task_params[0], num_modes=0, cdist=0, distribution='uniform')
+            data = self._generate_x_adv(data, eps=task_params[1])
         else:
             raise NotImplementedError
 
         return data
 
     def _get_val_data(self, batch_size, task_params):
-        val_data = self._get_data(batch_size, task_params)
-        # val_path = "../../data/TSP/tsp150_uniform.pkl"
-        # val_data = torch.Tensor(load_dataset(val_path)[: batch_size])
+        if self.meta_params["data_type"] in ["size", "distribution"]:
+            start1, end1 = min(task_params[0] + 10, self.max_n), min(task_params[0] + 20, self.max_n)
+        elif self.meta_params["data_type"] == "size_distribution":
+            pass
+        val_size = random.sample(range(start1, end1 + 1), 1)[0]
+        val_data = self._get_data(batch_size, (val_size,))
 
         return val_data
 
@@ -454,6 +485,7 @@ def minmax(xy_):
             xy_ = (xy_ - xy_.min(dim=1, keepdims=True)[0]) / (xy_.max(dim=1, keepdims=True)[0] - xy_.min(dim=1, keepdims=True)[0])
             return xy_
 
+        if eps == 0: return data
         # generate x_adv
         print(">> Warning! Generating x_adv!")
         self.meta_model.eval()
diff --git a/POMO/TSP/TSPTrainer_pomo.py b/POMO/TSP/TSPTrainer_pomo.py
index 60bb3da..9c3c970 100644
--- a/POMO/TSP/TSPTrainer_pomo.py
+++ b/POMO/TSP/TSPTrainer_pomo.py
@@ -86,7 +86,13 @@ def run(self):
             self.result_log.append('train_loss', epoch, train_loss)
             # Val
             dir, no_aug_score_list = "../../data/TSP/", []
-            for val_path in ["tsp50_uniform.pkl", "tsp100_uniform.pkl", "tsp100_diagonal.pkl", "tsp150_uniform.pkl", "tsp200_uniform.pkl"]:
+            if self.meta_params["data_type"] == "size":
+                paths = ["tsp50_uniform.pkl", "tsp100_uniform.pkl", "tsp200_uniform.pkl"]
+            elif self.meta_params["data_type"] == "distribution":
+                paths = ["tsp100_uniform.pkl", "tsp100_gaussian.pkl", "tsp100_cluster.pkl", "tsp100_diagonal.pkl", "tsp100_tsplib.pkl"]
+            elif self.meta_params["data_type"] == "size_distribution":
+                pass
+            for val_path in paths:
                 no_aug_score = self._fast_val(self.meta_model, path=os.path.join(dir, val_path), val_episodes=64)
                 no_aug_score_list.append(round(no_aug_score, 4))
             self.result_log.append('val_score', epoch, no_aug_score_list[-1])
@@ -141,25 +147,34 @@ def _train_one_epoch(self, epoch):
         """
         score_AM = AverageMeter()
         loss_AM = AverageMeter()
-        batch_size = self.meta_params['meta_batch_size']
+
+        # Curriculum learning
+        if self.meta_params["data_type"] in ["size", "distribution"]:
+            self.min_n, self.max_n = self.task_set[0][0], self.task_set[-1][0]  # [20, 150] / [0, 130]
+            # start = self.min_n + int(epoch/self.meta_params['epochs'] * (self.max_n - self.min_n))  # linear
+            start = self.min_n + int(1 / 2 * (1 - math.cos(math.pi * min(epoch / self.meta_params['epochs'], 1))) * (self.max_n - self.min_n))  # cosine
+            end = min(start + 10, self.max_n)  # 10 is the size of the sliding window
+            if self.meta_params["curriculum"]: print(">> training task {}".format((start, end)))
+        elif self.meta_params["data_type"] == "size_distribution":
+            pass
 
         # sample a batch of tasks
         for i in range(self.meta_params['B']):
-            task_params = random.sample(self.task_set, 1)[0]
-
             for step in range(self.meta_params['k']):
-                # task_params = random.sample(self.task_set, 1)[0]
+                if self.meta_params["data_type"] == "size":
+                    task_params = random.sample(range(start, end + 1), 1) if self.meta_params['curriculum'] else random.sample(self.task_set, 1)[0]
+                    batch_size = self.meta_params['meta_batch_size'] if task_params[0] <= 100 else self.meta_params['meta_batch_size'] // 2
+                elif self.meta_params["data_type"] == "distribution":
+                    task_params = random.sample(range(start, end + 1), 1) if self.meta_params['curriculum'] else random.sample(self.task_set, 1)[0]
+                    batch_size = self.meta_params['meta_batch_size']
+                elif self.meta_params["data_type"] == "size_distribution":
+                    pass
                 data = self._get_data(batch_size, task_params)
-                data = self._generate_x_adv(data, eps=random.randint(10, 100)) if self.trainer_params['adv_train'] else data
                 env_params = {'problem_size': data.size(1), 'pomo_size': data.size(1)}
                 avg_score, avg_loss = self._train_one_batch(data, Env(**env_params))
                 score_AM.update(avg_score.item(), batch_size)
                 loss_AM.update(avg_loss.item(), batch_size)
 
-            val_data = self._get_val_data(self.meta_params['val_batch_size'], task_params)
-            env_params = {'problem_size': val_data.size(1), 'pomo_size': val_data.size(1)}
-            self._train_one_batch(val_data, Env(**env_params))
-
         # Log Once, for each epoch
         self.logger.info('Meta Iteration {:3d}: Score: {:.4f},  Loss: {:.4f}'.format(epoch, score_AM.avg, loss_AM.avg))
 
@@ -234,25 +249,21 @@ def _fast_val(self, model, data=None, path=None, val_episodes=32):
     def _get_data(self, batch_size, task_params):
 
         if self.meta_params['data_type'] == 'distribution':
-            assert len(task_params) == 2
-            data = get_random_problems(batch_size, self.env_params['problem_size'], num_modes=task_params[0], cdist=task_params[-1], distribution='gaussian_mixture')
+            assert len(task_params) == 1
+            data = get_random_problems(batch_size, self.env_params['problem_size'], num_modes=0, cdist=0, distribution='uniform')
+            data = self._generate_x_adv(data, eps=task_params[0])
         elif self.meta_params['data_type'] == 'size':
             assert len(task_params) == 1
             data = get_random_problems(batch_size, task_params[0], num_modes=0, cdist=0, distribution='uniform')
         elif self.meta_params['data_type'] == "size_distribution":
-            assert len(task_params) == 3
-            data = get_random_problems(batch_size, problem_size=task_params[0], num_modes=task_params[1], cdist=task_params[-1], distribution='gaussian_mixture')
+            assert len(task_params) == 2
+            data = get_random_problems(batch_size, task_params[0], num_modes=0, cdist=0, distribution='uniform')
+            data = self._generate_x_adv(data, eps=task_params[1])
         else:
             raise NotImplementedError
 
         return data
 
-    def _get_val_data(self, batch_size, task_params):
-        val_data = self._get_data(batch_size, task_params)
-        # val_path = "../../data/TSP/tsp100_uniform.pkl"
-        # val_data = torch.Tensor(load_dataset(val_path)[: batch_size])
-
-        return val_data
 
     def _generate_x_adv(self, data, eps=10.0):
         """
@@ -264,8 +275,8 @@ def minmax(xy_):
             xy_ = (xy_ - xy_.min(dim=1, keepdims=True)[0]) / (xy_.max(dim=1, keepdims=True)[0] - xy_.min(dim=1, keepdims=True)[0])
             return xy_
 
+        if eps == 0: return data
         # generate x_adv
-        print(">> Warning! Generating x_adv!")
         self.meta_model.eval()
         aug_factor, batch_size = 1, data.size(0)
         env = Env(**{'problem_size': data.size(1), 'pomo_size': data.size(1)})
diff --git a/POMO/TSP/TSP_baseline.py b/POMO/TSP/TSP_baseline.py
new file mode 100644
index 0000000..a2ae178
--- /dev/null
+++ b/POMO/TSP/TSP_baseline.py
@@ -0,0 +1,485 @@
+import argparse
+import numpy as np
+import os, re
+import time
+from datetime import timedelta
+from scipy.spatial import distance_matrix
+from subprocess import check_call, check_output, CalledProcessError
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, "..")  # for utils
+from utils.functions import check_extension, load_dataset, save_dataset, run_all_in_pool, move_to
+
+
+class TSPDataset(Dataset):
+
+    def __init__(self, data=None):
+        super(TSPDataset, self).__init__()
+
+        self.data = data
+        self.size = len(self.data)
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+def solve_gurobi(directory, name, loc, disable_cache=False, timeout=None, gap=None):
+    # Lazy import so we do not need to have gurobi installed to run this script
+    from TSP_gurobi import solve_euclidian_tsp as solve_euclidian_tsp_gurobi
+
+    try:
+        problem_filename = os.path.join(directory, "{}.gurobi{}{}.pkl".format(
+            name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap)))
+
+        if os.path.isfile(problem_filename) and not disable_cache:
+            (cost, tour, duration) = load_dataset(problem_filename)
+        else:
+            # 0 = start, 1 = end so add depot twice
+            start = time.time()
+
+            cost, tour = solve_euclidian_tsp_gurobi(loc, threads=1, timeout=timeout, gap=gap)
+            duration = time.time() - start  # Measure clock time
+            save_dataset((cost, tour, duration), problem_filename)
+
+        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
+        total_cost = calc_tsp_length(loc, tour)
+        assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect"
+        return total_cost, tour, duration
+
+    except Exception as e:
+        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
+        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
+        print("Exception occured")
+        print(e)
+        return None
+
+
+def solve_concorde_log(executable, directory, name, loc, disable_cache=False):
+
+    problem_filename = os.path.join(directory, "{}.tsp".format(name))
+    tour_filename = os.path.join(directory, "{}.tour".format(name))
+    output_filename = os.path.join(directory, "{}.concorde.pkl".format(name))
+    log_filename = os.path.join(directory, "{}.log".format(name))
+
+    # if True:
+    try:
+        # May have already been run
+        if os.path.isfile(output_filename) and not disable_cache:
+            tour, duration = load_dataset(output_filename)
+        else:
+            write_tsplib(problem_filename, loc, name=name)
+
+            with open(log_filename, 'w') as f:
+                start = time.time()
+                try:
+                    # Concorde is weird, will leave traces of solution in current directory so call from target dir
+                    check_call([executable, '-s', '1234', '-x', '-o',
+                                os.path.abspath(tour_filename), os.path.abspath(problem_filename)],
+                               stdout=f, stderr=f, cwd=directory)
+                except CalledProcessError as e:
+                    # Somehow Concorde returns 255
+                    assert e.returncode == 255
+                duration = time.time() - start
+
+            tour = read_concorde_tour(tour_filename)
+            save_dataset((tour, duration), output_filename)
+
+        return calc_tsp_length(loc, tour), tour, duration
+
+    except Exception as e:
+        print("Exception occured")
+        print(e)
+        return None
+
+
+def get_lkh_executable(url="http://www.akira.ruc.dk/~keld/research/LKH-3/LKH-3.0.7.tgz"):
+
+    cwd = os.path.abspath(os.path.join("problems", "vrp", "lkh"))
+    os.makedirs(cwd, exist_ok=True)
+
+    file = os.path.join(cwd, os.path.split(urlparse(url).path)[-1])
+    filedir = os.path.splitext(file)[0]
+
+    if not os.path.isdir(filedir):
+        print("{} not found, downloading and compiling".format(filedir))
+
+        check_call(["wget", url], cwd=cwd)
+        assert os.path.isfile(file), "Download failed, {} does not exist".format(file)
+        check_call(["tar", "xvfz", file], cwd=cwd)
+
+        assert os.path.isdir(filedir), "Extracting failed, dir {} does not exist".format(filedir)
+        check_call("make", cwd=filedir)
+        os.remove(file)
+
+    executable = os.path.join(filedir, "LKH")
+    assert os.path.isfile(executable)
+    return os.path.abspath(executable)
+
+
+def solve_lkh_log(executable, directory, name, loc, runs=1, disable_cache=False):
+
+    problem_filename = os.path.join(directory, "{}.lkh{}.vrp".format(name, runs))
+    tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs))
+    output_filename = os.path.join(directory, "{}.lkh{}.pkl".format(name, runs))
+    param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs))
+    log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs))
+
+    try:
+        # May have already been run
+        if os.path.isfile(output_filename) and not disable_cache:
+            tour, duration = load_dataset(output_filename)
+        else:
+            write_tsplib(problem_filename, loc, name=name)
+
+            params = {"PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": tour_filename, "RUNS": runs, "SEED": 1234}
+            write_lkh_par(param_filename, params)
+
+            with open(log_filename, 'w') as f:
+                start = time.time()
+                check_call([executable, param_filename], stdout=f, stderr=f)
+                duration = time.time() - start
+
+            tour = read_tsplib(tour_filename)
+            save_dataset((tour, duration), output_filename)
+
+        return calc_tsp_length(loc, tour), tour, duration
+
+    except Exception as e:
+        print("Exception occured")
+        print(e)
+        return None
+
+
+def write_lkh_par(filename, parameters):
+    default_parameters = {  # Use none to include as flag instead of kv
+        "MAX_TRIALS": 10000,
+        "RUNS": 10,
+        "TRACE_LEVEL": 1,
+        "SEED": 0
+    }
+    with open(filename, 'w') as f:
+        for k, v in {**default_parameters, **parameters}.items():
+            if v is None:
+                f.write("{}\n".format(k))
+            else:
+                f.write("{} = {}\n".format(k, v))
+
+
+def write_tsplib(filename, loc, name="problem"):
+
+    with open(filename, 'w') as f:
+        f.write("\n".join([
+            "{} : {}".format(k, v)
+            for k, v in (
+                ("NAME", name),
+                ("TYPE", "TSP"),
+                ("DIMENSION", len(loc)),
+                ("EDGE_WEIGHT_TYPE", "EUC_2D"),
+            )
+        ]))
+        f.write("\n")
+        f.write("NODE_COORD_SECTION\n")
+        f.write("\n".join([
+            "{}\t{}\t{}".format(i + 1, int(x * 10000000 + 0.5), int(y * 10000000 + 0.5))  # tsplib does not take floats
+            for i, (x, y) in enumerate(loc)
+        ]))
+        f.write("\n")
+        f.write("EOF\n")
+
+
+def read_concorde_tour(filename):
+    with open(filename, 'r') as f:
+        n = None
+        tour = []
+        for line in f:
+            if n is None:
+                n = int(line)
+            else:
+                tour.extend([int(node) for node in line.rstrip().split(" ")])
+    assert len(tour) == n, "Unexpected tour length"
+    return tour
+
+
+def read_tsplib(filename):
+    with open(filename, 'r') as f:
+        tour = []
+        dimension = 0
+        started = False
+        for line in f:
+            if started:
+                loc = int(line)
+                if loc == -1:
+                    break
+                tour.append(loc)
+            if line.startswith("DIMENSION"):
+                dimension = int(line.split(" ")[-1])
+
+            if line.startswith("TOUR_SECTION"):
+                started = True
+
+    assert len(tour) == dimension
+    tour = np.array(tour).astype(int) - 1  # Subtract 1 as depot is 1 and should be 0
+    return tour.tolist()
+
+
+def calc_tsp_length(loc, tour):
+    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
+    assert len(tour) == len(loc)
+    sorted_locs = np.array(loc)[np.concatenate((tour, [tour[0]]))]
+    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
+
+
+def _calc_insert_cost(D, prv, nxt, ins):
+    """
+    Calculates insertion costs of inserting ins between prv and nxt
+    :param D: distance matrix
+    :param prv: node before inserted node, can be vector
+    :param nxt: node after inserted node, can be vector
+    :param ins: node to insert
+    :return:
+    """
+    return (
+        D[prv, ins]
+        + D[ins, nxt]
+        - D[prv, nxt]
+    )
+
+
+def run_insertion(loc, method):
+    n = len(loc)
+    D = distance_matrix(loc, loc)
+
+    mask = np.zeros(n, dtype=bool)
+    tour = []  # np.empty((0, ), dtype=int)
+    for i in range(n):
+        feas = mask == 0
+        feas_ind = np.flatnonzero(mask == 0)
+        if method == 'random':
+            # Order of instance is random so do in order for deterministic results
+            a = i
+        elif method == 'nearest':
+            if i == 0:
+                a = 0  # order does not matter so first is random
+            else:
+                a = feas_ind[D[np.ix_(feas, ~feas)].min(1).argmin()] # node nearest to any in tour
+        elif method == 'cheapest':
+            assert False, "Not yet implemented"  # try all and find cheapest insertion cost
+
+        elif method == 'farthest':
+            if i == 0:
+                a = D.max(1).argmax()  # Node with farthest distance to any other node
+            else:
+                a = feas_ind[D[np.ix_(feas, ~feas)].min(1).argmax()]  # node which has closest node in tour farthest
+        mask[a] = True
+
+        if len(tour) == 0:
+            tour = [a]
+        else:
+            # Find index with least insert cost
+            ind_insert = np.argmin(
+                _calc_insert_cost(
+                    D,
+                    tour,
+                    np.roll(tour, -1),
+                    a
+                )
+            )
+            tour.insert(ind_insert + 1, a)
+
+    cost = D[tour, np.roll(tour, -1)].sum()
+    return cost, tour
+
+
+def solve_insertion(directory, name, loc, method='random'):
+    start = time.time()
+    cost, tour = run_insertion(loc, method)
+    duration = time.time() - start
+    return cost, tour, duration
+
+
+def calc_batch_pdist(dataset):
+    diff = (dataset[:, :, None, :] - dataset[:, None, :, :])
+    return torch.matmul(diff[:, :, :, None, :], diff[:, :, :, :, None]).squeeze(-1).squeeze(-1).sqrt()
+
+
+def nearest_neighbour(dataset, start='first'):
+    dist = calc_batch_pdist(dataset)
+
+    batch_size, graph_size, _ = dataset.size()
+
+    total_dist = dataset.new(batch_size).zero_()
+
+    if not isinstance(start, torch.Tensor):
+        if start == 'random':
+            start = dataset.new().long().new(batch_size).zero_().random_(0, graph_size)
+        elif start == 'first':
+            start = dataset.new().long().new(batch_size).zero_()
+        elif start == 'center':
+            _, start = dist.mean(2).min(1)  # Minimum total distance to others
+        else:
+            assert False, "Unknown start: {}".format(start)
+
+    current = start
+    dist_to_startnode = torch.gather(dist, 2, current.view(-1, 1, 1).expand(batch_size, graph_size, 1)).squeeze(2)
+    tour = [current]
+
+    for i in range(graph_size - 1):
+        # Mark out current node as option
+        dist.scatter_(2, current.view(-1, 1, 1).expand(batch_size, graph_size, 1), np.inf)
+        nn_dist = torch.gather(dist, 1, current.view(-1, 1, 1).expand(batch_size, 1, graph_size)).squeeze(1)
+
+        min_nn_dist, current = nn_dist.min(1)
+        total_dist += min_nn_dist
+        tour.append(current)
+
+    total_dist += torch.gather(dist_to_startnode, 1, current.view(-1, 1)).squeeze(1)
+
+    return total_dist, torch.stack(tour, dim=1)
+
+
+def solve_all_nn(dataset_path, eval_batch_size=1024, no_cuda=False, dataset_n=None, progress_bar_mininterval=0.1):
+    import torch
+    from torch.utils.data import DataLoader
+    from TSProblemDef import get_random_problems
+
+    num_samples = dataset_n if dataset_n is not None else 1000000
+    data = get_random_problems(batch_size=num_samples, path=dataset_path)
+    dataloader = DataLoader(TSPDataset(data=data), batch_size=eval_batch_size)
+    device = torch.device("cuda:0" if torch.cuda.is_available() and not no_cuda else "cpu")
+    results = []
+    for batch in tqdm(dataloader, mininterval=progress_bar_mininterval):
+        start = time.time()
+        batch = move_to(batch, device)
+
+        lengths, tours = nearest_neighbour(batch)
+        lengths_check, _ = TSP.get_costs(batch, tours)
+
+        assert (torch.abs(lengths - lengths_check.data) < 1e-5).all()
+
+        duration = time.time() - start
+        results.extend(
+            [(cost.item(), np.trim_zeros(pi.cpu().numpy(), 'b'), duration) for cost, pi in zip(lengths, tours)])
+
+    return results, eval_batch_size
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("method", type=str, default='concorde', choices=['nn', "gurobi", "gurobigap", "gurobit", "concorde", "lkh", "random_insertion", "nearest_insertion", "farthest_insertion"])
+    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
+    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
+    parser.add_argument("-o", default=None, help="Name of the results file to write")
+    parser.add_argument("--cpus", type=int, help="Number of CPUs to use, defaults to all cores")
+    parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA (only for Tsiligirides)')
+    parser.add_argument('--disable_cache', action='store_true', help='Disable caching')
+    parser.add_argument('--max_calc_batch_size', type=int, default=1000, help='Size for subbatches')
+    parser.add_argument('--progress_bar_mininterval', type=float, default=0.1, help='Minimum interval')
+    parser.add_argument('-n', type=int, help="Number of instances to process")
+    parser.add_argument('--offset', type=int, help="Offset where to start processing")
+    parser.add_argument('--results_dir', default='results', help="Name of results directory")
+
+    opts = parser.parse_args()
+
+    assert opts.o is None or len(opts.datasets) == 1, "Cannot specify result filename with more than one dataset"
+
+    for dataset_path in opts.datasets:
+
+        assert os.path.isfile(check_extension(dataset_path)), "File does not exist!"
+
+        dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
+
+        if opts.o is None:
+            results_dir = os.path.join(opts.results_dir, "tsp", dataset_basename)
+            os.makedirs(results_dir, exist_ok=True)
+
+            out_file = os.path.join(results_dir, "{}{}{}-{}{}".format(
+                dataset_basename,
+                "offs{}".format(opts.offset) if opts.offset is not None else "",
+                "n{}".format(opts.n) if opts.n is not None else "",
+                opts.method, ext
+            ))
+        else:
+            out_file = opts.o
+
+        assert opts.f or not os.path.isfile(
+            out_file), "File already exists! Try running with -f option to overwrite."
+
+        match = re.match(r'^([a-z_]+)(\d*)$', opts.method)
+        assert match
+        method = match[1]
+        runs = 1 if match[2] == '' else int(match[2])
+
+        if method == "nn":
+            assert opts.offset is None, "Offset not supported for nearest neighbor"
+
+            eval_batch_size = opts.max_calc_batch_size
+
+            results, parallelism = solve_all_nn(
+                dataset_path, eval_batch_size, opts.no_cuda, opts.n,
+                opts.progress_bar_mininterval
+            )
+        elif method in ("gurobi", "gurobigap", "gurobit", "concorde", "lkh") or method[-9:] == 'insertion':
+
+            target_dir = os.path.join(results_dir, "{}-{}".format(
+                dataset_basename,
+                opts.method
+            ))
+            assert opts.f or not os.path.isdir(target_dir), \
+                "Target dir already exists! Try running with -f option to overwrite."
+
+            if not os.path.isdir(target_dir):
+                os.makedirs(target_dir)
+
+            # TSP contains single loc array rather than tuple
+            dataset = [(instance, ) for instance in load_dataset(dataset_path)]
+
+            if method == "concorde":
+                use_multiprocessing = False
+                executable = os.path.abspath(os.path.join('problems', 'tsp', 'concorde', 'concorde', 'TSP', 'concorde'))
+
+                def run_func(args):
+                    return solve_concorde_log(executable, *args, disable_cache=opts.disable_cache)
+
+            elif method == "lkh":
+                use_multiprocessing = False
+                executable = get_lkh_executable()
+
+                def run_func(args):
+                    return solve_lkh_log(executable, *args, runs=runs, disable_cache=opts.disable_cache)
+
+            elif method[:6] == "gurobi":
+                use_multiprocessing = True  # We run one thread per instance
+
+                def run_func(args):
+                    return solve_gurobi(*args, disable_cache=opts.disable_cache,
+                                        timeout=runs if method[6:] == "t" else None,
+                                        gap=float(runs) if method[6:] == "gap" else None)
+            else:
+                assert method[-9:] == "insertion"
+                use_multiprocessing = True
+
+                def run_func(args):
+                    return solve_insertion(*args, opts.method.split("_")[0])
+
+            results, parallelism = run_all_in_pool(
+                run_func,
+                target_dir, dataset, opts, use_multiprocessing=use_multiprocessing
+            )
+
+        else:
+            assert False, "Unknown method: {}".format(opts.method)
+
+        costs, tours, durations = zip(*results)  # Not really costs since they should be negative
+        print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
+        print("Average serial duration: {} +- {}".format(
+            np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
+        print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
+        print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
+
+        save_dataset((results, parallelism), out_file)
diff --git a/POMO/TSP/baselines.py b/POMO/TSP/TSP_gurobi.py
similarity index 100%
rename from POMO/TSP/baselines.py
rename to POMO/TSP/TSP_gurobi.py
diff --git a/POMO/TSP/TSProblemDef.py b/POMO/TSP/TSProblemDef.py
index 5e00b68..1c17d5a 100644
--- a/POMO/TSP/TSProblemDef.py
+++ b/POMO/TSP/TSProblemDef.py
@@ -10,12 +10,14 @@
 
 def generate_task_set(meta_params):
     if meta_params['data_type'] == "distribution":  # focus on the TSP100 with different distributions
-        task_set = [(m, l) for l in [1, 10, 20, 30, 50] for m in range(1, 1 + meta_params['num_task'] // 5)] + [(0, 0)]
+        # task_set = [(m, l) for l in [1, 10, 20, 30, 50] for m in range(1, 1 + meta_params['num_task'] // 5)] + [(0, 0)]
+        task_set = [(eps,) for eps in range(0, 0 + meta_params['num_task'] + 1)]
     elif meta_params['data_type'] == "size":  # focus on uniform distribution with different sizes
-        task_set = [(n,) for n in range(10, 10 + 10 * meta_params['num_task'], 10)]
+        task_set = [(n,) for n in range(20, 20 + meta_params['num_task'] + 1)]
     elif meta_params['data_type'] == "size_distribution":
-        task_set = [(m, l) for l in [1, 10, 20, 30, 50] for m in range(1, 11)] + [(0, 0)]
-        task_set = [(n, m, l) for n in [25, 50, 75, 100, 125, 150] for (m, l) in task_set]
+        # task_set = [(m, l) for l in [1, 10, 20, 30, 50] for m in range(1, 11)] + [(0, 0)]
+        # task_set = [(n, m, l) for n in [25, 50, 75, 100, 125, 150] for (m, l) in task_set]
+        task_set = [(n, eps) for n in range(20, 20 + meta_params['num_task'] + 1) for eps in range(0, 0 + meta_params['num_task'] + 1, 10)]
     else:
         raise NotImplementedError
     print(">> Generating training task set: {} tasks with type {}".format(len(task_set), meta_params['data_type']))
@@ -43,6 +45,7 @@ def get_random_problems(batch_size, problem_size, num_modes=0, cdist=0, distribu
     if path is not None:
         with open(os.path.join(path, "tsp{}_{}.pkl".format(problem_size, distribution)), "wb") as f:
             pickle.dump(problems, f, pickle.HIGHEST_PROTOCOL)
+            problems = problems[: batch_size]
 
     # return tensor
     if not torch.is_tensor(problems):
diff --git a/POMO/TSP/test_n100.py b/POMO/TSP/test.py
similarity index 100%
rename from POMO/TSP/test_n100.py
rename to POMO/TSP/test.py
diff --git a/POMO/TSP/train_n100.py b/POMO/TSP/train.py
similarity index 79%
rename from POMO/TSP/train_n100.py
rename to POMO/TSP/train.py
index a7c0eaf..3fd1786 100644
--- a/POMO/TSP/train_n100.py
+++ b/POMO/TSP/train.py
@@ -17,8 +17,8 @@
 # parameters
 
 env_params = {
-    'problem_size': 50,
-    'pomo_size': 50,
+    'problem_size': 100,
+    'pomo_size': 100,
 }
 
 model_params = {
@@ -57,7 +57,6 @@
     'stop_criterion': 'epochs',  # epochs or time
     'train_episodes': 100000,  # number of instances per epoch
     'train_batch_size': 64,
-    'adv_train': False,
     'logging': {
         'model_save_interval': 5000,
         'img_save_interval': 5000,
@@ -76,19 +75,17 @@
         # 'epoch': 510,  # epoch version of pre-trained model to laod.
 
     },
-    # For fomaml, k needs to be small (1 or 2), but the performance is still inferior.
-    # For reptile, performance is quite well, however, after several iteration, the improvement in inner-loop is trivial.
     'meta_params': {
         'enable': True,  # whether use meta-learning or not
+        'curriculum': True,
         'meta_method': 'maml',  # choose from ['maml', 'fomaml', 'reptile']
         'bootstrap_steps': 0,
-        'data_type': 'size',  # choose from ["size", "distribution", "size_distribution"]
+        'data_type': 'distribution',  # choose from ["size", "distribution", "size_distribution"]
         'epochs': 50000,  # the number of meta-model updates: (250*100000) / (1*5*64)
         'B': 1,  # the number of tasks in a mini-batch
-        'k': 3,  # gradient decent steps in the inner-loop optimization of meta-learning method
-        'meta_batch_size': 64,  # the batch size of the inner-loop optimization
-        'val_batch_size': 64,
-        'num_task': 5,  # the number of tasks in the training task set
+        'k': 1,  # gradient decent steps in the inner-loop optimization of meta-learning method
+        'meta_batch_size': 64,  # will be divided by 2 if problem_size >= 100
+        'num_task': 130,  # the number of tasks in the training task set: e.g., [20, 150] / [0, 130]
         'alpha': 0.99,  # params for the outer-loop optimization of reptile
         'alpha_decay': 0.999,  # params for the outer-loop optimization of reptile
     }
@@ -96,7 +93,7 @@
 
 logger_params = {
     'log_file': {
-        'desc': 'train_tsp_n50',
+        'desc': 'train_tsp',
         'filename': 'log.txt'
     }
 }
@@ -140,5 +137,24 @@ def _print_config():
     [logger.info(g_key + "{}".format(globals()[g_key])) for g_key in globals().keys() if g_key.endswith('params')]
 
 
+def check_mem(cuda_device):
+    devices_info = os.popen('"/usr/bin/nvidia-smi" --query-gpu=memory.total,memory.used --format=csv,nounits,noheader').read().strip().split("\n")
+    total, used = devices_info[int(cuda_device)].split(',')
+    return total, used
+
+
+def occumpy_mem(cuda_device):
+    torch.cuda.set_device(cuda_device)
+    total, used = check_mem(cuda_device)
+    total = int(total)
+    used = int(used)
+    max_mem = int(total * 0.85)
+    block_mem = max_mem - used
+    x = torch.cuda.FloatTensor(256, 1024, block_mem)
+    del x
+
+
 if __name__ == "__main__":
+    if trainer_params["meta_params"]["data_type"] in ["size", "size_distribution"]:
+        occumpy_mem(CUDA_DEVICE_NUM)
     main()
diff --git a/POMO/utils/functions.py b/POMO/utils/functions.py
index bbc55f0..aa928c7 100644
--- a/POMO/utils/functions.py
+++ b/POMO/utils/functions.py
@@ -48,6 +48,12 @@ def display_num_param(net):
     print('There are {} ({:.2f} million) parameters in this neural network'.format(nb_param, nb_param/1e6))
 
 
+def move_to(var, device):
+    if isinstance(var, dict):
+        return {k: move_to(v, device) for k, v in var.items()}
+    return var.to(device)
+
+
 def clip_grad_norms(param_groups, max_norm=math.inf):
     """
     Clips the norms for all param groups to max_norm and returns gradient norms before clipping
@@ -64,6 +70,37 @@ def clip_grad_norms(param_groups, max_norm=math.inf):
     return grad_norms, grad_norms_clipped
 
 
+def run_all_in_pool(func, directory, dataset, opts, use_multiprocessing=True):
+    # # Test
+    # res = func((directory, 'test', *dataset[0]))
+    # return [res]
+
+    num_cpus = os.cpu_count() if opts.cpus is None else opts.cpus
+
+    w = len(str(len(dataset) - 1))
+    offset = getattr(opts, 'offset', None)
+    if offset is None:
+        offset = 0
+    ds = dataset[offset:(offset + opts.n if opts.n is not None else len(dataset))]
+    pool_cls = (Pool if use_multiprocessing and num_cpus > 1 else ThreadPool)
+    with pool_cls(num_cpus) as pool:
+        results = list(tqdm(pool.imap(
+            func,
+            [
+                (
+                    directory,
+                    str(i + offset).zfill(w),
+                    *problem
+                )
+                for i, problem in enumerate(ds)
+            ]
+        ), total=len(ds), mininterval=opts.progress_bar_mininterval))
+
+    failed = [str(i + offset) for i, res in enumerate(results) if res is None]
+    assert len(failed) == 0, "Some instances failed: {}".format(" ".join(failed))
+    return results, num_cpus
+
+
 def show(x, y, label, title, xdes, ydes, path, x_scale="linear", dpi=300):
     plt.style.use('fast')  # bmh, fivethirtyeight, Solarize_Light2
     plt.figure(figsize=(8, 8))