diff --git a/.gitignore b/.gitignore
index dcca880..31a28a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ __pycache__/
 # data & pretrain-model
 AM/
 result/
+backup/
+data/
 
 # private files
 utils_plot*
diff --git a/AM/adv_run.py b/AM/adv_run.py
deleted file mode 100755
index 2855ec3..0000000
--- a/AM/adv_run.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import random
-import json
-import pprint as pp
-from datetime import datetime
-import numpy as np
-
-import torch
-import torch.optim as optim
-
-from nets.critic_network import CriticNetwork
-from options import get_options
-from train import meta_train_epoch, validate, get_inner_model
-from reinforce_baselines import NoBaseline, ExponentialBaseline, CriticBaseline, RolloutBaseline, WarmupBaseline
-from nets.attention_model import AttentionModel
-from nets.pointer_network import PointerNetwork, CriticNetworkLSTM
-from utils import torch_load_cpu, load_problem, seed_everything, save_checkpoint
-from generate_dataset import generate_train_task
-
-
-def run(opts):
-    # hard-coded
-    opts.graph_size = 40  # for variation_type == size
-    opts.variation_type = "dist"
-    opts.baseline_every_Xepochs_for_META = 7
-    opts.val_dataset = "../data/size/tsp/tsp100_validation_seed4321.pkl"
-
-    # Pretty print the run args
-    pp.pprint(vars(opts))
-
-    # Set the random seed
-    seed_everything(opts.seed)
-
-    # Optionally configure tensorboard
-    tb_logger = None
-    # if not opts.no_tensorboard:
-    #     tb_logger = TbLogger(os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name))
-
-    os.makedirs(opts.save_dir)
-    # Save arguments so exact configuration can always be found
-    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
-        json.dump(vars(opts), f, indent=True)
-
-    # Set the device
-    opts.device = torch.device("cuda" if opts.use_cuda else "cpu")
-
-    # Figure out what's the problem
-    problem = load_problem(opts.problem)
-
-    # Load data from load_path
-    load_data = {}
-    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
-    load_path = opts.load_path if opts.load_path is not None else opts.resume
-    if load_path is not None:
-        print('  [*] Loading data from {}'.format(load_path))
-        load_data = torch_load_cpu(load_path)
-    if opts.resume:
-        epoch_resume = int(os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])
-        opts.epoch_start = epoch_resume + 1
-
-    # Initialize model
-    model_class = {
-        'attention': AttentionModel,
-        'pointer': PointerNetwork
-    }.get(opts.model, None)
-    assert model_class is not None, "Unknown model: {}".format(model_class)
-    model_meta = model_class(
-        opts.embedding_dim,
-        opts.hidden_dim,
-        problem,
-        n_encode_layers=opts.n_encode_layers,
-        mask_inner=True,
-        mask_logits=True,
-        normalization=opts.normalization,
-        tanh_clipping=opts.tanh_clipping,
-        checkpoint_encoder=opts.checkpoint_encoder,
-        shrink_size=opts.shrink_size
-    ).to(opts.device)
-
-    # if opts.use_cuda and torch.cuda.device_count() > 1:
-    #     model_meta = torch.nn.DataParallel(model_meta)
-
-    # Overwrite model parameters by parameters to load
-    model_ = get_inner_model(model_meta)
-    model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})})
-
-    # generate tasks based on task distribution.
-    tasks_list = generate_train_task(opts)
-    # for i in range(3):
-    #     task_prop = {'graph_size': opts.graph_size, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': 'none'}
-    #     tasks_list.append(task_prop)
-
-    baseline_dict, val_dict = {}, {}
-    print("{} tasks in task list: {}".format(len(tasks_list), tasks_list))
-
-    for task in tasks_list:
-        baseline = RolloutBaseline(model_meta, problem, opts, task=task)
-        baseline_dict[str(task)] = baseline
-        val_dataset = problem.make_dataset(num_samples=opts.val_size, distribution=opts.data_distribution, task=task)
-        val_dict[str(task)] = val_dataset
-
-    alpha = opts.alpha
-    start_time = datetime.now()
-    for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
-        if (datetime.now() - start_time).total_seconds() >= 24*60*60:
-            print(">> Time Out: 24hrs. Training finished {} epochs".format(epoch))
-            break
-        print(">> Epoch {}, alpha: {}".format(epoch, alpha))
-        for index_task, task in enumerate(tasks_list):
-            baseline = baseline_dict[str(task)]
-            val_dataset = val_dict[str(task)]
-            # eps = random.sample(np.arange(0, 50, 0.1).tolist(), 1)[0]
-            eps = random.sample([0, 0.1, 0.5, 1, 5, 10, 25, 50], 1)[0]
-            meta_train_epoch(model_meta, baseline, epoch, val_dataset, problem, tb_logger, opts, alpha, task, eps=eps)
-
-        alpha = alpha * opts.alpha_decay
-
-        if (opts.checkpoint_epochs != 0 and epoch % opts.checkpoint_epochs == 0) or epoch == opts.n_epochs - 1:
-            print('Saving model and state...')
-            save_checkpoint(model_meta, os.path.join(opts.save_dir, 'epoch-{}.pt'.format(epoch)))
-
-        # add validation here.
-        if opts.val_dataset is not None:
-            val_dataset = problem.make_dataset(filename=opts.val_dataset_path)
-            avg_reward = validate(model_meta, val_dataset, opts)
-            print(">> Epoch {} avg_cost on TSP100 validation set {}".format(epoch, avg_reward))
-
-
-if __name__ == "__main__":
-    run(get_options())
diff --git a/AM/compute_opt.py b/AM/compute_opt.py
deleted file mode 100644
index 15624b5..0000000
--- a/AM/compute_opt.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import pickle
-import argparse
-from problems.tsp.tsp_gurobi import solve_all_gurobi
-from utils import load_problem
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Compute (near-)opt solution.")
-
-    # Data
-    parser.add_argument('--problem', default='tsp', help="The problem to solve, default 'tsp'")
-    parser.add_argument('--path', type=str, default="../new_data/size/tsp", help='Dataset file to use for validation')
-    parser.add_argument('--offset', type=int, default=0, help='Offset where to start in dataset (default 0)')
-    parser.add_argument('--num_samples', type=int, default=10000, help='Number of samples to evaluate (default 10000)')
-
-    opts = parser.parse_args()
-
-    assert opts.problem == "tsp", "Unsupported currently!"
-
-    files = os.listdir(opts.path)
-    problem = load_problem(opts.problem)
-    for file in files:
-        if os.path.splitext(file)[-1][1:] not in ["pkl"]:
-            print("Unsupported file detected: {}".format(file))
-            continue
-        print(">> Solving dataset {}".format(file))
-        dataset = problem.make_dataset(filename=files, offset=0, num_samples=num_samples)
-        opt_sol = solve_all_gurobi(dataset)
-        # save results
-        name = os.path.basename(file)[:-4] + "_opt.pkl"
-        res_path = os.path.join(opts.path, name)
-        with open(res_path, 'wb') as f:
-            pickle.dump(opt_sol, f, pickle.HIGHEST_PROTOCOL)
diff --git a/AM/eval.py b/AM/eval.py
deleted file mode 100644
index 6b181ad..0000000
--- a/AM/eval.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import math
-import torch
-import os
-import argparse
-import numpy as np
-import itertools
-from tqdm import tqdm
-from utils import load_model, move_to
-from utils.data_utils import save_dataset
-from torch.utils.data import DataLoader
-import time
-from datetime import timedelta
-from utils.functions import parse_softmax_temperature
-mp = torch.multiprocessing.get_context('spawn')
-
-
-def get_best(sequences, cost, ids=None, batch_size=None):
-    """
-    Ids contains [0, 0, 0, 1, 1, 2, ..., n, n, n] if 3 solutions found for 0th instance, 2 for 1st, etc
-    :param sequences:
-    :param lengths:
-    :param ids:
-    :return: list with n sequences and list with n lengths of solutions
-    """
-    if ids is None:
-        idx = cost.argmin()
-        return sequences[idx:idx+1, ...], cost[idx:idx+1, ...]
-
-    splits = np.hstack([0, np.where(ids[:-1] != ids[1:])[0] + 1])
-    mincosts = np.minimum.reduceat(cost, splits)
-
-    group_lengths = np.diff(np.hstack([splits, len(ids)]))
-    all_argmin = np.flatnonzero(np.repeat(mincosts, group_lengths) == cost)
-    result = np.full(len(group_lengths) if batch_size is None else batch_size, -1, dtype=int)
-
-    result[ids[all_argmin[::-1]]] = all_argmin[::-1]
-
-    return [sequences[i] if i >= 0 else None for i in result], [cost[i] if i >= 0 else math.inf for i in result]
-
-
-def eval_dataset_mp(args):
-    (dataset_path, width, softmax_temp, opts, i, num_processes) = args
-
-    model, _ = load_model(opts.model)
-    val_size = opts.val_size // num_processes
-    dataset = model.problem.make_dataset(filename=dataset_path, num_samples=val_size, offset=opts.offset + val_size * i)
-    device = torch.device("cuda:{}".format(i))
-
-    return _eval_dataset(model, dataset, width, softmax_temp, opts, device)
-
-
-def eval_dataset(dataset_path, width, softmax_temp, opts):
-    # Even with multiprocessing, we load the model here since it contains the name where to write results
-    model, _ = load_model(opts.model)
-    use_cuda = torch.cuda.is_available() and not opts.no_cuda
-    if opts.multiprocessing:
-        assert use_cuda, "Can only do multiprocessing with cuda"
-        num_processes = torch.cuda.device_count()
-        assert opts.val_size % num_processes == 0
-
-        with mp.Pool(num_processes) as pool:
-            results = list(itertools.chain.from_iterable(pool.map(
-                eval_dataset_mp,
-                [(dataset_path, width, softmax_temp, opts, i, num_processes) for i in range(num_processes)]
-            )))
-
-    else:
-        device = torch.device("cuda:0" if use_cuda else "cpu")
-        dataset = model.problem.make_dataset(filename=dataset_path, num_samples=opts.val_size, offset=opts.offset)
-        results = _eval_dataset(model, dataset, width, softmax_temp, opts, device)
-
-    # This is parallelism, even if we use multiprocessing (we report as if we did not use multiprocessing, e.g. 1 GPU)
-    parallelism = opts.eval_batch_size
-
-    costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-
-    print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-    print("Average serial duration: {} +- {}".format(
-        np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-    print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-    print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-
-    dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
-    model_name = "_".join(os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:])
-    if opts.o is None:
-        results_dir = os.path.join(opts.results_dir, model.problem.NAME, dataset_basename)
-        os.makedirs(results_dir, exist_ok=True)
-
-        out_file = os.path.join(results_dir, "{}-{}-{}{}-t{}-{}-{}{}".format(
-            dataset_basename, model_name,
-            opts.decode_strategy,
-            width if opts.decode_strategy != 'greedy' else '',
-            softmax_temp, opts.offset, opts.offset + len(costs), ext
-        ))
-    else:
-        out_file = opts.o
-
-    assert opts.f or not os.path.isfile(
-        out_file), "File already exists! Try running with -f option to overwrite."
-
-    save_dataset((results, parallelism), out_file)
-
-    return costs, tours, durations
-
-
-def _eval_dataset(model, dataset, width, softmax_temp, opts, device):
-
-    model.to(device)
-    model.eval()
-
-    model.set_decode_type(
-        "greedy" if opts.decode_strategy in ('bs', 'greedy') else "sampling",
-        temp=softmax_temp)
-
-    dataloader = DataLoader(dataset, batch_size=opts.eval_batch_size)
-
-    results = []
-    for batch in tqdm(dataloader, disable=opts.no_progress_bar):
-        batch = move_to(batch, device)
-
-        start = time.time()
-        with torch.no_grad():
-            if opts.decode_strategy in ('sample', 'greedy'):
-                if opts.decode_strategy == 'greedy':
-                    assert width == 0, "Do not set width when using greedy"
-                    assert opts.eval_batch_size <= opts.max_calc_batch_size, \
-                        "eval_batch_size should be smaller than calc batch size"
-                    batch_rep = 1
-                    iter_rep = 1
-                elif width * opts.eval_batch_size > opts.max_calc_batch_size:
-                    assert opts.eval_batch_size == 1
-                    assert width % opts.max_calc_batch_size == 0
-                    batch_rep = opts.max_calc_batch_size
-                    iter_rep = width // opts.max_calc_batch_size
-                else:
-                    batch_rep = width
-                    iter_rep = 1
-                assert batch_rep > 0
-                # This returns (batch_size, iter_rep shape)
-                sequences, costs = model.sample_many(batch, batch_rep=batch_rep, iter_rep=iter_rep)
-                batch_size = len(costs)
-                ids = torch.arange(batch_size, dtype=torch.int64, device=costs.device)
-            else:
-                assert opts.decode_strategy == 'bs'
-
-                cum_log_p, sequences, costs, ids, batch_size = model.beam_search(
-                    batch, beam_size=width,
-                    compress_mask=opts.compress_mask,
-                    max_calc_batch_size=opts.max_calc_batch_size
-                )
-
-        if sequences is None:
-            sequences = [None] * batch_size
-            costs = [math.inf] * batch_size
-        else:
-            sequences, costs = get_best(
-                sequences.cpu().numpy(), costs.cpu().numpy(),
-                ids.cpu().numpy() if ids is not None else None,
-                batch_size
-            )
-        duration = time.time() - start
-        for seq, cost in zip(sequences, costs):
-            if model.problem.NAME == "tsp":
-                seq = seq.tolist()  # No need to trim as all are same length
-            elif model.problem.NAME in ("cvrp", "sdvrp"):
-                seq = np.trim_zeros(seq).tolist() + [0]  # Add depot
-            elif model.problem.NAME in ("op", "pctsp"):
-                seq = np.trim_zeros(seq)  # We have the convention to exclude the depot
-            else:
-                assert False, "Unkown problem: {}".format(model.problem.NAME)
-            # Note VRP only
-            results.append((cost, seq, duration))
-
-    return results
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument("-o", default=None, help="Name of the results file to write")
-    parser.add_argument('--val_size', type=int, default=10000,
-                        help='Number of instances used for reporting validation performance')
-    parser.add_argument('--offset', type=int, default=0,
-                        help='Offset where to start in dataset (default 0)')
-    parser.add_argument('--eval_batch_size', type=int, default=1024,
-                        help="Batch size to use during (baseline) evaluation")
-    # parser.add_argument('--decode_type', type=str, default='greedy',
-    #                     help='Decode type, greedy or sampling')
-    parser.add_argument('--width', type=int, nargs='+',
-                        help='Sizes of beam to use for beam search (or number of samples for sampling), '
-                             '0 to disable (default), -1 for infinite')
-    parser.add_argument('--decode_strategy', type=str,
-                        help='Beam search (bs), Sampling (sample) or Greedy (greedy)')
-    parser.add_argument('--softmax_temperature', type=parse_softmax_temperature, default=1,
-                        help="Softmax temperature (sampling or bs)")
-    parser.add_argument('--model', type=str)
-    parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA')
-    parser.add_argument('--no_progress_bar', action='store_true', help='Disable progress bar')
-    parser.add_argument('--compress_mask', action='store_true', help='Compress mask into long')
-    parser.add_argument('--max_calc_batch_size', type=int, default=10000, help='Size for subbatches')
-    parser.add_argument('--results_dir', default='results', help="Name of results directory")
-    parser.add_argument('--multiprocessing', action='store_true',
-                        help='Use multiprocessing to parallelize over multiple GPUs')
-
-    opts = parser.parse_args()
-
-    assert opts.o is None or (len(opts.datasets) == 1 and len(opts.width) <= 1), \
-        "Cannot specify result filename with more than one dataset or more than one width"
-
-    widths = opts.width if opts.width is not None else [0]
-
-    for width in widths:
-        for dataset_path in opts.datasets:
-            eval_dataset(dataset_path, width, opts.softmax_temperature, opts)
diff --git a/AM/eval_fine_tuning.py b/AM/eval_fine_tuning.py
deleted file mode 100644
index dc5c738..0000000
--- a/AM/eval_fine_tuning.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import json
-import pprint as pp
-
-import torch
-import torch.optim as optim
-from nets.critic_network import CriticNetwork
-from options import get_options
-from train import train_epoch, validate, get_inner_model
-from reinforce_baselines import NoBaseline, ExponentialBaseline, CriticBaseline, RolloutBaseline, WarmupBaseline
-from nets.attention_model import AttentionModel
-from nets.pointer_network import PointerNetwork, CriticNetworkLSTM
-from utils import torch_load_cpu, load_problem, seed_everything
-from train import clip_grad_norms, tune_and_test
-from generate_dataset import generate_test_task
-import copy
-import pickle
-import os
-
-
-def run(opts):
-    # hard-coded
-    opts.graph_size = -1  # for variation_type == size
-    opts.load_path = "/data/yxwu/jianan/generalization-NCO/AM/outputs/tsp_40/run_20220812T202208/epoch-1000.pt"
-    opts.variation_type = "size"
-    opts.dir_path = '../new_data/{}/{}/'.format(opts.variation_type, opts.problem)
-    test_seed, fine_tune_seed = 2023, 2022
-    opts.test_result_pickle_file = os.path.split(opts.load_path)[-1]
-
-    # Pretty print the run args
-    pp.pprint(vars(opts))
-
-    # Set the random seed
-    seed_everything(opts.seed)
-
-    # Optionally configure tensorboard
-    tb_logger = None
-
-    # Set the device
-    opts.device = torch.device("cuda" if opts.use_cuda else "cpu")
-    print(opts.device)
-
-    # Figure out what's the problem
-    problem = load_problem(opts.problem)
-
-    # Load data from load_path
-    load_data = {}
-    assert opts.load_path is not None, "load path cannot be None!"
-    load_path = opts.load_path
-    if load_path is not None:
-        print('  [*] Loading data from {}'.format(load_path))
-        load_data = torch_load_cpu(load_path)
-
-    # Initialize model
-    model_class = {
-        'attention': AttentionModel,
-        'pointer': PointerNetwork
-    }.get(opts.model, None)
-    assert model_class is not None, "Unknown model: {}".format(model_class)
-    print("load")
-    model_meta = model_class(
-        opts.embedding_dim,
-        opts.hidden_dim,
-        problem,
-        n_encode_layers=opts.n_encode_layers,
-        mask_inner=True,
-        mask_logits=True,
-        normalization=opts.normalization,
-        tanh_clipping=opts.tanh_clipping,
-        checkpoint_encoder=opts.checkpoint_encoder,
-        shrink_size=opts.shrink_size
-    ).to(opts.device)
-
-    if opts.use_cuda and torch.cuda.device_count() > 1:
-        model_meta = torch.nn.DataParallel(model_meta)
-
-    model_ = get_inner_model(model_meta)
-    model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})})
-
-    tasks_list = generate_test_task(opts, test_seed, fine_tune_seed)
-    print("Task list: {}".format(tasks_list))
-
-    baseline_dict, val_dict, fine_tuning_dict = {}, {}, {}
-    for task in tasks_list:
-        baseline = RolloutBaseline(model_meta, problem, opts, task=task, update_baseline=False)
-        baseline_dict[str(task)] = baseline
-        print(">> Loading test/fine-tune dataset for task {}".format(task))
-        val_dataset = problem.make_dataset(filename=opts.dir_path+task['test_dataset'], task=task)
-        val_dict[str(task)] = val_dataset
-        fine_tuning_dataset = problem.make_dataset(filename=opts.dir_path+task['fine_tuning_dataset'], task=task)
-        fine_tuning_dict[str(task)] = fine_tuning_dataset
-
-    total_reward_tasks = 0
-    dict_results_task_sample_iter_wise = {}
-    tune_sequence = []
-    epoch = 99999
-
-    for task in tasks_list:
-        task_string = None
-        if opts.variation_type == 'dist':
-            task_string = task['num_modes']
-        if opts.variation_type == 'scale':
-            task_string = str(task['low']) + '_' + str(task['high'])
-        if opts.variation_type == 'size':
-            task_string = task['graph_size']
-        if opts.variation_type == 'cap_vrp':
-            task_string = task['vrp_capacity']
-        if opts.variation_type == 'mix_dist_size':
-            task_string = task['num_modes']
-        print(">> opts.variation_type:", opts.variation_type)
-        print(">> task:", task)
-        print(">> task_string:", task_string)
-
-        baseline = baseline_dict[str(task)]
-        val_dataset = val_dict[str(task)]
-        fine_tuning_dataset = fine_tuning_dict[str(task)]
-        dict_results_task_sample_iter_wise[task_string] = {}
-        updated_reward = tune_and_test(task, model_meta, baseline, epoch, val_dataset, problem, tb_logger, opts, fine_tuning_dataset, dict_results_task_sample_iter_wise[task_string])
-        total_reward_tasks += updated_reward
-
-    if not os.path.exists("results_all/test"):
-        os.makedirs("results_all/test")
-    with open("results_all/test/TEST_" + opts.test_result_pickle_file, 'wb') as handle:
-        pickle.dump(dict_results_task_sample_iter_wise, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-    print("EPOCH ID ", opts.load_path)
-    avg_rewards_val = total_reward_tasks/len(tasks_list)
-    print("Avg reward all tasks after fine tune ", )
-    tune_sequence.append(avg_rewards_val)
-
-    for index, x in enumerate(tune_sequence):
-        print(index, x.data)
-
-
-if __name__ == "__main__":
-    run(get_options())
diff --git a/AM/generate_data.py b/AM/generate_data.py
deleted file mode 100644
index f627e81..0000000
--- a/AM/generate_data.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import argparse
-import os
-import numpy as np
-from utils.data_utils import check_extension, save_dataset
-
-
-def generate_tsp_data(dataset_size, tsp_size):
-    return np.random.uniform(size=(dataset_size, tsp_size, 2)).tolist()
-
-
-def generate_vrp_data(dataset_size, vrp_size):
-    CAPACITIES = {
-        10: 20.,
-        20: 30.,
-        50: 40.,
-        100: 50.
-    }
-    return list(zip(
-        np.random.uniform(size=(dataset_size, 2)).tolist(),  # Depot location
-        np.random.uniform(size=(dataset_size, vrp_size, 2)).tolist(),  # Node locations
-        np.random.randint(1, 10, size=(dataset_size, vrp_size)).tolist(),  # Demand, uniform integer 1 ... 9
-        np.full(dataset_size, CAPACITIES[vrp_size]).tolist()  # Capacity, same for whole dataset
-    ))
-
-
-def generate_op_data(dataset_size, op_size, prize_type='const'):
-    depot = np.random.uniform(size=(dataset_size, 2))
-    loc = np.random.uniform(size=(dataset_size, op_size, 2))
-
-    # Methods taken from Fischetti et al. 1998
-    if prize_type == 'const':
-        prize = np.ones((dataset_size, op_size))
-    elif prize_type == 'unif':
-        prize = (1 + np.random.randint(0, 100, size=(dataset_size, op_size))) / 100.
-    else:  # Based on distance to depot
-        assert prize_type == 'dist'
-        prize_ = np.linalg.norm(depot[:, None, :] - loc, axis=-1)
-        prize = (1 + (prize_ / prize_.max(axis=-1, keepdims=True) * 99).astype(int)) / 100.
-
-    # Max length is approximately half of optimal TSP tour, such that half (a bit more) of the nodes can be visited
-    # which is maximally difficult as this has the largest number of possibilities
-    MAX_LENGTHS = {
-        20: 2.,
-        50: 3.,
-        100: 4.
-    }
-
-    return list(zip(
-        depot.tolist(),
-        loc.tolist(),
-        prize.tolist(),
-        np.full(dataset_size, MAX_LENGTHS[op_size]).tolist()  # Capacity, same for whole dataset
-    ))
-
-
-def generate_pctsp_data(dataset_size, pctsp_size, penalty_factor=3):
-    depot = np.random.uniform(size=(dataset_size, 2))
-    loc = np.random.uniform(size=(dataset_size, pctsp_size, 2))
-
-    # For the penalty to make sense it should be not too large (in which case all nodes will be visited) nor too small
-    # so we want the objective term to be approximately equal to the length of the tour, which we estimate with half
-    # of the nodes by half of the tour length (which is very rough but similar to op)
-    # This means that the sum of penalties for all nodes will be approximately equal to the tour length (on average)
-    # The expected total (uniform) penalty of half of the nodes (since approx half will be visited by the constraint)
-    # is (n / 2) / 2 = n / 4 so divide by this means multiply by 4 / n,
-    # However instead of 4 we use penalty_factor (3 works well) so we can make them larger or smaller
-    MAX_LENGTHS = {
-        20: 2.,
-        50: 3.,
-        100: 4.
-    }
-    penalty_max = MAX_LENGTHS[pctsp_size] * (penalty_factor) / float(pctsp_size)
-    penalty = np.random.uniform(size=(dataset_size, pctsp_size)) * penalty_max
-
-    # Take uniform prizes
-    # Now expectation is 0.5 so expected total prize is n / 2, we want to force to visit approximately half of the nodes
-    # so the constraint will be that total prize >= (n / 2) / 2 = n / 4
-    # equivalently, we divide all prizes by n / 4 and the total prize should be >= 1
-    deterministic_prize = np.random.uniform(size=(dataset_size, pctsp_size)) * 4 / float(pctsp_size)
-
-    # In the deterministic setting, the stochastic_prize is not used and the deterministic prize is known
-    # In the stochastic setting, the deterministic prize is the expected prize and is known up front but the
-    # stochastic prize is only revealed once the node is visited
-    # Stochastic prize is between (0, 2 * expected_prize) such that E(stochastic prize) = E(deterministic_prize)
-    stochastic_prize = np.random.uniform(size=(dataset_size, pctsp_size)) * deterministic_prize * 2
-
-    return list(zip(
-        depot.tolist(),
-        loc.tolist(),
-        penalty.tolist(),
-        deterministic_prize.tolist(),
-        stochastic_prize.tolist()
-    ))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--filename", help="Filename of the dataset to create (ignores datadir)")
-    parser.add_argument("--data_dir", default='data', help="Create datasets in data_dir/problem (default 'data')")
-    parser.add_argument("--name", type=str, required=True, help="Name to identify dataset")
-    parser.add_argument("--problem", type=str, default='all',
-                        help="Problem, 'tsp', 'vrp', 'pctsp' or 'op_const', 'op_unif' or 'op_dist'"
-                             " or 'all' to generate all")
-    parser.add_argument('--data_distribution', type=str, default='all',
-                        help="Distributions to generate for problem, default 'all'.")
-
-    parser.add_argument("--dataset_size", type=int, default=10000, help="Size of the dataset")
-    parser.add_argument('--graph_sizes', type=int, nargs='+', default=[20, 50, 100],
-                        help="Sizes of problem instances (default 20, 50, 100)")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument('--seed', type=int, default=1234, help="Random seed")
-
-    opts = parser.parse_args()
-
-    assert opts.filename is None or (len(opts.problems) == 1 and len(opts.graph_sizes) == 1), \
-        "Can only specify filename when generating a single dataset"
-
-    distributions_per_problem = {
-        'tsp': [None],
-        'vrp': [None],
-        'pctsp': [None],
-        'op': ['const', 'unif', 'dist']
-    }
-    if opts.problem == 'all':
-        problems = distributions_per_problem
-    else:
-        problems = {
-            opts.problem:
-                distributions_per_problem[opts.problem]
-                if opts.data_distribution == 'all'
-                else [opts.data_distribution]
-        }
-
-    for problem, distributions in problems.items():
-        for distribution in distributions or [None]:
-            for graph_size in opts.graph_sizes:
-
-                datadir = os.path.join(opts.data_dir, problem)
-                os.makedirs(datadir, exist_ok=True)
-
-                if opts.filename is None:
-                    filename = os.path.join(datadir, "{}{}{}_{}_seed{}.pkl".format(
-                        problem,
-                        "_{}".format(distribution) if distribution is not None else "",
-                        graph_size, opts.name, opts.seed))
-                else:
-                    filename = check_extension(opts.filename)
-
-                assert opts.f or not os.path.isfile(check_extension(filename)), \
-                    "File already exists! Try running with -f option to overwrite."
-
-                np.random.seed(opts.seed)
-                if problem == 'tsp':
-                    dataset = generate_tsp_data(opts.dataset_size, graph_size)
-                elif problem == 'vrp':
-                    dataset = generate_vrp_data(
-                        opts.dataset_size, graph_size)
-                elif problem == 'pctsp':
-                    dataset = generate_pctsp_data(opts.dataset_size, graph_size)
-                elif problem == "op":
-                    dataset = generate_op_data(opts.dataset_size, graph_size, prize_type=distribution)
-                else:
-                    assert False, "Unknown problem: {}".format(problem)
-
-                print(dataset[0])
-
-                save_dataset(dataset, filename)
diff --git a/AM/generate_dataset.py b/AM/generate_dataset.py
deleted file mode 100644
index 5d0ffe4..0000000
--- a/AM/generate_dataset.py
+++ /dev/null
@@ -1,150 +0,0 @@
-from utils.functions import seed_everything, load_problem
-from utils.data_utils import save_dataset
-
-def generate_train_task(opts):
-    tasks_list = []
-    if opts.variation_type == 'size':
-        graph_sizes = [10, 20, 30, 50]
-        if opts.problem == "tsp":
-            pass
-            # if opts.graph_size_continuous:
-            #     graph_sizes = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'scale':
-        scales = [[0, 1], [0, 2], [0, 4]]
-        for scale in scales:
-            task_prop = {'graph_size': opts.graph_size, 'low': scale[0], 'high': scale[1], 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/SCALE_{}_{}-{}_val_farthest_insertion.pkl".format(task_prop['graph_size'], int(task_prop['low']), int(task_prop['high']))
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'dist':
-        for i in [1, 2, 5]:
-            num_modes = i
-            task_prop = {'graph_size': opts.graph_size, 'low': 0, 'high': 1, 'num_modes': num_modes, 'dist': 'gmm', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/GRID_{}_modes_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'], task_prop['num_modes'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'mix_dist_size':
-        for i in [1, 2, 5]:
-            for cur_graph_size in [20, 30, 50]:
-                num_modes = i
-                task_prop = {'graph_size': cur_graph_size, 'low': 0, 'high': 1, 'num_modes': num_modes, 'dist': 'gmm', 'variation_type': opts.variation_type}
-                task_prop['insertion_heuristic_cost_file'] = "results_all/validation/GRID_{}_modes_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'], task_prop['num_modes'])
-                tasks_list.append(task_prop)
-    elif opts.variation_type == 'cap_vrp':
-        train_tasks = [int(tsk) for tsk in (opts.train_tasks).split('_')]
-        print("train_tasks ", train_tasks)
-        for i in train_tasks:
-            vrp_capacity = i
-            task_prop = {'graph_size': opts.graph_size, 'vrp_capacity': vrp_capacity, 'low': 0, 'high': 1, 'variation_type': opts.variation_type}
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "adv":
-        for i in range(3):
-            task_prop = {'graph_size': opts.graph_size, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': 'adv'}
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "size_uniform":
-        graph_sizes = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "size_two_cluster":
-        graph_sizes = [5, 6, 7, 8, 9, 91, 92, 93, 94, 95]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "size_imbalanced":
-        graph_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 95]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "size_increasing_order":
-        graph_sizes = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    elif opts.variation_type == "size_decreasing_order":
-        graph_sizes = [95, 85, 75, 65, 55, 45, 35, 25, 15, 5]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['insertion_heuristic_cost_file'] = "results_all/validation/gsize_{}_val_farthest_insertion.pkl".format(task_prop['graph_size'])
-            tasks_list.append(task_prop)
-    else:
-        print("Invalid task distribution: opts.variation_type!")
-        exit(0)
-
-    return tasks_list
-
-
-def generate_test_task(opts, test_seed=1234, fine_tune_seed=9999):
-    tasks_list = []
-    if opts.variation_type == 'size':
-        graph_sizes = [10, 30, 50, 80, 100, 120, 150, 200]
-        for g_sizes in graph_sizes:
-            task_prop = {'graph_size': g_sizes, 'low': 0, 'high': 1, 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['test_dataset'] = "{}{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], "test", test_seed)
-            task_prop['fine_tuning_dataset'] = "{}{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], "fine_tuning", fine_tune_seed)
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'scale':
-        scales = [[0.0, 3.0], [0.0, 5.0], [0.0, 8.0], [0.0, 10.0]]
-        for scale in scales:
-            task_prop = {'graph_size': opts.graph_size, 'low': scale[0], 'high': scale[1], 'dist': 'uniform', 'variation_type': opts.variation_type}
-            task_prop['test_dataset'] = "{}__size_{}_scale_{}_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], task_prop['low'], task_prop['high'], "test", test_seed)
-            task_prop['fine_tuning_dataset'] = "{}__size_{}_scale_{}_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], task_prop['low'], task_prop['high'], "fine_tuning", fine_tune_seed)
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'dist':
-        for i in [3, 8]:
-            num_modes = i
-            task_prop = {'graph_size': opts.graph_size, 'low': 0, 'high': 1, 'num_modes': num_modes, 'dist': 'gmm', 'variation_type': opts.variation_type}
-            task_prop['test_dataset'] = "{}__size_{}_distribution_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(num_modes), "test", test_seed)
-            task_prop['fine_tuning_dataset'] = "{}__size_{}_distribution_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(num_modes), "fine_tuning", fine_tune_seed)
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'cap_vrp':
-        for i in [20, 50]:
-            vrp_capacity = i
-            task_prop = {'graph_size': opts.graph_size, 'vrp_capacity': vrp_capacity, 'low': 0, 'high': 1, 'variation_type': opts.variation_type}
-            task_prop['test_dataset'] = "{}__size_{}_cap_vrp_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(vrp_capacity), "test", test_seed)
-            task_prop['fine_tuning_dataset'] = "{}__size_{}_cap_vrp_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(vrp_capacity), "fine_tuning", fine_tune_seed)
-            tasks_list.append(task_prop)
-    elif opts.variation_type == 'mix_dist_size':
-        for i in [3, 5, 8]:
-            num_modes = i
-            task_prop = {'graph_size': opts.graph_size, 'low': 0, 'high': 1, 'num_modes': num_modes, 'dist': 'gmm', 'variation_type': opts.variation_type}
-            task_prop['test_dataset'] = "{}__size_{}_distribution_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(num_modes), "test", test_seed)
-            task_prop['fine_tuning_dataset'] = "{}__size_{}_distribution_{}_{}_seed{}.pkl".format(opts.problem, task_prop['graph_size'], str(num_modes), "fine_tuning", fine_tune_seed)
-            tasks_list.append(task_prop)
-    else:
-        print("Invalid task distribution: opts.variation_type!")
-        exit(0)
-
-    return tasks_list
-
-
-if __name__ == "__main__":
-    from options import get_options
-    opts = get_options()
-    # opts.seed = 2023
-    # opts.val_size = 10000
-
-    opts.seed = 2022
-    opts.fine_tune_size = 3000
-
-    opts.problem = "tsp"
-    opts.variation_type = "size"
-    opts.graph_size = 40
-
-    seed_everything(opts.seed)
-    problem = load_problem(opts.problem)
-
-    tasks_list = generate_test_task(opts, test_seed=2023, fine_tune_seed=2022)
-    for task in tasks_list:
-        # test_set = problem.make_dataset(num_samples=opts.val_size, filename=None, distribution=None, task=task)
-        # save_dataset(test_set, "./new_data/size/tsp/" + task['test_dataset'])
-        # print(task['test_dataset'], len(test_set))
-        test_set = problem.make_dataset(num_samples=opts.fine_tune_size, filename=None, distribution=None, task=task)
-        save_dataset(test_set, "./new_data/size/tsp/" + task['fine_tuning_dataset'])
-        print(task['fine_tuning_dataset'], len(test_set))
diff --git a/AM/nets/__init__.py b/AM/nets/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/nets/attention_model.py b/AM/nets/attention_model.py
deleted file mode 100644
index d7eb0e4..0000000
--- a/AM/nets/attention_model.py
+++ /dev/null
@@ -1,534 +0,0 @@
-import torch
-from torch import nn
-from torch.utils.checkpoint import checkpoint
-import math
-from typing import NamedTuple
-from utils.tensor_functions import compute_in_batches
-
-from nets.graph_encoder import GraphAttentionEncoder
-from torch.nn import DataParallel
-from utils.beam_search import CachedLookup
-from utils.functions import sample_many
-
-
-def set_decode_type(model, decode_type):
-    if isinstance(model, DataParallel):
-        model = model.module
-    model.set_decode_type(decode_type)
-
-
-class AttentionModelFixed(NamedTuple):
-    """
-    Context for AttentionModel decoder that is fixed during decoding so can be precomputed/cached
-    This class allows for efficient indexing of multiple Tensors at once
-    """
-    node_embeddings: torch.Tensor
-    context_node_projected: torch.Tensor
-    glimpse_key: torch.Tensor
-    glimpse_val: torch.Tensor
-    logit_key: torch.Tensor
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)
-        return AttentionModelFixed(
-            node_embeddings=self.node_embeddings[key],
-            context_node_projected=self.context_node_projected[key],
-            glimpse_key=self.glimpse_key[:, key],  # dim 0 are the heads
-            glimpse_val=self.glimpse_val[:, key],  # dim 0 are the heads
-            logit_key=self.logit_key[key]
-        )
-
-
-class AttentionModel(nn.Module):
-
-    def __init__(self,
-                 embedding_dim,
-                 hidden_dim,
-                 problem,
-                 n_encode_layers=2,
-                 tanh_clipping=10.,
-                 mask_inner=True,
-                 mask_logits=True,
-                 normalization='batch',
-                 n_heads=8,
-                 checkpoint_encoder=False,
-                 shrink_size=None):
-        super(AttentionModel, self).__init__()
-
-        self.embedding_dim = embedding_dim
-        self.hidden_dim = hidden_dim
-        self.n_encode_layers = n_encode_layers
-        self.decode_type = None
-        self.temp = 1.0
-        self.allow_partial = problem.NAME == 'sdvrp'
-        self.is_vrp = problem.NAME == 'cvrp' or problem.NAME == 'sdvrp'
-        self.is_orienteering = problem.NAME == 'op'
-        self.is_pctsp = problem.NAME == 'pctsp'
-
-        self.tanh_clipping = tanh_clipping
-
-        self.mask_inner = mask_inner
-        self.mask_logits = mask_logits
-
-        self.problem = problem
-        self.n_heads = n_heads
-        self.checkpoint_encoder = checkpoint_encoder
-        self.shrink_size = shrink_size
-
-        # Problem specific context parameters (placeholder and step context dimension)
-        if self.is_vrp or self.is_orienteering or self.is_pctsp:
-            # Embedding of last node + remaining_capacity / remaining length / remaining prize to collect
-            step_context_dim = embedding_dim + 1
-
-            if self.is_pctsp:
-                node_dim = 4  # x, y, expected_prize, penalty
-            else:
-                node_dim = 3  # x, y, demand / prize
-
-            # Special embedding projection for depot node
-            self.init_embed_depot = nn.Linear(2, embedding_dim)
-            if self.is_vrp and self.allow_partial:  # Need to include the demand if split delivery allowed
-                self.project_node_step = nn.Linear(1, 3 * embedding_dim, bias=False)
-        else:  # TSP
-            assert problem.NAME == "tsp", "Unsupported problem: {}".format(problem.NAME)
-            step_context_dim = 2 * embedding_dim  # Embedding of first and last node
-            node_dim = 2  # x, y
-            # Learned input symbols for first action
-            self.W_placeholder = nn.Parameter(torch.Tensor(2 * embedding_dim))
-            self.W_placeholder.data.uniform_(-1, 1)  # Placeholder should be in range of activations
-
-        self.init_embed = nn.Linear(node_dim, embedding_dim)
-
-        self.embedder = GraphAttentionEncoder(
-            n_heads=n_heads,
-            embed_dim=embedding_dim,
-            n_layers=self.n_encode_layers,
-            normalization=normalization
-        )
-
-        # For each node we compute (glimpse key, glimpse value, logit key) so 3 * embedding_dim
-        self.project_node_embeddings = nn.Linear(embedding_dim, 3 * embedding_dim, bias=False)
-        self.project_fixed_context = nn.Linear(embedding_dim, embedding_dim, bias=False)
-        self.project_step_context = nn.Linear(step_context_dim, embedding_dim, bias=False)
-        assert embedding_dim % n_heads == 0
-        # Note n_heads * val_dim == embedding_dim so input to project_out is embedding_dim
-        self.project_out = nn.Linear(embedding_dim, embedding_dim, bias=False)
-
-    def set_decode_type(self, decode_type, temp=None):
-        self.decode_type = decode_type
-        if temp is not None:  # Do not change temperature if not provided
-            self.temp = temp
-
-    def forward(self, input, return_pi=False, return_emb=False):
-        """
-        :param input: (batch_size, graph_size, node_dim) input node features or dictionary with multiple tensors
-        :param return_pi: whether to return the output sequences, this is optional as it is not compatible with
-        using DataParallel as the results may be of different lengths on different GPUs
-        :return:
-        """
-        self.input = input
-        if self.checkpoint_encoder and self.training:  # Only checkpoint if we need gradients
-            embeddings, _ = checkpoint(self.embedder, self._init_embed(input))
-        else:
-            embeddings, _ = self.embedder(self._init_embed(input))
-
-        _log_p, pi = self._inner(input, embeddings, return_emb)
-
-        cost, mask = self.problem.get_costs(input, pi)
-        # Log likelyhood is calculated within the model since returning it per action does not work well with
-        # DataParallel since sequences can be of different lengths
-        ll = self._calc_log_likelihood(_log_p, pi, mask)
-
-        if (return_emb):
-            return self.embd_graph_mean_to_return
-
-        if return_pi:
-            return cost, ll, pi
-
-        return cost, ll
-
-    def beam_search(self, *args, **kwargs):
-        return self.problem.beam_search(*args, **kwargs, model=self)
-
-    def precompute_fixed(self, input):
-        embeddings, _ = self.embedder(self._init_embed(input))
-        # Use a CachedLookup such that if we repeatedly index this object with the same index we only need to do
-        # the lookup once... this is the case if all elements in the batch have maximum batch size
-        return CachedLookup(self._precompute(embeddings))
-
-    def propose_expansions(self, beam, fixed, expand_size=None, normalize=False, max_calc_batch_size=4096):
-        # First dim = batch_size * cur_beam_size
-        log_p_topk, ind_topk = compute_in_batches(
-            lambda b: self._get_log_p_topk(fixed[b.ids], b.state, k=expand_size, normalize=normalize),
-            max_calc_batch_size, beam, n=beam.size()
-        )
-
-        assert log_p_topk.size(1) == 1, "Can only have single step"
-        # This will broadcast, calculate log_p (score) of expansions
-        score_expand = beam.score[:, None] + log_p_topk[:, 0, :]
-
-        # We flatten the action as we need to filter and this cannot be done in 2d
-        flat_action = ind_topk.view(-1)
-        flat_score = score_expand.view(-1)
-        flat_feas = flat_score > -1e10  # != -math.inf triggers
-
-        # Parent is row idx of ind_topk, can be found by enumerating elements and dividing by number of columns
-        flat_parent = torch.arange(flat_action.size(-1), out=flat_action.new()) / ind_topk.size(-1)
-
-        # Filter infeasible
-        feas_ind_2d = torch.nonzero(flat_feas)
-
-        if len(feas_ind_2d) == 0:
-            # Too bad, no feasible expansions at all :(
-            return None, None, None
-
-        feas_ind = feas_ind_2d[:, 0]
-
-        return flat_parent[feas_ind], flat_action[feas_ind], flat_score[feas_ind]
-
-    def _calc_log_likelihood(self, _log_p, a, mask):
-
-        # Get log_p corresponding to selected actions
-        log_p = _log_p.gather(2, a.unsqueeze(-1)).squeeze(-1)
-
-        # Optional: mask out actions irrelevant to objective so they do not get reinforced
-        if mask is not None:
-            log_p[mask] = 0
-
-        assert (log_p > -1000).data.all(), "Logprobs should not be -inf, check sampling procedure!"
-
-        # Calculate log_likelihood
-        return log_p.sum(1)
-
-    def _init_embed(self, input):
-
-        if self.is_vrp or self.is_orienteering or self.is_pctsp:
-            if self.is_vrp:
-                features = ('demand',)
-            elif self.is_orienteering:
-                features = ('prize',)
-            else:
-                assert self.is_pctsp
-                features = ('deterministic_prize', 'penalty')
-            return torch.cat(
-                (
-                    self.init_embed_depot(input['depot'])[:, None, :],
-                    self.init_embed(torch.cat((
-                        input['loc'],
-                        *(input[feat][:, :, None] for feat in features)
-                    ), -1))
-                ),
-                1
-            )
-        # TSP
-        return self.init_embed(input)
-
-    def _inner(self, input, embeddings, return_emb=False):
-
-        outputs = []
-        sequences = []
-
-        state = self.problem.make_state(input)
-
-        # Compute keys, values for the glimpse and keys for the logits once as they can be reused in every step
-        fixed = self._precompute(embeddings, return_emb=return_emb)
-
-        batch_size = state.ids.size(0)
-
-        # Perform decoding steps
-        i = 0
-        while not (self.shrink_size is None and state.all_finished()):
-
-            if self.shrink_size is not None:
-                unfinished = torch.nonzero(state.get_finished() == 0)
-                if len(unfinished) == 0:
-                    break
-                unfinished = unfinished[:, 0]
-                # Check if we can shrink by at least shrink_size and if this leaves at least 16
-                # (otherwise batch norm will not work well and it is inefficient anyway)
-                if 16 <= len(unfinished) <= state.ids.size(0) - self.shrink_size:
-                    # Filter states
-                    state = state[unfinished]
-                    fixed = fixed[unfinished]
-
-            log_p, mask = self._get_log_p(fixed, state)
-
-            # Select the indices of the next nodes in the sequences, result (batch_size) long
-            selected = self._select_node(log_p.exp()[:, 0, :], mask[:, 0, :])  # Squeeze out steps dimension
-
-            state = state.update(selected)
-
-            # Now make log_p, selected desired output size by 'unshrinking'
-            if self.shrink_size is not None and state.ids.size(0) < batch_size:
-                log_p_, selected_ = log_p, selected
-                log_p = log_p_.new_zeros(batch_size, *log_p_.size()[1:])
-                selected = selected_.new_zeros(batch_size)
-
-                log_p[state.ids[:, 0]] = log_p_
-                selected[state.ids[:, 0]] = selected_
-
-            # Collect output of step
-            outputs.append(log_p[:, 0, :])
-            sequences.append(selected)
-
-            i += 1
-
-        # Collected lists, return Tensor
-        return torch.stack(outputs, 1), torch.stack(sequences, 1)
-
-    def sample_many(self, input, batch_rep=1, iter_rep=1):
-        """
-        :param input: (batch_size, graph_size, node_dim) input node features
-        :return:
-        """
-        # Bit ugly but we need to pass the embeddings as well.
-        # Making a tuple will not work with the problem.get_cost function
-        return sample_many(
-            lambda input: self._inner(*input),  # Need to unpack tuple into arguments
-            lambda input, pi: self.problem.get_costs(input[0], pi),  # Don't need embeddings as input to get_costs
-            (input, self.embedder(self._init_embed(input))[0]),  # Pack input with embeddings (additional input)
-            batch_rep, iter_rep
-        )
-
-    def _select_node(self, probs, mask):
-
-        assert (probs == probs).all(), "Probs should not contain any nans"
-
-        if self.decode_type == "greedy":
-            _, selected = probs.max(1)
-            assert not mask.gather(1, selected.unsqueeze(
-                -1)).data.any(), "Decode greedy: infeasible action has maximum probability"
-
-        elif self.decode_type == "sampling":
-            selected = probs.multinomial(1).squeeze(1)
-
-            # Check if sampling went OK, can go wrong due to bug on GPU
-            # See https://discuss.pytorch.org/t/bad-behavior-of-multinomial-function/10232
-            while mask.gather(1, selected.unsqueeze(-1)).data.any():
-                print('Sampled bad values, resampling!')
-                selected = probs.multinomial(1).squeeze(1)
-
-        else:
-            assert False, "Unknown decode type"
-        return selected
-
-    def _precompute(self, embeddings, num_steps=1, return_emb=False):
-
-        # The fixed context projection of the graph embedding is calculated only once for efficiency
-        graph_embed = embeddings.mean(1)
-
-        if (return_emb == True):
-            self.embd_graph_mean_to_return = graph_embed
-
-        # fixed context = (batch_size, 1, embed_dim) to make broadcastable with parallel timesteps
-        fixed_context = self.project_fixed_context(graph_embed)[:, None, :]
-
-        # The projection of the node embeddings for the attention is calculated once up front
-        glimpse_key_fixed, glimpse_val_fixed, logit_key_fixed = \
-            self.project_node_embeddings(embeddings[:, None, :, :]).chunk(3, dim=-1)
-
-        # No need to rearrange key for logit as there is a single head
-        fixed_attention_node_data = (
-            self._make_heads(glimpse_key_fixed, num_steps),
-            self._make_heads(glimpse_val_fixed, num_steps),
-            logit_key_fixed.contiguous()
-        )
-        return AttentionModelFixed(embeddings, fixed_context, *fixed_attention_node_data)
-
-    def _get_log_p_topk(self, fixed, state, k=None, normalize=True):
-        log_p, _ = self._get_log_p(fixed, state, normalize=normalize)
-
-        # Return topk
-        if k is not None and k < log_p.size(-1):
-            return log_p.topk(k, -1)
-
-        # Return all, note different from torch.topk this does not give error if less than k elements along dim
-        return (
-            log_p,
-            torch.arange(log_p.size(-1), device=log_p.device, dtype=torch.int64).repeat(log_p.size(0), 1)[:, None, :]
-        )
-
-    def _get_log_p(self, fixed, state, normalize=True):
-
-        # Compute query = context node embedding
-        query = fixed.context_node_projected + \
-                self.project_step_context(self._get_parallel_step_context(fixed.node_embeddings, state))
-
-        # Compute keys and values for the nodes
-        glimpse_K, glimpse_V, logit_K = self._get_attention_node_data(fixed, state)
-
-        # Compute the mask
-        mask = state.get_mask()
-
-        # Compute logits (unnormalized log_p)
-        log_p, glimpse = self._one_to_many_logits(query, glimpse_K, glimpse_V, logit_K, mask)
-
-        if normalize:
-            log_p1 = torch.log_softmax(log_p / self.temp, dim=-1)
-
-        # used for debugging
-        if torch.isnan(log_p1).any():
-            torch.set_printoptions(profile="full")
-            index = ((log_p1 != log_p1).nonzero(as_tuple=True)[0]).tolist()
-            index = set(index)
-            print(">> Occur nan problem on index: {}".format(index))
-            torch.save(
-                {
-                    'input': self.input,
-                    'model': self.state_dict(),
-                    'log_p': log_p,
-                    'log_p1': log_p1,
-                    'mask': mask,
-                }, "./debug.pt")
-
-        assert not torch.isnan(log_p1).any()
-
-        return log_p1, mask
-
-    def _get_parallel_step_context(self, embeddings, state, from_depot=False):
-        """
-        Returns the context per step, optionally for multiple steps at once (for efficient evaluation of the model)
-
-        :param embeddings: (batch_size, graph_size, embed_dim)
-        :param prev_a: (batch_size, num_steps)
-        :param first_a: Only used when num_steps = 1, action of first step or None if first step
-        :return: (batch_size, num_steps, context_dim)
-        """
-
-        current_node = state.get_current_node()
-        batch_size, num_steps = current_node.size()
-
-        if self.is_vrp:
-            # Embedding of previous node + remaining capacity
-            if from_depot:
-                # 1st dimension is node idx, but we do not squeeze it since we want to insert step dimension
-                # i.e. we actually want embeddings[:, 0, :][:, None, :] which is equivalent
-                return torch.cat(
-                    (
-                        embeddings[:, 0:1, :].expand(batch_size, num_steps, embeddings.size(-1)),
-                        # used capacity is 0 after visiting depot
-                        self.problem.VEHICLE_CAPACITY - torch.zeros_like(state.used_capacity[:, :, None])
-                    ),
-                    -1
-                )
-            else:
-                return torch.cat(
-                    (
-                        torch.gather(
-                            embeddings,
-                            1,
-                            current_node.contiguous()
-                                .view(batch_size, num_steps, 1)
-                                .expand(batch_size, num_steps, embeddings.size(-1))
-                        ).view(batch_size, num_steps, embeddings.size(-1)),
-                        self.problem.VEHICLE_CAPACITY - state.used_capacity[:, :, None]
-                    ),
-                    -1
-                )
-        elif self.is_orienteering or self.is_pctsp:
-            return torch.cat(
-                (
-                    torch.gather(
-                        embeddings,
-                        1,
-                        current_node.contiguous()
-                            .view(batch_size, num_steps, 1)
-                            .expand(batch_size, num_steps, embeddings.size(-1))
-                    ).view(batch_size, num_steps, embeddings.size(-1)),
-                    (
-                        state.get_remaining_length()[:, :, None]
-                        if self.is_orienteering
-                        else state.get_remaining_prize_to_collect()[:, :, None]
-                    )
-                ),
-                -1
-            )
-        else:  # TSP
-
-            if num_steps == 1:  # We need to special case if we have only 1 step, may be the first or not
-                if state.i.item() == 0:
-                    # First and only step, ignore prev_a (this is a placeholder)
-                    return self.W_placeholder[None, None, :].expand(batch_size, 1, self.W_placeholder.size(-1))
-                else:
-                    return embeddings.gather(
-                        1,
-                        torch.cat((state.first_a, current_node), 1)[:, :, None].expand(batch_size, 2,
-                                                                                       embeddings.size(-1))
-                    ).view(batch_size, 1, -1)
-            # More than one step, assume always starting with first
-            embeddings_per_step = embeddings.gather(
-                1,
-                current_node[:, 1:, None].expand(batch_size, num_steps - 1, embeddings.size(-1))
-            )
-            return torch.cat((
-                # First step placeholder, cat in dim 1 (time steps)
-                self.W_placeholder[None, None, :].expand(batch_size, 1, self.W_placeholder.size(-1)),
-                # Second step, concatenate embedding of first with embedding of current/previous (in dim 2, context dim)
-                torch.cat((
-                    embeddings_per_step[:, 0:1, :].expand(batch_size, num_steps - 1, embeddings.size(-1)),
-                    embeddings_per_step
-                ), 2)
-            ), 1)
-
-    def _one_to_many_logits(self, query, glimpse_K, glimpse_V, logit_K, mask):
-
-        batch_size, num_steps, embed_dim = query.size()
-        key_size = val_size = embed_dim // self.n_heads
-
-        # Compute the glimpse, rearrange dimensions so the dimensions are (n_heads, batch_size, num_steps, 1, key_size)
-        glimpse_Q = query.view(batch_size, num_steps, self.n_heads, 1, key_size).permute(2, 0, 1, 3, 4)
-
-        # Batch matrix multiplication to compute compatibilities (n_heads, batch_size, num_steps, graph_size)
-        compatibility = torch.matmul(glimpse_Q, glimpse_K.transpose(-2, -1)) / math.sqrt(glimpse_Q.size(-1))
-        if self.mask_inner:
-            assert self.mask_logits, "Cannot mask inner without masking logits"
-            compatibility[mask[None, :, :, None, :].expand_as(compatibility)] = -math.inf
-
-        # Batch matrix multiplication to compute heads (n_heads, batch_size, num_steps, val_size)
-        heads = torch.matmul(torch.softmax(compatibility, dim=-1), glimpse_V)
-
-        # Project to get glimpse/updated context node embedding (batch_size, num_steps, embedding_dim)
-        glimpse = self.project_out(
-            heads.permute(1, 2, 3, 0, 4).contiguous().view(-1, num_steps, 1, self.n_heads * val_size))
-
-        # Now projecting the glimpse is not needed since this can be absorbed into project_out
-        # final_Q = self.project_glimpse(glimpse)
-        final_Q = glimpse
-        # Batch matrix multiplication to compute logits (batch_size, num_steps, graph_size)
-        # logits = 'compatibility'
-        logits = torch.matmul(final_Q, logit_K.transpose(-2, -1)).squeeze(-2) / math.sqrt(final_Q.size(-1))
-
-        # From the logits compute the probabilities by clipping, masking and softmax
-        if self.tanh_clipping > 0:
-            logits = torch.tanh(logits) * self.tanh_clipping
-        if self.mask_logits:
-            logits[mask] = -math.inf
-
-        return logits, glimpse.squeeze(-2)
-
-    def _get_attention_node_data(self, fixed, state):
-
-        if self.is_vrp and self.allow_partial:
-            # Need to provide information of how much each node has already been served
-            # Clone demands as they are needed by the backprop whereas they are updated later
-            glimpse_key_step, glimpse_val_step, logit_key_step = \
-                self.project_node_step(state.demands_with_depot[:, :, :, None].clone()).chunk(3, dim=-1)
-
-            # Projection of concatenation is equivalent to addition of projections but this is more efficient
-            return (
-                fixed.glimpse_key + self._make_heads(glimpse_key_step),
-                fixed.glimpse_val + self._make_heads(glimpse_val_step),
-                fixed.logit_key + logit_key_step,
-            )
-
-        # TSP or VRP without split delivery
-        return fixed.glimpse_key, fixed.glimpse_val, fixed.logit_key
-
-    def _make_heads(self, v, num_steps=None):
-        assert num_steps is None or v.size(1) == 1 or v.size(1) == num_steps
-
-        return (
-            v.contiguous().view(v.size(0), v.size(1), v.size(2), self.n_heads, -1)
-                .expand(v.size(0), v.size(1) if num_steps is None else num_steps, v.size(2), self.n_heads, -1)
-                .permute(3, 0, 1, 2, 4)  # (n_heads, batch_size, num_steps, graph_size, head_dim)
-        )
diff --git a/AM/nets/critic_network.py b/AM/nets/critic_network.py
deleted file mode 100644
index 73d8926..0000000
--- a/AM/nets/critic_network.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from torch import nn
-from nets.graph_encoder import GraphAttentionEncoder
-
-
-class CriticNetwork(nn.Module):
-
-    def __init__(
-        self,
-        input_dim,
-        embedding_dim,
-        hidden_dim,
-        n_layers,
-        encoder_normalization
-    ):
-        super(CriticNetwork, self).__init__()
-
-        self.hidden_dim = hidden_dim
-
-        self.encoder = GraphAttentionEncoder(
-            node_dim=input_dim,
-            n_heads=8,
-            embed_dim=embedding_dim,
-            n_layers=n_layers,
-            normalization=encoder_normalization
-        )
-
-        self.value_head = nn.Sequential(
-            nn.Linear(embedding_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, 1)
-        )
-
-    def forward(self, inputs):
-        """
-
-        :param inputs: (batch_size, graph_size, input_dim)
-        :return:
-        """
-        _, graph_embeddings = self.encoder(inputs)
-        return self.value_head(graph_embeddings)
diff --git a/AM/nets/graph_encoder.py b/AM/nets/graph_encoder.py
deleted file mode 100644
index e3a4cf8..0000000
--- a/AM/nets/graph_encoder.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import torch
-import numpy as np
-from torch import nn
-import math
-
-
-class SkipConnection(nn.Module):
-
-    def __init__(self, module):
-        super(SkipConnection, self).__init__()
-        self.module = module
-
-    def forward(self, input):
-        return input + self.module(input)
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(
-            self,
-            n_heads,
-            input_dim,
-            embed_dim,
-            val_dim=None,
-            key_dim=None
-    ):
-        super(MultiHeadAttention, self).__init__()
-
-        if val_dim is None:
-            val_dim = embed_dim // n_heads
-        if key_dim is None:
-            key_dim = val_dim
-
-        self.n_heads = n_heads
-        self.input_dim = input_dim
-        self.embed_dim = embed_dim
-        self.val_dim = val_dim
-        self.key_dim = key_dim
-
-        self.norm_factor = 1 / math.sqrt(key_dim)  # See Attention is all you need
-
-        self.W_query = nn.Parameter(torch.Tensor(n_heads, input_dim, key_dim))
-        self.W_key = nn.Parameter(torch.Tensor(n_heads, input_dim, key_dim))
-        self.W_val = nn.Parameter(torch.Tensor(n_heads, input_dim, val_dim))
-
-        self.W_out = nn.Parameter(torch.Tensor(n_heads, val_dim, embed_dim))
-
-        self.init_parameters()
-
-    def init_parameters(self):
-
-        for param in self.parameters():
-            stdv = 1. / math.sqrt(param.size(-1))
-            param.data.uniform_(-stdv, stdv)
-
-    def forward(self, q, h=None, mask=None):
-        """
-
-        :param q: queries (batch_size, n_query, input_dim)
-        :param h: data (batch_size, graph_size, input_dim)
-        :param mask: mask (batch_size, n_query, graph_size) or viewable as that (i.e. can be 2 dim if n_query == 1)
-        Mask should contain 1 if attention is not possible (i.e. mask is negative adjacency)
-        :return:
-        """
-        if h is None:
-            h = q  # compute self-attention
-
-        # h should be (batch_size, graph_size, input_dim)
-        batch_size, graph_size, input_dim = h.size()
-        n_query = q.size(1)
-        assert q.size(0) == batch_size
-        assert q.size(2) == input_dim
-        assert input_dim == self.input_dim, "Wrong embedding dimension of input"
-
-        hflat = h.contiguous().view(-1, input_dim)
-        qflat = q.contiguous().view(-1, input_dim)
-
-        # last dimension can be different for keys and values
-        shp = (self.n_heads, batch_size, graph_size, -1)
-        shp_q = (self.n_heads, batch_size, n_query, -1)
-
-        # Calculate queries, (n_heads, n_query, graph_size, key/val_size)
-        Q = torch.matmul(qflat, self.W_query).view(shp_q)
-        # Calculate keys and values (n_heads, batch_size, graph_size, key/val_size)
-        K = torch.matmul(hflat, self.W_key).view(shp)
-        V = torch.matmul(hflat, self.W_val).view(shp)
-
-        # Calculate compatibility (n_heads, batch_size, n_query, graph_size)
-        compatibility = self.norm_factor * torch.matmul(Q, K.transpose(2, 3))
-
-        # Optionally apply mask to prevent attention
-        if mask is not None:
-            mask = mask.view(1, batch_size, n_query, graph_size).expand_as(compatibility)
-            compatibility[mask] = -np.inf
-
-        attn = torch.softmax(compatibility, dim=-1)
-
-        # If there are nodes with no neighbours then softmax returns nan so we fix them to 0
-        if mask is not None:
-            attnc = attn.clone()
-            attnc[mask] = 0
-            attn = attnc
-
-        heads = torch.matmul(attn, V)
-
-        out = torch.mm(
-            heads.permute(1, 2, 0, 3).contiguous().view(-1, self.n_heads * self.val_dim),
-            self.W_out.view(-1, self.embed_dim)
-        ).view(batch_size, n_query, self.embed_dim)
-
-        # Alternative:
-        # headst = heads.transpose(0, 1)  # swap the dimensions for batch and heads to align it for the matmul
-        # # proj_h = torch.einsum('bhni,hij->bhnj', headst, self.W_out)
-        # projected_heads = torch.matmul(headst, self.W_out)
-        # out = torch.sum(projected_heads, dim=1)  # sum across heads
-
-        # Or:
-        # out = torch.einsum('hbni,hij->bnj', heads, self.W_out)
-
-        return out
-
-
-class Normalization(nn.Module):
-
-    def __init__(self, embed_dim, normalization='batch'):
-        super(Normalization, self).__init__()
-
-        normalizer_class = {
-            'batch': nn.BatchNorm1d,
-            'instance': nn.InstanceNorm1d
-        }.get(normalization, None)
-
-        self.normalizer = normalizer_class(embed_dim, affine=True)
-
-        # Normalization by default initializes affine parameters with bias 0 and weight unif(0,1) which is too large!
-        # self.init_parameters()
-
-    def init_parameters(self):
-
-        for name, param in self.named_parameters():
-            stdv = 1. / math.sqrt(param.size(-1))
-            param.data.uniform_(-stdv, stdv)
-
-    def forward(self, input):
-
-        if isinstance(self.normalizer, nn.BatchNorm1d):
-            return self.normalizer(input.view(-1, input.size(-1))).view(*input.size())
-        elif isinstance(self.normalizer, nn.InstanceNorm1d):
-            return self.normalizer(input.permute(0, 2, 1)).permute(0, 2, 1)
-        else:
-            assert self.normalizer is None, "Unknown normalizer type"
-            return input
-
-
-class MultiHeadAttentionLayer(nn.Sequential):
-
-    def __init__(
-            self,
-            n_heads,
-            embed_dim,
-            feed_forward_hidden=512,
-            normalization='batch',
-    ):
-        super(MultiHeadAttentionLayer, self).__init__(
-            SkipConnection(
-                MultiHeadAttention(
-                    n_heads,
-                    input_dim=embed_dim,
-                    embed_dim=embed_dim
-                )
-            ),
-            Normalization(embed_dim, normalization),
-            SkipConnection(
-                nn.Sequential(
-                    nn.Linear(embed_dim, feed_forward_hidden),
-                    nn.ReLU(),
-                    nn.Linear(feed_forward_hidden, embed_dim)
-                ) if feed_forward_hidden > 0 else nn.Linear(embed_dim, embed_dim)
-            ),
-            Normalization(embed_dim, normalization)
-        )
-
-
-class GraphAttentionEncoder(nn.Module):
-    def __init__(
-            self,
-            n_heads,
-            embed_dim,
-            n_layers,
-            node_dim=None,
-            normalization='batch',
-            feed_forward_hidden=512
-    ):
-        super(GraphAttentionEncoder, self).__init__()
-
-        # To map input to embedding space
-        self.init_embed = nn.Linear(node_dim, embed_dim) if node_dim is not None else None
-
-        self.layers = nn.Sequential(*(
-            MultiHeadAttentionLayer(n_heads, embed_dim, feed_forward_hidden, normalization)
-            for _ in range(n_layers)
-        ))
-
-    def forward(self, x, mask=None):
-
-        assert mask is None, "TODO mask not yet supported!"
-
-        # Batch multiply to get initial embeddings of nodes
-        h = self.init_embed(x.view(-1, x.size(-1))).view(*x.size()[:2], -1) if self.init_embed is not None else x
-
-        h = self.layers(h)
-
-        return (
-            h,  # (batch_size, graph_size, embed_dim)
-            h.mean(dim=1),  # average to get embedding of graph, (batch_size, embed_dim)
-        )
diff --git a/AM/nets/pointer_network.py b/AM/nets/pointer_network.py
deleted file mode 100644
index c04fa08..0000000
--- a/AM/nets/pointer_network.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-import math
-import numpy as np
-
-
-class Encoder(nn.Module):
-    """Maps a graph represented as an input sequence
-    to a hidden vector"""
-    def __init__(self, input_dim, hidden_dim):
-        super(Encoder, self).__init__()
-        self.hidden_dim = hidden_dim
-        self.lstm = nn.LSTM(input_dim, hidden_dim)
-        self.init_hx, self.init_cx = self.init_hidden(hidden_dim)
-
-    def forward(self, x, hidden):
-        output, hidden = self.lstm(x, hidden)
-        return output, hidden
-    
-    def init_hidden(self, hidden_dim):
-        """Trainable initial hidden state"""
-        std = 1. / math.sqrt(hidden_dim)
-        enc_init_hx = nn.Parameter(torch.FloatTensor(hidden_dim))
-        enc_init_hx.data.uniform_(-std, std)
-
-        enc_init_cx = nn.Parameter(torch.FloatTensor(hidden_dim))
-        enc_init_cx.data.uniform_(-std, std)
-        return enc_init_hx, enc_init_cx
-
-
-class Attention(nn.Module):
-    """A generic attention module for a decoder in seq2seq"""
-    def __init__(self, dim, use_tanh=False, C=10):
-        super(Attention, self).__init__()
-        self.use_tanh = use_tanh
-        self.project_query = nn.Linear(dim, dim)
-        self.project_ref = nn.Conv1d(dim, dim, 1, 1)
-        self.C = C  # tanh exploration
-        self.tanh = nn.Tanh()
-
-        self.v = nn.Parameter(torch.FloatTensor(dim))
-        self.v.data.uniform_(-(1. / math.sqrt(dim)), 1. / math.sqrt(dim))
-        
-    def forward(self, query, ref):
-        """
-        Args: 
-            query: is the hidden state of the decoder at the current
-                time step. batch x dim
-            ref: the set of hidden states from the encoder. 
-                sourceL x batch x hidden_dim
-        """
-        # ref is now [batch_size x hidden_dim x sourceL]
-        ref = ref.permute(1, 2, 0)
-        q = self.project_query(query).unsqueeze(2)  # batch x dim x 1
-        e = self.project_ref(ref)  # batch_size x hidden_dim x sourceL 
-        # expand the query by sourceL
-        # batch x dim x sourceL
-        expanded_q = q.repeat(1, 1, e.size(2)) 
-        # batch x 1 x hidden_dim
-        v_view = self.v.unsqueeze(0).expand(
-                expanded_q.size(0), len(self.v)).unsqueeze(1)
-        # [batch_size x 1 x hidden_dim] * [batch_size x hidden_dim x sourceL]
-        u = torch.bmm(v_view, self.tanh(expanded_q + e)).squeeze(1)
-        if self.use_tanh:
-            logits = self.C * self.tanh(u)
-        else:
-            logits = u  
-        return e, logits
-
-
-class Decoder(nn.Module):
-    def __init__(self, 
-            embedding_dim,
-            hidden_dim,
-            tanh_exploration,
-            use_tanh,
-            n_glimpses=1,
-            mask_glimpses=True,
-            mask_logits=True):
-        super(Decoder, self).__init__()
-
-        self.embedding_dim = embedding_dim
-        self.hidden_dim = hidden_dim
-        self.n_glimpses = n_glimpses
-        self.mask_glimpses = mask_glimpses
-        self.mask_logits = mask_logits
-        self.use_tanh = use_tanh
-        self.tanh_exploration = tanh_exploration
-        self.decode_type = None  # Needs to be set explicitly before use
-
-        self.lstm = nn.LSTMCell(embedding_dim, hidden_dim)
-        self.pointer = Attention(hidden_dim, use_tanh=use_tanh, C=tanh_exploration)
-        self.glimpse = Attention(hidden_dim, use_tanh=False)
-        self.sm = nn.Softmax(dim=1)
-
-    def update_mask(self, mask, selected):
-        return mask.clone().scatter_(1, selected.unsqueeze(-1), True)
-
-    def recurrence(self, x, h_in, prev_mask, prev_idxs, step, context):
-
-        logit_mask = self.update_mask(prev_mask, prev_idxs) if prev_idxs is not None else prev_mask
-
-        logits, h_out = self.calc_logits(x, h_in, logit_mask, context, self.mask_glimpses, self.mask_logits)
-
-        # Calculate log_softmax for better numerical stability
-        log_p = torch.log_softmax(logits, dim=1)
-        probs = log_p.exp()
-
-        if not self.mask_logits:
-            # If self.mask_logits, this would be redundant, otherwise we must mask to make sure we don't resample
-            # Note that as a result the vector of probs may not sum to one (this is OK for .multinomial sampling)
-            # But practically by not masking the logits, a model is learned over all sequences (also infeasible)
-            # while only during sampling feasibility is enforced (a.k.a. by setting to 0. here)
-            probs[logit_mask] = 0.
-            # For consistency we should also mask out in log_p, but the values set to 0 will not be sampled and
-            # Therefore not be used by the reinforce estimator
-
-        return h_out, log_p, probs, logit_mask
-
-    def calc_logits(self, x, h_in, logit_mask, context, mask_glimpses=None, mask_logits=None):
-
-        if mask_glimpses is None:
-            mask_glimpses = self.mask_glimpses
-
-        if mask_logits is None:
-            mask_logits = self.mask_logits
-
-        hy, cy = self.lstm(x, h_in)
-        g_l, h_out = hy, (hy, cy)
-
-        for i in range(self.n_glimpses):
-            ref, logits = self.glimpse(g_l, context)
-            # For the glimpses, only mask before softmax so we have always an L1 norm 1 readout vector
-            if mask_glimpses:
-                logits[logit_mask] = -np.inf
-            # [batch_size x h_dim x sourceL] * [batch_size x sourceL x 1] =
-            # [batch_size x h_dim x 1]
-            g_l = torch.bmm(ref, self.sm(logits).unsqueeze(2)).squeeze(2)
-        _, logits = self.pointer(g_l, context)
-
-        # Masking before softmax makes probs sum to one
-        if mask_logits:
-            logits[logit_mask] = -np.inf
-
-        return logits, h_out
-
-    def forward(self, decoder_input, embedded_inputs, hidden, context, eval_tours=None):
-        """
-        Args:
-            decoder_input: The initial input to the decoder
-                size is [batch_size x embedding_dim]. Trainable parameter.
-            embedded_inputs: [sourceL x batch_size x embedding_dim]
-            hidden: the prev hidden state, size is [batch_size x hidden_dim]. 
-                Initially this is set to (enc_h[-1], enc_c[-1])
-            context: encoder outputs, [sourceL x batch_size x hidden_dim] 
-        """
-
-        batch_size = context.size(1)
-        outputs = []
-        selections = []
-        steps = range(embedded_inputs.size(0))
-        idxs = None
-        mask = Variable(
-            embedded_inputs.data.new().byte().new(embedded_inputs.size(1), embedded_inputs.size(0)).zero_(),
-            requires_grad=False
-        )
-
-        for i in steps:
-            hidden, log_p, probs, mask = self.recurrence(decoder_input, hidden, mask, idxs, i, context)
-            # select the next inputs for the decoder [batch_size x hidden_dim]
-            idxs = self.decode(
-                probs,
-                mask
-            ) if eval_tours is None else eval_tours[:, i]
-
-            idxs = idxs.detach()  # Otherwise pytorch complains it want's a reward, todo implement this more properly?
-
-            # Gather input embedding of selected
-            decoder_input = torch.gather(
-                embedded_inputs,
-                0,
-                idxs.contiguous().view(1, batch_size, 1).expand(1, batch_size, *embedded_inputs.size()[2:])
-            ).squeeze(0)
-
-            # use outs to point to next object
-            outputs.append(log_p)
-            selections.append(idxs)
-        return (torch.stack(outputs, 1), torch.stack(selections, 1)), hidden
-
-    def decode(self, probs, mask):
-        if self.decode_type == "greedy":
-            _, idxs = probs.max(1)
-            assert not mask.gather(1, idxs.unsqueeze(-1)).data.any(), \
-                "Decode greedy: infeasible action has maximum probability"
-        elif self.decode_type == "sampling":
-            idxs = probs.multinomial(1).squeeze(1)
-            # Check if sampling went OK, can go wrong due to bug on GPU
-            while mask.gather(1, idxs.unsqueeze(-1)).data.any():
-                print(' [!] resampling due to race condition')
-                idxs = probs.multinomial().squeeze(1)
-        else:
-            assert False, "Unknown decode type"
-
-        return idxs
-
-
-class CriticNetworkLSTM(nn.Module):
-    """Useful as a baseline in REINFORCE updates"""
-    def __init__(self,
-            embedding_dim,
-            hidden_dim,
-            n_process_block_iters,
-            tanh_exploration,
-            use_tanh):
-        super(CriticNetworkLSTM, self).__init__()
-        
-        self.hidden_dim = hidden_dim
-        self.n_process_block_iters = n_process_block_iters
-
-        self.encoder = Encoder(embedding_dim, hidden_dim)
-        
-        self.process_block = Attention(hidden_dim, use_tanh=use_tanh, C=tanh_exploration)
-        self.sm = nn.Softmax(dim=1)
-        self.decoder = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, 1)
-        )
-
-    def forward(self, inputs):
-        """
-        Args:
-            inputs: [embedding_dim x batch_size x sourceL] of embedded inputs
-        """
-        inputs = inputs.transpose(0, 1).contiguous()
-
-        encoder_hx = self.encoder.init_hx.unsqueeze(0).repeat(inputs.size(1), 1).unsqueeze(0)
-        encoder_cx = self.encoder.init_cx.unsqueeze(0).repeat(inputs.size(1), 1).unsqueeze(0)
-        
-        # encoder forward pass
-        enc_outputs, (enc_h_t, enc_c_t) = self.encoder(inputs, (encoder_hx, encoder_cx))
-        
-        # grab the hidden state and process it via the process block 
-        process_block_state = enc_h_t[-1]
-        for i in range(self.n_process_block_iters):
-            ref, logits = self.process_block(process_block_state, enc_outputs)
-            process_block_state = torch.bmm(ref, self.sm(logits).unsqueeze(2)).squeeze(2)
-        # produce the final scalar output
-        out = self.decoder(process_block_state)
-        return out
-
-
-class PointerNetwork(nn.Module):
-
-    def __init__(self,
-                 embedding_dim,
-                 hidden_dim,
-                 problem,
-                 n_encode_layers=None,
-                 tanh_clipping=10.,
-                 mask_inner=True,
-                 mask_logits=True,
-                 normalization=None,
-                 **kwargs):
-        super(PointerNetwork, self).__init__()
-
-        self.problem = problem
-        assert problem.NAME == "tsp", "Pointer Network only supported for TSP"
-        self.input_dim = 2
-
-        self.encoder = Encoder(
-            embedding_dim,
-            hidden_dim)
-
-        self.decoder = Decoder(
-            embedding_dim,
-            hidden_dim,
-            tanh_exploration=tanh_clipping,
-            use_tanh=tanh_clipping > 0,
-            n_glimpses=1,
-            mask_glimpses=mask_inner,
-            mask_logits=mask_logits
-        )
-
-        # Trainable initial hidden states
-        std = 1. / math.sqrt(embedding_dim)
-        self.decoder_in_0 = nn.Parameter(torch.FloatTensor(embedding_dim))
-        self.decoder_in_0.data.uniform_(-std, std)
-
-        self.embedding = nn.Parameter(torch.FloatTensor(self.input_dim, embedding_dim))
-        self.embedding.data.uniform_(-std, std)
-
-    def set_decode_type(self, decode_type):
-        self.decoder.decode_type = decode_type
-
-    def forward(self, inputs, eval_tours=None, return_pi=False):
-        batch_size, graph_size, input_dim = inputs.size()
-
-        embedded_inputs = torch.mm(
-            inputs.transpose(0, 1).contiguous().view(-1, input_dim),
-            self.embedding
-        ).view(graph_size, batch_size, -1)
-
-        # query the actor net for the input indices 
-        # making up the output, and the pointer attn 
-        _log_p, pi = self._inner(embedded_inputs, eval_tours)
-
-        cost, mask = self.problem.get_costs(inputs, pi)
-        # Log likelyhood is calculated within the model since returning it per action does not work well with
-        # DataParallel since sequences can be of different lengths
-        ll = self._calc_log_likelihood(_log_p, pi, mask)
-        if return_pi:
-            return cost, ll, pi
-
-        return cost, ll
-
-    def _calc_log_likelihood(self, _log_p, a, mask):
-
-        # Get log_p corresponding to selected actions
-        log_p = _log_p.gather(2, a.unsqueeze(-1)).squeeze(-1)
-
-        # Optional: mask out actions irrelevant to objective so they do not get reinforced
-        if mask is not None:
-            log_p[mask] = 0
-
-        assert (log_p > -1000).data.all(), "Logprobs should not be -inf, check sampling procedure!"
-
-        # Calculate log_likelihood
-        return log_p.sum(1)
-
-    def _inner(self, inputs, eval_tours=None):
-
-        encoder_hx = encoder_cx = Variable(
-            torch.zeros(1, inputs.size(1), self.encoder.hidden_dim, out=inputs.data.new()),
-            requires_grad=False
-        )
-
-        # encoder forward pass
-        enc_h, (enc_h_t, enc_c_t) = self.encoder(inputs, (encoder_hx, encoder_cx))
-
-        dec_init_state = (enc_h_t[-1], enc_c_t[-1])
-
-        # repeat decoder_in_0 across batch
-        decoder_input = self.decoder_in_0.unsqueeze(0).repeat(inputs.size(1), 1)
-
-        (pointer_probs, input_idxs), dec_hidden_t = self.decoder(decoder_input,
-                                                                 inputs,
-                                                                 dec_init_state,
-                                                                 enc_h,
-                                                                 eval_tours)
-
-        return pointer_probs, input_idxs
\ No newline at end of file
diff --git a/AM/options.py b/AM/options.py
deleted file mode 100644
index 6f86099..0000000
--- a/AM/options.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-import time
-import argparse
-import torch
-
-
-def get_options(args=None):
-    parser = argparse.ArgumentParser(
-        description="Attention based model for solving the Travelling Salesman Problem with Reinforcement Learning")
-
-    # Data
-    parser.add_argument('--problem', default='tsp', help="The problem to solve, default 'tsp'")
-    parser.add_argument('--graph_size', type=int, default=40, help="The size of the problem graph")
-    parser.add_argument('--batch_size', type=int, default=256, help='Number of instances per batch during training')
-    # parser.add_argument('--epoch_size', type=int, default=1280000, help='Number of instances per epoch during training')
-    parser.add_argument('--k_tune_steps', type=int, default=50,
-                        help='Number of inner fine tuning steps during training. Set to 50( parameter applicable only to meta-training run_meta.py) ')
-    parser.add_argument('--alpha_decay', type=float, default=0.998, help='decaying reptile alpha')
-    parser.add_argument('--alpha', type=float, default=0.99, help='alpha')
-    parser.add_argument('--variation_type', type=str, default="size", choices=['size', 'dist', 'scale', 'mix_dist_size', 'cap_vrp'], help='type of the task distribution')
-    parser.add_argument('--baseline_every_Xepochs_for_META', type=int, default=40,
-                        help='Controls frequency of baseline update. Set to 7 for meta-training. (need to set only for meta-training run_meta.py, for multi and scratch it is set to default value in options.py)')
-    parser.add_argument('--train_tasks', type=str, default=None, help='train_tasks for cvrp')  # cvrp
-    parser.add_argument('--test_num_step_epochs', type=int, default=50, help='Fine_tuning steps test')
-    parser.add_argument('--val_size', type=int, default=10000, help='Number of instances used for reporting validation/test performance')
-    parser.add_argument('--fine_tune_size', type=int, default=3000, help='Number of instances used for fine-tuning')
-    parser.add_argument('--val_dataset', type=str, default=None, help='Dataset file to use for validation')
-    parser.add_argument('--shuffle', action='store_true', help='whether shuffle training tasks')
-
-    # Model
-    parser.add_argument('--model', default='attention', help="Model, 'attention' (default) or 'pointer'")
-    parser.add_argument('--embedding_dim', type=int, default=128, help='Dimension of input embedding')
-    parser.add_argument('--hidden_dim', type=int, default=128, help='Dimension of hidden layers in Enc/Dec')
-    parser.add_argument('--n_encode_layers', type=int, default=3,
-                        help='Number of layers in the encoder/critic network')
-    parser.add_argument('--tanh_clipping', type=float, default=10.,
-                        help='Clip the parameters to within +- this value using tanh. '
-                             'Set to 0 to not perform any clipping.')
-    parser.add_argument('--normalization', default='batch', help="Normalization type, 'batch' (default) or 'instance'")
-
-    # Training
-    parser.add_argument('--lr_model', type=float, default=1e-4, help="Set the learning rate for the actor network")
-    parser.add_argument('--lr_critic', type=float, default=1e-4, help="Set the learning rate for the critic network")
-    parser.add_argument('--lr_decay', type=float, default=1.0, help='Learning rate decay per epoch')
-    parser.add_argument('--eval_only', action='store_true', help='Set this value to only evaluate model')
-    parser.add_argument('--n_epochs', type=int, default=2000, help='The number of epochs to train')
-    parser.add_argument('--seed', type=int, default=1234, help='Random seed to use')
-    parser.add_argument('--max_grad_norm', type=float, default=1.0,
-                        help='Maximum L2 norm for gradient clipping, default 1.0 (0 to disable clipping)')
-    parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA')
-    parser.add_argument('--exp_beta', type=float, default=0.8,
-                        help='Exponential moving average baseline decay (default 0.8)')
-    parser.add_argument('--baseline', default=None,
-                        help="Baseline to use: 'rollout', 'critic' or 'exponential'. Defaults to no baseline.")
-    parser.add_argument('--bl_alpha', type=float, default=0.05,
-                        help='Significance in the t-test for updating rollout baseline')
-    parser.add_argument('--bl_warmup_epochs', type=int, default=0,
-                        help='Number of epochs to warmup the baseline, default None means 1 for rollout (exponential '
-                             'used for warmup phase), 0 otherwise. Can only be used with rollout baseline.')
-    parser.add_argument('--eval_batch_size', type=int, default=1024,
-                        help="Batch size to use during (baseline) evaluation")
-    parser.add_argument('--checkpoint_encoder', action='store_true',
-                        help='Set to decrease memory usage by checkpointing encoder')
-    parser.add_argument('--shrink_size', type=int, default=None,
-                        help='Shrink the batch size if at least this many instances in the batch are finished'
-                             ' to save memory (default None means no shrinking)')
-    parser.add_argument('--data_distribution', type=str, default=None,
-                        help='Data distribution to use during training, defaults and options depend on problem.')
-
-    # Misc
-    parser.add_argument('--log_step', type=int, default=50, help='Log info every log_step steps')
-    parser.add_argument('--log_dir', default='logs', help='Directory to write TensorBoard information to')
-    parser.add_argument('--run_name', default='run', help='Name to identify the run')
-    parser.add_argument('--output_dir', default='outputs', help='Directory to write output models to')
-    parser.add_argument('--epoch_start', type=int, default=0,
-                        help='Start at epoch # (relevant for learning rate decay)')
-    parser.add_argument('--checkpoint_epochs', type=int, default=10,
-                        help='Save checkpoint every n epochs (default 1), 0 to save no checkpoints')
-    parser.add_argument('--load_path', help='Path to load model parameters and optimizer state from')
-    parser.add_argument('--resume', help='Resume from previous checkpoint file')
-    parser.add_argument('--no_tensorboard', action='store_true', help='Disable logging TensorBoard files')
-    parser.add_argument('--no_progress_bar', action='store_true', help='Disable progress bar')
-
-    opts = parser.parse_args(args)
-
-    opts.use_cuda = torch.cuda.is_available() and not opts.no_cuda
-    opts.run_name = "{}_{}".format(opts.run_name, time.strftime("%Y%m%dT%H%M%S"))
-    opts.no_progress_bar = True
-    opts.no_tensorboard = True
-    opts.save_dir = os.path.join(
-        opts.output_dir,
-        "{}_{}".format(opts.problem, opts.graph_size),
-        opts.run_name
-    )
-    if opts.bl_warmup_epochs is None:
-        opts.bl_warmup_epochs = 1 if opts.baseline == 'rollout' else 0
-    assert (opts.bl_warmup_epochs == 0) or (opts.baseline == 'rollout')
-    # assert opts.epoch_size % opts.batch_size == 0, "Epoch size must be integer multiple of batch size!"
-    return opts
diff --git a/AM/problems/__init__.py b/AM/problems/__init__.py
deleted file mode 100644
index 1d8eddb..0000000
--- a/AM/problems/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from problems.tsp.problem_tsp import TSP
-from problems.vrp.problem_vrp import CVRP, SDVRP
-from problems.op.problem_op import OP
-from problems.pctsp.problem_pctsp import PCTSPDet, PCTSPStoch
\ No newline at end of file
diff --git a/AM/problems/op/.gitignore b/AM/problems/op/.gitignore
deleted file mode 100644
index 97cb344..0000000
--- a/AM/problems/op/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-compass/
\ No newline at end of file
diff --git a/AM/problems/op/__init__.py b/AM/problems/op/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/op/install_compass.sh b/AM/problems/op/install_compass.sh
deleted file mode 100644
index da96aaf..0000000
--- a/AM/problems/op/install_compass.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-git clone https://github.com/bcamath-ds/compass
-cd compass
-sudo apt-get install libtool m4
-sudo apt-get install libgsl0-dev libatlas-base-dev libbfd-dev libiberty-dev
-sudo apt-get install libssl-dev
-sudo apt-get install autoconf automake
-autoheader
-libtoolize
-aclocal
-automake --add-missing
-autoconf
-./configure
-make
-cd ..
\ No newline at end of file
diff --git a/AM/problems/op/op_baseline.py b/AM/problems/op/op_baseline.py
deleted file mode 100644
index 67b960b..0000000
--- a/AM/problems/op/op_baseline.py
+++ /dev/null
@@ -1,394 +0,0 @@
-import argparse
-import os
-import numpy as np
-from utils import run_all_in_pool
-from utils.data_utils import check_extension, load_dataset, save_dataset
-from subprocess import check_call, check_output
-import tempfile
-import time
-from datetime import timedelta
-from problems.op.opga.opevo import run_alg as run_opga_alg
-from tqdm import tqdm
-import re
-
-MAX_LENGTH_TOL = 1e-5
-
-
-# Run install_compass.sh to install
-def solve_compass(executable, depot, loc, demand, capacity):
-    with tempfile.TemporaryDirectory() as tempdir:
-        problem_filename = os.path.join(tempdir, "problem.oplib")
-        output_filename = os.path.join(tempdir, "output.tour")
-        param_filename = os.path.join(tempdir, "params.par")
-
-        starttime = time.time()
-        write_oplib(problem_filename, depot, loc, demand, capacity)
-        params = {"PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": output_filename}
-        write_compass_par(param_filename, params)
-        output = check_output([executable, param_filename])
-        result = read_oplib(output_filename, n=len(demand))
-        duration = time.time() - starttime
-        return result, output, duration
-
-
-def solve_compass_log(executable, directory, name, depot, loc, prize, max_length, disable_cache=False):
-
-    problem_filename = os.path.join(directory, "{}.oplib".format(name))
-    tour_filename = os.path.join(directory, "{}.tour".format(name))
-    output_filename = os.path.join(directory, "{}.compass.pkl".format(name))
-    log_filename = os.path.join(directory, "{}.log".format(name))
-
-    try:
-        # May have already been run
-        if os.path.isfile(output_filename) and not disable_cache:
-            tour, duration = load_dataset(output_filename)
-        else:
-            write_oplib(problem_filename, depot, loc, prize, max_length, name=name)
-
-            with open(log_filename, 'w') as f:
-                start = time.time()
-                check_call([executable, '--op', '--op-ea4op', problem_filename, '-o', tour_filename],
-                           stdout=f, stderr=f)
-                duration = time.time() - start
-
-            tour = read_oplib(tour_filename, n=len(prize))
-            if not calc_op_length(depot, loc, tour) <= max_length:
-                print("Warning: length exceeds max length:", calc_op_length(depot, loc, tour), max_length)
-            assert calc_op_length(depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
-            save_dataset((tour, duration), output_filename)
-
-        return -calc_op_total(prize, tour), tour, duration
-
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def calc_op_total(prize, tour):
-    # Subtract 1 since vals index start with 0 while tour indexing starts with 1 as depot is 0
-    assert (np.array(tour) > 0).all(), "Depot cannot be in tour"
-    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
-    return np.array(prize)[np.array(tour) - 1].sum()
-
-
-def calc_op_length(depot, loc, tour):
-    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
-    loc_with_depot = np.vstack((np.array(depot)[None, :], np.array(loc)))
-    sorted_locs = loc_with_depot[np.concatenate(([0], tour, [0]))]
-    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
-
-
-def write_compass_par(filename, parameters):
-    default_parameters = {  # Use none to include as flag instead of kv
-        "SPECIAL": None,
-        "MAX_TRIALS": 10000,
-        "RUNS": 10,
-        "TRACE_LEVEL": 1,
-        "SEED": 0
-    }
-    with open(filename, 'w') as f:
-        for k, v in {**default_parameters, **parameters}.items():
-            if v is None:
-                f.write("{}\n".format(k))
-            else:
-                f.write("{} = {}\n".format(k, v))
-
-
-def read_oplib(filename, n):
-    with open(filename, 'r') as f:
-        tour = []
-        dimension = 0
-        started = False
-        for line in f:
-            if started:
-                loc = int(line)
-                if loc == -1:
-                    break
-                tour.append(loc)
-            if line.startswith("DIMENSION"):
-                dimension = int(line.split(" ")[-1])
-
-            if line.startswith("NODE_SEQUENCE_SECTION"):
-                started = True
-    
-    assert len(tour) > 0, "Unexpected length"
-    tour = np.array(tour).astype(int) - 1  # Subtract 1 as depot is 1 and should be 0
-    assert tour[0] == 0  # Tour should start with depot
-    assert tour[-1] != 0  # Tour should not end with depot
-    return tour[1:].tolist()
-
-
-def write_oplib(filename, depot, loc, prize, max_length, name="problem"):
-
-    with open(filename, 'w') as f:
-        f.write("\n".join([
-            "{} : {}".format(k, v)
-            for k, v in (
-                ("NAME", name),
-                ("TYPE", "OP"),
-                ("DIMENSION", len(loc) + 1),
-                ("COST_LIMIT", int(max_length * 10000000 + 0.5)),
-                ("EDGE_WEIGHT_TYPE", "EUC_2D"),
-            )
-        ]))
-        f.write("\n")
-        f.write("NODE_COORD_SECTION\n")
-        f.write("\n".join([
-            "{}\t{}\t{}".format(i + 1, int(x * 10000000 + 0.5), int(y * 10000000 + 0.5))  # oplib does not take floats
-            #"{}\t{}\t{}".format(i + 1, x, y)
-            for i, (x, y) in enumerate([depot] + loc)
-        ]))
-        f.write("\n")
-        f.write("NODE_SCORE_SECTION\n")
-        f.write("\n".join([
-            "{}\t{}".format(i + 1, d)
-            for i, d in enumerate([0] + prize)
-        ]))
-        f.write("\n")
-        f.write("DEPOT_SECTION\n")
-        f.write("1\n")
-        f.write("-1\n")
-        f.write("EOF\n")
-
-
-def solve_opga(directory, name, depot, loc, prize, max_length, disable_cache=False):
-    problem_filename = os.path.join(directory, "{}.opga.pkl".format(name))
-    if os.path.isfile(problem_filename) and not disable_cache:
-        (prize, tour, duration) = load_dataset(problem_filename)
-    else:
-        # 0 = start, 1 = end so add depot twice
-        start = time.time()
-        prize, tour, duration = run_opga_alg(
-            [(*pos, p) for p, pos in zip([0, 0] + prize, [depot, depot] + loc)],
-            max_length, return_sol=True, verbose=False
-        )
-        duration = time.time() - start  # Measure clock time
-        save_dataset((prize, tour, duration), problem_filename)
-
-    # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
-    assert tour[0][3] == 0
-    assert tour[-1][3] == 1
-    return -prize, [i - 1 for x, y, p, i, t in tour[1:-1]], duration
-
-
-def solve_gurobi(directory, name, depot, loc, prize, max_length, disable_cache=False, timeout=None, gap=None):
-    # Lazy import so we do not need to have gurobi installed to run this script
-    from problems.op.op_gurobi import solve_euclidian_op as solve_euclidian_op_gurobi
-
-    try:
-        problem_filename = os.path.join(directory, "{}.gurobi{}{}.pkl".format(
-            name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap)))
-
-        if os.path.isfile(problem_filename) and not disable_cache:
-            (cost, tour, duration) = load_dataset(problem_filename)
-        else:
-            # 0 = start, 1 = end so add depot twice
-            start = time.time()
-
-            cost, tour = solve_euclidian_op_gurobi(
-                depot, loc, prize, max_length, threads=1, timeout=timeout, gap=gap
-            )
-            duration = time.time() - start  # Measure clock time
-            save_dataset((cost, tour, duration), problem_filename)
-
-        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
-        assert tour[0] == 0
-        tour = tour[1:]
-        assert calc_op_length(depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
-        total_cost = -calc_op_total(prize, tour)
-        assert abs(total_cost - cost) <= 1e-4, "Cost is incorrect"
-        return total_cost, tour, duration
-
-    except Exception as e:
-        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
-        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_ortools(directory, name, depot, loc, prize, max_length, sec_local_search=0, disable_cache=False):
-    # Lazy import so we do not require ortools by default
-    from problems.op.op_ortools import solve_op_ortools
-
-    try:
-        problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search))
-        if os.path.isfile(problem_filename) and not disable_cache:
-            objval, tour, duration = load_dataset(problem_filename)
-        else:
-            # 0 = start, 1 = end so add depot twice
-            start = time.time()
-            objval, tour = solve_op_ortools(depot, loc, prize, max_length, sec_local_search=sec_local_search)
-            duration = time.time() - start
-            save_dataset((objval, tour, duration), problem_filename)
-        assert tour[0] == 0, "Tour must start with depot"
-        tour = tour[1:]
-        assert calc_op_length(depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
-        assert abs(-calc_op_total(prize, tour) - objval) <= 1e-5, "Cost is incorrect"
-        return -calc_op_total(prize, tour), tour, duration
-    except Exception as e:
-        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
-        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def run_all_tsiligirides(
-        dataset_path, sample, num_samples, eval_batch_size, max_calc_batch_size, no_cuda=False, dataset_n=None,
-        progress_bar_mininterval=0.1, seed=1234):
-    import torch
-    from torch.utils.data import DataLoader
-    from utils import move_to, sample_many
-    from problems.op.tsiligirides import op_tsiligirides
-    from problems.op.problem_op import OP
-    torch.manual_seed(seed)
-
-    dataloader = DataLoader(
-        OP.make_dataset(filename=dataset_path, num_samples=dataset_n if dataset_n is not None else 1000000),
-        batch_size=eval_batch_size
-    )
-    device = torch.device("cuda:0" if torch.cuda.is_available() and not no_cuda else "cpu")
-    results = []
-    for batch in tqdm(dataloader, mininterval=progress_bar_mininterval):
-        start = time.time()
-        batch = move_to(batch, device)
-
-        with torch.no_grad():
-            if num_samples * eval_batch_size > max_calc_batch_size:
-                assert eval_batch_size == 1
-                assert num_samples % max_calc_batch_size == 0
-                batch_rep = max_calc_batch_size
-                iter_rep = num_samples // max_calc_batch_size
-            else:
-                batch_rep = num_samples
-                iter_rep = 1
-            sequences, costs = sample_many(
-                lambda inp: (None, op_tsiligirides(inp, sample)),
-                OP.get_costs,
-                batch, batch_rep=batch_rep, iter_rep=iter_rep)
-            duration = time.time() - start
-            results.extend(
-                [(cost.item(), np.trim_zeros(pi.cpu().numpy(),'b'), duration) for cost, pi in zip(costs, sequences)])
-    return results, eval_batch_size
-
-
-if __name__ == "__main__":
-    executable = os.path.abspath(os.path.join('problems', 'op', 'compass', 'compass'))
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("method", help="Name of the method to evaluate, 'compass', 'opga' or 'tsili'")
-    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument("-o", default=None, help="Name of the results file to write")
-    parser.add_argument("--cpus", type=int, help="Number of CPUs to use, defaults to all cores")
-    parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA (only for Tsiligirides)')
-    parser.add_argument('--disable_cache', action='store_true', help='Disable caching')
-    parser.add_argument('--max_calc_batch_size', type=int, default=1000, help='Size for subbatches')
-    parser.add_argument('--progress_bar_mininterval', type=float, default=0.1, help='Minimum interval')
-    parser.add_argument('-n', type=int, help="Number of instances to process")
-    parser.add_argument('--offset', type=int, help="Offset where to start processing")
-    parser.add_argument('--results_dir', default='results', help="Name of results directory")
-
-    opts = parser.parse_args()
-
-    assert opts.o is None or len(opts.datasets) == 1, "Cannot specify result filename with more than one dataset"
-
-    for dataset_path in opts.datasets:
-
-        assert os.path.isfile(check_extension(dataset_path)), "File does not exist!"
-
-        dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
-
-        if opts.o is None:
-            results_dir = os.path.join(opts.results_dir, "op", dataset_basename)
-            os.makedirs(results_dir, exist_ok=True)
-
-            out_file = os.path.join(results_dir, "{}{}{}-{}{}".format(
-                dataset_basename,
-                "offs{}".format(opts.offset) if opts.offset is not None else "",
-                "n{}".format(opts.n) if opts.n is not None else "",
-                opts.method, ext
-            ))
-        else:
-            out_file = opts.o
-
-        assert opts.f or not os.path.isfile(
-            out_file), "File already exists! Try running with -f option to overwrite."
-
-        match = re.match(r'^([a-z]+)(\d*)$', opts.method)
-        assert match
-        method = match[1]
-        runs = 1 if match[2] == '' else int(match[2])
-
-        if method == "tsili" or method == "tsiligreedy":
-            assert opts.offset is None, "Offset not supported for Tsiligirides"
-
-            if method == "tsiligreedy":
-                sample = False
-                num_samples = 1
-            else:
-                sample = True
-                num_samples = runs
-
-            eval_batch_size = max(1, opts.max_calc_batch_size // num_samples)
-
-            results, parallelism = run_all_tsiligirides(
-                dataset_path, sample, num_samples, eval_batch_size, opts.max_calc_batch_size, opts.no_cuda, opts.n,
-                opts.progress_bar_mininterval
-            )
-        elif method in ("compass", "opga", "gurobi", "gurobigap", "gurobit", "ortools"):
-
-            target_dir = os.path.join(results_dir, "{}-{}".format(
-                dataset_basename,
-                opts.method
-            ))
-            assert opts.f or not os.path.isdir(target_dir), \
-                "Target dir already exists! Try running with -f option to overwrite."
-
-            if not os.path.isdir(target_dir):
-                os.makedirs(target_dir)
-
-            dataset = load_dataset(dataset_path)
-
-            if method[:6] == "gurobi":
-                use_multiprocessing = True  # We run one thread per instance
-
-                def run_func(args):
-                    return solve_gurobi(*args, disable_cache=opts.disable_cache,
-                                        timeout=runs if method[6:] == "t" else None,
-                                        gap=float(runs) if method[6:] == "gap" else None)
-            elif method == "compass":
-                use_multiprocessing = False
-
-                def run_func(args):
-                    return solve_compass_log(executable, *args, disable_cache=opts.disable_cache)
-            elif method == "opga":
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_opga(*args, disable_cache=opts.disable_cache)
-            else:
-                assert method == "ortools"
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_ortools(*args, sec_local_search=runs, disable_cache=opts.disable_cache)
-
-            results, parallelism = run_all_in_pool(
-                run_func,
-                target_dir, dataset, opts, use_multiprocessing=use_multiprocessing
-            )
-
-        else:
-            assert False, "Unknown method: {}".format(opts.method)
-
-        costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-        print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-        print("Average serial duration: {} +- {}".format(
-            np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-        print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-        print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-
-        save_dataset((results, parallelism), out_file)
diff --git a/AM/problems/op/op_gurobi.py b/AM/problems/op/op_gurobi.py
deleted file mode 100644
index 4cda064..0000000
--- a/AM/problems/op/op_gurobi.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/python
-
-# Copyright 2017, Gurobi Optimization, Inc.
-
-# Solve a traveling salesman problem on a set of
-# points using lazy constraints.   The base MIP model only includes
-# 'degree-2' constraints, requiring each node to have exactly
-# two incident edges.  Solutions to this model may contain subtours -
-# tours that don't visit every city.  The lazy constraint callback
-# adds new constraints to cut them off.
-
-from gurobipy import *
-
-
-def solve_euclidian_op(depot, loc, prize, max_length, threads=0, timeout=None, gap=None):
-    """
-    Solves the Euclidan op problem to optimality using the MIP formulation 
-    with lazy subtour elimination constraint generation.
-    :param points: list of (x, y) coordinate 
-    :return: 
-    """
-
-    points = [depot] + loc
-    n = len(points)
-
-    # Callback - use lazy constraints to eliminate sub-tours
-
-    def subtourelim(model, where):
-        if where == GRB.Callback.MIPSOL:
-            # make a list of edges selected in the solution
-            vals = model.cbGetSolution(model._vars)
-            selected = tuplelist((i, j) for i, j in model._vars.keys() if vals[i, j] > 0.5)
-            # find the shortest cycle in the selected edge list
-            tour = subtour(selected)
-            if tour is not None:
-                # add subtour elimination constraint for every pair of cities in tour
-                # model.cbLazy(quicksum(model._vars[i, j]
-                #                       for i, j in itertools.combinations(tour, 2))
-                #              <= len(tour) - 1)
-
-                model.cbLazy(quicksum(model._vars[i, j]
-                                      for i, j in itertools.combinations(tour, 2))
-                             <= quicksum(model._dvars[i] for i in tour) * (len(tour) - 1) / float(len(tour)))
-
-    # Given a tuplelist of edges, find the shortest subtour
-
-    def subtour(edges, exclude_depot=True):
-        unvisited = list(range(n))
-        #cycle = range(n + 1)  # initial length has 1 more city
-        cycle = None
-        while unvisited:  # true if list is non-empty
-            thiscycle = []
-            neighbors = unvisited
-            while neighbors:
-                current = neighbors[0]
-                thiscycle.append(current)
-                unvisited.remove(current)
-                neighbors = [j for i, j in edges.select(current, '*') if j in unvisited]
-            # If we do not yet have a cycle or this is the shorter cycle, keep this cycle
-            # Unless it contains the depot while we do not want the depot
-            if (
-                (cycle is None or len(cycle) > len(thiscycle))
-                    and len(thiscycle) > 1 and not (0 in thiscycle and exclude_depot)
-            ):
-                cycle = thiscycle
-        return cycle
-
-    # Dictionary of Euclidean distance between each pair of points
-
-    dist = {(i,j) :
-        math.sqrt(sum((points[i][k]-points[j][k])**2 for k in range(2)))
-        for i in range(n) for j in range(i)}
-
-    m = Model()
-    m.Params.outputFlag = False
-
-    # Create variables
-
-    vars = m.addVars(dist.keys(), vtype=GRB.BINARY, name='e')
-    for i,j in vars.keys():
-        vars[j,i] = vars[i,j] # edge in opposite direction
-
-    # Depot vars can be 2
-    for i,j in vars.keys():
-        if i == 0 or j == 0:
-            vars[i,j].vtype = GRB.INTEGER
-            vars[i,j].ub = 2
-
-    prize_dict = {
-        i + 1: -p  # We need to maximize so negate
-        for i, p in enumerate(prize)
-    }
-    delta = m.addVars(range(1, n), obj=prize_dict, vtype=GRB.BINARY, name='delta')
-
-    # Add degree-2 constraint (2 * delta for nodes which are not the depot)
-    m.addConstrs(vars.sum(i,'*') == (2 if i == 0 else 2 * delta[i]) for i in range(n))
-
-    # Length of tour constraint
-    m.addConstr(quicksum(var * dist[i, j] for (i, j), var in vars.items() if j < i) <= max_length)
-
-    # Optimize model
-
-    m._vars = vars
-    m._dvars = delta
-    m.Params.lazyConstraints = 1
-    m.Params.threads = threads
-    if timeout:
-        m.Params.timeLimit = timeout
-    if gap:
-        m.Params.mipGap = gap * 0.01  # Percentage
-    m.optimize(subtourelim)
-
-    vals = m.getAttr('x', vars)
-    selected = tuplelist((i,j) for i,j in vals.keys() if vals[i,j] > 0.5)
-
-    tour = subtour(selected, exclude_depot=False)
-    assert tour[0] == 0, "Tour should start with depot"
-
-    return m.objVal, tour
\ No newline at end of file
diff --git a/AM/problems/op/op_ortools.py b/AM/problems/op/op_ortools.py
deleted file mode 100644
index 694339a..0000000
--- a/AM/problems/op/op_ortools.py
+++ /dev/null
@@ -1,263 +0,0 @@
-#!/usr/bin/env python
-# This Python file uses the following encoding: utf-8
-# Copyright 2015 Tin Arm Engineering AB
-# Copyright 2018 Google LLC
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Capacitated Vehicle Routing Problem (CVRP).
-
-   This is a sample using the routing library python wrapper to solve a CVRP
-   problem.
-   A description of the problem can be found here:
-   http://en.wikipedia.org/wiki/Vehicle_routing_problem.
-
-   Distances are in meters.
-"""
-
-from __future__ import print_function
-from collections import namedtuple
-from six.moves import xrange
-from ortools.constraint_solver import pywrapcp
-from ortools.constraint_solver import routing_enums_pb2
-import math
-
-###########################
-# Problem Data Definition #
-###########################
-# Vehicle declaration
-Vehicle = namedtuple('Vehicle', ['capacity'])
-
-
-def float_to_scaled_int(v):
-    return int(v * 10000000 + 0.5)
-
-
-class DataProblem():
-  """Stores the data for the problem"""
-
-  def __init__(self, depot, loc, prize, max_length):
-    """Initializes the data for the problem"""
-    # Locations in block unit
-    self._locations = [(float_to_scaled_int(l[0]), float_to_scaled_int(l[1])) for l in [depot] + loc]
-
-    self._prizes = [float_to_scaled_int(v) for v in prize]
-
-    self._max_length = float_to_scaled_int(max_length)
-
-  @property
-  def vehicle(self):
-    """Gets a vehicle"""
-    return Vehicle()
-
-  @property
-  def num_vehicles(self):
-    """Gets number of vehicles"""
-    return 1
-
-  @property
-  def locations(self):
-    """Gets locations"""
-    return self._locations
-
-  @property
-  def num_locations(self):
-    """Gets number of locations"""
-    return len(self.locations)
-
-  @property
-  def depot(self):
-    """Gets depot location index"""
-    return 0
-
-  @property
-  def prizes(self):
-    """Gets prizes at each location"""
-    return self._prizes
-
-  @property
-  def max_length(self):
-      """Gets prizes at each location"""
-      return self._max_length
-
-
-#######################
-# Problem Constraints #
-#######################
-def euclidian_distance(position_1, position_2):
-  """Computes the Euclidian distance between two points"""
-  return int(math.sqrt((position_1[0] - position_2[0]) ** 2 + (position_1[1] - position_2[1]) ** 2) + 0.5)
-
-
-class CreateDistanceEvaluator(object):  # pylint: disable=too-few-public-methods
-  """Creates callback to return distance between points."""
-
-  def __init__(self, data):
-    """Initializes the distance matrix."""
-    self._distances = {}
-
-    # precompute distance between location to have distance callback in O(1)
-    for from_node in xrange(data.num_locations):
-      self._distances[from_node] = {}
-      for to_node in xrange(data.num_locations):
-        if from_node == to_node:
-          self._distances[from_node][to_node] = 0
-        else:
-          self._distances[from_node][to_node] = (
-              euclidian_distance(data.locations[from_node],
-                                 data.locations[to_node]))
-
-  def distance_evaluator(self, from_node, to_node):
-    """Returns the manhattan distance between the two nodes"""
-    return self._distances[from_node][to_node]
-
-
-class CreatePrizeEvaluator(object):  # pylint: disable=too-few-public-methods
-  """Creates callback to get prizes at each location."""
-
-  def __init__(self, data):
-    """Initializes the prize array."""
-    self._prizes = data.prizes
-
-  def prize_evaluator(self, from_node, to_node):
-    """Returns the prize of the current node"""
-    del to_node
-    return self._prizes[from_node]
-
-
-def add_capacity_constraints(routing, data, prize_evaluator):
-  """Adds capacity constraint"""
-  capacity = 'Capacity'
-  routing.AddDimension(
-      prize_evaluator,
-      0,  # null capacity slack
-      data.vehicle.capacity,
-      True,  # start cumul to zero
-      capacity)
-
-
-def add_distance_constraint(routing, distance_evaluator, maximum_distance):
-    """Add Global Span constraint"""
-    distance = "Distance"
-    routing.AddDimension(
-        distance_evaluator,
-        0, # null slack
-        maximum_distance, # maximum distance per vehicle
-        True, # start cumul to zero
-        distance)
-
-
-###########
-# Printer #
-###########
-def print_solution(data, routing, assignment):
-  """Prints assignment on console"""
-  print('Objective: {}'.format(assignment.ObjectiveValue()))
-  total_distance = 0
-  total_load = 0
-  capacity_dimension = routing.GetDimensionOrDie('Capacity')
-  for vehicle_id in xrange(data.num_vehicles):
-    index = routing.Start(vehicle_id)
-    plan_output = 'Route for vehicle {}:\n'.format(vehicle_id)
-    distance = 0
-    while not routing.IsEnd(index):
-      load_var = capacity_dimension.CumulVar(index)
-      plan_output += ' {} Load({}) -> '.format(
-          routing.IndexToNode(index), assignment.Value(load_var))
-      previous_index = index
-      index = assignment.Value(routing.NextVar(index))
-      distance += routing.GetArcCostForVehicle(previous_index, index,
-                                               vehicle_id)
-    load_var = capacity_dimension.CumulVar(index)
-    plan_output += ' {0} Load({1})\n'.format(
-        routing.IndexToNode(index), assignment.Value(load_var))
-    plan_output += 'Distance of the route: {}m\n'.format(distance)
-    plan_output += 'Load of the route: {}\n'.format(assignment.Value(load_var))
-    print(plan_output)
-    total_distance += distance
-    total_load += assignment.Value(load_var)
-  print('Total Distance of all routes: {}m'.format(total_distance))
-  print('Total Load of all routes: {}'.format(total_load))
-
-
-def solve_op_ortools(depot, loc, prize, max_length, sec_local_search=0):
-    data = DataProblem(depot, loc, prize, max_length)
-
-    # Create Routing Model
-    routing = pywrapcp.RoutingModel(data.num_locations, data.num_vehicles,
-                                    data.depot)
-
-    # Define weight of each edge
-    distance_evaluator = CreateDistanceEvaluator(data).distance_evaluator
-    # routing.SetArcCostEvaluatorOfAllVehicles(distance_evaluator)
-    add_distance_constraint(routing, distance_evaluator, data.max_length)
-    # Add Capacity constraint
-    # prize_evaluator = CreatePrizeEvaluator(data).prize_evaluator
-    # add_capacity_constraints(routing, data, prize_evaluator)
-    # Add penalties for missed prizes
-    nodes = [routing.AddDisjunction([int(c + 1)], p) for c, p in enumerate(data.prizes)]
-
-    # Setting first solution heuristic (cheapest addition).
-    search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
-    search_parameters.first_solution_strategy = (
-        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
-    if sec_local_search > 0:
-        # Additionally do local search
-        search_parameters.local_search_metaheuristic = (
-            routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
-        search_parameters.time_limit_ms = 1000 * sec_local_search
-    # Solve the problem.
-    assignment = routing.SolveWithParameters(search_parameters)
-
-    assert assignment is not None, "ORTools was unable to find a feasible solution"
-
-    index = routing.Start(0)
-    route = []
-    while not routing.IsEnd(index):
-        node_index = routing.IndexToNode(index)
-        route.append(node_index)
-        index = assignment.Value(routing.NextVar(index))
-    # The constant total of prizes is not taken into account by ORTOOLS
-    # This returns - total prize collected = total prize not collected - total prize
-    return assignment.ObjectiveValue() / 10000000. - sum(prize), route
-
-    #print_solution(data, routing, assignment)
-
-########
-# Main #
-########
-def main():
-  """Entry point of the program"""
-  # Instantiate the data problem.
-  data = DataProblem()
-
-  # Create Routing Model
-  routing = pywrapcp.RoutingModel(data.num_locations, data.num_vehicles,
-                                  data.depot)
-
-  # Define weight of each edge
-  distance_evaluator = CreateDistanceEvaluator(data).distance_evaluator
-  routing.SetArcCostEvaluatorOfAllVehicles(distance_evaluator)
-  # Add Capacity constraint
-  # prize_evaluator = CreatePrizeEvaluator(data).prize_evaluator
-  # add_capacity_constraints(routing, data, prize_evaluator)
-
-  # Setting first solution heuristic (cheapest addition).
-  search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
-  search_parameters.first_solution_strategy = (
-      routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)  # pylint: disable=no-member
-  # Solve the problem.
-  assignment = routing.SolveWithParameters(search_parameters)
-  print_solution(data, routing, assignment)
-
-
-if __name__ == '__main__':
-  main()
\ No newline at end of file
diff --git a/AM/problems/op/opga/README.md b/AM/problems/op/opga/README.md
deleted file mode 100644
index e5aeb4e..0000000
--- a/AM/problems/op/opga/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# orienteering
-GA for orienteering problem
diff --git a/AM/problems/op/opga/__init__.py b/AM/problems/op/opga/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/op/opga/opevo.py b/AM/problems/op/opga/opevo.py
deleted file mode 100644
index 0acf9f9..0000000
--- a/AM/problems/op/opga/opevo.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import sys
-import random
-import time
-from . import oph
-
-#fitness will take a set s and a set of weights and return a tuple containing the fitness and the best path
-def fitness( chrom, s, start_point, end_point, tmax ):
-    augs = []
-    for i in range( len( s ) ):
-        augs.append( ( s[ i ][0],
-                       s[ i ][1],
-                       s[ i ][2],
-                       s[ i ][3], 
-                       s[ i ][4] + chrom[ i ] ) )
-    if debug:
-        print ('fitness---------------------------------')
-        print ('augs:')
-        print (augs)
-    #best = oph.ellinit_replacement( augs, start_point, end_point, tmax )
-    ellset = oph.ell_sub( tmax, start_point, end_point, augs )
-    #best = oph.initialize( ellset, start_point, end_point, tmax )[0]
-    best = oph.init_replacement( ellset, start_point, end_point, tmax )[0]
-    if debug:
-        print ('best:')
-        print (best)
-        print ('best real reward:')
-        print ([ x[3] for x in best ])
-        print (len( s ))
-        print ([ s[ x[3] - 2 ] for x in best[ 1:len( best ) - 1 ] ])
-        print ([ s[ x[3] - 2 ][2] for x in best[ 1:len( best ) - 1 ] ])
-        print (( sum( [ s[ x[3] - 2 ][2] for x in best[ 1:len( best ) - 1 ] ] ), best ))
-    return ( sum( [ s[ x[3] - 2 ][2] for x in best[ 1:len( best ) - 1 ] ] ), best )
-
-def crossover( c1, c2 ):
-    assert( len( c1 ) == len( c2 ) )
-    point = random.randrange( len( c1 ) )
-    first = random.randrange( 2 )
-    if( first ):
-        return c1[:point] + c2[point:]
-    else:
-        return c2[:point] + c1[point:]
-
-def mutate( chrom, mchance, msigma ):
-    return [ x + random.gauss( 0, msigma ) if random.randrange( mchance ) == 0  else 
-             x for x in chrom ]
-
-def run_alg_f( f, tmax, N ):
-    random.seed()
-    cpoints = []
-    an_unused_value = f.readline() # ignore first line of file
-    for i in range( N ):
-        cpoints.append( tuple( [ float( x ) for x in f.readline().split() ] ) )
-    if debug:
-        print ('N:            ', N)
-    return run_alg(cpoints, tmax)
-
-def run_alg(points, tmax, return_sol=False, verbose=True):
-    cpoints = [tuple(p) + (i, 0) for i, p in enumerate(points)]
-    start_point = cpoints.pop( 0 )
-    end_point = cpoints.pop( 0 )
-    assert( oph.distance( start_point, end_point ) < tmax )
-    popsize = 10
-    genlimit = 10
-    kt = 5
-    isigma = 10
-    msigma = 7
-    mchance = 2
-    elitismn = 2
-    if( debug ):
-        print ('data set size:', len( cpoints ) + 2)
-        print ('tmax:         ', tmax)
-        print ('parameters:')
-        print ('generations:     ', genlimit)
-        print ('population size: ', popsize)
-        print ('ktournament size:', kt)
-        print ('mutation chance: ', mchance)
-        print (str( elitismn ) + '-elitism')
-
-    start_time = time.clock()
-    #generate initial random population
-    pop = []
-    for i in range( popsize + elitismn ):
-        chrom = []
-        for j in range( len( cpoints ) ):
-            chrom.append( random.gauss( 0, isigma ) )
-        chrom = ( fitness( chrom, cpoints, start_point, end_point, tmax )[0], chrom )
-        while( i - j > 0 and j < elitismn and chrom > pop[ i - 1 - j ] ):
-            j += 1
-        pop.insert( i - j, chrom )
-
-    bestfit = 0
-    for i in range( genlimit ):
-        nextgen = []
-        for j in range( popsize ):
-            #select parents in k tournaments
-            parents = sorted( random.sample( pop, kt ) )[ kt - 2: ] #optimize later
-            #crossover and mutate
-            offspring = mutate( crossover( parents[0][1], parents[1][1] ), mchance, msigma )
-            offspring = ( fitness( offspring, cpoints, start_point, end_point, tmax )[0], offspring )
-            if( offspring[0] > bestfit ):
-                bestfit = offspring[0]
-                if verbose:
-                    print (bestfit)
-            if( elitismn > 0 and offspring > pop[ popsize ] ):
-                l = 0
-                while( l < elitismn and offspring > pop[ popsize + l ] ):
-                    l += 1
-                pop.insert( popsize + l, offspring )
-                nextgen.append( pop.pop( popsize ) )
-            else:
-                nextgen.append( offspring )
-        pop = nextgen + pop[ popsize: ]
-
-    bestchrom = sorted( pop )[ popsize + elitismn - 1 ] 
-    end_time = time.clock()
-
-    if verbose:
-        print ('time:')
-        print (end_time - start_time)
-        print ('best fitness:')
-        print (bestchrom[0])
-        print ('best path:')
-    best_path = fitness( bestchrom[1], cpoints, start_point, end_point, tmax )[1]
-    if verbose:
-        print ([ x[3] for x in best_path ])
-
-        print ('their stuff:')
-    stuff = oph.initialize( oph.ell_sub( tmax, start_point, end_point, cpoints )
-    , start_point, end_point, tmax )[0]
-    if verbose:
-        print ('fitness:', sum( [ x[2] for x in stuff ] ))
-        print ('my stuff:')
-    stuff2 = oph.ellinit_replacement( cpoints, start_point, end_point, tmax )
-    if verbose:
-        print ('fitness:', sum( [ x[2] for x in stuff2 ] ))
-        print ('checking correctness...')
-    total_distance = ( oph.distance( start_point, cpoints[ best_path[ 1                    ][3] - 2 ] ) + 
-                       oph.distance( end_point,   cpoints[ best_path[ len( best_path ) - 2 ][3] - 2 ] ) )
-    for i in range( 1, len( best_path ) - 3 ):
-        total_distance += oph.distance( cpoints[ best_path[ i     ][3] - 2 ], 
-                                        cpoints[ best_path[ i + 1 ][3] - 2 ] )
-    if verbose:
-        print ('OK' if total_distance <= tmax else 'not OK')
-        print ('tmax:          ', tmax)
-        print ('total distance:', total_distance)
-    if return_sol:
-        return ( bestchrom[0], best_path, end_time - start_time )
-    return ( bestchrom[0], end_time - start_time )
-
-if( __name__ ==  '__main__' ):
-    debug = True if 'd' in sys.argv else False
-    run_alg( open( sys.argv[1] ), int( sys.argv[2] ), int( sys.argv[3] ) )
-else:
-    debug = False
diff --git a/AM/problems/op/opga/oph.py b/AM/problems/op/opga/oph.py
deleted file mode 100644
index a7c020e..0000000
--- a/AM/problems/op/opga/oph.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import math
-
-def distance( p1, p2 ):
-    return math.sqrt( ( p1[0] - p2[0] ) ** 2 + ( p1[1] - p2[1] ) ** 2 )
-
-#returns a path (list of points) through s with high value
-def ellinit_replacement( s1, start_point, end_point, tmax ):
-    s = list( s1 )
-    path = [ start_point, end_point ]
-    length = distance( start_point, end_point )
-    found = True
-    while( found == True and len( s ) > 0 ):
-        min_added_length = -1
-        max_added_reward = 0
-        for j in range( len( s ) ):
-            for k in range( len( path ) - 1 ):
-                added_length = ( distance( path[ k ], s[ j ] ) + 
-                                 distance( path[ k + 1 ], s[ j ] ) - 
-                                 distance( path[ k ], path[ k + 1 ] ) ) # optimize later
-                if( length + added_length < tmax and s[ j ][2] > max_added_reward ):
-                    min_added_length = added_length
-                    max_added_reward = s[ j ][2]
-                    minpoint = j
-                    pathpoint = k + 1
-        if( min_added_length > 0 ):
-            #add to path
-            path.insert( pathpoint, s.pop( minpoint ) )
-            length = length + min_added_length
-        else:
-            found = False
-    return path
-
-#returns a list of L paths with the best path in the first position
-#by weight rather than length
-def init_replacement( s1, start_point, end_point, tmax ):
-    s = list( s1 )
-    L = len( s ) if len( s ) <= 10 else 10
-    if( L == 0 ):
-        #print 'something is probably wrong'
-        #actually maybe not
-        return [ [ start_point, end_point ] ]
-
-    #decorate and sort by weight
-    dsub = sorted( [ ( x[4], x ) for x in s ] )[::-1] #this is different
-    ls = dsub[ :L ] 
-    rest = dsub[ L: ]
-    paths = []
-    for i in range( L ):
-        path = [ start_point, ls[ i ][1] , end_point ] 
-        length = distance( path[0], path[1] ) + distance( path[1], path[2] )
-        assert( length < tmax )
-        arest = ls[ :i ] + ls[ i + 1: ] + rest
-        arest = [ x[1] for x in arest ] #undecorate
-        assert( len( arest ) + len( path ) == len( s ) + 2 )
-        found = True
-        while( found == True and len( arest ) > 0 ):
-            min_added_length = -1
-            max_weight = 0
-            for j in range( len( arest ) ):
-                for k in range( len( path ) - 1 ):
-                    added_length = ( distance( path[ k ], arest[ j ] ) + 
-                                     distance( path[ k + 1 ], arest[ j ] ) - 
-                                     distance( path[ k ], path[ k + 1 ] ) ) # optimize later
-                    if( length + added_length < tmax and arest[ j ][4] < max_weight ):
-                        min_added_length = added_length
-                        max_weight = arest[ j ][4]
-                        minpoint = j
-                        pathpoint = k + 1
-            if( min_added_length > 0 ):
-                #add to path
-                path.insert( pathpoint, arest.pop( minpoint ) )
-                length = length + min_added_length
-            else:
-                found = False
-        if( length < tmax ):
-            paths.append( path )
-
-    assert( len( paths ) > 0 )
-    return [ x[1] for x in sorted( [ ( sum( [ y[2] for y in z ] ), z ) for z in paths ] )[::-1] ]
-
-
-#returns the subset of s that is on/in the ellipse defined by foci f1, f2 and the major axis
-def ell_sub( axis, f1, f2, s ):
-    result = []
-    for item in s:
-        if( distance( item, f1 ) + distance( item, f2 ) <= axis ):
-            result.append( item )
-    return result
-
-#returns a list of L paths with the best path in the first position
-def initialize( s, start_point, end_point, tmax ):
-    L = len( s ) if len( s ) <= 10 else 10
-    if( L == 0 ):
-        return [ [ start_point, end_point ] ]
-
-    dsub = sorted( [ ( distance( x, start_point ) + distance( x, end_point ), x ) for x in s ]
-            )[::-1] #optimize later
-    ls = dsub[ :L ] 
-    rest = dsub[ L: ]
-    paths = []
-    for i in range( L ):
-        path = [ start_point, ls[ i ][1] , end_point ] 
-        length = ls[ i ][0]
-        assert( length == distance( path[0], path[1] ) + distance( path[1], path[2] ) )
-        arest = ls[ :i ] + ls[ i + 1: ] + rest
-        arest = [ x[1] for x in arest ] #undecorate
-        assert( len( arest ) + len( path ) == len( s ) + 2 )
-        found = True
-        while( found == True and len( arest ) > 0 ):
-            min_added = -1
-            for j in range( len( arest ) ):
-                for k in range( len( path ) - 1 ):
-                    added_length = ( distance( path[ k ], arest[ j ] ) + 
-                                     distance( path[ k + 1 ], arest[ j ] ) - 
-                                     distance( path[ k ], path[ k + 1 ] ) ) # optimize later
-                    if( length + added_length < tmax and ( added_length < min_added or min_added < 0 ) ):
-                        min_added = added_length
-                        minpoint = j
-                        pathpoint = k + 1
-            if( min_added > 0 ):
-                #add to path
-                path.insert( pathpoint, arest.pop( minpoint ) )
-                length = length + min_added
-            else:
-                found = False
-        paths.append( path )
-
-    assert( len( [ x[1] for x in sorted( [ ( sum( [ y[2] for y in z ] ), z ) for z in paths ]
-        )[::-1] ] ) > 0 )
-    return [ x[1] for x in sorted( [ ( sum( [ y[2] for y in z ] ), z ) for z in paths ] )[::-1] ]
-
diff --git a/AM/problems/op/opga/optest.py b/AM/problems/op/opga/optest.py
deleted file mode 100644
index a05e9e6..0000000
--- a/AM/problems/op/opga/optest.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import time
-import opevo
-
-files = [ 'test instances/set_64_1_15.txt' ]
-tmaxs = [ range( 15,  80 + 1, 5 ) ]
-Ns = [ 64 ]
-
-test_runs = 30
-
-assert( len( files ) == len( tmaxs ) and len( tmaxs ) == len( Ns ) )
-
-for i in range( len( files ) ):
-    f = open( files[ i ] )
-    of = open( files[ i ][ :len( files[ i ] ) - 4 ] + '_results.dat', 'a' )
-
-    of.write( time.asctime() + '\n' )
-    of.write( 't avgfit avgtime bestfit\n' )
-    for t in tmaxs[ i ]:
-        fit_sum = float( 0 )
-        time_sum = float( 0 )
-        best_fit = 0
-        for j in range( test_runs ):
-            print('TEST %i/%i' % ( j + 1, test_runs ))
-            f.seek( 0 ) 
-            result = opevo.run_alg_f( f, t, Ns[ i ] )
-            fit_sum += result[0]
-            time_sum += result[1]
-            best_fit = result[0] if result[0] > best_fit else best_fit
-        #find avg fit, time, best fit then write to file
-        of.write( ' '.join( [ str( x ) for x in [ t, fit_sum / test_runs, time_sum / test_runs,
-            best_fit ] ] ) + '\n' )
-    f.close()
-    of.close()
diff --git a/AM/problems/op/opga/test instances/set_64_1_15.txt b/AM/problems/op/opga/test instances/set_64_1_15.txt
deleted file mode 100644
index 9fc9247..0000000
--- a/AM/problems/op/opga/test instances/set_64_1_15.txt	
+++ /dev/null
@@ -1,65 +0,0 @@
-15	1
-0.000	-7.000	0
-0.000	7.000	0
--1.000	-6.000	6
-1.000	-6.000	6
--2.000	-5.000	12
-0.000	-5.000	6
-2.000	-5.000	12
--3.000	-4.000	18
--1.000	-4.000	12
-1.000	-4.000	12
-3.000	-4.000	18
--4.000	-3.000	24
--2.000	-3.000	18
-0.000	-3.000	12
-2.000	-3.000	18
-4.000	-3.000	24
--5.000	-2.000	30
--3.000	-2.000	24
--1.000	-2.000	18
-1.000	-2.000	18
-3.000	-2.000	24
-5.000	-2.000	30
--6.000	-1.000	36
--4.000	-1.000	30
--2.000	-1.000	24
-0.000	-1.000	18
-2.000	-1.000	24
-4.000	-1.000	30
-6.000	-1.000	36
--7.000	0.000	42
--5.000	0.000	36
--3.000	0.000	30
--1.000	0.000	24
-1.000	0.000	24
-3.000	0.000	30
-5.000	0.000	36
-7.000	0.000	42
--6.000	1.000	36
--4.000	1.000	30
--2.000	1.000	24
-0.000	1.000	18
-2.000	1.000	24
-4.000	1.000	30
-6.000	1.000	36
--5.000	2.000	30
--3.000	2.000	24
--1.000	2.000	18
-1.000	2.000	18
-3.000	2.000	24
-5.000	2.000	30
--4.000	3.000	24
--2.000	3.000	18
-0.000	3.000	12
-2.000	3.000	18
-4.000	3.000	24
--3.000	4.000	18
--1.000	4.000	12
-1.000	4.000	12
-3.000	4.000	18
--2.000	5.000	12
-0.000	5.000	6
-2.000	5.000	12
--1.000	6.000	6
-1.000	6.000	6
diff --git a/AM/problems/op/problem_op.py b/AM/problems/op/problem_op.py
deleted file mode 100644
index a0d2559..0000000
--- a/AM/problems/op/problem_op.py
+++ /dev/null
@@ -1,141 +0,0 @@
-from torch.utils.data import Dataset
-import torch
-import os
-import pickle
-from problems.op.state_op import StateOP
-from utils.beam_search import beam_search
-
-
-class OP(object):
-
-    NAME = 'op'  # Orienteering problem
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        if pi.size(-1) == 1:  # In case all tours directly return to depot, prevent further problems
-            assert (pi == 0).all(), "If all length 1 tours, they should be zero"
-            # Return
-            return torch.zeros(pi.size(0), dtype=torch.float, device=pi.device), None
-
-        # Check that tours are valid, i.e. contain 0 to n -1
-        sorted_pi = pi.data.sort(1)[0]
-        # Make sure each node visited once at most (except for depot)
-        assert ((sorted_pi[:, 1:] == 0) | (sorted_pi[:, 1:] > sorted_pi[:, :-1])).all(), "Duplicates"
-
-        prize_with_depot = torch.cat(
-            (
-                torch.zeros_like(dataset['prize'][:, :1]),
-                dataset['prize']
-            ),
-            1
-        )
-        p = prize_with_depot.gather(1, pi)
-
-        # Gather dataset in order of tour
-        loc_with_depot = torch.cat((dataset['depot'][:, None, :], dataset['loc']), 1)
-        d = loc_with_depot.gather(1, pi[..., None].expand(*pi.size(), loc_with_depot.size(-1)))
-
-        length = (
-            (d[:, 1:] - d[:, :-1]).norm(p=2, dim=-1).sum(1)  # Prevent error if len 1 seq
-            + (d[:, 0] - dataset['depot']).norm(p=2, dim=-1)  # Depot to first
-            + (d[:, -1] - dataset['depot']).norm(p=2, dim=-1)  # Last to depot, will be 0 if depot is last
-        )
-        assert (length <= dataset['max_length'] + 1e-5).all(), \
-            "Max length exceeded by {}".format((length - dataset['max_length']).max())
-
-        # We want to maximize total prize but code minimizes so return negative
-        return -p.sum(-1), None
-
-    @staticmethod
-    def make_dataset(*args, **kwargs):
-        return OPDataset(*args, **kwargs)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StateOP.initialize(*args, **kwargs)
-
-    @staticmethod
-    def beam_search(input, beam_size, expand_size=None,
-                    compress_mask=False, model=None, max_calc_batch_size=4096):
-
-        assert model is not None, "Provide model"
-
-        fixed = model.precompute_fixed(input)
-
-        def propose_expansions(beam):
-            return model.propose_expansions(
-                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
-            )
-
-        state = OP.make_state(
-            input, visited_dtype=torch.int64 if compress_mask else torch.uint8
-        )
-
-        return beam_search(state, beam_size, propose_expansions)
-
-
-def generate_instance(size, prize_type):
-    # Details see paper
-    MAX_LENGTHS = {
-        20: 2.,
-        50: 3.,
-        100: 4.
-    }
-
-    loc = torch.FloatTensor(size, 2).uniform_(0, 1)
-    depot = torch.FloatTensor(2).uniform_(0, 1)
-    # Methods taken from Fischetti et al. 1998
-    if prize_type == 'const':
-        prize = torch.ones(size)
-    elif prize_type == 'unif':
-        prize = (1 + torch.randint(0, 100, size=(size, ))) / 100.
-    else:  # Based on distance to depot
-        assert prize_type == 'dist'
-        prize_ = (depot[None, :] - loc).norm(p=2, dim=-1)
-        prize = (1 + (prize_ / prize_.max(dim=-1, keepdim=True)[0] * 99).int()).float() / 100.
-
-    return {
-        'loc': loc,
-        # Uniform 1 - 9, scaled by capacities
-        'prize': prize,
-        'depot': depot,
-        'max_length': torch.tensor(MAX_LENGTHS[size])
-    }
-
-
-class OPDataset(Dataset):
-    
-    def __init__(self, filename=None, size=50, num_samples=1000000, offset=0, distribution='const'):
-        super(OPDataset, self).__init__()
-        assert distribution is not None, "Data distribution must be specified for OP"
-        # Currently the distribution can only vary in the type of the prize
-        prize_type = distribution
-
-        self.data_set = []
-        if filename is not None:
-            assert os.path.splitext(filename)[1] == '.pkl'
-
-            with open(filename, 'rb') as f:
-                data = pickle.load(f)
-                self.data = [
-                    {
-                        'loc': torch.FloatTensor(loc),
-                        'prize': torch.FloatTensor(prize),
-                        'depot': torch.FloatTensor(depot),
-                        'max_length': torch.tensor(max_length)
-                    }
-                    for depot, loc, prize, max_length in (data[offset:offset+num_samples])
-                ]
-        else:
-            self.data = [
-                generate_instance(size, prize_type)
-                for i in range(num_samples)
-            ]
-
-        self.size = len(self.data)
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, idx):
-        return self.data[idx]
diff --git a/AM/problems/op/state_op.py b/AM/problems/op/state_op.py
deleted file mode 100644
index c1c6db8..0000000
--- a/AM/problems/op/state_op.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import torch
-from typing import NamedTuple
-from utils.boolmask import mask_long2bool, mask_long_scatter
-import torch.nn.functional as F
-
-
-class StateOP(NamedTuple):
-    # Fixed input
-    coords: torch.Tensor  # Depot + loc
-    prize: torch.Tensor
-    # Max length is not a single value, but one for each node indicating max length tour should have when arriving
-    # at this node, so this is max_length - d(depot, node)
-    max_length: torch.Tensor
-
-    # If this state contains multiple copies (i.e. beam search) for the same instance, then for memory efficiency
-    # the coords and prizes tensors are not kept multiple times, so we need to use the ids to index the correct rows.
-    ids: torch.Tensor  # Keeps track of original fixed data index of rows
-
-    # State
-    prev_a: torch.Tensor
-    visited_: torch.Tensor  # Keeps track of nodes that have been visited
-    lengths: torch.Tensor
-    cur_coord: torch.Tensor
-    cur_total_prize: torch.Tensor
-    i: torch.Tensor  # Keeps track of step
-
-    @property
-    def visited(self):
-        if self.visited_.dtype == torch.uint8:
-            return self.visited_
-        else:
-            return mask_long2bool(self.visited_, n=self.coords.size(-2))
-
-    @property
-    def dist(self):
-        return (self.coords[:, :, None, :] - self.coords[:, None, :, :]).norm(p=2, dim=-1)
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            ids=self.ids[key],
-            prev_a=self.prev_a[key],
-            visited_=self.visited_[key],
-            lengths=self.lengths[key],
-            cur_coord=self.cur_coord[key],
-            cur_total_prize=self.cur_total_prize[key],
-        )
-
-    # Warning: cannot override len of NamedTuple, len should be number of fields, not batch size
-    # def __len__(self):
-    #     return len(self.used_capacity)
-
-    @staticmethod
-    def initialize(input, visited_dtype=torch.uint8):
-        depot = input['depot']
-        loc = input['loc']
-        prize = input['prize']
-        max_length = input['max_length']
-
-        batch_size, n_loc, _ = loc.size()
-        coords = torch.cat((depot[:, None, :], loc), -2)
-        return StateOP(
-            coords=coords,
-            prize=F.pad(prize, (1, 0), mode='constant', value=0),  # add 0 for depot
-            # max_length is max length allowed when arriving at node, so subtract distance to return to depot
-            # Additionally, substract epsilon margin for numeric stability
-            max_length=max_length[:, None] - (depot[:, None, :] - coords).norm(p=2, dim=-1) - 1e-6,
-            ids=torch.arange(batch_size, dtype=torch.int64, device=loc.device)[:, None],  # Add steps dimension
-            prev_a=torch.zeros(batch_size, 1, dtype=torch.long, device=loc.device),
-            visited_=(  # Visited as mask is easier to understand, as long more memory efficient
-                # Keep visited_ with depot so we can scatter efficiently (if there is an action for depot)
-                torch.zeros(
-                    batch_size, 1, n_loc + 1,
-                    dtype=torch.uint8, device=loc.device
-                )
-                if visited_dtype == torch.uint8
-                else torch.zeros(batch_size, 1, (n_loc + 1 + 63) // 64, dtype=torch.int64, device=loc.device)  # Ceil
-            ),
-            lengths=torch.zeros(batch_size, 1, device=loc.device),
-            cur_coord=input['depot'][:, None, :],  # Add step dimension
-            cur_total_prize=torch.zeros(batch_size, 1, device=loc.device),
-            i=torch.zeros(1, dtype=torch.int64, device=loc.device)  # Vector with length num_steps
-        )
-
-    def get_remaining_length(self):
-        # max_length[:, 0] is max length arriving at depot so original max_length
-        return self.max_length[self.ids, 0] - self.lengths
-
-    def get_final_cost(self):
-
-        assert self.all_finished()
-        # The cost is the negative of the collected prize since we want to maximize collected prize
-        return -self.cur_total_prize
-
-    def update(self, selected):
-
-        assert self.i.size(0) == 1, "Can only update if state represents single step"
-
-        # Update the state
-        selected = selected[:, None]  # Add dimension for step
-        prev_a = selected
-
-        # Add the length
-        cur_coord = self.coords[self.ids, selected]
-        lengths = self.lengths + (cur_coord - self.cur_coord).norm(p=2, dim=-1)  # (batch_dim, 1)
-
-        # Add the collected prize
-        cur_total_prize = self.cur_total_prize + self.prize[self.ids, selected]
-
-        if self.visited_.dtype == torch.uint8:
-            # Note: here we do not subtract one as we have to scatter so the first column allows scattering depot
-            # Add one dimension since we write a single value
-            visited_ = self.visited_.scatter(-1, prev_a[:, :, None], 1)
-        else:
-            # This works, by check_unset=False it is allowed to set the depot visited a second a time
-            visited_ = mask_long_scatter(self.visited_, prev_a, check_unset=False)
-
-        return self._replace(
-            prev_a=prev_a, visited_=visited_,
-            lengths=lengths, cur_coord=cur_coord, cur_total_prize=cur_total_prize, i=self.i + 1
-        )
-
-    def all_finished(self):
-        # All must be returned to depot (and at least 1 step since at start also prev_a == 0)
-        # This is more efficient than checking the mask
-        return self.i.item() > 0 and (self.prev_a == 0).all()
-        # return self.visited[:, :, 0].all()  # If we have visited the depot we're done
-
-    def get_current_node(self):
-        """
-        Returns the current node where 0 is depot, 1...n are nodes
-        :return: (batch_size, num_steps) tensor with current nodes
-        """
-        return self.prev_a
-
-    def get_mask(self):
-        """
-        Gets a (batch_size, n_loc + 1) mask with the feasible actions (0 = depot), depends on already visited and
-        remaining capacity. 0 = feasible, 1 = infeasible
-        Forbids to visit depot twice in a row, unless all nodes have been visited
-        :return:
-        """
-
-        exceeds_length = (
-            self.lengths[:, :, None] + (self.coords[self.ids, :, :] - self.cur_coord[:, :, None, :]).norm(p=2, dim=-1)
-            > self.max_length[self.ids, :]
-        )
-        # Note: this always allows going to the depot, but that should always be suboptimal so be ok
-        # Cannot visit if already visited or if length that would be upon arrival is too large to return to depot
-        # If the depot has already been visited then we cannot visit anymore
-        visited_ = self.visited.to(exceeds_length.dtype)
-        mask = visited_ | visited_[:, :, 0:1] | exceeds_length
-        # Depot can always be visited
-        # (so we do not hardcode knowledge that this is strictly suboptimal if other options are available)
-        mask[:, :, 0] = 0
-        return mask
-
-    def construct_solutions(self, actions):
-        return actions
diff --git a/AM/problems/op/tsiligirides.py b/AM/problems/op/tsiligirides.py
deleted file mode 100644
index 5b45448..0000000
--- a/AM/problems/op/tsiligirides.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-from problems.op.state_op import StateOP
-
-
-def op_tsiligirides(batch, sample=False, power=4.0):
-    state = StateOP.initialize(batch)
-
-    all_a = []
-    while not state.all_finished():
-        # Compute scores
-        mask = state.get_mask()
-        p = (
-                (mask[..., 1:] == 0).float() *
-                state.prize[state.ids, 1:] /
-                ((state.coords[state.ids, 1:, :] - state.cur_coord[:, :, None, :]).norm(p=2, dim=-1) + 1e-6)
-        ) ** power
-        bestp, besta = p.topk(4, dim=-1)
-        bestmask = mask[..., 1:].gather(-1, besta)
-
-        # If no feasible actions, must go to depot
-        # mask == 0 means feasible, so if mask == 0 sums to 0 there are no feasible and
-        # all corresponding ps should be 0, so we need to add a column with a 1 that corresponds
-        # to selecting the end destination
-        to_depot = ((bestmask == 0).sum(-1, keepdim=True) == 0).float()
-        # best_p should be zero if we have to go to depot, but because of numeric stabilities, it isn't
-        p_ = torch.cat((to_depot, bestp), -1)
-        pnorm = p_ / p_.sum(-1, keepdim=True)
-
-        if sample:
-            a = pnorm[:, 0, :].multinomial(1)  # Sample action
-        else:
-            # greedy
-            a = pnorm[:, 0, :].max(-1)[1].unsqueeze(-1)  # Add 'sampling dimension'
-
-        # a == 0 means depot, otherwise subtract one
-        final_a = torch.cat((torch.zeros_like(besta[..., 0:1]), besta + 1), -1)[:, 0, :].gather(-1, a)
-
-        selected = final_a[..., 0]  # Squeeze unnecessary sampling dimension
-        state = state.update(selected)
-        all_a.append(selected)
-    return torch.stack(all_a, -1)
-
diff --git a/AM/problems/pctsp/PCTSP/.gitignore b/AM/problems/pctsp/PCTSP/.gitignore
deleted file mode 100644
index 12ec999..0000000
--- a/AM/problems/pctsp/PCTSP/.gitignore
+++ /dev/null
@@ -1,332 +0,0 @@
-## Mac .DS_Store
-.DS_Store
-
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-##
-## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
-
-# User-specific files
-*.suo
-*.user
-*.userosscache
-*.sln.docstates
-
-# User-specific files (MonoDevelop/Xamarin Studio)
-*.userprefs
-
-# Build results
-[Dd]ebug/
-[Dd]ebugPublic/
-[Rr]elease/
-[Rr]eleases/
-x64/
-x86/
-bld/
-[Bb]in/
-[Oo]bj/
-[Ll]og/
-
-# Visual Studio 2015/2017 cache/options directory
-.vs/
-# Uncomment if you have tasks that create the project's static files in wwwroot
-#wwwroot/
-
-# Visual Studio 2017 auto generated files
-Generated\ Files/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-# NUNIT
-*.VisualState.xml
-TestResult.xml
-
-# Build Results of an ATL Project
-[Dd]ebugPS/
-[Rr]eleasePS/
-dlldata.c
-
-# Benchmark Results
-BenchmarkDotNet.Artifacts/
-
-# .NET Core
-project.lock.json
-project.fragment.lock.json
-artifacts/
-
-# StyleCop
-StyleCopReport.xml
-
-# Files built by Visual Studio
-*_i.c
-*_p.c
-*_i.h
-*.ilk
-*.meta
-*.obj
-*.iobj
-*.pch
-*.pdb
-*.ipdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.svclog
-*.scc
-
-# Chutzpah Test files
-_Chutzpah*
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opendb
-*.opensdf
-*.sdf
-*.cachefile
-*.VC.db
-*.VC.VC.opendb
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-*.sap
-
-# Visual Studio Trace Files
-*.e2e
-
-# TFS 2012 Local Workspace
-$tf/
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-*.DotSettings.user
-
-# JustCode is a .NET coding add-in
-.JustCode
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# AxoCover is a Code Coverage Tool
-.axoCover/*
-!.axoCover/settings.json
-
-# Visual Studio code coverage results
-*.coverage
-*.coveragexml
-
-# NCrunch
-_NCrunch_*
-.*crunch*.local.xml
-nCrunchTemp_*
-
-# MightyMoose
-*.mm.*
-AutoTest.Net/
-
-# Web workbench (sass)
-.sass-cache/
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.[Pp]ublish.xml
-*.azurePubxml
-# Note: Comment the next line if you want to checkin your web deploy settings,
-# but database connection strings (with potential passwords) will be unencrypted
-*.pubxml
-*.publishproj
-
-# Microsoft Azure Web App publish settings. Comment the next line if you want to
-# checkin your Azure Web App publish settings, but sensitive information contained
-# in these scripts will be unencrypted
-PublishScripts/
-
-# NuGet Packages
-*.nupkg
-# The packages folder can be ignored because of Package Restore
-**/[Pp]ackages/*
-# except build/, which is used as an MSBuild target.
-!**/[Pp]ackages/build/
-# Uncomment if necessary however generally it will be regenerated when needed
-#!**/[Pp]ackages/repositories.config
-# NuGet v3's project.json files produces more ignorable files
-*.nuget.props
-*.nuget.targets
-
-# Microsoft Azure Build Output
-csx/
-*.build.csdef
-
-# Microsoft Azure Emulator
-ecf/
-rcf/
-
-# Windows Store app package directories and files
-AppPackages/
-BundleArtifacts/
-Package.StoreAssociation.xml
-_pkginfo.txt
-*.appx
-
-# Visual Studio cache files
-# files ending in .cache can be ignored
-*.[Cc]ache
-# but keep track of directories ending in .cache
-!*.[Cc]ache/
-
-# Others
-ClientBin/
-~$*
-*~
-*.dbmdl
-*.dbproj.schemaview
-*.jfm
-*.pfx
-*.publishsettings
-orleans.codegen.cs
-
-# Including strong name files can present a security risk
-# (https://github.com/github/gitignore/pull/2483#issue-259490424)
-#*.snk
-
-# Since there are multiple workflows, uncomment next line to ignore bower_components
-# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
-#bower_components/
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file
-# to a newer Visual Studio version. Backup files are not needed,
-# because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-ServiceFabricBackup/
-*.rptproj.bak
-
-# SQL Server files
-*.mdf
-*.ldf
-*.ndf
-
-# Business Intelligence projects
-*.rdl.data
-*.bim.layout
-*.bim_*.settings
-*.rptproj.rsuser
-
-# Microsoft Fakes
-FakesAssemblies/
-
-# GhostDoc plugin setting file
-*.GhostDoc.xml
-
-# Node.js Tools for Visual Studio
-.ntvs_analysis.dat
-node_modules/
-
-# Visual Studio 6 build log
-*.plg
-
-# Visual Studio 6 workspace options file
-*.opt
-
-# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
-*.vbw
-
-# Visual Studio LightSwitch build output
-**/*.HTMLClient/GeneratedArtifacts
-**/*.DesktopClient/GeneratedArtifacts
-**/*.DesktopClient/ModelManifest.xml
-**/*.Server/GeneratedArtifacts
-**/*.Server/ModelManifest.xml
-_Pvt_Extensions
-
-# Paket dependency manager
-.paket/paket.exe
-paket-files/
-
-# FAKE - F# Make
-.fake/
-
-# JetBrains Rider
-.idea/
-*.sln.iml
-
-# CodeRush
-.cr/
-
-# Python Tools for Visual Studio (PTVS)
-__pycache__/
-*.pyc
-
-# Cake - Uncomment if you are using it
-# tools/**
-# !tools/packages.config
-
-# Tabs Studio
-*.tss
-
-# Telerik's JustMock configuration file
-*.jmconfig
-
-# BizTalk build output
-*.btp.cs
-*.btm.cs
-*.odx.cs
-*.xsd.cs
-
-# OpenCover UI analysis results
-OpenCover/
-
-# Azure Stream Analytics local run output
-ASALocalRun/
-
-# MSBuild Binary and Structured Log
-*.binlog
-
-# NVidia Nsight GPU debugger configuration file
-*.nvuser
-
-# MFractors (Xamarin productivity tool) working folder
-.mfractor/
diff --git a/AM/problems/pctsp/PCTSP/Instances/problem_20_100_100_1000.pctsp b/AM/problems/pctsp/PCTSP/Instances/problem_20_100_100_1000.pctsp
deleted file mode 100644
index 1945fae..0000000
--- a/AM/problems/pctsp/PCTSP/Instances/problem_20_100_100_1000.pctsp
+++ /dev/null
@@ -1,27 +0,0 @@
-
-     0     10     17      4     13     11      4     31     85     62     53     59     90     19     82     25     52     67     86      2 
-
-
- 1000000     57     70     43     55     35     50     40     77     16     21      8     45     10     94     92     57     58      8      9 
-
-
-     0    274    163    189    282    865    187    563    639    364    267    730    113     95    994    363     79    272    283    572 
-   274      0    978    857    422    812    397     80    751    559    286    798    940    254    888    677    726    196    551    648 
-   163    978      0    102    441    458    720    123   1000    605    850    442    175    675    143    701     99    292    726    151 
-   189    857    102      0    382    927     43    953    680    839    478    953    412     21    344    184    129    926    234    673 
-   282    422    441    382      0    546     89    895    109     23    425     80    419    669    242    114    588     88    737    826 
-   865    812    458    927    546      0    413    146    126    633    765    286    355    469     62    211    951    316    168    320 
-   187    397    720     43     89    413      0    458    714    815    388     30    440    570    258    578    980    949     73    290 
-   563     80    123    953    895    146    458      0    709    627    583    561     82    249    956    869     81    941    742    949 
-   639    751   1000    680    109    126    714    709      0    993    883    584    267    431    413     80     38    680    798    710 
-   364    559    605    839     23    633    815    627    993      0    978    156    397    146    183    246    245    575    147    698 
-   267    286    850    478    425    765    388    583    883    978      0    603    610    740    582    546    172    121    307    787 
-   730    798    442    953     80    286     30    561    584    156    603      0    734    189    324     55    802    862    114    753 
-   113    940    175    412    419    355    440     82    267    397    610    734      0    598    836    884    467    771    721     87 
-    95    254    675     21    669    469    570    249    431    146    740    189    598      0    138      2    365    457    537    702 
-   994    888    143    344    242     62    258    956    413    183    582    324    836    138      0    336    990    679    241     88 
-   363    677    701    184    114    211    578    869     80    246    546     55    884      2    336      0    189    824    224    876 
-    79    726     99    129    588    951    980     81     38    245    172    802    467    365    990    189      0     99    264    652 
-   272    196    292    926     88    316    949    941    680    575    121    862    771    457    679    824     99      0    760    347 
-   283    551    726    234    737    168     73    742    798    147    307    114    721    537    241    224    264    760      0    845 
-   572    648    151    673    826    320    290    949    710    698    787    753     87    702     88    876    652    347    845      0 
diff --git a/AM/problems/pctsp/PCTSP/PCPTSP/main.cpp b/AM/problems/pctsp/PCTSP/PCPTSP/main.cpp
deleted file mode 100644
index a6fd97c..0000000
--- a/AM/problems/pctsp/PCTSP/PCPTSP/main.cpp
+++ /dev/null
@@ -1,616 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <string>
-#include <fstream>
-#include <sstream>
-#include <random>
-#include <time.h>
-#include <iterator>
-#include <algorithm>
-
-using namespace std;
-
-struct Params {
-	vector<vector<unsigned int>> vertices;
-	vector<vector<unsigned int>> distanceMatrix;
-	vector<bool> visited;
-	unsigned int minTotalPrize;
-	vector<vector<unsigned int>> orderSwapTwoOpt;
-
-	Params() {
-		minTotalPrize = 0;
-	}
-};
-
-struct Solution {
-	vector<unsigned int> route;
-	double cost;
-	double penalty;
-	unsigned int prize;
-
-	Solution() {
-		cost = numeric_limits<double>::infinity();
-		penalty = numeric_limits<double>::infinity();
-		prize = 0;
-	}
-};
-
-vector<string> openFile(unsigned int size, unsigned int maxPrize, unsigned int maxPenalty, unsigned int maxCost) {
-
-	vector<string> lines;
-	string file = "../Instances/problem_" + to_string(size) + "_" + to_string(maxPrize) + "_" + to_string(maxPenalty) + "_" + to_string(maxCost) + ".pctsp";
-	ifstream inFile(file);
-
-	if (inFile.good()) {
-		string sLine;
-
-		unsigned int j = 0;
-		while (getline(inFile, sLine)) {
-			if (sLine.length() != 0 && sLine != "\r") {
-				lines.push_back(sLine);
-				j++;
-			}
-		}
-	}
-	inFile.close();
-	return lines;
-}
-
-vector<string> openFileName(string file) {
-
-	vector<string> lines;
-	ifstream inFile(file);
-
-	if (inFile.good()) {
-		string sLine;
-
-		unsigned int j = 0;
-		while (getline(inFile, sLine)) {
-			if (sLine.length() != 0 && sLine != "\r") {
-				lines.push_back(sLine);
-				j++;
-			}
-		}
-	}
-	inFile.close();
-	return lines;
-}
-
-void createGraph(const vector<string> &lines, Params *params) {
-
-	istringstream iss(lines[0]);
-	vector<string> tokens{ istream_iterator<string>{iss}, istream_iterator<string>{} };
-	for (unsigned int i = 0; i < tokens.size(); i++) {
-		params->vertices.push_back({ (unsigned int)stoi(tokens[i]) });
-	}
-
-	istringstream iss2(lines[1]);
-	vector<string> tokens2{ istream_iterator<string>{iss2}, istream_iterator<string>{} };
-	for (unsigned int i = 0; i < tokens2.size(); i++) {
-		params->vertices[i].push_back((unsigned int)stoi(tokens2[i]));
-	}
-
-	for (unsigned int i = 2; i < lines.size(); i++) {
-		params->distanceMatrix.emplace_back();
-		istringstream iss3(lines[i]);
-		vector<string> tokens3{ istream_iterator<string>{iss3}, istream_iterator<string>{} };
-		for (unsigned int j = 0; j < tokens.size(); j++) {
-			params->distanceMatrix[i - 2].push_back((unsigned int)stoi(tokens3[j]));
-		}
-	}
-}
-
-int genRandom(int i) {
-	return std::rand() % i;
-}
-
-void randomConst(Params *params, Solution *solution) {
-
-	vector<unsigned int> selected;
-	for (unsigned int i = 1; i < params->vertices.size(); i++) {
-		params->visited[i] = false;
-		selected.push_back(i);
-	}
-	random_shuffle(selected.begin(), selected.end(), genRandom);
-
-	solution->cost = 0;
-	solution->penalty = 0;
-	solution->route.push_back(0);
-	params->visited[0] = true;
-	unsigned int i = 0;
-	while (solution->prize < params->minTotalPrize) {
-		solution->cost += params->distanceMatrix[solution->route.back()][selected[i]];
-		solution->prize += params->vertices[selected[i]][0];
-		params->visited[selected[i]] = true;
-		solution->route.push_back(selected[i]);
-		i++;
-	}
-	solution->cost += params->distanceMatrix[solution->route.back()][0];
-	solution->route.push_back(0);
-	for (unsigned int i = 0; i < params->visited.size(); i++) {
-		if (!params->visited[i])
-			solution->penalty += params->vertices[i][1];
-	}
-}
-
-void greedyRandomizedConst(Params *params, Solution *solution) {
-
-	double minCost, currentCost;
-	unsigned int solutionSize, selectedPos;
-	auto *solutionCandidate = new Solution();
-	vector<unsigned int> rlc;
-
-	for (unsigned int i = 0; i < 10; i++) {
-		solutionCandidate->prize = 0;
-		solutionCandidate->cost = 0;
-		solutionCandidate->route = {};
-		solutionCandidate->route.push_back(0);
-		solutionSize = (unsigned int)((i + 1) * params->vertices.size() / 10);
-		params->visited[0] = true;
-		for (unsigned int j = 1; j < params->vertices.size(); j++)
-			params->visited[j] = false;
-
-		while (solutionCandidate->prize < params->minTotalPrize || solutionCandidate->route.size() < solutionSize) {
-			minCost = numeric_limits<double>::infinity();
-			for (unsigned int j = 0; j < params->vertices.size(); j++) {
-				if (!params->visited[j]) {
-					currentCost = params->distanceMatrix[solutionCandidate->route.back()][j];
-					if (currentCost < minCost)
-						minCost = currentCost;
-				}
-			}
-			rlc = {};
-			for (unsigned int j = 0; j < params->vertices.size(); j++) {
-				if (!params->visited[j] && (params->distanceMatrix[solutionCandidate->route.back()][j] <= 1.2 * minCost))
-					rlc.push_back(j);
-			}
-
-			selectedPos = (unsigned int)(rand() % rlc.size());
-			selectedPos = rlc[selectedPos];
-			params->visited[selectedPos] = true;
-			solutionCandidate->prize += params->vertices[selectedPos][0];
-			solutionCandidate->cost += params->distanceMatrix[solutionCandidate->route.back()][selectedPos];
-			solutionCandidate->route.push_back(selectedPos);
-		}
-
-		solutionCandidate->penalty = 0;
-		for (unsigned int j = 0; j < params->visited.size(); j++) {
-			if (!params->visited[j])
-				solutionCandidate->penalty += params->vertices[j][1];
-		}
-
-		solutionCandidate->cost += params->distanceMatrix[solutionCandidate->route.back()][0];
-		solutionCandidate->route.push_back(0);
-		if (solutionCandidate->cost + solutionCandidate->penalty < solution->cost + solution->penalty) {
-			solution->cost = solutionCandidate->cost;
-			solution->penalty = solutionCandidate->penalty;
-			solution->prize = solutionCandidate->prize;
-			solution->route = solutionCandidate->route;
-		}
-	}
-	delete solutionCandidate;
-}
-
-bool addNode(Params *params, Solution *solution, Solution *bestSolution) {
-	double modifiedCost, modifiedPenalty;
-	unsigned int modifiedPrize;
-	int iBest = -1, jBest = -1;
-
-	for (unsigned int i = 1; i < params->vertices.size() - 1; i++) {
-		if (!params->visited[i]) {
-			for (unsigned int j = 1; j < solution->route.size(); j++) {
-				modifiedCost = solution->cost - params->distanceMatrix[solution->route[j - 1]][solution->route[j]] +
-					params->distanceMatrix[solution->route[j - 1]][i] + params->distanceMatrix[i][solution->route[j]];
-				modifiedPenalty = solution->penalty - params->vertices[i][1];
-				modifiedPrize = solution->prize + params->vertices[i][0];
-
-				if ((modifiedCost + modifiedPenalty) < (bestSolution->cost + bestSolution->penalty)) {
-					iBest = i;
-					jBest = j;
-					bestSolution->cost = modifiedCost;
-					bestSolution->penalty = modifiedPenalty;
-					bestSolution->prize = modifiedPrize;
-				}
-			}
-		}
-	}
-	if (iBest != -1) {
-		solution->cost = bestSolution->cost;
-		solution->penalty = bestSolution->penalty;
-		solution->prize = bestSolution->prize;
-		solution->route.insert(solution->route.begin() + jBest, (unsigned int)iBest);
-		params->visited[iBest] = true;
-		return true;
-	}
-	return false;
-}
-
-bool removeNode(Params *params, Solution *solution, Solution *bestSolution) {
-
-	double modifiedCost, modifiedPenalty;
-	unsigned int modifiedPrize;
-	int iBest = -1;
-
-	for (unsigned int i = 1; i < solution->route.size() - 1; i++) {
-		modifiedCost = solution->cost - params->distanceMatrix[solution->route[i - 1]][solution->route[i]] -
-			params->distanceMatrix[solution->route[i]][solution->route[i + 1]] +
-			params->distanceMatrix[solution->route[i - 1]][solution->route[i + 1]];
-		modifiedPenalty = solution->penalty + params->vertices[solution->route[i]][1];
-		modifiedPrize = solution->prize - params->vertices[solution->route[i]][0];
-
-		if ((modifiedPrize >= params->minTotalPrize) && (modifiedCost + modifiedPenalty < bestSolution->cost + bestSolution->penalty)) {
-			iBest = i;
-			bestSolution->cost = modifiedCost;
-			bestSolution->penalty = modifiedPenalty;
-			bestSolution->prize = modifiedPrize;
-		}
-	}
-	if (iBest != -1) {
-		solution->cost = bestSolution->cost;
-		solution->penalty = bestSolution->penalty;
-		solution->prize = bestSolution->prize;
-		params->visited[solution->route[iBest]] = false;
-		solution->route.erase(solution->route.begin() + iBest);
-		return true;
-	}
-	return false;
-}
-
-bool swapNodes(Params *params, Solution *solution, Solution *bestSolution) {
-
-	double modifiedCost;
-	int iBest = -1, jBest = -1;
-
-	//    for (const vector<unsigned int> &pos: params->orderSwapTwoOpt) {
-	for (unsigned int i = 1; i < solution->route.size() - 1; i++) {
-		for (unsigned int j = i + 1; j < solution->route.size() - 1; j++) {
-			modifiedCost = solution->cost - params->distanceMatrix[solution->route[i - 1]][solution->route[i]] -
-				params->distanceMatrix[solution->route[j]][solution->route[j + 1]] +
-				params->distanceMatrix[solution->route[i - 1]][solution->route[j]] +
-				params->distanceMatrix[solution->route[j]][solution->route[i + 1]] +
-				params->distanceMatrix[solution->route[j - 1]][solution->route[i]] +
-				params->distanceMatrix[solution->route[i]][solution->route[j + 1]];
-
-			if (j != i + 1)
-				modifiedCost = modifiedCost - params->distanceMatrix[solution->route[i]][solution->route[i + 1]] -
-				params->distanceMatrix[solution->route[j - 1]][solution->route[j]];
-
-			if (modifiedCost < bestSolution->cost) {
-				iBest = i;
-				jBest = j;
-				bestSolution->cost = modifiedCost;
-			}
-		}
-	}
-
-	if (iBest != -1) {
-		solution->cost = bestSolution->cost;
-		unsigned int temp;
-		temp = solution->route[(unsigned int)iBest];
-		solution->route[(unsigned int)iBest] = solution->route[(unsigned int)jBest];
-		solution->route[(unsigned int)jBest] = temp;
-		return true;
-	}
-	return false;
-}
-
-bool twoOpt(Params *params, Solution *solution, Solution *bestSolution) {
-
-	double modifiedCost;
-	int iBest = -1, jBest = -1;
-
-	//    for (const vector<unsigned int> &pos: params->orderSwapTwoOpt) {
-	for (unsigned int i = 1; i < solution->route.size() - 1; i++) {
-		for (unsigned int j = i + 1; j < solution->route.size() - 1; j++) {
-			modifiedCost = solution->cost - params->distanceMatrix[solution->route[i - 1]][solution->route[i]] -
-				params->distanceMatrix[solution->route[j]][solution->route[j + 1]] +
-				params->distanceMatrix[solution->route[i - 1]][solution->route[j]] +
-				params->distanceMatrix[solution->route[i]][solution->route[j + 1]];
-
-			if (modifiedCost < bestSolution->cost) {
-				iBest = i;
-				jBest = j;
-				bestSolution->cost = modifiedCost;
-			}
-		}
-	}
-
-	if (iBest != -1) {
-		solution->cost = bestSolution->cost;
-		bestSolution->route = {};
-		copy(solution->route.begin() + iBest, solution->route.begin() + jBest + 1,
-			back_inserter(bestSolution->route));
-		reverse(bestSolution->route.begin(), bestSolution->route.end());
-		solution->route.erase(solution->route.begin() + iBest, solution->route.begin() + jBest + 1);
-		solution->route.insert(solution->route.begin() + iBest, bestSolution->route.begin(), bestSolution->route.end());
-		return true;
-	}
-
-	return false;
-}
-
-void orderLS(Params *params, Solution *solution) {
-	params->orderSwapTwoOpt = {};
-	for (unsigned int i = 1; i < solution->route.size() - 1; i++) {
-		for (unsigned int j = i + 1; j < solution->route.size() - 1; j++) {
-			params->orderSwapTwoOpt.push_back({ i, j });
-		}
-	}
-}
-
-void shuffleIndices(Params *params) {
-	// Shuffling the jobs order vector
-	random_shuffle(params->orderSwapTwoOpt.begin(), params->orderSwapTwoOpt.end(), genRandom);
-}
-
-void localSearch(Params *params, Solution *solution) {
-
-	bool foundBetter1, foundBetter2, foundBetter3, foundBetter4;
-	auto *bestSolution = new Solution();
-	unsigned int improved = 0;
-
-	for (unsigned int i = 0; i < params->visited.size(); i++)
-		params->visited[i] = false;
-	for (unsigned int i = 0; i < solution->route.size(); i++) {
-		params->visited[solution->route[i]] = true;
-	}
-	bestSolution->cost = solution->cost;
-	bestSolution->penalty = solution->penalty;
-	bestSolution->prize = solution->prize;
-
-	while (true) {
-		foundBetter1 = addNode(params, solution, bestSolution);
-		//        orderLS(params, solution);
-		//        shuffleIndices(params);
-		foundBetter2 = swapNodes(params, solution, bestSolution);
-		foundBetter3 = removeNode(params, solution, bestSolution);
-		//        orderLS(params, solution);
-		//        shuffleIndices(params);
-		foundBetter4 = twoOpt(params, solution, bestSolution);
-		if (!foundBetter1 && !foundBetter2 && !foundBetter3 && !foundBetter4)
-			break;
-		improved++;
-	}
-	//    cout << improved << endl;
-	delete bestSolution;
-}
-
-void doubleBridge(Params *params, Solution *solutionCandidate) {
-
-	solutionCandidate->route.pop_back();
-
-	unsigned int position1 = 1 + (unsigned int)(rand() % (int)(solutionCandidate->route.size() / 3));
-	unsigned int position2 = position1 + 1 + (unsigned int)(rand() % (int)(solutionCandidate->route.size() / 3));
-	unsigned int position3 = position2 + 1 + (unsigned int)(rand() % (int)(solutionCandidate->route.size() / 3));
-
-	vector<unsigned int> temp = {};
-
-	copy(solutionCandidate->route.begin(), solutionCandidate->route.begin() + position1, back_inserter(temp));
-	temp.insert(temp.end(), solutionCandidate->route.begin() + position3, solutionCandidate->route.end());
-	temp.insert(temp.end(), solutionCandidate->route.begin() + position2, solutionCandidate->route.begin() + position3);
-	temp.insert(temp.end(), solutionCandidate->route.begin() + position1, solutionCandidate->route.begin() + position2);
-
-	solutionCandidate->route = temp;
-	solutionCandidate->route.push_back(0);
-	solutionCandidate->cost = 0;
-	for (unsigned int i = 0; i < solutionCandidate->route.size() - 1; i++)
-		solutionCandidate->cost += params->distanceMatrix[solutionCandidate->route[i]][solutionCandidate->route[i + 1]];
-}
-
-void perturbation(Params *params, Solution *solutionCandidate, unsigned int intensity) {
-    // Need at least 4 nodes / two internal nodes (0 1 2 0) to perform double bridge
-    if (solutionCandidate->route.size() < 4)
-        return;
-	for (unsigned int i = 0; i < intensity; i++) {
-		doubleBridge(params, solutionCandidate);
-	}
-}
-
-void ILS(Params *params, Solution *solution) {
-	auto *bestSolution = new Solution();
-	bestSolution->route = solution->route;
-	bestSolution->cost = solution->cost;
-	bestSolution->penalty = solution->penalty;
-	bestSolution->prize = solution->prize;
-
-	auto *modifiedSolution = new Solution();
-	modifiedSolution->route = solution->route;
-	modifiedSolution->cost = solution->cost;
-	modifiedSolution->penalty = solution->penalty;
-	modifiedSolution->prize = solution->prize;
-
-	unsigned int maxIter = 40000;
-	unsigned int maxNoImprov = 20000;
-	unsigned int maxReboot = 4001;
-	unsigned int maxNbReboots = 4;
-	unsigned int iterations = 0;
-	unsigned int noImprov = 0;
-	unsigned int reboot = 0;
-	unsigned int nbReboots = 0;
-
-	while (noImprov < maxNoImprov && iterations < maxIter) {
-
-		perturbation(params, modifiedSolution, 2);
-		localSearch(params, modifiedSolution);
-
-		iterations++;
-		noImprov++;
-		reboot++;
-
-		if (modifiedSolution->cost + modifiedSolution->penalty < solution->cost + solution->penalty) {
-			//            cout << iterations << endl;
-			//            cout << modifiedSolution->cost + modifiedSolution->penalty << endl;
-			solution->route = modifiedSolution->route;
-			solution->cost = modifiedSolution->cost;
-			solution->penalty = modifiedSolution->penalty;
-			solution->prize = modifiedSolution->prize;
-			if (modifiedSolution->cost + modifiedSolution->penalty < bestSolution->cost + bestSolution->penalty) {
-				bestSolution->route = modifiedSolution->route;
-				bestSolution->cost = modifiedSolution->cost;
-				bestSolution->penalty = modifiedSolution->penalty;
-				bestSolution->prize = modifiedSolution->prize;
-				reboot = 0;
-				noImprov = 0;
-			}
-		}
-		else {
-			modifiedSolution->route = solution->route;
-			modifiedSolution->cost = solution->cost;
-			modifiedSolution->penalty = solution->penalty;
-			modifiedSolution->prize = solution->prize;
-		}
-
-		if (nbReboots < maxNbReboots && reboot >= maxReboot) {
-			modifiedSolution->cost = numeric_limits<double>::infinity();
-			greedyRandomizedConst(params, modifiedSolution);
-			localSearch(params, modifiedSolution);
-			solution->route = modifiedSolution->route;
-			solution->cost = modifiedSolution->cost;
-			solution->penalty = modifiedSolution->penalty;
-			solution->prize = modifiedSolution->prize;
-			if (modifiedSolution->cost + modifiedSolution->penalty < bestSolution->cost + bestSolution->penalty) {
-				bestSolution->route = modifiedSolution->route;
-				bestSolution->cost = modifiedSolution->cost;
-				bestSolution->penalty = modifiedSolution->penalty;
-				bestSolution->prize = modifiedSolution->prize;
-				noImprov = 0;
-			}
-			reboot = 0;
-			nbReboots++;
-		}
-	}
-	solution->route = bestSolution->route;
-	solution->cost = bestSolution->cost;
-	solution->penalty = bestSolution->penalty;
-	solution->prize = bestSolution->prize;
-	//    cout << iterations << endl;
-	//    cout << solution->cost + solution->penalty << endl;
-	//    cout << "--------" << endl;
-	delete modifiedSolution;
-	delete bestSolution;
-}
-//
-//int main() {
-//
-//	unsigned int runs = 20;
-//	vector<unsigned int> sizes = { 20, 40, 60, 80, 100, 200, 300, 400, 500 };
-//	vector<unsigned int> penalties = { 100, 1000 };
-//	vector<unsigned int> costs = { 1000, 10000 };
-//	vector<string> lines;
-//	double bestResult;
-//	double averageResult, averageTime;
-//
-//	for (const unsigned int &size : sizes) {
-//		for (const unsigned int &penalty : penalties) {
-//			for (const unsigned int &cost : costs) {
-//				if (!(penalty == 1000 && cost == 1000)) {
-//					bestResult = numeric_limits<double>::infinity();
-//					averageResult = 0;
-//					averageTime = 0;
-//					for (unsigned int k = 0; k < runs; k++) {
-//						srand((unsigned int)time(nullptr));
-//						const clock_t start = clock();
-//						auto *params = new Params();
-//
-//						lines = openFile(size, 100, penalty, cost);
-//						createGraph(lines, params);
-//						for (unsigned int i = 0; i < params->vertices.size(); i++)
-//							params->visited.push_back(false);
-//
-//						auto *solution = new Solution();
-//						//                        randomConst(params, solution);
-//						greedyRandomizedConst(params, solution);
-//
-//						//                        orderLS(params, solution);
-//						localSearch(params, solution);
-//						//                      for (unsigned int i = 0; i < solution->route.size(); i++)
-//						//                          cout << solution->route[i] << ", ";
-//						//                      cout << endl;
-//						ILS(params, solution);
-//
-//						if (solution->cost + solution->penalty < bestResult)
-//							bestResult = solution->cost + solution->penalty;
-//						averageResult += (solution->cost + solution->penalty);
-//						averageTime += (float(clock() - start) / CLOCKS_PER_SEC);
-//
-//						delete solution;
-//						delete params;
-//					}
-//					cout << "Best Result: " << bestResult << endl;
-//					cout << "Average Result: " << averageResult / runs << endl;
-//					cout << "Average Time: " << averageTime / runs << endl;
-//				}
-//			}
-//		}
-//	}
-//	return 0;
-//}
-
-
-int main(int argc, char *argv[]) {
-
-    string filename = argv[1];
-    unsigned int minTotalPrize = stoi(argv[2]);
-	unsigned int runs = 20;
-	unsigned int seed = 1234;
-	if (argc >= 4)
-    {
-        runs = stoi(argv[3]);
-    }
-    if (argc >= 5)
-    {
-        seed = stoi(argv[4]);
-    }
-
-	vector<string> lines;
-	double bestResult;
-	double averageResult, averageTime;
-	vector<unsigned int> bestRoute;
-
-    bestResult = numeric_limits<double>::infinity();
-    bestRoute = vector<unsigned int>();
-    averageResult = 0;
-    averageTime = 0;
-    for (unsigned int k = 0; k < runs; k++) {
-        cout << "Run: " << k << endl;
-        srand((unsigned int)seed + k);
-        const clock_t start = clock();
-        auto *params = new Params();
-        params->minTotalPrize = minTotalPrize;
-
-        // lines = openFile(size, 100, penalty, cost);
-        lines = openFileName(filename);
-        createGraph(lines, params);
-        for (unsigned int i = 0; i < params->vertices.size(); i++)
-            params->visited.push_back(false);
-
-        auto *solution = new Solution();
-        //                        randomConst(params, solution);
-        greedyRandomizedConst(params, solution);
-
-        //                        orderLS(params, solution);
-        localSearch(params, solution);
-        //                      for (unsigned int i = 0; i < solution->route.size(); i++)
-        //                          cout << solution->route[i] << ", ";
-        //                      cout << endl;
-        ILS(params, solution);
-
-        if (solution->cost + solution->penalty < bestResult)
-            bestResult = solution->cost + solution->penalty;
-            bestRoute = solution->route;
-        averageResult += (solution->cost + solution->penalty);
-        averageTime += (float(clock() - start) / CLOCKS_PER_SEC);
-
-        delete solution;
-        delete params;
-    }
-    cout << "Best Result Cost: " << bestResult << endl;
-    cout << "Best Result Route:";
-    for (auto i: bestRoute)
-        cout << ' ' << i;
-    cout << endl;
-    cout << "Average Result: " << averageResult / runs << endl;
-    cout << "Average Time: " << averageTime / runs << endl;
-	return 0;
-}
-
diff --git a/AM/problems/pctsp/__init__.py b/AM/problems/pctsp/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/pctsp/pctsp_baseline.py b/AM/problems/pctsp/pctsp_baseline.py
deleted file mode 100644
index e9f3420..0000000
--- a/AM/problems/pctsp/pctsp_baseline.py
+++ /dev/null
@@ -1,455 +0,0 @@
-import argparse
-import os
-import numpy as np
-from utils import run_all_in_pool
-from utils.data_utils import check_extension, load_dataset, save_dataset
-from subprocess import check_call, check_output
-import re
-import time
-from datetime import timedelta
-import random
-from scipy.spatial import distance_matrix
-from .salesman.pctsp.model.pctsp import Pctsp
-from .salesman.pctsp.algo.ilocal_search import ilocal_search
-from .salesman.pctsp.model import solution
-
-MAX_LENGTH_TOL = 1e-5
-
-
-def get_pctsp_executable():
-    path = os.path.join("pctsp", "PCTSP", "PCPTSP")
-    sourcefile = os.path.join(path, "main.cpp")
-    execfile = os.path.join(path, "main.out")
-    if not os.path.isfile(execfile):
-        print ("Compiling...")
-        check_call(["g++", "-g", "-Wall", sourcefile, "-std=c++11", "-o", execfile])
-        print ("Done!")
-    assert os.path.isfile(execfile), "{} does not exist! Compilation failed?".format(execfile)
-    return os.path.abspath(execfile)
-
-
-def solve_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10):
-
-    problem_filename = os.path.join(directory, "{}.pctsp{}.pctsp".format(name, runs))
-    output_filename = os.path.join(directory, "{}.pctsp{}.pkl".format(name, runs))
-    log_filename = os.path.join(directory, "{}.pctsp{}.log".format(name, runs))
-
-    try:
-        # May have already been run
-        if not os.path.isfile(output_filename):
-            write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name)
-            with open(log_filename, 'w') as f:
-                start = time.time()
-                output = check_output(
-                    # exe, filename, min_total_prize (=1), num_runs
-                    [executable, problem_filename, float_to_scaled_int_str(1.), str(runs)],
-                    stderr=f
-                ).decode('utf-8')
-                duration = time.time() - start
-                f.write(output)
-
-            save_dataset((output, duration), output_filename)
-        else:
-            output, duration = load_dataset(output_filename)
-
-        # Now parse output
-        tour = None
-        for line in output.splitlines():
-            heading = "Best Result Route: "
-            if line[:len(heading)] == heading:
-                tour = np.array(line[len(heading):].split(" ")).astype(int)
-                break
-        assert tour is not None, "Could not find tour in output!"
-
-        assert tour[0] == 0, "Tour should start with depot"
-        assert tour[-1] == 0, "Tour should end with depot"
-        tour = tour[1:-1]  # Strip off depot
-
-        return calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour), tour.tolist(), duration
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_stochastic_pctsp_log(
-        executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10, append='all'):
-
-    try:
-
-        problem_filename = os.path.join(directory, "{}.stochpctsp{}{}.pctsp".format(name, append, runs))
-        output_filename = os.path.join(directory, "{}.stochpctsp{}{}.pkl".format(name, append, runs))
-        log_filename = os.path.join(directory, "{}.stochpctsp{}{}.log".format(name, append, runs))
-
-        # May have already been run
-        if not os.path.isfile(output_filename):
-
-            total_start = time.time()
-
-            outputs = []
-            durations = []
-            final_tour = []
-
-            coord = [depot] + loc
-
-            mask = np.zeros(len(coord), dtype=bool)
-            dist = distance_matrix(coord, coord)
-            penalty = np.array(penalty)
-            deterministic_prize = np.array(deterministic_prize)
-
-            it = 0
-            total_collected_prize = 0.
-            # As long as we have not visited all nodes we repeat
-            # even though we have already satisfied the total prize collected constraint
-            # since the algorithm may decide to include more nodes to avoid further penalties
-            while len(final_tour) < len(stochastic_prize):
-
-                # Mask all nodes already visited (not the depot)
-                mask[final_tour] = True
-
-                # The distance from the 'start' or 'depot' is the distance from the 'current node'
-                # this way we mimic as if we have a separate start and end by the assymetric distance matrix
-                # Note: this violates the triangle inequality and the distance from 'depot to depot' becomes nonzero
-                # but the program seems to deal with this well
-                if len(final_tour) > 0:  # in the first iteration we are at depot and distance matrix is ok
-                    dist[0, :] = dist[final_tour[-1], :]
-
-                remaining_deterministic_prize = deterministic_prize[~mask[1:]]
-                write_pctsp_dist(problem_filename,
-                                 dist[np.ix_(~mask, ~mask)], penalty[~mask[1:]], remaining_deterministic_prize)
-                # If the remaining deterministic prize is less than the prize we should still collect
-                # set this lower value as constraint since otherwise problem is infeasible
-                # compute total remaining deterministic prize after converting to ints
-                # otherwise we may still have problems with rounding
-                # Note we need to clip 1 - total_collected_prize between 0 (constraint can already be satisfied)
-                # and the maximum achievable with the remaining_deterministic_prize
-                min_prize_int = max(0, min(
-                    float_to_scaled_int(1. - total_collected_prize),
-                    sum([float_to_scaled_int(v) for v in remaining_deterministic_prize])
-                ))
-                with open(log_filename, 'a') as f:
-                    start = time.time()
-                    output = check_output(
-                        # exe, filename, min_total_prize (=1), num_runs
-                        [executable, problem_filename, str(min_prize_int), str(runs)],
-                        stderr=f
-                    ).decode('utf-8')
-                    durations.append(time.time() - start)
-                    outputs.append(output)
-
-                # Now parse output
-                tour = None
-                for line in output.splitlines():
-                    heading = "Best Result Route: "
-                    if line[:len(heading)] == heading:
-                        tour = np.array(line[len(heading):].split(" ")).astype(int)
-                        break
-                assert tour is not None, "Could not find tour in output!"
-
-                assert tour[0] == 0, "Tour should start with depot"
-                assert tour[-1] == 0, "Tour should end with depot"
-                tour = tour[1:-1]  # Strip off depot
-
-                # Now find to which nodes these correspond
-                tour_node_ids = np.arange(len(coord), dtype=int)[~mask][tour]
-
-                if len(tour_node_ids) == 0:
-                    # The inner algorithm can decide to stop, but does not have to
-                    assert total_collected_prize > 1 - 1e-5, "Collected prize should be one"
-                    break
-
-                if append == 'first':
-                    final_tour.append(tour_node_ids[0])
-                elif append == 'half':
-                    final_tour.extend(tour_node_ids[:max(len(tour_node_ids) // 2, 1)])
-                else:
-                    assert append == 'all'
-                    final_tour.extend(tour_node_ids)
-
-                total_collected_prize = calc_pctsp_total(stochastic_prize, final_tour)
-                it = it + 1
-
-            os.remove(problem_filename)
-            final_cost = calc_pctsp_cost(depot, loc, penalty, stochastic_prize, final_tour)
-            total_duration = time.time() - total_start
-            save_dataset((final_cost, final_tour, total_duration, outputs, durations), output_filename)
-
-        else:
-            final_cost, final_tour, total_duration, outputs, durations = load_dataset(output_filename)
-
-        return final_cost, final_tour, total_duration
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_salesman(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10):
-
-    problem_filename = os.path.join(directory, "{}.salesman{}.pctsp".format(name, runs))
-    output_filename = os.path.join(directory, "{}.salesman{}.pkl".format(name, runs))
-
-    try:
-        # May have already been run
-        if not os.path.isfile(output_filename):
-            write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name)
-
-            start = time.time()
-
-            random.seed(1234)
-            pctsp = Pctsp()
-            pctsp.load(problem_filename, float_to_scaled_int(1.))
-            s = solution.random(pctsp, start_size=int(len(pctsp.prize) * 0.7))
-            s = ilocal_search(s, n_runs=runs)
-
-            output = (s.route[:s.size], s.quality)
-
-            duration = time.time() - start
-
-            save_dataset((output, duration), output_filename)
-        else:
-            output, duration = load_dataset(output_filename)
-
-        # Now parse output
-        tour = output[0][:]
-        assert tour[0] == 0, "Tour should start with depot"
-        assert tour[-1] != 0, "Tour should not end with depot"
-        tour = tour[1:]  # Strip off depot
-
-        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour)
-        assert (float_to_scaled_int(total_cost) - output[1]) / float(output[1]) < 1e-5
-        return total_cost, tour, duration
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_gurobi(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize,
-                 disable_cache=False, timeout=None, gap=None):
-    # Lazy import so we do not need to have gurobi installed to run this script
-    from .pctsp_gurobi import solve_euclidian_pctsp as solve_euclidian_pctsp_gurobi
-
-    try:
-        problem_filename = os.path.join(directory, "{}.gurobi{}{}.pkl".format(
-            name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap)))
-
-        if os.path.isfile(problem_filename) and not disable_cache:
-            (cost, tour, duration) = load_dataset(problem_filename)
-        else:
-            # 0 = start, 1 = end so add depot twice
-            start = time.time()
-
-            # Must collect 1 or the sum of the prices if it is less then 1.
-            cost, tour = solve_euclidian_pctsp_gurobi(
-                depot, loc, penalty, deterministic_prize, min(sum(deterministic_prize), 1.),
-                threads=1, timeout=timeout, gap=gap
-            )
-            duration = time.time() - start  # Measure clock time
-            save_dataset((cost, tour, duration), problem_filename)
-
-        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
-        assert tour[0] == 0
-        tour = tour[1:]
-
-        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour)
-        assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect"
-        return total_cost, tour, duration
-
-    except Exception as e:
-        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
-        # By letting it fail we do not get total results, but we can retry by the caching mechanism
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_ortools(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize,
-                  sec_local_search=0, disable_cache=False):
-    # Lazy import so we do not require ortools by default
-    from .pctsp_ortools import solve_pctsp_ortools
-
-    try:
-        problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search))
-        if os.path.isfile(problem_filename) and not disable_cache:
-            objval, tour, duration = load_dataset(problem_filename)
-        else:
-            # 0 = start, 1 = end so add depot twice
-            start = time.time()
-            objval, tour = solve_pctsp_ortools(depot, loc, deterministic_prize, penalty,
-                                               min(sum(deterministic_prize), 1.), sec_local_search=sec_local_search)
-            duration = time.time() - start
-            save_dataset((objval, tour, duration), problem_filename)
-        assert tour[0] == 0, "Tour must start with depot"
-        tour = tour[1:]
-        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour)
-        assert abs(total_cost - objval) <= 1e-5, "Cost is incorrect"
-        return total_cost, tour, duration
-    except Exception as e:
-        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
-        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def calc_pctsp_total(vals, tour):
-    # Subtract 1 since vals index start with 0 while tour indexing starts with 1 as depot is 0
-    assert (np.array(tour) > 0).all(), "Depot cannot be in tour"
-    return np.array(vals)[np.array(tour) - 1].sum()
-
-
-def calc_pctsp_length(depot, loc, tour):
-    loc_with_depot = np.vstack((np.array(depot)[None, :], np.array(loc)))
-    sorted_locs = loc_with_depot[np.concatenate(([0], tour, [0]))]
-    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
-
-
-def calc_pctsp_cost(depot, loc, penalty, prize, tour):
-    # With some tolerance we should satisfy minimum prize
-    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
-    assert calc_pctsp_total(prize, tour) >= 1 - 1e-5 or len(tour) == len(prize), \
-        "Tour should collect at least 1 as total prize or visit all nodes"
-    # Penalty is only incurred for locations not visited, so charge total penalty minus penalty of locations visited
-    return calc_pctsp_length(depot, loc, tour) + np.sum(penalty) - calc_pctsp_total(penalty, tour)
-
-
-def write_pctsp(filename, depot, loc, penalty, prize, name="problem"):
-    coord = [depot] + loc
-    return write_pctsp_dist(filename, distance_matrix(coord, coord), penalty, prize)
-
-
-def float_to_scaled_int_str(v):  # Program only accepts ints so scale everything by 10^7
-    return str(float_to_scaled_int(v))
-
-
-def float_to_scaled_int(v):
-    return int(v * 10000000 + 0.5)
-
-
-def write_pctsp_dist(filename, dist, penalty, prize):
-
-    with open(filename, 'w') as f:
-        f.write("\n".join([
-            "",
-            " ".join([float_to_scaled_int_str(p) for p in [0] + list(prize)]),
-            "",
-            "",
-            " ".join([float_to_scaled_int_str(p) for p in [0] + list(penalty)]),
-            "",
-            "",
-            *(
-                " ".join(float_to_scaled_int_str(d) for d in d_row)
-                for d_row in dist
-            )
-        ]))
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("method",
-                        help="Name of the method to evaluate, 'pctsp', 'salesman' or 'stochpctsp(first|half|all)'")
-    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument("-o", default=None, help="Name of the results file to write")
-    parser.add_argument("--cpus", type=int, help="Number of CPUs to use, defaults to all cores")
-    parser.add_argument('--disable_cache', action='store_true', help='Disable caching')
-    parser.add_argument('--progress_bar_mininterval', type=float, default=0.1, help='Minimum interval')
-    parser.add_argument('-n', type=int, help="Number of instances to process")
-    parser.add_argument('--offset', type=int, help="Offset where to start processing")
-    parser.add_argument('--results_dir', default='results', help="Name of results directory")
-
-    opts = parser.parse_args()
-
-    assert opts.o is None or len(opts.datasets) == 1, "Cannot specify result filename with more than one dataset"
-
-    for dataset_path in opts.datasets:
-
-        assert os.path.isfile(check_extension(dataset_path)), "File does not exist!"
-
-        dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
-
-        if opts.o is None:
-            results_dir = os.path.join(opts.results_dir, "pctsp", dataset_basename)
-            os.makedirs(results_dir, exist_ok=True)
-
-            out_file = os.path.join(results_dir, "{}{}{}-{}{}".format(
-                dataset_basename,
-                "offs{}".format(opts.offset) if opts.offset is not None else "",
-                "n{}".format(opts.n) if opts.n is not None else "",
-                opts.method, ext
-            ))
-        else:
-            out_file = opts.o
-
-        assert opts.f or not os.path.isfile(
-            out_file), "File already exists! Try running with -f option to overwrite."
-
-        match = re.match(r'^([a-z]+)(\d*)$', opts.method)
-        assert match
-        method = match[1]
-        runs = 1 if match[2] == '' else int(match[2])
-
-        if method in ("pctsp", "salesman", "gurobi", "gurobigap", "gurobit", "ortools") or method[:10] == "stochpctsp":
-
-            target_dir = os.path.join(results_dir, "{}-{}".format(
-                dataset_basename,
-                opts.method
-            ))
-            assert opts.f or not os.path.isdir(target_dir), \
-                "Target dir already exists! Try running with -f option to overwrite."
-
-            if not os.path.isdir(target_dir):
-                os.makedirs(target_dir)
-
-            dataset = load_dataset(dataset_path)
-
-            if method[:6] == "gurobi":
-                use_multiprocessing = True  # We run one thread per instance
-
-                def run_func(args):
-                    return solve_gurobi(*args, disable_cache=opts.disable_cache,
-                                        timeout=runs if method[6:] == "t" else None,
-                                        gap=float(runs) if method[6:] == "gap" else None)
-            elif method == "pctsp":
-                executable = get_pctsp_executable()
-                use_multiprocessing = False
-
-                def run_func(args):
-                    return solve_pctsp_log(executable, *args, runs=runs)
-            elif method == "salesman":
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_salesman(*args, runs=runs)
-            elif method == "ortools":
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_ortools(*args, sec_local_search=runs, disable_cache=opts.disable_cache)
-            else:
-                assert method[:10] == "stochpctsp"
-                append = method[10:]
-                assert append in ('first', 'half', 'all')
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_stochastic_pctsp_log(executable, *args, runs=runs, append=append)
-
-            results, parallelism = run_all_in_pool(
-                run_func,
-                target_dir, dataset, opts, use_multiprocessing=use_multiprocessing
-            )
-
-        else:
-            assert False, "Unknown method: {}".format(opts.method)
-
-        costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-        print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-        print("Average serial duration: {} +- {}".format(
-            np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-        print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-        print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-
-        save_dataset((results, parallelism), out_file)
diff --git a/AM/problems/pctsp/pctsp_gurobi.py b/AM/problems/pctsp/pctsp_gurobi.py
deleted file mode 100644
index 7d630a8..0000000
--- a/AM/problems/pctsp/pctsp_gurobi.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/python
-
-# Copyright 2017, Gurobi Optimization, Inc.
-
-# Solve a traveling salesman problem on a set of
-# points using lazy constraints.   The base MIP model only includes
-# 'degree-2' constraints, requiring each node to have exactly
-# two incident edges.  Solutions to this model may contain subtours -
-# tours that don't visit every city.  The lazy constraint callback
-# adds new constraints to cut them off.
-
-from gurobipy import *
-
-
-def solve_euclidian_pctsp(depot, loc, penalty, prize, min_prize, threads=0, timeout=None, gap=None):
-    """
-    Solves the Euclidan pctsp problem to pctsptimality using the MIP formulation 
-    with lazy subtour elimination constraint generation.
-    :param points: list of (x, y) coordinate 
-    :return: 
-    """
-
-    points = [depot] + loc
-    n = len(points)
-
-    # Callback - use lazy constraints to eliminate sub-tours
-
-    def subtourelim(model, where):
-        if where == GRB.Callback.MIPSOL:
-            # make a list of edges selected in the solution
-            vals = model.cbGetSolution(model._vars)
-            selected = tuplelist((i, j) for i, j in model._vars.keys() if vals[i, j] > 0.5)
-            # find the shortest cycle in the selected edge list
-            tour = subtour(selected)
-            if tour is not None:
-                # add subtour elimination constraint for every pair of cities in tour
-                # model.cbLazy(quicksum(model._vars[i, j]
-                #                       for i, j in itertools.combinations(tour, 2))
-                #              <= len(tour) - 1)
-
-                model.cbLazy(quicksum(model._vars[i, j]
-                                      for i, j in itertools.combinations(tour, 2))
-                             <= quicksum(model._dvars[i] for i in tour) * (len(tour) - 1) / float(len(tour)))
-
-    # Given a tuplelist of edges, find the shortest subtour
-
-    def subtour(edges, exclude_depot=True):
-        unvisited = list(range(n))
-        #cycle = range(n + 1)  # initial length has 1 more city
-        cycle = None
-        while unvisited:  # true if list is non-empty
-            thiscycle = []
-            neighbors = unvisited
-            while neighbors:
-                current = neighbors[0]
-                thiscycle.append(current)
-                unvisited.remove(current)
-                neighbors = [j for i, j in edges.select(current, '*') if j in unvisited]
-            # If we do not yet have a cycle or this is the shorter cycle, keep this cycle
-            # Unless it contains the depot while we do not want the depot
-            if (
-                (cycle is None or len(cycle) > len(thiscycle))
-                    and len(thiscycle) > 1 and not (0 in thiscycle and exclude_depot)
-            ):
-                cycle = thiscycle
-        return cycle
-
-    # Dictionary of Euclidean distance between each pair of points
-
-    dist = {(i,j) :
-        math.sqrt(sum((points[i][k]-points[j][k])**2 for k in range(2)))
-        for i in range(n) for j in range(i)}
-
-    m = Model()
-    m.Params.outputFlag = False
-
-    # Create variables
-    vars = m.addVars(dist.keys(), obj=dist, vtype=GRB.BINARY, name='e')
-    for i,j in vars.keys():
-        vars[j,i] = vars[i,j] # edge in pctspposite direction
-
-    # Depot vars can be 2
-    for i,j in vars.keys():
-        if i == 0 or j == 0:
-            vars[i,j].vtype = GRB.INTEGER
-            vars[i,j].ub = 2
-
-    penalty_dict = {
-        i + 1: -p  # We get penalties for the nodes not visited so we 'save the penalties' for the nodes visited
-        for i, p in enumerate(penalty)
-    }
-    delta = m.addVars(range(1, n), obj=penalty_dict, vtype=GRB.BINARY, name='delta')
-
-    # Add degree-2 constraint (2 * delta for nodes which are not the depot)
-    m.addConstrs(vars.sum(i,'*') == (2 if i == 0 else 2 * delta[i]) for i in range(n))
-
-
-    # Minimum prize constraint
-    assert min_prize <= sum(prize)
-    # Subtract 1 from i since prizes are 0 indexed while delta vars start with 1 (0 is depot)
-    m.addConstr(quicksum(var * prize[i - 1] for i, var in delta.items()) >= min_prize)
-
-    # optimize model
-
-    m._vars = vars
-    m._dvars = delta
-    m.Params.lazyConstraints = 1
-    m.Params.threads = threads
-    if timeout:
-        m.Params.timeLimit = timeout
-    if gap:
-        m.Params.mipGap = gap * 0.01  # Percentage
-    # For the correct objective, we need to add the sum of the penalties, which are subtracted when nodes are visited
-    # this is important for the relative gap termination criterion
-    m.objcon = sum(penalty)
-    m.optimize(subtourelim)
-
-    vals = m.getAttr('x', vars)
-    selected = tuplelist((i,j) for i,j in vals.keys() if vals[i,j] > 0.5)
-
-    tour = subtour(selected, exclude_depot=False)
-    assert tour[0] == 0, "Tour should start with depot"
-
-    return m.objVal, tour
\ No newline at end of file
diff --git a/AM/problems/pctsp/pctsp_ortools.py b/AM/problems/pctsp/pctsp_ortools.py
deleted file mode 100644
index 78831ee..0000000
--- a/AM/problems/pctsp/pctsp_ortools.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env python
-# This Python file uses the following encoding: utf-8
-# Copyright 2015 Tin Arm Engineering AB
-# Copyright 2018 Google LLC
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Capacitated Vehicle Routing Problem (CVRP).
-
-   This is a sample using the routing library python wrapper to solve a CVRP
-   problem.
-   A description of the problem can be found here:
-   http://en.wikipedia.org/wiki/Vehicle_routing_problem.
-
-   Distances are in meters.
-"""
-
-from __future__ import print_function
-from collections import namedtuple
-from six.moves import xrange
-from ortools.constraint_solver import pywrapcp
-from ortools.constraint_solver import routing_enums_pb2
-import math
-
-###########################
-# Problem Data Definition #
-###########################
-# Vehicle declaration
-Vehicle = namedtuple('Vehicle', ['capacity'])
-
-
-def float_to_scaled_int(v):
-    return int(v * 10000000 + 0.5)
-
-
-class DataProblem():
-  """Stores the data for the problem"""
-
-  def __init__(self, depot, loc, prize, penalty, min_prize):
-    """Initializes the data for the problem"""
-    # Locations in block unit
-    self._locations = [(float_to_scaled_int(l[0]), float_to_scaled_int(l[1])) for l in [depot] + loc]
-
-    self._prizes = [float_to_scaled_int(v) for v in prize]
-
-    self._penalties = [float_to_scaled_int(v) for v in penalty]
-
-    # Check that min_prize is feasible
-    assert sum(prize) >= min_prize
-    # After scaling and rounding, however, it can possible not be feasible so relax constraint
-    self._min_prize = min(float_to_scaled_int(min_prize), sum(self.prizes))
-
-  @property
-  def vehicle(self):
-    """Gets a vehicle"""
-    return Vehicle()
-
-  @property
-  def num_vehicles(self):
-    """Gets number of vehicles"""
-    return 1
-
-  @property
-  def locations(self):
-    """Gets locations"""
-    return self._locations
-
-  @property
-  def num_locations(self):
-    """Gets number of locations"""
-    return len(self.locations)
-
-  @property
-  def depot(self):
-    """Gets depot location index"""
-    return 0
-
-  @property
-  def prizes(self):
-    """Gets prizes at each location"""
-    return self._prizes
-
-  @property
-  def penalties(self):
-      """Gets penalties at each location"""
-      return self._penalties
-
-  @property
-  def min_prize(self):
-      """Gets penalties at each location"""
-      return self._min_prize
-
-
-#######################
-# Problem Constraints #
-#######################
-def euclidian_distance(position_1, position_2):
-  """Computes the Euclidian distance between two points"""
-  return int(math.sqrt((position_1[0] - position_2[0]) ** 2 + (position_1[1] - position_2[1]) ** 2) + 0.5)
-
-
-class CreateDistanceEvaluator(object):  # pylint: disable=too-few-public-methods
-  """Creates callback to return distance between points."""
-
-  def __init__(self, data):
-    """Initializes the distance matrix."""
-    self._distances = {}
-
-    # precompute distance between location to have distance callback in O(1)
-    for from_node in xrange(data.num_locations):
-      self._distances[from_node] = {}
-      for to_node in xrange(data.num_locations):
-        if from_node == to_node:
-          self._distances[from_node][to_node] = 0
-        else:
-          self._distances[from_node][to_node] = (
-              euclidian_distance(data.locations[from_node],
-                                 data.locations[to_node]))
-
-  def distance_evaluator(self, from_node, to_node):
-    """Returns the manhattan distance between the two nodes"""
-    return self._distances[from_node][to_node]
-
-
-class CreatePrizeEvaluator(object):  # pylint: disable=too-few-public-methods
-  """Creates callback to get prizes at each location."""
-
-  def __init__(self, data):
-    """Initializes the prize array."""
-    self._prizes = data.prizes
-
-  def prize_evaluator(self, from_node, to_node):
-    """Returns the prize of the current node"""
-    del to_node
-    return 0 if from_node == 0 else self._prizes[from_node - 1]
-
-
-def add_min_prize_constraints(routing, data, prize_evaluator, min_prize):
-  """Adds capacity constraint"""
-  prize = 'Prize'
-  routing.AddDimension(
-      prize_evaluator,
-      0,  # null capacity slack
-      sum(data.prizes),  # No upper bound
-      True,  # start cumul to zero
-      prize)
-  capacity_dimension = routing.GetDimensionOrDie(prize)
-  for vehicle in xrange(data.num_vehicles):  # only single vehicle
-      capacity_dimension.CumulVar(routing.End(vehicle)).RemoveInterval(0, min_prize)
-
-
-def add_distance_constraint(routing, distance_evaluator, maximum_distance):
-    """Add Global Span constraint"""
-    distance = "Distance"
-    routing.AddDimension(
-        distance_evaluator,
-        0, # null slack
-        maximum_distance, # maximum distance per vehicle
-        True, # start cumul to zero
-        distance)
-
-
-###########
-# Printer #
-###########
-def print_solution(data, routing, assignment):
-  """Prints assignment on console"""
-  print('Objective: {}'.format(assignment.ObjectiveValue()))
-  total_distance = 0
-  total_load = 0
-  capacity_dimension = routing.GetDimensionOrDie('Capacity')
-  for vehicle_id in xrange(data.num_vehicles):
-    index = routing.Start(vehicle_id)
-    plan_output = 'Route for vehicle {}:\n'.format(vehicle_id)
-    distance = 0
-    while not routing.IsEnd(index):
-      load_var = capacity_dimension.CumulVar(index)
-      plan_output += ' {} Load({}) -> '.format(
-          routing.IndexToNode(index), assignment.Value(load_var))
-      previous_index = index
-      index = assignment.Value(routing.NextVar(index))
-      distance += routing.GetArcCostForVehicle(previous_index, index,
-                                               vehicle_id)
-    load_var = capacity_dimension.CumulVar(index)
-    plan_output += ' {0} Load({1})\n'.format(
-        routing.IndexToNode(index), assignment.Value(load_var))
-    plan_output += 'Distance of the route: {}m\n'.format(distance)
-    plan_output += 'Load of the route: {}\n'.format(assignment.Value(load_var))
-    print(plan_output)
-    total_distance += distance
-    total_load += assignment.Value(load_var)
-  print('Total Distance of all routes: {}m'.format(total_distance))
-  print('Total Load of all routes: {}'.format(total_load))
-
-
-def solve_pctsp_ortools(depot, loc, prize, penalty, min_prize, sec_local_search=0):
-    data = DataProblem(depot, loc, prize, penalty, min_prize)
-
-    # Create Routing Model
-    routing = pywrapcp.RoutingModel(data.num_locations, data.num_vehicles,
-                                    data.depot)
-
-    # Define weight of each edge
-    distance_evaluator = CreateDistanceEvaluator(data).distance_evaluator
-    routing.SetArcCostEvaluatorOfAllVehicles(distance_evaluator)
-
-    # Add minimum total prize constraint
-    prize_evaluator = CreatePrizeEvaluator(data).prize_evaluator
-    add_min_prize_constraints(routing, data, prize_evaluator, data.min_prize)
-
-    # Add penalties for missed nodes
-    nodes = [routing.AddDisjunction([int(c + 1)], p) for c, p in enumerate(data.penalties)]
-
-    # Setting first solution heuristic (cheapest addition).
-    search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
-    search_parameters.first_solution_strategy = (
-        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
-    if sec_local_search > 0:
-        # Additionally do local search
-        search_parameters.local_search_metaheuristic = (
-            routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
-        search_parameters.time_limit_ms = 1000 * sec_local_search
-    # Solve the problem.
-    assignment = routing.SolveWithParameters(search_parameters)
-
-    assert assignment is not None, "ORTools was unable to find a feasible solution"
-
-    index = routing.Start(0)
-    route = []
-    while not routing.IsEnd(index):
-        node_index = routing.IndexToNode(index)
-        route.append(node_index)
-        index = assignment.Value(routing.NextVar(index))
-    return assignment.ObjectiveValue() / 10000000., route
diff --git a/AM/problems/pctsp/problem_pctsp.py b/AM/problems/pctsp/problem_pctsp.py
deleted file mode 100644
index a3cff91..0000000
--- a/AM/problems/pctsp/problem_pctsp.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from torch.utils.data import Dataset
-import torch
-import os
-import pickle
-from problems.pctsp.state_pctsp import StatePCTSP
-from utils.beam_search import beam_search
-
-
-class PCTSP(object):
-
-    NAME = 'pctsp'  # Prize Collecting TSP, without depot, with penalties
-
-    @staticmethod
-    def _get_costs(dataset, pi, stochastic=False):
-        if pi.size(-1) == 1:  # In case all tours directly return to depot, prevent further problems
-            assert (pi == 0).all(), "If all length 1 tours, they should be zero"
-            # Return
-            return torch.zeros(pi.size(0), dtype=torch.float, device=pi.device), None
-
-        # Check that tours are valid, i.e. contain 0 to n -1
-        sorted_pi = pi.data.sort(1)[0]
-        # Make sure each node visited once at most (except for depot)
-        assert ((sorted_pi[:, 1:] == 0) | (sorted_pi[:, 1:] > sorted_pi[:, :-1])).all(), "Duplicates"
-
-        prize = dataset['stochastic_prize'] if stochastic else dataset['deterministic_prize']
-        prize_with_depot = torch.cat(
-            (
-                torch.zeros_like(prize[:, :1]),
-                prize
-            ),
-            1
-        )
-        p = prize_with_depot.gather(1, pi)
-
-        # Either prize constraint should be satisfied or all prizes should be visited
-        assert (
-            (p.sum(-1) >= 1 - 1e-5) |
-            (sorted_pi.size(-1) - (sorted_pi == 0).int().sum(-1) == dataset['loc'].size(-2))
-        ).all(), "Total prize does not satisfy min total prize"
-        penalty_with_depot = torch.cat(
-            (
-                torch.zeros_like(dataset['penalty'][:, :1]),
-                dataset['penalty']
-            ),
-            1
-        )
-        pen = penalty_with_depot.gather(1, pi)
-
-        # Gather dataset in order of tour
-        loc_with_depot = torch.cat((dataset['depot'][:, None, :], dataset['loc']), 1)
-        d = loc_with_depot.gather(1, pi[..., None].expand(*pi.size(), loc_with_depot.size(-1)))
-
-        length = (
-            (d[:, 1:] - d[:, :-1]).norm(p=2, dim=-1).sum(1)  # Prevent error if len 1 seq
-            + (d[:, 0] - dataset['depot']).norm(p=2, dim=-1)  # Depot to first
-            + (d[:, -1] - dataset['depot']).norm(p=2, dim=-1)  # Last to depot, will be 0 if depot is last
-        )
-        # We want to maximize total prize but code minimizes so return negative
-        # Incurred penalty cost is total penalty cost - saved penalty costs of nodes visited
-        return length + dataset['penalty'].sum(-1) - pen.sum(-1), None
-
-    @staticmethod
-    def make_dataset(*args, **kwargs):
-        return PCTSPDataset(*args, **kwargs)
-
-    @staticmethod
-    def beam_search(input, beam_size, expand_size=None,
-                    compress_mask=False, model=None, max_calc_batch_size=4096):
-
-        assert model is not None, "Provide model"
-
-        fixed = model.precompute_fixed(input)
-
-        def propose_expansions(beam):
-            return model.propose_expansions(
-                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
-            )
-
-        # With beam search we always consider the deterministic case
-        state = PCTSPDet.make_state(
-            input, visited_dtype=torch.int64 if compress_mask else torch.uint8
-        )
-
-        return beam_search(state, beam_size, propose_expansions)
-
-
-class PCTSPDet(PCTSP):
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        return PCTSP._get_costs(dataset, pi, stochastic=False)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StatePCTSP.initialize(*args, **kwargs, stochastic=False)
-
-
-class PCTSPStoch(PCTSP):
-
-    # Stochastic variant of PCTSP, the real (stochastic) prize is only revealed when node is visited
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        return PCTSP._get_costs(dataset, pi, stochastic=True)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StatePCTSP.initialize(*args, **kwargs, stochastic=True)
-
-
-def generate_instance(size, penalty_factor=3):
-    depot = torch.rand(2)
-    loc = torch.rand(size, 2)
-
-    # For the penalty to make sense it should be not too large (in which case all nodes will be visited) nor too small
-    # so we want the objective term to be approximately equal to the length of the tour, which we estimate with half
-    # of the nodes by half of the tour length (which is very rough but similar to op)
-    # This means that the sum of penalties for all nodes will be approximately equal to the tour length (on average)
-    # The expected total (uniform) penalty of half of the nodes (since approx half will be visited by the constraint)
-    # is (n / 2) / 2 = n / 4 so divide by this means multiply by 4 / n,
-    # However instead of 4 we use penalty_factor (3 works well) so we can make them larger or smaller
-    MAX_LENGTHS = {
-        20: 2.,
-        50: 3.,
-        100: 4.
-    }
-    penalty_max = MAX_LENGTHS[size] * (penalty_factor) / float(size)
-    penalty = torch.rand(size) * penalty_max
-
-    # Take uniform prizes
-    # Now expectation is 0.5 so expected total prize is n / 2, we want to force to visit approximately half of the nodes
-    # so the constraint will be that total prize >= (n / 2) / 2 = n / 4
-    # equivalently, we divide all prizes by n / 4 and the total prize should be >= 1
-    deterministic_prize = torch.rand(size) * 4 / float(size)
-
-    # In the deterministic setting, the stochastic_prize is not used and the deterministic prize is known
-    # In the stochastic setting, the deterministic prize is the expected prize and is known up front but the
-    # stochastic prize is only revealed once the node is visited
-    # Stochastic prize is between (0, 2 * expected_prize) such that E(stochastic prize) = E(deterministic_prize)
-    stochastic_prize = torch.rand(size) * deterministic_prize * 2
-
-    return {
-        'depot': depot,
-        'loc': loc,
-        'penalty': penalty,
-        'deterministic_prize': deterministic_prize,
-        'stochastic_prize': stochastic_prize
-    }
-
-
-class PCTSPDataset(Dataset):
-    
-    def __init__(self, filename=None, size=50, num_samples=1000000, offset=0, distribution=None):
-        super(PCTSPDataset, self).__init__()
-
-        self.data_set = []
-        if filename is not None:
-            assert os.path.splitext(filename)[1] == '.pkl'
-
-            with open(filename, 'rb') as f:
-                data = pickle.load(f)
-                self.data = [
-                    {
-                        'depot': torch.FloatTensor(depot),
-                        'loc': torch.FloatTensor(loc),
-                        'penalty': torch.FloatTensor(penalty),
-                        'deterministic_prize': torch.FloatTensor(deterministic_prize),
-                        'stochastic_prize': torch.tensor(stochastic_prize)
-                    }
-                    for depot, loc, penalty, deterministic_prize, stochastic_prize in (data[offset:offset+num_samples])
-                ]
-        else:
-            self.data = [
-                generate_instance(size)
-                for i in range(num_samples)
-            ]
-
-        self.size = len(self.data)
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, idx):
-        return self.data[idx]
diff --git a/AM/problems/pctsp/salesman/.gitignore b/AM/problems/pctsp/salesman/.gitignore
deleted file mode 100644
index 894a44c..0000000
--- a/AM/problems/pctsp/salesman/.gitignore
+++ /dev/null
@@ -1,104 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
diff --git a/AM/problems/pctsp/salesman/README.md b/AM/problems/pctsp/salesman/README.md
deleted file mode 100644
index 3b3db6d..0000000
--- a/AM/problems/pctsp/salesman/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# salesman
-Prize Collecting Travelling Salesman Problem
diff --git a/AM/problems/pctsp/salesman/__init__.py b/AM/problems/pctsp/salesman/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/pctsp/salesman/pctsp/__init__.py b/AM/problems/pctsp/salesman/pctsp/__init__.py
deleted file mode 100755
index a8e6b45..0000000
--- a/AM/problems/pctsp/salesman/pctsp/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# package qextractor
-#
-# Copyright (c) 2015 Rafael Reis
-#
-"""
-Package qextractor - Packages for building and evaluating a machine learning
-model to tackle the Quotation Extractor Task
-
-"""
-__version__="1.0"
-__author__ = "Rafael Reis <rafael2reis@gmail.com>"
\ No newline at end of file
diff --git a/AM/problems/pctsp/salesman/pctsp/__main__.py b/AM/problems/pctsp/salesman/pctsp/__main__.py
deleted file mode 100644
index 6b77efb..0000000
--- a/AM/problems/pctsp/salesman/pctsp/__main__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# from qextractor.application import main
-# main()
\ No newline at end of file
diff --git a/AM/problems/pctsp/salesman/pctsp/algo/__init__.py b/AM/problems/pctsp/salesman/pctsp/algo/__init__.py
deleted file mode 100644
index 80aa94f..0000000
--- a/AM/problems/pctsp/salesman/pctsp/algo/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# package algo
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-Package algo - Algorithms for solving the  Prize Collecting Travelling Salesman Problem
-
-"""
-__version__="1.0"
-__author__ = "Rafael Reis <rafael2reis@gmail.com>"
diff --git a/AM/problems/pctsp/salesman/pctsp/algo/geni.py b/AM/problems/pctsp/salesman/pctsp/algo/geni.py
deleted file mode 100644
index 60686f1..0000000
--- a/AM/problems/pctsp/salesman/pctsp/algo/geni.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# module geni.py
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-geni module - Auxiliary functions to the GENI method.
-
-"""
-__version__="1.0"
-
-import numpy as np
-import sys
-
-def geni(v, s, max_i):
-    quality_1 = 0
-    quality_2 = 0
-
-    s_star = Solution()
-    s_start.quality = sys.maxint
-
-    for i in range(1, max_i):
-        quality_1 = quality_after_insertion_1(v, i, )
-        quality_2 = quality_after_insertion_2()
-
-        if quality_1 < quality_2 and quality_1 < s_star.quality:
-            s_star = insertion_1(s)
-        elif quality_2 < quality_1 and quality_2 < s_star.quality:
-            s_star = insertion_2(s)
-
-    return s_star
diff --git a/AM/problems/pctsp/salesman/pctsp/algo/genius.py b/AM/problems/pctsp/salesman/pctsp/algo/genius.py
deleted file mode 100644
index ef124ed..0000000
--- a/AM/problems/pctsp/salesman/pctsp/algo/genius.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# module genius.py
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-genius module - Implements GENIUS, an algorithm for generation of a solution.
-
-"""
-__version__="1.0"
-
-from pctsp.model.pctsp import *
-from pctsp.model import solution
-
-import numpy as np
-
-def genius(pctsp):
-    s = solution.random(pctsp, size=3)
-    s = geni(pstsp, s)
-    s = us(pctsp, s)
-
-    return s
-
-def geni(pctsp, s):
-    return
-
-def us(pctsp, s):
-    return
diff --git a/AM/problems/pctsp/salesman/pctsp/algo/ilocal_search.py b/AM/problems/pctsp/salesman/pctsp/algo/ilocal_search.py
deleted file mode 100644
index 9c8bebe..0000000
--- a/AM/problems/pctsp/salesman/pctsp/algo/ilocal_search.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# module ilocal_search.py
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-ilocal_search module - Implements Iterate Local Search algorithm.
-
-"""
-__version__="1.0"
-
-import numpy as np
-import random
-
-def ilocal_search(s, n_runs=10):
-    h = s.copy()
-    best = s.copy()
-    times = [1000] * n_runs  # random.sample(range(1000, 2000), n_runs)
-
-    while len(times) > 0:
-        time = times.pop()
-        t = 0
-        s_tabu = s.copy()
-        while t < time:
-            r = tweak(s_tabu.copy())
-            if r.quality < s_tabu.quality:
-                s_tabu = r
-
-                if s_tabu.is_valid():
-                    s = s_tabu
-            t += 1
-
-        if s.quality < best.quality and s.is_valid():
-            best = s
-        
-        h = newHomeBase(h, s)
-        s = perturb(h)
-    
-    return best
-
-def tweak(solution):
-    s = solution
-
-    s_1 = m1(solution.copy())
-    s_2 = m2(solution.copy())
-    
-    if (s_1 and s_1.quality < solution.quality 
-        and (not s_2 or s_1.quality < s_2.quality)
-        ):#and s_1.is_valid()):
-        s = s_1
-    elif (s_2 and s_2.quality < solution.quality
-        and (not s_1 or s_2.quality < s_1.quality)
-        ):#and s_2.is_valid()):
-        s = s_2
-    else:
-        s_3 = m3(solution.copy())
-        if (s_3 and s_3.quality < solution.quality
-            ):#and s_3.is_valid()):
-            s = s_3
-
-    return s
-
-def newHomeBase(h, s):
-    if s.quality <= h.quality:
-        return s
-    else:
-        return h
-
-def perturb(solution):
-    s = solution.copy()
-    if s.size > 5:
-        quant = int(s.size/5)
-        s.remove_cities(quant=quant)
-
-    return s
-
-def m1(solution):
-    size = solution.size
-    length = len(solution.route)
-
-    if size > 1 and size < length:
-        i = random.randrange(1, size)
-        j = random.randrange(size, length)
-        solution.swap(i, j)
-   
-    return solution
-
-def m2(solution):
-    if solution.size > 1:
-        i = random.randrange(1, solution.size)
-        solution.remove_city(index=i)
-
-    return solution
-
-def m3(solution):
-    if solution.size < len(solution.route):
-        solution.add_city()
-
-    return solution
diff --git a/AM/problems/pctsp/salesman/pctsp/application.py b/AM/problems/pctsp/salesman/pctsp/application.py
deleted file mode 100644
index f725541..0000000
--- a/AM/problems/pctsp/salesman/pctsp/application.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# module application.py
-#
-# Copyright (c) 2015 Rafael Reis
-#
-"""
-application module - Main module that solves the Prize Collecting Travelling Salesman Problem
-
-"""
-
-from pctsp.model.pctsp import *
-from pctsp.model import solution
-from pctsp.algo.genius import genius
-from pctsp.algo import ilocal_search as ils
-from pkg_resources import resource_filename
-import random
-
-INPUT_INSTANCE_FILE = resource_filename('pctsp', 'data/problem_20_100_100_1000.pctsp')
-
-def solve_instance(filename, min_prize, runs=10, seed=1234):
-    random.seed(seed)
-    pctsp = Pctsp()
-    pctsp.load(filename, min_prize)
-    s = solution.random(pctsp, size=int(len(pctsp.prize) * 0.7))
-    s = ils.ilocal_search(s, n_runs=runs)
-
-    return (s.route[1:], s.quality)
-
-def main():
-    """Main function, that solves the PCTSP.
-
-    """
-    pctsp = Pctsp()
-    pctsp.load(INPUT_INSTANCE_FILE, 386)
-    #pctsp.prize = np.array([0, 4, 8, 3])
-    #pctsp.penal = np.array([1000, 7, 11, 17])
-    #pctsp.cost = np.array([[0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 1], [1, 1, 1, 0]])
-    # print(pctsp.type)
-
-    size = int(len(pctsp.prize)*0.7)
-
-    s = solution.random(pctsp, size=size)
-    print(s.route)
-    print(s.size)
-    print(s.quality)
-    print(s.is_valid())
-
-    print("\n")
-
-    # s = genius(pctsp)
-    # print(s.route)
-    # print(s.quality)
-
-    s = ils.ilocal_search(s)
-    print(s.route)
-    print(s.size)
-    print(s.quality)
-    print(s.is_valid())
-
-if __name__ == '__main__':
-    main()
diff --git a/AM/problems/pctsp/salesman/pctsp/model/__init__.py b/AM/problems/pctsp/salesman/pctsp/model/__init__.py
deleted file mode 100644
index a5cc455..0000000
--- a/AM/problems/pctsp/salesman/pctsp/model/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# package model
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-Package model - Models of Prize Collecting Travelling Salesman Problem
-
-"""
-__version__="1.0"
-__author__ = "Rafael Reis <rafael2reis@gmail.com>"
diff --git a/AM/problems/pctsp/salesman/pctsp/model/pctsp.py b/AM/problems/pctsp/salesman/pctsp/model/pctsp.py
deleted file mode 100644
index 2a0661d..0000000
--- a/AM/problems/pctsp/salesman/pctsp/model/pctsp.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# module pctsp.py
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-pctsp module - Implements Pctsp, a class that describes an instance of the problem..
-
-"""
-__version__="1.0"
-
-import numpy as np
-import re
-
-class Pctsp(object):
-    """
-    Attributes:
-       c (:obj:`list` of :obj:`list`): Costs from i to j
-       p (:obj:`list` of :obj:`int`): Prize for visiting each city i
-       gama (:obj:`list` of :obj:`int`): Penalty for not visiting each city i
-    """
-    def __init__(self):
-        self.prize = []
-        self.penal = []
-        self.cost = []
-        self.prize_min = 0
-
-    def load(self, file_name, prize_min):
-
-        f = open(file_name,'r')
-        for i,line in enumerate(f):
-            if i is 5: break
-            if i is 1: self.prize = np.fromstring(line, dtype=int, sep=' ')
-            if i is 4: self.penal = np.fromstring(line, dtype=int, sep=' ')
-
-        f.close()
-
-        self.cost = np.loadtxt(file_name, dtype=int, skiprows=7)
-        self.prize_min = prize_min
-
-        assert sum(self.prize) >= prize_min, "Infeasible"
-
-
diff --git a/AM/problems/pctsp/salesman/pctsp/model/solution.py b/AM/problems/pctsp/salesman/pctsp/model/solution.py
deleted file mode 100644
index 05de807..0000000
--- a/AM/problems/pctsp/salesman/pctsp/model/solution.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# module solution.py
-#
-# Copyright (c) 2018 Rafael Reis
-#
-"""
-solution module - Implements Solution, a class that describes a solution for the problem.
-
-"""
-__version__="1.0"
-
-import numpy as np
-import copy
-import sys
-from random import shuffle
-
-def random(pctsp, start_size):
-    s = Solution(pctsp)
-    length = len(pctsp.prize)
-
-    # Modification: start from start_size but increase after maximum number of iterations in case no feasible solution
-    # is found. When the full length is used, there should always be a feasible solution
-    for size in range(start_size, length + 1):
-        if size: s.size = size
-
-        i = 0
-        min_solutions = 30
-        max_solutions = 1000
-
-        while i < min_solutions or (i < max_solutions and not s.is_valid()):
-            r = Solution(pctsp)
-            if size: r.size = size
-            cities = list(range(1, length, 1))
-            shuffle(cities) # Shuffle in place
-            r.route = [0] + cities # The city 0 is always the first
-
-            if r.quality < s.quality and r.is_valid():
-                s = r
-
-            i += 1
-        if s.is_valid():
-            break
-    assert s.is_valid()
-    return s
-
-
-class Solution(object):
-    """
-    Attributes:
-       route (:obj:`list` of :obj:`int`): The list of cities in the visiting order
-       size (:obj:`int`): The quantity of the first cities to be considered in the route list
-       quality (:obj:`int`): The quality of the solution
-    """
-
-    def __init__(self, pctsp, size=None):
-        self._route = []
-        
-        if size:
-            self.size = size
-        else:
-            self.size = len(pctsp.prize) # Default size value is the total of cities
-        
-        self.quality = sys.maxsize
-        self.pctsp = pctsp
-        self.prize = 0
-
-    """
-    Computes the quality of the solution.
-    """
-    def compute(self):
-        self.prize = 0
-        self.quality = 0
-
-        for i,city in enumerate(self._route):
-            if i < self.size:
-                self.prize += self.pctsp.prize[city]
-                if i > 0:
-                    previousCity = self._route[i - 1]
-                    self.quality += self.pctsp.cost[previousCity][city]
-                if i + 1 == self.size:
-                    self.quality += self.pctsp.cost[city][0]
-            else:
-                self.quality += self.pctsp.penal[city]
-
-    def copy(self):
-        cp = copy.copy(self)
-        cp._route = list(self._route)
-
-        return cp
-    
-    def swap(self, i, j):
-        city_i = self._route[i]
-        city_i_prev = self._route[i-1]
-        city_i_next = self._route[(i+1) % self.size]
-        
-        city_j = self._route[j]
-
-        self.quality = (self.quality
-                - self.pctsp.cost[city_i_prev][city_i] - self.pctsp.cost[city_i][city_i_next]
-                + self.pctsp.cost[city_i_prev][city_j] + self.pctsp.cost[city_j][city_i_next]
-                - self.pctsp.penal[city_j] + self.pctsp.penal[city_i])
-        self.prize = self.prize - self.pctsp.prize[city_i] + self.pctsp.prize[city_j]
-
-        self._route[j], self._route[i] = self._route[i], self._route[j]
-
-    def is_valid(self):
-        return self.prize >= self.pctsp.prize_min
-
-    def add_city(self):
-        city_l = self._route[self.size - 1]
-        city_add = self._route[self.size]
-        
-        self.quality = (self.quality
-            - self.pctsp.cost[city_l][0]
-            - self.pctsp.penal[city_add]
-            + self.pctsp.cost[city_l][city_add]
-            + self.pctsp.cost[city_add][0])
-        
-        self.size += 1
-        self.prize += self.pctsp.prize[city_add]
-
-    def remove_city(self, index):
-        city_rem = self._route[index]
-        city_rem_prev = self._route[index-1]
-        city_rem_next = self._route[(index+1)%self.size]
-
-        self.quality = (self.quality
-            - self.pctsp.cost[city_rem_prev][city_rem] - self.pctsp.cost[city_rem][city_rem_next]
-            + self.pctsp.penal[city_rem]
-            + self.pctsp.cost[city_rem_prev][city_rem_next])
-        self.prize -= self.pctsp.prize[city_rem]
-
-        del self._route[index]        
-        self._route.append(city_rem)
-
-        self.size -= 1
-
-    def remove_cities(self, quant):
-        for i in range(self.size-quant,self.size):
-            city_rem = self._route[i]
-            city_rem_prev = self._route[i-1]
-
-            self.quality = (self.quality 
-                - self.pctsp.cost[city_rem_prev][city_rem]
-                + self.pctsp.penal[city_rem])
-            self.prize -= self.pctsp.prize[city_rem]
-
-        city_rem = self._route[self.size-1]
-        city_l = self._route[self.size-quant-1]
-        self.quality = (self.quality - self.pctsp.cost[city_rem][0]
-            + self.pctsp.cost[city_l][0])
-
-        self.size -= quant
-
-    def print_route(self):
-        print(self._route)
-
-    @property
-    def route(self):
-        return self._route
-
-    @route.setter
-    def route(self, r):
-        self._route = r
-        self.compute()
diff --git a/AM/problems/pctsp/salesman/pctsp/model/tests/__init__.py b/AM/problems/pctsp/salesman/pctsp/model/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/pctsp/salesman/pctsp/model/tests/test_solution.py b/AM/problems/pctsp/salesman/pctsp/model/tests/test_solution.py
deleted file mode 100644
index c4e8ad2..0000000
--- a/AM/problems/pctsp/salesman/pctsp/model/tests/test_solution.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# python -m pctsp.model.tests.test_solution
-import unittest
-
-from pctsp.model import solution
-from pctsp.model import pctsp
-import numpy as np
-
-class TestTrain(unittest.TestCase):
-    def setUp(self):
-        self.p = pctsp.Pctsp()
-        self.p.prize = np.array([0, 4, 8, 3])
-        self.p.penal = np.array([1000, 7, 11, 17])
-        self.p.cost = np.array([[0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 1], [1, 1, 1, 0]])
-
-    def test_quality(self):
-        s = solution.Solution(self.p)
-        s.route = [0, 1, 2, 3]
-        print("Quality: ", s.quality)
-        self.assertEqual(s.quality, 4)
-
-    def test_quality_2(self):
-        s = solution.Solution(self.p, size=2)
-        s.route = [0, 1, 2, 3]
-        print("Quality: ", s.quality)
-        self.assertEqual(s.quality, 30)
-
-    def test_swap(self):
-        s = solution.Solution(self.p, size=3)
-        s.route = [0, 1, 2, 3]
-        
-        s.swap(1,3)
-        print("Quality: ", s.quality)
-        print("route:", s.route)
-        self.assertEqual(s.quality, 10)
-
-    def test_add_city(self):
-        s = solution.Solution(self.p, size=3)
-        s.route = [0, 1, 2, 3]
-        
-        s.add_city()
-        print("Quality: ", s.quality)
-        self.assertEqual(s.quality, 4)
-
-    def test_remove_city(self):
-        s = solution.Solution(self.p)
-        s.route = [0, 1, 2, 3]
-
-        s.remove_city(3)
-        print("Quality: ", s.quality)
-        self.assertEqual(s.quality, 20)
-
-    def test_remove_cities(self):
-        s = solution.Solution(self.p)
-        s.route = [0, 1, 2, 3]
-
-        s.remove_cities(quant=3)
-        self.assertEqual(s.quality, 35)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/AM/problems/pctsp/state_pctsp.py b/AM/problems/pctsp/state_pctsp.py
deleted file mode 100644
index 9555dad..0000000
--- a/AM/problems/pctsp/state_pctsp.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import torch
-from typing import NamedTuple
-from utils.boolmask import mask_long2bool, mask_long_scatter
-import torch.nn.functional as F
-
-
-class StatePCTSP(NamedTuple):
-    # Fixed input
-    coords: torch.Tensor  # Depot + loc
-    expected_prize: torch.Tensor
-    real_prize: torch.Tensor
-    penalty: torch.Tensor
-
-    # If this state contains multiple copies (i.e. beam search) for the same instance, then for memory efficiency
-    # the coords and prizes tensors are not kept multiple times, so we need to use the ids to index the correct rows.
-    ids: torch.Tensor  # Keeps track of original fixed data index of rows
-
-    # State
-    prev_a: torch.Tensor
-    visited_: torch.Tensor  # Keeps track of nodes that have been visited
-    lengths: torch.Tensor
-    cur_total_prize: torch.Tensor
-    cur_total_penalty: torch.Tensor
-    cur_coord: torch.Tensor
-    i: torch.Tensor  # Keeps track of step
-
-    @property
-    def visited(self):
-        if self.visited_.dtype == torch.uint8:
-            return self.visited_
-        else:
-            return mask_long2bool(self.visited_, n=self.coords.size(-2))
-
-    @property
-    def dist(self):
-        return (self.coords[:, :, None, :] - self.coords[:, None, :, :]).norm(p=2, dim=-1)
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            ids=self.ids[key],
-            prev_a=self.prev_a[key],
-            visited_=self.visited_[key],
-            lengths=self.lengths[key],
-            cur_total_prize=self.cur_total_prize[key],
-            cur_total_penalty=self.cur_total_penalty[key],
-            cur_coord=self.cur_coord[key],
-        )
-
-    # Warning: cannot override len of NamedTuple, len should be number of fields, not batch size
-    # def __len__(self):
-    #     return len(self.used_capacity)
-
-    @staticmethod
-    def initialize(input, visited_dtype=torch.uint8, stochastic=False):
-        depot = input['depot']
-        loc = input['loc']
-        # For both deterministic and stochastic variant, model sees only deterministic (expected) prize
-        expected_prize = input['deterministic_prize']
-        # This is the prize that is actually obtained at each node
-        real_prize = input['stochastic_prize' if stochastic else 'deterministic_prize']
-        penalty = input['penalty']
-
-        batch_size, n_loc, _ = loc.size()
-        coords = torch.cat((depot[:, None, :], loc), -2)
-        # For prize, prepend 0 (corresponding to depot) so we can gather efficiently
-
-        real_prize_with_depot = torch.cat((torch.zeros_like(real_prize[:, :1]), real_prize), -1)
-        penalty_with_depot = F.pad(penalty, (1, 0), mode='constant', value=0)
-
-        return StatePCTSP(
-            coords=coords,
-            expected_prize=expected_prize,
-            real_prize=real_prize_with_depot,
-            penalty=penalty_with_depot,
-            ids=torch.arange(batch_size, dtype=torch.int64, device=loc.device)[:, None],  # Add steps dimension
-            prev_a=torch.zeros(batch_size, 1, dtype=torch.long, device=loc.device),
-            visited_=(  # Visited as mask is easier to understand, as long more memory efficient
-                # Keep visited_ with depot so we can scatter efficiently (if there is an action for depot)
-                torch.zeros(
-                    batch_size, 1, n_loc + 1,
-                    dtype=torch.uint8, device=loc.device
-                )
-                if visited_dtype == torch.uint8
-                else torch.zeros(batch_size, 1, (n_loc + 63) // 64, dtype=torch.int64, device=loc.device)  # Ceil
-            ),
-            lengths=torch.zeros(batch_size, 1, device=loc.device),
-            cur_total_prize=torch.zeros(batch_size, 1, device=loc.device),
-            cur_total_penalty=penalty.sum(-1)[:, None],  # Sum penalties (all when nothing is visited), add step dim
-            cur_coord=input['depot'][:, None, :],  # Add step dimension
-            i=torch.zeros(1, dtype=torch.int64, device=loc.device)  # Vector with length num_steps
-        )
-
-    def get_remaining_prize_to_collect(self):
-        # returns the remaining prize to collect, or 0 if already collected the minimum (1.0)
-        return torch.clamp(1 - self.cur_total_prize, min=0)
-
-    def get_final_cost(self):
-
-        assert self.all_finished()
-        # assert self.visited_.
-        # We are at the depot so no need to add remaining distance
-        return self.lengths + self.cur_total_penalty
-
-    def update(self, selected):
-
-        assert self.i.size(0) == 1, "Can only update if state represents single step"
-
-        # Update the state
-        selected = selected[:, None]  # Add dimension for step
-        prev_a = selected
-
-        # Add the length
-        cur_coord = self.coords[self.ids, selected]
-        lengths = self.lengths + (cur_coord - self.cur_coord).norm(p=2, dim=-1)  # (batch_dim, 1)
-        # Add current total prize
-        cur_total_prize = self.cur_total_prize + self.real_prize[self.ids, selected]
-        cur_total_penalty = self.cur_total_penalty + self.penalty[self.ids, selected]
-
-        if self.visited_.dtype == torch.uint8:
-            # Note: here we do not subtract one as we have to scatter so the first column allows scattering depot
-            # Add one dimension since we write a single value
-            visited_ = self.visited_.scatter(-1, prev_a[:, :, None], 1)
-        else:
-            # This works, by check_unset=False it is allowed to set the depot visited a second a time
-            visited_ = mask_long_scatter(self.visited_, prev_a, check_unset=False)
-
-        return self._replace(
-            prev_a=prev_a, visited_=visited_,
-            lengths=lengths, cur_total_prize=cur_total_prize, cur_total_penalty=cur_total_penalty, cur_coord=cur_coord,
-            i=self.i + 1
-        )
-
-    def all_finished(self):
-        # All must be returned to depot (and at least 1 step since at start also prev_a == 0)
-        # This is more efficient than checking the mask
-        return self.i.item() > 0 and (self.prev_a == 0).all()
-        # return self.visited[:, :, 0].all()  # If we have visited the depot we're done
-
-    def get_current_node(self):
-        """
-        Returns the current node where 0 is depot, 1...n are nodes
-        :return: (batch_size, num_steps) tensor with current nodes
-        """
-        return self.prev_a
-
-    def get_mask(self):
-        """
-        Gets a (batch_size, n_loc + 1) mask with the feasible actions (0 = depot), depends on already visited and
-        remaining capacity. 0 = feasible, 1 = infeasible
-        Forbids to visit depot twice in a row, unless all nodes have been visited
-        :return:
-        """
-
-        # Note: this always allows going to the depot, but that should always be suboptimal so be ok
-        # Cannot visit if already visited or if the depot has already been visited then we cannot visit anymore
-        visited_ = self.visited
-        mask = (
-            visited_ | visited_[:, :, 0:1]
-        )
-        # Cannot visit depot if not yet collected 1 total prize and there are unvisited nodes
-        mask[:, :, 0] = (self.cur_total_prize < 1.) & (visited_[:, :, 1:].int().sum(-1) < visited_[:, :, 1:].size(-1))
-
-        return mask > 0  # Hacky way to return bool or uint8 depending on pytorch version
-
-    def construct_solutions(self, actions):
-        return actions
diff --git a/AM/problems/tsp/.gitignore b/AM/problems/tsp/.gitignore
deleted file mode 100644
index 9575714..0000000
--- a/AM/problems/tsp/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-concorde/
\ No newline at end of file
diff --git a/AM/problems/tsp/__init__.py b/AM/problems/tsp/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/tsp/install_concorde.sh b/AM/problems/tsp/install_concorde.sh
deleted file mode 100755
index 447d678..0000000
--- a/AM/problems/tsp/install_concorde.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-mkdir concorde
-cd concorde
-mkdir qsopt
-cd qsopt
-# Download qsopt
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    curl -O http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/mac64/qsopt.a
-    curl -O http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/mac64/qsopt.h
-    curl -O http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/mac64/qsopt
-else
-    wget http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/centos/qsopt.a
-    wget http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/centos/qsopt.h
-    wget http://www.math.uwaterloo.ca/~bico/qsopt/beta/codes/centos/qsopt
-fi
-cd ..
-wget http://www.math.uwaterloo.ca/tsp/concorde/downloads/codes/src/co031219.tgz
-tar xf co031219.tgz
-cd concorde
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    ./configure --with-qsopt=$(pwd)/../qsopt --host=powerpc-apple-macos
-else
-    ./configure --with-qsopt=$(realpath ../qsopt)
-fi
-make
-TSP/concorde -s 99 -k 100
-cd ../..
\ No newline at end of file
diff --git a/AM/problems/tsp/problem_tsp.py b/AM/problems/tsp/problem_tsp.py
deleted file mode 100644
index 91adb75..0000000
--- a/AM/problems/tsp/problem_tsp.py
+++ /dev/null
@@ -1,196 +0,0 @@
-from torch.utils.data import Dataset
-import torch
-import os
-import pickle
-import numpy as np
-from problems.tsp.state_tsp import StateTSP
-from utils.beam_search import beam_search
-from utils import move_to
-
-
-def generate_GM_tsp_data_grid(dataset_size, graph_size, num_modes=-1, low=0, high=1):
-    """
-    GMM-9: each mode with N points; overall clipped to the 0-1 square.
-    sc: propto stdev of modes arounf the perfect grid; sc1: stdev at each mode.
-    Code from "On the Generalization of Neural Combinatorial Optimization Heuristics".
-    """
-    import scipy
-    from scipy import stats
-    from numpy.random import default_rng
-    from numpy import meshgrid, array
-    # print(">> Generating data using Gaussian Mixture.")
-    dataset = []
-
-    for i in range(dataset_size):
-        cur_gauss = np.empty([0, 2])
-        remaining_elements = graph_size
-        modes_done = 0
-        sc = 1. / 9.
-        sc1 = .045
-
-        rng = default_rng()
-        z = array((1., 3., 5.)) / 6
-        z = array(meshgrid(z, z))  # perfect grid\n",
-        z += rng.uniform(-sc, sc, size=z.shape)  # shake it a bit\n",
-        z = z.reshape(2, 9)
-        cells_chosen = np.random.choice(9, num_modes, replace=False)
-
-        mu_x_array = []
-        mu_y_array = []
-        for mode in cells_chosen:
-            # grid_x = mode//3
-            # grid_y = mode % 3
-            mu_x = z[0][mode]
-            mu_y = z[1][mode]
-            mu_x_array.append(mu_x)
-            mu_y_array.append(mu_y)
-
-            elements_in_this_mode = int(remaining_elements / (num_modes - modes_done))
-            samples_x = scipy.stats.truncnorm.rvs((low - mu_x) / sc1, (high - mu_x) / sc1, loc=mu_x, scale=sc1,
-                                                  size=elements_in_this_mode)
-            samples_y = scipy.stats.truncnorm.rvs((low - mu_y) / sc1, (high - mu_y) / sc1, loc=mu_y, scale=sc1,
-                                                  size=elements_in_this_mode)
-            samples = np.stack((samples_x, samples_y), axis=1)
-            cur_gauss = np.concatenate((cur_gauss, samples))
-            remaining_elements = remaining_elements - elements_in_this_mode
-            modes_done += 1
-
-        data = torch.Tensor(cur_gauss)
-        data = data.reshape(graph_size, 2)
-        dataset.append(data)
-
-    # print(num_modes, " dataset ", dataset[0])
-
-    return dataset
-
-
-def generate_tsp_data_mg(dataset_size, graph_size):
-    '''
-    formal test setting, generate GMM TSP-50 data (number dataset_size). every part dataset_size//12
-    Code from AAAI-2022 "Learning to Solve Travelling Salesman Problem with Hardness-Adaptive Curriculum".
-    '''
-
-    def mg(cdist=100, graph_size=50):
-        '''
-        GMM create one instance of TSP-50, using cdist
-        '''
-        from sklearn.preprocessing import MinMaxScaler
-        nc = np.random.randint(3, 7)
-        nums = np.random.multinomial(graph_size, np.ones(nc) / nc)
-        xy = []
-        for num in nums:
-            center = np.random.uniform(0, cdist, size=(1, 2))
-            nxy = np.random.multivariate_normal(mean=center.squeeze(), cov=np.eye(2, 2), size=(num,))
-            xy.extend(nxy)
-
-        xy = np.array(xy)
-        xy = MinMaxScaler().fit_transform(xy)
-        return xy
-
-    pern = [dataset_size // 11] * 10 + [dataset_size-dataset_size//11*10]
-    res = []
-    # uni = np.random.uniform(size=(dataset_size - pern * 11, graph_size, 2))
-    # res.append(uni)
-    for i, cdist in enumerate([1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]):
-        # GMM create a batch size instance of TSP-50, using cdist
-        xy_ = []
-        for j in range(pern[i]):
-            xy_.append(mg(cdist, graph_size))
-        res.append(np.array(xy_))
-    res = np.concatenate(res, axis=0)
-
-    return res
-
-
-def generate_uniform_tsp_data(dataset_size, graph_size, low=0, high=1):
-    return [torch.FloatTensor(graph_size, 2).uniform_(low, high) for i in range(dataset_size)]
-
-
-class TSP(object):
-
-    NAME = 'tsp'
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        # Check that tours are valid, i.e. contain 0 to n -1
-        assert (
-            torch.arange(pi.size(1), out=pi.data.new()).view(1, -1).expand_as(pi) ==
-            pi.data.sort(1)[0]
-        ).all(), "Invalid tour"
-
-        # Gather dataset in order of tour
-        d = dataset.gather(1, pi.unsqueeze(-1).expand_as(dataset))
-
-        # Length is distance (L2-norm of difference) from each next location from its prev and of last from first
-        return (d[:, 1:] - d[:, :-1]).norm(p=2, dim=2).sum(1) + (d[:, 0] - d[:, -1]).norm(p=2, dim=1), None
-
-    @staticmethod
-    def make_dataset(*args, **kwargs):
-        return TSPDataset(*args, **kwargs)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StateTSP.initialize(*args, **kwargs)
-
-    @staticmethod
-    def beam_search(input, beam_size, expand_size=None,
-                    compress_mask=False, model=None, max_calc_batch_size=4096):
-
-        assert model is not None, "Provide model"
-
-        fixed = model.precompute_fixed(input)
-
-        def propose_expansions(beam):
-            return model.propose_expansions(
-                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
-            )
-
-        state = TSP.make_state(
-            input, visited_dtype=torch.int64 if compress_mask else torch.uint8
-        )
-
-        return beam_search(state, beam_size, propose_expansions)
-
-
-class TSPDataset(Dataset):
-    
-    def __init__(self, filename=None, size=None, num_samples=10000, offset=0, distribution=None, task=None):
-        super(TSPDataset, self).__init__()
-
-        if filename is not None:
-            assert os.path.splitext(filename)[1] == '.pkl'
-            with open(filename, 'rb') as f:
-                data = pickle.load(f)
-                self.data = [torch.FloatTensor(row) for row in (data[offset:offset+num_samples])]
-        else:
-            if task['variation_type'] == 'size':
-                self.data = generate_uniform_tsp_data(num_samples, task['graph_size'], task['low'], task['high'])
-            elif task['variation_type'] == 'scale':
-                self.data = generate_uniform_tsp_data(num_samples, task['graph_size'], task['low'], task['high'])
-            elif task['variation_type'] == 'dist':
-                self.data = generate_GM_tsp_data_grid(num_samples, task['graph_size'], task['num_modes'])
-            elif task['variation_type'] == 'mix_dist_size':
-                self.data = generate_GM_tsp_data_grid(num_samples, task['graph_size'], task['num_modes'])
-            elif task['variation_type'] in ['adv', 'size_uniform', 'size_two_cluster', 'size_imbalanced', 'size_increasing_order', 'size_decreasing_order']:
-                self.data = [torch.FloatTensor(task['graph_size'], 2).uniform_(0, 1) for i in range(num_samples)]
-            else:
-                print("[!] Default: generating uniform distribution data.")
-                self.data = [torch.FloatTensor(size, 2).uniform_(0, 1) for i in range(num_samples)]
-
-        # check validity of dataset, strange bugs: when moving cpu -> gpu, coordinate may out of (0, 1) range in rare instances even using uniform_(0, 1) e.g., [1.7500e+38, 2.4132e-01].
-        low = task['low'] if task is not None else 0
-        high = task['high'] if task is not None else 1
-        for i, x in enumerate(self.data):
-            x = move_to(x, torch.device("cuda"))
-            if (x < low).any() or (x > high).any():
-                torch.set_printoptions(profile="full")
-                self.data[i] = torch.clamp(self.data[i], min=low, max=high)
-                print("[!] Generated dataset violates valid range ({}-{}): {}".format(low, high, x))
-
-        self.size = len(self.data)
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, idx):
-        return self.data[idx]
diff --git a/AM/problems/tsp/state_tsp.py b/AM/problems/tsp/state_tsp.py
deleted file mode 100644
index 4730605..0000000
--- a/AM/problems/tsp/state_tsp.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-from typing import NamedTuple
-from utils.boolmask import mask_long2bool, mask_long_scatter
-
-
-class StateTSP(NamedTuple):
-    # Fixed input
-    loc: torch.Tensor
-    dist: torch.Tensor
-
-    # If this state contains multiple copies (i.e. beam search) for the same instance, then for memory efficiency
-    # the loc and dist tensors are not kept multiple times, so we need to use the ids to index the correct rows.
-    ids: torch.Tensor  # Keeps track of original fixed data index of rows
-
-    # State
-    first_a: torch.Tensor
-    prev_a: torch.Tensor
-    visited_: torch.Tensor  # Keeps track of nodes that have been visited
-    lengths: torch.Tensor
-    cur_coord: torch.Tensor
-    i: torch.Tensor  # Keeps track of step
-
-    @property
-    def visited(self):
-        if self.visited_.dtype == torch.uint8:
-            return self.visited_
-        else:
-            return mask_long2bool(self.visited_, n=self.loc.size(-2))
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            ids=self.ids[key],
-            first_a=self.first_a[key],
-            prev_a=self.prev_a[key],
-            visited_=self.visited_[key],
-            lengths=self.lengths[key],
-            cur_coord=self.cur_coord[key] if self.cur_coord is not None else None,
-        )
-
-    @staticmethod
-    def initialize(loc, visited_dtype=torch.uint8):
-
-        batch_size, n_loc, _ = loc.size()
-        prev_a = torch.zeros(batch_size, 1, dtype=torch.long, device=loc.device)
-        return StateTSP(
-            loc=loc,
-            dist=(loc[:, :, None, :] - loc[:, None, :, :]).norm(p=2, dim=-1),
-            ids=torch.arange(batch_size, dtype=torch.int64, device=loc.device)[:, None],  # Add steps dimension
-            first_a=prev_a,
-            prev_a=prev_a,
-            # Keep visited with depot so we can scatter efficiently (if there is an action for depot)
-            visited_=(  # Visited as mask is easier to understand, as long more memory efficient
-                torch.zeros(
-                    batch_size, 1, n_loc,
-                    dtype=torch.uint8, device=loc.device
-                )
-                if visited_dtype == torch.uint8
-                else torch.zeros(batch_size, 1, (n_loc + 63) // 64, dtype=torch.int64, device=loc.device)  # Ceil
-            ),
-            lengths=torch.zeros(batch_size, 1, device=loc.device),
-            cur_coord=None,
-            i=torch.zeros(1, dtype=torch.int64, device=loc.device)  # Vector with length num_steps
-        )
-
-    def get_final_cost(self):
-
-        assert self.all_finished()
-        # assert self.visited_.
-
-        return self.lengths + (self.loc[self.ids, self.first_a, :] - self.cur_coord).norm(p=2, dim=-1)
-
-    def update(self, selected):
-
-        # Update the state
-        prev_a = selected[:, None]  # Add dimension for step
-
-        # Add the length
-        # cur_coord = self.loc.gather(
-        #     1,
-        #     selected[:, None, None].expand(selected.size(0), 1, self.loc.size(-1))
-        # )[:, 0, :]
-        cur_coord = self.loc[self.ids, prev_a]
-        lengths = self.lengths
-        if self.cur_coord is not None:  # Don't add length for first action (selection of start node)
-            lengths = self.lengths + (cur_coord - self.cur_coord).norm(p=2, dim=-1)  # (batch_dim, 1)
-
-        # Update should only be called with just 1 parallel step, in which case we can check this way if we should update
-        first_a = prev_a if self.i.item() == 0 else self.first_a
-
-        if self.visited_.dtype == torch.uint8:
-            # Add one dimension since we write a single value
-            visited_ = self.visited_.scatter(-1, prev_a[:, :, None], 1)
-        else:
-            visited_ = mask_long_scatter(self.visited_, prev_a)
-
-        return self._replace(first_a=first_a, prev_a=prev_a, visited_=visited_,
-                             lengths=lengths, cur_coord=cur_coord, i=self.i + 1)
-
-    def all_finished(self):
-        # Exactly n steps
-        return self.i.item() >= self.loc.size(-2)
-
-    def get_current_node(self):
-        return self.prev_a
-
-    def get_mask(self):
-        return self.visited > 0  # Hacky way to return bool or uint8 depending on pytorch version
-
-    def get_nn(self, k=None):
-        # Insert step dimension
-        # Nodes already visited get inf so they do not make it
-        if k is None:
-            k = self.loc.size(-2) - self.i.item()  # Number of remaining
-        return (self.dist[self.ids, :, :] + self.visited.float()[:, :, None, :] * 1e6).topk(k, dim=-1, largest=False)[1]
-
-    def get_nn_current(self, k=None):
-        assert False, "Currently not implemented, look into which neighbours to use in step 0?"
-        # Note: if this is called in step 0, it will have k nearest neighbours to node 0, which may not be desired
-        # so it is probably better to use k = None in the first iteration
-        if k is None:
-            k = self.loc.size(-2)
-        k = min(k, self.loc.size(-2) - self.i.item())  # Number of remaining
-        return (
-            self.dist[
-                self.ids,
-                self.prev_a
-            ] +
-            self.visited.float() * 1e6
-        ).topk(k, dim=-1, largest=False)[1]
-
-    def construct_solutions(self, actions):
-        return actions
diff --git a/AM/problems/tsp/tsp_baseline.py b/AM/problems/tsp/tsp_baseline.py
deleted file mode 100644
index 7d7fe67..0000000
--- a/AM/problems/tsp/tsp_baseline.py
+++ /dev/null
@@ -1,449 +0,0 @@
-import argparse
-import numpy as np
-import os
-import time
-from datetime import timedelta
-from scipy.spatial import distance_matrix
-from utils import run_all_in_pool
-from utils.data_utils import check_extension, load_dataset, save_dataset
-from subprocess import check_call, check_output, CalledProcessError
-from problems.vrp.vrp_baseline import get_lkh_executable
-import torch
-from tqdm import tqdm
-import re
-
-
-def solve_gurobi(directory, name, loc, disable_cache=False, timeout=None, gap=None):
-    # Lazy import so we do not need to have gurobi installed to run this script
-    from problems.tsp.tsp_gurobi import solve_euclidian_tsp as solve_euclidian_tsp_gurobi
-
-    try:
-        problem_filename = os.path.join(directory, "{}.gurobi{}{}.pkl".format(
-            name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap)))
-
-        if os.path.isfile(problem_filename) and not disable_cache:
-            (cost, tour, duration) = load_dataset(problem_filename)
-        else:
-            # 0 = start, 1 = end so add depot twice
-            start = time.time()
-
-            cost, tour = solve_euclidian_tsp_gurobi(loc, threads=1, timeout=timeout, gap=gap)
-            duration = time.time() - start  # Measure clock time
-            save_dataset((cost, tour, duration), problem_filename)
-
-        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
-        total_cost = calc_tsp_length(loc, tour)
-        assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect"
-        return total_cost, tour, duration
-
-    except Exception as e:
-        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
-        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_concorde_log(executable, directory, name, loc, disable_cache=False):
-
-    problem_filename = os.path.join(directory, "{}.tsp".format(name))
-    tour_filename = os.path.join(directory, "{}.tour".format(name))
-    output_filename = os.path.join(directory, "{}.concorde.pkl".format(name))
-    log_filename = os.path.join(directory, "{}.log".format(name))
-
-    # if True:
-    try:
-        # May have already been run
-        if os.path.isfile(output_filename) and not disable_cache:
-            tour, duration = load_dataset(output_filename)
-        else:
-            write_tsplib(problem_filename, loc, name=name)
-
-            with open(log_filename, 'w') as f:
-                start = time.time()
-                try:
-                    # Concorde is weird, will leave traces of solution in current directory so call from target dir
-                    check_call([executable, '-s', '1234', '-x', '-o',
-                                os.path.abspath(tour_filename), os.path.abspath(problem_filename)],
-                               stdout=f, stderr=f, cwd=directory)
-                except CalledProcessError as e:
-                    # Somehow Concorde returns 255
-                    assert e.returncode == 255
-                duration = time.time() - start
-
-            tour = read_concorde_tour(tour_filename)
-            save_dataset((tour, duration), output_filename)
-
-        return calc_tsp_length(loc, tour), tour, duration
-
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def solve_lkh_log(executable, directory, name, loc, runs=1, disable_cache=False):
-
-    problem_filename = os.path.join(directory, "{}.lkh{}.vrp".format(name, runs))
-    tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs))
-    output_filename = os.path.join(directory, "{}.lkh{}.pkl".format(name, runs))
-    param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs))
-    log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs))
-
-    try:
-        # May have already been run
-        if os.path.isfile(output_filename) and not disable_cache:
-            tour, duration = load_dataset(output_filename)
-        else:
-            write_tsplib(problem_filename, loc, name=name)
-
-            params = {"PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": tour_filename, "RUNS": runs, "SEED": 1234}
-            write_lkh_par(param_filename, params)
-
-            with open(log_filename, 'w') as f:
-                start = time.time()
-                check_call([executable, param_filename], stdout=f, stderr=f)
-                duration = time.time() - start
-
-            tour = read_tsplib(tour_filename)
-            save_dataset((tour, duration), output_filename)
-
-        return calc_tsp_length(loc, tour), tour, duration
-
-    except Exception as e:
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def write_lkh_par(filename, parameters):
-    default_parameters = {  # Use none to include as flag instead of kv
-        "MAX_TRIALS": 10000,
-        "RUNS": 10,
-        "TRACE_LEVEL": 1,
-        "SEED": 0
-    }
-    with open(filename, 'w') as f:
-        for k, v in {**default_parameters, **parameters}.items():
-            if v is None:
-                f.write("{}\n".format(k))
-            else:
-                f.write("{} = {}\n".format(k, v))
-
-
-def write_tsplib(filename, loc, name="problem"):
-
-    with open(filename, 'w') as f:
-        f.write("\n".join([
-            "{} : {}".format(k, v)
-            for k, v in (
-                ("NAME", name),
-                ("TYPE", "TSP"),
-                ("DIMENSION", len(loc)),
-                ("EDGE_WEIGHT_TYPE", "EUC_2D"),
-            )
-        ]))
-        f.write("\n")
-        f.write("NODE_COORD_SECTION\n")
-        f.write("\n".join([
-            "{}\t{}\t{}".format(i + 1, int(x * 10000000 + 0.5), int(y * 10000000 + 0.5))  # tsplib does not take floats
-            for i, (x, y) in enumerate(loc)
-        ]))
-        f.write("\n")
-        f.write("EOF\n")
-
-
-def read_concorde_tour(filename):
-    with open(filename, 'r') as f:
-        n = None
-        tour = []
-        for line in f:
-            if n is None:
-                n = int(line)
-            else:
-                tour.extend([int(node) for node in line.rstrip().split(" ")])
-    assert len(tour) == n, "Unexpected tour length"
-    return tour
-
-
-def read_tsplib(filename):
-    with open(filename, 'r') as f:
-        tour = []
-        dimension = 0
-        started = False
-        for line in f:
-            if started:
-                loc = int(line)
-                if loc == -1:
-                    break
-                tour.append(loc)
-            if line.startswith("DIMENSION"):
-                dimension = int(line.split(" ")[-1])
-
-            if line.startswith("TOUR_SECTION"):
-                started = True
-
-    assert len(tour) == dimension
-    tour = np.array(tour).astype(int) - 1  # Subtract 1 as depot is 1 and should be 0
-    return tour.tolist()
-
-
-def calc_tsp_length(loc, tour):
-    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
-    assert len(tour) == len(loc)
-    sorted_locs = np.array(loc)[np.concatenate((tour, [tour[0]]))]
-    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
-
-
-def _calc_insert_cost(D, prv, nxt, ins):
-    """
-    Calculates insertion costs of inserting ins between prv and nxt
-    :param D: distance matrix
-    :param prv: node before inserted node, can be vector
-    :param nxt: node after inserted node, can be vector
-    :param ins: node to insert
-    :return:
-    """
-    return (
-        D[prv, ins]
-        + D[ins, nxt]
-        - D[prv, nxt]
-    )
-
-
-def run_insertion(loc, method):
-    n = len(loc)
-    D = distance_matrix(loc, loc)
-
-    mask = np.zeros(n, dtype=bool)
-    tour = []  # np.empty((0, ), dtype=int)
-    for i in range(n):
-        feas = mask == 0
-        feas_ind = np.flatnonzero(mask == 0)
-        if method == 'random':
-            # Order of instance is random so do in order for deterministic results
-            a = i
-        elif method == 'nearest':
-            if i == 0:
-                a = 0  # order does not matter so first is random
-            else:
-                a = feas_ind[D[np.ix_(feas, ~feas)].min(1).argmin()] # node nearest to any in tour
-        elif method == 'cheapest':
-            assert False, "Not yet implemented" # try all and find cheapest insertion cost
-
-        elif method == 'farthest':
-            if i == 0:
-                a = D.max(1).argmax()  # Node with farthest distance to any other node
-            else:
-                a = feas_ind[D[np.ix_(feas, ~feas)].min(1).argmax()]  # node which has closest node in tour farthest
-        mask[a] = True
-
-        if len(tour) == 0:
-            tour = [a]
-        else:
-            # Find index with least insert cost
-            ind_insert = np.argmin(
-                _calc_insert_cost(
-                    D,
-                    tour,
-                    np.roll(tour, -1),
-                    a
-                )
-            )
-            tour.insert(ind_insert + 1, a)
-
-    cost = D[tour, np.roll(tour, -1)].sum()
-    return cost, tour
-
-
-def solve_insertion(directory, name, loc, method='random'):
-    start = time.time()
-    cost, tour = run_insertion(loc, method)
-    duration = time.time() - start
-    return cost, tour, duration
-
-
-def calc_batch_pdist(dataset):
-    diff = (dataset[:, :, None, :] - dataset[:, None, :, :])
-    return torch.matmul(diff[:, :, :, None, :], diff[:, :, :, :, None]).squeeze(-1).squeeze(-1).sqrt()
-
-
-def nearest_neighbour(dataset, start='first'):
-    dist = calc_batch_pdist(dataset)
-
-    batch_size, graph_size, _ = dataset.size()
-
-    total_dist = dataset.new(batch_size).zero_()
-
-    if not isinstance(start, torch.Tensor):
-        if start == 'random':
-            start = dataset.new().long().new(batch_size).zero_().random_(0, graph_size)
-        elif start == 'first':
-            start = dataset.new().long().new(batch_size).zero_()
-        elif start == 'center':
-            _, start = dist.mean(2).min(1)  # Minimum total distance to others
-        else:
-            assert False, "Unknown start: {}".format(start)
-
-    current = start
-    dist_to_startnode = torch.gather(dist, 2, current.view(-1, 1, 1).expand(batch_size, graph_size, 1)).squeeze(2)
-    tour = [current]
-
-    for i in range(graph_size - 1):
-        # Mark out current node as option
-        dist.scatter_(2, current.view(-1, 1, 1).expand(batch_size, graph_size, 1), np.inf)
-        nn_dist = torch.gather(dist, 1, current.view(-1, 1, 1).expand(batch_size, 1, graph_size)).squeeze(1)
-
-        min_nn_dist, current = nn_dist.min(1)
-        total_dist += min_nn_dist
-        tour.append(current)
-
-    total_dist += torch.gather(dist_to_startnode, 1, current.view(-1, 1)).squeeze(1)
-
-    return total_dist, torch.stack(tour, dim=1)
-
-
-def solve_all_nn(dataset_path, eval_batch_size=1024, no_cuda=False, dataset_n=None, progress_bar_mininterval=0.1):
-    import torch
-    from torch.utils.data import DataLoader
-    from problems import TSP
-    from utils import move_to
-
-    dataloader = DataLoader(
-        TSP.make_dataset(filename=dataset_path, num_samples=dataset_n if dataset_n is not None else 1000000),
-        batch_size=eval_batch_size
-    )
-    device = torch.device("cuda:0" if torch.cuda.is_available() and not no_cuda else "cpu")
-    results = []
-    for batch in tqdm(dataloader, mininterval=progress_bar_mininterval):
-        start = time.time()
-        batch = move_to(batch, device)
-
-        lengths, tours = nearest_neighbour(batch)
-        lengths_check, _ = TSP.get_costs(batch, tours)
-
-        assert (torch.abs(lengths - lengths_check.data) < 1e-5).all()
-
-        duration = time.time() - start
-        results.extend(
-            [(cost.item(), np.trim_zeros(pi.cpu().numpy(), 'b'), duration) for cost, pi in zip(lengths, tours)])
-
-    return results, eval_batch_size
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("method",
-                        help="Name of the method to evaluate, 'nn', 'gurobi' or '(nearest|random|farthest)_insertion'")
-    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument("-o", default=None, help="Name of the results file to write")
-    parser.add_argument("--cpus", type=int, help="Number of CPUs to use, defaults to all cores")
-    parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA (only for Tsiligirides)')
-    parser.add_argument('--disable_cache', action='store_true', help='Disable caching')
-    parser.add_argument('--max_calc_batch_size', type=int, default=1000, help='Size for subbatches')
-    parser.add_argument('--progress_bar_mininterval', type=float, default=0.1, help='Minimum interval')
-    parser.add_argument('-n', type=int, help="Number of instances to process")
-    parser.add_argument('--offset', type=int, help="Offset where to start processing")
-    parser.add_argument('--results_dir', default='results', help="Name of results directory")
-
-    opts = parser.parse_args()
-
-    assert opts.o is None or len(opts.datasets) == 1, "Cannot specify result filename with more than one dataset"
-
-    for dataset_path in opts.datasets:
-
-        assert os.path.isfile(check_extension(dataset_path)), "File does not exist!"
-
-        dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
-
-        if opts.o is None:
-            results_dir = os.path.join(opts.results_dir, "tsp", dataset_basename)
-            os.makedirs(results_dir, exist_ok=True)
-
-            out_file = os.path.join(results_dir, "{}{}{}-{}{}".format(
-                dataset_basename,
-                "offs{}".format(opts.offset) if opts.offset is not None else "",
-                "n{}".format(opts.n) if opts.n is not None else "",
-                opts.method, ext
-            ))
-        else:
-            out_file = opts.o
-
-        assert opts.f or not os.path.isfile(
-            out_file), "File already exists! Try running with -f option to overwrite."
-
-        match = re.match(r'^([a-z_]+)(\d*)$', opts.method)
-        assert match
-        method = match[1]
-        runs = 1 if match[2] == '' else int(match[2])
-
-        if method == "nn":
-            assert opts.offset is None, "Offset not supported for nearest neighbor"
-
-            eval_batch_size = opts.max_calc_batch_size
-
-            results, parallelism = solve_all_nn(
-                dataset_path, eval_batch_size, opts.no_cuda, opts.n,
-                opts.progress_bar_mininterval
-            )
-        elif method in ("gurobi", "gurobigap", "gurobit", "concorde", "lkh") or method[-9:] == 'insertion':
-
-            target_dir = os.path.join(results_dir, "{}-{}".format(
-                dataset_basename,
-                opts.method
-            ))
-            assert opts.f or not os.path.isdir(target_dir), \
-                "Target dir already exists! Try running with -f option to overwrite."
-
-            if not os.path.isdir(target_dir):
-                os.makedirs(target_dir)
-
-            # TSP contains single loc array rather than tuple
-            dataset = [(instance, ) for instance in load_dataset(dataset_path)]
-
-            if method == "concorde":
-                use_multiprocessing = False
-                executable = os.path.abspath(os.path.join('problems', 'tsp', 'concorde', 'concorde', 'TSP', 'concorde'))
-
-                def run_func(args):
-                    return solve_concorde_log(executable, *args, disable_cache=opts.disable_cache)
-
-            elif method == "lkh":
-                use_multiprocessing = False
-                executable = get_lkh_executable()
-
-                def run_func(args):
-                    return solve_lkh_log(executable, *args, runs=runs, disable_cache=opts.disable_cache)
-
-            elif method[:6] == "gurobi":
-                use_multiprocessing = True  # We run one thread per instance
-
-                def run_func(args):
-                    return solve_gurobi(*args, disable_cache=opts.disable_cache,
-                                        timeout=runs if method[6:] == "t" else None,
-                                        gap=float(runs) if method[6:] == "gap" else None)
-            else:
-                assert method[-9:] == "insertion"
-                use_multiprocessing = True
-
-                def run_func(args):
-                    return solve_insertion(*args, opts.method.split("_")[0])
-
-            results, parallelism = run_all_in_pool(
-                run_func,
-                target_dir, dataset, opts, use_multiprocessing=use_multiprocessing
-            )
-
-        else:
-            assert False, "Unknown method: {}".format(opts.method)
-
-        costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-        print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-        print("Average serial duration: {} +- {}".format(
-            np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-        print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-        print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-
-        save_dataset((results, parallelism), out_file)
diff --git a/AM/problems/vrp/.gitignore b/AM/problems/vrp/.gitignore
deleted file mode 100644
index 2fbcab9..0000000
--- a/AM/problems/vrp/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-lkh/
\ No newline at end of file
diff --git a/AM/problems/vrp/__init__.py b/AM/problems/vrp/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AM/problems/vrp/encode-attend-navigate/Neural_Reinforce.py b/AM/problems/vrp/encode-attend-navigate/Neural_Reinforce.py
deleted file mode 100644
index e028070..0000000
--- a/AM/problems/vrp/encode-attend-navigate/Neural_Reinforce.py
+++ /dev/null
@@ -1,464 +0,0 @@
-
-# coding: utf-8
-
-# # Neural Combinatorial Optimization
-
-# In[1]:
-
-
-#-*- coding: utf-8 -*-
-import tensorflow as tf
-distr = tf.contrib.distributions
-
-import numpy as np
-from tqdm import tqdm
-import os
-import matplotlib.pyplot as plt
-
-from utils import embed_seq, encode_seq, full_glimpse, pointer
-from data_generator import DataGenerator
-
-from datetime import timedelta
-
-import time
-
-# ## 1. Data Generator
-
-# In[2]:
-
-
-dataset = DataGenerator() # Create Data Generator
-
-input_batch = dataset.test_batch(batch_size=128, max_length=50, dimension=2, seed=123) # Generate some data
-dataset.visualize_2D_trip(input_batch[0]) # 2D plot for coord batch
-
-
-# ## 2. Config
-
-# In[19]:
-
-
-import argparse
-
-parser = argparse.ArgumentParser(description='Configuration file')
-arg_lists = []
-
-def add_argument_group(name):
-  arg = parser.add_argument_group(name)
-  arg_lists.append(arg)
-  return arg
-
-def str2bool(v):
-  return v.lower() in ('true', '1')
-
-# Data
-data_arg = add_argument_group('Data')
-data_arg.add_argument('--batch_size', type=int, default=512, help='batch size')
-data_arg.add_argument('--max_length', type=int, default=50, help='number of cities') ##### #####
-data_arg.add_argument('--dimension', type=int, default=2, help='city dimension')
-data_arg.add_argument('--greedy', action='store_true', help='Use greedy decoding')
-
-# Network
-net_arg = add_argument_group('Network')
-net_arg.add_argument('--input_embed', type=int, default=128, help='actor critic input embedding')
-net_arg.add_argument('--num_neurons', type=int, default=512, help='encoder inner layer neurons')
-net_arg.add_argument('--num_stacks', type=int, default=3, help='encoder num stacks')
-net_arg.add_argument('--num_heads', type=int, default=16, help='encoder num heads')
-net_arg.add_argument('--query_dim', type=int, default=360, help='decoder query space dimension')
-net_arg.add_argument('--num_units', type=int, default=256, help='decoder and critic attention product space')
-net_arg.add_argument('--num_neurons_critic', type=int, default=256, help='critic n-1 layer')
-
-# Train / test parameters
-train_arg = add_argument_group('Training')
-train_arg.add_argument('--nb_epochs', type=int, default=100, help='nb epochs')
-train_arg.add_argument('--nb_steps', type=int, default=2500, help='nb steps')
-train_arg.add_argument('--init_B', type=float, default=7., help='critic init baseline')
-train_arg.add_argument('--lr_start', type=float, default=0.001, help='actor learning rate')
-train_arg.add_argument('--lr_decay_step', type=int, default=5000, help='lr1 decay step')
-train_arg.add_argument('--lr_decay_rate', type=float, default=0.96, help='lr1 decay rate')
-train_arg.add_argument('--temperature', type=float, default=1.0, help='pointer initial temperature')
-train_arg.add_argument('--C', type=float, default=10.0, help='pointer tanh clipping')
-train_arg.add_argument('--is_training', type=str2bool, default=True, help='switch to inference mode when model is trained') 
-train_arg.add_argument('--n_test', nargs='*', type=int, help='sizes to test on')
-
-def get_config():
-  config, unparsed = parser.parse_known_args()
-  return config, unparsed
-
-
-# In[20]:
-
-
-config, _ = get_config()
-print(vars(config))
-dir_ = str(config.dimension)+'D_'+'TSP'+str(config.max_length) +'_b'+str(config.batch_size)+'_e'+str(config.input_embed)+'_n'+str(config.num_neurons)+'_s'+str(config.num_stacks)+'_h'+str(config.num_heads)+ '_q'+str(config.query_dim) +'_u'+str(config.num_units)+'_c'+str(config.num_neurons_critic)+ '_lr'+str(config.lr_start)+'_d'+str(config.lr_decay_step)+'_'+str(config.lr_decay_rate)+ '_T'+str(config.temperature)+ '_steps'+str(config.nb_steps)+'_i'+str(config.init_B) 
-print(dir_)
-
-
-# ## 3. Model
-
-# In[21]:
-
-
-class Actor(object):
-    
-    def __init__(self):
-        
-        # Data config
-        self.batch_size = config.batch_size # batch size
-        self.max_length = config.max_length # input sequence length (number of cities)
-        self.dimension = config.dimension # dimension of a city (coordinates)
-        self.greedy = config.greedy # whether to use greedy decoding
-        
-        # Network config
-        self.input_embed = config.input_embed # dimension of embedding space
-        self.num_neurons = config.num_neurons # dimension of hidden states (encoder)
-        self.num_stacks = config.num_stacks # encoder num stacks
-        self.num_heads = config.num_heads # encoder num heads
-        self.query_dim = config.query_dim # decoder query space dimension
-        self.num_units = config.num_units # dimension of attention product space (decoder and critic)
-        self.num_neurons_critic = config.num_neurons_critic # critic n-1 layer num neurons
-        self.initializer = tf.contrib.layers.xavier_initializer() # variables initializer
-        
-        # Training config (actor and critic)
-        self.global_step = tf.Variable(0, trainable=False, name="global_step") # actor global step
-        self.global_step2 = tf.Variable(0, trainable=False, name="global_step2") # critic global step
-        self.init_B = config.init_B # critic initial baseline
-        self.lr_start = config.lr_start # initial learning rate
-        self.lr_decay_step = config.lr_decay_step # learning rate decay step
-        self.lr_decay_rate = config.lr_decay_rate # learning rate decay rate
-        self.is_training = config.is_training # swith to False if test mode
-
-        # Tensor block holding the input sequences [Batch Size, Sequence Length, Features]
-        self.input_ = tf.placeholder(tf.float32, [None, self.max_length, self.dimension], name="input_coordinates")
-        
-        with tf.variable_scope("actor"): self.encode_decode()
-        with tf.variable_scope("critic"): self.build_critic()
-        with tf.variable_scope("environment"): self.build_reward()
-        with tf.variable_scope("optimizer"): self.build_optim()
-        self.merged = tf.summary.merge_all()    
-        
-        
-    def encode_decode(self):
-        actor_embedding = embed_seq(input_seq=self.input_, from_=self.dimension, to_= self.input_embed, is_training=self.is_training, BN=True, initializer=self.initializer)
-        actor_encoding = encode_seq(input_seq=actor_embedding, input_dim=self.input_embed, num_stacks=self.num_stacks, num_heads=self.num_heads, num_neurons=self.num_neurons, is_training=self.is_training)
-        if self.is_training == False:
-            actor_encoding = tf.tile(actor_encoding,[self.batch_size,1,1])
-        
-        idx_list, log_probs, entropies = [], [], [] # tours index, log_probs, entropies
-        mask = tf.zeros((self.batch_size, self.max_length)) # mask for actions
-        
-        n_hidden = actor_encoding.get_shape().as_list()[2] # input_embed
-        W_ref = tf.get_variable("W_ref",[1, n_hidden, self.num_units],initializer=self.initializer)
-        W_q = tf.get_variable("W_q",[self.query_dim, self.num_units],initializer=self.initializer)
-        v = tf.get_variable("v",[self.num_units],initializer=self.initializer)
-        
-        encoded_ref = tf.nn.conv1d(actor_encoding, W_ref, 1, "VALID") # actor_encoding is the ref for actions [Batch size, seq_length, n_hidden]
-        query1 = tf.zeros((self.batch_size, n_hidden)) # initial state
-        query2 = tf.zeros((self.batch_size, n_hidden)) # previous state
-        query3 = tf.zeros((self.batch_size, n_hidden)) # previous previous state
-            
-        W_1 =tf.get_variable("W_1",[n_hidden, self.query_dim],initializer=self.initializer) # update trajectory (state)
-        W_2 =tf.get_variable("W_2",[n_hidden, self.query_dim],initializer=self.initializer)
-        W_3 =tf.get_variable("W_3",[n_hidden, self.query_dim],initializer=self.initializer)
-    
-        for step in range(self.max_length): # sample from POINTER      
-            query = tf.nn.relu(tf.matmul(query1, W_1) + tf.matmul(query2, W_2) + tf.matmul(query3, W_3))
-            logits = pointer(encoded_ref=encoded_ref, query=query, mask=mask, W_ref=W_ref, W_q=W_q, v=v, C=config.C, temperature=config.temperature)
-            prob = distr.Categorical(logits) # logits = masked_scores
-            idx = prob.mode() if self.greedy else prob.sample()
-            
-            idx_list.append(idx) # tour index
-            log_probs.append(prob.log_prob(idx)) # log prob
-            entropies.append(prob.entropy()) # entropies
-            mask = mask + tf.one_hot(idx, self.max_length) # mask
-            
-            idx_ = tf.stack([tf.range(self.batch_size,dtype=tf.int32), idx],1) # idx with batch   
-            query3 = query2
-            query2 = query1
-            query1 = tf.gather_nd(actor_encoding, idx_) # update trajectory (state)
-            
-        idx_list.append(idx_list[0]) # return to start
-        self.tour = tf.stack(idx_list, axis=1) # permutations
-        self.log_prob = tf.add_n(log_probs) # corresponding log-probability for backprop
-        self.entropies = tf.add_n(entropies)
-        tf.summary.scalar('log_prob_mean', tf.reduce_mean(self.log_prob))
-        tf.summary.scalar('entropies_mean', tf.reduce_mean(self.entropies))
-        
-        
-    def build_reward(self): # reorder input % tour and return tour length (euclidean distance)
-        self.permutations = tf.stack([tf.tile(tf.expand_dims(tf.range(self.batch_size,dtype=tf.int32),1),[1,self.max_length+1]),self.tour],2)
-        if self.is_training==True:
-            self.ordered_input_ = tf.gather_nd(self.input_,self.permutations)
-        else:
-            self.ordered_input_ = tf.gather_nd(tf.tile(self.input_,[self.batch_size,1,1]),self.permutations)
-        self.ordered_input_ = tf.transpose(self.ordered_input_,[2,1,0]) # [features, seq length +1, batch_size]   Rq: +1 because end = start    
-        
-        ordered_x_ = self.ordered_input_[0] # ordered x, y coordinates [seq length +1, batch_size]
-        ordered_y_ = self.ordered_input_[1] # ordered y coordinates [seq length +1, batch_size]          
-        delta_x2 = tf.transpose(tf.square(ordered_x_[1:]-ordered_x_[:-1]),[1,0]) # [batch_size, seq length]        delta_x**2
-        delta_y2 = tf.transpose(tf.square(ordered_y_[1:]-ordered_y_[:-1]),[1,0]) # [batch_size, seq length]        delta_y**2
-
-        inter_city_distances = tf.sqrt(delta_x2+delta_y2) # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot      [batch_size, seq length]
-        self.distances = tf.reduce_sum(inter_city_distances, axis=1) # [batch_size]
-        self.reward = tf.cast(self.distances,tf.float32) # define reward from tour length  
-        tf.summary.scalar('reward_mean', tf.reduce_mean(self.reward))
-
-            
-    def build_critic(self):
-        critic_embedding = embed_seq(input_seq=self.input_, from_=self.dimension, to_= self.input_embed, is_training=self.is_training, BN=True, initializer=self.initializer)
-        critic_encoding = encode_seq(input_seq=critic_embedding, input_dim=self.input_embed, num_stacks=self.num_stacks, num_heads=self.num_heads, num_neurons=self.num_neurons, is_training=self.is_training)
-        frame = full_glimpse(ref=critic_encoding, from_=self.input_embed, to_=self.num_units, initializer=tf.contrib.layers.xavier_initializer()) # Glimpse on critic_encoding [Batch_size, input_embed]
-        
-        with tf.variable_scope("ffn"): #  2 dense layers for predictions
-            h0 = tf.layers.dense(frame, self.num_neurons_critic, activation=tf.nn.relu, kernel_initializer=self.initializer)
-            w1 = tf.get_variable("w1", [self.num_neurons_critic, 1], initializer=self.initializer)
-            b1 = tf.Variable(self.init_B, name="b1")
-            self.predictions = tf.squeeze(tf.matmul(h0, w1)+b1)
-            tf.summary.scalar('predictions_mean', tf.reduce_mean(self.predictions))
-            
-    def build_optim(self):
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(update_ops): # Update moving_mean and moving_variance for BN
-            
-            with tf.name_scope('reinforce'):
-                lr1 = tf.train.natural_exp_decay(learning_rate=self.lr_start, global_step=self.global_step, decay_steps=self.lr_decay_step, decay_rate=self.lr_decay_rate, staircase=False, name="learning_rate1") # learning rate actor
-                tf.summary.scalar('lr', lr1)
-                opt1 = tf.train.AdamOptimizer(learning_rate=lr1) # Optimizer
-                self.loss = tf.reduce_mean(tf.stop_gradient(self.reward-self.predictions)*self.log_prob, axis=0) # loss actor
-                gvs1 = opt1.compute_gradients(self.loss) # gradients
-                capped_gvs1 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs1 if grad is not None] # L2 clip
-                self.trn_op1 = opt1.apply_gradients(grads_and_vars=capped_gvs1, global_step=self.global_step) # minimize op actor
-                
-            with tf.name_scope('state_value'):
-                lr2 = tf.train.natural_exp_decay(learning_rate=self.lr_start, global_step=self.global_step2, decay_steps=self.lr_decay_step, decay_rate=self.lr_decay_rate, staircase=False, name="learning_rate2") # learning rate critic
-                opt2 = tf.train.AdamOptimizer(learning_rate=lr2) # Optimizer
-                loss2 = tf.losses.mean_squared_error(self.reward, self.predictions) # loss critic
-                gvs2 = opt2.compute_gradients(loss2) # gradients
-                capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs2 if grad is not None] # L2 clip
-                self.trn_op2 = opt2.apply_gradients(grads_and_vars=capped_gvs2, global_step=self.global_step2) # minimize op critic
-
-
-# In[22]:
-
-
-tf.reset_default_graph()
-actor = Actor() # Build graph
-
-
-# In[23]:
-
-
-variables_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Save & restore all the variables.
-saver = tf.train.Saver(var_list=variables_to_save, keep_checkpoint_every_n_hours=1.0)   
-
-
-# In[24]:
-
-
-with tf.Session() as sess: # start session
-    sess.run(tf.global_variables_initializer()) # Run initialize op
-    variables_names = [v.name for v in tf.trainable_variables() if 'Adam' not in v.name]
-    values = sess.run(variables_names)
-    for k, v in zip(variables_names, values):
-        #print("Variable: ", k, "Shape: ", v.shape) # print all variables
-        pass
-
-
-# ## 4. Train
-
-# In[25]:
-
-
-np.random.seed(123) # reproducibility
-tf.set_random_seed(123)
-
-
-# In[26]:
-
-
-with tf.Session() as sess: # start session
-    sess.run(tf.global_variables_initializer()) # run initialize op
-    writer = tf.summary.FileWriter('summary/'+dir_, sess.graph) # summary writer
-    
-    save_path = "save/"+dir_
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-    for epoch in range(config.nb_epochs):
-        print(f'Start epoch {epoch}')
-        for i in tqdm(range(config.nb_steps), mininterval=30): # Forward pass & train step
-            input_batch = dataset.train_batch(actor.batch_size, actor.max_length, actor.dimension)
-            feed = {actor.input_: input_batch} # get feed dict
-            reward, predictions, summary, _, _ = sess.run([actor.reward, actor.predictions, actor.merged, actor.trn_op1, actor.trn_op2], feed_dict=feed)
-
-            if i % 500 == 0: 
-                print('reward',np.mean(reward))
-                print('predictions',np.mean(predictions))
-                writer.add_summary(summary,i)
-        if epoch % 10 == 0:
-            saver.save(sess, save_path+"/actor{epoch}.ckpt") # save the variables to disk
-    
-    if config.nb_epochs > 0:
-        saver.save(sess, save_path+"/actor.ckpt") # save the variables to disk
-        print("Training COMPLETED! Model saved in file: %s" % save_path)
-
-
-# ##  5. Test
-
-# In[ ]:
-
-
-import pickle
-from sklearn.decomposition import PCA
-
-def preprocess_instance(sequence): # Generate random TSP instance
-    sequence = np.array(sequence)
-    pca = PCA(n_components=sequence.shape[-1]) # center & rotate coordinates
-    sequence = pca.fit_transform(sequence) 
-    return sequence
-  
-def load_dataset(size):
-    with open(f"tsp{size}_test_seed1234.pkl", 'rb') as f:
-        return [np.array(inst) for inst in pickle.load(f)]
-#     return [
-#         preprocess_instance(seq) 
-#         for seq in ds
-#     ]
-
-def calc_tsp_length(loc, tour):
-    assert len(np.unique(tour)) == len(tour), "Tour cannot contain duplicates"
-    assert len(tour) == len(loc)
-    sorted_locs = np.array(loc)[np.concatenate((tour, [tour[0]]))]
-    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
-
-test_instance_size = 100
-dataset_test = load_dataset(test_instance_size)
-
-
-# In[ ]:
-
-
-
-
-
-# In[ ]:
-
-
-
-def eval_dataset(ds, greedy, num_samples, save_path, max_length, seed=123):
-    np.random.seed(seed) # reproducibility
-    tf.set_random_seed(seed)
-  
-    config.is_training = False
-    config.greedy = greedy
-    config.batch_size = num_samples ##### #####
-    config.max_length = max_length ##### #####
-    config.temperature = 1.2 ##### #####
-
-    tf.reset_default_graph()
-    actor = Actor() # Build graph
-
-    variables_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Save & restore all the variables.
-    saver = tf.train.Saver(var_list=variables_to_save, keep_checkpoint_every_n_hours=1.0) 
-
-    with tf.Session() as sess:  # start session
-        sess.run(tf.global_variables_initializer()) # Run initialize op
-
-        # save_path = "save/"+dir_
-        saver.restore(sess, save_path+"/actor.ckpt") # Restore variables from disk.
-
-        predictions_length, predictions_length_w2opt = [], []
-        results, results_w2opt = [], []
-    #     for i in tqdm(range(1000)): # test instance
-    #         seed_ = 1+i
-    #         input_batch = dataset.test_batch(1, actor.max_length, actor.dimension, seed=seed_, shuffle=False)
-        for i, coords in enumerate(tqdm(ds, mininterval=30)):
-            start = time.time()
-            input_batch = [preprocess_instance(coords)]  # Add batch dimension
-            feed = {actor.input_: input_batch} # Get feed dict
-            tour, reward = sess.run([actor.tour, actor.reward], feed_dict=feed) # sample tours
-            j = np.argmin(reward) # find best solution
-            best_permutation = tour[j][:-1]
-            predictions_length.append(reward[j])
-            
-#             print('reward (before 2 opt)',reward[j])
-#             dataset.visualize_2D_trip(input_batch[0][best_permutation])
-#             dataset.visualize_sampling(tour)
-            
-            results.append((calc_tsp_length(coords, best_permutation), best_permutation, time.time() - start))
-            
-
-            opt_tour, opt_length = dataset.loop2opt(input_batch[0][best_permutation])
-            predictions_length_w2opt.append(opt_length)
-            
-#             print('reward (with 2 opt)', opt_length)
-#             dataset.visualize_2D_trip(opt_tour)
-            
-            # Find permutation corresponding to opt_tour
-            opt_tour_ind = np.linalg.norm(opt_tour[:, None, :] - input_batch[0][None, :, :], axis=-1).argmin(-1)
-            
-            results_w2opt.append((calc_tsp_length(coords, opt_tour_ind), opt_tour_ind, time.time() - start))
-            
-
-        predictions_length = np.asarray(predictions_length) # average tour length
-        predictions_length_w2opt = np.asarray(predictions_length_w2opt)
-        print("Testing COMPLETED ! Mean length1:",np.mean(predictions_length), "Mean length2:",np.mean(predictions_length_w2opt))
-        
-        return results, results_w2opt
-
-#         n1, bins1, patches1 = plt.hist(predictions_length, 50, facecolor='b', alpha=0.75) # Histogram
-#         n2, bins2, patches2 = plt.hist(predictions_length_w2opt, 50, facecolor='g', alpha=0.75) # Histogram
-#         plt.xlabel('Tour length')
-#         plt.ylabel('Counts')
-#         plt.axis([3., 9., 0, 250])
-#         plt.grid(True)
-#         plt.show()
-
-
-# In[ ]:
-
-
-def print_summary(results, parallelism=1):
-    costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-    print("Number of instances: {}".format(len(costs)))
-    print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-    print("Average serial duration: {} +- {}".format(
-        np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-    print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-    print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-    
-def save_results(size, method, results, parallelism=1):
-    print(f"----- results for {method}")
-    print_summary(results)
-    results_dir = 'results'
-    os.makedirs(results_dir, exist_ok=True)
-    out_file = os.path.join(results_dir, "tsp{}_test_seed1234-{}.pkl".format(
-        size,
-        method
-    ))
-    
-    with open(out_file, 'wb') as f:
-        pickle.dump((results, parallelism), f, pickle.HIGHEST_PROTOCOL)
-    
-# print_summary(results)
-# print_summary(results_2opt)
-# save_results(config.max_length, 'deudongreedy', results_greedy)
-# save_results(config.max_length, 'deudongreedy2opt', results_greedy_2opt)
-
-
-# In[ ]:
-
-train_size = config.max_length
-for test_instance_size in config.n_test:
-
-    dataset_test = load_dataset(test_instance_size)
-    save_path = "save/"+dir_
-    parallelism = 1
-    # eval_dataset(dataset_test50, False, 10, save_path50)
-    results_greedy, results_greedy_2opt = eval_dataset(dataset_test, True, 1, save_path, test_instance_size)
-    save_results(test_instance_size, f'deudon{train_size}greedy', results_greedy)
-    save_results(test_instance_size, f'deudon{train_size}greedy2opt', results_greedy_2opt)
-    results_sampling, results_sampling_2opt = eval_dataset(dataset_test, False, 1280, save_path, test_instance_size)
-    save_results(test_instance_size, f'deudon{train_size}sampling', results_sampling)
-    save_results(test_instance_size, f'deudon{train_size}sampling2opt', results_sampling_2opt)
-
diff --git a/AM/problems/vrp/encode-attend-navigate/data_generator.py b/AM/problems/vrp/encode-attend-navigate/data_generator.py
deleted file mode 100644
index 313d6d6..0000000
--- a/AM/problems/vrp/encode-attend-navigate/data_generator.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#-*- coding: utf-8 -*-
-import numpy as np
-import matplotlib.pyplot as plt
-import math
-from sklearn.decomposition import PCA
-
-
-# Compute a sequence's reward
-def reward(tsp_sequence):
-    tour = np.concatenate((tsp_sequence, np.expand_dims(tsp_sequence[0],0))) # sequence to tour (end=start)
-    inter_city_distances = np.sqrt(np.sum(np.square(tour[:-1,:2]-tour[1:,:2]),axis=1)) # tour length
-    return np.sum(inter_city_distances) # reward
-
-# Swap city[i] with city[j] in sequence
-def swap2opt(tsp_sequence,i,j):
-    new_tsp_sequence = np.copy(tsp_sequence)
-    new_tsp_sequence[i:j+1] = np.flip(tsp_sequence[i:j+1], axis=0) # flip or swap ?
-    return new_tsp_sequence
-
-# One step of 2opt = one double loop and return first improved sequence
-def step2opt(tsp_sequence):
-    seq_length = tsp_sequence.shape[0]
-    distance = reward(tsp_sequence)
-    for i in range(1,seq_length-1):
-        for j in range(i+1,seq_length):
-            new_tsp_sequence = swap2opt(tsp_sequence,i,j)
-            new_distance = reward(new_tsp_sequence)
-            if new_distance < distance:
-                return new_tsp_sequence, new_distance
-    return tsp_sequence, distance
-
-
-class DataGenerator(object):
-
-    def __init__(self):
-        pass
-
-    def gen_instance(self, max_length, dimension, seed=0): # Generate random TSP instance
-        if seed!=0: np.random.seed(seed)
-        sequence = np.random.rand(max_length, dimension) # (max_length) cities with (dimension) coordinates in [0,1]
-        pca = PCA(n_components=dimension) # center & rotate coordinates
-        sequence = pca.fit_transform(sequence) 
-        return sequence
-
-    def train_batch(self, batch_size, max_length, dimension): # Generate random batch for training procedure
-        input_batch = []
-        for _ in range(batch_size):
-            input_ = self.gen_instance(max_length, dimension) # Generate random TSP instance
-            input_batch.append(input_) # Store batch
-        return input_batch
-
-    def test_batch(self, batch_size, max_length, dimension, seed=0, shuffle=False): # Generate random batch for testing procedure
-        input_batch = []
-        input_ = self.gen_instance(max_length, dimension, seed=seed) # Generate random TSP instance
-        for _ in range(batch_size): 
-            sequence = np.copy(input_)
-            if shuffle==True: 
-                np.random.shuffle(sequence) # Shuffle sequence
-            input_batch.append(sequence) # Store batch
-        return input_batch
-
-    def loop2opt(self, tsp_sequence, max_iter=2000): # Iterate step2opt max_iter times (2-opt local search)
-        best_reward = reward(tsp_sequence)
-        new_tsp_sequence = np.copy(tsp_sequence)
-        for _ in range(max_iter): 
-            new_tsp_sequence, new_reward = step2opt(new_tsp_sequence)
-            if new_reward < best_reward:
-                best_reward = new_reward
-            else:
-                break
-        return new_tsp_sequence, best_reward
-
-    def visualize_2D_trip(self, trip): # Plot tour
-        plt.figure(1)
-        colors = ['red'] # First city red
-        for i in range(len(trip)-1):
-            colors.append('blue')
-            
-        plt.scatter(trip[:,0], trip[:,1],  color=colors) # Plot cities
-        tour=np.array(list(range(len(trip))) + [0]) # Plot tour
-        X = trip[tour, 0]
-        Y = trip[tour, 1]
-        plt.plot(X, Y,"--")
-
-        plt.xlim(-0.75,0.75)
-        plt.ylim(-0.75,0.75)
-        plt.xlabel('X')
-        plt.ylabel('Y')
-        plt.show()
-    
-    def visualize_sampling(self, permutations): # Heatmap of permutations (x=cities; y=steps)
-        max_length = len(permutations[0])
-        grid = np.zeros([max_length,max_length]) # initialize heatmap grid to 0
-
-        transposed_permutations = np.transpose(permutations)
-        for t, cities_t in enumerate(transposed_permutations): # step t, cities chosen at step t
-            city_indices, counts = np.unique(cities_t,return_counts=True,axis=0)
-            for u,v in zip(city_indices, counts):
-                grid[t][u]+=v # update grid with counts from the batch of permutations
-
-        fig = plt.figure(1) # plot heatmap
-        ax = fig.add_subplot(1,1,1)
-        ax.set_aspect('equal')
-        plt.imshow(grid, interpolation='nearest', cmap='gray')
-        plt.colorbar()
-        plt.title('Sampled permutations')
-        plt.ylabel('Time t')
-        plt.xlabel('City i')
-        plt.show()
\ No newline at end of file
diff --git a/AM/problems/vrp/encode-attend-navigate/utils.py b/AM/problems/vrp/encode-attend-navigate/utils.py
deleted file mode 100644
index a7fe993..0000000
--- a/AM/problems/vrp/encode-attend-navigate/utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import print_function
-import tensorflow as tf
-import numpy as np
-from tqdm import tqdm
-
-
-# Embed input sequence [batch_size, seq_length, from_] -> [batch_size, seq_length, to_]
-def embed_seq(input_seq, from_, to_, is_training, BN=True, initializer=tf.contrib.layers.xavier_initializer()):
-    with tf.variable_scope("embedding"): # embed + BN input set
-        W_embed = tf.get_variable("weights",[1,from_, to_], initializer=initializer)
-        embedded_input = tf.nn.conv1d(input_seq, W_embed, 1, "VALID", name="embedded_input")
-        if BN == True: embedded_input = tf.layers.batch_normalization(embedded_input, axis=2, training=is_training, name='layer_norm', reuse=None)
-        return embedded_input
-
-
-# Apply multihead attention to a 3d tensor with shape [batch_size, seq_length, n_hidden].
-# Attention size = n_hidden should be a multiple of num_head
-# Returns a 3d tensor with shape of [batch_size, seq_length, n_hidden]
-def multihead_attention(inputs, num_units=None, num_heads=16, dropout_rate=0.1, is_training=True):
-    with tf.variable_scope("multihead_attention", reuse=None):
-        # Linear projections
-        Q = tf.layers.dense(inputs, num_units, activation=tf.nn.relu) # [batch_size, seq_length, n_hidden]
-        K = tf.layers.dense(inputs, num_units, activation=tf.nn.relu) # [batch_size, seq_length, n_hidden]
-        V = tf.layers.dense(inputs, num_units, activation=tf.nn.relu) # [batch_size, seq_length, n_hidden]
-        # Split and concat
-        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # [batch_size, seq_length, n_hidden/num_heads]
-        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # [batch_size, seq_length, n_hidden/num_heads]
-        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # [batch_size, seq_length, n_hidden/num_heads]
-        # Multiplication
-        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # num_heads*[batch_size, seq_length, seq_length]
-        # Scale
-        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
-        # Activation
-        outputs = tf.nn.softmax(outputs) # num_heads*[batch_size, seq_length, seq_length]
-        # Dropouts
-        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
-        # Weighted sum
-        outputs = tf.matmul(outputs, V_) # num_heads*[batch_size, seq_length, n_hidden/num_heads]
-        # Restore shape
-        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # [batch_size, seq_length, n_hidden]   
-        # Residual connection
-        outputs += inputs # [batch_size, seq_length, n_hidden]
-        # Normalize
-        outputs = tf.layers.batch_normalization(outputs, axis=2, training=is_training, name='ln', reuse=None)  # [batch_size, seq_length, n_hidden]
- 
-    return outputs
-
-
-# Apply point-wise feed forward net to a 3d tensor with shape [batch_size, seq_length, n_hidden]
-# Returns: a 3d tensor with the same shape and dtype as inputs
-def feedforward(inputs, num_units=[2048, 512], is_training=True):
-    with tf.variable_scope("ffn", reuse=None):
-        # Inner layer
-        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu, "use_bias": True}
-        outputs = tf.layers.conv1d(**params)
-        # Readout layer
-        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True}
-        outputs = tf.layers.conv1d(**params)
-        # Residual connection
-        outputs += inputs
-        # Normalize
-        outputs = tf.layers.batch_normalization(outputs, axis=2, training=is_training, name='ln', reuse=None)  # [batch_size, seq_length, n_hidden]   
-    return outputs
-
-
-# Encode input sequence [batch_size, seq_length, n_hidden] -> [batch_size, seq_length, n_hidden]
-def encode_seq(input_seq, input_dim, num_stacks, num_heads, num_neurons, is_training, dropout_rate=0.):
-    with tf.variable_scope("stack"):
-        for i in range(num_stacks): # block i
-            with tf.variable_scope("block_{}".format(i)): # Multihead Attention + Feed Forward
-                input_seq = multihead_attention(input_seq, num_units=input_dim, num_heads=num_heads, dropout_rate=dropout_rate, is_training=is_training)
-                input_seq = feedforward(input_seq, num_units=[num_neurons, input_dim], is_training=is_training)
-        return input_seq # encoder_output is the ref for actions [Batch size, Sequence Length, Num_neurons]
-            
-
-# From a query (decoder output) [Batch size, n_hidden] and a set of reference (encoder_output) [Batch size, seq_length, n_hidden]
-# predict a distribution over next decoder input
-def pointer(encoded_ref, query, mask, W_ref, W_q, v, C=10., temperature=1.0):
-    encoded_query = tf.expand_dims(tf.matmul(query, W_q), 1) # [Batch size, 1, n_hidden]
-    scores = tf.reduce_sum(v * tf.tanh(encoded_ref + encoded_query), [-1]) # [Batch size, seq_length]
-    scores = C*tf.tanh(scores/temperature) # control entropy
-    masked_scores =  tf.clip_by_value(scores -100000000.*mask, -100000000., 100000000.) # [Batch size, seq_length]
-    return masked_scores
-
-
-# From a query [Batch size, n_hidden], glimpse at a set of reference vectors (ref) [Batch size, seq_length, n_hidden]
-def full_glimpse(ref, from_, to_, initializer=tf.contrib.layers.xavier_initializer()):
-    with tf.variable_scope("glimpse"):
-        W_ref_g =tf.get_variable("W_ref_g",[1,from_, to_],initializer=initializer)
-        W_q_g =tf.get_variable("W_q_g",[from_, to_],initializer=initializer)
-        v_g =tf.get_variable("v_g",[to_],initializer=initializer)
-        # Attending mechanism
-        encoded_ref_g = tf.nn.conv1d(ref, W_ref_g, 1, "VALID", name="encoded_ref_g") # [Batch size, seq_length, n_hidden]
-        scores_g = tf.reduce_sum(v_g * tf.tanh(encoded_ref_g), [-1], name="scores_g") # [Batch size, seq_length]
-        attention_g = tf.nn.softmax(scores_g, name="attention_g")
-        # 1 glimpse = Linear combination of reference vectors (defines new query vector)
-        glimpse = tf.multiply(ref, tf.expand_dims(attention_g,2))
-        glimpse = tf.reduce_sum(glimpse,1)
-        return glimpse
\ No newline at end of file
diff --git a/AM/problems/vrp/problem_vrp.py b/AM/problems/vrp/problem_vrp.py
deleted file mode 100644
index 7ff1c3e..0000000
--- a/AM/problems/vrp/problem_vrp.py
+++ /dev/null
@@ -1,205 +0,0 @@
-from torch.utils.data import Dataset
-import torch
-import os
-import pickle
-
-from problems.vrp.state_cvrp import StateCVRP
-from problems.vrp.state_sdvrp import StateSDVRP
-from utils.beam_search import beam_search
-
-
-class CVRP(object):
-
-    NAME = 'cvrp'  # Capacitated Vehicle Routing Problem
-
-    VEHICLE_CAPACITY = 1.0  # (w.l.o.g. vehicle capacity is 1, demands should be scaled)
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        batch_size, graph_size = dataset['demand'].size()
-        # Check that tours are valid, i.e. contain 0 to n -1
-        sorted_pi = pi.data.sort(1)[0]
-
-        # Sorting it should give all zeros at front and then 1...n
-        assert (
-            torch.arange(1, graph_size + 1, out=pi.data.new()).view(1, -1).expand(batch_size, graph_size) ==
-            sorted_pi[:, -graph_size:]
-        ).all() and (sorted_pi[:, :-graph_size] == 0).all(), "Invalid tour"
-
-        # Visiting depot resets capacity so we add demand = -capacity (we make sure it does not become negative)
-        demand_with_depot = torch.cat(
-            (
-                torch.full_like(dataset['demand'][:, :1], -CVRP.VEHICLE_CAPACITY),
-                dataset['demand']
-            ),
-            1
-        )
-        d = demand_with_depot.gather(1, pi)
-
-        used_cap = torch.zeros_like(dataset['demand'][:, 0])
-        for i in range(pi.size(1)):
-            used_cap += d[:, i]  # This will reset/make capacity negative if i == 0, e.g. depot visited
-            # Cannot use less than 0
-            used_cap[used_cap < 0] = 0
-            assert (used_cap <= CVRP.VEHICLE_CAPACITY + 1e-5).all(), "Used more than capacity"
-
-        # Gather dataset in order of tour
-        loc_with_depot = torch.cat((dataset['depot'][:, None, :], dataset['loc']), 1)
-        d = loc_with_depot.gather(1, pi[..., None].expand(*pi.size(), loc_with_depot.size(-1)))
-
-        # Length is distance (L2-norm of difference) of each next location to its prev and of first and last to depot
-        return (
-            (d[:, 1:] - d[:, :-1]).norm(p=2, dim=2).sum(1)
-            + (d[:, 0] - dataset['depot']).norm(p=2, dim=1)  # Depot to first
-            + (d[:, -1] - dataset['depot']).norm(p=2, dim=1)  # Last to depot, will be 0 if depot is last
-        ), None
-
-    @staticmethod
-    def make_dataset(*args, **kwargs):
-        return VRPDataset(*args, **kwargs)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StateCVRP.initialize(*args, **kwargs)
-
-    @staticmethod
-    def beam_search(input, beam_size, expand_size=None,
-                    compress_mask=False, model=None, max_calc_batch_size=4096):
-
-        assert model is not None, "Provide model"
-
-        fixed = model.precompute_fixed(input)
-
-        def propose_expansions(beam):
-            return model.propose_expansions(
-                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
-            )
-
-        state = CVRP.make_state(
-            input, visited_dtype=torch.int64 if compress_mask else torch.uint8
-        )
-
-        return beam_search(state, beam_size, propose_expansions)
-
-
-class SDVRP(object):
-
-    NAME = 'sdvrp'  # Split Delivery Vehicle Routing Problem
-
-    VEHICLE_CAPACITY = 1.0  # (w.l.o.g. vehicle capacity is 1, demands should be scaled)
-
-    @staticmethod
-    def get_costs(dataset, pi):
-        batch_size, graph_size = dataset['demand'].size()
-
-        # Each node can be visited multiple times, but we always deliver as much demand as possible
-        # We check that at the end all demand has been satisfied
-        demands = torch.cat(
-            (
-                torch.full_like(dataset['demand'][:, :1], -SDVRP.VEHICLE_CAPACITY),
-                dataset['demand']
-            ),
-            1
-        )
-        rng = torch.arange(batch_size, out=demands.data.new().long())
-        used_cap = torch.zeros_like(dataset['demand'][:, 0])
-        a_prev = None
-        for a in pi.transpose(0, 1):
-            assert a_prev is None or (demands[((a_prev == 0) & (a == 0)), :] == 0).all(), \
-                "Cannot visit depot twice if any nonzero demand"
-            d = torch.min(demands[rng, a], SDVRP.VEHICLE_CAPACITY - used_cap)
-            demands[rng, a] -= d
-            used_cap += d
-            used_cap[a == 0] = 0
-            a_prev = a
-        assert (demands == 0).all(), "All demand must be satisfied"
-
-        # Gather dataset in order of tour
-        loc_with_depot = torch.cat((dataset['depot'][:, None, :], dataset['loc']), 1)
-        d = loc_with_depot.gather(1, pi[..., None].expand(*pi.size(), loc_with_depot.size(-1)))
-
-        # Length is distance (L2-norm of difference) of each next location to its prev and of first and last to depot
-        return (
-            (d[:, 1:] - d[:, :-1]).norm(p=2, dim=2).sum(1)
-            + (d[:, 0] - dataset['depot']).norm(p=2, dim=1)  # Depot to first
-            + (d[:, -1] - dataset['depot']).norm(p=2, dim=1)  # Last to depot, will be 0 if depot is last
-        ), None
-
-    @staticmethod
-    def make_dataset(*args, **kwargs):
-        return VRPDataset(*args, **kwargs)
-
-    @staticmethod
-    def make_state(*args, **kwargs):
-        return StateSDVRP.initialize(*args, **kwargs)
-
-    @staticmethod
-    def beam_search(input, beam_size, expand_size=None,
-                    compress_mask=False, model=None, max_calc_batch_size=4096):
-        assert model is not None, "Provide model"
-        assert not compress_mask, "SDVRP does not support compression of the mask"
-
-        fixed = model.precompute_fixed(input)
-
-        def propose_expansions(beam):
-            return model.propose_expansions(
-                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
-            )
-
-        state = SDVRP.make_state(input)
-
-        return beam_search(state, beam_size, propose_expansions)
-
-
-def make_instance(args):
-    depot, loc, demand, capacity, *args = args
-    grid_size = 1
-    if len(args) > 0:
-        depot_types, customer_types, grid_size = args
-    return {
-        'loc': torch.tensor(loc, dtype=torch.float) / grid_size,
-        'demand': torch.tensor(demand, dtype=torch.float) / capacity,
-        'depot': torch.tensor(depot, dtype=torch.float) / grid_size
-    }
-
-
-class VRPDataset(Dataset):
-    
-    def __init__(self, filename=None, size=50, num_samples=1000000, offset=0, distribution=None):
-        super(VRPDataset, self).__init__()
-
-        self.data_set = []
-        if filename is not None:
-            assert os.path.splitext(filename)[1] == '.pkl'
-
-            with open(filename, 'rb') as f:
-                data = pickle.load(f)
-            self.data = [make_instance(args) for args in data[offset:offset+num_samples]]
-
-        else:
-
-            # From VRP with RL paper https://arxiv.org/abs/1802.04240
-            CAPACITIES = {
-                10: 20.,
-                20: 30.,
-                50: 40.,
-                100: 50.
-            }
-
-            self.data = [
-                {
-                    'loc': torch.FloatTensor(size, 2).uniform_(0, 1),
-                    # Uniform 1 - 9, scaled by capacities
-                    'demand': (torch.FloatTensor(size).uniform_(0, 9).int() + 1).float() / CAPACITIES[size],
-                    'depot': torch.FloatTensor(2).uniform_(0, 1)
-                }
-                for i in range(num_samples)
-            ]
-
-        self.size = len(self.data)
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, idx):
-        return self.data[idx]
diff --git a/AM/problems/vrp/state_cvrp.py b/AM/problems/vrp/state_cvrp.py
deleted file mode 100644
index 81ba1e6..0000000
--- a/AM/problems/vrp/state_cvrp.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import torch
-from typing import NamedTuple
-from utils.boolmask import mask_long2bool, mask_long_scatter
-
-
-class StateCVRP(NamedTuple):
-    # Fixed input
-    coords: torch.Tensor  # Depot + loc
-    demand: torch.Tensor
-
-    # If this state contains multiple copies (i.e. beam search) for the same instance, then for memory efficiency
-    # the coords and demands tensors are not kept multiple times, so we need to use the ids to index the correct rows.
-    ids: torch.Tensor  # Keeps track of original fixed data index of rows
-
-    # State
-    prev_a: torch.Tensor
-    used_capacity: torch.Tensor
-    visited_: torch.Tensor  # Keeps track of nodes that have been visited
-    lengths: torch.Tensor
-    cur_coord: torch.Tensor
-    i: torch.Tensor  # Keeps track of step
-
-    VEHICLE_CAPACITY = 1.0  # Hardcoded
-
-    @property
-    def visited(self):
-        if self.visited_.dtype == torch.uint8:
-            return self.visited_
-        else:
-            return mask_long2bool(self.visited_, n=self.demand.size(-1))
-
-    @property
-    def dist(self):
-        return (self.coords[:, :, None, :] - self.coords[:, None, :, :]).norm(p=2, dim=-1)
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            ids=self.ids[key],
-            prev_a=self.prev_a[key],
-            used_capacity=self.used_capacity[key],
-            visited_=self.visited_[key],
-            lengths=self.lengths[key],
-            cur_coord=self.cur_coord[key],
-        )
-
-    # Warning: cannot override len of NamedTuple, len should be number of fields, not batch size
-    # def __len__(self):
-    #     return len(self.used_capacity)
-
-    @staticmethod
-    def initialize(input, visited_dtype=torch.uint8):
-
-        depot = input['depot']
-        loc = input['loc']
-        demand = input['demand']
-
-        batch_size, n_loc, _ = loc.size()
-        return StateCVRP(
-            coords=torch.cat((depot[:, None, :], loc), -2),
-            demand=demand,
-            ids=torch.arange(batch_size, dtype=torch.int64, device=loc.device)[:, None],  # Add steps dimension
-            prev_a=torch.zeros(batch_size, 1, dtype=torch.long, device=loc.device),
-            used_capacity=demand.new_zeros(batch_size, 1),
-            visited_=(  # Visited as mask is easier to understand, as long more memory efficient
-                # Keep visited_ with depot so we can scatter efficiently
-                torch.zeros(
-                    batch_size, 1, n_loc + 1,
-                    dtype=torch.uint8, device=loc.device
-                )
-                if visited_dtype == torch.uint8
-                else torch.zeros(batch_size, 1, (n_loc + 63) // 64, dtype=torch.int64, device=loc.device)  # Ceil
-            ),
-            lengths=torch.zeros(batch_size, 1, device=loc.device),
-            cur_coord=input['depot'][:, None, :],  # Add step dimension
-            i=torch.zeros(1, dtype=torch.int64, device=loc.device)  # Vector with length num_steps
-        )
-
-    def get_final_cost(self):
-
-        assert self.all_finished()
-
-        return self.lengths + (self.coords[self.ids, 0, :] - self.cur_coord).norm(p=2, dim=-1)
-
-    def update(self, selected):
-
-        assert self.i.size(0) == 1, "Can only update if state represents single step"
-
-        # Update the state
-        selected = selected[:, None]  # Add dimension for step
-        prev_a = selected
-        n_loc = self.demand.size(-1)  # Excludes depot
-
-        # Add the length
-        cur_coord = self.coords[self.ids, selected]
-        # cur_coord = self.coords.gather(
-        #     1,
-        #     selected[:, None].expand(selected.size(0), 1, self.coords.size(-1))
-        # )[:, 0, :]
-        lengths = self.lengths + (cur_coord - self.cur_coord).norm(p=2, dim=-1)  # (batch_dim, 1)
-
-        # Not selected_demand is demand of first node (by clamp) so incorrect for nodes that visit depot!
-        #selected_demand = self.demand.gather(-1, torch.clamp(prev_a - 1, 0, n_loc - 1))
-        selected_demand = self.demand[self.ids, torch.clamp(prev_a - 1, 0, n_loc - 1)]
-
-        # Increase capacity if depot is not visited, otherwise set to 0
-        #used_capacity = torch.where(selected == 0, 0, self.used_capacity + selected_demand)
-        used_capacity = (self.used_capacity + selected_demand) * (prev_a != 0).float()
-
-        if self.visited_.dtype == torch.uint8:
-            # Note: here we do not subtract one as we have to scatter so the first column allows scattering depot
-            # Add one dimension since we write a single value
-            visited_ = self.visited_.scatter(-1, prev_a[:, :, None], 1)
-        else:
-            # This works, will not set anything if prev_a -1 == -1 (depot)
-            visited_ = mask_long_scatter(self.visited_, prev_a - 1)
-
-        return self._replace(
-            prev_a=prev_a, used_capacity=used_capacity, visited_=visited_,
-            lengths=lengths, cur_coord=cur_coord, i=self.i + 1
-        )
-
-    def all_finished(self):
-        return self.i.item() >= self.demand.size(-1) and self.visited.all()
-
-    def get_finished(self):
-        return self.visited.sum(-1) == self.visited.size(-1)
-
-    def get_current_node(self):
-        return self.prev_a
-
-    def get_mask(self):
-        """
-        Gets a (batch_size, n_loc + 1) mask with the feasible actions (0 = depot), depends on already visited and
-        remaining capacity. 0 = feasible, 1 = infeasible
-        Forbids to visit depot twice in a row, unless all nodes have been visited
-        :return:
-        """
-
-        if self.visited_.dtype == torch.uint8:
-            visited_loc = self.visited_[:, :, 1:]
-        else:
-            visited_loc = mask_long2bool(self.visited_, n=self.demand.size(-1))
-
-        # For demand steps_dim is inserted by indexing with id, for used_capacity insert node dim for broadcasting
-        exceeds_cap = (self.demand[self.ids, :] + self.used_capacity[:, :, None] > self.VEHICLE_CAPACITY)
-        # Nodes that cannot be visited are already visited or too much demand to be served now
-        mask_loc = visited_loc.to(exceeds_cap.dtype) | exceeds_cap
-
-        # Cannot visit the depot if just visited and still unserved nodes
-        mask_depot = (self.prev_a == 0) & ((mask_loc == 0).int().sum(-1) > 0)
-        return torch.cat((mask_depot[:, :, None], mask_loc), -1)
-
-    def construct_solutions(self, actions):
-        return actions
diff --git a/AM/problems/vrp/state_sdvrp.py b/AM/problems/vrp/state_sdvrp.py
deleted file mode 100644
index 1970602..0000000
--- a/AM/problems/vrp/state_sdvrp.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import torch
-from typing import NamedTuple
-
-
-class StateSDVRP(NamedTuple):
-    # Fixed input
-    coords: torch.Tensor
-    demand: torch.Tensor
-
-    # If this state contains multiple copies (i.e. beam search) for the same instance, then for memory efficiency
-    # the coords and demands tensors are not kept multiple times, so we need to use the ids to index the correct rows.
-    ids: torch.Tensor  # Keeps track of original fixed data index of rows
-
-    # State
-    prev_a: torch.Tensor
-    used_capacity: torch.Tensor
-    demands_with_depot: torch.Tensor  # Keeps track of remaining demands
-    lengths: torch.Tensor
-    cur_coord: torch.Tensor
-    i: torch.Tensor  # Keeps track of step
-
-    VEHICLE_CAPACITY = 1.0  # Hardcoded
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            ids=self.ids[key],
-            prev_a=self.prev_a[key],
-            used_capacity=self.used_capacity[key],
-            demands_with_depot=self.demands_with_depot[key],
-            lengths=self.lengths[key],
-            cur_coord=self.cur_coord[key],
-        )
-
-    @staticmethod
-    def initialize(input):
-
-        depot = input['depot']
-        loc = input['loc']
-        demand = input['demand']
-
-        batch_size, n_loc, _ = loc.size()
-        return StateSDVRP(
-            coords=torch.cat((depot[:, None, :], loc), -2),
-            demand=demand,
-            ids=torch.arange(batch_size, dtype=torch.int64, device=loc.device)[:, None],  # Add steps dimension
-            prev_a=torch.zeros(batch_size, 1, dtype=torch.long, device=loc.device),
-            used_capacity=demand.new_zeros(batch_size, 1),
-            demands_with_depot=torch.cat((
-                demand.new_zeros(batch_size, 1),
-                demand[:, :]
-            ), 1)[:, None, :],
-            lengths=torch.zeros(batch_size, 1, device=loc.device),
-            cur_coord=input['depot'][:, None, :],  # Add step dimension
-            i=torch.zeros(1, dtype=torch.int64, device=loc.device)  # Vector with length num_steps
-        )
-
-    def get_final_cost(self):
-
-        assert self.all_finished()
-
-        return self.lengths + (self.coords[self.ids, 0, :] - self.cur_coord).norm(p=2, dim=-1)
-
-    def update(self, selected):
-
-        assert self.i.size(0) == 1, "Can only update if state represents single step"
-
-        # Update the state
-        selected = selected[:, None]  # Add dimension for step
-        prev_a = selected
-
-        # Add the length
-        cur_coord = self.coords[self.ids, selected]
-        lengths = self.lengths + (cur_coord - self.cur_coord).norm(p=2, dim=-1)  # (batch_dim, 1)
-
-        # Not selected_demand is demand of first node (by clamp) so incorrect for nodes that visit depot!
-        selected_demand = self.demands_with_depot.gather(-1, prev_a[:, :, None])[:, :, 0]
-        delivered_demand = torch.min(selected_demand, self.VEHICLE_CAPACITY - self.used_capacity)
-
-        # Increase capacity if depot is not visited, otherwise set to 0
-        #used_capacity = torch.where(selected == 0, 0, self.used_capacity + delivered_demand)
-        used_capacity = (self.used_capacity + delivered_demand) * (prev_a != 0).float()
-
-        # demands_with_depot = demands_with_depot.clone()[:, 0, :]
-        # Add one dimension since we write a single value
-        demands_with_depot = self.demands_with_depot.scatter(
-            -1,
-            prev_a[:, :, None],
-            self.demands_with_depot.gather(-1, prev_a[:, :, None]) - delivered_demand[:, :, None]
-        )
-        
-        return self._replace(
-            prev_a=prev_a, used_capacity=used_capacity, demands_with_depot=demands_with_depot,
-            lengths=lengths, cur_coord=cur_coord, i=self.i + 1
-        )
-
-    def all_finished(self):
-        return self.i.item() >= self.demands_with_depot.size(-1) and not (self.demands_with_depot > 0).any()
-
-    def get_current_node(self):
-        return self.prev_a
-
-    def get_mask(self):
-        """
-        Gets a (batch_size, n_loc + 1) mask with the feasible actions (0 = depot), depends on already visited and
-        remaining capacity. 0 = feasible, 1 = infeasible
-        Forbids to visit depot twice in a row, unless all nodes have been visited
-        :return:
-        """
-
-        # Nodes that cannot be visited are already visited or too much demand to be served now
-        mask_loc = (self.demands_with_depot[:, :, 1:] == 0) | (self.used_capacity[:, :, None] >= self.VEHICLE_CAPACITY)
-
-        # Cannot visit the depot if just visited and still unserved nodes
-        mask_depot = (self.prev_a == 0) & ((mask_loc == 0).int().sum(-1) > 0)
-        return torch.cat((mask_depot[:, :, None], mask_loc), -1)
-
-    def construct_solutions(self, actions):
-        return actions
diff --git a/AM/problems/vrp/vrp_baseline.py b/AM/problems/vrp/vrp_baseline.py
deleted file mode 100644
index 522028f..0000000
--- a/AM/problems/vrp/vrp_baseline.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import argparse
-import os
-import numpy as np
-import re
-from utils.data_utils import check_extension, load_dataset, save_dataset
-from subprocess import check_call, check_output
-from urllib.parse import urlparse
-import tempfile
-import time
-from datetime import timedelta
-from utils import run_all_in_pool
-
-
-def get_lkh_executable(url="http://www.akira.ruc.dk/~keld/research/LKH-3/LKH-3.0.4.tgz"):
-
-    cwd = os.path.abspath(os.path.join("problems", "vrp", "lkh"))
-    os.makedirs(cwd, exist_ok=True)
-
-    file = os.path.join(cwd, os.path.split(urlparse(url).path)[-1])
-    filedir = os.path.splitext(file)[0]
-
-    if not os.path.isdir(filedir):
-        print("{} not found, downloading and compiling".format(filedir))
-
-        check_call(["wget", url], cwd=cwd)
-        assert os.path.isfile(file), "Download failed, {} does not exist".format(file)
-        check_call(["tar", "xvfz", file], cwd=cwd)
-
-        assert os.path.isdir(filedir), "Extracting failed, dir {} does not exist".format(filedir)
-        check_call("make", cwd=filedir)
-        os.remove(file)
-
-    executable = os.path.join(filedir, "LKH")
-    assert os.path.isfile(executable)
-    return os.path.abspath(executable)
-
-
-def solve_lkh(executable, depot, loc, demand, capacity):
-    with tempfile.TemporaryDirectory() as tempdir:
-        problem_filename = os.path.join(tempdir, "problem.vrp")
-        output_filename = os.path.join(tempdir, "output.tour")
-        param_filename = os.path.join(tempdir, "params.par")
-
-        starttime = time.time()
-        write_vrplib(problem_filename, depot, loc, demand, capacity)
-        params = {"PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": output_filename}
-        write_lkh_par(param_filename, params)
-        output = check_output([executable, param_filename])
-        result = read_vrplib(output_filename, n=len(demand))
-        duration = time.time() - starttime
-        return result, output, duration
-
-
-def solve_lkh_log(executable, directory, name, depot, loc, demand, capacity, grid_size=1, runs=1, disable_cache=False):
-
-    problem_filename = os.path.join(directory, "{}.lkh{}.vrp".format(name, runs))
-    tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs))
-    output_filename = os.path.join(directory, "{}.lkh{}.pkl".format(name, runs))
-    param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs))
-    log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs))
-
-    try:
-        # May have already been run
-        if os.path.isfile(output_filename) and not disable_cache:
-            tour, duration = load_dataset(output_filename)
-        else:
-            write_vrplib(problem_filename, depot, loc, demand, capacity, grid_size, name=name)
-
-            params = {"PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": tour_filename, "RUNS": runs, "SEED": 1234}
-            write_lkh_par(param_filename, params)
-
-            with open(log_filename, 'w') as f:
-                start = time.time()
-                check_call([executable, param_filename], stdout=f, stderr=f)
-                duration = time.time() - start
-
-            tour = read_vrplib(tour_filename, n=len(demand))
-
-            save_dataset((tour, duration), output_filename)
-
-        return calc_vrp_cost(depot, loc, tour), tour, duration
-
-    except Exception as e:
-        raise
-        print("Exception occured")
-        print(e)
-        return None
-
-
-def calc_vrp_cost(depot, loc, tour):
-    assert (np.sort(tour)[-len(loc):] == np.arange(len(loc)) + 1).all(), "All nodes must be visited once!"
-    # TODO validate capacity constraints
-    loc_with_depot = np.vstack((np.array(depot)[None, :], np.array(loc)))
-    sorted_locs = loc_with_depot[np.concatenate(([0], tour, [0]))]
-    return np.linalg.norm(sorted_locs[1:] - sorted_locs[:-1], axis=-1).sum()
-
-
-def write_lkh_par(filename, parameters):
-    default_parameters = {  # Use none to include as flag instead of kv
-        "SPECIAL": None,
-        "MAX_TRIALS": 10000,
-        "RUNS": 10,
-        "TRACE_LEVEL": 1,
-        "SEED": 0
-    }
-    with open(filename, 'w') as f:
-        for k, v in {**default_parameters, **parameters}.items():
-            if v is None:
-                f.write("{}\n".format(k))
-            else:
-                f.write("{} = {}\n".format(k, v))
-
-
-def read_vrplib(filename, n):
-    with open(filename, 'r') as f:
-        tour = []
-        dimension = 0
-        started = False
-        for line in f:
-            if started:
-                loc = int(line)
-                if loc == -1:
-                    break
-                tour.append(loc)
-            if line.startswith("DIMENSION"):
-                dimension = int(line.split(" ")[-1])
-
-            if line.startswith("TOUR_SECTION"):
-                started = True
-
-    assert len(tour) == dimension
-    tour = np.array(tour).astype(int) - 1  # Subtract 1 as depot is 1 and should be 0
-    tour[tour > n] = 0  # Any nodes above the number of nodes there are is also depot
-    assert tour[0] == 0  # Tour should start with depot
-    assert tour[-1] != 0  # Tour should not end with depot
-    return tour[1:].tolist()
-
-
-def write_vrplib(filename, depot, loc, demand, capacity, grid_size, name="problem"):
-
-    with open(filename, 'w') as f:
-        f.write("\n".join([
-            "{} : {}".format(k, v)
-            for k, v in (
-                ("NAME", name),
-                ("TYPE", "CVRP"),
-                ("DIMENSION", len(loc) + 1),
-                ("EDGE_WEIGHT_TYPE", "EUC_2D"),
-                ("CAPACITY", capacity)
-            )
-        ]))
-        f.write("\n")
-        f.write("NODE_COORD_SECTION\n")
-        f.write("\n".join([
-            "{}\t{}\t{}".format(i + 1, int(x / grid_size * 100000 + 0.5), int(y / grid_size * 100000 + 0.5))  # VRPlib does not take floats
-            #"{}\t{}\t{}".format(i + 1, x, y)
-            for i, (x, y) in enumerate([depot] + loc)
-        ]))
-        f.write("\n")
-        f.write("DEMAND_SECTION\n")
-        f.write("\n".join([
-            "{}\t{}".format(i + 1, d)
-            for i, d in enumerate([0] + demand)
-        ]))
-        f.write("\n")
-        f.write("DEPOT_SECTION\n")
-        f.write("1\n")
-        f.write("-1\n")
-        f.write("EOF\n")
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("method", help="Name of the method to evaluate, 'lkh' only")
-    parser.add_argument("datasets", nargs='+', help="Filename of the dataset(s) to evaluate")
-    parser.add_argument("-f", action='store_true', help="Set true to overwrite")
-    parser.add_argument("-o", default=None, help="Name of the results file to write")
-    parser.add_argument("--cpus", type=int, help="Number of CPUs to use, defaults to all cores")
-    parser.add_argument('--disable_cache', action='store_true', help='Disable caching')
-    parser.add_argument('--progress_bar_mininterval', type=float, default=0.1, help='Minimum interval')
-    parser.add_argument('-n', type=int, help="Number of instances to process")
-    parser.add_argument('--offset', type=int, help="Offset where to start processing")
-    parser.add_argument('--results_dir', default='results', help="Name of results directory")
-
-    opts = parser.parse_args()
-
-    assert opts.o is None or len(opts.datasets) == 1, "Cannot specify result filename with more than one dataset"
-
-    for dataset_path in opts.datasets:
-
-        assert os.path.isfile(check_extension(dataset_path)), "File does not exist!"
-
-        dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
-
-        if opts.o is None:
-            results_dir = os.path.join(opts.results_dir, "vrp", dataset_basename)
-            os.makedirs(results_dir, exist_ok=True)
-
-            out_file = os.path.join(results_dir, "{}{}{}-{}{}".format(
-                dataset_basename,
-                "offs{}".format(opts.offset) if opts.offset is not None else "",
-                "n{}".format(opts.n) if opts.n is not None else "",
-                opts.method, ext
-            ))
-        else:
-            out_file = opts.o
-
-        assert opts.f or not os.path.isfile(
-            out_file), "File already exists! Try running with -f option to overwrite."
-
-        match = re.match(r'^([a-z_]+)(\d*)$', opts.method)
-        assert match
-        method = match[1]
-        runs = 1 if match[2] == '' else int(match[2])
-
-        if method == "lkh":
-            executable = get_lkh_executable()
-
-            target_dir = os.path.join(results_dir, "{}-{}".format(
-                dataset_basename,
-                opts.method
-            ))
-            assert opts.f or not os.path.isdir(target_dir), \
-                "Target dir already exists! Try running with -f option to overwrite."
-
-            if not os.path.isdir(target_dir):
-                os.makedirs(target_dir)
-
-            # TSP contains single loc array rather than tuple
-            dataset = load_dataset(dataset_path)
-
-            use_multiprocessing = False
-
-            def run_func(args):
-                directory, name, *args = args
-                depot, loc, demand, capacity, *args = args
-                grid_size = 1
-                if len(args) > 0:
-                    depot_types, customer_types, grid_size = args
-
-                return solve_lkh_log(
-                    executable,
-                    directory, name,
-                    depot, loc, demand, capacity, grid_size,
-                    runs=runs, disable_cache=opts.disable_cache
-                )
-
-            # Note: only processing n items is handled by run_all_in_pool
-            results, parallelism = run_all_in_pool(
-                run_func,
-                target_dir, dataset, opts, use_multiprocessing=use_multiprocessing
-            )
-
-        else:
-            assert False, "Unknown method: {}".format(opts.method)
-
-        costs, tours, durations = zip(*results)  # Not really costs since they should be negative
-        print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
-        print("Average serial duration: {} +- {}".format(
-            np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
-        print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
-        print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))
-
-        save_dataset((results, parallelism), out_file)
diff --git a/AM/reinforce_baselines.py b/AM/reinforce_baselines.py
deleted file mode 100644
index 37cf0ac..0000000
--- a/AM/reinforce_baselines.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch.utils.data import Dataset
-from scipy.stats import ttest_rel
-import copy
-from train import rollout, get_inner_model
-
-class Baseline(object):
-
-    def wrap_dataset(self, dataset):
-        return dataset
-
-    def unwrap_batch(self, batch):
-        return batch, None
-
-    def eval(self, x, c):
-        raise NotImplementedError("Override this method")
-
-    def get_learnable_parameters(self):
-        return []
-
-    def epoch_callback(self, model, epoch):
-        pass
-
-    def state_dict(self):
-        return {}
-
-    def load_state_dict(self, state_dict):
-        pass
-
-
-class WarmupBaseline(Baseline):
-
-    def __init__(self, baseline, n_epochs=1, warmup_exp_beta=0.8, ):
-        super(Baseline, self).__init__()
-
-        self.baseline = baseline
-        assert n_epochs > 0, "n_epochs to warmup must be positive"
-        self.warmup_baseline = ExponentialBaseline(warmup_exp_beta)
-        self.alpha = 0
-        self.n_epochs = n_epochs
-
-    def wrap_dataset(self, dataset):
-        if self.alpha > 0:
-            return self.baseline.wrap_dataset(dataset)
-        return self.warmup_baseline.wrap_dataset(dataset)
-
-    def unwrap_batch(self, batch):
-        if self.alpha > 0:
-            return self.baseline.unwrap_batch(batch)
-        return self.warmup_baseline.unwrap_batch(batch)
-
-    def eval(self, x, c):
-
-        if self.alpha == 1:
-            return self.baseline.eval(x, c)
-        if self.alpha == 0:
-            return self.warmup_baseline.eval(x, c)
-        v, l = self.baseline.eval(x, c)
-        vw, lw = self.warmup_baseline.eval(x, c)
-        # Return convex combination of baseline and of loss
-        return self.alpha * v + (1 - self.alpha) * vw, self.alpha * l + (1 - self.alpha * lw)
-
-    def epoch_callback(self, model, epoch):
-        # Need to call epoch callback of inner model (also after first epoch if we have not used it)
-        self.baseline.epoch_callback(model, epoch)
-        self.alpha = (epoch + 1) / float(self.n_epochs)
-        if epoch < self.n_epochs:
-            print("Set warmup alpha = {}".format(self.alpha))
-
-    def state_dict(self):
-        # Checkpointing within warmup stage makes no sense, only save inner baseline
-        return self.baseline.state_dict()
-
-    def load_state_dict(self, state_dict):
-        # Checkpointing within warmup stage makes no sense, only load inner baseline
-        self.baseline.load_state_dict(state_dict)
-
-
-class NoBaseline(Baseline):
-
-    def eval(self, x, c):
-        return 0, 0  # No baseline, no loss
-
-
-class ExponentialBaseline(Baseline):
-
-    def __init__(self, beta):
-        super(Baseline, self).__init__()
-
-        self.beta = beta
-        self.v = None
-
-    def eval(self, x, c):
-
-        if self.v is None:
-            v = c.mean()
-        else:
-            v = self.beta * self.v + (1. - self.beta) * c.mean()
-
-        self.v = v.detach()  # Detach since we never want to backprop
-        return self.v, 0  # No loss
-
-    def state_dict(self):
-        return {
-            'v': self.v
-        }
-
-    def load_state_dict(self, state_dict):
-        self.v = state_dict['v']
-
-
-class CriticBaseline(Baseline):
-
-    def __init__(self, critic):
-        super(Baseline, self).__init__()
-
-        self.critic = critic
-
-    def eval(self, x, c):
-        v = self.critic(x)
-        # Detach v since actor should not backprop through baseline, only for loss
-        return v.detach(), F.mse_loss(v, c.detach())
-
-    def get_learnable_parameters(self):
-        return list(self.critic.parameters())
-
-    def epoch_callback(self, model, epoch):
-        pass
-
-    def state_dict(self):
-        return {
-            'critic': self.critic.state_dict()
-        }
-
-    def load_state_dict(self, state_dict):
-        critic_state_dict = state_dict.get('critic', {})
-        if not isinstance(critic_state_dict, dict):  # backwards compatibility
-            critic_state_dict = critic_state_dict.state_dict()
-        self.critic.load_state_dict({**self.critic.state_dict(), **critic_state_dict})
-
-
-class RolloutBaseline(Baseline):
-
-    def __init__(self, model, problem, opts, epoch=0, task=None, update_baseline=True):
-        super(Baseline, self).__init__()
-
-        self.problem = problem
-        self.opts = opts
-        self.task = task
-
-        if update_baseline:
-            self._update_model(model, epoch)
-        else:
-            self.model = copy.deepcopy(model)
-
-    def _update_model(self, model, epoch, dataset=None):
-        self.model = copy.deepcopy(model)
-        # Always generate baseline dataset when updating model to prevent overfitting to the baseline dataset
-
-        if dataset is not None:
-            if len(dataset) != self.opts.val_size:
-                print("Warning: not using saved baseline dataset since val_size does not match")
-                dataset = None
-            elif (dataset[0] if self.problem.NAME == 'tsp' else dataset[0]['loc']).size(0) != self.opts.graph_size:
-                print("Warning: not using saved baseline dataset since graph_size does not match")
-                dataset = None
-
-        if dataset is None:
-            self.dataset = self.problem.make_dataset(num_samples=self.opts.val_size, distribution=self.opts.data_distribution, task=self.task)
-        else:
-            self.dataset = dataset
-        print("Evaluating baseline model on evaluation dataset")
-        self.bl_vals = rollout(self.model, self.dataset, self.opts).cpu().numpy()
-        self.mean = self.bl_vals.mean()
-        self.epoch = epoch
-
-    def wrap_dataset(self, dataset):
-        print("Evaluating baseline on dataset...")
-        # Need to convert baseline to 2D to prevent converting to double, see
-        # https://discuss.pytorch.org/t/dataloader-gives-double-instead-of-float/717/3
-        return BaselineDataset(dataset, rollout(self.model, dataset, self.opts).view(-1, 1))
-
-    def unwrap_batch(self, batch):
-        return batch['data'], batch['baseline'].view(-1)  # Flatten result to undo wrapping as 2D
-
-    def eval(self, x, c):
-        # Use volatile mode for efficient inference (single batch so we do not use rollout function)
-        with torch.no_grad():
-            v, _ = self.model(x)
-
-        # There is no loss
-        return v, 0
-
-    def epoch_callback(self, model, epoch):
-        """
-        Challenges the current baseline with the model and replaces the baseline model if it is improved.
-        :param model: The model to challenge the baseline by
-        :param epoch: The current epoch
-        """
-        print("Evaluating candidate model on evaluation dataset")
-        candidate_vals = rollout(model, self.dataset, self.opts).cpu().numpy()
-        candidate_mean = candidate_vals.mean()
-
-        print("Epoch {} candidate mean {}, baseline epoch {} mean {}, difference {}".format(
-            epoch, candidate_mean, self.epoch, self.mean, candidate_mean - self.mean))
-        if candidate_mean - self.mean < 0:
-            # Calc p value
-            t, p = ttest_rel(candidate_vals, self.bl_vals)
-
-            p_val = p / 2  # one-sided
-            assert t < 0, "T-statistic should be negative"
-            # print("p-value: {}".format(p_val))
-            if p_val < self.opts.bl_alpha:
-                # print('Update baseline')
-                self._update_model(model, epoch)
-
-    def state_dict(self):
-        return {
-            'model': self.model,
-            'dataset': self.dataset,
-            'epoch': self.epoch
-        }
-
-    def load_state_dict(self, state_dict):
-        # We make it such that it works whether model was saved as data parallel or not
-        print(">> Load state dict in Baseline.")
-        load_model = copy.deepcopy(self.model)
-        get_inner_model(load_model).load_state_dict(get_inner_model(state_dict['model']).state_dict())
-        self._update_model(load_model, state_dict['epoch'], state_dict['dataset'])
-
-
-class BaselineDataset(Dataset):
-
-    def __init__(self, dataset=None, baseline=None):
-        super(BaselineDataset, self).__init__()
-
-        self.dataset = dataset
-        self.baseline = baseline
-        assert (len(self.dataset) == len(self.baseline))
-
-    def __getitem__(self, item):
-        return {
-            'data': self.dataset[item],
-            'baseline': self.baseline[item]
-        }
-
-    def __len__(self):
-        return len(self.dataset)
diff --git a/AM/run_meta.py b/AM/run_meta.py
deleted file mode 100755
index 7611aa6..0000000
--- a/AM/run_meta.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import json
-import random
-import pprint as pp
-from datetime import datetime
-
-import torch
-import torch.optim as optim
-
-from nets.critic_network import CriticNetwork
-from options import get_options
-from train import meta_train_epoch, validate, get_inner_model
-from reinforce_baselines import NoBaseline, ExponentialBaseline, CriticBaseline, RolloutBaseline, WarmupBaseline
-from nets.attention_model import AttentionModel
-from nets.pointer_network import PointerNetwork, CriticNetworkLSTM
-from utils import torch_load_cpu, load_problem, seed_everything, save_checkpoint
-from generate_dataset import generate_train_task
-
-
-def run(opts):
-    # hard-coded
-    opts.shuffle = True
-    opts.graph_size = -1
-    opts.variation_type = "size"
-    opts.baseline_every_Xepochs_for_META = 7
-    opts.val_dataset = "../data/size/tsp/tsp100_validation_seed4321.pkl"
-
-    # Pretty print the run args
-    pp.pprint(vars(opts))
-
-    # Set the random seed
-    seed_everything(opts.seed)
-
-    # Optionally configure tensorboard
-    tb_logger = None
-    # if not opts.no_tensorboard:
-    #     tb_logger = TbLogger(os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name))
-
-    os.makedirs(opts.save_dir)
-    # Save arguments so exact configuration can always be found
-    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
-        json.dump(vars(opts), f, indent=True)
-
-    # Set the device
-    opts.device = torch.device("cuda" if opts.use_cuda else "cpu")
-
-    # Figure out what's the problem
-    problem = load_problem(opts.problem)
-
-    # Load data from load_path
-    load_data = {}
-    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
-    load_path = opts.load_path if opts.load_path is not None else opts.resume
-    if load_path is not None:
-        print('  [*] Loading data from {}'.format(load_path))
-        load_data = torch_load_cpu(load_path)
-    if opts.resume:
-        epoch_resume = int(os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])
-        opts.epoch_start = epoch_resume + 1
-
-    # Initialize model
-    model_class = {
-        'attention': AttentionModel,
-        'pointer': PointerNetwork
-    }.get(opts.model, None)
-    assert model_class is not None, "Unknown model: {}".format(model_class)
-    model_meta = model_class(
-        opts.embedding_dim,
-        opts.hidden_dim,
-        problem,
-        n_encode_layers=opts.n_encode_layers,
-        mask_inner=True,
-        mask_logits=True,
-        normalization=opts.normalization,
-        tanh_clipping=opts.tanh_clipping,
-        checkpoint_encoder=opts.checkpoint_encoder,
-        shrink_size=opts.shrink_size
-    ).to(opts.device)
-
-    # if opts.use_cuda and torch.cuda.device_count() > 1:
-    #     model_meta = torch.nn.DataParallel(model_meta)
-
-    # Overwrite model parameters by parameters to load
-    model_ = get_inner_model(model_meta)
-    model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})})
-
-    # generate tasks based on task distribution.
-    tasks_list = generate_train_task(opts)
-
-    baseline_dict, val_dict = {}, {}
-    print("{} tasks in task list: {}".format(len(tasks_list), tasks_list))
-
-    for task in tasks_list:
-        baseline = RolloutBaseline(model_meta, problem, opts, task=task)
-        baseline_dict[str(task)] = baseline
-        val_dataset = problem.make_dataset(num_samples=opts.val_size, distribution=opts.data_distribution, task=task)
-        val_dict[str(task)] = val_dataset
-
-    alpha = opts.alpha
-    start_time = datetime.now()
-    for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
-        if (datetime.now() - start_time).total_seconds() >= 24*60*60:
-            print(">> Time Out: 24hrs. Training finished {} epochs".format(epoch))
-            break
-        print(">> Epoch {}, alpha: {}".format(epoch, alpha))
-        if opts.shuffle:
-            random.shuffle(tasks_list)
-        for index_task, task in enumerate(tasks_list):
-            baseline = baseline_dict[str(task)]
-            val_dataset = val_dict[str(task)]
-            meta_train_epoch(model_meta, baseline, epoch, val_dataset, problem, tb_logger, opts, alpha, task)
-
-        alpha = alpha * opts.alpha_decay
-
-        if (opts.checkpoint_epochs != 0 and epoch % opts.checkpoint_epochs == 0) or epoch == opts.n_epochs - 1:
-            print('Saving model and state...')
-            save_checkpoint(model_meta, os.path.join(opts.save_dir, 'epoch-{}.pt'.format(epoch)))
-
-        # add validation here.
-        if opts.val_dataset is not None:
-            val_dataset = problem.make_dataset(filename=opts.val_dataset)
-            avg_reward = validate(model_meta, val_dataset, opts)
-            print(">> Epoch {} avg_cost on TSP100 validation set {}".format(epoch, avg_reward))
-
-
-if __name__ == "__main__":
-    run(get_options())
diff --git a/AM/run_multi.py b/AM/run_multi.py
deleted file mode 100644
index e2a6f24..0000000
--- a/AM/run_multi.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-
-import json
-import tqdm
-import pprint as pp
-
-import torch
-import torch.optim as optim
-import os
-from options import get_options
-from torch.utils.data import DataLoader
-from train import train_epoch, get_inner_model, clip_grad_norms, get_hard_samples, validate
-from reinforce_baselines import NoBaseline, ExponentialBaseline, CriticBaseline, RolloutBaseline, WarmupBaseline
-from nets.attention_model import AttentionModel, set_decode_type
-from nets.pointer_network import PointerNetwork, CriticNetworkLSTM
-from utils import torch_load_cpu, load_problem, seed_everything, save_checkpoint, move_to
-from generate_dataset import generate_train_task
-import datetime
-
-
-def run(opts):
-    # hard-coded
-    opts.graph_size = -1  # for variation_type == size
-    opts.variation_type = "mix_dist_size"
-    update_task = False  # update AM by batch (default) or task (implementation of "On the Generalization of Neural Combinatorial Optimization Heuristics")
-    eps = 0
-    opts.val_dataset = "../data/size/tsp/tsp100_validation_seed4321.pkl"
-    # opts.baseline_every_Xepochs_for_META = 40  # set to default value for multi-AM / oracle-AM
-
-    # Pretty print the run args
-    pp.pprint(vars(opts))
-
-    # Set the random seed
-    seed_everything(opts.seed)
-
-    # Optionally configure tensorboard
-    tb_logger = None
-    # if not opts.no_tensorboard:
-    #     tb_logger = TbLogger(os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name))
-
-    os.makedirs(opts.save_dir)
-    # Save arguments so exact configuration can always be found
-    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
-        json.dump(vars(opts), f, indent=True)
-
-    # Set the device
-    opts.device = torch.device("cuda" if opts.use_cuda else "cpu")
-
-    # Figure out what's the problem
-    problem = load_problem(opts.problem)
-
-    # Load data from load_path
-    load_data = {}
-    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
-    load_path = opts.load_path if opts.load_path is not None else opts.resume
-    if load_path is not None:
-        print('  [*] Loading data from {}'.format(load_path))
-        load_data = torch_load_cpu(load_path)
-    if opts.resume:
-        epoch_resume = int(os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])
-        opts.epoch_start = epoch_resume + 1
-
-    # Initialize model
-    model_class = {
-        'attention': AttentionModel,
-        'pointer': PointerNetwork
-    }.get(opts.model, None)
-    assert model_class is not None, "Unknown model: {}".format(model_class)
-    model_common = model_class(
-        opts.embedding_dim,
-        opts.hidden_dim,
-        problem,
-        n_encode_layers=opts.n_encode_layers,
-        mask_inner=True,
-        mask_logits=True,
-        normalization=opts.normalization,
-        tanh_clipping=opts.tanh_clipping,
-        checkpoint_encoder=opts.checkpoint_encoder,
-        shrink_size=opts.shrink_size
-    ).to(opts.device)
-
-    # if opts.use_cuda and torch.cuda.device_count() > 1:
-    #     model_meta = torch.nn.DataParallel(model_meta)
-
-    # Overwrite model parameters by parameters to load
-    model_ = get_inner_model(model_common)
-    model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})})
-
-    # generate tasks based on task distribution.
-    tasks_list = generate_train_task(opts)
-
-    baseline_dict, val_dict = {}, {}
-    print("{} tasks in task list: {}".format(len(tasks_list), tasks_list))
-
-    for task in tasks_list:
-        baseline = RolloutBaseline(model_common, problem, opts, task=task)
-        baseline_dict[str(task)] = baseline
-        val_dataset = problem.make_dataset(num_samples=opts.val_size, distribution=opts.data_distribution, task=task)
-        val_dict[str(task)] = val_dataset
-
-    optimizer_common = optim.Adam(model_common.parameters(), opts.lr_model)
-    lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer_common, lambda epoch1: opts.lr_decay ** epoch1)
-    start_time = datetime.now()
-
-    for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
-        if (datetime.now() - start_time).total_seconds() >= 24 * 60 * 60:
-            print(">> Time Out: 24hrs. Training finished {} epochs".format(epoch))
-            break
-        if not update_task:
-            for task in tasks_list:
-                baseline = baseline_dict[str(task)]
-                val_dataset = val_dict[str(task)]
-                train_epoch(model_common, optimizer_common, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, task, eps=eps)
-        else:
-            for task in tasks_list:
-                baseline = baseline_dict[str(task)]
-                val_dataset = val_dict[str(task)]
-
-                # update by task
-                epoch_size = opts.batch_size * opts.k_tune_steps
-                training_dataset = baseline.wrap_dataset(problem.make_dataset(num_samples=epoch_size, distribution=opts.data_distribution, task=task))
-                training_dataloader = DataLoader(training_dataset, batch_size=opts.batch_size, num_workers=1)
-                model_common.train()
-                set_decode_type(model_common, "sampling")
-                loss = 0
-                for batch_id, batch in enumerate(tqdm(training_dataloader, disable=opts.no_progress_bar)):
-                    x, bl_val = baseline.unwrap_batch(batch)
-                    x = move_to(x, opts.device)
-                    bl_val = move_to(bl_val, opts.device) if bl_val is not None else None
-                    if eps > 0:
-                        x = get_hard_samples(model_common, x, eps, batch_size=x.size(0), baseline=baseline)
-                        if bl_val is not None:
-                            bl_val, _ = baseline.eval(x, None)
-                    model_common.train()
-                    set_decode_type(model_common, "sampling")
-                    cost, log_likelihood = model_common(x)
-                    bl_val, bl_loss = baseline.eval(x, cost) if bl_val is None else (bl_val, 0)
-                    reinforce_loss = ((cost - bl_val) * log_likelihood).mean()
-                    batch_loss = reinforce_loss + bl_loss
-                    loss += batch_loss * x.size(0)
-                loss = loss / epoch_size
-                optimizer_common.zero_grad()
-                loss.backward()
-                clip_grad_norms(optimizer_common.param_groups, opts.max_grad_norm)
-                optimizer_common.step()
-                lr_scheduler.step()
-                if epoch % opts.baseline_every_Xepochs_for_META == 0:
-                    # avg_reward = validate(model_common, val_dataset, opts)
-                    baseline.epoch_callback(model_common, epoch)
-
-        if (opts.checkpoint_epochs != 0 and epoch % opts.checkpoint_epochs == 0) or epoch == opts.n_epochs - 1:
-            print('Saving model and state...')
-            save_checkpoint(model_common, os.path.join(opts.save_dir, 'epoch-{}.pt'.format(epoch)))
-
-        # add validation here.
-        if opts.val_dataset is not None:
-            val_dataset = problem.make_dataset(filename=opts.val_dataset_path)
-            avg_reward = validate(model_meta, val_dataset, opts)
-            print(">> Epoch {} avg_cost on TSP100 validation set {}".format(epoch, avg_reward))
-
-
-if __name__ == "__main__":
-    run(get_options())
diff --git a/AM/train.py b/AM/train.py
deleted file mode 100644
index 8a08c63..0000000
--- a/AM/train.py
+++ /dev/null
@@ -1,309 +0,0 @@
-import os
-import copy
-import time
-from tqdm import tqdm
-import torch
-import math
-import pickle
-from datetime import datetime
-
-import torch.optim as optim
-from torch.utils.data import DataLoader
-from torch.nn import DataParallel
-
-from nets.attention_model import set_decode_type
-from utils.log_utils import log_values
-from utils import move_to
-
-
-def get_inner_model(model):
-    return model.module if isinstance(model, DataParallel) else model
-
-
-def validate(model, dataset, opts, return_all_costs=False, return_pi=False):
-    # Validate
-    print('Validating...')
-    if return_pi:
-        cost, pi = rollout(model, dataset, opts, return_pi=True)
-    else:
-        cost = rollout(model, dataset, opts, return_pi=False)
-    avg_cost = cost.mean()
-    print('Validation overall avg_cost: {} +- {}'.format(avg_cost, torch.std(cost) / math.sqrt(len(cost))))
-
-    if return_all_costs and return_pi:
-        return avg_cost, cost, pi
-    if return_all_costs and not return_pi:
-        return avg_cost, cost
-    return avg_cost
-
-
-def rollout(model, dataset, opts, return_pi=False):
-    # Put in greedy evaluation mode!
-    set_decode_type(model, "greedy")
-    model.eval()
-
-    def eval_model_bat(bat):
-        with torch.no_grad():
-            cost, _, pi = model(move_to(bat, opts.device), return_pi=True)
-        return cost.data.cpu(), pi.cpu()
-
-    if not return_pi:
-        return torch.cat([
-            eval_model_bat(bat)[0]
-            for bat in tqdm(DataLoader(dataset, batch_size=opts.eval_batch_size), disable=opts.no_progress_bar)
-        ], 0)
-    else:
-        cost_array, pi_array = [], []
-        for bat in tqdm(DataLoader(dataset, batch_size=opts.eval_batch_size), disable=opts.no_progress_bar):
-            cost_, pi_ = eval_model_bat(bat)
-            cost_array.append(cost_)
-            pi_array.append(pi_)
-        return torch.cat(cost_array, 0), torch.cat(pi_array, 0)
-
-
-def clip_grad_norms(param_groups, max_norm=math.inf):
-    """
-    Clips the norms for all param groups to max_norm and returns gradient norms before clipping
-    :param optimizer:
-    :param max_norm:
-    :param gradient_norms_log:
-    :return: grad_norms, clipped_grad_norms: list with (clipped) gradient norms per group
-    """
-    grad_norms = [
-        torch.nn.utils.clip_grad_norm_(
-            group['params'],
-            max_norm if max_norm > 0 else math.inf,  # Inf so no clipping but still call to calc
-            norm_type=2
-        )
-        for group in param_groups
-    ]
-    grad_norms_clipped = [min(g_norm, max_norm) for g_norm in grad_norms] if max_norm > 0 else grad_norms
-    return grad_norms, grad_norms_clipped
-
-
-def get_hard_samples(model, data, eps=5, batch_size=1024, baseline=None):
-    from torch.autograd import Variable
-    model.eval()
-    set_decode_type(model, "greedy")
-
-    def minmax(xy_):
-        '''
-        min max batch of graphs [b,n,2]
-        '''
-        xy_ = (xy_ - xy_.min(dim=1, keepdims=True)[0]) / (xy_.max(dim=1, keepdims=True)[0] - xy_.min(dim=1, keepdims=True)[0])
-        return xy_
-
-    def get_hard(model, data, eps):
-        data.requires_grad_()
-        cost, ll, pi = model(data, return_pi=True)
-        if baseline is not None:
-            with torch.no_grad():
-                cost_b, _ = baseline.eval(data, None)  # only support for rollout now.
-            # cost, ll = model(data)
-            delta = torch.autograd.grad(eps*((cost/cost_b)*ll).mean(), data)[0]
-        else:
-            # As dividend is viewed as constant, it can be omitted in gradient calculation.
-            delta = torch.autograd.grad(eps*(cost*ll).mean(), data)[0]
-        ndata = data.detach() + delta
-        ndata = minmax(ndata)
-        ndata = Variable(ndata, requires_grad=False)
-        return ndata
-
-    # dataloader = DataLoader(data, batch_size=batch_size)
-    # hard = torch.cat([get_hard(model, data, eps) for data in dataloader], dim=0)
-    # return hard
-    return get_hard(model, data, eps)
-
-
-def tune_and_test(task, model_meta, baseline, epoch, test_dataset, problem, tb_logger, opts, fine_tuning_dataset=None, dict_results_task_sample_iter_wise=None):
-    """
-    test_dataset: Test dataset
-    fine_tuning_dataset: dataset used for fine-tuning the model
-    TODO: 1. why not fine_tuning_dataset = test_dataset?
-    """
-
-    print("task  ", task)
-    sequence_updated_reward = []
-    step = 0
-    start_time = time.time()
-    COUNTER_FINE_TUNE = 0
-
-    training_dataset = baseline.wrap_dataset(fine_tuning_dataset)
-    num_fine_tune_step_epochs = opts.test_num_step_epochs  # not 30; it depends upon (fine tuning dataset used)
-    num_batch_size = 256 if task['graph_size'] < 150 else 128
-
-    print("size of fine tuning dataset ", len(fine_tuning_dataset))
-    print("num_batch_size ", num_batch_size)
-
-    rand_sampler = torch.utils.data.RandomSampler(training_dataset, num_samples=len(training_dataset), replacement=True)
-    training_dataloader = DataLoader(training_dataset, batch_size=num_batch_size, num_workers=1, sampler=rand_sampler)
-    model_task = copy.deepcopy(model_meta)
-
-    avg_reward, all_costs = validate(model_task, test_dataset, opts,  return_all_costs=True, return_pi=False)
-    print(" >> AVG_COST {}, BEFORE TUNING on task {}".format(avg_reward, task))
-
-    dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE] = {}
-    dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['cost'] = all_costs
-    # if opts.rescale_for_testing is not None:  # only for scratch part since we didn't want to train again.
-    #     dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['cost'] = dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['cost']*(task['rescale_for_testing']/3.0)
-    # if COUNTER_FINE_TUNE % 50 == 0:
-    dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['avg_cost'] = avg_reward.item()
-    dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['current_time'] = datetime.now()
-    print("COUNTER FINE TUNE {}, AVG COSTS {}".format(COUNTER_FINE_TUNE, avg_reward.item()))
-
-    sequence_updated_reward.append(avg_reward)
-    model_task.train()
-    set_decode_type(model_task, "sampling")
-    optimizer = optim.Adam(model_task.parameters(), lr=opts.lr_model*0.1)
-    print("num_fine_tune_step_epochs ", num_fine_tune_step_epochs)
-    time_spent_in_fine_tuning = 0
-
-    for outer_step_id in range(num_fine_tune_step_epochs):
-        print("Fine-tune epoch ", outer_step_id)
-        for batch_id, batch in enumerate(tqdm(training_dataloader, disable=opts.no_progress_bar)):
-            # if time_spent_in_fine_tuning > 180 or (COUNTER_FINE_TUNE == 250000 and opts.longer_fine_tune == 0):
-            # if COUNTER_FINE_TUNE > num_fine_tune_step_epochs:
-            #     return updated_reward
-            time_before_update = datetime.now()
-            model_task.train()
-            set_decode_type(model_task, "sampling")
-            train_batch(model_task, optimizer, baseline, epoch, batch_id, step, batch, tb_logger, opts)
-            time_after_update = datetime.now()
-            time_taken_for_update = (time_after_update - time_before_update).total_seconds() / 60.0
-            time_spent_in_fine_tuning += time_taken_for_update
-            step += 1
-            COUNTER_FINE_TUNE += 1
-            print(">> Time spent in fine-tuning {} minutes, {} steps".format(time_spent_in_fine_tuning, COUNTER_FINE_TUNE))
-
-        # if COUNTER_FINE_TUNE % 10 == 0 or COUNTER_FINE_TUNE == 1:
-        updated_reward, updated_all_costs = validate(model_task, test_dataset, opts, return_all_costs=True, return_pi=False)
-        print(" COST AFTER TUNING ", updated_reward)
-        sequence_updated_reward.append(updated_reward)
-        # if dict_results_task_sample_iter_wise is not None:
-        dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE] = {}
-        dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['cost'] = updated_all_costs
-        # dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['pi'] = None
-        dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['current_time'] = datetime.now()
-        dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['time_spent_in_fine_tuning'] = time_spent_in_fine_tuning
-        dict_results_task_sample_iter_wise[COUNTER_FINE_TUNE]['avg_cost'] = updated_reward.item()
-
-    epoch_duration = time.time() - start_time
-    updated_reward = validate(model_task, test_dataset, opts, return_all_costs=False, return_pi=False)
-
-    if num_fine_tune_step_epochs == 0:
-        print("****** No fine tuning done **** ")
-    else:
-        print(">> {} steps within {} epochs fine-tuning finished, took {} s".format(COUNTER_FINE_TUNE, num_fine_tune_step_epochs, time.strftime('%H:%M:%S', time.gmtime(epoch_duration))))
-        print(">> AFTER TUNING on task ", task)
-        print(">> COST AFTER TUNING ", updated_reward)
-
-    for index, x in enumerate(sequence_updated_reward):
-        print(x.item(), end=' -> ')
-    print("")
-
-    return updated_reward
-
-
-def train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, task, eps=0):
-    """
-        Implementation of ordinary AM training (update by batch).
-    """
-    print("Start train epoch {}, lr={} on task {}".format(epoch, optimizer.param_groups[0]['lr'], task))
-    step = 0
-    start_time = time.time()
-
-    # Generate new training data for each epoch
-    epoch_size = opts.batch_size * opts.k_tune_steps
-    training_dataset = baseline.wrap_dataset(problem.make_dataset(num_samples=epoch_size, distribution=opts.data_distribution, task=task))
-    training_dataloader = DataLoader(training_dataset, batch_size=opts.batch_size, num_workers=1)
-
-    # Put model in train mode!
-    model.train()
-    set_decode_type(model, "sampling")
-
-    for batch_id, batch in enumerate(tqdm(training_dataloader, disable=opts.no_progress_bar)):
-        train_batch(model, optimizer, baseline, epoch, batch_id, step, batch, tb_logger, opts, eps=eps)
-        step += 1
-
-    # update baseline model
-    if epoch % opts.baseline_every_Xepochs_for_META == 0:
-        # avg_reward = validate(model, val_dataset, opts)
-        baseline.epoch_callback(model, epoch)
-
-    # lr_scheduler should be called at end of epoch
-    lr_scheduler.step()
-
-    epoch_duration = time.time() - start_time
-    print("Finished epoch {}, took {} s".format(epoch, time.strftime('%H:%M:%S', time.gmtime(epoch_duration))))
-
-
-def meta_train_epoch(model_meta, baseline, epoch, val_dataset, problem, tb_logger, opts, alpha, task, eps=0):
-    """
-        Implementation for meta-learning framework.
-    """
-    lr = opts.lr_model * (opts.lr_decay ** epoch)
-    print("Start train epoch {}, lr={}, alpha={} on task {}".format(epoch, lr, alpha, task))
-    step = 0
-    start_time = time.time()
-
-    # Generate new training data for each epoch
-    epoch_size = opts.batch_size * opts.k_tune_steps
-    training_dataset = baseline.wrap_dataset(problem.make_dataset(num_samples=epoch_size, distribution=opts.data_distribution, task=task))
-    training_dataloader = DataLoader(training_dataset, batch_size=opts.batch_size, num_workers=1)
-
-    # Put model in train mode!
-    current_weights = copy.deepcopy(model_meta.state_dict())
-    model_meta.train()
-    set_decode_type(model_meta, "sampling")
-    optimizer = optim.Adam(model_meta.parameters(), lr=lr)
-
-    for batch_id, batch in enumerate(tqdm(training_dataloader, disable=opts.no_progress_bar)):
-        train_batch(model_meta, optimizer, baseline, epoch, batch_id, step, batch, tb_logger, opts, eps=eps)
-        step += 1
-
-    candidate_weights = model_meta.state_dict()
-    state_dict = {candidate: (current_weights[candidate] + alpha * (candidate_weights[candidate] - current_weights[candidate])) for candidate in candidate_weights}
-
-    # update baseline model
-    if epoch % opts.baseline_every_Xepochs_for_META == 0:
-        # avg_reward = validate(model, val_dataset, opts)
-        baseline.epoch_callback(model_meta, epoch)
-
-    model_meta.load_state_dict(state_dict)
-
-    epoch_duration = time.time() - start_time
-    print("Finished epoch {}, took {} s".format(epoch, time.strftime('%H:%M:%S', time.gmtime(epoch_duration))))
-
-
-def train_batch(model, optimizer, baseline, epoch, batch_id, step, batch, tb_logger, opts, eps=0):
-    x, bl_val = baseline.unwrap_batch(batch)
-    x = move_to(x, opts.device)
-    bl_val = move_to(bl_val, opts.device) if bl_val is not None else None
-
-    if eps > 0:
-        x = get_hard_samples(model, x, eps, batch_size=x.size(0), baseline=baseline)
-        if bl_val is not None:
-            bl_val, _ = baseline.eval(x, None)
-
-    # Evaluate model, get costs and log probabilities
-    model.train()
-    set_decode_type(model, "sampling")
-    cost, log_likelihood = model(x)
-
-    # Evaluate baseline, get baseline loss if any (only for critic)
-    bl_val, bl_loss = baseline.eval(x, cost) if bl_val is None else (bl_val, 0)
-
-    # Calculate loss
-    reinforce_loss = ((cost - bl_val) * log_likelihood).mean()
-    loss = reinforce_loss + bl_loss
-
-    # Perform backward pass and optimization step
-    optimizer.zero_grad()
-    loss.backward()
-    # Clip gradient norms and get (clipped) gradient norms for logging
-    grad_norms = clip_grad_norms(optimizer.param_groups, opts.max_grad_norm)
-    optimizer.step()
-
-    # Logging
-    # if step % int(opts.log_step) == 0:
-    #     log_values(cost, grad_norms, epoch, batch_id, step, log_likelihood, reinforce_loss, bl_loss, tb_logger, opts)
diff --git a/AM/utils/__init__.py b/AM/utils/__init__.py
deleted file mode 100644
index e7d4dc0..0000000
--- a/AM/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .functions import *
\ No newline at end of file
diff --git a/AM/utils/beam_search.py b/AM/utils/beam_search.py
deleted file mode 100644
index 5e68d20..0000000
--- a/AM/utils/beam_search.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import time
-import torch
-from typing import NamedTuple
-from utils.lexsort import torch_lexsort
-
-
-def beam_search(*args, **kwargs):
-    beams, final_state = _beam_search(*args, **kwargs)
-    return get_beam_search_results(beams, final_state)
-
-
-def get_beam_search_results(beams, final_state):
-    beam = beams[-1]  # Final beam
-    if final_state is None:
-        return None, None, None, None, beam.batch_size
-
-    # First state has no actions/parents and should be omitted when backtracking
-    actions = [beam.action for beam in beams[1:]]
-    parents = [beam.parent for beam in beams[1:]]
-
-    solutions = final_state.construct_solutions(backtrack(parents, actions))
-    return beam.score, solutions, final_state.get_final_cost()[:, 0], final_state.ids.view(-1), beam.batch_size
-
-
-def _beam_search(state, beam_size, propose_expansions=None,
-                keep_states=False):
-
-    beam = BatchBeam.initialize(state)
-
-    # Initial state
-    beams = [beam if keep_states else beam.clear_state()]
-
-    # Perform decoding steps
-    while not beam.all_finished():
-
-        # Use the model to propose and score expansions
-        parent, action, score = beam.propose_expansions() if propose_expansions is None else propose_expansions(beam)
-        if parent is None:
-            return beams, None
-
-        # Expand and update the state according to the selected actions
-        beam = beam.expand(parent, action, score=score)
-
-        # Get topk
-        beam = beam.topk(beam_size)
-
-        # Collect output of step
-        beams.append(beam if keep_states else beam.clear_state())
-
-    # Return the final state separately since beams may not keep state
-    return beams, beam.state
-
-
-class BatchBeam(NamedTuple):
-    """
-    Class that keeps track of a beam for beam search in batch mode.
-    Since the beam size of different entries in the batch may vary, the tensors are not (batch_size, beam_size, ...)
-    but rather (sum_i beam_size_i, ...), i.e. flattened. This makes some operations a bit cumbersome.
-    """
-    score: torch.Tensor  # Current heuristic score of each entry in beam (used to select most promising)
-    state: None  # To track the state
-    parent: torch.Tensor
-    action: torch.Tensor
-    batch_size: int  # Can be used for optimizations if batch_size = 1
-    device: None  # Track on which device
-
-    # Indicates for each row to which batch it belongs (0, 0, 0, 1, 1, 2, ...), managed by state
-    @property
-    def ids(self):
-        return self.state.ids.view(-1)  # Need to flat as state has steps dimension
-
-    def __getitem__(self, key):
-        assert torch.is_tensor(key) or isinstance(key, slice)  # If tensor, idx all tensors by this tensor:
-        return self._replace(
-            # ids=self.ids[key],
-            score=self.score[key] if self.score is not None else None,
-            state=self.state[key],
-            parent=self.parent[key] if self.parent is not None else None,
-            action=self.action[key] if self.action is not None else None
-        )
-
-    # Do not use __len__ since this is used by namedtuple internally and should be number of fields
-    # def __len__(self):
-    #     return len(self.ids)
-
-    @staticmethod
-    def initialize(state):
-        batch_size = len(state.ids)
-        device = state.ids.device
-        return BatchBeam(
-            score=torch.zeros(batch_size, dtype=torch.float, device=device),
-            state=state,
-            parent=None,
-            action=None,
-            batch_size=batch_size,
-            device=device
-        )
-
-    def propose_expansions(self):
-        mask = self.state.get_mask()
-        # Mask always contains a feasible action
-        expansions = torch.nonzero(mask[:, 0, :] == 0)
-        parent, action = torch.unbind(expansions, -1)
-        return parent, action, None
-
-    def expand(self, parent, action, score=None):
-        return self._replace(
-            score=score,  # The score is cleared upon expanding as it is no longer valid, or it must be provided
-            state=self.state[parent].update(action),  # Pass ids since we replicated state
-            parent=parent,
-            action=action
-        )
-
-    def topk(self, k):
-        idx_topk = segment_topk_idx(self.score, k, self.ids)
-        return self[idx_topk]
-
-    def all_finished(self):
-        return self.state.all_finished()
-
-    def cpu(self):
-        return self.to(torch.device('cpu'))
-
-    def to(self, device):
-        if device == self.device:
-            return self
-        return self._replace(
-            score=self.score.to(device) if self.score is not None else None,
-            state=self.state.to(device),
-            parent=self.parent.to(device) if self.parent is not None else None,
-            action=self.action.to(device) if self.action is not None else None
-        )
-
-    def clear_state(self):
-        return self._replace(state=None)
-
-    def size(self):
-        return self.state.ids.size(0)
-
-
-def segment_topk_idx(x, k, ids):
-    """
-    Finds the topk per segment of data x given segment ids (0, 0, 0, 1, 1, 2, ...).
-    Note that there may be fewer than k elements in a segment so the returned length index can vary.
-    x[result], ids[result] gives the sorted elements per segment as well as corresponding segment ids after sorting.
-    :param x:
-    :param k:
-    :param ids:
-    :return:
-    """
-    assert x.dim() == 1
-    assert ids.dim() == 1
-
-    # Since we may have varying beam size per batch entry we cannot reshape to (batch_size, beam_size)
-    # And use default topk along dim -1, so we have to be creative
-    # Now we have to get the topk per segment which is really annoying :(
-    # we use lexsort on (ids, score), create array with offset per id
-    # offsets[ids] then gives offsets repeated and only keep for which arange(len) < offsets + k
-    splits_ = torch.nonzero(ids[1:] - ids[:-1])
-
-    if len(splits_) == 0:  # Only one group
-        _, idx_topk = x.topk(min(k, x.size(0)))
-        return idx_topk
-
-    splits = torch.cat((ids.new_tensor([0]), splits_[:, 0] + 1))
-    # Make a new array in which we store for each id the offset (start) of the group
-    # This way ids does not need to be increasing or adjacent, as long as each group is a single range
-    group_offsets = splits.new_zeros((splits.max() + 1,))
-    group_offsets[ids[splits]] = splits
-    offsets = group_offsets[ids]  # Look up offsets based on ids, effectively repeating for the repetitions per id
-
-    # We want topk so need to sort x descending so sort -x (be careful with unsigned data type!)
-    idx_sorted = torch_lexsort((-(x if x.dtype != torch.uint8 else x.int()).detach(), ids))
-
-    # This will filter first k per group (example k = 2)
-    # ids     = [0, 0, 0, 1, 1, 1, 1, 2]
-    # splits  = [0, 3, 7]
-    # offsets = [0, 0, 0, 3, 3, 3, 3, 7]
-    # offs+2  = [2, 2, 2, 5, 5, 5, 5, 9]
-    # arange  = [0, 1, 2, 3, 4, 5, 6, 7]
-    # filter  = [1, 1, 0, 1, 1, 0, 0, 1]
-    # Use filter to get only topk of sorting idx
-    return idx_sorted[torch.arange(ids.size(0), out=ids.new()) < offsets + k]
-
-
-def backtrack(parents, actions):
-
-    # Now backtrack to find aligned action sequences in reversed order
-    cur_parent = parents[-1]
-    reversed_aligned_sequences = [actions[-1]]
-    for parent, sequence in reversed(list(zip(parents[:-1], actions[:-1]))):
-        reversed_aligned_sequences.append(sequence.gather(-1, cur_parent))
-        cur_parent = parent.gather(-1, cur_parent)
-
-    return torch.stack(list(reversed(reversed_aligned_sequences)), -1)
-
-
-class CachedLookup(object):
-
-    def __init__(self, data):
-        self.orig = data
-        self.key = None
-        self.current = None
-
-    def __getitem__(self, key):
-        assert not isinstance(key, slice), "CachedLookup does not support slicing, " \
-                                           "you can slice the result of an index operation instead"
-
-        assert torch.is_tensor(key)  # If tensor, idx all tensors by this tensor:
-
-        if self.key is None:
-            self.key = key
-            self.current = self.orig[key]
-        elif len(key) != len(self.key) or (key != self.key).any():
-            self.key = key
-            self.current = self.orig[key]
-
-        return self.current
diff --git a/AM/utils/boolmask.py b/AM/utils/boolmask.py
deleted file mode 100644
index 4764745..0000000
--- a/AM/utils/boolmask.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def _pad_mask(mask):
-    # By taking -size % 8, we get 0 if exactly divisible by 8
-    # and required padding otherwise (i.e. -1 % 8 = 7 pad)
-    pad = -mask.size(-1) % 8
-    if pad != 0:
-        mask = F.pad(mask, [0, pad])
-    return mask, mask.size(-1) // 8
-
-
-def _mask_bool2byte(mask):
-    assert mask.dtype == torch.uint8
-    # assert (mask <= 1).all()  # Precondition, disabled for efficiency
-    mask, d = _pad_mask(mask)
-    return (mask.view(*mask.size()[:-1], d, 8) << torch.arange(8, out=mask.new())).sum(-1, dtype=torch.uint8)
-
-
-def _mask_byte2long(mask):
-    assert mask.dtype == torch.uint8
-    mask, d = _pad_mask(mask)
-    # Note this corresponds to a temporary factor 8
-    # memory overhead by converting to long before summing
-    # Alternatively, aggregate using for loop
-    return (mask.view(*mask.size()[:-1], d, 8).long() << (torch.arange(8, dtype=torch.int64, device=mask.device) * 8)).sum(-1)
-
-
-def mask_bool2long(mask):
-    assert mask.dtype == torch.uint8
-    return _mask_byte2long(_mask_bool2byte(mask))
-
-
-def _mask_long2byte(mask, n=None):
-    if n is None:
-        n = 8 * mask.size(-1)
-    return (mask[..., None] >> (torch.arange(8, out=mask.new()) * 8))[..., :n].to(torch.uint8).view(*mask.size()[:-1], -1)[..., :n]
-
-
-def _mask_byte2bool(mask, n=None):
-    if n is None:
-        n = 8 * mask.size(-1)
-    return (mask[..., None] & (mask.new_ones(8) << torch.arange(8, out=mask.new()) * 1)).view(*mask.size()[:-1], -1)[..., :n] > 0
-
-
-def mask_long2bool(mask, n=None):
-    assert mask.dtype == torch.int64
-    return _mask_byte2bool(_mask_long2byte(mask), n=n)
-
-
-def mask_long_scatter(mask, values, check_unset=True):
-    """
-    Sets values in mask in dimension -1 with arbitrary batch dimensions
-    If values contains -1, nothing is set
-    Note: does not work for setting multiple values at once (like normal scatter)
-    """
-    assert mask.size()[:-1] == values.size()
-    rng = torch.arange(mask.size(-1), out=mask.new())
-    values_ = values[..., None]  # Need to broadcast up do mask dim
-    # This indicates in which value of the mask a bit should be set
-    where = (values_ >= (rng * 64)) & (values_ < ((rng + 1) * 64))
-    # Optional: check that bit is not already set
-    assert not (check_unset and ((mask & (where.long() << (values_ % 64))) > 0).any())
-    # Set bit by shifting a 1 to the correct position
-    # (% not strictly necessary as bitshift is cyclic)
-    # since where is 0 if no value needs to be set, the bitshift has no effect
-    return mask | (where.long() << (values_ % 64))
diff --git a/AM/utils/data_utils.py b/AM/utils/data_utils.py
deleted file mode 100644
index 54a3ee8..0000000
--- a/AM/utils/data_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-import pickle
-
-
-def check_extension(filename):
-    if os.path.splitext(filename)[1] != ".pkl":
-        return filename + ".pkl"
-    return filename
-
-
-def save_dataset(dataset, filename):
-
-    filedir = os.path.split(filename)[0]
-
-    if not os.path.isdir(filedir):
-        os.makedirs(filedir)
-
-    with open(check_extension(filename), 'wb') as f:
-        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
-
-
-def load_dataset(filename):
-
-    with open(check_extension(filename), 'rb') as f:
-        return pickle.load(f)
\ No newline at end of file
diff --git a/AM/utils/functions.py b/AM/utils/functions.py
deleted file mode 100644
index 9c9657c..0000000
--- a/AM/utils/functions.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import warnings
-
-import torch
-import numpy as np
-import os, random
-import json
-from tqdm import tqdm
-from multiprocessing.dummy import Pool as ThreadPool
-from multiprocessing import Pool
-import torch.nn.functional as F
-
-
-def seed_everything(seed=2022):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-
-
-def save_checkpoint(model, path):
-    from torch.nn import DataParallel
-    if isinstance(model, dict):
-        d = {
-            'rng_state': torch.get_rng_state(),
-            'cuda_rng_state': torch.cuda.get_rng_state_all(),
-            }
-        for k, v in model.items():
-            v = v.module if isinstance(v, DataParallel) else v
-            d['model{}'.format(k)] = v
-    else:
-        model = model.module if isinstance(model, DataParallel) else model
-        d = {
-            'model': model.state_dict(),
-            'rng_state': torch.get_rng_state(),
-            'cuda_rng_state': torch.cuda.get_rng_state_all(),
-            }
-    torch.save(d, path)
-    print(">> Save checkpoint {} to {}".format(d.keys(), path))
-
-
-def load_problem(name):
-    from problems import TSP, CVRP, SDVRP, OP, PCTSPDet, PCTSPStoch
-    problem = {
-        'tsp': TSP,
-        'cvrp': CVRP,
-        'sdvrp': SDVRP,
-        'op': OP,
-        'pctsp_det': PCTSPDet,
-        'pctsp_stoch': PCTSPStoch,
-    }.get(name, None)
-    assert problem is not None, "Currently unsupported problem: {}!".format(name)
-    return problem
-
-
-def torch_load_cpu(load_path):
-    return torch.load(load_path, map_location=lambda storage, loc: storage)  # Load on CPU
-
-
-def move_to(var, device):
-    if isinstance(var, dict):
-        return {k: move_to(v, device) for k, v in var.items()}
-    return var.to(device)
-
-
-def _load_model_file(load_path, model):
-    """Loads the model with parameters from the file and returns optimizer state dict if it is in the file"""
-
-    # Load the model parameters from a saved state
-    load_optimizer_state_dict = None
-    print('  [*] Loading model from {}'.format(load_path))
-
-    load_data = torch.load(
-        os.path.join(
-            os.getcwd(),
-            load_path
-        ), map_location=lambda storage, loc: storage)
-
-    if isinstance(load_data, dict):
-        load_optimizer_state_dict = load_data.get('optimizer', None)
-        load_model_state_dict = load_data.get('model', load_data)
-    else:
-        load_model_state_dict = load_data.state_dict()
-
-    state_dict = model.state_dict()
-
-    state_dict.update(load_model_state_dict)
-
-    model.load_state_dict(state_dict)
-
-    return model, load_optimizer_state_dict
-
-
-def load_args(filename):
-    with open(filename, 'r') as f:
-        args = json.load(f)
-
-    # Backwards compatibility
-    if 'data_distribution' not in args:
-        args['data_distribution'] = None
-        probl, *dist = args['problem'].split("_")
-        if probl == "op":
-            args['problem'] = probl
-            args['data_distribution'] = dist[0]
-    return args
-
-
-def load_model(path, epoch=None):
-    from nets.attention_model import AttentionModel
-    from nets.pointer_network import PointerNetwork
-
-    if os.path.isfile(path):
-        model_filename = path
-        path = os.path.dirname(model_filename)
-    elif os.path.isdir(path):
-        if epoch is None:
-            epoch = max(
-                int(os.path.splitext(filename)[0].split("-")[1])
-                for filename in os.listdir(path)
-                if os.path.splitext(filename)[1] == '.pt'
-            )
-        model_filename = os.path.join(path, 'epoch-{}.pt'.format(epoch))
-    else:
-        assert False, "{} is not a valid directory or file".format(path)
-
-    args = load_args(os.path.join(path, 'args.json'))
-
-    problem = load_problem(args['problem'])
-
-    model_class = {
-        'attention': AttentionModel,
-        'pointer': PointerNetwork
-    }.get(args.get('model', 'attention'), None)
-    assert model_class is not None, "Unknown model: {}".format(model_class)
-
-    model = model_class(
-        args['embedding_dim'],
-        args['hidden_dim'],
-        problem,
-        n_encode_layers=args['n_encode_layers'],
-        mask_inner=True,
-        mask_logits=True,
-        normalization=args['normalization'],
-        tanh_clipping=args['tanh_clipping'],
-        checkpoint_encoder=args.get('checkpoint_encoder', False),
-        shrink_size=args.get('shrink_size', None)
-    )
-    # Overwrite model parameters by parameters to load
-    load_data = torch_load_cpu(model_filename)
-    model.load_state_dict({**model.state_dict(), **load_data.get('model', {})})
-
-    model, *_ = _load_model_file(model_filename, model)
-
-    model.eval()  # Put in eval mode
-
-    return model, args
-
-
-def parse_softmax_temperature(raw_temp):
-    # Load from file
-    if os.path.isfile(raw_temp):
-        return np.loadtxt(raw_temp)[-1, 0]
-    return float(raw_temp)
-
-
-def run_all_in_pool(func, directory, dataset, opts, use_multiprocessing=True):
-    # # Test
-    # res = func((directory, 'test', *dataset[0]))
-    # return [res]
-
-    num_cpus = os.cpu_count() if opts.cpus is None else opts.cpus
-
-    w = len(str(len(dataset) - 1))
-    offset = getattr(opts, 'offset', None)
-    if offset is None:
-        offset = 0
-    ds = dataset[offset:(offset + opts.n if opts.n is not None else len(dataset))]
-    pool_cls = (Pool if use_multiprocessing and num_cpus > 1 else ThreadPool)
-    with pool_cls(num_cpus) as pool:
-        results = list(tqdm(pool.imap(
-            func,
-            [
-                (
-                    directory,
-                    str(i + offset).zfill(w),
-                    *problem
-                )
-                for i, problem in enumerate(ds)
-            ]
-        ), total=len(ds), mininterval=opts.progress_bar_mininterval))
-
-    failed = [str(i + offset) for i, res in enumerate(results) if res is None]
-    assert len(failed) == 0, "Some instances failed: {}".format(" ".join(failed))
-    return results, num_cpus
-
-
-def do_batch_rep(v, n):
-    if isinstance(v, dict):
-        return {k: do_batch_rep(v_, n) for k, v_ in v.items()}
-    elif isinstance(v, list):
-        return [do_batch_rep(v_, n) for v_ in v]
-    elif isinstance(v, tuple):
-        return tuple(do_batch_rep(v_, n) for v_ in v)
-
-    return v[None, ...].expand(n, *v.size()).contiguous().view(-1, *v.size()[1:])
-
-
-def sample_many(inner_func, get_cost_func, input, batch_rep=1, iter_rep=1):
-    """
-    :param input: (batch_size, graph_size, node_dim) input node features
-    :return:
-    """
-    input = do_batch_rep(input, batch_rep)
-
-    costs = []
-    pis = []
-    for i in range(iter_rep):
-        _log_p, pi = inner_func(input)
-        # pi.view(-1, batch_rep, pi.size(-1))
-        cost, mask = get_cost_func(input, pi)
-
-        costs.append(cost.view(batch_rep, -1).t())
-        pis.append(pi.view(batch_rep, -1, pi.size(-1)).transpose(0, 1))
-
-    max_length = max(pi.size(-1) for pi in pis)
-    # (batch_size * batch_rep, iter_rep, max_length) => (batch_size, batch_rep * iter_rep, max_length)
-    pis = torch.cat(
-        [F.pad(pi, (0, max_length - pi.size(-1))) for pi in pis],
-        1
-    )  # .view(embeddings.size(0), batch_rep * iter_rep, max_length)
-    costs = torch.cat(costs, 1)
-
-    # (batch_size)
-    mincosts, argmincosts = costs.min(-1)
-    # (batch_size, minlength)
-    minpis = pis[torch.arange(pis.size(0), out=argmincosts.new()), argmincosts]
-
-    return minpis, mincosts
diff --git a/AM/utils/lexsort.py b/AM/utils/lexsort.py
deleted file mode 100644
index c6a943e..0000000
--- a/AM/utils/lexsort.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import torch
-import numpy as np
-
-
-def torch_lexsort(keys, dim=-1):
-    if keys[0].is_cuda:
-        return _torch_lexsort_cuda(keys, dim)
-    else:
-        # Use numpy lex sort
-        return torch.from_numpy(np.lexsort([k.numpy() for k in keys], axis=dim))
-
-
-def _torch_lexsort_cuda(keys, dim=-1):
-    """
-    Function calculates a lexicographical sort order on GPU, similar to np.lexsort
-    Relies heavily on undocumented behavior of torch.sort, namely that when sorting more than
-    2048 entries in the sorting dim, it performs a sort using Thrust and it uses a stable sort
-    https://github.com/pytorch/pytorch/blob/695fd981924bd805704ecb5ccd67de17c56d7308/aten/src/THC/generic/THCTensorSort.cu#L330
-    """
-
-    MIN_NUMEL_STABLE_SORT = 2049  # Minimum number of elements for stable sort
-
-    # Swap axis such that sort dim is last and reshape all other dims to a single (batch) dimension
-    reordered_keys = tuple(key.transpose(dim, -1).contiguous() for key in keys)
-    flat_keys = tuple(key.view(-1) for key in keys)
-    d = keys[0].size(dim)  # Sort dimension size
-    numel = flat_keys[0].numel()
-    batch_size = numel // d
-    batch_key = torch.arange(batch_size, dtype=torch.int64, device=keys[0].device)[:, None].repeat(1, d).view(-1)
-
-    flat_keys = flat_keys + (batch_key,)
-
-    # We rely on undocumented behavior that the sort is stable provided that
-    if numel < MIN_NUMEL_STABLE_SORT:
-        n_rep = (MIN_NUMEL_STABLE_SORT + numel - 1) // numel  # Ceil
-        rep_key = torch.arange(n_rep, dtype=torch.int64, device=keys[0].device)[:, None].repeat(1, numel).view(-1)
-        flat_keys = tuple(k.repeat(n_rep) for k in flat_keys) + (rep_key,)
-
-    idx = None  # Identity sorting initially
-    for k in flat_keys:
-        if idx is None:
-            _, idx = k.sort(-1)
-        else:
-            # Order data according to idx and then apply
-            # found ordering to current idx (so permutation of permutation)
-            # such that we can order the next key according to the current sorting order
-            _, idx_ = k[idx].sort(-1)
-            idx = idx[idx_]
-
-    # In the end gather only numel and strip of extra sort key
-    if numel < MIN_NUMEL_STABLE_SORT:
-        idx = idx[:numel]
-
-    # Get only numel (if we have replicated), swap axis back and shape results
-    return idx[:numel].view(*reordered_keys[0].size()).transpose(dim, -1) % d
diff --git a/AM/utils/log_utils.py b/AM/utils/log_utils.py
deleted file mode 100644
index 68c4ad2..0000000
--- a/AM/utils/log_utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-def log_values(cost, grad_norms, epoch, batch_id, step,
-               log_likelihood, reinforce_loss, bl_loss, tb_logger, opts):
-    avg_cost = cost.mean().item()
-    grad_norms, grad_norms_clipped = grad_norms
-
-    # Log values to screen
-    print('epoch: {}, train_batch_id: {}, avg_cost: {}'.format(epoch, batch_id, avg_cost))
-
-    print('grad_norm: {}, clipped: {}'.format(grad_norms[0], grad_norms_clipped[0]))
-
-    # Log values to tensorboard
-    if not opts.no_tensorboard:
-        tb_logger.log_value('avg_cost', avg_cost, step)
-
-        tb_logger.log_value('actor_loss', reinforce_loss.item(), step)
-        tb_logger.log_value('nll', -log_likelihood.mean().item(), step)
-
-        tb_logger.log_value('grad_norm', grad_norms[0], step)
-        tb_logger.log_value('grad_norm_clipped', grad_norms_clipped[0], step)
-
-        if opts.baseline == 'critic':
-            tb_logger.log_value('critic_loss', bl_loss.item(), step)
-            tb_logger.log_value('critic_grad_norm', grad_norms[1], step)
-            tb_logger.log_value('critic_grad_norm_clipped', grad_norms_clipped[1], step)
diff --git a/AM/utils/monkey_patch.py b/AM/utils/monkey_patch.py
deleted file mode 100644
index 5d90614..0000000
--- a/AM/utils/monkey_patch.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch
-from itertools import chain
-from collections import defaultdict, Iterable
-from copy import deepcopy
-
-
-def load_state_dict(self, state_dict):
-    """Loads the optimizer state.
-    Arguments:
-        state_dict (dict): optimizer state. Should be an object returned
-            from a call to :meth:`state_dict`.
-    """
-    # deepcopy, to be consistent with module API
-    state_dict = deepcopy(state_dict)
-    # Validate the state_dict
-    groups = self.param_groups
-    saved_groups = state_dict['param_groups']
-
-    if len(groups) != len(saved_groups):
-        raise ValueError("loaded state dict has a different number of "
-                         "parameter groups")
-    param_lens = (len(g['params']) for g in groups)
-    saved_lens = (len(g['params']) for g in saved_groups)
-    if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
-        raise ValueError("loaded state dict contains a parameter group "
-                         "that doesn't match the size of optimizer's group")
-
-    # Update the state
-    id_map = {old_id: p for old_id, p in
-              zip(chain(*(g['params'] for g in saved_groups)),
-                  chain(*(g['params'] for g in groups)))}
-
-    def cast(param, value):
-        """Make a deep copy of value, casting all tensors to device of param."""
-        if torch.is_tensor(value):
-            # Floating-point types are a bit special here. They are the only ones
-            # that are assumed to always match the type of params.
-            if any(tp in type(param.data).__name__ for tp in {'Half', 'Float', 'Double'}):
-                value = value.type_as(param.data)
-            value = value.to(param.device)
-            return value
-        elif isinstance(value, dict):
-            return {k: cast(param, v) for k, v in value.items()}
-        elif isinstance(value, Iterable):
-            return type(value)(cast(param, v) for v in value)
-        else:
-            return value
-
-    # Copy state assigned to params (and cast tensors to appropriate types).
-    # State that is not assigned to params is copied as is (needed for
-    # backward compatibility).
-    state = defaultdict(dict)
-    for k, v in state_dict['state'].items():
-        if k in id_map:
-            param = id_map[k]
-            state[param] = cast(param, v)
-        else:
-            state[k] = v
-
-    # Update parameter groups, setting their 'params' value
-    def update_group(group, new_group):
-        new_group['params'] = group['params']
-        return new_group
-
-    param_groups = [
-        update_group(g, ng) for g, ng in zip(groups, saved_groups)]
-    self.__setstate__({'state': state, 'param_groups': param_groups})
-
-
-torch.optim.Optimizer.load_state_dict = load_state_dict
\ No newline at end of file
diff --git a/AM/utils/tensor_functions.py b/AM/utils/tensor_functions.py
deleted file mode 100644
index 1e09f75..0000000
--- a/AM/utils/tensor_functions.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-
-
-def compute_in_batches(f, calc_batch_size, *args, n=None):
-    """
-    Computes memory heavy function f(*args) in batches
-    :param n: the total number of elements, optional if it cannot be determined as args[0].size(0)
-    :param f: The function that is computed, should take only tensors as arguments and return tensor or tuple of tensors
-    :param calc_batch_size: The batch size to use when computing this function
-    :param args: Tensor arguments with equally sized first batch dimension
-    :return: f(*args), this should be one or multiple tensors with equally sized first batch dimension
-    """
-    if n is None:
-        n = args[0].size(0)
-    n_batches = (n + calc_batch_size - 1) // calc_batch_size  # ceil
-    if n_batches == 1:
-        return f(*args)
-
-    # Run all batches
-    # all_res = [f(*batch_args) for batch_args in zip(*[torch.chunk(arg, n_batches) for arg in args])]
-    # We do not use torch.chunk such that it also works for other classes that support slicing
-    all_res = [f(*(arg[i * calc_batch_size:(i + 1) * calc_batch_size] for arg in args)) for i in range(n_batches)]
-
-    # Allow for functions that return None
-    def safe_cat(chunks, dim=0):
-        if chunks[0] is None:
-            assert all(chunk is None for chunk in chunks)
-            return None
-        return torch.cat(chunks, dim)
-
-    # Depending on whether the function returned a tuple we need to concatenate each element or only the result
-    if isinstance(all_res[0], tuple):
-        return tuple(safe_cat(res_chunks, 0) for res_chunks in zip(*all_res))
-    return safe_cat(all_res, 0)
diff --git a/POMO/TSP/TSProblemDef.py b/POMO/TSP/TSProblemDef.py
index 04496ba..d1af6e9 100644
--- a/POMO/TSP/TSProblemDef.py
+++ b/POMO/TSP/TSProblemDef.py
@@ -1,6 +1,11 @@
-
+import os, sys
+import glob
 import torch
+import pickle
 import numpy as np
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, "..")  # for utils
+from utils.functions import show, seed_everything, load_dataset, save_dataset
 
 
 def generate_task_set(meta_params):
@@ -19,17 +24,30 @@ def generate_task_set(meta_params):
     return task_set
 
 
-def get_random_problems(batch_size, problem_size, num_modes=0, cdist=0, distribution='uniform'):
+def get_random_problems(batch_size, problem_size, num_modes=0, cdist=0, distribution='uniform', path=None):
     """
     Generate TSP data within range of [0, 1]
     """
     # uniform distribution problems.shape: (batch, problem, 2)
     if distribution == "uniform":
-        problems = torch.rand(size=(batch_size, problem_size, 2))
+        problems = np.random.uniform(0, 1, [batch_size, problem_size, 2])
+        # problems = torch.rand(size=(batch_size, problem_size, 2))
     elif distribution == "gaussian_mixture":
         problems = generate_gaussian_mixture_tsp(batch_size, problem_size, num_modes=num_modes, cdist=cdist)
+    elif distribution in ["uniform_rectangle", "gaussian", "cluster", "diagonal", "tsplib"]:
+        problems = generate_tsp_dist(batch_size, problem_size, distribution)
     else:
         raise NotImplementedError
+
+    # save as <class 'numpy.ndarray'>
+    if path is not None:
+        with open(os.path.join(path, "tsp{}_{}.pkl".format(problem_size, distribution)), "wb") as f:
+            pickle.dump(problems, f, pickle.HIGHEST_PROTOCOL)
+
+    # return tensor
+    if not torch.is_tensor(problems):
+        problems = torch.Tensor(problems)
+
     return problems
 
 
@@ -76,22 +94,144 @@ def gaussian_mixture(graph_size=100, num_modes=0, cdist=1):
         return xy
 
     if num_modes == 0 and cdist == 0:
-        return torch.rand(size=(dataset_size, graph_size, 2))
+        return np.random.uniform(0, 1, [dataset_size, graph_size, 2])
     else:
         res = []
         for i in range(dataset_size):
             res.append(gaussian_mixture(graph_size=graph_size, num_modes=num_modes, cdist=cdist))
-        return torch.Tensor(np.array(res))
+        return np.array(res)
+
+
+def generate_tsp_dist(n_samples, n_nodes, distribution):
+    """
+    Generate tsp instances with different distributions: ["cluster", "uniform_rectangle", "diagonal", "gaussian", "tsplib"]
+    from "Generative Adversarial Training for Neural Combinatorial Optimization Models".
+    """
+    if distribution == "cluster":  # time-consuming
+        x = []
+        for i in range(n_samples):
+            print(n_nodes, i)
+            loc = []
+            n_cluster = np.random.randint(low=3, high=9)
+            loc.append(np.random.randint(1000, size=[1, n_cluster, 2]))
+            prob = np.zeros((1000, 1000))
+            coord = np.concatenate([np.tile(np.arange(1000).reshape(-1, 1, 1), [1, 1000, 1]),
+                                    np.tile(np.arange(1000).reshape(1, -1, 1), [1000, 1, 1])], -1)
+            for j in range(n_cluster):
+                dist = np.sqrt(np.sum((coord - loc[-1][0, j, :]) ** 2, -1))
+                dist = np.exp(-dist / 40)
+                prob += dist
+            for j in range(n_cluster):
+                prob[loc[-1][0, j, 0], loc[-1][0, j, 1]] = 0
+            prob = prob / prob.sum()
+            index = np.random.choice(1000000, n_nodes - n_cluster, replace=False, p=prob.reshape(-1))
+            coord = coord[index // 1000, index % 1000]
+            loc.append(coord.reshape(1, -1, 2))
+            loc = np.concatenate(loc, 1)
+            x.append(loc)
+        x = np.concatenate(x, 0) / 1000
+    elif distribution == "uniform_rectangle":
+        data = []
+        for i in range(n_samples):
+            width = np.random.uniform(0, 1)
+            x1 = np.random.uniform(0, 1, [1, n_nodes, 1])
+            x2 = np.random.uniform(0.5 - width / 2, 0.5 + width / 2, [1, n_nodes, 1])
+            if np.random.randint(2) == 0:
+                data.append(np.concatenate([x1, x2], 2))
+            else:
+                data.append(np.concatenate([x2, x1], 2))
+        x = np.concatenate(data, 0)
+    elif distribution == "diagonal":
+        data = []
+        for i in range(n_samples):
+            x = np.random.uniform(low=0, high=1, size=(1, n_nodes, 1))
+            r = np.random.uniform(low=0, high=1)
+            if np.random.randint(4) == 0:
+                x = np.concatenate([x, x * r + (1 - r) / 2], 2)
+            elif np.random.randint(4) == 1:
+                x = np.concatenate([x, (1 - x) * r + (1 - r) / 2], 2)
+            elif np.random.randint(4) == 2:
+                x = np.concatenate([x * r + (1 - r) / 2, x], 2)
+            else:
+                x = np.concatenate([(1 - x) * r + (1 - r) / 2, x], 2)
+            width = np.random.uniform(low=0.05, high=0.2)
+            x += np.random.uniform(low=-width / 2, high=width / 2, size=(1, n_nodes, 2))
+            data.append(x)
+        x = np.concatenate(data, 0)
+    elif distribution == "gaussian":
+        data = []
+        for i in range(n_samples):
+            mean = [0.5, 0.5]
+            cov = np.random.uniform(0, 1)
+            cov = [[1.0, cov], [cov, 1.0]]
+            x = np.random.multivariate_normal(mean, cov, [1, n_nodes])
+            data.append(x)
+        x = np.concatenate(data, 0)
+    elif distribution == "tsplib":
+        file_names = glob.glob("../../data/TSP/tsplib/*.tsp")
+        data = []
+        for file_name in file_names:
+            with open(file_name, "r") as f:
+                lines = f.readlines()
+                for i in range(len(lines)):
+                    if lines[i].strip().split(":")[0].split(" ")[0] == "DIMENSION":
+                        nodes = int(lines[i].strip().split(" ")[-1])
+                x = []
+                for i in range(len(lines)):
+                    if lines[i].strip() == "NODE_COORD_SECTION":
+                        for j in range(i + 1, i + nodes + 1):
+                            line = [float(n) for n in lines[j].strip().split()]
+                            assert j - i == int(line[0])
+                            x.append([line[1], line[2]])
+                        break
+                if len(x) == 0:
+                    continue
+                x = np.array(x)
+                print(x.shape)
+
+                if x.shape[0] < 500:
+                    continue
+                for i in range(500):
+                    index = np.random.choice(x.shape[0], n_nodes, replace=False)
+                    x_new = x[index]
+                    data.append(x_new.reshape(1, n_nodes, 2))
+
+        x = np.concatenate(data, 0)
+        x = x[np.random.permutation(x.shape[0])]
+        print(x.shape)
+        assert n_samples <= x.shape[0]
+        x = x[:n_samples]
+        print(x.shape)
+
+    if distribution != "uniform_rectangle":
+        x_min, x_max = x.min(1), x.max(1)
+        x = x - x_min.reshape(-1, 1, 2)
+        x = x / (x_max - x_min).max(-1).reshape(-1, 1, 1)
+        x = x + (1 - x.max(1)).reshape(-1, 1, 2) / 2
+
+    np.random.shuffle(x)
+
+    assert x.shape[0] == n_samples
+    assert x.shape[1] == n_nodes
+    assert x.shape[2] == 2
+
+    return x
 
 
 if __name__ == "__main__":
-    import os, sys
-    os.chdir(os.path.dirname(os.path.abspath(__file__)))
-    sys.path.insert(0, "..")  # for utils
-    from utils.functions import show, seed_everything
-    seed_everything(seed=1234)
-
-    data = generate_gaussian_mixture_tsp(dataset_size=64, graph_size=100, num_modes=1, cdist=1)
-    print(type(data), data.size(), data)
-    x, y = data[0, :, 0].tolist(), data[0, :, -1].tolist()
-    show([x], [y], label=["Gaussian Mixture"], title="TSP100", xdes="x", ydes="y", path="./tsp.pdf")
+    """
+    train seed: 1234 
+    val seed: 2022
+    test seed: 2023
+    """
+    path = "../../data/TSP"
+    seed_everything(seed=2023)
+
+    for dist in ["uniform", "uniform_rectangle", "gaussian", "cluster", "diagonal", "tsplib"]:
+        print(">> Generating TSP instances following {} distribution!".format(dist))
+        get_random_problems(20000, 100, distribution=dist, path=path)
+
+    # data = generate_gaussian_mixture_tsp(dataset_size=64, graph_size=100, num_modes=1, cdist=1)
+    # print(type(data), data.size(), data)
+    # x, y = data[0, :, 0].tolist(), data[0, :, -1].tolist()
+    # show([x], [y], label=["Gaussian Mixture"], title="TSP100", xdes="x", ydes="y", path="./tsp.pdf")
diff --git a/AM/problems/tsp/tsp_gurobi.py b/POMO/TSP/baselines.py
similarity index 59%
rename from AM/problems/tsp/tsp_gurobi.py
rename to POMO/TSP/baselines.py
index 9174b07..f3d8cbd 100644
--- a/AM/problems/tsp/tsp_gurobi.py
+++ b/POMO/TSP/baselines.py
@@ -1,26 +1,29 @@
-#!/usr/bin/python
-
-# Copyright 2017, Gurobi Optimization, Inc.
-
-# Solve a traveling salesman problem on a set of
-# points using lazy constraints.   The base MIP model only includes
-# 'degree-2' constraints, requiring each node to have exactly
-# two incident edges.  Solutions to this model may contain subtours -
-# tours that don't visit every city.  The lazy constraint callback
-# adds new constraints to cut them off.
-
+import os, sys
+import time
+import glob
+import pickle
 import argparse
 import numpy as np
-from utils.data_utils import load_dataset, save_dataset
 from gurobipy import *
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, "..")  # for utils
+from utils.functions import seed_everything, load_dataset, save_dataset
 
 
 def solve_euclidian_tsp(points, threads=0, timeout=None, gap=None):
     """
-    Solves the Euclidan TSP problem to optimality using the MIP formulation 
+    Copyright 2017, Gurobi Optimization, Inc.
+    Solve a traveling salesman problem on a set of
+    points using lazy constraints.   The base MIP model only includes
+    'degree-2' constraints, requiring each node to have exactly
+    two incident edges.  Solutions to this model may contain subtours -
+    tours that don't visit every city.  The lazy constraint callback
+    adds new constraints to cut them off.
+
+    Solves the Euclidan TSP problem to optimality using the MIP formulation
     with lazy subtour elimination constraint generation.
-    :param points: list of (x, y) coordinate 
-    :return: 
+    :param points: list of (x, y) coordinate
+    :return:
     """
 
     n = len(points)
@@ -119,3 +122,33 @@ def solve_all_gurobi(dataset):
         result = solve_euclidian_tsp(instance)
         results.append(result)
     return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compute (near-)opt solution.")
+    parser.add_argument('--baseline', type=str, default='gurobi', choices=['gurobi', 'lkh3', 'concorde', 'farthest_insertion'], help="which baseline to use")
+    parser.add_argument('--path', type=str, default="../../data/TSP", help='Dataset file')
+    parser.add_argument('--offset', type=int, default=0, help='Offset where to start in dataset (default 0)')
+    parser.add_argument('--timelimit', type=int, default=0, help='time limit for baselone')
+    parser.add_argument('--num_samples', type=int, default=10000, help='Number of samples to evaluate (default 10000)')
+
+    args = parser.parse_args()
+
+    # Note: we only solve [0:10000] instances for testing
+    file_names = glob.glob(os.path.join(args.path, "*.pkl"))
+    for file_name in file_names:
+        data = load_dataset(file_name)
+        print(">> {}: Solving dataset {}".format(args.baseline, file_name))
+        start_time = time.time()
+        if args.baseline == "gurobi":
+            res = solve_all_gurobi(data[args.offset:args.offset+args.num_samples])  # [(obj, route), ...]
+            print(">> Completed within {}s".format(time.time() - start_time))
+            # save the results
+            path = os.path.join(args.path, args.baseline)
+            if not os.path.exists(path):
+                os.makedirs(path)
+            path = os.path.join(path, os.path.split(file_name)[-1])
+            with open(path, "wb") as f:
+                pickle.dump(res, f, pickle.HIGHEST_PROTOCOL)
+        else:
+            raise NotImplementedError