From 2250a1e8f1276eb99f98c163b9a12b168bd23bd7 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Wed, 4 Dec 2024 13:34:05 -0500
Subject: [PATCH 1/8] fix greedy cpu allocation

---
 carveme/cli/carve.py              | 39 +++++++++++++++++++++++--------
 carveme/reconstruction/diamond.py |  4 ++--
 docs/usage.rst                    | 11 +++++++--
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/carveme/cli/carve.py b/carveme/cli/carve.py
index 4d63ae5..358ab64 100755
--- a/carveme/cli/carve.py
+++ b/carveme/cli/carve.py
@@ -15,17 +15,17 @@
 import os
 import os.path
 import pandas as pd
-from multiprocessing import Pool
+from multiprocessing import Pool, cpu_count
 from glob import glob
 import subprocess
 
 
-def first_run_check():
+def first_run_check(ncores):
     diamond_db = project_dir + config.get('generated', 'diamond_db')
     if not os.path.exists(diamond_db):
         print("Running diamond for the first time, please wait while we build the internal database...")
         fasta_file = project_dir + config.get('generated', 'fasta_file')
-        cmd = ['diamond', 'makedb', '--in', fasta_file, '-d', diamond_db[:-5]]
+        cmd = ['diamond', 'makedb', '--threads', ncores,  '--in', fasta_file, '-d', diamond_db[:-5]]
         try:
             exit_code = subprocess.call(cmd)
         except OSError:
@@ -45,7 +45,7 @@ def build_model_id(name):
 def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None, universe=None, universe_file=None,
          ensemble_size=None, verbose=False, debug=False, flavor=None, gapfill=None, blind_gapfill=False, init=None,
          mediadb=None, default_score=None, uptake_score=None, soft_score=None, soft=None, hard=None, reference=None,
-         ref_score=None, recursive_mode=False):
+             ref_score=None, recursive_mode=False, ncores=None):
 
     if recursive_mode:
         model_id = os.path.splitext(os.path.basename(inputfile))[0]
@@ -108,7 +108,7 @@ def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None
             print('Running diamond...')
         diamond_db = project_dir + config.get('generated', 'diamond_db')
         blast_output = os.path.splitext(inputfile)[0] + '.tsv'
-        exit_code = run_blast(inputfile, input_type, blast_output, diamond_db, diamond_args, verbose)
+        exit_code = run_blast(inputfile, input_type, blast_output, diamond_db, diamond_args, ncores, verbose)
 
         if exit_code is None:
             print('Unable to run diamond (make sure diamond is available in your PATH).')
@@ -313,6 +313,16 @@ def main():
 
     parser.add_argument('--blind-gapfill', action='store_true', help=argparse.SUPPRESS)
 
+    parser.add_argument('--njobs', type=int, default=cpu_count(),
+                        help="number of concurrent tasks to run via "
+                        "multiprocessing.Pool; defaults to " +
+                        "multiprocessing.cpu_count() " +
+                        " (%(default)s)")
+    parser.add_argument('--ncores', type=int, default=1,
+                        help="number of cores to pass to "
+                        "each multiprocessing.Pool job (eg diamond); " +
+                        "jobs. default: %(default)s")
+
     args = parser.parse_args()
 
     if args.gapfill and args.ensemble:
@@ -345,12 +355,19 @@ def main():
     else:
         flavor = config.get('sbml', 'default_flavor')
 
+    if (args.ncores * args.njobs) > cpu_count():
+        parser.error(f'--ncores ({args.ncores}) multiplied by --njobs ({args.njobs}) cannot exceed {cpu_count()}')
+
     if args.solver:
         set_default_solver(args.solver)
 #    else:
 #        set_default_solver(config.get('solver', 'default_solver'))
 
-    first_run_check()
+    # give the initial diamond run all available resources
+    first_run_check(args.ncores * args.njobs)
+
+    if args.gapfill and args.ensemble:
+        parser.error('Gap fill and ensemble generation cannot currently be combined (not implemented yet).')
 
     if not args.recursive:
         if len(args.input) > 1:
@@ -377,7 +394,8 @@ def main():
             soft=args.soft,
             hard=args.hard,
             reference=args.reference,
-            ref_score=args.reference_score
+            ref_score=args.reference_score,
+            ncores=args.ncores,
         )
 
     else:
@@ -404,10 +422,11 @@ def f(x):
                 hard=args.hard,
                 reference=args.reference,
                 ref_score=args.reference_score,
-                recursive_mode=True
-            )
+                recursive_mode=True,
+                ncores=args.ncores,
+        )
 
-        p = Pool()
+        p = Pool(args.njobs)
         p.map(f, args.input)
 
 
diff --git a/carveme/reconstruction/diamond.py b/carveme/reconstruction/diamond.py
index 5bb2457..1244797 100755
--- a/carveme/reconstruction/diamond.py
+++ b/carveme/reconstruction/diamond.py
@@ -25,7 +25,7 @@ def load_diamond_results(filename, drop_unused_cols=True):
     return data
 
 
-def run_blast(inputfile, input_type, outputfile, database, args=None, verbose=True):
+def run_blast(inputfile, input_type, outputfile, database, args=None, ncores=None, verbose=True):
     """ Run blast aligment with Diamond.
 
     Args:
@@ -58,7 +58,7 @@ def run_blast(inputfile, input_type, outputfile, database, args=None, verbose=Tr
     cmd += ['-o', outputfile]
 
     if not args:
-        args = "--more-sensitive --top 10 --quiet"
+        args = f"--more-sensitive --top 10 --quiet --threads {ncores}"
 
     cmd += args.split()
 
diff --git a/docs/usage.rst b/docs/usage.rst
index 172b5e7..c764324 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -64,6 +64,15 @@ This can be combined with *-o* to change the output folder:
 
     $ carve -r myfolder/*.faa -o mymodels/
 
+To balance the number of concurrent samples run  versus the number of cores allocated to each ,
+specify the `--njobs` and `--ncores` argument.  `--njobs` gets passed to `multiprocessing.Pool()`,
+ while `--ncores` gets passed to Diamond. these default to running in single-threaded mode for all available CPUs.
+ To instead run Diamond with 4 threads on a maximum of 2 concurrent samples, this be adjusted to :
+
+.. code-block:: console
+
+    $ carve --ncores 4 --njobs 2 -r myfolder/*.faa -o mymodels/
+
 
 Gap Filling
 -----------
@@ -152,5 +161,3 @@ You can initialize the community with a pre-defined medium (just like during sin
 .. code-block:: console
 
     $ merge_community [input files] -i M9
-
-

From 4635a440678dda8b53fefe15bcb46c97aec6fb92 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Wed, 4 Dec 2024 14:23:50 -0500
Subject: [PATCH 2/8] fix subprocess syntax

---
 carveme/cli/carve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/carveme/cli/carve.py b/carveme/cli/carve.py
index 358ab64..35c44e9 100755
--- a/carveme/cli/carve.py
+++ b/carveme/cli/carve.py
@@ -25,7 +25,7 @@ def first_run_check(ncores):
     if not os.path.exists(diamond_db):
         print("Running diamond for the first time, please wait while we build the internal database...")
         fasta_file = project_dir + config.get('generated', 'fasta_file')
-        cmd = ['diamond', 'makedb', '--threads', ncores,  '--in', fasta_file, '-d', diamond_db[:-5]]
+        cmd = ['diamond', 'makedb', '--threads', str(ncores),  '--in', fasta_file, '-d', diamond_db[:-5]]
         try:
             exit_code = subprocess.call(cmd)
         except OSError:

From 0f4c3513c9103645500bbaa47df049e3bfe8feba Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Wed, 4 Dec 2024 15:00:02 -0500
Subject: [PATCH 3/8] fix path in setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 4ef0ce1..ea4184f 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
         'data/input/media_db.tsv',
         'data/input/metabolomics_park2016.csv',
         'data/input/unbalanced_metabolites.csv',
-        'data/input/bigg_proteins.faa',
+        'data/generated/bigg_proteins.faa',
         'data/input/equilibrator_compounds.tsv.gz',
         'data/input/refseq_release_201.tsv.gz',
         'data/generated/bigg_gibbs.csv',
@@ -100,7 +100,7 @@
     keywords='carveme',
     classifiers=[
         'Development Status :: 5 - Production/Stable',
-        'Environment :: Console', 
+        'Environment :: Console',
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
         'Programming Language :: Python :: 3.6',

From ea96f0ac8d5ab2f867d597179388fe01b44db480 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Wed, 4 Dec 2024 15:27:29 -0500
Subject: [PATCH 4/8] fix setup.py to catch missing included_files

---
 setup.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index ea4184f..0c70e7f 100644
--- a/setup.py
+++ b/setup.py
@@ -16,19 +16,19 @@
 included_files = {
     'carveme': [
         'config.cfg',
-        'data/input/bigg_models.csv',
+        'data/input/bigg_models.tsv',
         'data/input/biomass_db.tsv',
-        'data/input/manually_curated.csv',
+        'data/input/manually_curated.tsv',
         'data/input/media_db.tsv',
-        'data/input/metabolomics_park2016.csv',
+#        'data/input/metabolomics_park2016.csv', deleted 5cbc611af5aa265c39882f7a88bf357f3261b170
         'data/input/unbalanced_metabolites.csv',
         'data/generated/bigg_proteins.faa',
-        'data/input/equilibrator_compounds.tsv.gz',
+        'data/input/mnx_compounds.tsv',
         'data/input/refseq_release_201.tsv.gz',
-        'data/generated/bigg_gibbs.csv',
+#        'data/generated/bigg_gibbs.csv', # deleted c897f41d7d03c27ca12ecd9ee97337355338c378
         'data/generated/bigg_gprs.csv.gz',
         'data/generated/model_specific_data.csv.gz',
-        'data/generated/universe_draft.xml.gz',
+        'data/generated/bigg_universe.xml.gz',
         'data/generated/universe_bacteria.xml.gz',
         'data/generated/universe_grampos.xml.gz',
         'data/generated/universe_gramneg.xml.gz',
@@ -70,7 +70,15 @@
         'data/benchmark/results/essentiality.tsv',
     ]
 }
-
+missing_files = []
+for path in included_files["carveme"]:
+    fullpath = os.path.join("carveme", path)
+    if not os.path.exists(fullpath):
+        missing_files.append(fullpath)
+if missing_files:
+    print("files required for install are not found:\n")
+    print("\n".join(missing_files))
+    raise ValueError("missing files; exiting")
 
 setup(
     name='carveme',

From dfc24dc9d55319025971b95d2b4e0c54cf327fa1 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Thu, 5 Dec 2024 09:31:42 -0500
Subject: [PATCH 5/8] add config check for missing files

---
 carveme/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/carveme/__init__.py b/carveme/__init__.py
index 6a12535..5e689b2 100755
--- a/carveme/__init__.py
+++ b/carveme/__init__.py
@@ -10,8 +10,12 @@
 
 config = ConfigParser()
 config.read(project_dir + 'config.cfg')
+for k,v in config.items():
+    if os.pathsep in v:
+        if not os.path.exists(v):
+            raise ValueError(f'file {v} not found')
 
 #set_default_solver(config.get('solver', 'default_solver'))
 #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol')
 #default_parameters[Parameter.OPTIMALITY_TOL] = config.getfloat('solver', 'opt_tol')
-#default_parameters[Parameter.INT_FEASIBILITY_TOL] = config.getfloat('solver', 'int_feas_tol')
\ No newline at end of file
+#default_parameters[Parameter.INT_FEASIBILITY_TOL] = config.getfloat('solver', 'int_feas_tol')

From d1d4e7c0b40c8a412df64ced4fe58123ff89ce86 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Thu, 5 Dec 2024 13:09:21 -0500
Subject: [PATCH 6/8] move config check to setup.py from __init__

---
 carveme/__init__.py |  5 -----
 carveme/config.cfg  |  3 ---
 setup.py            | 13 ++++++++++++-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/carveme/__init__.py b/carveme/__init__.py
index 5e689b2..663d16f 100755
--- a/carveme/__init__.py
+++ b/carveme/__init__.py
@@ -9,11 +9,6 @@
 project_dir = os.path.abspath(os.path.dirname(__file__)) + '/'
 
 config = ConfigParser()
-config.read(project_dir + 'config.cfg')
-for k,v in config.items():
-    if os.pathsep in v:
-        if not os.path.exists(v):
-            raise ValueError(f'file {v} not found')
 
 #set_default_solver(config.get('solver', 'default_solver'))
 #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol')
diff --git a/carveme/config.cfg b/carveme/config.cfg
index b00cca0..079b468 100644
--- a/carveme/config.cfg
+++ b/carveme/config.cfg
@@ -1,7 +1,6 @@
 [input]
 biomass_library = data/input/biomass_db.tsv
 media_library = data/input/media_db.tsv
-metabolomics = data/input/metabolomics_park2016.csv
 refseq = data/input/refseq_release_201.tsv.gz
 mnx_compounds = data/input/mnx_compounds.tsv
 bigg_models = data/input/bigg_models.tsv
@@ -14,7 +13,6 @@ bigg_gprs = data/generated/bigg_gprs.csv.gz
 model_specific_data = data/generated/model_specific_data.csv.gz
 gene_annotations = data/generated/gene_annotations.tsv.gz
 bigg_universe = data/generated/bigg_universe.xml.gz
-bigg_annotated = data/generated/bigg_annotated.xml.gz
 default_universe = data/generated/universe_bacteria.xml.gz
 fasta_file = data/generated/bigg_proteins.faa
 diamond_db = data/generated/bigg_proteins.dmnd
@@ -35,4 +33,3 @@ default_flavor = bigg
 
 [gapfill]
 max_uptake = 100
-
diff --git a/setup.py b/setup.py
index 0c70e7f..56dddda 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 
 """The setup script."""
-
+import os
+from configparser import ConfigParser
 from setuptools import setup, find_packages
 
 with open('README.rst') as readme_file:
@@ -80,6 +81,16 @@
     print("\n".join(missing_files))
     raise ValueError("missing files; exiting")
 
+config = ConfigParser()
+project_dir = "carveme"
+config.read(os.path.join(project_dir, 'config.cfg'))
+for chunk in ["input", "generated"]:
+    for k,v in config[chunk].items():
+        vpath = os.path.join(project_dir, v)
+        if not os.path.exists(vpath) and k != "diamond_db":
+            raise ValueError(f'file {vpath} not found')
+
+
 setup(
     name='carveme',
     version='1.6.2',

From d9e19a350fe5cd556bbb12f0bb25408cb0587f4a Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Thu, 5 Dec 2024 15:09:06 -0500
Subject: [PATCH 7/8] fix config syntax

---
 carveme/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/carveme/__init__.py b/carveme/__init__.py
index 663d16f..bb21ac4 100755
--- a/carveme/__init__.py
+++ b/carveme/__init__.py
@@ -9,6 +9,7 @@
 project_dir = os.path.abspath(os.path.dirname(__file__)) + '/'
 
 config = ConfigParser()
+config.read(project_dir + 'config.cfg')
 
 #set_default_solver(config.get('solver', 'default_solver'))
 #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol')

From f204628ba8ee9a061710d125b49e3d9d6a657985 Mon Sep 17 00:00:00 2001
From: Nick Waters <watersn@mskcc.org>
Date: Thu, 5 Dec 2024 15:21:37 -0500
Subject: [PATCH 8/8] add better check for config files missing in
 included_files

---
 setup.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 56dddda..283ffcf 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@
         'data/generated/bigg_proteins.faa',
         'data/input/mnx_compounds.tsv',
         'data/input/refseq_release_201.tsv.gz',
+        'data/generated/gene_annotations.tsv.gz',
 #        'data/generated/bigg_gibbs.csv', # deleted c897f41d7d03c27ca12ecd9ee97337355338c378
         'data/generated/bigg_gprs.csv.gz',
         'data/generated/model_specific_data.csv.gz',
@@ -84,11 +85,18 @@
 config = ConfigParser()
 project_dir = "carveme"
 config.read(os.path.join(project_dir, 'config.cfg'))
+config_files = []
 for chunk in ["input", "generated"]:
     for k,v in config[chunk].items():
         vpath = os.path.join(project_dir, v)
-        if not os.path.exists(vpath) and k != "diamond_db":
+        if k in ["folder", "diamond_db"]: continue
+        if not os.path.exists(vpath):
             raise ValueError(f'file {vpath} not found')
+        elif v not in included_files["carveme"]:
+            raise ValueError(f'config file {vpath} not included in setup.py')
+        else:
+            config_files.append(v)
+
 
 
 setup(