From 2250a1e8f1276eb99f98c163b9a12b168bd23bd7 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Wed, 4 Dec 2024 13:34:05 -0500 Subject: [PATCH 1/8] fix greedy cpu allocation --- carveme/cli/carve.py | 39 +++++++++++++++++++++++-------- carveme/reconstruction/diamond.py | 4 ++-- docs/usage.rst | 11 +++++++-- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/carveme/cli/carve.py b/carveme/cli/carve.py index 4d63ae5..358ab64 100755 --- a/carveme/cli/carve.py +++ b/carveme/cli/carve.py @@ -15,17 +15,17 @@ import os import os.path import pandas as pd -from multiprocessing import Pool +from multiprocessing import Pool, cpu_count from glob import glob import subprocess -def first_run_check(): +def first_run_check(ncores): diamond_db = project_dir + config.get('generated', 'diamond_db') if not os.path.exists(diamond_db): print("Running diamond for the first time, please wait while we build the internal database...") fasta_file = project_dir + config.get('generated', 'fasta_file') - cmd = ['diamond', 'makedb', '--in', fasta_file, '-d', diamond_db[:-5]] + cmd = ['diamond', 'makedb', '--threads', ncores, '--in', fasta_file, '-d', diamond_db[:-5]] try: exit_code = subprocess.call(cmd) except OSError: @@ -45,7 +45,7 @@ def build_model_id(name): def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None, universe=None, universe_file=None, ensemble_size=None, verbose=False, debug=False, flavor=None, gapfill=None, blind_gapfill=False, init=None, mediadb=None, default_score=None, uptake_score=None, soft_score=None, soft=None, hard=None, reference=None, - ref_score=None, recursive_mode=False): + ref_score=None, recursive_mode=False, ncores=None): if recursive_mode: model_id = os.path.splitext(os.path.basename(inputfile))[0] @@ -108,7 +108,7 @@ def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None print('Running diamond...') diamond_db = project_dir + config.get('generated', 'diamond_db') blast_output = os.path.splitext(inputfile)[0] + '.tsv' - exit_code = run_blast(inputfile, input_type, blast_output, diamond_db, diamond_args, verbose) + exit_code = run_blast(inputfile, input_type, blast_output, diamond_db, diamond_args, ncores, verbose) if exit_code is None: print('Unable to run diamond (make sure diamond is available in your PATH).') @@ -313,6 +313,16 @@ def main(): parser.add_argument('--blind-gapfill', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('--njobs', type=int, default=cpu_count(), + help="number of concurrent tasks to run via " + "multiprocessing.Pool; defaults to " + + "multiprocessing.cpu_count() " + + " (%(default)s)") + parser.add_argument('--ncores', type=int, default=1, + help="number of cores to pass to " + "each multiprocessing.Pool job (eg diamond); " + + "jobs. default: %(default)s") + args = parser.parse_args() if args.gapfill and args.ensemble: @@ -345,12 +355,19 @@ def main(): else: flavor = config.get('sbml', 'default_flavor') + if (args.ncores * args.njobs) > cpu_count(): + parser.error(f'--ncores ({args.ncores}) multiplied by --njobs ({args.njobs}) cannot exceed {cpu_count()}') + if args.solver: set_default_solver(args.solver) # else: # set_default_solver(config.get('solver', 'default_solver')) - first_run_check() + # give the initial diamond run all available resources + first_run_check(args.ncores * args.njobs) + + if args.gapfill and args.ensemble: + parser.error('Gap fill and ensemble generation cannot currently be combined (not implemented yet).') if not args.recursive: if len(args.input) > 1: @@ -377,7 +394,8 @@ def main(): soft=args.soft, hard=args.hard, reference=args.reference, - ref_score=args.reference_score + ref_score=args.reference_score, + ncores=args.ncores, ) else: @@ -404,10 +422,11 @@ def f(x): hard=args.hard, reference=args.reference, ref_score=args.reference_score, - recursive_mode=True - ) + recursive_mode=True, + ncores=args.ncores, + ) - p = Pool() + p = Pool(args.njobs) p.map(f, args.input) diff --git a/carveme/reconstruction/diamond.py b/carveme/reconstruction/diamond.py index 5bb2457..1244797 100755 --- a/carveme/reconstruction/diamond.py +++ b/carveme/reconstruction/diamond.py @@ -25,7 +25,7 @@ def load_diamond_results(filename, drop_unused_cols=True): return data -def run_blast(inputfile, input_type, outputfile, database, args=None, verbose=True): +def run_blast(inputfile, input_type, outputfile, database, args=None, ncores=None, verbose=True): """ Run blast aligment with Diamond. Args: @@ -58,7 +58,7 @@ def run_blast(inputfile, input_type, outputfile, database, args=None, verbose=Tr cmd += ['-o', outputfile] if not args: - args = "--more-sensitive --top 10 --quiet" + args = f"--more-sensitive --top 10 --quiet --threads {ncores}" cmd += args.split() diff --git a/docs/usage.rst b/docs/usage.rst index 172b5e7..c764324 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -64,6 +64,15 @@ This can be combined with *-o* to change the output folder: $ carve -r myfolder/*.faa -o mymodels/ +To balance the number of concurrent samples run versus the number of cores allocated to each , +specify the `--njobs` and `--ncores` argument. `--njobs` gets passed to `multiprocessing.Pool()`, + while `--ncores` gets passed to Diamond. these default to running in single-threaded mode for all available CPUs. + To instead run Diamond with 4 threads on a maximum of 2 concurrent samples, this be adjusted to : + +.. code-block:: console + + $ carve --ncores 4 --njobs 2 -r myfolder/*.faa -o mymodels/ + Gap Filling ----------- @@ -152,5 +161,3 @@ You can initialize the community with a pre-defined medium (just like during sin .. code-block:: console $ merge_community [input files] -i M9 - - From 4635a440678dda8b53fefe15bcb46c97aec6fb92 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Wed, 4 Dec 2024 14:23:50 -0500 Subject: [PATCH 2/8] fix subprocess syntax --- carveme/cli/carve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/carveme/cli/carve.py b/carveme/cli/carve.py index 358ab64..35c44e9 100755 --- a/carveme/cli/carve.py +++ b/carveme/cli/carve.py @@ -25,7 +25,7 @@ def first_run_check(ncores): if not os.path.exists(diamond_db): print("Running diamond for the first time, please wait while we build the internal database...") fasta_file = project_dir + config.get('generated', 'fasta_file') - cmd = ['diamond', 'makedb', '--threads', ncores, '--in', fasta_file, '-d', diamond_db[:-5]] + cmd = ['diamond', 'makedb', '--threads', str(ncores), '--in', fasta_file, '-d', diamond_db[:-5]] try: exit_code = subprocess.call(cmd) except OSError: From 0f4c3513c9103645500bbaa47df049e3bfe8feba Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Wed, 4 Dec 2024 15:00:02 -0500 Subject: [PATCH 3/8] fix path in setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4ef0ce1..ea4184f 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'data/input/media_db.tsv', 'data/input/metabolomics_park2016.csv', 'data/input/unbalanced_metabolites.csv', - 'data/input/bigg_proteins.faa', + 'data/generated/bigg_proteins.faa', 'data/input/equilibrator_compounds.tsv.gz', 'data/input/refseq_release_201.tsv.gz', 'data/generated/bigg_gibbs.csv', @@ -100,7 +100,7 @@ keywords='carveme', classifiers=[ 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', + 'Environment :: Console', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'Programming Language :: Python :: 3.6', From ea96f0ac8d5ab2f867d597179388fe01b44db480 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Wed, 4 Dec 2024 15:27:29 -0500 Subject: [PATCH 4/8] fix setup.py to catch missing included_files --- setup.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index ea4184f..0c70e7f 100644 --- a/setup.py +++ b/setup.py @@ -16,19 +16,19 @@ included_files = { 'carveme': [ 'config.cfg', - 'data/input/bigg_models.csv', + 'data/input/bigg_models.tsv', 'data/input/biomass_db.tsv', - 'data/input/manually_curated.csv', + 'data/input/manually_curated.tsv', 'data/input/media_db.tsv', - 'data/input/metabolomics_park2016.csv', +# 'data/input/metabolomics_park2016.csv', deleted 5cbc611af5aa265c39882f7a88bf357f3261b170 'data/input/unbalanced_metabolites.csv', 'data/generated/bigg_proteins.faa', - 'data/input/equilibrator_compounds.tsv.gz', + 'data/input/mnx_compounds.tsv', 'data/input/refseq_release_201.tsv.gz', - 'data/generated/bigg_gibbs.csv', +# 'data/generated/bigg_gibbs.csv', # deleted c897f41d7d03c27ca12ecd9ee97337355338c378 'data/generated/bigg_gprs.csv.gz', 'data/generated/model_specific_data.csv.gz', - 'data/generated/universe_draft.xml.gz', + 'data/generated/bigg_universe.xml.gz', 'data/generated/universe_bacteria.xml.gz', 'data/generated/universe_grampos.xml.gz', 'data/generated/universe_gramneg.xml.gz', @@ -70,7 +70,15 @@ 'data/benchmark/results/essentiality.tsv', ] } - +missing_files = [] +for path in included_files["carveme"]: + fullpath = os.path.join("carveme", path) + if not os.path.exists(fullpath): + missing_files.append(fullpath) +if missing_files: + print("files required for install are not found:\n") + print("\n".join(missing_files)) + raise ValueError("missing files; exiting") setup( name='carveme', From dfc24dc9d55319025971b95d2b4e0c54cf327fa1 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 5 Dec 2024 09:31:42 -0500 Subject: [PATCH 5/8] add config check for missing files --- carveme/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/carveme/__init__.py b/carveme/__init__.py index 6a12535..5e689b2 100755 --- a/carveme/__init__.py +++ b/carveme/__init__.py @@ -10,8 +10,12 @@ config = ConfigParser() config.read(project_dir + 'config.cfg') +for k,v in config.items(): + if os.pathsep in v: + if not os.path.exists(v): + raise ValueError(f'file {v} not found') #set_default_solver(config.get('solver', 'default_solver')) #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol') #default_parameters[Parameter.OPTIMALITY_TOL] = config.getfloat('solver', 'opt_tol') -#default_parameters[Parameter.INT_FEASIBILITY_TOL] = config.getfloat('solver', 'int_feas_tol') \ No newline at end of file +#default_parameters[Parameter.INT_FEASIBILITY_TOL] = config.getfloat('solver', 'int_feas_tol') From d1d4e7c0b40c8a412df64ced4fe58123ff89ce86 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 5 Dec 2024 13:09:21 -0500 Subject: [PATCH 6/8] move config check to setup.py from __init__ --- carveme/__init__.py | 5 ----- carveme/config.cfg | 3 --- setup.py | 13 ++++++++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/carveme/__init__.py b/carveme/__init__.py index 5e689b2..663d16f 100755 --- a/carveme/__init__.py +++ b/carveme/__init__.py @@ -9,11 +9,6 @@ project_dir = os.path.abspath(os.path.dirname(__file__)) + '/' config = ConfigParser() -config.read(project_dir + 'config.cfg') -for k,v in config.items(): - if os.pathsep in v: - if not os.path.exists(v): - raise ValueError(f'file {v} not found') #set_default_solver(config.get('solver', 'default_solver')) #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol') diff --git a/carveme/config.cfg b/carveme/config.cfg index b00cca0..079b468 100644 --- a/carveme/config.cfg +++ b/carveme/config.cfg @@ -1,7 +1,6 @@ [input] biomass_library = data/input/biomass_db.tsv media_library = data/input/media_db.tsv -metabolomics = data/input/metabolomics_park2016.csv refseq = data/input/refseq_release_201.tsv.gz mnx_compounds = data/input/mnx_compounds.tsv bigg_models = data/input/bigg_models.tsv @@ -14,7 +13,6 @@ bigg_gprs = data/generated/bigg_gprs.csv.gz model_specific_data = data/generated/model_specific_data.csv.gz gene_annotations = data/generated/gene_annotations.tsv.gz bigg_universe = data/generated/bigg_universe.xml.gz -bigg_annotated = data/generated/bigg_annotated.xml.gz default_universe = data/generated/universe_bacteria.xml.gz fasta_file = data/generated/bigg_proteins.faa diamond_db = data/generated/bigg_proteins.dmnd @@ -35,4 +33,3 @@ default_flavor = bigg [gapfill] max_uptake = 100 - diff --git a/setup.py b/setup.py index 0c70e7f..56dddda 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,8 @@ # -*- coding: utf-8 -*- """The setup script.""" - +import os +from configparser import ConfigParser from setuptools import setup, find_packages with open('README.rst') as readme_file: @@ -80,6 +81,16 @@ print("\n".join(missing_files)) raise ValueError("missing files; exiting") +config = ConfigParser() +project_dir = "carveme" +config.read(os.path.join(project_dir, 'config.cfg')) +for chunk in ["input", "generated"]: + for k,v in config[chunk].items(): + vpath = os.path.join(project_dir, v) + if not os.path.exists(vpath) and k != "diamond_db": + raise ValueError(f'file {vpath} not found') + + setup( name='carveme', version='1.6.2', From d9e19a350fe5cd556bbb12f0bb25408cb0587f4a Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 5 Dec 2024 15:09:06 -0500 Subject: [PATCH 7/8] fix config syntax --- carveme/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/carveme/__init__.py b/carveme/__init__.py index 663d16f..bb21ac4 100755 --- a/carveme/__init__.py +++ b/carveme/__init__.py @@ -9,6 +9,7 @@ project_dir = os.path.abspath(os.path.dirname(__file__)) + '/' config = ConfigParser() +config.read(project_dir + 'config.cfg') #set_default_solver(config.get('solver', 'default_solver')) #default_parameters[Parameter.FEASIBILITY_TOL] = config.getfloat('solver', 'feas_tol') From f204628ba8ee9a061710d125b49e3d9d6a657985 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 5 Dec 2024 15:21:37 -0500 Subject: [PATCH 8/8] add better check for config files missing in included_files --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 56dddda..283ffcf 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ 'data/generated/bigg_proteins.faa', 'data/input/mnx_compounds.tsv', 'data/input/refseq_release_201.tsv.gz', + 'data/generated/gene_annotations.tsv.gz', # 'data/generated/bigg_gibbs.csv', # deleted c897f41d7d03c27ca12ecd9ee97337355338c378 'data/generated/bigg_gprs.csv.gz', 'data/generated/model_specific_data.csv.gz', @@ -84,11 +85,18 @@ config = ConfigParser() project_dir = "carveme" config.read(os.path.join(project_dir, 'config.cfg')) +config_files = [] for chunk in ["input", "generated"]: for k,v in config[chunk].items(): vpath = os.path.join(project_dir, v) - if not os.path.exists(vpath) and k != "diamond_db": + if k in ["folder", "diamond_db"]: continue + if not os.path.exists(vpath): raise ValueError(f'file {vpath} not found') + elif v not in included_files["carveme"]: + raise ValueError(f'config file {vpath} not included in setup.py') + else: + config_files.append(v) + setup(