diff --git a/src/cli.rs b/src/cli.rs index c61b534..5d06be8 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -831,6 +831,12 @@ pub fn build_cli() -> Command { .value_parser(clap::value_parser!(usize)) .default_value("200000"), ) + .arg( + Arg::new("min-contig-count") + .long("min-contig-count") + .value_parser(clap::value_parser!(usize)) + .default_value("10"), + ) .arg( Arg::new("n-neighbours") .long("n-neighbours") @@ -1241,6 +1247,12 @@ pub fn build_cli() -> Command { .value_parser(clap::value_parser!(usize)) .default_value("200000"), ) + .arg( + Arg::new("min-contig-count") + .long("min-contig-count") + .value_parser(clap::value_parser!(usize)) + .default_value("10"), + ) .arg( Arg::new("n-neighbours") .long("n-neighbours") diff --git a/src/recover/recover_engine.rs b/src/recover/recover_engine.rs index 5b13843..c3058c0 100644 --- a/src/recover/recover_engine.rs +++ b/src/recover/recover_engine.rs @@ -65,6 +65,7 @@ struct RecoverEngine { n_contigs: usize, min_bin_size: usize, min_contig_size: usize, + min_contig_count: usize, filtered_contigs: HashSet, max_retries: usize } @@ -127,6 +128,7 @@ impl RecoverEngine { let ef_construction = m.get_one::("ef-construction").unwrap().clone(); let max_layers = m.get_one::("max-layers").unwrap().clone(); let min_bin_size = m.get_one::("min-bin-size").unwrap().clone(); + let min_contig_count = m.get_one::("min-contig-count").unwrap().clone(); let n_contigs = coverage_table.table.nrows(); let max_retries = m.get_one::("max-retries").unwrap().clone(); @@ -145,6 +147,7 @@ impl RecoverEngine { n_contigs, min_bin_size, min_contig_size, + min_contig_count, // filtered_contigs, filtered_contigs: HashSet::new(), max_retries @@ -257,6 +260,7 @@ impl RecoverEngine { n_neighbours: self.n_neighbours, min_bin_size: self.min_bin_size, min_contig_size: self.min_contig_size, + min_contig_count: self.min_contig_count, mags_to_refine: mag_paths, checkm_results: None, threads: rayon::current_num_threads(), diff --git a/src/refine/refinery.rs b/src/refine/refinery.rs index 75fdf1a..2a752a5 100644 --- a/src/refine/refinery.rs +++ b/src/refine/refinery.rs @@ -29,6 +29,7 @@ pub struct RefineEngine { pub(crate) mags_to_refine: Vec, pub(crate) min_contig_size: usize, pub(crate) min_bin_size: usize, + pub(crate) min_contig_count: usize, pub(crate) n_neighbours: usize, pub(crate) bin_unbinned: bool, pub(crate) max_retries: usize @@ -65,6 +66,7 @@ impl RefineEngine { let min_contig_size = *m.get_one::("min-contig-size").unwrap(); let min_bin_size = *m.get_one::("min-bin-size").unwrap(); + let min_contig_count = *m.get_one::("min-contig-count").unwrap(); let n_neighbours = *m.get_one::("n-neighbours").unwrap(); let max_retries = *m.get_one::("max-retries").unwrap(); @@ -78,6 +80,7 @@ impl RefineEngine { mags_to_refine, min_contig_size, min_bin_size, + min_contig_count, n_neighbours, bin_unbinned: false, max_retries @@ -100,11 +103,11 @@ impl RefineEngine { // check if we have unbinned in self.mags_to_refine // if so move them straigh to unchanged - self.mags_to_refine.iter().for_each(|genome| { - if genome.contains(removal_string) { - self.copy_bin_to_output(genome, UNCHANGED_LOC, UNCHANGED_BIN_TAG).unwrap(); + for genome in self.mags_to_refine.iter() { + if genome.contains(removal_string) || !self.passes_requirements(genome)? { + self.copy_bin_to_output(genome, UNCHANGED_LOC, UNCHANGED_BIN_TAG)?; } - }); + } self.mags_to_refine.retain(|genome| !genome.contains(removal_string)); @@ -358,6 +361,31 @@ impl RefineEngine { Ok(output_json) } + fn passes_requirements(&self, bin_path: &str) -> Result { + let mut passes = true; + let (contig_count, genome_size) = self.get_count_and_size_above_size(bin_path, self.min_contig_size)?; + if contig_count < self.min_contig_count || genome_size < self.min_bin_size { + passes = false; + } + + return Ok(passes); + } + + fn get_count_and_size_above_size(&self, bin_path: &str, min_size: usize) -> Result<(usize, usize)> { + let mut reader = parse_fastx_file(path::Path::new(&bin_path))?; + let mut contig_count = 0; + let mut genome_size = 0; + while let Some(seq) = reader.next() { + let seq = seq?; + if seq.seq().len() >= min_size { + contig_count += 1; + genome_size += seq.seq().len(); + } + } + + Ok((contig_count, genome_size)) + } + fn get_original_contig_count(&self, bin_path: &str) -> Result { let mut reader = parse_fastx_file(path::Path::new(&bin_path))?; let mut contig_count = 0;