Skip to content

Commit

Permalink
Merge pull request #55 from rhysnewell/iss-54
Browse files Browse the repository at this point in the history
fix: add min contig count checker to refine
  • Loading branch information
rhysnewell authored Jul 17, 2024
2 parents e5d4a07 + 2caac06 commit 96ada78
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 4 deletions.
12 changes: 12 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,12 @@ pub fn build_cli() -> Command {
.value_parser(clap::value_parser!(usize))
.default_value("200000"),
)
.arg(
Arg::new("min-contig-count")
.long("min-contig-count")
.value_parser(clap::value_parser!(usize))
.default_value("10"),
)
.arg(
Arg::new("n-neighbours")
.long("n-neighbours")
Expand Down Expand Up @@ -1241,6 +1247,12 @@ pub fn build_cli() -> Command {
.value_parser(clap::value_parser!(usize))
.default_value("200000"),
)
.arg(
Arg::new("min-contig-count")
.long("min-contig-count")
.value_parser(clap::value_parser!(usize))
.default_value("10"),
)
.arg(
Arg::new("n-neighbours")
.long("n-neighbours")
Expand Down
4 changes: 4 additions & 0 deletions src/recover/recover_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ struct RecoverEngine {
n_contigs: usize,
min_bin_size: usize,
min_contig_size: usize,
min_contig_count: usize,
filtered_contigs: HashSet<String>,
max_retries: usize
}
Expand Down Expand Up @@ -127,6 +128,7 @@ impl RecoverEngine {
let ef_construction = m.get_one::<usize>("ef-construction").unwrap().clone();
let max_layers = m.get_one::<usize>("max-layers").unwrap().clone();
let min_bin_size = m.get_one::<usize>("min-bin-size").unwrap().clone();
let min_contig_count = m.get_one::<usize>("min-contig-count").unwrap().clone();

let n_contigs = coverage_table.table.nrows();
let max_retries = m.get_one::<usize>("max-retries").unwrap().clone();
Expand All @@ -145,6 +147,7 @@ impl RecoverEngine {
n_contigs,
min_bin_size,
min_contig_size,
min_contig_count,
// filtered_contigs,
filtered_contigs: HashSet::new(),
max_retries
Expand Down Expand Up @@ -257,6 +260,7 @@ impl RecoverEngine {
n_neighbours: self.n_neighbours,
min_bin_size: self.min_bin_size,
min_contig_size: self.min_contig_size,
min_contig_count: self.min_contig_count,
mags_to_refine: mag_paths,
checkm_results: None,
threads: rayon::current_num_threads(),
Expand Down
36 changes: 32 additions & 4 deletions src/refine/refinery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pub struct RefineEngine {
pub(crate) mags_to_refine: Vec<String>,
pub(crate) min_contig_size: usize,
pub(crate) min_bin_size: usize,
pub(crate) min_contig_count: usize,
pub(crate) n_neighbours: usize,
pub(crate) bin_unbinned: bool,
pub(crate) max_retries: usize
Expand Down Expand Up @@ -65,6 +66,7 @@ impl RefineEngine {

let min_contig_size = *m.get_one::<usize>("min-contig-size").unwrap();
let min_bin_size = *m.get_one::<usize>("min-bin-size").unwrap();
let min_contig_count = *m.get_one::<usize>("min-contig-count").unwrap();
let n_neighbours = *m.get_one::<usize>("n-neighbours").unwrap();
let max_retries = *m.get_one::<usize>("max-retries").unwrap();

Expand All @@ -78,6 +80,7 @@ impl RefineEngine {
mags_to_refine,
min_contig_size,
min_bin_size,
min_contig_count,
n_neighbours,
bin_unbinned: false,
max_retries
Expand All @@ -100,11 +103,11 @@ impl RefineEngine {

// check if we have unbinned in self.mags_to_refine
// if so move them straigh to unchanged
self.mags_to_refine.iter().for_each(|genome| {
if genome.contains(removal_string) {
self.copy_bin_to_output(genome, UNCHANGED_LOC, UNCHANGED_BIN_TAG).unwrap();
for genome in self.mags_to_refine.iter() {
if genome.contains(removal_string) || !self.passes_requirements(genome)? {
self.copy_bin_to_output(genome, UNCHANGED_LOC, UNCHANGED_BIN_TAG)?;
}
});
}

self.mags_to_refine.retain(|genome| !genome.contains(removal_string));

Expand Down Expand Up @@ -358,6 +361,31 @@ impl RefineEngine {
Ok(output_json)
}

fn passes_requirements(&self, bin_path: &str) -> Result<bool> {
let mut passes = true;
let (contig_count, genome_size) = self.get_count_and_size_above_size(bin_path, self.min_contig_size)?;
if contig_count < self.min_contig_count || genome_size < self.min_bin_size {
passes = false;
}

return Ok(passes);
}

fn get_count_and_size_above_size(&self, bin_path: &str, min_size: usize) -> Result<(usize, usize)> {
let mut reader = parse_fastx_file(path::Path::new(&bin_path))?;
let mut contig_count = 0;
let mut genome_size = 0;
while let Some(seq) = reader.next() {
let seq = seq?;
if seq.seq().len() >= min_size {
contig_count += 1;
genome_size += seq.seq().len();
}
}

Ok((contig_count, genome_size))
}

fn get_original_contig_count(&self, bin_path: &str) -> Result<usize> {
let mut reader = parse_fastx_file(path::Path::new(&bin_path))?;
let mut contig_count = 0;
Expand Down

0 comments on commit 96ada78

Please sign in to comment.