Skip to content

Commit

Permalink
Preprint revisions (#55)
Browse files Browse the repository at this point in the history
* Fix read length histogram bin label
* Update base modification statistics verified with modkit results
  • Loading branch information
jonperdomo authored Aug 5, 2024
1 parent 612d5cc commit e9d8d41
Show file tree
Hide file tree
Showing 8 changed files with 343 additions and 211 deletions.
3 changes: 1 addition & 2 deletions include/bam_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ class BAM_Module{

int run(Input_Para& input_params, Output_BAM& final_output);
int calculateStatistics(Input_Para& input_params, Output_BAM& final_output);
static void batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex);
static void batchStatistics(HTSReader& reader, int batch_size, std::unordered_set<std::string> read_ids, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex, double base_mod_threshold);

// RRMS
// Read the RRMS CSV file and store the read IDs (accepted or rejected)
std::unordered_set<std::string> readRRMSFile(std::string rrms_csv_file, bool accepted_reads);
};
Expand Down
4 changes: 2 additions & 2 deletions include/hts_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ class HTSReader {
int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution);

// Read the next batch of records from the BAM file
int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids);
int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold);

// Return if the reader has finished reading the BAM file
bool hasNextRecord();

// Return the number of records in the BAM file using the BAM index
int64_t getNumRecords(const std::string &bam_file_name);
int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, double base_mod_threshold);

std::map<int, int> getQueryToRefMap(bam1_t *record);

Expand Down
12 changes: 9 additions & 3 deletions include/output_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,21 @@ class Output_BAM : public Output_FQ
uint64_t modified_base_count = ZeroDefault; // Total number of modified bases mapped to the reference genome
uint64_t modified_base_count_forward = ZeroDefault; // Total number of modified bases in the genome on the forward strand
uint64_t modified_base_count_reverse = ZeroDefault; // Total number of modified bases in the genome on the reverse strand
// uint64_t c_modified_base_count = ZeroDefault; // Total C modified bases
uint64_t cpg_modified_base_count = ZeroDefault; // Total C modified bases in CpG sites (combined forward and reverse)
uint64_t cpg_modified_base_count_forward = ZeroDefault; // Total C modified bases in CpG sites on the forward strand
uint64_t cpg_modified_base_count_reverse = ZeroDefault; // Total C modified bases in CpG sites on the reverse strand
uint64_t cpg_genome_count = ZeroDefault; // Total number of CpG sites in the genome
double percent_modified_cpg = ZeroDefault; // Percentage of CpG sites with modified bases (forward and reverse)

// Preprint revisions: Remove all counts with unique positions in the
// reference genome, and only report raw counts
uint64_t sample_modified_base_count = ZeroDefault; // Total number of modified bases passing the threshold
uint64_t sample_modified_base_count_forward = ZeroDefault; // Total number of modified bases passing the threshold on the forward strand
uint64_t sample_modified_base_count_reverse = ZeroDefault; // Total number of modified bases passing the threshold on the reverse strand
uint64_t sample_cpg_forward_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique)
uint64_t sample_cpg_reverse_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique)
std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions; // chr -> vector of (position, strand) for modified bases passing the threshold

// Test counts
uint64_t test_count = ZeroDefault; // Test count
uint64_t test_count2 = ZeroDefault; // Test count 2
Expand All @@ -208,9 +216,7 @@ class Output_BAM : public Output_FQ
// Signal data section
int read_count = ZeroDefault;
int base_count = ZeroDefault;
// std::vector<Base_Move_Table> read_move_table;
std::unordered_map<std::string, Base_Move_Table> read_move_table;
// std::map<std::string, Base_Move_Table> read_move_table = {};

// POD5 signal-level information is stored in a map of read names to a map of
// reference positions to a tuple of (ts, ns, move table vector)
Expand Down
46 changes: 42 additions & 4 deletions src/bam_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
std::mutex output_mutex;
std::mutex cout_mutex;

// Get the modified base threshold if available
double base_mod_threshold = input_params.base_mod_threshold;

// Loop through the input files
int file_count = (int) input_params.num_input_files;
//std::cout << "Number of input files = " << file_count << std::endl;
Expand All @@ -79,7 +82,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_

// Get the number of records in the file using the BAM index
std::cout << "Getting number of records..." << std::endl;
int num_records = reader.getNumRecords(filepath);
int num_records = reader.getNumRecords(filepath, final_output, base_mod_threshold);
std::cout << "Number of records = " << num_records << std::endl;

// Exit if there are no records
Expand Down Expand Up @@ -117,8 +120,11 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
std::vector<std::thread> thread_vector;
for (int thread_index=0; thread_index<thread_count; thread_index++){

// Copy the input read IDs to a new vector
std::unordered_set<std::string> rrms_read_ids_copy = input_params.rrms_read_ids;

// Create a thread
std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, std::ref(input_params),std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex));
std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, rrms_read_ids_copy,std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex), base_mod_threshold);

// Add the thread to the vector
thread_vector.push_back(std::move(t));
Expand Down Expand Up @@ -149,6 +155,31 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
ref_query.setFilepath(input_params.ref_genome);
std::cout << "Reference genome read." << std::endl;

// Preprint revision: Loop through the base modifications and find
// the CpG modifications
for (auto const &it : final_output.sample_c_modified_positions)
{
std::string chr = it.first;
std::vector<std::pair<int32_t, int>> cpg_mods = it.second;

// Determine if it resides in a CpG site
for (auto const &it2 : cpg_mods)
{
int64_t ref_pos = (int64_t) it2.first;
int strand = it2.second;

// Update the strand-specific CpG modified base count
if (strand == 0 && ref_query.getBase(chr, ref_pos) == "C" && ref_query.getBase(chr, ref_pos + 1) == "G")
{
final_output.sample_cpg_forward_count++;
}
else if (strand == 1 && ref_query.getBase(chr, ref_pos) == "G" && ref_query.getBase(chr, ref_pos - 1) == "C")
{
final_output.sample_cpg_reverse_count++;
}
}
}

// Loop through the base modifications and find the CpG
// modifications
double base_mod_threshold = input_params.base_mod_threshold;
Expand Down Expand Up @@ -267,11 +298,11 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
return exit_code;
}

void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex)
void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unordered_set<std::string> read_ids, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex, double base_mod_threshold)
{
// Read the next N records
Output_BAM record_output;
reader.readNextRecords(batch_size, record_output, bam_mutex, input_params.rrms_read_ids);
reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold);

// Update the final output
output_mutex.lock();
Expand Down Expand Up @@ -340,6 +371,13 @@ std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_fi
// Store the read ID if the decision matches the pattern
if (decision == pattern){
rrms_read_ids.insert(read_id);

std::string test_id = "65d8befa-eec0-4496-bf2b-aa1a84e6dc5e";
if (read_id == test_id){
std::cout << "[TEST 1] Found test ID: " << test_id << std::endl;
}

// std::cout << read_id << std::endl;
}
}

Expand Down
Loading

0 comments on commit e9d8d41

Please sign in to comment.