diff --git a/include/bam_module.h b/include/bam_module.h index b583f4d..c6a7d02 100644 --- a/include/bam_module.h +++ b/include/bam_module.h @@ -18,9 +18,8 @@ class BAM_Module{ int run(Input_Para& input_params, Output_BAM& final_output); int calculateStatistics(Input_Para& input_params, Output_BAM& final_output); - static void batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex); + static void batchStatistics(HTSReader& reader, int batch_size, std::unordered_set read_ids, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex, double base_mod_threshold); - // RRMS // Read the RRMS CSV file and store the read IDs (accepted or rejected) std::unordered_set readRRMSFile(std::string rrms_csv_file, bool accepted_reads); }; diff --git a/include/hts_reader.h b/include/hts_reader.h index 4704d28..505eeea 100644 --- a/include/hts_reader.h +++ b/include/hts_reader.h @@ -41,13 +41,13 @@ class HTSReader { int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution); // Read the next batch of records from the BAM file - int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set& read_ids); + int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set& read_ids, double base_mod_threshold); // Return if the reader has finished reading the BAM file bool hasNextRecord(); // Return the number of records in the BAM file using the BAM index - int64_t getNumRecords(const std::string &bam_file_name); + int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, double base_mod_threshold); std::map getQueryToRefMap(bam1_t *record); diff --git a/include/output_data.h b/include/output_data.h index 7e15f6d..e8b9b2c 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -190,13 +190,21 @@ class Output_BAM : public Output_FQ uint64_t modified_base_count = ZeroDefault; // Total number of modified bases mapped to the reference genome uint64_t modified_base_count_forward = ZeroDefault; // Total number of modified bases in the genome on the forward strand uint64_t modified_base_count_reverse = ZeroDefault; // Total number of modified bases in the genome on the reverse strand - // uint64_t c_modified_base_count = ZeroDefault; // Total C modified bases uint64_t cpg_modified_base_count = ZeroDefault; // Total C modified bases in CpG sites (combined forward and reverse) uint64_t cpg_modified_base_count_forward = ZeroDefault; // Total C modified bases in CpG sites on the forward strand uint64_t cpg_modified_base_count_reverse = ZeroDefault; // Total C modified bases in CpG sites on the reverse strand uint64_t cpg_genome_count = ZeroDefault; // Total number of CpG sites in the genome double percent_modified_cpg = ZeroDefault; // Percentage of CpG sites with modified bases (forward and reverse) + // Preprint revisions: Remove all counts with unique positions in the + // reference genome, and only report raw counts + uint64_t sample_modified_base_count = ZeroDefault; // Total number of modified bases passing the threshold + uint64_t sample_modified_base_count_forward = ZeroDefault; // Total number of modified bases passing the threshold on the forward strand + uint64_t sample_modified_base_count_reverse = ZeroDefault; // Total number of modified bases passing the threshold on the reverse strand + uint64_t sample_cpg_forward_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique) + uint64_t sample_cpg_reverse_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique) + std::map>> sample_c_modified_positions; // chr -> vector of (position, strand) for modified bases passing the threshold + // Test counts uint64_t test_count = ZeroDefault; // Test count uint64_t test_count2 = ZeroDefault; // Test count 2 @@ -208,9 +216,7 @@ class Output_BAM : public Output_FQ // Signal data section int read_count = ZeroDefault; int base_count = ZeroDefault; - // std::vector read_move_table; std::unordered_map read_move_table; - // std::map read_move_table = {}; // POD5 signal-level information is stored in a map of read names to a map of // reference positions to a tuple of (ts, ns, move table vector) diff --git a/src/bam_module.cpp b/src/bam_module.cpp index 810c9d6..8a85bf5 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -60,6 +60,9 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ std::mutex output_mutex; std::mutex cout_mutex; + // Get the modified base threshold if available + double base_mod_threshold = input_params.base_mod_threshold; + // Loop through the input files int file_count = (int) input_params.num_input_files; //std::cout << "Number of input files = " << file_count << std::endl; @@ -79,7 +82,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ // Get the number of records in the file using the BAM index std::cout << "Getting number of records..." << std::endl; - int num_records = reader.getNumRecords(filepath); + int num_records = reader.getNumRecords(filepath, final_output, base_mod_threshold); std::cout << "Number of records = " << num_records << std::endl; // Exit if there are no records @@ -117,8 +120,11 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ std::vector thread_vector; for (int thread_index=0; thread_index rrms_read_ids_copy = input_params.rrms_read_ids; + // Create a thread - std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, std::ref(input_params),std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex)); + std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, rrms_read_ids_copy,std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex), base_mod_threshold); // Add the thread to the vector thread_vector.push_back(std::move(t)); @@ -149,6 +155,31 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ ref_query.setFilepath(input_params.ref_genome); std::cout << "Reference genome read." << std::endl; + // Preprint revision: Loop through the base modifications and find + // the CpG modifications + for (auto const &it : final_output.sample_c_modified_positions) + { + std::string chr = it.first; + std::vector> cpg_mods = it.second; + + // Determine if it resides in a CpG site + for (auto const &it2 : cpg_mods) + { + int64_t ref_pos = (int64_t) it2.first; + int strand = it2.second; + + // Update the strand-specific CpG modified base count + if (strand == 0 && ref_query.getBase(chr, ref_pos) == "C" && ref_query.getBase(chr, ref_pos + 1) == "G") + { + final_output.sample_cpg_forward_count++; + } + else if (strand == 1 && ref_query.getBase(chr, ref_pos) == "G" && ref_query.getBase(chr, ref_pos - 1) == "C") + { + final_output.sample_cpg_reverse_count++; + } + } + } + // Loop through the base modifications and find the CpG // modifications double base_mod_threshold = input_params.base_mod_threshold; @@ -267,11 +298,11 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ return exit_code; } -void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex) +void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unordered_set read_ids, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex, double base_mod_threshold) { // Read the next N records Output_BAM record_output; - reader.readNextRecords(batch_size, record_output, bam_mutex, input_params.rrms_read_ids); + reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold); // Update the final output output_mutex.lock(); @@ -340,6 +371,13 @@ std::unordered_set BAM_Module::readRRMSFile(std::string rrms_csv_fi // Store the read ID if the decision matches the pattern if (decision == pattern){ rrms_read_ids.insert(read_id); + + std::string test_id = "65d8befa-eec0-4496-bf2b-aa1a84e6dc5e"; + if (read_id == test_id){ + std::cout << "[TEST 1] Found test ID: " << test_id << std::endl; + } + + // std::cout << read_id << std::endl; } } diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 6a29067..870086b 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -83,7 +83,7 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *bas } // Read the next batch of records from the BAM file and store QC in the output_data object -int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set& read_ids){ +int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set& read_ids, double base_mod_threshold){ int record_count = 0; int exit_code = 0; @@ -91,13 +91,22 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu bool read_ids_present = false; if (read_ids.size() > 0){ read_ids_present = true; + printMessage("Filtering reads by read ID"); + + printMessage("Number of read IDs: " + std::to_string(read_ids.size())); + printMessage("First read ID: " + *read_ids.begin()); + // Check if the first read ID has any newlines, carriage returns, tabs, + // or spaces + if (read_ids.begin()->find_first_of("\n\r\t ") != std::string::npos) { + printError("Read IDs cannot contain newlines, carriage returns, tabs, or spaces"); + return 1; + } } // Access the base quality histogram from the output_data object uint64_t *base_quality_distribution = output_data.seq_quality_info.base_quality_distribution; // Do QC on each record and store the results in the output_data object - bool first_pod5_tag = false; while ((record_count < batch_size) && (exit_code >= 0)) { // Create a record object bam1_t* record = bam_init1(); @@ -117,13 +126,18 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Get the read (query) name std::string query_name = bam_get_qname(record); - // Determine if this read should be skipped - if (read_ids_present){ + // Check if the read name has any newlines, carriage returns, tabs, or + // spaces + if (query_name.find_first_of("\n\r\t ") != std::string::npos) { + printError("BAM Read names cannot contain newlines, carriage returns, tabs, or spaces"); + return 1; + } + // Determine if this read should be skipped + if (read_ids_present) { // Determine if this read should be skipped if (read_ids.find(query_name) == read_ids.end()){ - // std::cout << "Skipping read " << query_name << std::endl; continue; // Skip this read } } @@ -141,29 +155,18 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // present if (!this->has_pod5_tags.test_and_set()) { printMessage("POD5 tags found (ts, ns, mv)"); - first_pod5_tag = true; } // Get the ts and ns tags int32_t ts = bam_aux2i(ts_tag); int32_t ns = bam_aux2i(ns_tag); - // if (first_pod5_tag) { - // printMessage("ts: " + std::to_string(ts) + ", ns: " + std::to_string(ns)); - // } // Get the move table (start at 1 to skip the tag type) - int max_print = 15; int32_t length = bam_auxB_len(mv_tag); std::vector move_table(length); std::vector> sequence_move_table; // Store the sequence move table with indices - // if (first_pod5_tag) { - // printMessage("Move table length: " + std::to_string(length)); - // } - - int base_signal_length = 0; // Iterate over the move table values - int prev_value = 0; int current_index = ts; std::vector signal_index_vector; int move_value = 0; @@ -188,112 +191,8 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu break; } output_data.addReadMoveTable(query_name, seq_str, signal_index_vector, ts, ns); - - // if (first_pod5_tag) { - // printMessage("Signal vector length: " - // + std::to_string(signal_index_vector.size()) + ", Sequence string length: " - // + std::to_string(seq_str.length())); - // // printMessage("Base signal length: " + std::to_string(base_signal_length) + ", Sequence string length: " + std::to_string(seq_str.length())); - - // // printMessage("Base vector length: " + std::to_string(sequence_move_table.size())); - // // printMessage("Test count: " + std::to_string(test_count)); - // // printMessage("Sequence string length: " + std::to_string(seq_str.length())); - // } } - // Follow here to get base modification tags: - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274 - hts_base_mod_state *state = hts_base_mod_state_alloc(); - std::map> query_base_modifications; - - // Parse the base modification tags if a primary alignment - read_mutex.lock(); - int ret = bam_parse_basemod(record, state); - read_mutex.unlock(); - if (ret >= 0 && !(record->core.flag & BAM_FSECONDARY) && !(record->core.flag & BAM_FSUPPLEMENTARY) && !(record->core.flag & BAM_FUNMAP)) { - - // Get the chromosome if alignments are present - bool alignments_present = true; - std::string chr; - if (record->core.tid < 0) { - alignments_present = false; - } else { - chr = this->header->target_name[record->core.tid]; - } - - // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative, - // but it always yields 0...) - int strand = (record->core.flag & BAM_FREVERSE) ? 1 : 0; - - // Iterate over the state object to get the base modification tags - // using bam_next_basemod - hts_base_mod mods[10]; - int n = 0; - int pos = 0; - std::vector query_pos; - while ((n=bam_next_basemod(record, state, mods, 10, &pos)) > 0) { - for (int i = 0; i < n; i++) { - // Update the prediction count - output_data.modified_prediction_count++; - - // Note: The modified base value can be a positive char (e.g. 'm', - // 'h') (DNA Mods DB) or negative integer (ChEBI ID): - // https://github.com/samtools/hts-specs/issues/741 - // DNA Mods: https://dnamod.hoffmanlab.org/ - // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839 - // Header line: - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215 - - // TODO: Look into htslib error with missing strand information - - // Determine the probability of the modification (-1 if - // unknown) - double probability = -1; - if (mods[i].qual != -1) { - probability = mods[i].qual / 256.0; - } - - // Add the modification to the query base modifications map - this->addModificationToQueryMap(query_base_modifications, pos, mods[i].modified_base, mods[i].canonical_base, probability, strand); - query_pos.push_back(pos); - } - } - - // Set the atomic flag and print a message if the base modification - // tags are present - if (query_pos.size() > 0 && !this->has_mm_ml_tags.test_and_set()) { - printMessage("Base modification data found (MM, ML tags)"); - } - - // If alignments are present, get the reference positions of the query positions - if (alignments_present && query_pos.size() > 0) { - // Get the query to reference position mapping - std::map query_to_ref_map = this->getQueryToRefMap(record); - std::vector ref_pos(query_pos.size(), -1); - - // Loop through the query and reference positions and add the - // reference positions to the output data - for (size_t i = 0; i < query_pos.size(); i++) { - // Get the reference position from the query to reference - // map - if (query_to_ref_map.find(query_pos[i]) != query_to_ref_map.end()) { - ref_pos[i] = query_to_ref_map[query_pos[i]]; - - // Add the modification to the output data - char mod_type = std::get<0>(query_base_modifications[query_pos[i]]); - char canonical_base = std::get<1>(query_base_modifications[query_pos[i]]); - double likelihood = std::get<2>(query_base_modifications[query_pos[i]]); - int strand = std::get<3>(query_base_modifications[query_pos[i]]); - output_data.add_modification(chr, ref_pos[i], mod_type, canonical_base, likelihood, strand); - } - } - } - } - - // Deallocate the state object - hts_base_mod_state_free(state); - // Determine if this is an unmapped read if (record->core.flag & BAM_FUNMAP) { Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info; @@ -450,7 +349,7 @@ bool HTSReader::hasNextRecord(){ } // Return the number of records in the BAM file using the BAM index -int64_t HTSReader::getNumRecords(const std::string & bam_filename){ +int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &final_output, double base_mod_threshold){ samFile* bam_file = sam_open(bam_filename.c_str(), "r"); bam_hdr_t* bam_header = sam_hdr_read(bam_file); bam1_t* bam_record = bam_init1(); @@ -458,6 +357,141 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename){ int64_t num_reads = 0; while (sam_read1(bam_file, bam_header, bam_record) >= 0) { num_reads++; + + // Base modification tag analysis (not yet ready for multi-threading) + // Follow here to get base modification tags: + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274 + hts_base_mod_state *state = hts_base_mod_state_alloc(); + std::map> query_base_modifications; + + // Preprint revisions: New data structure that does not require unique + // positions for each base modification + // chr -> vector of (position, strand) for C modified bases passing the threshold + std::vector> c_modified_positions; + + // Parse the base modification tags if a primary alignment + int ret = bam_parse_basemod(bam_record, state); + if (ret >= 0 && !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP)) { + + // Get the chromosome if alignments are present + bool alignments_present = true; + std::string chr; + std::map query_to_ref_map; + if (bam_record->core.tid < 0) { + alignments_present = false; + } else { + chr = bam_header->target_name[bam_record->core.tid]; + + // Get the query to reference position mapping + query_to_ref_map = this->getQueryToRefMap(bam_record); + } + + // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative, + // but it always yields 0...) + int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0; + + // Iterate over the state object to get the base modification tags + // using bam_next_basemod + hts_base_mod mods[10]; + int n = 0; + //int pos = 0; + int32_t pos = 0; + std::vector query_pos; + while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) { + for (int i = 0; i < n; i++) { + // Update the prediction count + final_output.modified_prediction_count++; + + // Note: The modified base value can be a positive char (e.g. 'm', + // 'h') (DNA Mods DB) or negative integer (ChEBI ID): + // https://github.com/samtools/hts-specs/issues/741 + // DNA Mods: https://dnamod.hoffmanlab.org/ + // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839 + // Header line: + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215 + + // TODO: Look into htslib error with missing strand information + + // Determine the probability of the modification (-1 if + // unknown) + double probability = -1; + if (mods[i].qual != -1) { + probability = mods[i].qual / 256.0; + + // If the probability is greater than the threshold, + // update the count + if (probability >= base_mod_threshold) { + final_output.sample_modified_base_count++; + + // Update the modified base count for the strand + if (strand == 0) { + final_output.sample_modified_base_count_forward++; + } else { + final_output.sample_modified_base_count_reverse++; + } + + // Preprint revisions: Store the modified positions + char canonical_base_char = std::toupper(mods[i].canonical_base); + char mod_type = mods[i].modified_base; + if (canonical_base_char == 'C' && mod_type == 'm') { + + // Convert the query position to reference position if available + if (alignments_present) { + if (query_to_ref_map.find(pos) != query_to_ref_map.end()) { + int32_t ref_pos = query_to_ref_map[pos]; + c_modified_positions.push_back(std::make_pair(ref_pos, strand)); + } + } + } + } + } + + // Add the modification to the query base modifications map + this->addModificationToQueryMap(query_base_modifications, pos, mods[i].modified_base, mods[i].canonical_base, probability, strand); + query_pos.push_back(pos); + } + } + + // Set the atomic flag and print a message if the base modification + // tags are present + if (query_pos.size() > 0 && !this->has_mm_ml_tags.test_and_set()) { + printMessage("Base modification data found (MM, ML tags)"); + } + + // If alignments are present, get the reference positions of the query positions + if (alignments_present && query_pos.size() > 0) { + std::vector ref_pos(query_pos.size(), -1); + + // Loop through the query and reference positions and add the + // reference positions to the output data + for (size_t i = 0; i < query_pos.size(); i++) { + // Get the reference position from the query to reference + // map + if (query_to_ref_map.find(query_pos[i]) != query_to_ref_map.end()) { + ref_pos[i] = query_to_ref_map[query_pos[i]]; + + // Add the modification to the output data + char mod_type = std::get<0>(query_base_modifications[query_pos[i]]); + char canonical_base = std::get<1>(query_base_modifications[query_pos[i]]); + double likelihood = std::get<2>(query_base_modifications[query_pos[i]]); + int strand = std::get<3>(query_base_modifications[query_pos[i]]); + final_output.add_modification(chr, ref_pos[i], mod_type, canonical_base, likelihood, strand); + } + } + } + // Preprint revisions: Append the modified positions to the output data + if (c_modified_positions.size() > 0) { + if (final_output.sample_c_modified_positions.find(chr) == final_output.sample_c_modified_positions.end()) { + final_output.sample_c_modified_positions[chr] = c_modified_positions; + } else { + final_output.sample_c_modified_positions[chr].insert(final_output.sample_c_modified_positions[chr].end(), c_modified_positions.begin(), c_modified_positions.end()); + } + } + } + + // Deallocate the state object + hts_base_mod_state_free(state); } bam_destroy1(bam_record); diff --git a/src/output_data.cpp b/src/output_data.cpp index 6d0ddb3..3423d66 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -405,6 +405,21 @@ void Output_BAM::add(Output_BAM &output_data) int end = it->second.getSequenceEnd(); this->addReadMoveTable(read_id, sequence_data_str, signal_index, start, end); } + + // Preprint revisions: Update base modification counts + this->sample_modified_base_count += output_data.sample_modified_base_count; + this->sample_modified_base_count_forward += output_data.sample_modified_base_count_forward; + this->sample_modified_base_count_reverse += output_data.sample_modified_base_count_reverse; + + for ( auto it = output_data.sample_c_modified_positions.begin(); it != output_data.sample_c_modified_positions.end(); ++it ){ + std::string chr = it->first; + std::vector> positions = it->second; + if (this->sample_c_modified_positions.find(chr) == this->sample_c_modified_positions.end()){ + this->sample_c_modified_positions[chr] = positions; + } else { + this->sample_c_modified_positions[chr].insert(this->sample_c_modified_positions[chr].end(), positions.begin(), positions.end()); + } + } } void Output_BAM::global_sum(){ diff --git a/src/plot_utils.py b/src/plot_utils.py index f29d6e8..19ff5eb 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -232,63 +232,132 @@ def plot_basic_info(output_data, file_type): # Plot the read length histograms def read_lengths_histogram(data, font_size): + linear_bin_count = 10 + log_bin_count = 10 + annotation_size = 10 # Annotation font size mean, median, n50 = data.mean_read_length, data.median_read_length, data.n50_read_length # Read the read lengths array in float64 format read_lengths = np.array(data.read_lengths, dtype=np.float64) - # Calculate a histogram of read lengths - hist, edges = np.histogram(read_lengths, bins=10) + # If there are no read lengths, throw an error + if len(read_lengths) == 0: + raise ValueError("No read lengths found") + + # Calculate a histogram of read lengths, but don't center the bins + # edges = np.linspace(1, np.max(read_lengths), num=bin_count + 1) + edges = np.linspace(np.min(read_lengths), np.max(read_lengths), num=linear_bin_count + 1) + hist, _ = np.histogram(read_lengths, bins=edges) # Create a figure with two subplots + # fig = make_subplots( + # rows=2, cols=1, + # subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.5) fig = make_subplots( - rows=2, cols=1, - subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.3) - - customdata = np.dstack((edges[:-1], edges[1:], hist))[0, :, :] - fig.add_trace(go.Bar(x=edges, y=hist, customdata=customdata, - hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp
Counts:%{customdata[' - '2]:.0f}', - marker_color='#36a5c7'), row=1, col=1) + rows=1, cols=2, + subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.0) + linear_col=1 + log_col=2 + + linear_bindata = np.dstack((edges[:-1], edges[1:], hist))[0, :, :] + # linear_bin_centers = np.round((linear_bindata[:, 0] + linear_bindata[:, 1]) / 2, 0) + fig.add_trace(go.Bar(x=edges, y=hist, customdata=linear_bindata, + hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp
Counts:%{customdata[2]:.0f}', + marker_color='#36a5c7'), row=1, col=linear_col) fig.add_vline(mean, line_width=1, line_dash="dash", annotation_text='Mean', annotation_bgcolor="black", - annotation_textangle=90, row=1, col=1) + annotation_textangle=90, row=1, col=linear_col) fig.add_vline(median, line_width=1, line_dash="dash", annotation_text='Median', annotation_bgcolor="blue", - annotation_textangle=90, row=1, col=1) + annotation_textangle=90, row=1, col=linear_col) fig.add_vline(n50, line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green", - annotation_textangle=90, row=1, col=1) + annotation_textangle=90, row=1, col=linear_col) # Log histogram # Get the log10 histogram of read lengths read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0)) - log_hist, log_edges = np.histogram(read_lengths_log, bins=len(edges)) + # log_hist, log_edges = np.histogram(read_lengths_log, bins=bin_count) + log_edges = np.linspace(0, np.max(read_lengths_log), num=log_bin_count + 1) + log_hist, _ = np.histogram(read_lengths_log, bins=log_edges) xd = log_edges - customdata = np.dstack((np.power(10, log_edges)[:-1], np.power(10, log_edges)[1:], log_hist))[0, :, :] + log_bindata = np.dstack((np.power(10, log_edges)[:-1], np.power(10, log_edges)[1:], log_hist))[0, :, :] + # log_bin_centers = np.round((log_bindata[:, 0] + log_bindata[:, 1]) / 2, 0) yd = log_hist - fig.add_trace(go.Bar(x=xd, y=yd, customdata=customdata, + fig.add_trace(go.Bar(x=xd, y=yd, customdata=log_bindata, hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp
Counts:%{customdata[2]:.0f}', - marker_color='#36a5c7'), row=2, col=1) + marker_color='#36a5c7'), row=1, col=log_col) fig.add_vline(np.log10(mean), line_width=1, line_dash="dash", annotation_text='Mean', annotation_bgcolor="black", - annotation_textangle=90, row=2, col=1) + annotation_textangle=90, row=1, col=log_col) fig.add_vline(np.log10(median), line_width=1, line_dash="dash", annotation_text='Median', annotation_bgcolor="blue", - annotation_textangle=90, row=2, col=1) + annotation_textangle=90, row=1, col=log_col) fig.add_vline(np.log10(n50), line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green", - annotation_textangle=90, row=2, col=1) + annotation_textangle=90, row=1, col=log_col) fig.update_annotations(font=dict(color="white")) # Set tick value range for the log scale - tick_vals = list(range(0, 5)) - fig.update_xaxes( - range=[0, 5], - tickmode='array', - tickvals=tick_vals, - ticktext=['{:,}'.format(10 ** x) for x in tick_vals], - ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) - - fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=1) + # Use the bin edge centers as the tick values + # tick_vals = (log_edges[:-1] + log_edges[1:]) / 2 + # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals] + tick_vals = log_edges + # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals] + + # Format the tick labels to be in kilobases (kb) if the value is greater than + # 1000, and in bases (b) otherwise + # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals] + # tick_labels = ['{:,}kb'.format(int(x) for x in log_bin_centers) if x > + # 1000 else '{:,}b'.format(int(x)) for x in log_bin_centers] + tick_labels = [] + for i in range(len(log_bindata)): + # Format the tick labels to be in kilobases (kb) if the value is greater + # than 1000, in megabases (Mb) if the value is greater than 1,000,000, + # and in bases (b) if less than 1000 + left_val = log_bindata[i][0] + left_val_str = '{:,}Mb'.format(int(left_val / 1000000)) if left_val > 1000000 else '{:,}kb'.format(int(left_val / 1000)) if left_val > 1000 else '{:,}bp'.format(int(left_val)) + + right_val = log_bindata[i][1] + right_val_str = '{:,}Mb'.format(int(right_val / 1000000)) if right_val > 1000000 else '{:,}kb'.format(int(right_val / 1000)) if right_val > 1000 else '{:,}bp'.format(int(right_val)) + + tick_labels.append('{}-{}'.format(left_val_str, right_val_str)) + + fig.update_xaxes(ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=1, col=log_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45) + # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) + # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1, tickvals=tick_vals) + # tick_vals = list(range(0, 5)) + # fig.update_xaxes( + # range=[0, np.max(log_edges)], + # tickmode='array', + # tickvals=tick_vals, + # ticktext=['{:,}'.format(10 ** x) for x in tick_vals], + # ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) + + # Set the tick value range for the linear scale + # tick_vals = (edges[:-1] + edges[1:]) / 2 + # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals] + tick_vals = edges + # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals] + + # Format the tick labels to be the range of the bin centers + tick_labels = [] + for i in range(len(linear_bindata)): + # Format the tick labels to be in kilobases (kb) if the value is greater + # than 1000, in megabases (Mb) if the value is greater than 1,000,000, + # and in bases (b) if less than 1000 + left_val = linear_bindata[i][0] + left_val_str = '{:,}Mb'.format(int(left_val / 1000000)) if left_val > 1000000 else '{:,}kb'.format(int(left_val / 1000)) if left_val > 1000 else '{:,}bp'.format(int(left_val)) + + right_val = linear_bindata[i][1] + right_val_str = '{:,}Mb'.format(int(right_val / 1000000)) if right_val > 1000000 else '{:,}kb'.format(int(right_val / 1000)) if right_val > 1000 else '{:,}bp'.format(int(right_val)) + + tick_labels.append('{}-{}'.format(left_val_str, right_val_str)) + + # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals] + # tick_labels = ['{:,}kb'.format(int(x)) if x > 1000 else + # '{:,}b'.format(int(x)) for x in linear_bin_centers] + linear_col=1 + fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=linear_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45) + # fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=1, range=[0, np.max(edges)], tickvals=tick_vals) fig.update_yaxes(ticks="outside", title_text='Counts', title_standoff=0) # Update the layout @@ -298,8 +367,9 @@ def read_lengths_histogram(data, font_size): # fig.update_layout(font=dict(size=font_size), autosize=True) fig.update_annotations(font_size=annotation_size) - html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) - + # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + html_obj = fig.to_html(full_html=False, default_height=500, default_width=1200) + return html_obj # Save the 'Base quality' plot image. @@ -811,22 +881,23 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Add the base modification statistics to the table table_str += "Total Predictions{:,d}".format(total_predictions) table_str += "Probability Threshold{:.2f}".format(base_modification_threshold) - table_str += "Total Modified Bases in the Genome{:,d}".format(total_modifications) - table_str += "Total in the Forward Strand{:,d}".format(total_forward_modifications) - table_str += "Total in the Reverse Strand{:,d}".format(total_reverse_modifications) - # table_str += "Total Modified Cs{:,d}".format(total_c_modifications) - table_str += "Total CpG Sites in the Genome{:,d}".format(genome_cpg_count) - table_str += "Total Modified Cs in CpG Sites (Forward Strand){:,d}".format(cpg_forward_modifications) - table_str += "Total Modified Cs in CpG Sites (Reverse Strand){:,d}".format(cpg_reverse_modifications) - table_str += "Total Modified Cs in CpG Sites (Combined Strands){:,d}".format(cpg_modifications) - table_str += "Percentage of CpG Sites with Modifications (Combined Strands){:.2f}%".format(pct_modified_cpg) - - # Add percentage of CpG sites with modifications (forward and reverse - # strands) - # table_str += "Percentage of CpG Sites with Modifications (Forward Strand){:.2f}%".format(pct_modified_cpg_forward) - # table_str += "Percentage of CpG Sites with Modifications (Reverse Strand){:.2f}%".format(pct_modified_cpg_reverse) - # table_str += "Percentage of Cs in CpG Sites{:.2f}%".format(cpg_modification_percentage) + # table_str += "Total Modified Bases in the Genome{:,d}".format(total_modifications) + # table_str += "Total in the Forward Strand{:,d}".format(total_forward_modifications) + # table_str += "Total in the Reverse Strand{:,d}".format(total_reverse_modifications) + # table_str += "Total CpG Sites in the Genome{:,d}".format(genome_cpg_count) + # table_str += "Total Modified Cs in CpG Sites (Forward Strand){:,d}".format(cpg_forward_modifications) + # table_str += "Total Modified Cs in CpG Sites (Reverse Strand){:,d}".format(cpg_reverse_modifications) + # table_str += "Total Modified Cs in CpG Sites (Combined Strands){:,d}".format(cpg_modifications) + # table_str += "Percentage of CpG Sites with Modifications (Combined Strands){:.2f}%".format(pct_modified_cpg) + + # Preprint revision + table_str += "Total Modified Bases in the Sample{:,d}".format(output_data.sample_modified_base_count) + table_str += "Total in the Forward Strand{:,d}".format(output_data.sample_modified_base_count_forward) + table_str += "Total in the Reverse Strand{:,d}".format(output_data.sample_modified_base_count_reverse) + table_str += "Total modified CpG Sites in the Sample (Forward Strand){:,d}".format(output_data.sample_cpg_forward_count) + table_str += "Total modified CpG Sites in the Sample (Reverse Strand){:,d}".format(output_data.sample_cpg_reverse_count) + + # Close the table table_str += "\n\n" plot_filepaths["base_mods"]['detail'] = table_str diff --git a/src/ref_query.cpp b/src/ref_query.cpp index 1c36662..256e5d1 100644 --- a/src/ref_query.cpp +++ b/src/ref_query.cpp @@ -169,39 +169,8 @@ std::pair RefQuery::getCpGModificationCounts(int strand) uint64_t unmodified_count = 0; std::cout << "Calculating CpG modification counts..." << std::endl; - - std::cout << " [TEST] Total CpG modified count: " << this->cpg_modified_count << std::endl; - std::cout << " [TEST] Total CpG sites: " << this->cpg_total_count << std::endl; - std::cout << " [TEST] CpG test count: " << this->test_count << std::endl; modified_count = this->cpg_modified_count; unmodified_count = this->cpg_total_count - this->cpg_modified_count; - -// // Iterate over each chromosome in the CpG site map -// // uint32_t cpg_site_count = 0; -// for (const auto& chr_pos_map : this->chr_pos_to_cpg) -// { -// // uint32_t chr_cpg_site_count = 0; -// -// // Iterate over each CpG site in the chromosome -// for (const auto& pos_to_cpg : chr_pos_map.second) -// { -// // Get the position and CpG site -// // int64_t pos = pos_to_cpg.first; -// bool is_cpg = pos_to_cpg.second; -// -// // Check if the CpG site is modified -// if (is_cpg) -// { -// // Increment the modified count -// modified_count++; -// } else { -// // Increment the unmodified count -// unmodified_count++; -// } -// // cpg_site_count++; -// // chr_cpg_site_count++; -// } -// } std::cout << "Modified CpG sites: " << modified_count << std::endl; std::cout << "Unmodified CpG sites: " << unmodified_count << std::endl; std::cout << "Total CpG sites: " << modified_count + unmodified_count << std::endl; @@ -231,7 +200,7 @@ std::string RefQuery::getBase(std::string chr, int64_t pos) // Check if the position is out of range if (static_cast(pos) >= sequence.size()) { - throw std::out_of_range("Index out of range"); + return "N"; } // Get the base