Skip to content

Commit

Permalink
52 base modification tags (#53)
Browse files Browse the repository at this point in the history
* Add base modification (methylation) QC

* Add POD5 + dorado basecalled BAM QC
  • Loading branch information
jonperdomo authored Jul 25, 2024
1 parent 5353ed2 commit 612d5cc
Show file tree
Hide file tree
Showing 19 changed files with 1,454 additions and 254 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
uses: dsaltares/[email protected]
with:
repo: 'WGLab/LongReadSum'
version: 'tags/v0.1.0'
version: 'tags/v1.3.1'
file: 'SampleData.zip'

- name: Unzip assets
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ SampleData

# Testing scripts
linktoscripts
single_mod.summary.txt
single_mod.tin.xls
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ INCL_DIR := $(CURDIR)/include
SRC_DIR := $(CURDIR)/src
LIB_DIR := $(CURDIR)/lib

all:
# Generate the SWIG Python/C++ wrappers
# All targets
all: swig_build compile

# Generate the SWIG Python/C++ wrappers
swig_build:
swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i

# Compile the C++ shared libraries into lib/
python setup.py build_ext --build-lib $(LIB_DIR)
# Compile the C++ shared libraries into lib/
compile:
python3 setup.py build_ext --build-lib $(LIB_DIR)
11 changes: 11 additions & 0 deletions include/hts_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <fstream>
#include <mutex>
#include <stdint.h>
#include <atomic>

#include "output_data.h"

Expand All @@ -27,6 +28,11 @@ class HTSReader {
bam_hdr_t* header; // read the BAM header
bam1_t* record;
int record_count = 0;

// Atomic flags for whether certain BAM flags are present
std::atomic_flag has_nm_tag = ATOMIC_FLAG_INIT; // NM tag for number of mismatches using edit distance
std::atomic_flag has_mm_ml_tags = ATOMIC_FLAG_INIT; // MM and ML tags for modified base information
std::atomic_flag has_pod5_tags = ATOMIC_FLAG_INIT; // POD5 tags for signal-level information (ts, ns)

// Bool for whether the reading is complete
bool reading_complete = false;
Expand All @@ -43,6 +49,11 @@ class HTSReader {
// Return the number of records in the BAM file using the BAM index
int64_t getNumRecords(const std::string &bam_file_name);

std::map<int, int> getQueryToRefMap(bam1_t *record);

// Add a modification to the base modification map
void addModificationToQueryMap(std::map<int32_t, std::tuple<char, char, double, int>> &base_modifications, int32_t pos, char mod_type, char canonical_base, double likelihood, int strand);

HTSReader(const std::string &bam_file_name);
~HTSReader();
};
Expand Down
4 changes: 3 additions & 1 deletion include/input_parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ class Input_Para{
std::string rrms_csv; // CSV file with accepted/rejected read IDs (RRMS module)
bool rrms_filter; // Generate RRMS stats for accepted (true) or rejected (false) reads
std::unordered_set<std::string> rrms_read_ids; // List of read IDs from RRMS CSV file (accepted or rejected)
std::string ref_genome; // Reference genome file for BAM base modification analysis
double base_mod_threshold; // Base modification threshold for BAM base modification analysis

// Functions
std::string add_input_file(const std::string& _ip_file);
std::string add_input_file(const std::string& input_filepath);

Input_Para();

Expand Down
194 changes: 131 additions & 63 deletions include/output_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Define the output structures for each module.
#include <string>
#include <vector>
#include <map>
#include <unordered_map>
#include <stdint.h>

#include "input_parameters.h"
Expand Down Expand Up @@ -110,58 +111,142 @@ class Output_FQ : public Output_FA
};


// Define the base modification data structure (modification type, canonical
// base, likelihood, strand: 0 for forward, 1 for reverse, and CpG flag: T/F)
using Base_Modification = std::tuple<char, char, double, int, bool>;

// Define the signal-level data structure for POD5 (ts, ns, move table vector)
using POD5_Signal_Data = std::tuple<int32_t, int32_t, std::vector<int32_t>>;

// Base class for storing a read's base signal data
class Base_Signals
{
public:
std::string read_name;
int base_count;
std::string sequence_data_str; // Sequence of bases
std::vector<std::vector<int>> basecall_signals; // 2D vector of base signals

// Methods
int getBaseCount();
std::string getReadName();
std::string getSequenceString();
std::vector<std::vector<int>> getDataVector();
Base_Signals(std::string read_name, std::string sequence_data_str, std::vector<std::vector<int>> basecall_signals);
};

// Base class for storing a read's sequence and move table (basecalled POD5 in
// BAM format)
class Base_Move_Table
{
public:
std::string sequence_data_str; // Sequence of bases
std::vector<int> base_signal_index; // 2D vector of signal indices for each base
int sequence_start; // Signal index of the first base (ts)
int sequence_end; // Signal index of the last base (ns)

// Methods
std::string getSequenceString();
std::vector<int> getBaseSignalIndex();
int getSequenceStart();
int getSequenceEnd();
Base_Move_Table(std::string sequence_data_str, std::vector<int> base_signal_index, int start, int end);
Base_Move_Table();
};


// BAM output
class Output_BAM : public Output_FQ
{
public:
uint64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/
uint64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment
uint64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment
uint64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment
uint64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment;
uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment.
uint64_t forward_alignment = ZeroDefault; // Total number of forward alignments
uint64_t reverse_alignment = ZeroDefault; // Total number of reverse alignments
int reads_with_mods = ZeroDefault; // Total number of reads with modification tags
int reads_with_mods_pos_strand = ZeroDefault; // Total number of reads with modification tags on the positive strand
int reads_with_mods_neg_strand = ZeroDefault; // Total number of reads with modification tags on the negative strand

// Map of reads with supplementary alignments
std::map<std::string, bool> reads_with_supplementary;

// Map of reads with secondary alignments
std::map<std::string, bool> reads_with_secondary;

// Similar to Output_FA: below are for mapped.
uint64_t num_matched_bases = ZeroDefault; // the number of matched bases with =
uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X
uint64_t num_ins_bases = ZeroDefault; // the number of inserted bases;
uint64_t num_del_bases = ZeroDefault; // the number of deleted bases;
uint64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases;

// The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators
int num_columns = ZeroDefault; // the number of columns
double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns

std::vector<int> accuracy_per_read;

Basic_Seq_Statistics mapped_long_read_info;
Basic_Seq_Statistics unmapped_long_read_info;

Basic_Seq_Quality_Statistics mapped_seq_quality_info;
Basic_Seq_Quality_Statistics unmapped_seq_quality_info;

// Add a batch of records to the output
void add(Output_BAM &t_output_bam);

// Calculate QC across all records
void global_sum();
uint64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/
uint64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment
uint64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment
uint64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment
uint64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment;
uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment.
uint64_t forward_alignment = ZeroDefault; // Total number of forward alignments
uint64_t reverse_alignment = ZeroDefault; // Total number of reverse alignments
std::map<std::string, bool> reads_with_supplementary; // Map of reads with supplementary alignments
std::map<std::string, bool> reads_with_secondary; // Map of reads with secondary alignments

// Similar to Output_FA: below are for mapped.
uint64_t num_matched_bases = ZeroDefault; // the number of matched bases with =
uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X
uint64_t num_ins_bases = ZeroDefault; // the number of inserted bases;
uint64_t num_del_bases = ZeroDefault; // the number of deleted bases;
uint64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases;

// The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators
int num_columns = ZeroDefault; // the number of columns
double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns
std::vector<int> accuracy_per_read;

// Number of modified bases by position in the reference:
// chr -> reference position -> (modification type, canonical base, maximum
// likelihood, strand)
std::map<std::string, std::map<int32_t, Base_Modification>> base_modifications;
uint64_t modified_prediction_count = ZeroDefault; // Total number of modified base predictions
uint64_t modified_base_count = ZeroDefault; // Total number of modified bases mapped to the reference genome
uint64_t modified_base_count_forward = ZeroDefault; // Total number of modified bases in the genome on the forward strand
uint64_t modified_base_count_reverse = ZeroDefault; // Total number of modified bases in the genome on the reverse strand
// uint64_t c_modified_base_count = ZeroDefault; // Total C modified bases
uint64_t cpg_modified_base_count = ZeroDefault; // Total C modified bases in CpG sites (combined forward and reverse)
uint64_t cpg_modified_base_count_forward = ZeroDefault; // Total C modified bases in CpG sites on the forward strand
uint64_t cpg_modified_base_count_reverse = ZeroDefault; // Total C modified bases in CpG sites on the reverse strand
uint64_t cpg_genome_count = ZeroDefault; // Total number of CpG sites in the genome
double percent_modified_cpg = ZeroDefault; // Percentage of CpG sites with modified bases (forward and reverse)

// Test counts
uint64_t test_count = ZeroDefault; // Test count
uint64_t test_count2 = ZeroDefault; // Test count 2

// Counts for each type of modification:
// Modification type -> count
std::map<char, int> modification_type_counts;

// Signal data section
int read_count = ZeroDefault;
int base_count = ZeroDefault;
// std::vector<Base_Move_Table> read_move_table;
std::unordered_map<std::string, Base_Move_Table> read_move_table;
// std::map<std::string, Base_Move_Table> read_move_table = {};

// Save the output to a summary text file
void save_summary(std::string &output_file, Input_Para &params, Output_BAM &output_data);
// POD5 signal-level information is stored in a map of read names to a map of
// reference positions to a tuple of (ts, ns, move table vector)
std::unordered_map<std::string, POD5_Signal_Data> pod5_signal_data;

Output_BAM();
~Output_BAM();
Basic_Seq_Statistics mapped_long_read_info;
Basic_Seq_Statistics unmapped_long_read_info;

Basic_Seq_Quality_Statistics mapped_seq_quality_info;
Basic_Seq_Quality_Statistics unmapped_seq_quality_info;

// Add modified base data
void add_modification(std::string chr, int32_t ref_pos, char mod_type, char canonical_base, double likelihood, int strand);

// Return the modification information
std::map<std::string, std::map<int32_t, Base_Modification>> get_modifications();

// POD5 signal data functions
int getReadCount();
void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end);
std::vector<int> getReadMoveTable(std::string read_id);
std::string getReadSequence(std::string read_id);
int getReadSequenceStart(std::string read_id);
int getReadSequenceEnd(std::string read_id);

// Add a batch of records to the output
void add(Output_BAM &t_output_bam);

// Calculate QC across all records
void global_sum();

// Save the output to a summary text file
void save_summary(std::string &output_file, Input_Para &params, Output_BAM &output_data);

Output_BAM();
~Output_BAM();
};


Expand Down Expand Up @@ -193,23 +278,6 @@ class Output_SeqTxt : public Output_Info
void global_sum();
};

// Base class for storing a read's base signal data
class Base_Signals
{
public:
std::string read_name;
int base_count;
std::string sequence_data_str; // Sequence of bases
std::vector<std::vector<int>> basecall_signals; // 2D vector of base signals

// Methods
int getBaseCount();
std::string getReadName();
std::string getSequenceString();
std::vector<std::vector<int>> getDataVector();
Base_Signals(std::string read_name, std::string sequence_data_str, std::vector<std::vector<int>> basecall_signals);
};

// FAST5 output
class Output_FAST5 : public Output_FA
{
Expand Down
52 changes: 52 additions & 0 deletions include/ref_query.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// RefQuery: A class for querying a reference genome in FASTA format

#ifndef REF_QUERY_H
#define REF_QUERY_H

#include <string>
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <vector>

class RefQuery {
private:
std::string fasta_filepath;
std::vector<std::string> chromosomes;
std::unordered_map<std::string, std::string> chr_to_seq;
// uint32_t cpg_site_count = 0;
uint64_t cpg_modified_count = 0;
uint64_t cpg_total_count = 0;
uint64_t test_count = 0;

// Map of reference position (0-indexed) to CpG site (true/false) on the
// forward strand for each chromosome
// std::map<std::string, std::map<int64_t, bool>> chr_pos_to_cpg;

// Map of reference position (0-indexed) to CpG site (true/false) on the
// reverse strand for each chromosome
// std::map<std::string, std::map<int64_t, bool>> chr_pos_to_cpg_rev;

// Map of reference position (0-indexed) to CpG site (true/false) for
// all chromosomes
// std::map<std::string, std::map<int64_t, bool>> chr_pos_to_cpg;

// Map of chromosome to CpG site positions
std::unordered_map<std::string, std::unordered_set<int64_t>> chr_to_cpg;

// Map of chromosome to CpG site positions with modifications
std::unordered_map<std::string, std::unordered_set<int64_t>> chr_to_cpg_mod;

// Reverse strand CpG site map
// std::map<std::string, std::map<int64_t, bool>> chr_pos_to_cpg_rev;

public:
int setFilepath(std::string fasta_filepath);
std::string getFilepath();
std::string getBase(std::string chr, int64_t pos);
void generateCpGMap();
void addCpGSiteModification(std::string chr, int64_t pos, int strand);
std::pair<uint64_t, uint64_t> getCpGModificationCounts(int strand);
};

#endif // REF_QUERY_H
Loading

0 comments on commit 612d5cc

Please sign in to comment.