.Rhistory

path_somatic <- unlist(strsplit(lof_pth, "%"))[2]
somatic_vcf_dt <- vcf_to_dt(path_somatic)
germline_vcf_dt <- vcf_to_dt(path_germline)
somatic_bedpe <- build_bedpe_with_metadata(somatic_vcf_dt)
germline_bedepe <- build_bedpe_with_metadata(germline_vcf_dt)
somatic_bedpe[, CLASS := "SOMATIC"]
germline_bedepe[, CLASS := "GERMLINE"]
test_bed <- rbind(somatic_bedpe, germline_bedepe)
#adapting somatic_id to however file is called
sid_idx <- length(unlist(strsplit(path_somatic, "/")))
gid_idx <- length(unlist(strsplit(path_germline, "/")))
somatic_id <- unlist(strsplit(unlist(strsplit(path_somatic, "/"))[sid_idx], "[.]"))[1]
germline_id <-  unlist(strsplit(unlist(strsplit(path_germline, "/"))[gid_idx], "[.]"))[1]
if(somatic_id == germline_id) {
#output <- paste0("/xchip/beroukhimlab/wolu/testing_svaba/outputs/final_bedpe/final_bedpe_only/", somatic_id, "_combined_germ_soma_test.bedpe")
output <- paste0("/xchip/beroukhimlab/wolu/testing_svaba/scripts/vcf2bedpe/", somatic_id, ".bedpe")
write.table(test_bed, output, row.names = F, col.names = T, sep = "\t", quote = F)
}
}
merge(file_path)
file_path <- 'Z:/Shu/tcga_svaba_germline_marcin/2984.svaba.germline.sv.vcf%Z:/siyun/data/insertions/pcawg/2984.svaba.filtered.somatic.sv.vcf'
merge(file_path)
merge <- function(lof_pth) {
path_germline <- unlist(strsplit(lof_pth, "%"))[1]
path_somatic <- unlist(strsplit(lof_pth, "%"))[2]
somatic_vcf_dt <- vcf_to_dt(path_somatic)
germline_vcf_dt <- vcf_to_dt(path_germline)
somatic_bedpe <- build_bedpe_with_metadata(somatic_vcf_dt)
germline_bedepe <- build_bedpe_with_metadata(germline_vcf_dt)
somatic_bedpe[, CLASS := "SOMATIC"]
germline_bedepe[, CLASS := "GERMLINE"]
test_bed <- rbind(somatic_bedpe, germline_bedepe)
#adapting somatic_id to however file is called
sid_idx <- length(unlist(strsplit(path_somatic, "/")))
gid_idx <- length(unlist(strsplit(path_germline, "/")))
somatic_id <- unlist(strsplit(unlist(strsplit(path_somatic, "/"))[sid_idx], "[.]"))[1]
germline_id <-  unlist(strsplit(unlist(strsplit(path_germline, "/"))[gid_idx], "[.]"))[1]
if(somatic_id == germline_id) {
#output <- paste0("/xchip/beroukhimlab/wolu/testing_svaba/outputs/final_bedpe/final_bedpe_only/", somatic_id, "_combined_germ_soma_test.bedpe")
output <- paste0("Z:/wolu/testing_svaba/scripts/vcf2bedpe/", somatic_id, ".bedpe")
write.table(test_bed, output, row.names = F, col.names = T, sep = "\t", quote = F)
}
}
merge(file_path)
chk <- fread('Z:/wolu/testing_svaba/scripts/vcf2bedpe/2984.bedpe')
View(chk)
build_bedpe_with_metadata <- function(merged_dt) {
cat("Building bedpe...\n")
### get mate indexes
merged_dt[, mates_idx := unlist(strsplit(ID, ":"))[1], by = "uid"]
merged_dt[, which_mate := unlist(strsplit(ID, ":"))[2], by = "uid"]
temp_bedpe <- NULL
removed_bnd <- NULL
for(i in 1:length(unique(merged_dt$mates_idx))){
foo <- merged_dt[mates_idx == unique(merged_dt$mates_idx)[i]]
if(!(nrow(foo)== 2)) {
mes <- paste0("Breakpoint ",  unique(merged_dt$mates_idx)[i], " has incorrect number of mates for ", foo$sample, " It has been removed.")
# excludes this breakpoint from
continue = FALSE
removed_bnd <- rbind(removed_bnd, foo)
warning(mes[1])
}
else {
continue = TRUE
}
if(continue) {
#### build bedpe
foo[,split_ID := c(1:length(foo$seqnames))]
foo[, which_mate := unlist(strsplit(ID, ":"))[2], by = "split_ID"]
foo1 <- foo[which_mate == 1]
foo2 <- foo[which_mate == 2]
bedpe_base <- as.data.frame(cbind(foo1$seqnames, foo1$start, foo1$end,
foo2$seqnames, foo2$start, foo2$end))
colnames(bedpe_base) <- c("chrom1", "start1", "end1", "chrom2","start2","end2")
if(!(foo1$sid == foo2$sid)){
stop("Multiple samples are being processed, one at a time please...")
}
bedpe_base <- cbind(bedpe_base, paste0(foo$sample[1],"_", foo$mates_idx[1]))
colnames(bedpe_base)[7] <- "name"
bedpe_base <- cbind(bedpe_base, foo$QUAL[1])
colnames(bedpe_base)[8] <- "score"
bedpe_base <- cbind(bedpe_base, foo1$strand[1])
bedpe_base <- cbind(bedpe_base, foo2$strand[1])
colnames(bedpe_base)[9:10] <- c("strand1", "strand2")
refs_alts <- as.data.frame(cbind(foo1$REF[1], foo1$ALT[1],
foo2$REF[1], foo2$ALT[1]))
colnames(refs_alts) <- c("REF_1","ALT_1","REF_2","ALT_2")
bedpe_base <- cbind(bedpe_base, refs_alts)
bedpe_base <- cbind(bedpe_base, foo1[,c("SPAN", "HOMSEQ","INSERTION","NDISC","FILTER","sample", "TUMALT", 'GENO','TUMOR')])
mapqs <- as.data.frame(cbind(foo1$MAPQ[1], foo2$MAPQ[1]))
colnames(mapqs) <- c("MAPQ_1","MAPQ_2")
bedpe_base <- cbind(bedpe_base, mapqs)
temp_bedpe <- rbind(temp_bedpe, bedpe_base)
}
}
bedpe <- as.data.table(temp_bedpe)
return(bedpe)
}
merge(file_path)
chk <- fread('Z:/wolu/testing_svaba/scripts/vcf2bedpe/2984.bedpe')
View(chk)
##### functions #####
vcf_to_dt <- function(vcf_path) {
cat(paste0(vcf_path, "\n"))
if (!file.exists(vcf_path)) {
print(paste("File does not exist",vcf_path))
}
cat("Reading file...\n")
vcf_dt <- fread(cmd=paste("grep -v '^#'", vcf_path),sep='\t')
# Set colnames of vcf_dt to standard...
if (nrow(vcf_dt) == 0) {
return (vcf_dt)
}
if (ncol(vcf_dt)==10) {
setnames(vcf_dt, c("seqnames","start","ID","REF","ALT","QUAL","FILTER","INFO","GENO","NORMAL"))
} else {
setnames(vcf_dt, c("seqnames","start","ID","REF","ALT","QUAL","FILTER","INFO","GENO","NORMAL","TUMOR"), skip_absent=TRUE)
}
cat("Gathering Metadata...\n")
if ("INFO" %in% colnames(vcf_dt) ) {
vcf_dt[, SPAN := as.numeric(gsub(".*?SPAN=([-0-9]+).*","\\1",INFO))]
vcf_dt$sample = gsub("(.*?)_.*","\\1",basename(vcf_path))
vcf_dt[, uid := gsub("([0-9]+):(1|2)", "\\1", ID)]
vcf_dt[, EVDNC := gsub(".*?EVDNC=([A-Z]+).*", "\\1", INFO)]
vcf_dt[, MAPQ := as.integer(gsub(".*?;MAPQ=([0-9]+).*", "\\1", INFO))]
vcf_dt[, HOMSEQ := gsub(".*?;HOMSEQ=([A-Z]+).*", "\\1", INFO)]
vcf_dt[, HOMSEQ := ifelse(grepl(";", HOMSEQ), "", HOMSEQ)]
vcf_dt[, INSERTION := gsub(".*?;INSERTION=([A-Z]+).*", "\\1", INFO)]
vcf_dt[, INSERTION := ifelse(grepl(";", INSERTION), "", INSERTION)]
vcf_dt[, NDISC := as.numeric(gsub(".*?NDISC=([0-9]+).*", "\\1", INFO))]
vcf_dt[, SVMETHOD := substr(INFO,regexpr("SVMETHOD=",INFO)+nchar("SVMETHOD="),regexpr(";NDISC",INFO)-1)]
}
# More extraction regexpr stuff...
if ("TUMOR" %in% colnames(vcf_dt)) {
vcf_dt[, TUMALT :=  as.integer(strsplit(TUMOR, ":")[[1]][2]) , by=uid]
vcf_dt[, TUMCOV :=  as.integer(strsplit(TUMOR, ":")[[1]][3]) , by=uid]
vcf_dt[, TUMLOD :=  as.numeric(strsplit(TUMOR, ":")[[1]][9]) , by=uid]
}
if ("NORMAL" %in% colnames(vcf_dt)) {
vcf_dt[, NORMCOV :=  as.integer(strsplit(NORMAL, ":")[[1]][3]) , by=uid]
vcf_dt[, NORMALT :=  as.integer(strsplit(NORMAL, ":")[[1]][2]) , by=uid]
vcf_dt[, NORMLOD :=  as.numeric(strsplit(NORMAL, ":")[[1]][9]) , by=uid]
}
cat("Cleaning up...\n")
vcf_dt[, strand := ifelse(grepl("^\\[", ALT) | grepl("^\\]", ALT), '-', '+')]
vcf_dt[, inv := strand[1] == strand[2], by=uid]
vcf_dt[, altstrand := rev(strand), by=uid]
vcf_dt[, altpos := as.integer(gsub(".*?:([0-9]+).*", "\\1", ALT))]
vcf_dt[, altchr := gsub(".*?(\\[|\\])(.*?):([0-9]+).*", "\\2", ALT)]
vcf_dt[, end := start]
bad.ix <- vcf_dt[grepl("^G|^M", seqnames), uid]
vcf_dt <- vcf_dt[!uid %in% bad.ix]
vcf_dt[, sid := basename(tools::file_path_sans_ext(vcf_path))]
vcf_dt$seqnames <- paste0("chr",vcf_dt$seqnames)
return(vcf_dt)
}
build_bedpe_with_metadata <- function(merged_dt) {
cat("Building bedpe...\n")
### get mate indexes
merged_dt[, mates_idx := unlist(strsplit(ID, ":"))[1], by = "uid"]
merged_dt[, which_mate := unlist(strsplit(ID, ":"))[2], by = "uid"]
temp_bedpe <- NULL
removed_bnd <- NULL
for(i in 1:length(unique(merged_dt$mates_idx))){
foo <- merged_dt[mates_idx == unique(merged_dt$mates_idx)[i]]
if(!(nrow(foo)== 2)) {
mes <- paste0("Breakpoint ",  unique(merged_dt$mates_idx)[i], " has incorrect number of mates for ", foo$sample, " It has been removed.")
# excludes this breakpoint from
continue = FALSE
removed_bnd <- rbind(removed_bnd, foo)
warning(mes[1])
}
else {
continue = TRUE
}
if(continue) {
#### build bedpe
foo[,split_ID := c(1:length(foo$seqnames))]
foo[, which_mate := unlist(strsplit(ID, ":"))[2], by = "split_ID"]
foo1 <- foo[which_mate == 1]
foo2 <- foo[which_mate == 2]
bedpe_base <- as.data.frame(cbind(foo1$seqnames, foo1$start, foo1$end,
foo2$seqnames, foo2$start, foo2$end))
colnames(bedpe_base) <- c("chrom1", "start1", "end1", "chrom2","start2","end2")
if(!(foo1$sid == foo2$sid)){
stop("Multiple samples are being processed, one at a time please...")
}
bedpe_base <- cbind(bedpe_base, paste0(foo$sample[1],"_", foo$mates_idx[1]))
colnames(bedpe_base)[7] <- "name"
bedpe_base <- cbind(bedpe_base, foo$QUAL[1])
colnames(bedpe_base)[8] <- "score"
bedpe_base <- cbind(bedpe_base, foo1$strand[1])
bedpe_base <- cbind(bedpe_base, foo2$strand[1])
colnames(bedpe_base)[9:10] <- c("strand1", "strand2")
refs_alts <- as.data.frame(cbind(foo1$REF[1], foo1$ALT[1],
foo2$REF[1], foo2$ALT[1]))
colnames(refs_alts) <- c("REF_1","ALT_1","REF_2","ALT_2")
bedpe_base <- cbind(bedpe_base, refs_alts)
bedpe_base <- cbind(bedpe_base, foo1[,c("SPAN", "HOMSEQ","INSERTION","NDISC","FILTER","sample", "EVDNC","TUMALT", 'GENO','TUMOR')])
mapqs <- as.data.frame(cbind(foo1$MAPQ[1], foo2$MAPQ[1]))
colnames(mapqs) <- c("MAPQ_1","MAPQ_2")
bedpe_base <- cbind(bedpe_base, mapqs)
temp_bedpe <- rbind(temp_bedpe, bedpe_base)
}
}
bedpe <- as.data.table(temp_bedpe)
return(bedpe)
}
merge(file_path)
chk <- fread('Z:/wolu/testing_svaba/scripts/vcf2bedpe/2984.bedpe')
merge <- function(lof_pth) {
path_germline <- unlist(strsplit(lof_pth, "%"))[1]
path_somatic <- unlist(strsplit(lof_pth, "%"))[2]
somatic_vcf_dt <- vcf_to_dt(path_somatic)
germline_vcf_dt <- vcf_to_dt(path_germline)
somatic_bedpe <- build_bedpe_with_metadata(somatic_vcf_dt)
germline_bedepe <- build_bedpe_with_metadata(germline_vcf_dt)
somatic_bedpe[, CLASS := "SOMATIC"]
germline_bedepe[, CLASS := "GERMLINE"]
test_bed <- rbind(somatic_bedpe, germline_bedepe)
sid_idx <- length(unlist(strsplit(path_somatic, "/")))
gid_idx <- length(unlist(strsplit(path_germline, "/")))
somatic_id <- unlist(strsplit(unlist(strsplit(path_somatic, "/"))[sid_idx], "[.]"))[1]
germline_id <-  unlist(strsplit(unlist(strsplit(path_germline, "/"))[gid_idx], "[.]"))[1]
if(somatic_id == germline_id) {
#output <- paste0("/xchip/beroukhimlab/wolu/testing_svaba/outputs/final_bedpe/final_bedpe_only/", somatic_id, "_combined_germ_soma_test.bedpe")
output <- paste0("Z:/wolu/testing_svaba/scripts/vcf2bedpe/", somatic_id, ".bedpe")
#tmp <- test_bed #1840
test_bed[, NALT_SR := unlist(strsplit(TUMOR, ":"))[3], by = 'TUMOR']
test_bed[, NALT := unlist(strsplit(TUMOR, ":"))[1], by = 'TUMOR']
tmp <- test_bed[MAPQ_1 == 60 | MAPQ_2 == 60] #takes only reads that have a mapq score that are 60 or higher since MAPQ scores are capped at 60
tmp <- tmp[!(EVDNC == 'DSCRD')] #removes reads that are discordant
tmp <- tmp[NALT_SR > 1] #the number of reads covering the site/depth of coverage must be greater than 1
tmp <- tmp[SPAN > 49 | SPAN== -1] #SPAN = -1 refers to a translocation. Span shorter than 50bp are considered simple indels
write.table(tmp, output,
sep = '\t', row.names = F, col.names = T, quote = F)
#write.table(test_bed, output, row.names = F, col.names = T, sep = "\t", quote = F)
}
}
merge(file_path)
chk <- fread('Z:/wolu/testing_svaba/scripts/vcf2bedpe/2984.bedpe')
View(chk)
file_path
file_path <- "Z:/Shu/tcga_svaba_germline_marcin/2984.svaba.germline.sv.vcf%Z:/siyun/data/insertions/pcawg/2984.svaba.filtered.somatic.sv.vcf"
source('C:/Users/wchukwu/Downloads/GaTSV/processing_scripts.R')
source('C:/Users/wchukwu/Downloads/GaTSV/processing_scripts.R') #store all source scripts for pre-processing
chk <- merge_filter(file_path)
View(chk)
setwd('C:/Users/wchukwu/Downloads/GaTSV/')
gnomad_germline_hg38all = readRDS('Z:/wolu/testing_svaba/outputs/20240327_svmupdates/gnomAD4.0/gnomAD4.0_hg38.rds')
View(gnomad_germline_hg38all)
gnomad_germline_hg38all[ID:=NULL]
gnomad_germline_hg38all[,ID:=NULL]
View(gnomad_germline_hg38all)
gnomad_germline_hg38all = saveRDS(gnomad_germline_hg38all,'./data/gnomAD.v4.hg38.rds')
gnomad_germline_hg38all = readRDS('./data/gnomAD.v4.hg38.rds')
gnomad_germline_hg19all = readRDS("Z:/wolu/testing_svaba/outputs/20240327_svmupdates/gnomAD4.0/20240328_gnomad_hg19_v4.rds")
View(gnomad_germline_hg19all)
gnomad_germline_hg19all[,span_hg19:=NULL]
gnomad_germline_hg19all = saveRDS(gnomad_germline_hg19all,'./data/gnomAD.v4.hg19.liftover.rds')
gnomad_germline_hg19all = readRDS('./data/gnomAD.v4.hg19.liftover.rds')
LINE_dt_hg38 = readRDS("Z:/wolu/testing_svaba/extdata/repeatmasker_hg38_LINE.bed")
View(LINE_dt_hg38)
reptimedata = readRDS("Z:/wolu/testing_svaba/scripts/add_features/reptimedata.rds")
View(reptimedata)
reptimedata@metadata
View(reptimedata@metadata)
View(reptimedata@ranges)
LINE_dt_hg38 = saveRDS(readRDS("Z:/wolu/testing_svaba/extdata/repeatmasker_hg38_LINE.bed"),'./data/repeatmasker.hg38.LINE.bed')
SINE_dt_hg38 = saveRDS(readRDS("Z:/wolu/testing_svaba/extdata/repeatmasker_hg38_SINE.bed"),'./data/repeatmasker.hg38.SINE.bed')
LINE_dt_hg19 = saveRDS(readRDS("Z:/wolu/testing_svaba/extdata/repeat_masker_hg19_LINE.bed"),'./data/repeatmasker.hg19.LINE.bed')
SINE_dt_hg19 = saveRDS(readRDS("Z:/wolu/testing_svaba/extdata/repeat_masker_hg19_SINE.bed"),'./data/repeatmasker.hg19.SINE.bed')
hg19_genes <- saveRDS(readRDS('Z:/wolu/testing_svaba/outputs/20240327_svmupdates/gencode/hg19_geneRanges.rds'),'./data/gencode.genes.hg19.rds')
hg19_exons <- saveRDS(readRDS('Z:/wolu/testing_svaba/outputs/20240327_svmupdates/gencode/hg19_exonRanges.rds'),'./data/gencode.exons.hg19.rds')
library(data.table)
library(stringr)
library(GenomicRanges)
rm(gnomad_germline_hg38all)
#20240528_do the same processing for hg38 downloaded from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.annotation.gtf.gz
gencode_v46_hg38 <- fread("C:/Users/wchukwu/Downloads/gencode.v46.annotation.gtf.gz",skip = "chr1") #downloaded from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz
View(gencode_v46_hg38)
gencode_v46_hg38[,gene_type:=str_extract(V9,"(?<=; gene_type \")(.*?)(?=\";)")]
View(gencode_v46_hg38)
gencode_v46_hg38[,gene_status:=str_extract(V9,"(?<=; gene_status \")(.*?)(?=\";)")]
genes <- gencode_v46_hg38[V3=='gene'& gene_status=='KNOWN' & gene_type=='protein_coding']
exons <- gencode_v46_hg38[V3=='exon'& gene_status=='KNOWN' & gene_type=='protein_coding']
genes <- genes[,V1:=gsub('chr','',V1)]
exons <- exons[,V1:=gsub('chr','',V1)]
hg38_geneRanges <- makeGRangesFromDataFrame(genes,seqnames.field = 'V1', start.field = 'V4',
end.field = 'V5',strand.field = 'V7')
hg38_exonRanges <- makeGRangesFromDataFrame(exons,seqnames.field = 'V1', start.field = 'V4',
end.field = 'V5',strand.field = 'V7')
saveRDS(hg38_geneRanges,"C:/Users/wchukwu/Downloads/GaTSV/data/gencode.genes.hg38.rds")
saveRDS(hg38_exonRanges,"C:/Users/wchukwu/Downloads/GaTSV/data/gencode.exons.hg38.rds")
View(gencode_v46_hg38)
unique(gencode_v46_hg38$gene_type)
unique(gencode_v46_hg38$gene_status)
gencode_v46_hg38[,tag:=str_extract(V9,"(?<=; tag \")(.*?)(?=\";)")]
unique(gencode_v46_hg38$tag)
genes <- gencode_v46_hg38[V3=='gene' & gene_type=='protein_coding']
exons <- gencode_v46_hg38[V3=='exon'& gene_type=='protein_coding']
View(exons)
hg19_exons <- readRDS('./data/gencode.exons.hg38.rds')
View(hg19_exons)
hg19_exons <- readRDS('./data/gencode.exons.hg19.rds')
hg19_genes <- readRDS('./data/gencode.genes.hg19.rds')
View(hg19_genes)
genes <- genes[,V1:=gsub('chr','',V1)]
exons <- exons[,V1:=gsub('chr','',V1)]
genes[,gene_status:=NULL]
genes[,tag:=NULL]
View(genes)
hg38_geneRanges <- makeGRangesFromDataFrame(genes,seqnames.field = 'V1', start.field = 'V4',
end.field = 'V5',strand.field = 'V7')
hg38_exonRanges <- makeGRangesFromDataFrame(exons,seqnames.field = 'V1', start.field = 'V4',
end.field = 'V5',strand.field = 'V7')
saveRDS(hg38_geneRanges,"C:/Users/wchukwu/Downloads/GaTSV/data/gencode.genes.hg38.rds")
saveRDS(hg38_exonRanges,"C:/Users/wchukwu/Downloads/GaTSV/data/gencode.exons.hg38.rds")
reptimedata_hg19 = saveRDS(readRDS("Z:/wolu/testing_svaba/scripts/add_features/reptimedata.rds"),'./data/reptime.hg19.rds')
View(chk)
file_path
gc()
file_path <- "Z:/Shu/tcga_svaba_germline_marcin/2984.svaba.germline.sv.vcf%Z:/siyun/data/insertions/pcawg/2984.svaba.filtered.somatic.sv.vcf"
require(data.table)
require(parallel)
require(gUtils)
require(data.table)
require(GenomicRanges)
require(parallel)
require(stats4)
require(BiocGenerics)
require(rlang)
require(S4Vectors)
require(IRanges)
require(GenomeInfoDb)
print("Setting Working Directory")
# Setting working directory to the /figures folder
getSourceEditorContext()$path
library(rstudioapi)
source('./scripts/processing_scripts.R') #store all source scripts for pre-processing
chk <- merge_filter(file_path)
cat('Loading reference files...\n')
gnomad_hg38 = readRDS('./data/gnomAD.v4.hg38.rds')
gnomad_hg19 = readRDS('./data/gnomAD.v4.hg19.liftover.rds')
LINE_dt_hg38 = readRDS('./data/repeatmasker.hg38.LINE.bed')
SINE_dt_hg38 = readRDS('./data/repeatmasker.hg38.SINE.bed')
LINE_dt_hg19 = readRDS('./data/repeatmasker.hg19.LINE.bed')
SINE_dt_hg19 = readRDS('./data/repeatmasker.hg19.SINE.bed')
hg19_genes = readRDS('./data/gencode.genes.hg19.rds')
hg19_exons=readRDS('./data/gencode.exons.hg19.rds')
hg38_genes=readRDS('./data/gencode.genes.hg38.rds')
hg38_exons=readRDS('./data/gencode.exons.hg38.rds')
reptimedata_hg19 = readRDS('./data/reptime.hg19.rds')
run_it <- function(file,n_cores) {
#sample.name <- unlist(strsplit(lof_pth, "filtered_only/"))[2]
#print(sample.name) #added this line
#output_path <- paste0('/xchip/beroukhimlab/wolu/testing_svaba/outputs/modified_fuzzy_and_features/modified_features/',sample.name,'_fuzzy.bedpe')
bedpe <- file
bedpe[, chrom1 := gsub('chr','',chrom1)]
bedpe[, chrom2 := gsub('chr','',chrom2)]
fuzzy <- closest_germline(bp = bedpe, cores = n_cores, genome = 'hg19')
line_sine <- closest_line_sine(bp = fuzzy, genome = 'hg19', cores = n_cores)
cat("Finding distance to closest SV... \n")
nearest_sv_dist <- rbindlist(mclapply(1:nrow(line_sine), find_closest_sv, line_sine,mc.cores=n_cores))
cat("done. \n")
cat("Finding no of SVs in 5Mbp window... \n")
sv_annotated <- rbindlist(mclapply(1:nrow(nearest_sv_dist), count_sv_5mbp, nearest_sv_dist, mc.cores = n_cores))
cat("done. \n")
cat("Adding replication timing info... \n")
reptime_added <- rbindlist(mclapply(1:nrow(sv_annotated), rep_time, sv_annotated, reptimedata=reptimedata_hg19, mc.cores = n_cores))
cat("done. \n")
#all_features <- sv_annotations(bp = line_sine, cores = n_cores)
#write.table(all_features, output_path, sep = '\t', row.names = F, col.names = T, quote = F)
return(reptime_added)
}
chk[1:5]
run_it <- function(file,n_cores) {
#sample.name <- unlist(strsplit(lof_pth, "filtered_only/"))[2]
#print(sample.name) #added this line
#output_path <- paste0('/xchip/beroukhimlab/wolu/testing_svaba/outputs/modified_fuzzy_and_features/modified_features/',sample.name,'_fuzzy.bedpe')
bedpe <- file
bedpe[, chrom1 := gsub('chr','',chrom1)]
bedpe[, chrom2 := gsub('chr','',chrom2)]
fuzzy <- closest_germline(bp = bedpe, cores = n_cores, genome = 'hg19')
line_sine <- closest_line_sine(bp = fuzzy, genome = 'hg19', cores = n_cores)
cat("Finding distance to closest SV... \n")
nearest_sv_dist <- rbindlist(mclapply(1:nrow(line_sine), find_closest_sv, line_sine,mc.cores=n_cores))
cat("done. \n")
cat("Finding no of SVs in 5Mbp window... \n")
sv_annotated <- rbindlist(mclapply(1:nrow(nearest_sv_dist), count_sv_5mbp, nearest_sv_dist, mc.cores = n_cores))
cat("done. \n")
cat("Adding replication timing info... \n")
reptime_added <- rbindlist(mclapply(1:nrow(sv_annotated), rep_time, sv_annotated, genome='hg19', mc.cores = n_cores))
cat("done. \n")
#all_features <- sv_annotations(bp = line_sine, cores = n_cores)
#write.table(all_features, output_path, sep = '\t', row.names = F, col.names = T, quote = F)
return(reptime_added)
}
chk2 <- run_it(chk[1:5],n_cores = 1)
View(chk)
row_l <- chk[3,]
View(row_l)
i<- 3
bedpe_l <- chk[1:5,]
row_l <- bedpe_l[i,] #the particular row of the annotated bedpe file we are working with
cat(i,'\n')
###create the reference dt here with the row in question removed
bedpe_l_wo_row <- bedpe_l[-i,]
sample_intra_events <- bedpe_l_wo_row[chrom1 == chrom2,]
sample_inter_events <- bedpe_l_wo_row[chrom1 != chrom2,]
ref_intra <- data.table(sample_intra_events$chrom1,sample_intra_events$start1,sample_intra_events$start2)
ref_inter1 <-  data.table(sample_inter_events$chrom1,sample_inter_events$start1,sample_inter_events$end1)
ref_inter2 <- data.table(sample_inter_events$chrom2,sample_inter_events$start2,sample_inter_events$end2)
sample_ref_dt <- rbind(ref_intra,ref_inter1,ref_inter2) #this is a reference table of SVs that lacks the row in question
colnames(sample_ref_dt) <- c('seqnames','start','end')
row_l_str1 <- GRanges(row_l$chrom1, IRanges(as.numeric(as.character(row_l$start1)),width=1))
row_l_str2 <- GRanges(row_l$chrom2, IRanges(as.numeric(as.character(row_l$start2)),width=1))
row_l$chrom1 == row_l$chrom2
### both bedpe and ref should be sorted so lower bkpt comes first
ref_sub <- sample_ref_dt[seqnames == row_l$chrom1] #includes all events affecting that chromsome
nrow(ref_sub) == 0
ref_sub_ranges <- makeGRangesFromDataFrame(ref_sub, seqnames.field="seqnames", start.field="start", end.field ="end")
check_str1_overlap <- ref_sub_ranges %&% row_l_str1
check_str2_overlap <- ref_sub_ranges %&% row_l_str2
length(check_str1_overlap)>=1
ref_sub[,str1_dist_l := ifelse(length(check_str1_overlap)>=1, 0, min(abs(start - as.numeric(as.character(row_l$start1))), abs(end - as.numeric(as.character(row_l$start1)))))] #automatically returns the min value
View(ref_sub)
as.numeric(as.character(row_l$start1))
ref_sub$start-as.numeric(as.character(row_l$start1))
as.numeric(ref_sub$start)-as.numeric(as.character(row_l$start1))
View(bedpe_l)
ref_sub[,str1_dist_l := ifelse(length(check_str1_overlap)>=1, 0, min(abs(as.numeric(start) - as.numeric(as.character(row_l$start1))), abs(as.numeric(end) - as.numeric(as.character(row_l$start1)))))] #automatically returns the min value
View(ref_sub)
source('./scripts/processing_scripts.R') #store all source scripts for pre-processing
chk2 <- run_it(chk[1:5],n_cores = 1)
View(chk2)
annot_file_2984 <- fread('Z:/wolu/testing_svaba/outputs/20240327_svmupdates/updated_annot/2984_combined_germ_soma_test.bedpe.filtered_fuzzy.bedpe.annot.bedpe.more.bedpe.bedpe')
View(annot_file_2984)
as.character(c(1:22),'X','Y')
as.character(c(1:22),c('X','Y'))
c(as.character(c(1:22)),'X','Y')
#reptimedata_hg38 = readRDS('./data/reptime.hg38.rds') #unfortunately, we haven't processed a hg38 equivalent of the replication timing reference file.
cohort_metadata <- fread("./data/20240417_cohort_metadata.csv")
View(cohort_metadata)
tmp<- chk
chroms <-  c(as.character(c(1:22)),'X','Y')
tmp$chrom1 <- as.character(tmp$chrom1)
tmp$chrom2 <- as.character(tmp$chrom2)
store_indices = which(!tmp$chrom1 %in% chroms| !tmp$chrom2 %in% chroms) #done to address the observation that some SVs were mapping to hpv
bedpe_clean <- tmp[!store_indices,]
bedpe_clean$start1 <- as.numeric(bedpe_clean$start1)
bedpe_clean$start2 <- as.numeric(bedpe_clean$start2)
View(tmp)
View(chk2)
tmp<- chk2
chroms <-  c(as.character(c(1:22)),'X','Y')
tmp$chrom1 <- as.character(tmp$chrom1)
tmp$chrom2 <- as.character(tmp$chrom2)
store_indices = which(!tmp$chrom1 %in% chroms| !tmp$chrom2 %in% chroms) #done to address the observation that some SVs were mapping to hpv
bedpe_clean <- tmp[!store_indices,]
bedpe_clean$start1 <- as.numeric(bedpe_clean$start1)
bedpe_clean$start2 <- as.numeric(bedpe_clean$start2)
source('./scripts/processing_scripts.R') #store all source scripts for pre-processing
View(bedpe_clean)
bedpe_annot <- rbindlist(mclapply(1:nrow(bedpe_clean), annot_geneexon, bedpe_clean, genome='hg19',mc.cores = 1))
View(bedpe_annot)
View(annot_file_2984)
head(annot_file_2984$CN_annot,5)
head(annot_file_2984$exon_annot,5)
tp53_added <- check_tp53(bedpe_annot)
source('./scripts/processing_scripts.R') #store all source scripts for pre-processing
tp53_added <- check_tp53(bedpe_annot)
View(tp53_added)
filename <- unlist(strsplit(bedpe_annot$name[1],"[.]"))[1]
mut_status <- cohort_metadata$mut_status[which(cohort_metadata$uid == filename)]
View(cohort_metadata)
source('./scripts/processing_scripts.R') #store all source scripts for pre-processing
tp53_added <- check_tp53(bedpe_annot)
View(tp53_added)
filt_file <- merge_filter(file_path)
run_it <- function(file,n_cores) {
bedpe <- file
bedpe[, chrom1 := gsub('chr','',chrom1)]
bedpe[, chrom2 := gsub('chr','',chrom2)]
fuzzy <- closest_germline(bp = bedpe, cores = n_cores, genome = 'hg19')
line_sine <- closest_line_sine(bp = fuzzy, genome = 'hg19', cores = n_cores)
cat("Finding distance to closest SV... \n")
nearest_sv_dist <- rbindlist(mclapply(1:nrow(line_sine), find_closest_sv, line_sine,mc.cores=n_cores))
cat("done. \n")
cat("Finding no of SVs in 5Mbp window... \n")
sv_annotated <- rbindlist(mclapply(1:nrow(nearest_sv_dist), count_sv_5mbp, nearest_sv_dist, mc.cores = n_cores))
cat("done. \n")
cat("Adding replication timing info... \n")
reptime_added <- rbindlist(mclapply(1:nrow(sv_annotated), rep_time, sv_annotated, genome='hg19', mc.cores = n_cores))
cat("done. \n")
return(reptime_added)
}
tmp <- run_it(file = filt_file, n_cores = 1)
chroms <-  c(as.character(c(1:22)),'X','Y')
tmp$chrom1 <- as.character(tmp$chrom1)
tmp$chrom2 <- as.character(tmp$chrom2)
store_indices = which(!tmp$chrom1 %in% chroms| !tmp$chrom2 %in% chroms) #done to address the observation that some SVs were mapping to hpv
bedpe_clean <- tmp[!store_indices,]
bedpe_clean$start1 <- as.numeric(bedpe_clean$start1)
bedpe_clean$start2 <- as.numeric(bedpe_clean$start2)
cat("Performing annotation...")
bedpe_annot <- rbindlist(mclapply(1:nrow(bedpe_clean), annot_geneexon, bedpe_clean, genome='hg19',mc.cores = 1))
cat("done. \n")
cat("Checking TP53 status...")
tp53_added <- check_tp53(bedpe_annot)
cat("done. \n")
View(tp53_added)