-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrna_seq_basics.py
125 lines (104 loc) · 4.21 KB
/
rna_seq_basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 26 12:02:37 2023
@author: user
"""
import pandas as pd
def read_expression_file(filename):
"""
Reads an RNA-seq expression file and returns a pandas DataFrame with gene expression values for each sample.
"""
df = pd.read_csv(filename, sep="\t", index_col=0)
return df
def filter_genes_by_expression(df, min_mean_expression=0, min_fraction_samples=0):
"""
Filters genes based on mean expression and the fraction of samples in which the gene is expressed.
"""
mean_expression = df.mean(axis=1)
fraction_samples_expressed = (df > 0).mean(axis=1)
mask = (mean_expression >= min_mean_expression) & (fraction_samples_expressed >= min_fraction_samples)
return df.loc[mask]
import pybedtools
import dexseq_prepare_annotation as dpa
import dexseq_count as dc
import dexseq_compare as dcomp
import subprocess
def prepare_annotation(annotation_file, output_dir):
"""
Runs dexseq_prepare_annotation.py to prepare the annotation file for use with DEXSeq.
"""
command = f"dexseq_prepare_annotation.py {annotation_file} {output_dir}"
subprocess.run(command, shell=True)
def count_reads(bam_file, annotation_file, output_dir):
"""
Runs dexseq_count.py to count reads overlapping exons in the annotation file.
"""
bam = pybedtools.BedTool(bam_file)
exons = dpa.AnnotationExons(annotation_file)
counts = dc.count_reads(bam, exons, output_dir)
return counts
def compare_conditions(counts1, counts2, output_dir, design_file):
"""
Runs dexseq_compare.py to identify differentially spliced exons between two conditions.
"""
result = dcomp.DexSeqComparison(counts1, counts2, output_dir, design_file)
return result
"""import subprocess"""
def run_rmats(bam_file1, bam_file2, output_dir):
"""
Runs rMATS to identify alternative splicing events between two samples.
"""
command = f"python rMATS/rMATS-turbo-Linux-UCS4/rmats.py " \
f"--b1 {bam_file1} " \
f"--b2 {bam_file2} " \
f"--gtf rMATS/gencode.v38.annotation.gtf " \
f"--od {output_dir}"
subprocess.run(command, shell=True)
def plot_rmats_event(event_id, output_file):
"""
Plots the inclusion levels of alternative splicing events identified by rMATS.
"""
command = f"python rMATS/rMATS-turbo-Linux-UCS4/rmats2sashimiplot.py " \
f"-b1 rMATS/{event_id}.bam " \
f"-b2 rMATS/{event_id}.bam " \
f"-t {event_id} " \
f"-e rMATS/{event_id}.txt " \
f"-o {output_file}"
subprocess.run(command, shell=True)
import pysam
"""import pybedtools"""
def calculate_psi(bam_file, annotation_file):
"""
Calculates the percent spliced in (PSI) value for each alternative splicing event in a BAM file.
"""
# Load the annotation file into a BedTool object
annotation = pybedtools.BedTool(annotation_file)
# Open the BAM file using pysam
bam = pysam.AlignmentFile(bam_file, "rb")
# Loop over each alternative splicing event in the annotation file
for feature in annotation:
# Extract the reads overlapping the event
reads = bam.fetch(feature.chrom, feature.start, feature.end)
# Count the number of reads supporting each splice variant
left_count = right_count = 0
for read in reads:
if read.is_unmapped or read.is_secondary:
continue
if read.is_reverse:
if read.pos < feature.start:
right_count += 1
elif read.aend > feature.end:
left_count += 1
else:
if read.pos < feature.start:
left_count += 1
elif read.aend > feature.end:
right_count += 1
# Calculate the PSI value
total_count = left_count + right_count
if total_count == 0:
psi = 0
else:
psi = 100.0 * right_count / total_count
# Print the result for this event
print(f"{feature.name}\t{psi}")