-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
71 lines (60 loc) · 2.05 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import sys, re, sets
import os
from Bio import SeqIO
import pysam
infile = pysam.Samfile(sys.argv[1], "rb")
fasta_file = sys.argv[2]
offset = int(sys.argv[3])
outfile = sys.argv[4]
filtfile = sys.argv[5]
oligoT = 8
priming_len = 16
tempname = "_" + sys.argv[1] + ".filter.temp"
tempfile = pysam.Samfile(tempname, "wb", template=infile)
filtered = pysam.Samfile(filtfile, "wb", template=infile)
fasta_seqs = {}
for seq_record in SeqIO.parse(fasta_file, "fasta"):
fasta_seqs[seq_record.id] = seq_record
total = 0
mapped = 0
fake = 0
unmapped = 0
low_quality = 0
AT_high = 0
total_reads = sets.Set()
mapped_reads = sets.Set()
for read in infile.fetch():
total += 1
total_reads.add(read.qname)
if read.is_unmapped:
unmapped += 1
elif read.mapq < 0:
low_quality += 1
filtered.write(read)
elif ((read.seq.count('A') + read.seq.count('T')) / float(len(read.seq)) >= 0.80):
AT_high += 1
filtered.write(read)
else:
chrx = infile.getrname(read.tid)
if read.is_reverse:
pos = read.pos + offset
seq = str(fasta_seqs[chrx].seq[pos + read.rlen: pos + read.rlen + priming_len])
else:
pos = read.pos - offset
seq = str(fasta_seqs[chrx].seq[pos - priming_len: pos].reverse_complement())
sense = ["+", "-"][read.is_reverse]
#if (seq.count('A') + seq.count('G')) / float(oligoT) < 0.75:
if seq[0:oligoT].count('A') < oligoT and (seq.count('A') / float(priming_len)) < 0.75:
tempfile.write(read)
mapped += 1
mapped_reads.add(read.qname)
else:
fake += 1
filtered.write(read)
tempfile.close()
infile.close()
filtered.close()
#pysam.sort(tempname, outfile)
#pysam.index(outfile + ".bam")
os.remove(tempname)
print"Total reads: " + str(len(total_reads)), "Mapped reads: " + str(len(mapped_reads)), "Total mappings: " + str(total), "Final mappings: " + str(mapped), "low quality mappings: " + str(low_quality), "AT_high mappings: " + str(AT_high), "fakes alignment: " + str(fake)