forked from PapenfussLab/sv_benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgasv2vcf.py
116 lines (108 loc) · 3.78 KB
/
gasv2vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
#
# converts GASVPro output into VCF
#
import sys
print """##fileformat=VCFv4.1
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CILEN,Number=2,Type=Integer,Description="Confidence interval around the length of the inserted material between breakends">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=LOCALISATION,Number=1,Type=Float,Description="Localisation">
##INFO=<ID=PRS,Number=1,Type=Float,Description="Num PRS">
##INFO=<ID=LLR,Number=1,Type=Float,Description="LogLikelihoodRatio">
#CHROM POS ID REF ALT QUAL FILTER INFO"""
lookup = {}
for line in open(sys.argv[1]):
chrIndex = str.strip(line).split(' ')
lookup[chrIndex[1]] = chrIndex[0]
for line in sys.stdin:
if line[0] == '#': continue
#Cluster_ID: LeftChr: LeftBreakPoint: RightChr: RightBreakPoint: Num PRS: Localization: Type: LogLikelihoodRatio:
input = line.split(' ')
clusterid = input[0]
leftChr = input[1]
leftStart = int(input[2].split(',')[0])
leftEnd = int(input[2].split(',')[1])
rightChr = input[3]
rightStart = int(input[4].split(',')[0])
rightEnd = int(input[4].split(',')[1])
numprs = input[5]
localisation = float(input[6])
type = input[7]
llr = float(input[8])
startMidPos = (leftEnd + leftStart) / 2
endMidPos = (rightEnd + rightStart) / 2
svLen = endMidPos - startMidPos
if type[0] == 'D':
svType = "DEL"
svLen = -1 * svLen
if type[0] == 'I':
svType = "INS"
if leftChr == rightChr:
print "{0} {1} {2} . . {3} PASS IMPRECISE;END={6};SVLEN={5};SVTYPE={4};CIPOS={7},{8};CIEND={9},{10};LOCALISATION={11};PRS={12};LLR={13}".format(
lookup[leftChr],
startMidPos,
clusterid,
numprs,
svType,
svLen,
endMidPos,
leftStart - startMidPos,
leftEnd - startMidPos,
rightStart - endMidPos,
rightEnd - endMidPos,
localisation,
numprs,
llr
)
else:
print "Unhandled interchromsomal event: " + input
# Output format before convertClusters has been run
# for line in sys.stdin:
# if line[0] == '#': continue
# #Cluster_ID: Num PRS: Localization: Type: List of PRS: LeftChr: RightChr: Boundary Points: Log_Likelihood_Ratio:
# input = line.split(' ')
# clusterid = input[0]
# pairCount = input[1]
# localisation = float(input[2])
# type = input[3]
# pairList = input[4]
# leftChr = input[5]
# rightChr = input[6]
# boundaryPoints = map(int, input[7].split(','))
# logLikelihood = float(input[8])
# # transform
# points = sorted(boundaryPoints)
# lenp = len(points)
# halflenp = lenp / 2
# startMidPos = sum(points[0:halflenp]) / halflenp
# endMidPos = sum(points[halflenp:lenp]) / halflenp
# svLen = endMidPos - startMidPos
# if type[0] == 'D':
# svType = "DEL"
# svLen = -1 * svLen
# if type[0] == 'I':
# svType = "INS"
# if leftChr == rightChr:
# print "{0} {1} {2} . . {3} PASS IMPRECISE;END={6};SVLEN={5};SVTYPE={4};CIPOS={7},{8};CIEND={9},{10};CILEN={11},{12};LOCALISATION={13}".format(
# lookup[leftChr],
# startMidPos,
# clusterid,
# logLikelihood / 10,
# svType,
# svLen,
# endMidPos,
# points[0] - startMidPos,
# points[halflenp-1] - startMidPos,
# points[halflenp] - endMidPos,
# points[lenp-1] - endMidPos,
# points[halflenp] - points[halflenp-1],
# points[lenp-1] - points[0],
# localisation
# )
# else:
# print "Unhandled interchromsomal event: " + input