-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathvcf_v42.ragel
265 lines (214 loc) · 12.5 KB
/
vcf_v42.ragel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/**
* Copyright 2014-2017 EMBL - European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
%%{
machine vcf_v42;
include vcf "vcf.ragel";
#############################################
# Actions definition #
#############################################
########## Incorrect metadata and header actions ##########
# Fileformat line
action fileformat_error {
ErrorPolicy::handle_error(*this,
new FileformatError{n_lines, "The fileformat declaration is not 'fileformat=VCFv4.2'"});
fhold; fgoto meta_section_skip;
}
# FORMAT metadata
action meta_format_number_err {
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G or dot"});
fhold; fgoto meta_section_skip;
}
# INFO metadata
action meta_info_number_err {
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "INFO metadata Number is not a number, A, R, G or dot"});
fhold; fgoto meta_section_skip;
}
########## Incorrect records actions ##########
# Format
action format_error {
ErrorPolicy::handle_error(*this, new FormatBodyError{n_lines});
fhold; fgoto body_section_skip;
}
#################################################
# Machine definition #
#################################################
## A chromosome must be a string with no white-spaces, and may be surrounded by < > symbols (for contigs)
chrom_basic = alnum ( alnum | punct - ( '<' | '>' | ',' ) )* ;
chrom_contig = '<' chrom_basic '>' ;
chromosome = chrom_basic | chrom_contig ;
# File format line
fileformat_name = 'VCFv4.2' >token_begin $token_middle %fileformat_end $err(fileformat_error) ;
fileformat = '##fileformat=' fileformat_name ;
# Meta-data
meta_typeid = ( ( print - '=' )+ - ('ALT' | 'FILTER' | 'FORMAT' | 'INFO' | 'assembly' | 'contig' | 'SAMPLE' | 'PEDIGREE' | 'pedigreeDB') ) >token_begin @token_middle %meta_typeid_generic ;
meta_key = ( (alnum | '.' | '_' | '-' )+ - (punct)+ ) >token_begin @token_middle %meta_key_generic;
## A value can be expressed in multiple ways:
## - The line is a single key-value pair (meta_value): the value cannot start with an angle bracket, quote or linebreak, but it is free text thereafter
## - One of multiple key-value pairs, non-quoted (meta_field_value): the value cannot contain a comma -end of field-, or a closing angle bracket -end of entry-, or quote
## - One of multiple key-value pairs, quoted (meta_field_desc): the value can contain any character except for non-escaped quotes
meta_value = ((print - ('<' | '"' | NL)) (print)*) >token_begin $token_middle %token_end ;
meta_field_value= (print - (',' | '>' | '"'))+ >token_begin $token_middle %token_end ;
meta_field_desc = (('\\' print) | (print - '"'))* >token_begin $token_middle %token_end ;
meta_values = ('"' meta_field_desc '"') | meta_field_value ;
meta_field = meta_key '=' meta_values ;
meta_field_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' ) >token_begin @token_middle %token_end ;
meta_field_type = (alpha)+ >token_begin @token_middle %token_end ;
meta_alt = 'ID=' %meta_id alt_id >token_begin @token_middle %token_end $err(meta_alt_id_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_assembly = url $err(meta_url_err) ;
meta_contig = 'ID=' %meta_id chrom_basic >token_begin $token_middle %token_end $err(meta_id_err)
(',' identifier '=' meta_values)* ;
meta_filter = 'ID=' %meta_id identifier $err(meta_id_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_format = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_num $err(meta_format_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_info = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_num $err(meta_info_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_pedigree = identifier $err(meta_id_err) '=' identifier $err(meta_id_err)
(',' identifier $err(meta_id_err) '=' identifier $err(meta_id_err))* ;
meta_pedigreeDB = url $err(meta_url_err) ;
## TODO The error shown when 'Description' is missing shows that Mixture is not a valid string
meta_sample = 'ID=' %meta_id identifier $err(meta_id_err)
(',Genomes=' %meta_genomes meta_field_value) $err(meta_sample_genomes_err)
(',Mixture=' %meta_mixture meta_field_value) $err(meta_sample_mixture_err)
(',Description=' %meta_description '"' meta_field_desc '"') $err(meta_desc_err) ;
meta_entry = '##' (
('ALT' %meta_typeid_alt '=<' meta_alt '>' ) $err(meta_alt_err) |
('FILTER' %meta_typeid_filter '=<' meta_filter '>' ) $err(meta_filter_err) |
('FORMAT' %meta_typeid_format '=<' meta_format '>' ) $err(meta_format_err) |
('INFO' %meta_typeid_info '=<' meta_info '>' ) $err(meta_info_err) |
('assembly' %meta_typeid_assembly '=' meta_assembly ) $err(meta_assembly_err) |
('contig' %meta_typeid_contig '=<' meta_contig '>' ) $err(meta_contig_err) |
('SAMPLE' %meta_typeid_sample '=<' meta_sample '>' ) $err(meta_sample_err) |
('PEDIGREE' %meta_typeid_pedigree '=<' meta_pedigree '>' ) $err(meta_pedigree_err) |
('pedigreeDB' %meta_typeid_pedigreedb '=<' meta_pedigreeDB '>' ) $err(meta_pedigreedb_err) |
(meta_typeid '='
(
('<' meta_field (',' meta_field)* '>') |
('"' meta_field_desc '"') |
('<"' meta_field_desc '">') |
meta_value
)
)
) %meta_entry_end;
# Header between meta and records
sample_name = (print - (NL | CS))+ >token_begin @token_middle %sample_name_end;
header = ('#CHROM' CS 'POS' CS 'ID' CS 'REF' CS 'ALT' CS 'QUAL' CS 'FILTER' CS 'INFO') $err(header_prefix_err)
(CS 'FORMAT' (CS sample_name)+ )? %header_end ;
# Records
position = nat_number ;
record_chrom = chrom_basic >token_begin $token_middle %token_end |
'<' chrom_basic >token_begin $token_middle %token_end '>' ;
record_position = position >token_begin @token_middle %token_end ;
## ID must be a (list of) string with no white-spaces or semi-colons
record_id = (record_id_value (';' record_id_value)*) | record_empty_field ;
record_ref = bases >token_begin @token_middle %token_end ;
## Structural variants follow forms like:
## ]1:1234]ATG or ]<contig_1>:1234]ATG : paired breakends
## .AGT, AGT.: single breakends
record_alt_sv = ']' chromosome ':' position ']' bases |
'[' chromosome ':' position '[' bases |
bases ']' chromosome ':' position ']' |
bases '[' chromosome ':' position '[' |
'.' bases |
bases '.' ;
## Main alternate allele rule
record_alt_data = ( record_alt_snv |
record_alt_indel |
record_alt_sv |
record_alt_other ) >token_begin $token_middle %token_end ;
record_alt = (record_alt_data (',' record_alt_data)*) | record_empty_field ;
record_qual = (any_number | '.') >token_begin $token_middle %token_end ;
record_filter = (record_fil_value (';' record_fil_value)*) | record_empty_field ;
info_key = (( (alpha | '_') (alpha | digit | '_' | '.')*) | '1000G' ) ;
info_value = (print - (space | ';'))+ ;
info_value_list = info_value (',' info_value)* ;
info_entry = (
info_key $err(info_key_error) '=' info_value_list $err(info_value_error) |
info_key $err(info_key_error)
) >token_begin $token_middle %token_end ;
record_info = ( info_entry (';' info_entry)* ) |
( '.' >token_begin @token_middle %token_end ) ;
## Accepting non-alphanumeric characters is an addition in v4.3, but widely used in already existing files
format_value = (alnum)+ >token_begin $token_middle %token_end ;
record_format = format_value (':' format_value)* ;
## In a sample, if a genotype is present it must be the first field
record_sample = (
genotype $err(genotype_error) (':' sample_values)? |
sample_values
) >token_begin $token_middle %token_end ;
record = (
record_chrom %record_field_end $err(chrom_error)
CS record_position %record_field_end $err(pos_error)
CS record_id %record_field_end $err(id_error)
CS record_ref %record_field_end $err(ref_error)
CS record_alt %record_field_end $err(alt_error)
CS record_qual %record_field_end $err(qual_error)
CS record_filter %record_field_end $err(filter_error)
CS record_info %record_field_end $err(info_error)
(CS record_format %record_field_end $err(format_error)
(CS record_sample %record_field_end $err(sample_error))+ )?
) %record_end;
fileformat_section = (fileformat NL) $err(fileformat_section_error);
meta_section = (meta_entry NL)* $err(meta_section_error);
header_section = (header NL) $err(header_section_error);
body_section = (record (NL record)*)? $err(body_section_error);
# Machine start (fileformat, then optional meta, header, then optional records)
main := fileformat_section
meta_section
header_section
body_section >meta_section_end
(NL)* ;
# Error recovery machines that skip until the next line and restart the
# the most appropriate section state.
meta_section_skip := [^\n]* NL @{ fgoto meta_section; };
body_section_skip := [^\n]* NL @{ fgoto body_section; };
}%%
namespace
{
%%{
write data;
}%%
}
namespace ebi
{
namespace vcf
{
template <typename Configuration>
ParserImpl_v42<Configuration>::ParserImpl_v42(std::shared_ptr<Source> source, AdditionalChecks additionalChecks)
: ParserImpl{source, additionalChecks}
{
%%{
write init;
}%%
}
template <typename Configuration>
void ParserImpl_v42<Configuration>::parse_buffer(char const * p, char const * pe, char const * eof)
{
%%{
write exec;
}%%
}
}
}