-
Notifications
You must be signed in to change notification settings - Fork 295
/
Copy pathhedy_grammar.py
348 lines (262 loc) · 13.2 KB
/
hedy_grammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import re
import warnings
from os import path
from functools import cache
import hedy_translation
"""
Because of the gradual nature of Hedy, the grammar of every level is just slightly different than the grammar of the
previous one. With the current approach every level only describes the grammar differences from its preceding level.
To get the grammar of level N, the grammar of level 1 is merged consecutively with the change definitions of all
levels up to N. To facilitate this approach, 2 features are added:
- Preprocessing rules appear in grammar rule definitions and allow for custom python logic to be applied before the
rule is merged. For example, `for:<needs_colon>` fetches the `for` rule from the base grammar and adds a colon at
the end of its definition. Another example is `if_error:<old_rule_to_error ifs>` which fetches the definition of
`ifs` from the base grammar. Preprocessors are also used to construct regular expressions which have to avoid
specific translated keywords, e.g. `elses: /([^\n ]| (?!<expand_keyword else>))+/` is transformed to
`elses: /([^\n ]| (?!else|ellers))+/` in Danish and `elses: /([^\n ]| (?!else|değilse))+/` in Turkish.
- Grammar mering operators, i.e. +=, -= and >>, allow adding, removing and moving last parts of a rule definition.
For example, if level 1 contains the following definition `command: repeat | while` and level 2 redefines the
rule as `command += for -= while >> repeat`, then the merged grammar of level 2 will have the following
definition `command: for | repeat`.
"""
@cache
def create_grammar(level, lang, skip_faulty):
""" Creates a grammar file for a chosen level and lang. Note that the language is required
to generate regular expressions that escape keywords (with negative lookahead).
Currently, it is only a couple of MB in total, so it is safe to cache. """
# start with creating the grammar for level 1
merged_grammars = get_full_grammar_for_level(1)
# then keep merging new grammars in
for lvl in range(2, level + 1):
grammar_text_lvl = get_additional_rules_for_level(lvl)
merged_grammars = merge_grammars(merged_grammars, grammar_text_lvl, lang)
if skip_faulty:
skip_faulty_grammar = read_skip_faulty_file(level)
merged_grammars = merge_grammars(merged_grammars, skip_faulty_grammar, lang)
# keyword and other terminals never have merge-able rules, so we can just add them at the end
keywords = get_keywords_for_language(lang)
terminals = get_terminals()
merged_grammars = merged_grammars + '\n' + keywords + '\n' + terminals
# ready? Save to file to ease debugging
# this could also be done on each merge for performance reasons
save_total_grammar_file(level, merged_grammars, lang)
return merged_grammars
def merge_grammars(grammar_text_1, grammar_text_2, lang):
""" Merges two grammar files into one.
Rules that are redefined in the second file are overridden.
Rules that are new in the second file are added."""
merged_grammar = []
rules_to_delete = [] # collects rules we no longer need
base_grammar = parse_grammar(grammar_text_1)
target_grammar = parse_grammar(grammar_text_2)
apply_preprocessing_rules(target_grammar, base_grammar, lang)
for base_rule in base_grammar.values():
if base_rule.name in target_grammar:
target_rule = target_grammar[base_rule.name]
if base_rule.value.strip() == target_rule.value.strip():
warnings.warn(f"The rule {base_rule.name} is duplicated: {base_rule.value}. Please check!")
# computes the rules that use the merge operators in the grammar, namely +=, -= and >>
merged_rule, to_delete = merge_rules_operator(base_rule.value, target_rule.value,
base_rule.name, target_rule.line)
rules_to_delete.extend(to_delete)
merged_grammar.append(merged_rule)
else:
merged_grammar.append(base_rule.line)
for target_rule in target_grammar.values():
if target_rule.name not in base_grammar:
merged_grammar.append(target_rule.line)
# filters rules that are no longer needed
rules_to_keep = [rule for rule in merged_grammar if split_rule_name_and_value(rule)[0] not in rules_to_delete]
return '\n'.join(sorted(rules_to_keep))
def read_file(*paths):
script_dir = path.abspath(path.dirname(__file__))
path_ = path.join(script_dir, *paths)
with open(path_, "r", encoding="utf-8") as file:
return file.read()
def read_skip_faulty_file(level):
script_dir = path.abspath(path.dirname(__file__))
for lvl in range(level, 0, -1):
file_path = path.join(script_dir, 'grammars', f'skip-faulty-level{lvl}.lark')
if path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
def write_file(content, *paths):
script_dir = path.abspath(path.dirname(__file__))
path_ = path.join(script_dir, *paths)
with open(path_, "w", encoding="utf-8") as file:
file.write(content)
def get_keywords_for_language(language):
try:
return read_file('grammars', f'keywords-{language}.lark')
except FileNotFoundError:
return read_file('grammars', 'keywords-en.lark')
def get_terminals():
return read_file('grammars', 'terminals.lark')
def save_total_grammar_file(level, grammar, lang_):
write_file(grammar, 'grammars-Total', f'level{level}.{lang_}-Total.lark')
def get_additional_rules_for_level(level):
return read_file('grammars', f'level{level}-Additions.lark')
def get_full_grammar_for_level(level):
return read_file('grammars', f'level{level}.lark')
def parse_grammar(grammar):
lines = grammar.split('\n')
rules = [parse_grammar_rule(line) for line in lines if line != '' and line[0] != '/']
return {r.name: r for r in rules}
def parse_grammar_rule(line):
processor_rules = [re.findall(fr'<({rule})( +[\w_]+)?>', line) for rule in PREPROCESS_RULES]
processor_rules = [
RuleProcessor(name, arg)
for rules in processor_rules
for name, arg in rules if rules
]
name, value = split_rule_name_and_value(line)
return GrammarRule(line=line, name=name, value=value, processors=processor_rules)
class GrammarRule:
"""Used to store information about parsed grammar rules when merging grammars.
Valid examples of rules: name.-100: _DEFINITION, name<processor>, and name.1<processor argument>."""
def __init__(self, line, name, value, processors=None):
self.line = line
self.name_with_priority = name
self.name = strip_priority_suffix(name).strip()
self.value = value
self.processors = processors
def apply_processors(self, base_grammar, lang):
if self.processors:
result = self.value
for processor in self.processors:
arg = processor.arg if processor.arg else self.name
target_part = processor.func(arg=arg, lang=lang, base_grammar=base_grammar)
result = result.replace(processor.match_string, target_part)
self.value = result
self.line = f'{self.name_with_priority}:{result}'
def __str__(self):
return f'{self.name}:{self.value}'
def __repr__(self):
return self.__str__()
class RuleProcessor:
def __init__(self, name, arg):
self.name = name
self.arg = arg.strip()
self.match_string = f'<{name}{arg}>'
self.func = PREPROCESS_RULES[name]
def split_rule_name_and_value(s):
# splits the name and the definition of a rule
parts = s.split(':')
if len(parts) <= 1:
return s, s
# we join because the rule definition could contain :
return parts[0], ':'.join(parts[1:])
def apply_preprocessing_rules(grammar, base_grammar, lang):
for rule in grammar.values():
rule.apply_processors(base_grammar, lang)
def get_rule_from_grammar(rule_name, grammar):
if rule_name not in grammar:
raise Exception(f'There is a reference to rule {rule_name} but it is not in the base grammar.')
return grammar[rule_name]
#
# Grammar rule preprocessing functions
#
def needs_colon(**kwargs):
""" Returns the definition of the rule in the base grammar modified so that it is followed by a `:` """
rule_name = kwargs['arg']
base_grammar = kwargs['base_grammar']
rule = get_rule_from_grammar(rule_name, base_grammar)
value = rule.value
pos = value.find('_EOL (_SPACE command)')
return f'{value[0:pos]} _COLON {value[pos:]}'
def old_rule_to_error(**kwargs):
""" Returns the 'old' version of the rule, i.e. the definition of the rule in the base grammar """
arg = kwargs['arg']
base_grammar = kwargs['base_grammar']
rule = get_rule_from_grammar(arg, base_grammar)
return rule.value
def expand_keyword(**kwargs):
""" Creates a list of all values of a keyword. The keyword `else` produces `else|ellers` for Danish"""
keyword = kwargs['arg']
lang = kwargs['lang']
values = get_translated_keyword(keyword, lang)
values = sorted(list(set(values)))
return '|'.join(values)
def expand_keyword_first(**kwargs):
""" Creates a list of the first letter of all values of a keyword.
The keyword `else` produces `ei` for Ukrainian """
keyword = kwargs['arg']
lang = kwargs['lang']
values = get_translated_keyword(keyword, lang)
values = sorted(list(set([v[0] for v in values])))
return ''.join(values)
def expand_keyword_not_followed_by_space(**kwargs):
""" Creates a negative lookahead for all values of a keyword (except their first letter) followed by a space.
The keyword `else` produces `e(?!lse |llers )` for Danish and `e(?!lse )|и(?!наче)` for Bulgarian"""
keyword = kwargs['arg']
lang = kwargs['lang']
values = get_translated_keyword(keyword, lang)
first_to_rest = dict()
for v in values:
first, rest = v[0], f'{v[1:]} '
if first in first_to_rest:
first_to_rest[first].append(rest)
else:
first_to_rest[first] = [rest]
result = [f'{k}(?!{"|".join(v)})' for k, v in first_to_rest.items()]
return '|'.join(result)
def get_translated_keyword(keyword, lang):
def get_keyword_value_from_lang(keyword_, lang_):
keywords = hedy_translation.keywords_to_dict(lang_)
if keyword_ in keywords:
return [k for k in keywords[keyword_] if k]
else:
raise Exception(f"The keywords yaml file for language '{lang_}' has no definition for '{keyword_}'.")
translated_keyword = get_keyword_value_from_lang(keyword, lang) if lang != 'en' else []
return translated_keyword + get_keyword_value_from_lang(keyword, 'en')
PREPROCESS_RULES = {
'needs_colon': needs_colon,
'old_rule_to_error': old_rule_to_error,
'expand_keyword': expand_keyword,
'expand_keyword_first': expand_keyword_first,
'expand_keyword_not_followed_by_space': expand_keyword_not_followed_by_space,
}
#
# Grammar merging operators: +=, -=, >>
#
ADD_GRAMMAR_MERGE_OP = '+='
REMOVE_GRAMMAR_MERGE_OP = '-='
LAST_GRAMMAR_MERGE_OP = '>>'
GRAMMAR_MERGE_OPERATORS = [ADD_GRAMMAR_MERGE_OP, REMOVE_GRAMMAR_MERGE_OP, LAST_GRAMMAR_MERGE_OP]
def merge_rules_operator(prev_definition, new_definition, name, complete_line):
op_to_arg = get_operator_to_argument(new_definition)
add_arg = op_to_arg.get(ADD_GRAMMAR_MERGE_OP, '')
remove_arg = op_to_arg.get(REMOVE_GRAMMAR_MERGE_OP, '')
last_arg = op_to_arg.get(LAST_GRAMMAR_MERGE_OP, '')
remaining_commands = get_remaining_rules(prev_definition, remove_arg, last_arg)
ordered_commands = split_rule(remaining_commands, add_arg, last_arg)
new_rule = f"{name}: {' | '.join(ordered_commands)}" if bool(op_to_arg) else complete_line
deletable = split_rule(remove_arg)
return new_rule, deletable
def get_operator_to_argument(definition):
"""Creates a map of all used operators and their respective arguments e.g. {'+=': 'print | play', '>>': 'echo'}"""
operator_to_index = [(op, definition.find(op)) for op in GRAMMAR_MERGE_OPERATORS if op in definition]
result = {}
for i, (op, index) in enumerate(operator_to_index):
start_index = index + len(op)
if i + 1 < len(operator_to_index):
_, next_index = operator_to_index[i + 1]
result[op] = definition[start_index:next_index].strip()
else:
result[op] = definition[start_index:].strip()
return result
def get_remaining_rules(orig_def, *sub_def):
original_commands = split_rule(orig_def)
commands_after_minus = split_rule(*sub_def)
misses = [c for c in commands_after_minus if c not in original_commands]
if misses:
raise Exception(f"Command(s) {'|'.join(misses)} do not exist in the previous definition")
remaining_commands = [cmd for cmd in original_commands if cmd not in commands_after_minus]
remaining_commands = ' | '.join(remaining_commands) # turn the result list into a string
return remaining_commands
def split_rule(*rules):
return [c.strip() for rule in rules for c in rule.split('|') if c.strip() != '']
def strip_priority_suffix(rule):
if re.match(r"\w+\.-?\d+", rule):
return rule.split('.')[0]
return rule