Skip to content

Commit

Permalink
feat: convert Chinese articles to simp/trad
Browse files Browse the repository at this point in the history
  • Loading branch information
Crissium committed Oct 1, 2023
1 parent 718dea5 commit 63106d9
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 12 deletions.
19 changes: 17 additions & 2 deletions server/app/dictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

class Dictionaries:
_LEGACY_LOOKUP_API_PATTERN = r'/api/lookup/([^/]+)/([^/]+)'
_CACHE_API_PATTERN = r'/api/cache/([^/]+)/([^/]+)'
_REPLACEMENT_TEXT = '!!@@SUBSTITUTION@@!!'

def _load_dictionary(self, dictionary_info: 'dict') -> 'None':
match dictionary_info['dictionary_format']:
Expand Down Expand Up @@ -83,6 +85,19 @@ def get_spelling_suggestions(self, group_name: 'str', key: 'str') -> 'list[str]'
suggestions = [simplify(suggestion) for suggestion in spelling_suggestions(key, self.settings.group_lang(group_name)) if db_manager.entry_exists_in_dictionaries(simplify(suggestion), names_dictionaries_of_group)]
return db_manager.select_entries_with_keys(suggestions, names_dictionaries_of_group, [], self.settings.misc_configs['num_suggestions'])

def _safely_convert_chinese_article(self, article: 'str') -> 'str':
"""
A direct call to convert_chinese() converts things like API references.
Now only cache API calls are protected.
"""
# First replace all API calls with the substitution string, then convert the article, and finally restore the API calls
matches = re.findall(self._CACHE_API_PATTERN, article)
article = re.sub(self._CACHE_API_PATTERN, self._REPLACEMENT_TEXT, article)
article = convert_chinese(article, self.settings.preferences['chinese_preference'])
for match in matches:
article = article.replace(self._REPLACEMENT_TEXT, '/api/cache/%s/%s' % match, 1)
return article

def suggestions(self, group_name: 'str', key: 'str') -> 'list[str]':
"""
Return matched headwords if the key is found;
Expand Down Expand Up @@ -138,9 +153,9 @@ def extract_articles_from_dictionary(dictionary_name: 'str') -> 'None':
keys_found = [key for key in keys if db_manager.entry_exists_in_dictionary(key, dictionary_name)]
article = self.dictionaries[dictionary_name].entries_definitions(keys_found)
if article:
if 'zh' in group_lang:
article = self._safely_convert_chinese_article(article)
article = re.sub(self._LEGACY_LOOKUP_API_PATTERN, replace_legacy_lookup_api, article)
# if 'zh' in group_lang:
# article = convert_chinese(article, self.settings.preferences['chinese_preference'])
if not autoplay_found and article.find('autoplay') != -1:
autoplay_found = True
articles.append((dictionary_name, self.settings.display_name_of_dictionary(dictionary_name), article))
Expand Down
17 changes: 10 additions & 7 deletions server/app/langs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,13 @@ def convert_chinese(text: 'str', preference: 'str') -> 'str':
"""
Convert Chinese characters to Traditional or Simplified, and localise expressions.
"""
match preference:
case 'cn':
return chinese.to_simplified.convert(text)
case 'tw':
return chinese.to_traditional.convert(text)
case _:
return text
if chinese.opencc_found:
match preference:
case 'cn':
return chinese.to_simplified.convert(text)
case 'tw':
return chinese.to_traditional.convert(text)
case _:
return text
else:
return text
8 changes: 5 additions & 3 deletions server/app/langs/chinese.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@ def is_chinese(s: 'str') -> 'bool':
try:
from opencc import OpenCC

_to_traditional = OpenCC('s2twp.json')
_to_simplified = OpenCC('tw2sp.json')
opencc_found = True
to_traditional = OpenCC('s2twp.json')
to_simplified = OpenCC('tw2sp.json')
def transliterate(s: 'str') -> 'list[str]':
"""
Two-way conversion of Chinese characters.
Returns Traditional and Simplified Chinese.
"""
return [_to_traditional.convert(s), _to_simplified.convert(s)]
return [to_traditional.convert(s), to_simplified.convert(s)]
except ImportError:
opencc_found = False
def transliterate(s: 'str') -> 'list[str]':
"""
No conversion.
Expand Down

0 comments on commit 63106d9

Please sign in to comment.