Skip to content

Commit

Permalink
fix: speed up definition extraction somewhat
Browse files Browse the repository at this point in the history
  • Loading branch information
Crissium committed Oct 29, 2023
1 parent bd2b286 commit b682a9f
Show file tree
Hide file tree
Showing 12 changed files with 492 additions and 215 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
- Minimalist web interface
- Separable client and server components
- Works as expected
- DSL, StarDict, MDict supported
- Cross-platform (Linux, Windows, MacOS, Android, limited iOS)

## Roadmap
Expand All @@ -48,7 +49,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
- [X] Add support for ABBYY Lingvo DSL format[^4]
- [ ] Reduce DSL indexing and parsing time
- [X] Reduce the memory footprint of the MDict Reader
- [ ] Inline styles to prevent them from being applied to the whole page (The commented-out implementation in `mdict_reader.py` breaks richly-formatted dictionaries.)[^5]
- [ ] Inline styles to prevent them from being applied to the whole page (The commented-out implementation in [`server/app/dicts/mdict/html_cleaner.py`](/server/app/dicts/mdict/html_cleaner.py) breaks richly-formatted dictionaries.)[^5]
- [X] Reorganise APIs (to facilitate dictionary groups)
- [X] Ignore diacritics when searching (testing still wanted from speakers of Turkish and Asian languages other than CJK)
- [X] Ignore case when searching
Expand All @@ -63,6 +64,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
- [X] Allow configure suggestion matching mode, listening address, running mode, etc. via a configuration file, without modifying code
- [X] Add a timestamp field to suggestions to avoid newer suggestions being overridden by older ones
- [ ] Use a linter
- [ ] Full-text search

### Client-side

Expand Down Expand Up @@ -167,7 +169,7 @@ I would also express my gratitude to Jiang Qian for his suggestions, encourageme

[^4]: I tested with an extremely ill-formed DSL dictionary, and before such devilry my cleaning code is powerless. I will look into how GoldenDict handles this.

[^5]: The use of a custom styling manager such as Dark Reader is recommended until I fix this, as styles for different dictionaries meddle with each other.
[^5]: The use of a custom styling manager such as Dark Reader is recommended until I fix this, as styles for different dictionaries meddle with each other. Or better, if you know CSS, you could just edit the dictionaries' stylesheets to make them less intrusive and individualistic.

[^6]: A Russian-speaking friend told me that it is unusual to type Russian on an American keyboard, so whether this feature is useful is open to doubt.

Expand Down
8 changes: 3 additions & 5 deletions server/app/dictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
import re
from .settings import Settings
from . import db_manager
from .dicts.base_reader import BaseReader
from .dicts.mdict_reader import MDictReader
from .dicts.stardict_reader import StarDictReader
from .dicts.dsl_reader import DSLReader
from .dicts import BaseReader, DSLReader, StarDictReader, MDictReader
from .langs import is_lang, transliterate, stem, spelling_suggestions, orthographic_forms, convert_chinese
import logging

Expand Down Expand Up @@ -154,6 +151,7 @@ def query(self, group_name: 'str', key: 'str') -> 'list[tuple[str, str, str]]':
articles = []
def replace_legacy_lookup_api(match: 're.Match') -> 'str':
return '/api/query/%s/%s' % (group_name, match.group(2))

def extract_articles_from_dictionary(dictionary_name: 'str') -> 'None':
nonlocal autoplay_found
keys_found = [key for key in keys if db_manager.entry_exists_in_dictionary(key, dictionary_name)]
Expand All @@ -168,7 +166,7 @@ def extract_articles_from_dictionary(dictionary_name: 'str') -> 'None':
else:
articles.append((dictionary_name, self.settings.display_name_of_dictionary(dictionary_name), article.replace('autoplay', '')))

with concurrent.futures.ThreadPoolExecutor() as executor:
with concurrent.futures.ThreadPoolExecutor(len(names_dictionaries_of_group)) as executor:
executor.map(extract_articles_from_dictionary, names_dictionaries_of_group)

if len(articles) > 0:
Expand Down
4 changes: 4 additions & 0 deletions server/app/dicts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .base_reader import BaseReader
from .dsl_reader import DSLReader
from .stardict_reader import StarDictReader
from .mdict_reader import MDictReader
6 changes: 4 additions & 2 deletions server/app/dicts/dsl/markup_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _correct_media_references(self, html: 'str') -> 'tuple[str, list[str]]':
def _extract_files(self, files_to_be_extracted: 'list[str]') -> 'None':
# ZipFile's extractall() is too slow, so we use a thread pool to extract files in parallel.
with ZipFile(self._resources_filename) as zip_file:
with concurrent.futures.ThreadPoolExecutor() as executor:
with concurrent.futures.ThreadPoolExecutor(len(files_to_be_extracted)) as executor:
executor.map(zip_file.extract, files_to_be_extracted, [self._resources_dir] * len(files_to_be_extracted))

def _clean_html(self, html: 'str') -> 'str':
Expand All @@ -254,7 +254,9 @@ def _clean_html(self, html: 'str') -> 'str':

return html

def convert(self, text: 'str', headword: 'str') -> 'str':
# def convert(self, text: 'str', headword: 'str') -> 'str':
def convert(self, record: 'tuple[str, str]') -> 'str':
text, headword = record
for line in text.splitlines():
if line.startswith(' [m') and not line.endswith('[/m]'):
text = text.replace(line, line + '[/m]')
Expand Down
30 changes: 18 additions & 12 deletions server/app/dicts/dsl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import idzip
from json import detect_encoding
from pathlib import Path
import concurrent.futures
from .base_reader import BaseReader
from .. import db_manager
from .dsl import DSLConverter
Expand Down Expand Up @@ -192,22 +193,27 @@ def __init__(self,
if remove_resources_after_extraction:
os.remove(resources_filename)

def _get_records(self, offset: 'int', size: 'int') -> 'str':
def _get_record(self, f: 'idzip.api.IdzipFile', offset: 'int', size: 'int') -> 'str':
"""
Returns original DSL markup.
"""
assert os.path.splitext(self.filename)[1] == '.dz'
f.seek(offset)
data = f.read(size)
assert detect_encoding(data) == 'utf-8'
return data.decode('utf-8')

def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[tuple[str, str]]':
records = []
with idzip.open(self.filename) as f:
f.seek(offset)
data = f.read(size)
assert detect_encoding(data) == 'utf-8'
return data.decode('utf-8')
for word, offset, size in locations:
records.append((self._get_record(f, offset, size), word))
return records

def entry_definition(self, entry: str) -> str:
locations = db_manager.get_entries(entry, self.name)
records = []
for word, offset, length in locations:
record = self._get_records(offset, length)
records.append(self._converter.convert(record, word))

return self._ARTICLE_SEPARATOR.join(records)
records = self._get_records_in_batch(locations)
# records = [self._converter.convert(*record) for record in records]
# DSL parsing is expensive, so we'd better parallelise it
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self._converter.convert, records))
return self._ARTICLE_SEPARATOR.join(records)
3 changes: 3 additions & 0 deletions server/app/dicts/mdict/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .readmdict import MDD, MDX
from .html_cleaner import HTMLCleaner
from . import lzo
136 changes: 136 additions & 0 deletions server/app/dicts/mdict/html_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
import shutil
from pathlib import Path
import re
# import css_inline

class HTMLCleaner:
NON_PRINTING_CHARS_PATTERN = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]')

def __init__(self, filename, dict_name: 'str', resources_dir: 'str') -> 'None':
self._filename = filename
self._resources_dir = resources_dir
self._href_root_dir = '/api/cache/' + dict_name + '/'
self._lookup_url_root = '/api/lookup/' + dict_name + '/'

def _fix_file_path(self, definition_html: 'str', file_extension: 'str') -> 'str':
extension_position = 0
while (extension_position := definition_html.find(file_extension, extension_position)) != -1:
filename_position = definition_html.rfind('"', 0, extension_position) + 1
filename = definition_html[filename_position:extension_position + len(file_extension)]
file_path_on_disk = os.path.join(os.path.dirname(self._filename), filename)
new_file_path_on_disk = os.path.join(self._resources_dir, filename)
if not os.path.isfile(new_file_path_on_disk):
if os.path.isfile(file_path_on_disk):
Path(self._resources_dir).mkdir(parents=True, exist_ok=True)
shutil.copy(file_path_on_disk, new_file_path_on_disk)
definition_html = definition_html[:filename_position] + self._href_root_dir + definition_html[filename_position:]
else:
if os.path.getmtime(file_path_on_disk) > os.path.getmtime(new_file_path_on_disk):
shutil.copy(file_path_on_disk, new_file_path_on_disk)
definition_html = definition_html[:filename_position] + self._href_root_dir + definition_html[filename_position:]
extension_position += len(file_extension)
return definition_html

# def _inline_styles(self, html_content: 'str') -> 'str': # CSS path(s) is inside the HTML file
# # Find all CSS references
# # regex won't work. Maybe it's simply because that I haven't mastered the dark art.
# css_references = []
# css_extension_position = 0
# while (css_extension_position := html_content.find('.css"', css_extension_position)) != -1:
# css_filename_position = html_content.rfind('"', 0, css_extension_position) + 1
# css_filename = html_content[css_filename_position:css_extension_position] + '.css'
# css_references.append(css_filename)
# # Remove the CSS reference
# link_tag_start_position = html_content.rfind('<link', 0, css_filename_position)
# link_tag_end_position = html_content.find('>', link_tag_start_position) + 1
# html_content = html_content[:link_tag_start_position] + html_content[link_tag_end_position:]
# css_extension_position = link_tag_start_position

# for css in css_references:
# # Read the CSS file
# css_path = os.path.join(self._resources_dir, css.split('/')[-1])
# with open(css_path) as css_file:
# css_content = css_file.read()

# # Inline the CSS
# inliner = css_inline.CSSInliner(load_remote_stylesheets=False, extra_css=css_content)
# html_content = inliner.inline(html_content)

# return html_content

def _fix_internal_href(self, definition_html: 'str') -> 'str':
# That is, links like entry://#81305a5747ca42b28f2b50de9b762963_nav2
return definition_html.replace('entry://#', '#')

def _flatten_nested_a(self, definition_html: 'str', depth: 'int') -> 'str':
# Sometimes there're multiple inner elements inside the <a> element, which should be removed
# For example, in my Fr-En En-Fr Collins Dictionary, there's a <span> element inside the <a> element
# The text within the <span> should be preserved, though
# <a class="ref" href="/lookup/collinse22f/badly" title="Translation of badly"><span class="orth">badly</span></a>
if depth == 0:
return definition_html
else:
a_closing_tag_pos = 0
while (a_tag_start_pos := definition_html.find('<a', a_closing_tag_pos)) != -1:
a_tag_end_pos = definition_html.find('>', a_tag_start_pos)
inner_html_start_pos = definition_html.find('>', a_tag_end_pos + 1) + 1
if (a_closing_tag_pos := definition_html.find('</a>', a_tag_end_pos, inner_html_start_pos)) != -1:
continue
inner_html_end_pos = definition_html.find('</', inner_html_start_pos)
inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
a_closing_tag_pos = definition_html.find('</a>', inner_html_end_pos)
definition_html = definition_html[:a_tag_end_pos + 1] + inner_html + definition_html[a_closing_tag_pos:]
return self._flatten_nested_a(definition_html, depth - 1)

def _fix_entry_cross_ref(self, definition_html: 'str') -> 'str':
if definition_html.startswith('@@@LINK='): # strange special case
last_non_whitespace_position = len(definition_html) - 1
while definition_html[last_non_whitespace_position].isspace():
last_non_whitespace_position -= 1
entry_linked = definition_html[len('@@@LINK='):last_non_whitespace_position+1]
return '<a href="%s">%s</a>' % (self._lookup_url_root + entry_linked, entry_linked)
else:
definition_html = definition_html.replace('entry://', self._lookup_url_root)
return self._flatten_nested_a(definition_html, 3) # fingers crossed there are no more than three layers

def _fix_sound_link(self, definition_html: 'str') -> 'str':
# Use HTML sound element instead of the original <a> element, which looks like this:
# <a class="hwd_sound sound audio_play_button icon-volume-up ptr fa fa-volume-up" data-lang="en_GB" data-src-mp3="https://www.collinsdictionary.com/sounds/hwd_sounds/EN-GB-W0020530.mp3" href="sound://audio/ef/7650.mp3" title="Pronunciation for "><img class="soundpng" src="/api/cache/collinse22f/img/sound.png"></a>
autoplay_string = 'autoplay'
sound_element_template = '<audio controls %s src=%s>%s</audio>'
while (sound_link_start_pos := definition_html.find('sound://')) != -1:
sound_link_end_pos = definition_html.find('"', sound_link_start_pos)
original_sound_link = definition_html[sound_link_start_pos:sound_link_end_pos]
sound_link = original_sound_link.replace('sound://', self._href_root_dir)
inner_html_start_pos = definition_html.find('>', sound_link_end_pos) + 1
inner_html_end_pos = definition_html.find('</a>', inner_html_start_pos)
inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
outer_html_start_pos = definition_html.rfind('<a', 0, sound_link_start_pos)
outer_html_end_pos = definition_html.find('</a>', inner_html_end_pos) + len('</a>')
definition_html = definition_html[:outer_html_start_pos] + sound_element_template % (autoplay_string, sound_link, inner_html) + definition_html[outer_html_end_pos:]
autoplay_string = ''

return definition_html

def _fix_img_src(self, definition_html: 'str') -> 'str':
img_tag_end_pos = 0
while (img_tag_start_pos := definition_html.find('<img', img_tag_end_pos)) != -1:
img_tag_end_pos = definition_html.find('>', img_tag_start_pos)
img_src_start_pos = definition_html.find(' src="', img_tag_start_pos, img_tag_end_pos) + len(' src="')
img_src_end_pos = definition_html.find('"', img_src_start_pos, img_tag_end_pos)
img_src = definition_html[img_src_start_pos:img_src_end_pos]
img_src = self._href_root_dir + img_src.replace('file://' , '')
definition_html = definition_html[:img_src_start_pos] + img_src + definition_html[img_src_end_pos:]
return definition_html

def clean(self, definition_html: 'str') -> 'str':
definition_html = self.NON_PRINTING_CHARS_PATTERN.sub('', definition_html)
definition_html = self._fix_file_path(definition_html, '.css')
definition_html = self._fix_file_path(definition_html, '.js')
definition_html = self._fix_internal_href(definition_html)
definition_html = self._fix_entry_cross_ref(definition_html)
definition_html = self._fix_sound_link(definition_html)
definition_html = self._fix_img_src(definition_html)
# definition_html = self._inline_styles(definition_html)
return definition_html
Loading

0 comments on commit b682a9f

Please sign in to comment.