fix: speed up definition extraction somewhat

Crissium · Oct 29, 2023 · b682a9f · b682a9f
1 parent bd2b286
commit b682a9f
Show file tree

Hide file tree

Showing 12 changed files with 492 additions and 215 deletions.
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
 - Minimalist web interface
 - Separable client and server components
 - Works as expected
+- DSL, StarDict, MDict supported
 - Cross-platform (Linux, Windows, MacOS, Android, limited iOS)
 
 ## Roadmap
@@ -48,7 +49,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
 - [X] Add support for ABBYY Lingvo DSL format[^4]
 - [ ] Reduce DSL indexing and parsing time
 - [X] Reduce the memory footprint of the MDict Reader
-- [ ] Inline styles to prevent them from being applied to the whole page (The commented-out implementation in `mdict_reader.py` breaks richly-formatted dictionaries.)[^5]
+- [ ] Inline styles to prevent them from being applied to the whole page (The commented-out implementation in [`server/app/dicts/mdict/html_cleaner.py`](/server/app/dicts/mdict/html_cleaner.py) breaks richly-formatted dictionaries.)[^5]
 - [X] Reorganise APIs (to facilitate dictionary groups)
 - [X] Ignore diacritics when searching (testing still wanted from speakers of Turkish and Asian languages other than CJK)
 - [X] Ignore case when searching
@@ -63,6 +64,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
 - [X] Allow configure suggestion matching mode, listening address, running mode, etc. via a configuration file, without modifying code
 - [X] Add a timestamp field to suggestions to avoid newer suggestions being overridden by older ones
 - [ ] Use a linter
+- [ ] Full-text search
 
 ### Client-side
 
@@ -167,7 +169,7 @@ I would also express my gratitude to Jiang Qian for his suggestions, encourageme
 
 [^4]: I tested with an extremely ill-formed DSL dictionary, and before such devilry my cleaning code is powerless. I will look into how GoldenDict handles this.
 
-[^5]: The use of a custom styling manager such as Dark Reader is recommended until I fix this, as styles for different dictionaries meddle with each other.
+[^5]: The use of a custom styling manager such as Dark Reader is recommended until I fix this, as styles for different dictionaries meddle with each other. Or better, if you know CSS, you could just edit the dictionaries' stylesheets to make them less intrusive and individualistic.
 
 [^6]: A Russian-speaking friend told me that it is unusual to type Russian on an American keyboard, so whether this feature is useful is open to doubt.
 

diff --git a/server/app/dictionaries.py b/server/app/dictionaries.py
@@ -3,10 +3,7 @@
 import re
 from .settings import Settings
 from . import db_manager
-from .dicts.base_reader import BaseReader
-from .dicts.mdict_reader import MDictReader
-from .dicts.stardict_reader import StarDictReader
-from .dicts.dsl_reader import DSLReader
+from .dicts import BaseReader, DSLReader, StarDictReader, MDictReader
 from .langs import is_lang, transliterate, stem, spelling_suggestions, orthographic_forms, convert_chinese
 import logging
 
@@ -154,6 +151,7 @@ def query(self, group_name: 'str', key: 'str') -> 'list[tuple[str, str, str]]':
 		articles = []
 		def replace_legacy_lookup_api(match: 're.Match') -> 'str':
 			return '/api/query/%s/%s' % (group_name, match.group(2))
+
 		def extract_articles_from_dictionary(dictionary_name: 'str') -> 'None':
 			nonlocal autoplay_found
 			keys_found = [key for key in keys if db_manager.entry_exists_in_dictionary(key, dictionary_name)]
@@ -168,7 +166,7 @@ def extract_articles_from_dictionary(dictionary_name: 'str') -> 'None':
 				else:
 					articles.append((dictionary_name, self.settings.display_name_of_dictionary(dictionary_name), article.replace('autoplay', '')))
 
-		with concurrent.futures.ThreadPoolExecutor() as executor:
+		with concurrent.futures.ThreadPoolExecutor(len(names_dictionaries_of_group)) as executor:
 			executor.map(extract_articles_from_dictionary, names_dictionaries_of_group)
 
 		if len(articles) > 0:

diff --git a/server/app/dicts/__init__.py b/server/app/dicts/__init__.py
@@ -0,0 +1,4 @@
+from .base_reader import BaseReader
+from .dsl_reader import DSLReader
+from .stardict_reader import StarDictReader
+from .mdict_reader import MDictReader
diff --git a/server/app/dicts/dsl/markup_converter.py b/server/app/dicts/dsl/markup_converter.py
@@ -235,7 +235,7 @@ def _correct_media_references(self, html: 'str') -> 'tuple[str, list[str]]':
 	def _extract_files(self, files_to_be_extracted: 'list[str]') -> 'None':
 		# ZipFile's extractall() is too slow, so we use a thread pool to extract files in parallel.
 		with ZipFile(self._resources_filename) as zip_file:
-			with concurrent.futures.ThreadPoolExecutor() as executor:
+			with concurrent.futures.ThreadPoolExecutor(len(files_to_be_extracted)) as executor:
 				executor.map(zip_file.extract, files_to_be_extracted, [self._resources_dir] * len(files_to_be_extracted))
 
 	def _clean_html(self, html: 'str') -> 'str':
@@ -254,7 +254,9 @@ def _clean_html(self, html: 'str') -> 'str':
 
 		return html
 
-	def convert(self, text: 'str', headword: 'str') -> 'str':
+	# def convert(self, text: 'str', headword: 'str') -> 'str':
+	def convert(self, record: 'tuple[str, str]') -> 'str':
+		text, headword = record
 		for line in text.splitlines():
 			if line.startswith(' [m') and not line.endswith('[/m]'):
 				text = text.replace(line, line + '[/m]')

diff --git a/server/app/dicts/dsl_reader.py b/server/app/dicts/dsl_reader.py
@@ -4,6 +4,7 @@
 import idzip
 from json import detect_encoding
 from pathlib import Path
+import concurrent.futures
 from .base_reader import BaseReader
 from .. import db_manager
 from .dsl import DSLConverter
@@ -192,22 +193,27 @@ def __init__(self,
 				if remove_resources_after_extraction:
 					os.remove(resources_filename)
 
-	def _get_records(self, offset: 'int', size: 'int') -> 'str':
+	def _get_record(self, f: 'idzip.api.IdzipFile', offset: 'int', size: 'int') -> 'str':
 		"""
 		Returns original DSL markup.
 		"""
-		assert os.path.splitext(self.filename)[1] == '.dz'
+		f.seek(offset)
+		data = f.read(size)
+		assert detect_encoding(data) == 'utf-8'
+		return data.decode('utf-8')
+
+	def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[tuple[str, str]]':
+		records = []
 		with idzip.open(self.filename) as f:
-			f.seek(offset)
-			data = f.read(size)
-			assert detect_encoding(data) == 'utf-8'
-			return data.decode('utf-8')
+			for word, offset, size in locations:
+				records.append((self._get_record(f, offset, size), word))
+		return records
 
 	def entry_definition(self, entry: str) -> str:
 		locations = db_manager.get_entries(entry, self.name)
-		records = []
-		for word, offset, length in locations:
-			record = self._get_records(offset, length)
-			records.append(self._converter.convert(record, word))
-
-		return self._ARTICLE_SEPARATOR.join(records)
+		records = self._get_records_in_batch(locations)
+		# records = [self._converter.convert(*record) for record in records]
+		# DSL parsing is expensive, so we'd better parallelise it
+		with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
+			records = list(executor.map(self._converter.convert, records))
+		return self._ARTICLE_SEPARATOR.join(records)
diff --git a/server/app/dicts/mdict/__init__.py b/server/app/dicts/mdict/__init__.py
@@ -0,0 +1,3 @@
+from .readmdict import MDD, MDX
+from .html_cleaner import HTMLCleaner
+from . import lzo
diff --git a/server/app/dicts/mdict/html_cleaner.py b/server/app/dicts/mdict/html_cleaner.py
@@ -0,0 +1,136 @@
+import os
+import shutil
+from pathlib import Path
+import re
+# import css_inline
+
+class HTMLCleaner:
+	NON_PRINTING_CHARS_PATTERN = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]')
+
+	def __init__(self, filename, dict_name: 'str', resources_dir: 'str') -> 'None':
+		self._filename = filename
+		self._resources_dir = resources_dir
+		self._href_root_dir = '/api/cache/' + dict_name + '/'
+		self._lookup_url_root = '/api/lookup/' + dict_name + '/'
+
+	def _fix_file_path(self, definition_html: 'str', file_extension: 'str') -> 'str':
+		extension_position = 0
+		while (extension_position := definition_html.find(file_extension, extension_position)) != -1:
+			filename_position = definition_html.rfind('"', 0, extension_position) + 1
+			filename = definition_html[filename_position:extension_position + len(file_extension)]
+			file_path_on_disk =  os.path.join(os.path.dirname(self._filename), filename)
+			new_file_path_on_disk = os.path.join(self._resources_dir, filename)
+			if not os.path.isfile(new_file_path_on_disk):
+				if os.path.isfile(file_path_on_disk):
+					Path(self._resources_dir).mkdir(parents=True, exist_ok=True)
+					shutil.copy(file_path_on_disk, new_file_path_on_disk)
+					definition_html = definition_html[:filename_position] + self._href_root_dir + definition_html[filename_position:]
+			else:
+				if os.path.getmtime(file_path_on_disk) > os.path.getmtime(new_file_path_on_disk):
+					shutil.copy(file_path_on_disk, new_file_path_on_disk)
+				definition_html = definition_html[:filename_position] + self._href_root_dir + definition_html[filename_position:]
+			extension_position += len(file_extension)
+		return definition_html
+
+	# def _inline_styles(self, html_content: 'str') -> 'str': # CSS path(s) is inside the HTML file
+	# 	# Find all CSS references
+	# 	# regex won't work. Maybe it's simply because that I haven't mastered the dark art.
+	# 	css_references = []
+	# 	css_extension_position = 0
+	# 	while (css_extension_position := html_content.find('.css"', css_extension_position)) != -1:
+	# 		css_filename_position = html_content.rfind('"', 0, css_extension_position) + 1
+	# 		css_filename = html_content[css_filename_position:css_extension_position] + '.css'
+	# 		css_references.append(css_filename)
+	# 		# Remove the CSS reference
+	# 		link_tag_start_position = html_content.rfind('<link', 0, css_filename_position)
+	# 		link_tag_end_position = html_content.find('>', link_tag_start_position) + 1
+	# 		html_content = html_content[:link_tag_start_position] + html_content[link_tag_end_position:]
+	# 		css_extension_position = link_tag_start_position
+
+	# 	for css in css_references:
+	# 		# Read the CSS file
+	# 		css_path = os.path.join(self._resources_dir, css.split('/')[-1])
+	# 		with open(css_path) as css_file:
+	# 			css_content = css_file.read()
+
+	# 		# Inline the CSS
+	# 		inliner = css_inline.CSSInliner(load_remote_stylesheets=False, extra_css=css_content)
+	# 		html_content = inliner.inline(html_content)
+
+	# 	return html_content
+
+	def _fix_internal_href(self, definition_html: 'str') -> 'str':
+		# That is, links like entry://#81305a5747ca42b28f2b50de9b762963_nav2
+		return definition_html.replace('entry://#', '#')
+
+	def _flatten_nested_a(self, definition_html: 'str', depth: 'int') -> 'str':
+		# Sometimes there're multiple inner elements inside the <a> element, which should be removed
+		# For example, in my Fr-En En-Fr Collins Dictionary, there's a <span> element inside the <a> element
+		# The text within the <span> should be preserved, though
+		# <a class="ref" href="/lookup/collinse22f/badly" title="Translation of badly"><span class="orth">badly</span></a>
+		if depth == 0:
+			return definition_html
+		else:
+			a_closing_tag_pos = 0
+			while (a_tag_start_pos := definition_html.find('<a', a_closing_tag_pos)) != -1:
+				a_tag_end_pos = definition_html.find('>', a_tag_start_pos)
+				inner_html_start_pos = definition_html.find('>', a_tag_end_pos + 1) + 1
+				if (a_closing_tag_pos := definition_html.find('</a>', a_tag_end_pos, inner_html_start_pos)) != -1:
+					continue
+				inner_html_end_pos = definition_html.find('</', inner_html_start_pos)
+				inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
+				a_closing_tag_pos = definition_html.find('</a>', inner_html_end_pos)
+				definition_html = definition_html[:a_tag_end_pos + 1] + inner_html + definition_html[a_closing_tag_pos:]
+			return self._flatten_nested_a(definition_html, depth - 1)
+
+	def _fix_entry_cross_ref(self, definition_html: 'str') -> 'str':
+		if definition_html.startswith('@@@LINK='): # strange special case
+			last_non_whitespace_position = len(definition_html) - 1
+			while definition_html[last_non_whitespace_position].isspace():
+				last_non_whitespace_position -= 1
+			entry_linked = definition_html[len('@@@LINK='):last_non_whitespace_position+1]
+			return '<a href="%s">%s</a>' % (self._lookup_url_root + entry_linked, entry_linked)
+		else:
+			definition_html = definition_html.replace('entry://', self._lookup_url_root)
+			return self._flatten_nested_a(definition_html, 3) # fingers crossed there are no more than three layers
+
+	def _fix_sound_link(self, definition_html: 'str') -> 'str':
+		# Use HTML sound element instead of the original <a> element, which looks like this:
+		# <a class="hwd_sound sound audio_play_button icon-volume-up ptr fa fa-volume-up" data-lang="en_GB" data-src-mp3="https://www.collinsdictionary.com/sounds/hwd_sounds/EN-GB-W0020530.mp3" href="sound://audio/ef/7650.mp3" title="Pronunciation for "><img class="soundpng" src="/api/cache/collinse22f/img/sound.png"></a>
+		autoplay_string = 'autoplay'
+		sound_element_template = '<audio controls %s src=%s>%s</audio>'
+		while (sound_link_start_pos := definition_html.find('sound://')) != -1:
+			sound_link_end_pos = definition_html.find('"', sound_link_start_pos)
+			original_sound_link = definition_html[sound_link_start_pos:sound_link_end_pos]
+			sound_link = original_sound_link.replace('sound://', self._href_root_dir)
+			inner_html_start_pos = definition_html.find('>', sound_link_end_pos) + 1
+			inner_html_end_pos = definition_html.find('</a>', inner_html_start_pos)
+			inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
+			outer_html_start_pos = definition_html.rfind('<a', 0, sound_link_start_pos)
+			outer_html_end_pos = definition_html.find('</a>', inner_html_end_pos) + len('</a>')
+			definition_html = definition_html[:outer_html_start_pos] + sound_element_template % (autoplay_string, sound_link, inner_html) + definition_html[outer_html_end_pos:]
+			autoplay_string = ''
+
+		return definition_html
+
+	def _fix_img_src(self, definition_html: 'str') -> 'str':
+		img_tag_end_pos = 0
+		while (img_tag_start_pos := definition_html.find('<img', img_tag_end_pos)) != -1:
+			img_tag_end_pos = definition_html.find('>', img_tag_start_pos)
+			img_src_start_pos = definition_html.find(' src="', img_tag_start_pos, img_tag_end_pos) + len(' src="')
+			img_src_end_pos = definition_html.find('"', img_src_start_pos, img_tag_end_pos)
+			img_src = definition_html[img_src_start_pos:img_src_end_pos]
+			img_src = self._href_root_dir + img_src.replace('file://' , '')
+			definition_html = definition_html[:img_src_start_pos] + img_src + definition_html[img_src_end_pos:]
+		return definition_html
+
+	def clean(self, definition_html: 'str') -> 'str':
+		definition_html = self.NON_PRINTING_CHARS_PATTERN.sub('', definition_html)
+		definition_html = self._fix_file_path(definition_html, '.css')
+		definition_html = self._fix_file_path(definition_html, '.js')
+		definition_html = self._fix_internal_href(definition_html)
+		definition_html = self._fix_entry_cross_ref(definition_html)
+		definition_html = self._fix_sound_link(definition_html)
+		definition_html = self._fix_img_src(definition_html)
+		# definition_html = self._inline_styles(definition_html)
+		return definition_html