feat: allow pre-loading formatted articles

Crissium · Nov 3, 2023 · a3e85c8 · a3e85c8
1 parent c8ea5c3
commit a3e85c8
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 ![favicon](/client/public/favicon.ico)
 
-[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki)
+[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki) (At least read the general notes before using.)
 
 This project is intended to be a modern, from-the-ground-up, maintainable alternative to [GoldenDict](https://github.com/goldendict/goldendict)(-[ng](https://github.com/xiaoyifang/goldendict-ng)), developed with Flask and React.
 
@@ -58,7 +58,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
 - [X] OpenCC Chinese conversion (please set your preference in `~/.silverdict/preferences.yaml` and add `zh` to the group with Chinese dictionaries)
 - [X] Add the ability to set sources for automatic indexing, i.e. dictionaries put into the specified directories will be automatically added
 - [X] Recursive source scanning
-- [X] Multithreaded article extraction
+- [X] Multithreaded article extraction (This project will benefit hugely from [no-GIL python](https://peps.python.org/pep-0703/))
 - [X] Improve the performance of suggestions matching
 - [X] Make the suggestion size customisable
 - [X] Allow configure suggestion matching mode, listening address, running mode, etc. via a configuration file, without modifying code
@@ -74,8 +74,8 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
 - [ ] Make the strings translatable
 - [X] GoldenDict-like dictionary group support
 - [X] A mobile-friendly interface (retouch needed)
-- [ ] [A real mobile app](https://github.com/Crissium/SilverDict-mobile)
-- [ ] A C++/Qt (or QML) desktop app (development is scheduled to begin in July, 2024)[^7]
+- [X] [A real mobile app](https://github.com/Crissium/SilverDict-mobile)
+- [ ] A C++/Qt (or QML) desktop app[^7]
 
 ### Issue backlog
 

diff --git a/server/app/db_manager.py b/server/app/db_manager.py
@@ -115,6 +115,14 @@ def get_entries(key: 'str', dictionary_name: 'str') -> 'list[tuple[str, int, int
 	cursor.execute('select word, offset, size from entries where key = ? and dictionary_name = ?', (key, dictionary_name))
 	return cursor.fetchall()
 
+def get_entries_all(dictionary_name: 'str') -> 'list[tuple[str, str, int, int]]':
+	"""
+	Returns a list of (key, word, offset, size).
+	"""
+	cursor = get_cursor()
+	cursor.execute('select key, word, offset, size from entries where dictionary_name = ? order by offset', (dictionary_name,))
+	return cursor.fetchall()
+
 def delete_dictionary(dictionary_name: 'str') -> 'None':
 	cursor = get_cursor()
 	cursor.execute('delete from entries where dictionary_name = ?', (dictionary_name,))

diff --git a/server/app/dictionaries.py b/server/app/dictionaries.py
@@ -20,12 +20,12 @@ class Dictionaries:
 	def _load_dictionary(self, dictionary_info: 'dict') -> 'None':
 		match dictionary_info['dictionary_format']:
 			case 'MDict (.mdx)':
-				self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
+				self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
 			case 'StarDict (.ifo)':
-				self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
+				self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
 			case 'DSL (.dsl/.dsl.dz)':
 				if self.settings.preferences['running_mode'] == 'normal':
-					self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
+					self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
 				elif self.settings.preferences['running_mode'] == 'preparation':
 					self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], True, True)
 				else: # 'server' mode
@@ -41,9 +41,14 @@ def __init__(self, app: 'Flask') -> 'None':
 		db_manager.create_table_entries()
 
 		self.dictionaries : 'dict[str, BaseReader]' = dict()
-		for dictionary_info in self.settings.dictionaries_list:
-			self._load_dictionary(dictionary_info)
-		logger.info('Dictionaries loaded into memory.')
+		if len(self.settings.dictionaries_of_group(Settings.NAME_GROUP_LOADED_INTO_MEMORY)) > 0: # on HDD it would confuse the I/O scheduler to load the dictionaries in parallel
+			for dictionary_info in self.settings.dictionaries_list:
+				self._load_dictionary(dictionary_info)
+		else:
+			with concurrent.futures.ThreadPoolExecutor() as executor:
+				executor.map(self._load_dictionary, self.settings.dictionaries_list)
+
+		logger.info('Dictionaries loaded.')
 
 	def add_dictionary(self, dictionary_info: 'dict') -> 'None':
 		self._load_dictionary(dictionary_info)

diff --git a/server/app/dicts/dsl/markup_converter.py b/server/app/dicts/dsl/markup_converter.py
@@ -199,8 +199,7 @@ def _clean_tags(self, line: 'str') -> 'str':
 		line = line.replace("\\[", "[").replace("\\]", "]")
 
 		# preserve newlines
-		if not line.endswith('>'):
-			print(line)
+		if not line.endswith('>') and not line.endswith('[/m]'):
 			line += '<br/>'
 
 		return line

diff --git a/server/app/dicts/dsl_reader.py b/server/app/dicts/dsl_reader.py
@@ -32,7 +32,7 @@ class DSLReader(BaseReader):
 	@staticmethod
 	def _cleanup_text(text: 'str') -> 'str':
 		# Get rid of the BOM
-		text = text.replace('\ufeff', '')
+		text = text.replace('\ufeff', '', 1)
 
 		# Remove the {·} marker (note: this is not the same as {·}, which is used to separate syllables)
 		text = text.replace('{·}', '')
@@ -60,7 +60,7 @@ def _clean_up(dsl_decompressed_path: 'str') -> 'None':
 		"""
 		with open(dsl_decompressed_path, 'rb') as f:
 			data = f.read()
-		text = data.decode(detect_encoding(data))
+		text = data.decode(detect_encoding(data)) # TODO: json's detect_encoding() is not always reliable
 		del data
 		text = DSLReader._cleanup_text(text)
 		text = DSLReader._clean_up_opening_whitespace(text)
@@ -98,7 +98,8 @@ def __init__(self,
 				 display_name: 'str',
 				 performs_cleanup: 'bool'=True, # Make sure your dsl is already cleaned up if it is False
 				 extract_resources: 'bool'=False,
-				 remove_resources_after_extraction: 'bool'=True) -> 'None': 
+				 remove_resources_after_extraction: 'bool'=True,
+				 load_content_into_memory: 'bool'=False) -> 'None': 
 		super().__init__(name, filename, display_name)
 		filename_no_extension, extension = os.path.splitext(filename)
 		is_compressed = extension == '.dz'
@@ -179,6 +180,15 @@ def __init__(self,
 		Path(os.path.join(self._CACHE_ROOT, self.name)).mkdir(parents=True, exist_ok=True)
 		self._converter = DSLConverter(self.filename, self.name, os.path.join(self._CACHE_ROOT, self.name), extract_resources)
 
+		self._loaded_content_into_memory = load_content_into_memory
+		if load_content_into_memory:
+			self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
+			locations_all = db_manager.get_entries_all(self.name)
+			with idzip.open(self.filename) as f:
+				for key, word, offset, size in locations_all:
+					record = self._get_record(f, offset, size)
+					self._content.setdefault(key, []).append(self._converter.convert((record, word)))
+
 		if extract_resources:
 			from zipfile import ZipFile
 
@@ -209,11 +219,15 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis
 				records.append((self._get_record(f, offset, size), word))
 		return records
 
-	def entry_definition(self, entry: str) -> str:
-		locations = db_manager.get_entries(entry, self.name)
-		records = self._get_records_in_batch(locations)
-		# records = [self._converter.convert(*record) for record in records]
-		# DSL parsing is expensive, so we'd better parallelise it
-		with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
-			records = list(executor.map(self._converter.convert, records))
-		return self._ARTICLE_SEPARATOR.join(records)
+	def entry_definition(self, entry: 'str') -> 'str':
+		if self._loaded_content_into_memory:
+			articles = self._content.get(entry)
+			return self._ARTICLE_SEPARATOR.join(articles)
+		else:
+			locations = db_manager.get_entries(entry, self.name)
+			records = self._get_records_in_batch(locations)
+			# records = [self._converter.convert(*record) for record in records]
+			# DSL parsing is expensive, so we'd better parallelise it
+			with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
+				records = list(executor.map(self._converter.convert, records))
+			return self._ARTICLE_SEPARATOR.join(records)
diff --git a/server/app/dicts/mdict_reader.py b/server/app/dicts/mdict_reader.py
@@ -2,6 +2,8 @@
 import zlib
 import os
 from pathlib import Path
+import pickle
+import io
 try:
 	import lzo
 except ImportError:
@@ -16,9 +18,9 @@
 logger.setLevel(logging.INFO)
 
 class MDictReader(BaseReader):
+	FILENAME_MDX_PICKLE = 'mdx.pickle'
 	def _write_to_cache_dir(self, resource_filename: 'str', data: 'bytes') -> 'None':
 		absolute_path = os.path.join(self._resources_dir, resource_filename)
-		Path(os.path.dirname(absolute_path)).mkdir(parents=True, exist_ok=True)
 		with open(absolute_path, 'wb') as f:
 			f.write(data)
 
@@ -27,13 +29,24 @@ def __init__(self,
 				 filename: 'str',
 				 display_name: 'str',
 				 extract_resources: 'bool'=True,
-				 remove_resources_after_extraction: 'bool'=False) -> 'None':
+				 remove_resources_after_extraction: 'bool'=False,
+				 load_content_into_memory: 'bool'=False) -> 'None':
 		"""
 		It is recommended to set remove_resources_after_extraction to True on a server when you have local backup.
 		"""
 		super().__init__(name, filename, display_name)
+		filename_no_extension, extension = os.path.splitext(filename)
+		self._resources_dir = os.path.join(self._CACHE_ROOT, name)
+		Path(self._resources_dir).mkdir(parents=True, exist_ok=True)
 
-		self._mdict = MDX(filename)
+		filename_mdx_pickle = os.path.join(self._resources_dir, self.FILENAME_MDX_PICKLE)
+		if os.path.isfile(filename_mdx_pickle):
+			mdx_pickled = True
+			with open(filename_mdx_pickle, 'rb') as f:
+				self._mdict = pickle.load(f)
+		else:
+			mdx_pickled = False
+			self._mdict = MDX(filename)
 
 		if not db_manager.dictionary_exists(self.name):
 			db_manager.drop_index()
@@ -48,12 +61,24 @@ def __init__(self,
 			db_manager.create_index()
 			logger.info('Entries of dictionary %s added to database' % self.name)
 
-		del self._mdict._key_list # a hacky way to reduce memory usage without touching the library
+		if not mdx_pickled:
+			del self._mdict._key_list # a hacky way to reduce memory usage without touching the library
+			with open(filename_mdx_pickle, 'wb') as f:
+				pickle.dump(self._mdict, f)
 
-		filename_no_extension, extension = os.path.splitext(filename)
-		self._resources_dir = os.path.join(self._CACHE_ROOT, name)
 		self.html_cleaner = HTMLCleaner(filename, name, self._resources_dir)
 
+		self._loaded_content_into_memory = load_content_into_memory
+		if load_content_into_memory:
+			# with open(self._mdict._fname, 'rb') as f:
+			# 	self._content = io.BytesIO(f.read())
+			self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
+			locations_all = db_manager.get_entries_all(self.name)
+			with open(self._mdict._fname, 'rb') as f:
+				for key, word, offset, length in locations_all:
+					record = self._get_record(f, offset, length)
+					self._content.setdefault(key, []).append(self.html_cleaner.clean(record))
+
 		if extract_resources and not os.path.isdir(self._resources_dir): # Only extract the files once
 			# Load the resource files (.mdd), if any
 			# For example, for the dictionary collinse22f.mdx, there are four .mdd files:
@@ -170,9 +195,13 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis
 		return records
 
 	def entry_definition(self, entry: 'str') -> 'str':
-		locations = db_manager.get_entries(entry, self.name)
-		records = self._get_records_in_batch(locations)
-		# Cleaning up HTML actually takes some time to complete
-		with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
-			records = list(executor.map(self.html_cleaner.clean, records))
-		return self._ARTICLE_SEPARATOR.join(records)
+		if self._loaded_content_into_memory:
+			articles = self._content.get(entry)
+			return self._ARTICLE_SEPARATOR.join(articles)
+		else:
+			locations = db_manager.get_entries(entry, self.name)
+			records = self._get_records_in_batch(locations)
+			# Cleaning up HTML actually takes some time to complete
+			with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
+				records = list(executor.map(self.html_cleaner.clean, records))
+			return self._ARTICLE_SEPARATOR.join(records)
diff --git a/server/app/dicts/stardict/stardict.py b/server/app/dicts/stardict/stardict.py
@@ -225,8 +225,8 @@ def get_syn(self, synonym_word):
 class DictFileReader(object):
 	"""Read the .dict file, store the data in memory for querying.
 	"""
-	
-	def __init__(self, filename, dict_ifo, dict_index):
+
+	def __init__(self, filename, dict_ifo, dict_index, load_content_into_memory=False):
 		"""Constructor.
 		
 		Arguments:
@@ -237,20 +237,33 @@ def __init__(self, filename, dict_ifo, dict_index):
 		self._dict_ifo = dict_ifo
 		self._dict_index = dict_index
 		self._offset = 0
+		self._loaded_content_into_memory = load_content_into_memory
 		compressed = os.path.splitext(filename)[1] == ".dz"
-		if compressed:
-			#with gzip.open(filename, "rb") as dict_file:
-			#    self._dict_file = dict_file.read()
-			self.fd = idzip.open(filename)
+		if load_content_into_memory:
+			if compressed:
+				with idzip.open(filename) as f:
+					self._content = f.read()
+			else:
+				with open(filename, "rb") as f:
+					self._content = f.read()
 		else:
-			self.fd = open(filename, "rb")
+			if compressed:
+				#with gzip.open(filename, "rb") as dict_file:
+				#    self._dict_file = dict_file.read()
+				self.fd = idzip.open(filename)
+			else:
+				self.fd = open(filename, "rb")
 
 	def close(self):
-		self.fd.close()
+		if not self._loaded_content_into_memory:
+			self.fd.close()
 
 	def _get_dict_by_offset_size_internal(self, offset, size, sametypesequence, result):
-		self.fd.seek(offset)            
-		self._dict_file = self.fd.read(size)
+		if self._loaded_content_into_memory:
+			self._dict_file = self._content[offset:(offset+size)]
+		else:
+			self.fd.seek(offset)
+			self._dict_file = self.fd.read(size)
 		if sametypesequence:
 			result.append(self._get_entry_sametypesequence(0, size))
 		else:
@@ -262,45 +275,45 @@ def get_dict_by_offset_size(self, offset, size):
 		self._get_dict_by_offset_size_internal(offset, size, sametypesequence, result)
 		return result
 
-	def get_dict_by_word(self, word):
-		"""Get the word's dictionary data by it's name.
+	# def get_dict_by_word(self, word):
+	# 	"""Get the word's dictionary data by it's name.
 
-		Arguments:
-		- `word`: word name.
-		Return:
-		The specified word's dictionary data, in form of dict as below:
-		{type_identifier: infomation, ...}
-		in which type_identifier can be any character in "mlgtxykwhnrWP".
-		"""
-		if type(word) != type(b""):
-			word = word.encode("utf-8")
-		indexes = self._dict_index.get_index_by_word(word)
-		if indexes == False:
-			return False
-		sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
-		result = list()
-		for index in indexes:
-			self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result)
-		return result
+	# 	Arguments:
+	# 	- `word`: word name.
+	# 	Return:
+	# 	The specified word's dictionary data, in form of dict as below:
+	# 	{type_identifier: infomation, ...}
+	# 	in which type_identifier can be any character in "mlgtxykwhnrWP".
+	# 	"""
+	# 	if type(word) != type(b""):
+	# 		word = word.encode("utf-8")
+	# 	indexes = self._dict_index.get_index_by_word(word)
+	# 	if indexes == False:
+	# 		return False
+	# 	sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
+	# 	result = list()
+	# 	for index in indexes:
+	# 		self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result)
+	# 	return result
 
-	def get_dict_by_index(self, index):
-		"""Get the word's dictionary data by it's index infomation.
+	# def get_dict_by_index(self, index):
+	# 	"""Get the word's dictionary data by it's index infomation.
 
-		Arguments:
-		- `index`: index of a word entrt in .idx file.'
-		Return:
-		The specified word's dictionary data, in form of dict as below:
-		{type_identifier: infomation, ...}
-		in which type_identifier can be any character in "mlgtxykwhnrWP".
-		"""
-		word, offset, size = self._dict_index.get_index_by_num(index)
-		sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
-		self.fd.seek(offset)            
-		self._dict_file = self.fd.read(size)
-		if sametypesequence:
-			return self._get_entry_sametypesequence(0, size)
-		else:
-			return self._get_entry(0, size)
+	# 	Arguments:
+	# 	- `index`: index of a word entrt in .idx file.'
+	# 	Return:
+	# 	The specified word's dictionary data, in form of dict as below:
+	# 	{type_identifier: infomation, ...}
+	# 	in which type_identifier can be any character in "mlgtxykwhnrWP".
+	# 	"""
+	# 	word, offset, size = self._dict_index.get_index_by_num(index)
+	# 	sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
+	# 	self.fd.seek(offset)            
+	# 	self._dict_file = self.fd.read(size)
+	# 	if sametypesequence:
+	# 		return self._get_entry_sametypesequence(0, size)
+	# 	else:
+	# 		return self._get_entry(0, size)
 
 	def _get_entry(self, offset, size):
 		result = dict()