fix: load dictionary files into memory instead

Crissium · Nov 3, 2023 · 4615beb · 4615beb
1 parent a3e85c8
commit 4615beb
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 68 deletions.
diff --git a/server/app/dicts/dsl_reader.py b/server/app/dicts/dsl_reader.py
@@ -182,12 +182,8 @@ def __init__(self,
 
 		self._loaded_content_into_memory = load_content_into_memory
 		if load_content_into_memory:
-			self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
-			locations_all = db_manager.get_entries_all(self.name)
 			with idzip.open(self.filename) as f:
-				for key, word, offset, size in locations_all:
-					record = self._get_record(f, offset, size)
-					self._content.setdefault(key, []).append(self._converter.convert((record, word)))
+				self._content = f.read()
 
 		if extract_resources:
 			from zipfile import ZipFile
@@ -212,22 +208,27 @@ def _get_record(self, f: 'idzip.api.IdzipFile', offset: 'int', size: 'int') -> '
 		assert detect_encoding(data) == 'utf-8'
 		return data.decode('utf-8')
 
+	def _get_record_from_cache(self, offset: 'int', size: 'int') -> 'str':
+		return self._content[offset:offset+size].decode('utf-8')
+
 	def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[tuple[str, str]]':
 		records = []
-		with idzip.open(self.filename) as f:
-			for word, offset, size in locations:
-				records.append((self._get_record(f, offset, size), word))
-		return records
-
-	def entry_definition(self, entry: 'str') -> 'str':
 		if self._loaded_content_into_memory:
-			articles = self._content.get(entry)
-			return self._ARTICLE_SEPARATOR.join(articles)
+			# for word, offset, size in locations:
+			# 	records.append((self._get_record_from_cache(offset, size), word))
+			with concurrent.futures.ThreadPoolExecutor(len(locations)) as executor:
+				executor.map(lambda location: records.append((self._get_record_from_cache(location[1], location[2]), location[0])), locations)
 		else:
-			locations = db_manager.get_entries(entry, self.name)
-			records = self._get_records_in_batch(locations)
-			# records = [self._converter.convert(*record) for record in records]
-			# DSL parsing is expensive, so we'd better parallelise it
-			with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
-				records = list(executor.map(self._converter.convert, records))
-			return self._ARTICLE_SEPARATOR.join(records)
+			with idzip.open(self.filename) as f:
+				for word, offset, size in locations:
+					records.append((self._get_record(f, offset, size), word))
+		return records
+
+	def entry_definition(self, entry: str) -> str:
+		locations = db_manager.get_entries(entry, self.name)
+		records = self._get_records_in_batch(locations)
+		# records = [self._converter.convert(*record) for record in records]
+		# DSL parsing is expensive, so we'd better parallelise it
+		with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
+			records = list(executor.map(self._converter.convert, records))
+		return self._ARTICLE_SEPARATOR.join(records)
diff --git a/server/app/dicts/mdict_reader.py b/server/app/dicts/mdict_reader.py
@@ -70,15 +70,9 @@ def __init__(self,
 
 		self._loaded_content_into_memory = load_content_into_memory
 		if load_content_into_memory:
-			# with open(self._mdict._fname, 'rb') as f:
-			# 	self._content = io.BytesIO(f.read())
-			self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
-			locations_all = db_manager.get_entries_all(self.name)
 			with open(self._mdict._fname, 'rb') as f:
-				for key, word, offset, length in locations_all:
-					record = self._get_record(f, offset, length)
-					self._content.setdefault(key, []).append(self.html_cleaner.clean(record))
-
+				self._content = io.BytesIO(f.read())
+
 		if extract_resources and not os.path.isdir(self._resources_dir): # Only extract the files once
 			# Load the resource files (.mdd), if any
 			# For example, for the dictionary collinse22f.mdx, there are four .mdd files:
@@ -189,19 +183,19 @@ def _get_record_v1v2(self, f, offset: 'int', length: 'int') -> 'str':
 
 	def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[str]':
 		# word is not used in mdict, which is present in the article itself.
-		mdict_fp = open(self._mdict._fname, 'rb')
+		if self._loaded_content_into_memory:
+			mdict_fp = self._content
+		else:
+			mdict_fp = open(self._mdict._fname, 'rb')
 		records = [self._get_record(mdict_fp, offset, length) for word, offset, length in locations]
-		mdict_fp.close()
+		if not self._loaded_content_into_memory:
+			mdict_fp.close()
 		return records
 
 	def entry_definition(self, entry: 'str') -> 'str':
-		if self._loaded_content_into_memory:
-			articles = self._content.get(entry)
-			return self._ARTICLE_SEPARATOR.join(articles)
-		else:
-			locations = db_manager.get_entries(entry, self.name)
-			records = self._get_records_in_batch(locations)
-			# Cleaning up HTML actually takes some time to complete
-			with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
-				records = list(executor.map(self.html_cleaner.clean, records))
-			return self._ARTICLE_SEPARATOR.join(records)
+		locations = db_manager.get_entries(entry, self.name)
+		records = self._get_records_in_batch(locations)
+		# Cleaning up HTML actually takes some time to complete
+		with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
+			records = list(executor.map(self.html_cleaner.clean, records))
+		return self._ARTICLE_SEPARATOR.join(records)
diff --git a/server/app/dicts/stardict_reader.py b/server/app/dicts/stardict_reader.py
@@ -20,7 +20,7 @@ def _stardict_filenames(base_filename: 'str') -> 'tuple[str, str, str, str]':
 		if not os.path.isfile(idxfile):
 			idxfile += '.gz'
 		dictfile = base_filename + '.dict.dz'
-		synfile = base_filename + 'syn.dz' # not used at the moment
+		synfile = base_filename + 'syn.dz'
 		return ifofile, idxfile, dictfile, synfile
 
 	def __init__(self,
@@ -45,30 +45,16 @@ def __init__(self,
 			logger.info('Entries of dictionary %s added to database' % self.name)
 
 		self._relative_root_dir = name
-		# assert self._relative_root_dir == name
-		# This assertion won't hold when the filename contains dots
 		self._resources_dir = os.path.join(self._CACHE_ROOT, self._relative_root_dir)
 
 		self.ifo_reader = IfoFileReader(self.ifofile)
 
-		self._html_cleaner = HtmlCleaner(self.name, os.path.dirname(self.filename), self._resources_dir)
-		self._xdxf_cleaner = XdxfCleaner()
-
 		self._loaded_content_into_memory = load_content_into_memory
 		if load_content_into_memory:
-			locations_all = db_manager.get_entries_all(self.name)
-			self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
-			if not os.path.isfile(self.dictfile): # it is possible that it is not dictzipped
-				from idzip.command import _compress
-				class Options:
-					suffix = '.dz'
-					keep = False
-				_compress(self.dictfile[:-len(Options.suffix)], Options)
-			dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
-			for key, word, offset, size in locations_all:
-				records = self._get_records(dict_reader, offset, size)
-				self._content.setdefault(key, []).extend([self._clean_up_markup(r, word) for r in records])
-			dict_reader.close()
+			self._content_dictfile = DictFileReader(self.dictfile, self.ifo_reader, None, True)
+
+		self._html_cleaner = HtmlCleaner(self.name, os.path.dirname(self.filename), self._resources_dir)
+		self._xdxf_cleaner = XdxfCleaner()
 
 	def _get_records(self, dict_reader: 'DictFileReader', offset: 'int', size: 'int') -> 'list[tuple[str, str]]':
 		"""
@@ -112,18 +98,18 @@ class Options:
 				suffix = '.dz'
 				keep = False
 			_compress(self.dictfile[:-len(Options.suffix)], Options)
-		dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
+		if self._loaded_content_into_memory:
+			dict_reader = self._content_dictfile
+		else:
+			dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
 		records = []
 		for word, offset, size in locations:
 			records.extend([self._clean_up_markup(r, word) for r in self._get_records(dict_reader, offset, size)])
-		dict_reader.close()
+		if not self._loaded_content_into_memory:
+			dict_reader.close()
 		return records
 
 	def entry_definition(self, entry: 'str') -> 'str':
-		if self._loaded_content_into_memory:
-			articles = self._content.get(entry)
-			return self._ARTICLE_SEPARATOR.join(articles)
-		else:
-			locations = db_manager.get_entries(entry, self.name)
-			records = self._get_records_in_batch(locations)
-			return self._ARTICLE_SEPARATOR.join(records)
+		locations = db_manager.get_entries(entry, self.name)
+		records = self._get_records_in_batch(locations)
+		return self._ARTICLE_SEPARATOR.join(records)