Skip to content

Commit

Permalink
fix: load dictionary files into memory instead
Browse files Browse the repository at this point in the history
  • Loading branch information
Crissium committed Nov 3, 2023
1 parent a3e85c8 commit 4615beb
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 68 deletions.
41 changes: 21 additions & 20 deletions server/app/dicts/dsl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,8 @@ def __init__(self,

self._loaded_content_into_memory = load_content_into_memory
if load_content_into_memory:
self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
locations_all = db_manager.get_entries_all(self.name)
with idzip.open(self.filename) as f:
for key, word, offset, size in locations_all:
record = self._get_record(f, offset, size)
self._content.setdefault(key, []).append(self._converter.convert((record, word)))
self._content = f.read()

if extract_resources:
from zipfile import ZipFile
Expand All @@ -212,22 +208,27 @@ def _get_record(self, f: 'idzip.api.IdzipFile', offset: 'int', size: 'int') -> '
assert detect_encoding(data) == 'utf-8'
return data.decode('utf-8')

def _get_record_from_cache(self, offset: 'int', size: 'int') -> 'str':
return self._content[offset:offset+size].decode('utf-8')

def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[tuple[str, str]]':
records = []
with idzip.open(self.filename) as f:
for word, offset, size in locations:
records.append((self._get_record(f, offset, size), word))
return records

def entry_definition(self, entry: 'str') -> 'str':
if self._loaded_content_into_memory:
articles = self._content.get(entry)
return self._ARTICLE_SEPARATOR.join(articles)
# for word, offset, size in locations:
# records.append((self._get_record_from_cache(offset, size), word))
with concurrent.futures.ThreadPoolExecutor(len(locations)) as executor:
executor.map(lambda location: records.append((self._get_record_from_cache(location[1], location[2]), location[0])), locations)
else:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# records = [self._converter.convert(*record) for record in records]
# DSL parsing is expensive, so we'd better parallelise it
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self._converter.convert, records))
return self._ARTICLE_SEPARATOR.join(records)
with idzip.open(self.filename) as f:
for word, offset, size in locations:
records.append((self._get_record(f, offset, size), word))
return records

def entry_definition(self, entry: str) -> str:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# records = [self._converter.convert(*record) for record in records]
# DSL parsing is expensive, so we'd better parallelise it
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self._converter.convert, records))
return self._ARTICLE_SEPARATOR.join(records)
34 changes: 14 additions & 20 deletions server/app/dicts/mdict_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,9 @@ def __init__(self,

self._loaded_content_into_memory = load_content_into_memory
if load_content_into_memory:
# with open(self._mdict._fname, 'rb') as f:
# self._content = io.BytesIO(f.read())
self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
locations_all = db_manager.get_entries_all(self.name)
with open(self._mdict._fname, 'rb') as f:
for key, word, offset, length in locations_all:
record = self._get_record(f, offset, length)
self._content.setdefault(key, []).append(self.html_cleaner.clean(record))

self._content = io.BytesIO(f.read())

if extract_resources and not os.path.isdir(self._resources_dir): # Only extract the files once
# Load the resource files (.mdd), if any
# For example, for the dictionary collinse22f.mdx, there are four .mdd files:
Expand Down Expand Up @@ -189,19 +183,19 @@ def _get_record_v1v2(self, f, offset: 'int', length: 'int') -> 'str':

def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'list[str]':
# word is not used in mdict, which is present in the article itself.
mdict_fp = open(self._mdict._fname, 'rb')
if self._loaded_content_into_memory:
mdict_fp = self._content
else:
mdict_fp = open(self._mdict._fname, 'rb')
records = [self._get_record(mdict_fp, offset, length) for word, offset, length in locations]
mdict_fp.close()
if not self._loaded_content_into_memory:
mdict_fp.close()
return records

def entry_definition(self, entry: 'str') -> 'str':
if self._loaded_content_into_memory:
articles = self._content.get(entry)
return self._ARTICLE_SEPARATOR.join(articles)
else:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# Cleaning up HTML actually takes some time to complete
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self.html_cleaner.clean, records))
return self._ARTICLE_SEPARATOR.join(records)
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# Cleaning up HTML actually takes some time to complete
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self.html_cleaner.clean, records))
return self._ARTICLE_SEPARATOR.join(records)
42 changes: 14 additions & 28 deletions server/app/dicts/stardict_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _stardict_filenames(base_filename: 'str') -> 'tuple[str, str, str, str]':
if not os.path.isfile(idxfile):
idxfile += '.gz'
dictfile = base_filename + '.dict.dz'
synfile = base_filename + 'syn.dz' # not used at the moment
synfile = base_filename + 'syn.dz'
return ifofile, idxfile, dictfile, synfile

def __init__(self,
Expand All @@ -45,30 +45,16 @@ def __init__(self,
logger.info('Entries of dictionary %s added to database' % self.name)

self._relative_root_dir = name
# assert self._relative_root_dir == name
# This assertion won't hold when the filename contains dots
self._resources_dir = os.path.join(self._CACHE_ROOT, self._relative_root_dir)

self.ifo_reader = IfoFileReader(self.ifofile)

self._html_cleaner = HtmlCleaner(self.name, os.path.dirname(self.filename), self._resources_dir)
self._xdxf_cleaner = XdxfCleaner()

self._loaded_content_into_memory = load_content_into_memory
if load_content_into_memory:
locations_all = db_manager.get_entries_all(self.name)
self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
if not os.path.isfile(self.dictfile): # it is possible that it is not dictzipped
from idzip.command import _compress
class Options:
suffix = '.dz'
keep = False
_compress(self.dictfile[:-len(Options.suffix)], Options)
dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
for key, word, offset, size in locations_all:
records = self._get_records(dict_reader, offset, size)
self._content.setdefault(key, []).extend([self._clean_up_markup(r, word) for r in records])
dict_reader.close()
self._content_dictfile = DictFileReader(self.dictfile, self.ifo_reader, None, True)

self._html_cleaner = HtmlCleaner(self.name, os.path.dirname(self.filename), self._resources_dir)
self._xdxf_cleaner = XdxfCleaner()

def _get_records(self, dict_reader: 'DictFileReader', offset: 'int', size: 'int') -> 'list[tuple[str, str]]':
"""
Expand Down Expand Up @@ -112,18 +98,18 @@ class Options:
suffix = '.dz'
keep = False
_compress(self.dictfile[:-len(Options.suffix)], Options)
dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
if self._loaded_content_into_memory:
dict_reader = self._content_dictfile
else:
dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None)
records = []
for word, offset, size in locations:
records.extend([self._clean_up_markup(r, word) for r in self._get_records(dict_reader, offset, size)])
dict_reader.close()
if not self._loaded_content_into_memory:
dict_reader.close()
return records

def entry_definition(self, entry: 'str') -> 'str':
if self._loaded_content_into_memory:
articles = self._content.get(entry)
return self._ARTICLE_SEPARATOR.join(articles)
else:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
return self._ARTICLE_SEPARATOR.join(records)
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
return self._ARTICLE_SEPARATOR.join(records)

0 comments on commit 4615beb

Please sign in to comment.