From a3e85c8f096d204801eb776c00fd565dd26d43fc Mon Sep 17 00:00:00 2001 From: Xing Yi Date: Fri, 3 Nov 2023 11:33:49 +0800 Subject: [PATCH] feat: allow pre-loading formatted articles --- README.md | 8 +- server/app/db_manager.py | 8 ++ server/app/dictionaries.py | 17 ++-- server/app/dicts/dsl/markup_converter.py | 3 +- server/app/dicts/dsl_reader.py | 36 +++++--- server/app/dicts/mdict_reader.py | 53 +++++++++--- server/app/dicts/stardict/stardict.py | 105 +++++++++++++---------- server/app/dicts/stardict_reader.py | 31 +++++-- server/app/settings.py | 5 ++ 9 files changed, 180 insertions(+), 86 deletions(-) diff --git a/README.md b/README.md index 100ef3d..d86b22e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![favicon](/client/public/favicon.ico) -[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki) +[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki) (At least read the general notes before using.) This project is intended to be a modern, from-the-ground-up, maintainable alternative to [GoldenDict](https://github.com/goldendict/goldendict)(-[ng](https://github.com/xiaoyifang/goldendict-ng)), developed with Flask and React. @@ -58,7 +58,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten - [X] OpenCC Chinese conversion (please set your preference in `~/.silverdict/preferences.yaml` and add `zh` to the group with Chinese dictionaries) - [X] Add the ability to set sources for automatic indexing, i.e. dictionaries put into the specified directories will be automatically added - [X] Recursive source scanning -- [X] Multithreaded article extraction +- [X] Multithreaded article extraction (This project will benefit hugely from [no-GIL python](https://peps.python.org/pep-0703/)) - [X] Improve the performance of suggestions matching - [X] Make the suggestion size customisable - [X] Allow configure suggestion matching mode, listening address, running mode, etc. via a configuration file, without modifying code @@ -74,8 +74,8 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten - [ ] Make the strings translatable - [X] GoldenDict-like dictionary group support - [X] A mobile-friendly interface (retouch needed) -- [ ] [A real mobile app](https://github.com/Crissium/SilverDict-mobile) -- [ ] A C++/Qt (or QML) desktop app (development is scheduled to begin in July, 2024)[^7] +- [X] [A real mobile app](https://github.com/Crissium/SilverDict-mobile) +- [ ] A C++/Qt (or QML) desktop app[^7] ### Issue backlog diff --git a/server/app/db_manager.py b/server/app/db_manager.py index 6a628e0..99af9d3 100644 --- a/server/app/db_manager.py +++ b/server/app/db_manager.py @@ -115,6 +115,14 @@ def get_entries(key: 'str', dictionary_name: 'str') -> 'list[tuple[str, int, int cursor.execute('select word, offset, size from entries where key = ? and dictionary_name = ?', (key, dictionary_name)) return cursor.fetchall() +def get_entries_all(dictionary_name: 'str') -> 'list[tuple[str, str, int, int]]': + """ + Returns a list of (key, word, offset, size). + """ + cursor = get_cursor() + cursor.execute('select key, word, offset, size from entries where dictionary_name = ? order by offset', (dictionary_name,)) + return cursor.fetchall() + def delete_dictionary(dictionary_name: 'str') -> 'None': cursor = get_cursor() cursor.execute('delete from entries where dictionary_name = ?', (dictionary_name,)) diff --git a/server/app/dictionaries.py b/server/app/dictionaries.py index e579c31..a2be50f 100644 --- a/server/app/dictionaries.py +++ b/server/app/dictionaries.py @@ -20,12 +20,12 @@ class Dictionaries: def _load_dictionary(self, dictionary_info: 'dict') -> 'None': match dictionary_info['dictionary_format']: case 'MDict (.mdx)': - self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name']) + self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY)) case 'StarDict (.ifo)': - self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name']) + self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY)) case 'DSL (.dsl/.dsl.dz)': if self.settings.preferences['running_mode'] == 'normal': - self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name']) + self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY)) elif self.settings.preferences['running_mode'] == 'preparation': self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], True, True) else: # 'server' mode @@ -41,9 +41,14 @@ def __init__(self, app: 'Flask') -> 'None': db_manager.create_table_entries() self.dictionaries : 'dict[str, BaseReader]' = dict() - for dictionary_info in self.settings.dictionaries_list: - self._load_dictionary(dictionary_info) - logger.info('Dictionaries loaded into memory.') + if len(self.settings.dictionaries_of_group(Settings.NAME_GROUP_LOADED_INTO_MEMORY)) > 0: # on HDD it would confuse the I/O scheduler to load the dictionaries in parallel + for dictionary_info in self.settings.dictionaries_list: + self._load_dictionary(dictionary_info) + else: + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(self._load_dictionary, self.settings.dictionaries_list) + + logger.info('Dictionaries loaded.') def add_dictionary(self, dictionary_info: 'dict') -> 'None': self._load_dictionary(dictionary_info) diff --git a/server/app/dicts/dsl/markup_converter.py b/server/app/dicts/dsl/markup_converter.py index cdedebd..aca2eb1 100644 --- a/server/app/dicts/dsl/markup_converter.py +++ b/server/app/dicts/dsl/markup_converter.py @@ -199,8 +199,7 @@ def _clean_tags(self, line: 'str') -> 'str': line = line.replace("\\[", "[").replace("\\]", "]") # preserve newlines - if not line.endswith('>'): - print(line) + if not line.endswith('>') and not line.endswith('[/m]'): line += '
' return line diff --git a/server/app/dicts/dsl_reader.py b/server/app/dicts/dsl_reader.py index 748dd87..1c993d8 100644 --- a/server/app/dicts/dsl_reader.py +++ b/server/app/dicts/dsl_reader.py @@ -32,7 +32,7 @@ class DSLReader(BaseReader): @staticmethod def _cleanup_text(text: 'str') -> 'str': # Get rid of the BOM - text = text.replace('\ufeff', '') + text = text.replace('\ufeff', '', 1) # Remove the {·} marker (note: this is not the same as {·}, which is used to separate syllables) text = text.replace('{·}', '') @@ -60,7 +60,7 @@ def _clean_up(dsl_decompressed_path: 'str') -> 'None': """ with open(dsl_decompressed_path, 'rb') as f: data = f.read() - text = data.decode(detect_encoding(data)) + text = data.decode(detect_encoding(data)) # TODO: json's detect_encoding() is not always reliable del data text = DSLReader._cleanup_text(text) text = DSLReader._clean_up_opening_whitespace(text) @@ -98,7 +98,8 @@ def __init__(self, display_name: 'str', performs_cleanup: 'bool'=True, # Make sure your dsl is already cleaned up if it is False extract_resources: 'bool'=False, - remove_resources_after_extraction: 'bool'=True) -> 'None': + remove_resources_after_extraction: 'bool'=True, + load_content_into_memory: 'bool'=False) -> 'None': super().__init__(name, filename, display_name) filename_no_extension, extension = os.path.splitext(filename) is_compressed = extension == '.dz' @@ -179,6 +180,15 @@ def __init__(self, Path(os.path.join(self._CACHE_ROOT, self.name)).mkdir(parents=True, exist_ok=True) self._converter = DSLConverter(self.filename, self.name, os.path.join(self._CACHE_ROOT, self.name), extract_resources) + self._loaded_content_into_memory = load_content_into_memory + if load_content_into_memory: + self._content : 'dict[str, list[str]]' = {} # key -> [definition_html] + locations_all = db_manager.get_entries_all(self.name) + with idzip.open(self.filename) as f: + for key, word, offset, size in locations_all: + record = self._get_record(f, offset, size) + self._content.setdefault(key, []).append(self._converter.convert((record, word))) + if extract_resources: from zipfile import ZipFile @@ -209,11 +219,15 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis records.append((self._get_record(f, offset, size), word)) return records - def entry_definition(self, entry: str) -> str: - locations = db_manager.get_entries(entry, self.name) - records = self._get_records_in_batch(locations) - # records = [self._converter.convert(*record) for record in records] - # DSL parsing is expensive, so we'd better parallelise it - with concurrent.futures.ThreadPoolExecutor(len(records)) as executor: - records = list(executor.map(self._converter.convert, records)) - return self._ARTICLE_SEPARATOR.join(records) + def entry_definition(self, entry: 'str') -> 'str': + if self._loaded_content_into_memory: + articles = self._content.get(entry) + return self._ARTICLE_SEPARATOR.join(articles) + else: + locations = db_manager.get_entries(entry, self.name) + records = self._get_records_in_batch(locations) + # records = [self._converter.convert(*record) for record in records] + # DSL parsing is expensive, so we'd better parallelise it + with concurrent.futures.ThreadPoolExecutor(len(records)) as executor: + records = list(executor.map(self._converter.convert, records)) + return self._ARTICLE_SEPARATOR.join(records) diff --git a/server/app/dicts/mdict_reader.py b/server/app/dicts/mdict_reader.py index e21b36c..4fc6e99 100644 --- a/server/app/dicts/mdict_reader.py +++ b/server/app/dicts/mdict_reader.py @@ -2,6 +2,8 @@ import zlib import os from pathlib import Path +import pickle +import io try: import lzo except ImportError: @@ -16,9 +18,9 @@ logger.setLevel(logging.INFO) class MDictReader(BaseReader): + FILENAME_MDX_PICKLE = 'mdx.pickle' def _write_to_cache_dir(self, resource_filename: 'str', data: 'bytes') -> 'None': absolute_path = os.path.join(self._resources_dir, resource_filename) - Path(os.path.dirname(absolute_path)).mkdir(parents=True, exist_ok=True) with open(absolute_path, 'wb') as f: f.write(data) @@ -27,13 +29,24 @@ def __init__(self, filename: 'str', display_name: 'str', extract_resources: 'bool'=True, - remove_resources_after_extraction: 'bool'=False) -> 'None': + remove_resources_after_extraction: 'bool'=False, + load_content_into_memory: 'bool'=False) -> 'None': """ It is recommended to set remove_resources_after_extraction to True on a server when you have local backup. """ super().__init__(name, filename, display_name) + filename_no_extension, extension = os.path.splitext(filename) + self._resources_dir = os.path.join(self._CACHE_ROOT, name) + Path(self._resources_dir).mkdir(parents=True, exist_ok=True) - self._mdict = MDX(filename) + filename_mdx_pickle = os.path.join(self._resources_dir, self.FILENAME_MDX_PICKLE) + if os.path.isfile(filename_mdx_pickle): + mdx_pickled = True + with open(filename_mdx_pickle, 'rb') as f: + self._mdict = pickle.load(f) + else: + mdx_pickled = False + self._mdict = MDX(filename) if not db_manager.dictionary_exists(self.name): db_manager.drop_index() @@ -48,12 +61,24 @@ def __init__(self, db_manager.create_index() logger.info('Entries of dictionary %s added to database' % self.name) - del self._mdict._key_list # a hacky way to reduce memory usage without touching the library + if not mdx_pickled: + del self._mdict._key_list # a hacky way to reduce memory usage without touching the library + with open(filename_mdx_pickle, 'wb') as f: + pickle.dump(self._mdict, f) - filename_no_extension, extension = os.path.splitext(filename) - self._resources_dir = os.path.join(self._CACHE_ROOT, name) self.html_cleaner = HTMLCleaner(filename, name, self._resources_dir) + self._loaded_content_into_memory = load_content_into_memory + if load_content_into_memory: + # with open(self._mdict._fname, 'rb') as f: + # self._content = io.BytesIO(f.read()) + self._content : 'dict[str, list[str]]' = {} # key -> [definition_html] + locations_all = db_manager.get_entries_all(self.name) + with open(self._mdict._fname, 'rb') as f: + for key, word, offset, length in locations_all: + record = self._get_record(f, offset, length) + self._content.setdefault(key, []).append(self.html_cleaner.clean(record)) + if extract_resources and not os.path.isdir(self._resources_dir): # Only extract the files once # Load the resource files (.mdd), if any # For example, for the dictionary collinse22f.mdx, there are four .mdd files: @@ -170,9 +195,13 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis return records def entry_definition(self, entry: 'str') -> 'str': - locations = db_manager.get_entries(entry, self.name) - records = self._get_records_in_batch(locations) - # Cleaning up HTML actually takes some time to complete - with concurrent.futures.ThreadPoolExecutor(len(records)) as executor: - records = list(executor.map(self.html_cleaner.clean, records)) - return self._ARTICLE_SEPARATOR.join(records) + if self._loaded_content_into_memory: + articles = self._content.get(entry) + return self._ARTICLE_SEPARATOR.join(articles) + else: + locations = db_manager.get_entries(entry, self.name) + records = self._get_records_in_batch(locations) + # Cleaning up HTML actually takes some time to complete + with concurrent.futures.ThreadPoolExecutor(len(records)) as executor: + records = list(executor.map(self.html_cleaner.clean, records)) + return self._ARTICLE_SEPARATOR.join(records) diff --git a/server/app/dicts/stardict/stardict.py b/server/app/dicts/stardict/stardict.py index f3db180..39a275f 100644 --- a/server/app/dicts/stardict/stardict.py +++ b/server/app/dicts/stardict/stardict.py @@ -225,8 +225,8 @@ def get_syn(self, synonym_word): class DictFileReader(object): """Read the .dict file, store the data in memory for querying. """ - - def __init__(self, filename, dict_ifo, dict_index): + + def __init__(self, filename, dict_ifo, dict_index, load_content_into_memory=False): """Constructor. Arguments: @@ -237,20 +237,33 @@ def __init__(self, filename, dict_ifo, dict_index): self._dict_ifo = dict_ifo self._dict_index = dict_index self._offset = 0 + self._loaded_content_into_memory = load_content_into_memory compressed = os.path.splitext(filename)[1] == ".dz" - if compressed: - #with gzip.open(filename, "rb") as dict_file: - # self._dict_file = dict_file.read() - self.fd = idzip.open(filename) + if load_content_into_memory: + if compressed: + with idzip.open(filename) as f: + self._content = f.read() + else: + with open(filename, "rb") as f: + self._content = f.read() else: - self.fd = open(filename, "rb") + if compressed: + #with gzip.open(filename, "rb") as dict_file: + # self._dict_file = dict_file.read() + self.fd = idzip.open(filename) + else: + self.fd = open(filename, "rb") def close(self): - self.fd.close() + if not self._loaded_content_into_memory: + self.fd.close() def _get_dict_by_offset_size_internal(self, offset, size, sametypesequence, result): - self.fd.seek(offset) - self._dict_file = self.fd.read(size) + if self._loaded_content_into_memory: + self._dict_file = self._content[offset:(offset+size)] + else: + self.fd.seek(offset) + self._dict_file = self.fd.read(size) if sametypesequence: result.append(self._get_entry_sametypesequence(0, size)) else: @@ -262,45 +275,45 @@ def get_dict_by_offset_size(self, offset, size): self._get_dict_by_offset_size_internal(offset, size, sametypesequence, result) return result - def get_dict_by_word(self, word): - """Get the word's dictionary data by it's name. + # def get_dict_by_word(self, word): + # """Get the word's dictionary data by it's name. - Arguments: - - `word`: word name. - Return: - The specified word's dictionary data, in form of dict as below: - {type_identifier: infomation, ...} - in which type_identifier can be any character in "mlgtxykwhnrWP". - """ - if type(word) != type(b""): - word = word.encode("utf-8") - indexes = self._dict_index.get_index_by_word(word) - if indexes == False: - return False - sametypesequence = self._dict_ifo.get_ifo("sametypesequence") - result = list() - for index in indexes: - self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result) - return result + # Arguments: + # - `word`: word name. + # Return: + # The specified word's dictionary data, in form of dict as below: + # {type_identifier: infomation, ...} + # in which type_identifier can be any character in "mlgtxykwhnrWP". + # """ + # if type(word) != type(b""): + # word = word.encode("utf-8") + # indexes = self._dict_index.get_index_by_word(word) + # if indexes == False: + # return False + # sametypesequence = self._dict_ifo.get_ifo("sametypesequence") + # result = list() + # for index in indexes: + # self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result) + # return result - def get_dict_by_index(self, index): - """Get the word's dictionary data by it's index infomation. + # def get_dict_by_index(self, index): + # """Get the word's dictionary data by it's index infomation. - Arguments: - - `index`: index of a word entrt in .idx file.' - Return: - The specified word's dictionary data, in form of dict as below: - {type_identifier: infomation, ...} - in which type_identifier can be any character in "mlgtxykwhnrWP". - """ - word, offset, size = self._dict_index.get_index_by_num(index) - sametypesequence = self._dict_ifo.get_ifo("sametypesequence") - self.fd.seek(offset) - self._dict_file = self.fd.read(size) - if sametypesequence: - return self._get_entry_sametypesequence(0, size) - else: - return self._get_entry(0, size) + # Arguments: + # - `index`: index of a word entrt in .idx file.' + # Return: + # The specified word's dictionary data, in form of dict as below: + # {type_identifier: infomation, ...} + # in which type_identifier can be any character in "mlgtxykwhnrWP". + # """ + # word, offset, size = self._dict_index.get_index_by_num(index) + # sametypesequence = self._dict_ifo.get_ifo("sametypesequence") + # self.fd.seek(offset) + # self._dict_file = self.fd.read(size) + # if sametypesequence: + # return self._get_entry_sametypesequence(0, size) + # else: + # return self._get_entry(0, size) def _get_entry(self, offset, size): result = dict() diff --git a/server/app/dicts/stardict_reader.py b/server/app/dicts/stardict_reader.py index 56b0792..652c002 100644 --- a/server/app/dicts/stardict_reader.py +++ b/server/app/dicts/stardict_reader.py @@ -20,13 +20,14 @@ def _stardict_filenames(base_filename: 'str') -> 'tuple[str, str, str, str]': if not os.path.isfile(idxfile): idxfile += '.gz' dictfile = base_filename + '.dict.dz' - synfile = base_filename + 'syn.dz' + synfile = base_filename + 'syn.dz' # not used at the moment return ifofile, idxfile, dictfile, synfile def __init__(self, name: 'str', filename: 'str', # .ifo - display_name: 'str',) -> 'None': + display_name: 'str', + load_content_into_memory: 'bool'=False) -> 'None': super().__init__(name, filename, display_name) filename_no_extension, extension = os.path.splitext(filename) self.ifofile, idxfile, self.dictfile, synfile = self._stardict_filenames(filename_no_extension) @@ -53,6 +54,22 @@ def __init__(self, self._html_cleaner = HtmlCleaner(self.name, os.path.dirname(self.filename), self._resources_dir) self._xdxf_cleaner = XdxfCleaner() + self._loaded_content_into_memory = load_content_into_memory + if load_content_into_memory: + locations_all = db_manager.get_entries_all(self.name) + self._content : 'dict[str, list[str]]' = {} # key -> [definition_html] + if not os.path.isfile(self.dictfile): # it is possible that it is not dictzipped + from idzip.command import _compress + class Options: + suffix = '.dz' + keep = False + _compress(self.dictfile[:-len(Options.suffix)], Options) + dict_reader = DictFileReader(self.dictfile, self.ifo_reader, None) + for key, word, offset, size in locations_all: + records = self._get_records(dict_reader, offset, size) + self._content.setdefault(key, []).extend([self._clean_up_markup(r, word) for r in records]) + dict_reader.close() + def _get_records(self, dict_reader: 'DictFileReader', offset: 'int', size: 'int') -> 'list[tuple[str, str]]': """ Returns a list of tuples (cttype, article). @@ -103,6 +120,10 @@ class Options: return records def entry_definition(self, entry: 'str') -> 'str': - locations = db_manager.get_entries(entry, self.name) - records = self._get_records_in_batch(locations) - return self._ARTICLE_SEPARATOR.join(records) + if self._loaded_content_into_memory: + articles = self._content.get(entry) + return self._ARTICLE_SEPARATOR.join(articles) + else: + locations = db_manager.get_entries(entry, self.name) + records = self._get_records_in_batch(locations) + return self._ARTICLE_SEPARATOR.join(records) diff --git a/server/app/settings.py b/server/app/settings.py index 442b45d..9cbdb13 100644 --- a/server/app/settings.py +++ b/server/app/settings.py @@ -100,6 +100,8 @@ class Settings: NGRAM_LEN = 4 + NAME_GROUP_LOADED_INTO_MEMORY = 'Memory' + def _preferences_valid(self) -> 'bool': return all(key in self.preferences.keys() for key in ['listening_address', 'suggestions_mode', 'running_mode']) and self.preferences['suggestions_mode'] in ('right-side', 'both-sides') and self.preferences['running_mode'] in ('normal', 'preparation', 'server') @@ -429,6 +431,9 @@ def dictionaries_of_group(self, group_name: 'str') -> 'list[str]': names = [dictionary_name for dictionary_name, groups in self.junction_table.items() if group_name in groups] # junction_table's keys are unordered, so we need to sort the list according to the order in dictionary_list return [dictionary_info['dictionary_name'] for dictionary_info in self.dictionaries_list if dictionary_info['dictionary_name'] in names] + + def dictionary_is_in_group(self, dictionary_name: 'str', group_name: 'str') -> 'bool': + return group_name in self.junction_table[dictionary_name] def add_source(self, source: 'str') -> 'None': if not source in self.misc_configs['sources']: