Skip to content

Commit

Permalink
feat: allow pre-loading formatted articles
Browse files Browse the repository at this point in the history
  • Loading branch information
Crissium committed Nov 3, 2023
1 parent c8ea5c3 commit a3e85c8
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 86 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

![favicon](/client/public/favicon.ico)

[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki)
[Documentation and Guides](https://github.com/Crissium/SilverDict/wiki) (At least read the general notes before using.)

This project is intended to be a modern, from-the-ground-up, maintainable alternative to [GoldenDict](https://github.com/goldendict/goldendict)(-[ng](https://github.com/xiaoyifang/goldendict-ng)), developed with Flask and React.

Expand Down Expand Up @@ -58,7 +58,7 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
- [X] OpenCC Chinese conversion (please set your preference in `~/.silverdict/preferences.yaml` and add `zh` to the group with Chinese dictionaries)
- [X] Add the ability to set sources for automatic indexing, i.e. dictionaries put into the specified directories will be automatically added
- [X] Recursive source scanning
- [X] Multithreaded article extraction
- [X] Multithreaded article extraction (This project will benefit hugely from [no-GIL python](https://peps.python.org/pep-0703/))
- [X] Improve the performance of suggestions matching
- [X] Make the suggestion size customisable
- [X] Allow configure suggestion matching mode, listening address, running mode, etc. via a configuration file, without modifying code
Expand All @@ -74,8 +74,8 @@ The dark theme is not built in, but rendered with the [Dark Reader Firefox exten
- [ ] Make the strings translatable
- [X] GoldenDict-like dictionary group support
- [X] A mobile-friendly interface (retouch needed)
- [ ] [A real mobile app](https://github.com/Crissium/SilverDict-mobile)
- [ ] A C++/Qt (or QML) desktop app (development is scheduled to begin in July, 2024)[^7]
- [X] [A real mobile app](https://github.com/Crissium/SilverDict-mobile)
- [ ] A C++/Qt (or QML) desktop app[^7]

### Issue backlog

Expand Down
8 changes: 8 additions & 0 deletions server/app/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ def get_entries(key: 'str', dictionary_name: 'str') -> 'list[tuple[str, int, int
cursor.execute('select word, offset, size from entries where key = ? and dictionary_name = ?', (key, dictionary_name))
return cursor.fetchall()

def get_entries_all(dictionary_name: 'str') -> 'list[tuple[str, str, int, int]]':
"""
Returns a list of (key, word, offset, size).
"""
cursor = get_cursor()
cursor.execute('select key, word, offset, size from entries where dictionary_name = ? order by offset', (dictionary_name,))
return cursor.fetchall()

def delete_dictionary(dictionary_name: 'str') -> 'None':
cursor = get_cursor()
cursor.execute('delete from entries where dictionary_name = ?', (dictionary_name,))
Expand Down
17 changes: 11 additions & 6 deletions server/app/dictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ class Dictionaries:
def _load_dictionary(self, dictionary_info: 'dict') -> 'None':
match dictionary_info['dictionary_format']:
case 'MDict (.mdx)':
self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
self.dictionaries[dictionary_info['dictionary_name']] = MDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
case 'StarDict (.ifo)':
self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
self.dictionaries[dictionary_info['dictionary_name']] = StarDictReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
case 'DSL (.dsl/.dsl.dz)':
if self.settings.preferences['running_mode'] == 'normal':
self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'])
self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], load_content_into_memory=self.settings.dictionary_is_in_group(dictionary_info['dictionary_name'], Settings.NAME_GROUP_LOADED_INTO_MEMORY))
elif self.settings.preferences['running_mode'] == 'preparation':
self.dictionaries[dictionary_info['dictionary_name']] = DSLReader(dictionary_info['dictionary_name'], dictionary_info['dictionary_filename'], dictionary_info['dictionary_display_name'], True, True)
else: # 'server' mode
Expand All @@ -41,9 +41,14 @@ def __init__(self, app: 'Flask') -> 'None':
db_manager.create_table_entries()

self.dictionaries : 'dict[str, BaseReader]' = dict()
for dictionary_info in self.settings.dictionaries_list:
self._load_dictionary(dictionary_info)
logger.info('Dictionaries loaded into memory.')
if len(self.settings.dictionaries_of_group(Settings.NAME_GROUP_LOADED_INTO_MEMORY)) > 0: # on HDD it would confuse the I/O scheduler to load the dictionaries in parallel
for dictionary_info in self.settings.dictionaries_list:
self._load_dictionary(dictionary_info)
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self._load_dictionary, self.settings.dictionaries_list)

logger.info('Dictionaries loaded.')

def add_dictionary(self, dictionary_info: 'dict') -> 'None':
self._load_dictionary(dictionary_info)
Expand Down
3 changes: 1 addition & 2 deletions server/app/dicts/dsl/markup_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,7 @@ def _clean_tags(self, line: 'str') -> 'str':
line = line.replace("\\[", "[").replace("\\]", "]")

# preserve newlines
if not line.endswith('>'):
print(line)
if not line.endswith('>') and not line.endswith('[/m]'):
line += '<br/>'

return line
Expand Down
36 changes: 25 additions & 11 deletions server/app/dicts/dsl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class DSLReader(BaseReader):
@staticmethod
def _cleanup_text(text: 'str') -> 'str':
# Get rid of the BOM
text = text.replace('\ufeff', '')
text = text.replace('\ufeff', '', 1)

# Remove the {·} marker (note: this is not the same as {·}, which is used to separate syllables)
text = text.replace('{·}', '')
Expand Down Expand Up @@ -60,7 +60,7 @@ def _clean_up(dsl_decompressed_path: 'str') -> 'None':
"""
with open(dsl_decompressed_path, 'rb') as f:
data = f.read()
text = data.decode(detect_encoding(data))
text = data.decode(detect_encoding(data)) # TODO: json's detect_encoding() is not always reliable
del data
text = DSLReader._cleanup_text(text)
text = DSLReader._clean_up_opening_whitespace(text)
Expand Down Expand Up @@ -98,7 +98,8 @@ def __init__(self,
display_name: 'str',
performs_cleanup: 'bool'=True, # Make sure your dsl is already cleaned up if it is False
extract_resources: 'bool'=False,
remove_resources_after_extraction: 'bool'=True) -> 'None':
remove_resources_after_extraction: 'bool'=True,
load_content_into_memory: 'bool'=False) -> 'None':
super().__init__(name, filename, display_name)
filename_no_extension, extension = os.path.splitext(filename)
is_compressed = extension == '.dz'
Expand Down Expand Up @@ -179,6 +180,15 @@ def __init__(self,
Path(os.path.join(self._CACHE_ROOT, self.name)).mkdir(parents=True, exist_ok=True)
self._converter = DSLConverter(self.filename, self.name, os.path.join(self._CACHE_ROOT, self.name), extract_resources)

self._loaded_content_into_memory = load_content_into_memory
if load_content_into_memory:
self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
locations_all = db_manager.get_entries_all(self.name)
with idzip.open(self.filename) as f:
for key, word, offset, size in locations_all:
record = self._get_record(f, offset, size)
self._content.setdefault(key, []).append(self._converter.convert((record, word)))

if extract_resources:
from zipfile import ZipFile

Expand Down Expand Up @@ -209,11 +219,15 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis
records.append((self._get_record(f, offset, size), word))
return records

def entry_definition(self, entry: str) -> str:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# records = [self._converter.convert(*record) for record in records]
# DSL parsing is expensive, so we'd better parallelise it
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self._converter.convert, records))
return self._ARTICLE_SEPARATOR.join(records)
def entry_definition(self, entry: 'str') -> 'str':
if self._loaded_content_into_memory:
articles = self._content.get(entry)
return self._ARTICLE_SEPARATOR.join(articles)
else:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# records = [self._converter.convert(*record) for record in records]
# DSL parsing is expensive, so we'd better parallelise it
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self._converter.convert, records))
return self._ARTICLE_SEPARATOR.join(records)
53 changes: 41 additions & 12 deletions server/app/dicts/mdict_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import zlib
import os
from pathlib import Path
import pickle
import io
try:
import lzo
except ImportError:
Expand All @@ -16,9 +18,9 @@
logger.setLevel(logging.INFO)

class MDictReader(BaseReader):
FILENAME_MDX_PICKLE = 'mdx.pickle'
def _write_to_cache_dir(self, resource_filename: 'str', data: 'bytes') -> 'None':
absolute_path = os.path.join(self._resources_dir, resource_filename)
Path(os.path.dirname(absolute_path)).mkdir(parents=True, exist_ok=True)
with open(absolute_path, 'wb') as f:
f.write(data)

Expand All @@ -27,13 +29,24 @@ def __init__(self,
filename: 'str',
display_name: 'str',
extract_resources: 'bool'=True,
remove_resources_after_extraction: 'bool'=False) -> 'None':
remove_resources_after_extraction: 'bool'=False,
load_content_into_memory: 'bool'=False) -> 'None':
"""
It is recommended to set remove_resources_after_extraction to True on a server when you have local backup.
"""
super().__init__(name, filename, display_name)
filename_no_extension, extension = os.path.splitext(filename)
self._resources_dir = os.path.join(self._CACHE_ROOT, name)
Path(self._resources_dir).mkdir(parents=True, exist_ok=True)

self._mdict = MDX(filename)
filename_mdx_pickle = os.path.join(self._resources_dir, self.FILENAME_MDX_PICKLE)
if os.path.isfile(filename_mdx_pickle):
mdx_pickled = True
with open(filename_mdx_pickle, 'rb') as f:
self._mdict = pickle.load(f)
else:
mdx_pickled = False
self._mdict = MDX(filename)

if not db_manager.dictionary_exists(self.name):
db_manager.drop_index()
Expand All @@ -48,12 +61,24 @@ def __init__(self,
db_manager.create_index()
logger.info('Entries of dictionary %s added to database' % self.name)

del self._mdict._key_list # a hacky way to reduce memory usage without touching the library
if not mdx_pickled:
del self._mdict._key_list # a hacky way to reduce memory usage without touching the library
with open(filename_mdx_pickle, 'wb') as f:
pickle.dump(self._mdict, f)

filename_no_extension, extension = os.path.splitext(filename)
self._resources_dir = os.path.join(self._CACHE_ROOT, name)
self.html_cleaner = HTMLCleaner(filename, name, self._resources_dir)

self._loaded_content_into_memory = load_content_into_memory
if load_content_into_memory:
# with open(self._mdict._fname, 'rb') as f:
# self._content = io.BytesIO(f.read())
self._content : 'dict[str, list[str]]' = {} # key -> [definition_html]
locations_all = db_manager.get_entries_all(self.name)
with open(self._mdict._fname, 'rb') as f:
for key, word, offset, length in locations_all:
record = self._get_record(f, offset, length)
self._content.setdefault(key, []).append(self.html_cleaner.clean(record))

if extract_resources and not os.path.isdir(self._resources_dir): # Only extract the files once
# Load the resource files (.mdd), if any
# For example, for the dictionary collinse22f.mdx, there are four .mdd files:
Expand Down Expand Up @@ -170,9 +195,13 @@ def _get_records_in_batch(self, locations: 'list[tuple[str, int, int]]') -> 'lis
return records

def entry_definition(self, entry: 'str') -> 'str':
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# Cleaning up HTML actually takes some time to complete
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self.html_cleaner.clean, records))
return self._ARTICLE_SEPARATOR.join(records)
if self._loaded_content_into_memory:
articles = self._content.get(entry)
return self._ARTICLE_SEPARATOR.join(articles)
else:
locations = db_manager.get_entries(entry, self.name)
records = self._get_records_in_batch(locations)
# Cleaning up HTML actually takes some time to complete
with concurrent.futures.ThreadPoolExecutor(len(records)) as executor:
records = list(executor.map(self.html_cleaner.clean, records))
return self._ARTICLE_SEPARATOR.join(records)
105 changes: 59 additions & 46 deletions server/app/dicts/stardict/stardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ def get_syn(self, synonym_word):
class DictFileReader(object):
"""Read the .dict file, store the data in memory for querying.
"""
def __init__(self, filename, dict_ifo, dict_index):

def __init__(self, filename, dict_ifo, dict_index, load_content_into_memory=False):
"""Constructor.
Arguments:
Expand All @@ -237,20 +237,33 @@ def __init__(self, filename, dict_ifo, dict_index):
self._dict_ifo = dict_ifo
self._dict_index = dict_index
self._offset = 0
self._loaded_content_into_memory = load_content_into_memory
compressed = os.path.splitext(filename)[1] == ".dz"
if compressed:
#with gzip.open(filename, "rb") as dict_file:
# self._dict_file = dict_file.read()
self.fd = idzip.open(filename)
if load_content_into_memory:
if compressed:
with idzip.open(filename) as f:
self._content = f.read()
else:
with open(filename, "rb") as f:
self._content = f.read()
else:
self.fd = open(filename, "rb")
if compressed:
#with gzip.open(filename, "rb") as dict_file:
# self._dict_file = dict_file.read()
self.fd = idzip.open(filename)
else:
self.fd = open(filename, "rb")

def close(self):
self.fd.close()
if not self._loaded_content_into_memory:
self.fd.close()

def _get_dict_by_offset_size_internal(self, offset, size, sametypesequence, result):
self.fd.seek(offset)
self._dict_file = self.fd.read(size)
if self._loaded_content_into_memory:
self._dict_file = self._content[offset:(offset+size)]
else:
self.fd.seek(offset)
self._dict_file = self.fd.read(size)
if sametypesequence:
result.append(self._get_entry_sametypesequence(0, size))
else:
Expand All @@ -262,45 +275,45 @@ def get_dict_by_offset_size(self, offset, size):
self._get_dict_by_offset_size_internal(offset, size, sametypesequence, result)
return result

def get_dict_by_word(self, word):
"""Get the word's dictionary data by it's name.
# def get_dict_by_word(self, word):
# """Get the word's dictionary data by it's name.

Arguments:
- `word`: word name.
Return:
The specified word's dictionary data, in form of dict as below:
{type_identifier: infomation, ...}
in which type_identifier can be any character in "mlgtxykwhnrWP".
"""
if type(word) != type(b""):
word = word.encode("utf-8")
indexes = self._dict_index.get_index_by_word(word)
if indexes == False:
return False
sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
result = list()
for index in indexes:
self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result)
return result
# Arguments:
# - `word`: word name.
# Return:
# The specified word's dictionary data, in form of dict as below:
# {type_identifier: infomation, ...}
# in which type_identifier can be any character in "mlgtxykwhnrWP".
# """
# if type(word) != type(b""):
# word = word.encode("utf-8")
# indexes = self._dict_index.get_index_by_word(word)
# if indexes == False:
# return False
# sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
# result = list()
# for index in indexes:
# self._get_dict_by_offset_size_internal(index[0], index[1], sametypesequence, result)
# return result

def get_dict_by_index(self, index):
"""Get the word's dictionary data by it's index infomation.
# def get_dict_by_index(self, index):
# """Get the word's dictionary data by it's index infomation.

Arguments:
- `index`: index of a word entrt in .idx file.'
Return:
The specified word's dictionary data, in form of dict as below:
{type_identifier: infomation, ...}
in which type_identifier can be any character in "mlgtxykwhnrWP".
"""
word, offset, size = self._dict_index.get_index_by_num(index)
sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
self.fd.seek(offset)
self._dict_file = self.fd.read(size)
if sametypesequence:
return self._get_entry_sametypesequence(0, size)
else:
return self._get_entry(0, size)
# Arguments:
# - `index`: index of a word entrt in .idx file.'
# Return:
# The specified word's dictionary data, in form of dict as below:
# {type_identifier: infomation, ...}
# in which type_identifier can be any character in "mlgtxykwhnrWP".
# """
# word, offset, size = self._dict_index.get_index_by_num(index)
# sametypesequence = self._dict_ifo.get_ifo("sametypesequence")
# self.fd.seek(offset)
# self._dict_file = self.fd.read(size)
# if sametypesequence:
# return self._get_entry_sametypesequence(0, size)
# else:
# return self._get_entry(0, size)

def _get_entry(self, offset, size):
result = dict()
Expand Down
Loading

0 comments on commit a3e85c8

Please sign in to comment.