forked from sveinbjornt/ensk.is
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb.py
executable file
·230 lines (186 loc) · 8.13 KB
/
db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python3
"""
Ensk.is - Free and open English-Icelandic dictionary
Copyright (c) 2021-2024, Sveinbjorn Thordarson <[email protected]>
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may
be used to endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
Dictionary database singleton.
"""
from typing import List, Dict
import os
import logging
import sqlite3
from pathlib import Path
from dict import CATEGORIES
DB_FILENAME = "dict.db"
CACHED_STATEMENTS = 1024
CACHE_SIZE_KB = 1024 * 32 # 32 MB
class EnskDatabase(object):
_instance = None
def __init__(self, read_only=False):
self.db_conn = None
self.read_only = read_only
def __new__(cls, read_only=False):
"""Singleton pattern."""
if cls._instance is None:
logging.info("Instantiating database")
cls._instance = super(EnskDatabase, cls).__new__(cls)
basepath, _ = os.path.split(os.path.realpath(__file__))
cls._dbpath = os.path.join(basepath, DB_FILENAME)
cls.read_only = read_only
# Create database file and schema if no DB file exists
if not Path(cls._dbpath).is_file():
cls._instance._create()
return cls._instance
def _create(self) -> None:
"""Create database file and generate database schema."""
logging.info(f"Creating database {self._dbpath}")
# Create database file
conn = sqlite3.connect(self._dbpath)
# Create dictionary table
create_dictionary_table_sql = """
CREATE TABLE dictionary (
id INTEGER UNIQUE PRIMARY KEY NOT NULL,
word TEXT,
definition TEXT,
ipa_uk TEXT,
ipa_us TEXT,
page_num INTEGER
);
"""
conn.cursor().execute(create_dictionary_table_sql)
# Create metadata table
create_metadata_table_sql = """
CREATE TABLE metadata (
key TEXT UNIQUE PRIMARY KEY NOT NULL,
value TEXT
);
"""
conn.cursor().execute(create_metadata_table_sql)
def reinstantiate(self) -> "EnskDatabase":
"""Reinstantiate database."""
EnskDatabase._instance = None
return EnskDatabase.__new__(EnskDatabase)
def conn(self) -> sqlite3.Connection:
"""Open database connection lazily."""
if not self.db_conn:
# Open database file via URI
db_uri = f"file:{self._dbpath}"
if self.read_only:
db_uri += "?mode=ro"
logging.info(f"Opening database connection at {db_uri}")
self.db_conn = sqlite3.connect(
db_uri,
uri=True,
check_same_thread=(self.read_only is False),
cached_statements=CACHED_STATEMENTS,
)
# Set cache size
self.db_conn.cursor().execute(f"PRAGMA cache_size = -{CACHE_SIZE_KB}")
# Return rows as key-value dicts
self.db_conn.row_factory = lambda c, r: dict(
zip([col[0] for col in c.description], r)
)
return self.db_conn
def add_entry(
self,
w: str,
definition: str,
ipa_uk: str,
ipa_us: str,
page_num: int,
commit=False, # Whether to commit changes to database immediately
) -> None:
"""Add a single entry to the dictionary."""
conn = self.conn()
conn.cursor().execute(
"INSERT INTO dictionary (word, definition, ipa_uk, ipa_us, page_num) VALUES (?,?,?,?,?)",
[w, definition, ipa_uk, ipa_us, page_num],
)
if commit:
conn.commit()
def add_metadata(self, key: str, value: str) -> None:
"""Add a single metadata entry to the database."""
conn = self.conn()
conn.cursor().execute(
"INSERT INTO metadata (key, value) VALUES (?,?)", [key, value]
)
conn.commit()
def _consume(self, cursor: sqlite3.Cursor) -> List[Dict]:
"""Consume cursor and return list of rows."""
res = list(cursor) # Consume generator into list
res.sort(key=lambda x: x["word"].lower())
return res
def read_all_entries(self) -> List[Dict]:
"""Read and return all entries."""
conn = self.conn()
selected = conn.cursor().execute("SELECT * FROM dictionary")
return self._consume(selected)
def read_all_original(self) -> List[Dict]:
"""Read and return all original entries from the Zoega dictionary."""
conn = self.conn()
selected = conn.cursor().execute("SELECT * FROM dictionary WHERE page_num!=0")
return self._consume(selected)
def read_all_additions(self) -> List[Dict]:
"""Read and return all entries not present in the original Zoega dictionary."""
conn = self.conn()
selected = conn.cursor().execute("SELECT * FROM dictionary WHERE page_num=0")
return self._consume(selected)
def read_all_duplicates(self) -> List[Dict]:
"""Read and return all duplicate (i.e. same word) entries present in the dictionary
as a dict keyed by word."""
conn = self.conn()
selected = conn.cursor().execute(
"SELECT *, COUNT(*) FROM dictionary GROUP BY word HAVING COUNT(*) > 1"
)
res = list(selected) # Consume generator into list
return res
def read_all_without_ipa(self, lang="uk") -> List[Dict]:
"""Read and return all entries without IPA."""
assert lang in ["uk", "us"]
ipa_col = "ipa_uk" if lang == "uk" else "ipa_us"
conn = self.conn()
selected = conn.cursor().execute(f"SELECT * FROM dictionary WHERE {ipa_col}=''")
return self._consume(selected)
def read_all_with_no_page(self) -> List[Dict]:
"""Read and return all entries without IPA."""
conn = self.conn()
selected = conn.cursor().execute("SELECT * FROM dictionary WHERE page_num=0")
return self._consume(selected)
def read_all_capitalized(self) -> List[Dict]:
"""Read and return all entries with capitalized words."""
conn = self.conn()
selected = conn.cursor().execute(
"SELECT * FROM dictionary WHERE word GLOB '[A-Z]*'"
)
return self._consume(selected)
def read_all_in_wordcat(self, cat=None) -> List[Dict]:
"""Read all entries in a given word category."""
assert cat is not None
# Return empty list if category is not valid
if cat + "." not in CATEGORIES:
return []
conn = self.conn()
selected = conn.cursor().execute(
f"SELECT * FROM dictionary WHERE definition LIKE '%{cat}. %'"
)
return self._consume(selected)