docs: update docs to match #14

mr-martian · Sep 25, 2024 · 9a19334 · 9a19334
1 parent a4d0794
commit 9a19334
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 21 deletions.
diff --git a/Architecture.md b/Architecture.md
@@ -6,7 +6,11 @@ The primary product of this repository is a program named `rebabel-format`. This
 
 Processes operate on [SQLite](https://sqlite.org) databases of linguistic information. These databases contain three things:
 - **Units** represent linguistic objects, such as sentences, words, or morphemes. They have a type, such as `"sentence"`.
-- **Tiers** define what data can be associated with units. They have a feature name, such as `"Gender"`, a tier name, such as `"Morphology"` (so that tiers can be grouped), what unit type they apply to, and what type of value they contain (string, integer, boolean, or reference to another unit). (Tiers are also sometimes referred to as "features" in the code. The fact that this object has multiple names that both overlap with the names of its attributes is a confusion that should probably be fixed at some point.)
+- **Tiers** define what data can be associated with units. They have:
+  - A **name**, such as `"UD:FEATS:Gender"` (where `:` separates components, so this is UD data, subcategory FEATS, and specific feature name Gender
+  - A **unit type** that they apply to
+  - A **value type** specifying what they contain (string, integer, boolean, or reference to another unit).
+  - (Tiers are also sometimes referred to as "features" in the code. The fact that this object has multiple names that both overlap with the names of its attributes is a confusion that should probably be fixed at some point.)
 - **Feature Values** are values for a particular tier for a particular unit. They are divided into definite features, which must be unique per tier-unit pair and are associated with a user and can have a confidence indicator (integer), and suggestions, which are not unique and have a probability field.
 
 The database schema is defined in [`schema.sql`](rebabel_format/schema.sql) and the Python interface for it is in [`db.py`](rebabel_format/db.py).

diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ type = "sentence"
 # N is a word
 type = "word"
 # N has a feature named UD:upos with the value NOUN
-features = [{tier = "UD", feature = "upos", value = "NOUN"}]
+features = [{feature = "UD:upos", value = "NOUN"}]
 # N is part of S
 parent = "S"
 
@@ -56,13 +56,11 @@ parent = "S"
 
 # a different way of listing features
 [[query.V.features]]
-tier = "UD"
-feature = "upos"
+feature = "UD:upos"
 value = "VERB"
 
 [[query.V.features]]
-tier = "UD/FEATS"
-feature = "Person"
+feature = "UD:FEATS:Person"
 value = "3"
 ```
 
@@ -121,8 +119,8 @@ rebabel_format.run_command(
   mappings=[
     # use CoNLL-U sentence nodes where FlexText expects phrases
     {'in_type': 'sentence', 'out_type': 'phrase'},
-    # use UD:lemma where FlexText wants FlexText/en:txt
-    {'in_feature': 'UD:lemma', 'out_feature': 'FlexText/en:txt'},
+    # use UD:lemma where FlexText wants FlexText:en:txt
+    {'in_feature': 'UD:lemma', 'out_feature': 'FlexText:en:txt'},
   ],
   # settings specific to the FlexText writer:
   # the highest non-empty node will be the phrase

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -38,10 +38,6 @@ In either case, methods on `SomeWriter` can simply refer to `self.warn_on_invali
 - `type`: the type that a provided value must be (according to `isinstance`)
 - `help`: the documentation string for this parameter
 
-## `FeatureParameter`
-
-The input to this parameter type should specify a tier name and feature name. This can be done as a string (`"tier:feature"`), as a dictionary (`{"tier": tier, "feature": feature}`), or as an iterable of length 2 (`[tier, feature]`). It will be normalized to a tuple (`(tier, feature)`).
-
 ## `QueryParameter`
 
 Input to this parameter should be a dictionary which is checked to ensure that it is a valid query.

diff --git a/docs/readers.md b/docs/readers.md
@@ -11,7 +11,7 @@ class SomeReader(Reader):
     def read_file(self, fin):
         for line_number, line in enumerate(fin):
             self.set_type(line_number, 'line')
-            self.set_feature(line_number, 'something', 'line', 'str',
+            self.set_feature(line_number, 'something:line', 'str',
                              line.strip())
         self.finish_block()
 ```
@@ -27,7 +27,7 @@ Within `read_file`, units are created when information about them is specified,
 - `set_type(name, type)`: specify the unit type of `name`; if the type of a unit is not specified, a `ReaderError` will be raised
 - `set_parent(child_name, parent_name)`: set the primary parent of a given unit
 - `add_relation(child_name, parent_name)`: set a non-primary parent of a given unit
-- `set_feature(name, tier, feature, type, value)`: set `tier:feature` to `value` for unit `name`, creating the feature with type `type`, if necessary
+- `set_feature(name, feature, type, value)`: set `feature` to `value` for unit `name`, creating the feature with type `type`, if necessary
 - `finish_block(keep_uids=False)`: indicates that a segment of data is complete and should be committed to the database
   - by default, the list of names accumulated by the other methods will be cleared; this can be prevented by setting `keep_uids=True`, which is useful for cases where the input has globally unique IDs, is very large, and has relations spanning the file
 

diff --git a/docs/writers.md b/docs/writers.md
@@ -15,21 +15,19 @@ class SomeWriter(Writer):
     indent = Parameter(type=str, default='\t')
 
     def write(self, fout):
-        s_feat = self.table.add_features('S', ['something:text'])[0]
+        self.table.add_features('S', ['something:text'])
 
         w_feat_names = ['something:lemma', 'something:pos']
-        w_feat_ids = self.table.add_features('W', w_feat_names)
-        w_lemma = w_feat_ids[0]
-        w_pos = w_feat_ids[1]
+        self.table.add_features('W', w_feat_names)
 
         current_sentence = None
         for units, features in self.table.results():
             if units['S'] != current_sentence:
-                fout.write(str(features[units['S']].get(s_feat, '')) + '\n')
+                fout.write(str(features[units['S']].get('something:text', '')) + '\n')
                 current_sentence = units['S']
             fout.write(self.indent)
-            fout.write(str(features[units['W']].get(w_lemma, '')))
+            fout.write(str(features[units['W']].get('something:lemma', '')))
             fout.write(' ')
-            fout.write(str(features[units['W']].get(w_pos, '')))
+            fout.write(str(features[units['W']].get('something:pos', '')))
             fout.write('\n')
 ```