Re-space incorrectly-encoded hashes

Return flag when hash respacing occurs Adjusts some typing in tests/tests.py
DDMAL · May 27, 2024 · 3499a67 · 3499a67
1 parent a063a39
commit 3499a67
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 35 deletions.
diff --git a/tests/tests.py b/tests/tests.py
@@ -6,6 +6,9 @@
 import json
 import csv
 
+from typing import List, Dict
+from typing_extensions import TypedDict
+
 from volpiano_display_utilities.latin_word_syllabification import (
     syllabify_word,
     split_word_by_syl_bounds,
@@ -31,7 +34,7 @@ class TestWordSyllabification(unittest.TestCase):
     Tests functions in latin_text_syllabification.
     """
 
-    def test_syllabify_word(self):
+    def test_syllabify_word(self) -> None:
         """Tests syllabify_word."""
         # Read test words from csv file and get syllable boundaries.
         # ie. "Be-ne-dic-tus" -> [2, 4, 7]
@@ -53,22 +56,26 @@ def test_syllabify_word(self):
             with self.subTest(word=word):
                 self.assertEqual(syllabify_word(word, return_string=False), expected)
 
-    def test_split_word_by_syl_bounds(self):
+    def test_split_word_by_syl_bounds(self) -> None:
         """
         Tests split_word_by_syl_bounds.
 
         Test 1, 2, and 2+ syllable words.
         """
         test_words = {"Benedictus": "Be-ne-dic-tus", "qui": "qui", "venit": "ve-nit"}
-        word_syl_bounds = {"Benedictus": [2, 4, 7], "qui": [], "venit": [2]}
+        word_syl_bounds: Dict[str, List[int]] = {
+            "Benedictus": [2, 4, 7],
+            "qui": [],
+            "venit": [2],
+        }
         for word, expected in test_words.items():
             with self.subTest(word=word):
                 self.assertEqual(
                     "".join(split_word_by_syl_bounds(word, word_syl_bounds[word])),
                     expected,
                 )
 
-    def test_character_check(self):
+    def test_character_check(self) -> None:
         """
         Tests that an error is raised with invalid characters.
         """
@@ -86,25 +93,25 @@ class TestCantusTextSyllabification(unittest.TestCase):
     Tests functions in cantus_text_syllabification.
     """
 
-    def test_cantus_exceptions(self):
+    def test_cantus_exceptions(self) -> None:
         """Tests syllabification of a few words that are exceptions
         in the Cantus Database."""
         exception_word = "euouae"
-        syllabified_word = flatten_syllabified_text(syllabify_text(exception_word))
+        syllabified_word = flatten_syllabified_text(syllabify_text(exception_word)[0])
         self.assertEqual(syllabified_word, "e-u-o-u-a-e")
         exception_word_capitalized = "Euouae"
         syllabified_word_capitalized = flatten_syllabified_text(
-            syllabify_text(exception_word_capitalized)
+            syllabify_text(exception_word_capitalized)[0]
         )
         self.assertEqual(syllabified_word_capitalized, "E-u-o-u-a-e")
 
-    def test_clean_text(self):
+    def test_clean_text(self) -> None:
         """Tests _clean_text."""
         initial_text = "abcdefg @#$&*[^@]#${}|~[]/|\\"
         expected_text = "abcdefg #[]#{}|~[]|"
         self.assertEqual(_clean_text(initial_text), expected_text)
 
-    def test_prepare_string_for_syllabification(self):
+    def test_prepare_string_for_syllabification(self) -> None:
         """Tests _prepare_string_for_syllabification."""
         str_hyphen_start = "-ABCDEFG"
         str_hyphen_end = "ABCDEFG-"
@@ -117,7 +124,7 @@ def test_prepare_string_for_syllabification(self):
             ("ABCDEFG", False, True),
         )
 
-    def test_split_text_sections(self):
+    def test_split_text_sections(self) -> None:
         """
         Tests _split_text_sections.
 
@@ -134,15 +141,20 @@ def test_split_text_sections(self):
         ]
         self.assertEqual(_split_text_sections(start_str), sectioned)
 
-    def test_syllabify_text(self):
+    def test_syllabify_text(self) -> None:
         """Tests syllabify_text. Constructs a test string with all possible cases."""
 
         # Full text of test:
         # "Sanctus sanctus sanctus # Sabaoth plen- # sunt # -li et {terra gloria} tua
         # Bene- {dictus} qui venit {#} no- {#} -ne {#} -omini
         # {cantic- #} {#} {# -ovum} quia mirabilia fecit | salvavit sibi dextera
         # eius et brachium sanctum eius | ~Gloria | ~Ipsum [Canticum]"
-        test_cases = [
+        class TestCaseType(TypedDict):
+            case_name: str
+            test_string: str
+            expected_result: List[List[List[str]]]
+
+        test_cases: List[TestCaseType] = [
             {
                 "case_name": "Normal Text",
                 "test_string": "Sanctus sanctus sanctus",
@@ -230,14 +242,17 @@ def test_syllabify_text(self):
         ]
         for test_case in test_cases:
             with self.subTest(test_case["case_name"]):
-                syllabified_text = syllabify_text(test_case["test_string"])
+                syllabified_text, adjusted_spacing = syllabify_text(
+                    test_case["test_string"]
+                )
                 syllabified_text_list = [
                     section.section for section in syllabified_text
                 ]
                 self.assertEqual(
                     syllabified_text_list,
                     test_case["expected_result"],
                 )
+                self.assertFalse(adjusted_spacing)
         # Test presyllabified text
         presyllabified_text = (
             # Test case where a syllable break has been added
@@ -276,21 +291,42 @@ def test_syllabify_text(self):
             [["ecce"], ["enim"], ["ex"], ["hoc"], ["be-", "a-", "tam"]],
         ]
         with self.subTest("Presyllabified Text"):
-            syllabified_text = syllabify_text(
+            syllabified_text, adjusted_spacing = syllabify_text(
                 presyllabified_text, text_presyllabified=True
             )
             syllabified_text_list = [section.section for section in syllabified_text]
             self.assertEqual(
                 syllabified_text_list,
                 expected_result,
             )
+            self.assertFalse(adjusted_spacing)
         with self.subTest("Improperly encoded #"):
-            preceding_hash_text = "rorate #-li de super"
-            following_hash_text = "rorate cae-# de super"
-            with self.assertRaises(LatinError):
-                syllabify_text(preceding_hash_text)
-            with self.assertRaises(LatinError):
-                syllabify_text(following_hash_text)
+            test_no_space_before_hash = "rorate #-li de super"
+            test_no_space_after_hash = "rorate cae-# de super"
+            expected_no_space_before_hash = [
+                [["ro-", "ra-", "te"],
+                ["#"], ["-li"],
+                ["de"], ["su-", "per"]],
+            ]
+            expected_no_space_after_hash = [
+                [["ro-", "ra-", "te"],
+                ["cae-"], ["#"],
+                ["de"], ["su-", "per"]],
+            ]
+            syllabified_text_no_space_before_hash, _ = syllabify_text(
+                test_no_space_before_hash
+            )
+            syllabified_text_no_space_after_hash, _ = syllabify_text(
+                test_no_space_after_hash
+            )
+            self.assertEqual(
+                [section.section for section in syllabified_text_no_space_before_hash],
+                expected_no_space_before_hash,
+            )
+            self.assertEqual(
+                [section.section for section in syllabified_text_no_space_after_hash],
+                expected_no_space_after_hash,
+            )
         with self.subTest("Improperly encoded [ & ]"):
             test_with_bad_bracket = "rorate | caeli [de super]"
             with self.assertRaises(LatinError):
@@ -302,7 +338,7 @@ class TestVolpianoSyllabification(unittest.TestCase):
     Tests functions for syllabifying volpiano in volpiano_syllabification.py.
     """
 
-    def test_prepare_volpiano_for_alignment(self):
+    def test_prepare_volpiano_for_alignment(self) -> None:
         standard_volpiano = "1---g---h---3"
         volpiano_with_extra_starting_matter = "tf-g-1---g-1--h---3"
         expected = "g---h---3"
@@ -326,8 +362,14 @@ def test_prepare_volpiano_for_alignment(self):
             )
             self.assertTrue(vol_chars_rmvd_flag)
 
-    def test_syllabify_volpiano(self):
-        volpiano_syllabification_test_cases = [
+    def test_syllabify_volpiano(self) -> None:
+        class TestCaseType(TypedDict):
+            case_name: str
+            volpiano: str
+            vol_improperly_encoded: bool
+            expected_result: List[List[List[str]]]
+
+        volpiano_syllabification_test_cases: List[TestCaseType] = [
             {
                 "case_name": "Section divided by barline '3' + standard spacing",
                 "volpiano": "a-b--c---d---e---3---",
@@ -390,7 +432,7 @@ def test_syllabify_volpiano(self):
                     vol_improperly_encoded, test_case["vol_improperly_encoded"]
                 )
 
-    def test_adjust_missing_music_spacing_for_rendering(self):
+    def test_adjust_missing_music_spacing_for_rendering(self) -> None:
         with self.subTest("Not a missing music section"):
             volpiano = "a-b--c---"
             text_length = 3
@@ -432,7 +474,7 @@ class TestTextVolpianoAlignment(unittest.TestCase):
     text_volpiano_alignment.py.
     """
 
-    def test_align_text_volpiano(self):
+    def test_align_text_volpiano(self) -> None:
         """
         Tests align_text_volpiano.
         """

diff --git a/volpiano_display_utilities/cantus_text_syllabification.py b/volpiano_display_utilities/cantus_text_syllabification.py
@@ -39,6 +39,14 @@
 STR_ENDS_W_HYPHEN_REGEX = re.compile(r"\-$")
 # Matches pipes and missing music sectioners ("{" and "}")
 TEXT_SECTIONER_REGEX = re.compile(r"(\||\{.*?\}|~.*?(?=\||$))")
+# Matches all hashes and preceding spaces, if any, except
+# hashes that are immediately preceded by an open curly brace or
+# the start of the string
+SPACE_PRECEDING_HASH_REGEX = re.compile(r"(?<=[^\{]) ?#")
+# Matches all hashes and following spaces, if any, except
+# hashes that are immediately followed by a close curly brace or
+# the end of the string
+SPACE_FOLLOWING_HASH_REGEX = re.compile(r"# ?(?=[^\}])")
 
 
 def _clean_text(text: str) -> str:
@@ -63,13 +71,31 @@ def _detect_invalid_characters(text: str) -> bool:
     return bool(INVALID_CHAR_REGEX.search(text))
 
 
+def _space_hash_signs(text: str) -> Tuple[str, bool]:
+    """
+    Spaces out hash signs ("#") in a text string to
+    ensure that they are syllabified as individual
+    words.
+
+    text [str]: text string to space out
+
+    returns [str, bool]: text string with hash signs spaced out
+        and boolean indicating whether spacing adjustments
+        were made
+    """
+    spaced_text = SPACE_PRECEDING_HASH_REGEX.sub(" #", text)
+    spaced_text = SPACE_FOLLOWING_HASH_REGEX.sub("# ", spaced_text)
+    adjusted = spaced_text != text
+    return spaced_text, adjusted
+
+
 def _prepare_string_for_syllabification(word_str: str) -> Tuple[str, bool, bool]:
     """
-    Complete preparation of a string before syllabification.
+    Complete preparation of a word string before syllabification.
     Hyphens are removed from the beginning and end of the string,
     and the presence of these hyphens is recorded.
 
-    word_str [str]: string to prepare
+    word_str [str]: string containing a word (or partial word) to prepare
 
     returns [tuple[str,bool,bool]]: prepared string, whether a hyphen
         was removed from the beginning of the string, whether a hyphen
@@ -102,7 +128,7 @@ def syllabify_text(
     text: str,
     clean_text: bool = False,
     text_presyllabified: bool = False,
-) -> List[SyllabifiedTextSection]:
+) -> Tuple[List[SyllabifiedTextSection], bool]:
     """
     Syllabifies a text string that has been encoded in the style
     of the Cantus Database. Texts are syllabified word by word,
@@ -120,9 +146,9 @@ def syllabify_text(
         This function finds a syllable split if and only if a hyphen is
         present (ie. no additional syllabification is performed).
 
-    returns [SyllabifiedTextSection]: an object of class SyllabifiedTextSection
-        that contains the syllabified text string. See class docstring for more
-        information.
+    returns [SyllabifiedTextSection, bool]: an object of class SyllabifiedTextSection
+        that contains the syllabified text string and a boolean indicating whether
+        spacing adjustments. See class docstring for more information.
     """
 
     logging.debug("Syllabifying text: %s", text)
@@ -135,6 +161,8 @@ def syllabify_text(
             raise ValueError(
                 "Invalid characters detected in text string. To clean, use clean_text=True."
             )
+    # Space out hash signs if necessary
+    text, spacing_adjusted = _space_hash_signs(text)
     # Split text into sections. Sections are divided by pipes ("|") or enclosed
     # in curly braces ("{}") or square brackets ("[]").
     text_sections = _split_text_sections(text)
@@ -195,10 +223,10 @@ def syllabify_text(
                 syllabified_section.append(syllabified_word)
         syllabified_text.append(SyllabifiedTextSection(syllabified_section))
     logging.debug("Syllabified text: %s", ", ".join(str(s) for s in syllabified_text))
-    return syllabified_text
+    return syllabified_text, spacing_adjusted
 
 
-def flatten_syllabified_text(syllabified_text=List[SyllabifiedTextSection]) -> str:
+def flatten_syllabified_text(syllabified_text: List[SyllabifiedTextSection]) -> str:
     """
     Flattens a list of syllabified text sections to a string.
 

diff --git a/volpiano_display_utilities/text_volpiano_alignment.py b/volpiano_display_utilities/text_volpiano_alignment.py
@@ -335,11 +335,13 @@ def align_text_and_volpiano(
     review_encoding_flag: bool = False
     # If cleaning of text is required, we set the review_encoding_flag to True
     try:
-        syllabified_text = syllabify_text(
+        syllabified_text, spacing_adjusted = syllabify_text(
             chant_text, clean_text=False, text_presyllabified=text_presyllabified
         )
+        if spacing_adjusted:
+            review_encoding_flag = True
     except ValueError:
-        syllabified_text = syllabify_text(
+        syllabified_text, spacing_adjusted = syllabify_text(
             chant_text, clean_text=True, text_presyllabified=text_presyllabified
         )
         review_encoding_flag = True