Skip to content

Commit

Permalink
Re-space incorrectly-encoded hashes
Browse files Browse the repository at this point in the history
Return flag when hash respacing occurs

Adjusts some typing in tests/tests.py
  • Loading branch information
dchiller committed May 27, 2024
1 parent a063a39 commit 3499a67
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 35 deletions.
92 changes: 67 additions & 25 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import json
import csv

from typing import List, Dict
from typing_extensions import TypedDict

from volpiano_display_utilities.latin_word_syllabification import (
syllabify_word,
split_word_by_syl_bounds,
Expand All @@ -31,7 +34,7 @@ class TestWordSyllabification(unittest.TestCase):
Tests functions in latin_text_syllabification.
"""

def test_syllabify_word(self):
def test_syllabify_word(self) -> None:
"""Tests syllabify_word."""
# Read test words from csv file and get syllable boundaries.
# ie. "Be-ne-dic-tus" -> [2, 4, 7]
Expand All @@ -53,22 +56,26 @@ def test_syllabify_word(self):
with self.subTest(word=word):
self.assertEqual(syllabify_word(word, return_string=False), expected)

def test_split_word_by_syl_bounds(self):
def test_split_word_by_syl_bounds(self) -> None:
"""
Tests split_word_by_syl_bounds.
Test 1, 2, and 2+ syllable words.
"""
test_words = {"Benedictus": "Be-ne-dic-tus", "qui": "qui", "venit": "ve-nit"}
word_syl_bounds = {"Benedictus": [2, 4, 7], "qui": [], "venit": [2]}
word_syl_bounds: Dict[str, List[int]] = {
"Benedictus": [2, 4, 7],
"qui": [],
"venit": [2],
}
for word, expected in test_words.items():
with self.subTest(word=word):
self.assertEqual(
"".join(split_word_by_syl_bounds(word, word_syl_bounds[word])),
expected,
)

def test_character_check(self):
def test_character_check(self) -> None:
"""
Tests that an error is raised with invalid characters.
"""
Expand All @@ -86,25 +93,25 @@ class TestCantusTextSyllabification(unittest.TestCase):
Tests functions in cantus_text_syllabification.
"""

def test_cantus_exceptions(self):
def test_cantus_exceptions(self) -> None:
"""Tests syllabification of a few words that are exceptions
in the Cantus Database."""
exception_word = "euouae"
syllabified_word = flatten_syllabified_text(syllabify_text(exception_word))
syllabified_word = flatten_syllabified_text(syllabify_text(exception_word)[0])
self.assertEqual(syllabified_word, "e-u-o-u-a-e")
exception_word_capitalized = "Euouae"
syllabified_word_capitalized = flatten_syllabified_text(
syllabify_text(exception_word_capitalized)
syllabify_text(exception_word_capitalized)[0]
)
self.assertEqual(syllabified_word_capitalized, "E-u-o-u-a-e")

def test_clean_text(self):
def test_clean_text(self) -> None:
"""Tests _clean_text."""
initial_text = "abcdefg @#$&*[^@]#${}|~[]/|\\"
expected_text = "abcdefg #[]#{}|~[]|"
self.assertEqual(_clean_text(initial_text), expected_text)

def test_prepare_string_for_syllabification(self):
def test_prepare_string_for_syllabification(self) -> None:
"""Tests _prepare_string_for_syllabification."""
str_hyphen_start = "-ABCDEFG"
str_hyphen_end = "ABCDEFG-"
Expand All @@ -117,7 +124,7 @@ def test_prepare_string_for_syllabification(self):
("ABCDEFG", False, True),
)

def test_split_text_sections(self):
def test_split_text_sections(self) -> None:
"""
Tests _split_text_sections.
Expand All @@ -134,15 +141,20 @@ def test_split_text_sections(self):
]
self.assertEqual(_split_text_sections(start_str), sectioned)

def test_syllabify_text(self):
def test_syllabify_text(self) -> None:
"""Tests syllabify_text. Constructs a test string with all possible cases."""

# Full text of test:
# "Sanctus sanctus sanctus # Sabaoth plen- # sunt # -li et {terra gloria} tua
# Bene- {dictus} qui venit {#} no- {#} -ne {#} -omini
# {cantic- #} {#} {# -ovum} quia mirabilia fecit | salvavit sibi dextera
# eius et brachium sanctum eius | ~Gloria | ~Ipsum [Canticum]"
test_cases = [
class TestCaseType(TypedDict):
case_name: str
test_string: str
expected_result: List[List[List[str]]]

test_cases: List[TestCaseType] = [
{
"case_name": "Normal Text",
"test_string": "Sanctus sanctus sanctus",
Expand Down Expand Up @@ -230,14 +242,17 @@ def test_syllabify_text(self):
]
for test_case in test_cases:
with self.subTest(test_case["case_name"]):
syllabified_text = syllabify_text(test_case["test_string"])
syllabified_text, adjusted_spacing = syllabify_text(
test_case["test_string"]
)
syllabified_text_list = [
section.section for section in syllabified_text
]
self.assertEqual(
syllabified_text_list,
test_case["expected_result"],
)
self.assertFalse(adjusted_spacing)
# Test presyllabified text
presyllabified_text = (
# Test case where a syllable break has been added
Expand Down Expand Up @@ -276,21 +291,42 @@ def test_syllabify_text(self):
[["ecce"], ["enim"], ["ex"], ["hoc"], ["be-", "a-", "tam"]],
]
with self.subTest("Presyllabified Text"):
syllabified_text = syllabify_text(
syllabified_text, adjusted_spacing = syllabify_text(
presyllabified_text, text_presyllabified=True
)
syllabified_text_list = [section.section for section in syllabified_text]
self.assertEqual(
syllabified_text_list,
expected_result,
)
self.assertFalse(adjusted_spacing)
with self.subTest("Improperly encoded #"):
preceding_hash_text = "rorate #-li de super"
following_hash_text = "rorate cae-# de super"
with self.assertRaises(LatinError):
syllabify_text(preceding_hash_text)
with self.assertRaises(LatinError):
syllabify_text(following_hash_text)
test_no_space_before_hash = "rorate #-li de super"
test_no_space_after_hash = "rorate cae-# de super"
expected_no_space_before_hash = [
[["ro-", "ra-", "te"],
["#"], ["-li"],
["de"], ["su-", "per"]],
]
expected_no_space_after_hash = [
[["ro-", "ra-", "te"],
["cae-"], ["#"],
["de"], ["su-", "per"]],
]
syllabified_text_no_space_before_hash, _ = syllabify_text(
test_no_space_before_hash
)
syllabified_text_no_space_after_hash, _ = syllabify_text(
test_no_space_after_hash
)
self.assertEqual(
[section.section for section in syllabified_text_no_space_before_hash],
expected_no_space_before_hash,
)
self.assertEqual(
[section.section for section in syllabified_text_no_space_after_hash],
expected_no_space_after_hash,
)
with self.subTest("Improperly encoded [ & ]"):
test_with_bad_bracket = "rorate | caeli [de super]"
with self.assertRaises(LatinError):
Expand All @@ -302,7 +338,7 @@ class TestVolpianoSyllabification(unittest.TestCase):
Tests functions for syllabifying volpiano in volpiano_syllabification.py.
"""

def test_prepare_volpiano_for_alignment(self):
def test_prepare_volpiano_for_alignment(self) -> None:
standard_volpiano = "1---g---h---3"
volpiano_with_extra_starting_matter = "tf-g-1---g-1--h---3"
expected = "g---h---3"
Expand All @@ -326,8 +362,14 @@ def test_prepare_volpiano_for_alignment(self):
)
self.assertTrue(vol_chars_rmvd_flag)

def test_syllabify_volpiano(self):
volpiano_syllabification_test_cases = [
def test_syllabify_volpiano(self) -> None:
class TestCaseType(TypedDict):
case_name: str
volpiano: str
vol_improperly_encoded: bool
expected_result: List[List[List[str]]]

volpiano_syllabification_test_cases: List[TestCaseType] = [
{
"case_name": "Section divided by barline '3' + standard spacing",
"volpiano": "a-b--c---d---e---3---",
Expand Down Expand Up @@ -390,7 +432,7 @@ def test_syllabify_volpiano(self):
vol_improperly_encoded, test_case["vol_improperly_encoded"]
)

def test_adjust_missing_music_spacing_for_rendering(self):
def test_adjust_missing_music_spacing_for_rendering(self) -> None:
with self.subTest("Not a missing music section"):
volpiano = "a-b--c---"
text_length = 3
Expand Down Expand Up @@ -432,7 +474,7 @@ class TestTextVolpianoAlignment(unittest.TestCase):
text_volpiano_alignment.py.
"""

def test_align_text_volpiano(self):
def test_align_text_volpiano(self) -> None:
"""
Tests align_text_volpiano.
"""
Expand Down
44 changes: 36 additions & 8 deletions volpiano_display_utilities/cantus_text_syllabification.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@
STR_ENDS_W_HYPHEN_REGEX = re.compile(r"\-$")
# Matches pipes and missing music sectioners ("{" and "}")
TEXT_SECTIONER_REGEX = re.compile(r"(\||\{.*?\}|~.*?(?=\||$))")
# Matches all hashes and preceding spaces, if any, except
# hashes that are immediately preceded by an open curly brace or
# the start of the string
SPACE_PRECEDING_HASH_REGEX = re.compile(r"(?<=[^\{]) ?#")
# Matches all hashes and following spaces, if any, except
# hashes that are immediately followed by a close curly brace or
# the end of the string
SPACE_FOLLOWING_HASH_REGEX = re.compile(r"# ?(?=[^\}])")


def _clean_text(text: str) -> str:
Expand All @@ -63,13 +71,31 @@ def _detect_invalid_characters(text: str) -> bool:
return bool(INVALID_CHAR_REGEX.search(text))


def _space_hash_signs(text: str) -> Tuple[str, bool]:
"""
Spaces out hash signs ("#") in a text string to
ensure that they are syllabified as individual
words.
text [str]: text string to space out
returns [str, bool]: text string with hash signs spaced out
and boolean indicating whether spacing adjustments
were made
"""
spaced_text = SPACE_PRECEDING_HASH_REGEX.sub(" #", text)
spaced_text = SPACE_FOLLOWING_HASH_REGEX.sub("# ", spaced_text)
adjusted = spaced_text != text
return spaced_text, adjusted


def _prepare_string_for_syllabification(word_str: str) -> Tuple[str, bool, bool]:
"""
Complete preparation of a string before syllabification.
Complete preparation of a word string before syllabification.
Hyphens are removed from the beginning and end of the string,
and the presence of these hyphens is recorded.
word_str [str]: string to prepare
word_str [str]: string containing a word (or partial word) to prepare
returns [tuple[str,bool,bool]]: prepared string, whether a hyphen
was removed from the beginning of the string, whether a hyphen
Expand Down Expand Up @@ -102,7 +128,7 @@ def syllabify_text(
text: str,
clean_text: bool = False,
text_presyllabified: bool = False,
) -> List[SyllabifiedTextSection]:
) -> Tuple[List[SyllabifiedTextSection], bool]:
"""
Syllabifies a text string that has been encoded in the style
of the Cantus Database. Texts are syllabified word by word,
Expand All @@ -120,9 +146,9 @@ def syllabify_text(
This function finds a syllable split if and only if a hyphen is
present (ie. no additional syllabification is performed).
returns [SyllabifiedTextSection]: an object of class SyllabifiedTextSection
that contains the syllabified text string. See class docstring for more
information.
returns [SyllabifiedTextSection, bool]: an object of class SyllabifiedTextSection
that contains the syllabified text string and a boolean indicating whether
spacing adjustments. See class docstring for more information.
"""

logging.debug("Syllabifying text: %s", text)
Expand All @@ -135,6 +161,8 @@ def syllabify_text(
raise ValueError(
"Invalid characters detected in text string. To clean, use clean_text=True."
)
# Space out hash signs if necessary
text, spacing_adjusted = _space_hash_signs(text)
# Split text into sections. Sections are divided by pipes ("|") or enclosed
# in curly braces ("{}") or square brackets ("[]").
text_sections = _split_text_sections(text)
Expand Down Expand Up @@ -195,10 +223,10 @@ def syllabify_text(
syllabified_section.append(syllabified_word)
syllabified_text.append(SyllabifiedTextSection(syllabified_section))
logging.debug("Syllabified text: %s", ", ".join(str(s) for s in syllabified_text))
return syllabified_text
return syllabified_text, spacing_adjusted


def flatten_syllabified_text(syllabified_text=List[SyllabifiedTextSection]) -> str:
def flatten_syllabified_text(syllabified_text: List[SyllabifiedTextSection]) -> str:
"""
Flattens a list of syllabified text sections to a string.
Expand Down
6 changes: 4 additions & 2 deletions volpiano_display_utilities/text_volpiano_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,13 @@ def align_text_and_volpiano(
review_encoding_flag: bool = False
# If cleaning of text is required, we set the review_encoding_flag to True
try:
syllabified_text = syllabify_text(
syllabified_text, spacing_adjusted = syllabify_text(
chant_text, clean_text=False, text_presyllabified=text_presyllabified
)
if spacing_adjusted:
review_encoding_flag = True
except ValueError:
syllabified_text = syllabify_text(
syllabified_text, spacing_adjusted = syllabify_text(
chant_text, clean_text=True, text_presyllabified=text_presyllabified
)
review_encoding_flag = True
Expand Down

0 comments on commit 3499a67

Please sign in to comment.