Skip to content

Commit

Permalink
Fix issues with emtpy lines and too long lines
Browse files Browse the repository at this point in the history
  • Loading branch information
mbanon committed Sep 6, 2023
1 parent 60a4cbd commit 066750a
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 23 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v1.6.2
- Fix divison by 0 error on empty sentences.
- Fixed rules that were giving false positives on empty sentences (no titles, wrong language)
- For performance, long setences (>1024 chars.) are ignored by default, only "not_too_long" is outputed. Added "--dont_ignore_long" flag to override this
behaviour.

## v1.6.1

### Changed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "monocleaner"
version = "1.6.1"
version = "1.6.2"
requires-python = ">=3.8"
authors = [
{ name = "Prompsit Language Engineering", email = "[email protected]" },
Expand Down
61 changes: 39 additions & 22 deletions src/monocleaner/hardrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def c_no_empty(self, sentence):
return sentence != ""

def c_no_titles(self, sentence):
if len(sentence) == 0:
return True
return len(sentence.strip().split(" ")) > 1

def c_not_too_long(self, sentence):
Expand All @@ -85,12 +87,16 @@ def c_no_bad_encoding(self, sentence):
return True

def c_no_only_symbols(self, sentence):
if len(sentence) == 0:
return True
return len(regex_alpha.findall(sentence)) / len(sentence) > 0.1

def c_no_only_numbers(self, sentence):
threshold = 0.5
if self.language in CJK:
threshold = 0.7
if len(sentence) == 0:
return True
return len(regex_numbers.findall(sentence)) / len(sentence) < threshold

def c_no_urls(self, sentence):
Expand Down Expand Up @@ -163,7 +169,7 @@ def c_no_repeated_words(self, sentence):
return True

def z_no_wrong_language(self, sentence):
if not self.disable_lang_ident:
if (not self.disable_lang_ident) and len(sentence) > 0:
# Obtain fastspell prediction, lowercasing helps in small langs
langid = self.fastspell.getlang(sentence.lower())

Expand Down Expand Up @@ -224,6 +230,7 @@ def initialization():
parser.add_argument("--detect_script", action='store_true', help="Detect writing script with FastSpell (only Serbo-Croatian is supported)")
parser.add_argument("--annotated_output", action='store_true', help="Add hardrules annotation for each sentence")
parser.add_argument("--run_all_rules", action='store_true', help="Run all hardrules for each sentence instead of stopping at the first one discarded")
parser.add_argument('--dont_ignore_long', default=False, action='store_true', help="Don't ignore too long sentences")
parser.add_argument("--debug", action='store_true')
parser.add_argument("-q", "--quiet", action='store_true')
parser.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")
Expand Down Expand Up @@ -268,39 +275,49 @@ def main():
hardrules = Hardrules(args)

nline = 0

for line in args.input:
nline += 1
tag = ""
parts = line.rstrip("\n").split("\t")

if len(parts) >= args.scol:
sentence = parts[args.scol-1]
else:
logging.error(f" scol ({args.scol}) index above column number ({len(parts)}) on line {nline}")
continue
sentence = ""
tag = "c_missing_columns"
#continue

hr_result = hardrules.wrong_segment(args, sentence)
tag = hr_result
langid = args.language

# Language identification rule and output
if not args.disable_lang_ident:
# If run all rules is enabled, run the identification method.
# If it doesn't pass, then set the tag accordingly if other hardrules have failed.
if args.run_all_rules:
langid, res = hardrules.z_no_wrong_language(line)

if not res:
if tag == 'keep':
tag = 'no_wrong_language'
else:
tag += '+no_wrong_language'
else:
# If run all rules is disabled, then only run identification method when all other hardrules have passed
if tag == 'keep':
langid, res = hardrules.z_no_wrong_language(line)
if not args.dont_ignore_long and (len(line) > 1024):
tag = "c_not_too_long"
#continue

if tag == "":
hr_result = hardrules.wrong_segment(args, sentence)
tag = hr_result
langid = args.language

# Language identification rule and output
if (not args.disable_lang_ident) and len(line) > 0:
# If run all rules is enabled, run the identification method.
# If it doesn't pass, then set the tag accordingly if other hardrules have failed.
if args.run_all_rules:
langid, res = hardrules.z_no_wrong_language(sentence)

if not res:
tag = 'no_wrong_language'
if tag == 'keep':
tag = 'no_wrong_language'
else:
tag += '+no_wrong_language'
else:
# If run all rules is disabled, then only run identification method when all other hardrules have passed
if tag == 'keep':
langid, res = hardrules.z_no_wrong_language(sentence)

if not res:
tag = 'no_wrong_language'

score = 1
if tag != "keep":
Expand Down

0 comments on commit 066750a

Please sign in to comment.