Merge pull request #180 from krr-up/feature/authfmt

add script to format authors
krr-up · Jun 16, 2024 · 4f17caa · 4f17caa
2 parents bc765b1 + 955249f
commit 4f17caa
Show file tree

Hide file tree

Showing 6 changed files with 533 additions and 91 deletions.
diff --git a/authfmt.py b/authfmt.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Script using the bibtexparser module to format author names in our
+bibliography.
+- It abbreviates first names: Roland Kaminski becomes R. Kaminski.
+
+It does well with most names. In particular, all the names composed of two
+words. For names with more than two words the splitting of first/last names is
+in general ambiguous, so you may need to fix some of them manually:
+`Juan Carlos Nieves` should be parsed as `{Juan Carlos} Nieves`
+`Manuel Ojeda Aciego` as `Manuel {Ojeda Aciego}`.
+Protected special names can be added to the `config_authfmt.toml` file.
+"""
+import sys
+import os
+import tomllib
+import bibtexparser as bp
+from splitnames import splitname
+from bibfmt import check_min_version, cleanup_record, _parser, _writer
+
+
+def split_names_to_strs(names: str) -> list[str]:
+    """
+    Split the given string containing people names into a list of strings representing the name of each person.
+    """
+    return names.replace("\n", " ").split(" and ")
+
+
+def format_first_name(name: str) -> str:
+    if len(name) > 2 and not name.startswith("{\\"):
+        name = f"{name[0]}."
+    return name
+
+
+def format_name_dict(name: dict) -> dict:
+    """
+    Format name reprented as a dictionary.
+    """
+    if "first" in name:
+        first_name = " ".join(name["first"])
+        name["first"] = [format_first_name(first_name)]
+    return name
+
+
+def name_dict_to_str(name: dict) -> str:
+    """
+    Concatenate the name information into a string.
+    """
+    first = " ".join(name.get("first", []))
+    von = " ".join(name.get("von", []))
+    last = " ".join(name.get("last", []))
+    jr = " ".join(name.get("jr", []))
+    previous = first != ""
+    if previous and von:
+        von = f" {von}"
+    if previous and last:
+        last = f" {last}"
+    if previous and jr:
+        jr = f" {jr}"
+    return f"{first}{von}{last}{jr}"
+
+
+def config_special_names(config) -> dict[str, str]:
+    """
+    Return the a dictionary mapping special names to their formatted version.
+    """
+    special_names = config["special_names"]
+    for k, name in special_names.items():
+        name = [w.strip() for w in name.split("|")]
+        special_names[k] = (
+            name[0]
+            if len(name) == 1
+            else name_dict_to_str(
+                format_name_dict(splitname(", ".join(name[1:] + [name[0]])))
+            )
+        )
+    return special_names
+
+
+CONFIG = tomllib.load(
+    open(os.path.join(os.path.dirname(__file__), "config_authfmt.toml"), "rb")
+)
+CONFIG_SPECIAL_NAMES = config_special_names(CONFIG)
+
+
+def format_name(name: str) -> str:
+    """
+    Format the given string containing a person name.
+    """
+    if name in CONFIG_SPECIAL_NAMES:
+        return CONFIG_SPECIAL_NAMES[name]
+    return name_dict_to_str(format_name_dict(splitname(name)))
+
+
+def format_names(names: str) -> str:
+    """
+    Format the given string containing people names.
+    """
+    return " and ".join(format_name(name) for name in split_names_to_strs(names))
+
+
+def format_entry_names(entry):
+    """
+    Format the names in the given entry.
+    """
+    new_entry = entry.copy()
+    if "author" in new_entry:
+        new_entry["author"] = format_names(new_entry["author"])
+    if "editor" in entry:
+        new_entry["editor"] = format_names(entry["editor"])
+    return new_entry
+
+
+def format_entry(entry):
+    """
+    Format the given entry.
+    """
+    return format_entry_names(cleanup_record(entry))
+
+
+def format_bib(path):
+    """
+    Format the given bibliography file.
+    """
+    # read bibliography
+    with open(path, "r") as f:
+        db = bp.load(f, _parser(customization=format_entry))
+
+    # write the bibliography
+    with open(path, "w") as f:
+        bp.dump(db, f, _writer(sorted_entries=False))
+
+
+if __name__ == "__main__":
+    check_min_version()
+    format_bib("krr.bib")
+    format_bib("procs.bib")
+    sys.exit(0)
diff --git a/bibfmt.py b/bibfmt.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
-'''
+"""
 Script using the bibtexparser module to cleanup_record and pretty print our
 bibliography.
-'''
+"""
 
 import sys
 from io import StringIO
@@ -18,97 +18,106 @@
 
 
 def check_min_version():
-    '''
+    """
     Ensure that a new enough version of bibtexparser is used.
-    '''
-    vers = bp.__version__.split('.')
+    """
+    vers = bp.__version__.split(".")
     if (int(vers[0]), int(vers[1])) < (1, 2):
-        raise RuntimeError('The script requires at least bibtexparser version 1.2.')
+        raise RuntimeError("The script requires at least bibtexparser version 1.2.")
+
 
 def is_ascii(x):
-    '''
+    """
     Reurn true if the given string contains ascii symbols only.
-    '''
+    """
     try:
-        x.encode('ascii')
+        x.encode("ascii")
         return True
     except UnicodeEncodeError:
         return False
 
+
 # Map from unicode symbols to latex expressions.
 #
 # The bibtexparser.latexenc module also maps some ascii characters to unicode
 # symbols. Such characters are ignored in the map.
-UNICODE_TO_LATEX = {key: value
-                    for key, value in unicode_to_latex_map.items()
-                    if not is_ascii(key)}
+UNICODE_TO_LATEX = {
+    key: value for key, value in unicode_to_latex_map.items() if not is_ascii(key)
+}
+
 
 def apply_on_expression(x, f):
-    '''
+    """
     Apply the function f for converting strings to bibtex expressions as
     returned by the bibtexparser module.
-    '''
+    """
     if isinstance(x, str):
         return f(x)
     if isinstance(x, BibDataStringExpression):
         x.apply_on_strings(f)
     return x
 
+
 def cleanup_expression(x):
-    '''
+    """
     Convert the given string containing unicode symbols into a string with
     latex escapes only.
-    '''
+    """
     ret = []
     for char in x:
-        if char in (' ', '{', '}'):
+        if char in (" ", "{", "}"):
             ret.append(char)
         else:
             ret.append(UNICODE_TO_LATEX.get(char, char))
-    return ''.join(ret)
+    return "".join(ret)
+
 
 def cleanup_record(x):
-    '''
+    """
     Cleanup a record as returned by the bibtexparser module.
-    '''
+    """
     for val in x:
-        if val in ('ID',):
+        if val in ("ID",):
             continue
         x[val] = apply_on_expression(x[val], cleanup_expression)
-        if val.lower() == 'pages':
-            x[val] = x[val].replace('--', '-')
+        if val.lower() == "pages":
+            x[val] = x[val].replace("--", "-")
     return x
 
-def _parser():
-    '''
+
+def _parser(customization=cleanup_record):
+    """
     Return a configured bibtex parser.
-    '''
+    """
     parser = BibTexParser()
     parser.interpolate_strings = False
-    parser.customization = cleanup_record
+    parser.customization = customization
     return parser
 
-def _writer():
-    '''
+
+def _writer(sorted_entries=True):
+    """
     Return a configured bibtex writer.
-    '''
+    """
     writer = BibTexWriter()
-    writer.indent = '  '
-    writer.order_entries_by = ('ID',)
-    writer.display_order = ['title', 'author', 'editor']
+    writer.indent = "  "
+    writer.order_entries_by = ("ID",) if sorted_entries else None
+    writer.display_order = ["title", "author", "editor"]
     return writer
 
+
 def _fixdb(db):
-    '''
+    """
     Currently sorts the strings in the database.
-    '''
+    """
     db.strings = OrderedDict(sorted(db.strings.items()))
     return db
 
+
 def format_bib(path):
-    '''
+    """
     Format the given bibliography file.
-    '''
+    """
     # read bibliography
     with open(path, "r") as f:
         db = _fixdb(bp.load(f, _parser()))
@@ -117,10 +126,11 @@ def format_bib(path):
     with open(path, "w") as f:
         bp.dump(db, f, _writer())
 
+
 def check_bib(path):
-    '''
+    """
     Check if the given bibliography is correctly formatted.
-    '''
+    """
     # read bibliography
     with open(path, "r") as f:
         in_ = f.read()
@@ -131,44 +141,43 @@ def check_bib(path):
     out = StringIO()
     bp.dump(db, out, _writer())
 
-    return [x for x in ndiff(in_.splitlines(), out.getvalue().splitlines()) if x[0] != ' ']
+    return [
+        x for x in ndiff(in_.splitlines(), out.getvalue().splitlines()) if x[0] != " "
+    ]
 
 
 def run():
-    '''
+    """
     Run the applications.
-    '''
+    """
     check_min_version()
 
     parser = ArgumentParser(
-        prog='bibfmt',
-        description='Autoformat and check bibliography.')
+        prog="bibfmt", description="Autoformat and check bibliography."
+    )
     subparsers = parser.add_subparsers(
-        metavar='command',
-        dest='command',
-        help='available subcommands',
-        required=True)
-    subparsers.add_parser(
-        'check',
-        help='check whether bibliography is correctly formatted')
+        metavar="command", dest="command", help="available subcommands", required=True
+    )
     subparsers.add_parser(
-        'format',
-        help='format the bibliography')
+        "check", help="check whether bibliography is correctly formatted"
+    )
+    subparsers.add_parser("format", help="format the bibliography")
 
     res = parser.parse_args()
 
     if res.command == "format":
-        format_bib('krr.bib')
-        format_bib('procs.bib')
+        format_bib("krr.bib")
+        format_bib("procs.bib")
         return 0
 
     assert res.command == "check"
-    diff = check_bib('krr.bib') + check_bib('procs.bib')
+    diff = check_bib("krr.bib") + check_bib("procs.bib")
     if diff:
         for x in diff:
             print(x, file=sys.stderr)
         return 1
     return 0
 
+
 if __name__ == "__main__":
     sys.exit(run())
diff --git a/config_authfmt.toml b/config_authfmt.toml
@@ -0,0 +1,5 @@
+[special_names]
+'The STREAM Group' = 'The STREAM Group'
+'Juan Carlos Nieves' = 'Juan Carlos | Nieves'
+'Tran Cao Son' = 'Tran Cao | Son'
+'Cesar Augusto Tacla' = 'Cesar Augusto | Tacla'