Skip to content

Commit

Permalink
Merge pull request #180 from krr-up/feature/authfmt
Browse files Browse the repository at this point in the history
add script to format authors
  • Loading branch information
rkaminsk authored Jun 16, 2024
2 parents bc765b1 + 955249f commit 4f17caa
Show file tree
Hide file tree
Showing 6 changed files with 533 additions and 91 deletions.
138 changes: 138 additions & 0 deletions authfmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
Script using the bibtexparser module to format author names in our
bibliography.
- It abbreviates first names: Roland Kaminski becomes R. Kaminski.
It does well with most names. In particular, all the names composed of two
words. For names with more than two words the splitting of first/last names is
in general ambiguous, so you may need to fix some of them manually:
`Juan Carlos Nieves` should be parsed as `{Juan Carlos} Nieves`
`Manuel Ojeda Aciego` as `Manuel {Ojeda Aciego}`.
Protected special names can be added to the `config_authfmt.toml` file.
"""
import sys
import os
import tomllib
import bibtexparser as bp
from splitnames import splitname
from bibfmt import check_min_version, cleanup_record, _parser, _writer


def split_names_to_strs(names: str) -> list[str]:
"""
Split the given string containing people names into a list of strings representing the name of each person.
"""
return names.replace("\n", " ").split(" and ")


def format_first_name(name: str) -> str:
if len(name) > 2 and not name.startswith("{\\"):
name = f"{name[0]}."
return name


def format_name_dict(name: dict) -> dict:
"""
Format name reprented as a dictionary.
"""
if "first" in name:
first_name = " ".join(name["first"])
name["first"] = [format_first_name(first_name)]
return name


def name_dict_to_str(name: dict) -> str:
"""
Concatenate the name information into a string.
"""
first = " ".join(name.get("first", []))
von = " ".join(name.get("von", []))
last = " ".join(name.get("last", []))
jr = " ".join(name.get("jr", []))
previous = first != ""
if previous and von:
von = f" {von}"
if previous and last:
last = f" {last}"
if previous and jr:
jr = f" {jr}"
return f"{first}{von}{last}{jr}"


def config_special_names(config) -> dict[str, str]:
"""
Return the a dictionary mapping special names to their formatted version.
"""
special_names = config["special_names"]
for k, name in special_names.items():
name = [w.strip() for w in name.split("|")]
special_names[k] = (
name[0]
if len(name) == 1
else name_dict_to_str(
format_name_dict(splitname(", ".join(name[1:] + [name[0]])))
)
)
return special_names


CONFIG = tomllib.load(
open(os.path.join(os.path.dirname(__file__), "config_authfmt.toml"), "rb")
)
CONFIG_SPECIAL_NAMES = config_special_names(CONFIG)


def format_name(name: str) -> str:
"""
Format the given string containing a person name.
"""
if name in CONFIG_SPECIAL_NAMES:
return CONFIG_SPECIAL_NAMES[name]
return name_dict_to_str(format_name_dict(splitname(name)))


def format_names(names: str) -> str:
"""
Format the given string containing people names.
"""
return " and ".join(format_name(name) for name in split_names_to_strs(names))


def format_entry_names(entry):
"""
Format the names in the given entry.
"""
new_entry = entry.copy()
if "author" in new_entry:
new_entry["author"] = format_names(new_entry["author"])
if "editor" in entry:
new_entry["editor"] = format_names(entry["editor"])
return new_entry


def format_entry(entry):
"""
Format the given entry.
"""
return format_entry_names(cleanup_record(entry))


def format_bib(path):
"""
Format the given bibliography file.
"""
# read bibliography
with open(path, "r") as f:
db = bp.load(f, _parser(customization=format_entry))

# write the bibliography
with open(path, "w") as f:
bp.dump(db, f, _writer(sorted_entries=False))


if __name__ == "__main__":
check_min_version()
format_bib("krr.bib")
format_bib("procs.bib")
sys.exit(0)
121 changes: 65 additions & 56 deletions bibfmt.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
'''
"""
Script using the bibtexparser module to cleanup_record and pretty print our
bibliography.
'''
"""

import sys
from io import StringIO
Expand All @@ -18,97 +18,106 @@


def check_min_version():
'''
"""
Ensure that a new enough version of bibtexparser is used.
'''
vers = bp.__version__.split('.')
"""
vers = bp.__version__.split(".")
if (int(vers[0]), int(vers[1])) < (1, 2):
raise RuntimeError('The script requires at least bibtexparser version 1.2.')
raise RuntimeError("The script requires at least bibtexparser version 1.2.")


def is_ascii(x):
'''
"""
Reurn true if the given string contains ascii symbols only.
'''
"""
try:
x.encode('ascii')
x.encode("ascii")
return True
except UnicodeEncodeError:
return False


# Map from unicode symbols to latex expressions.
#
# The bibtexparser.latexenc module also maps some ascii characters to unicode
# symbols. Such characters are ignored in the map.
UNICODE_TO_LATEX = {key: value
for key, value in unicode_to_latex_map.items()
if not is_ascii(key)}
UNICODE_TO_LATEX = {
key: value for key, value in unicode_to_latex_map.items() if not is_ascii(key)
}


def apply_on_expression(x, f):
'''
"""
Apply the function f for converting strings to bibtex expressions as
returned by the bibtexparser module.
'''
"""
if isinstance(x, str):
return f(x)
if isinstance(x, BibDataStringExpression):
x.apply_on_strings(f)
return x


def cleanup_expression(x):
'''
"""
Convert the given string containing unicode symbols into a string with
latex escapes only.
'''
"""
ret = []
for char in x:
if char in (' ', '{', '}'):
if char in (" ", "{", "}"):
ret.append(char)
else:
ret.append(UNICODE_TO_LATEX.get(char, char))
return ''.join(ret)
return "".join(ret)


def cleanup_record(x):
'''
"""
Cleanup a record as returned by the bibtexparser module.
'''
"""
for val in x:
if val in ('ID',):
if val in ("ID",):
continue
x[val] = apply_on_expression(x[val], cleanup_expression)
if val.lower() == 'pages':
x[val] = x[val].replace('--', '-')
if val.lower() == "pages":
x[val] = x[val].replace("--", "-")
return x

def _parser():
'''

def _parser(customization=cleanup_record):
"""
Return a configured bibtex parser.
'''
"""
parser = BibTexParser()
parser.interpolate_strings = False
parser.customization = cleanup_record
parser.customization = customization
return parser

def _writer():
'''

def _writer(sorted_entries=True):
"""
Return a configured bibtex writer.
'''
"""
writer = BibTexWriter()
writer.indent = ' '
writer.order_entries_by = ('ID',)
writer.display_order = ['title', 'author', 'editor']
writer.indent = " "
writer.order_entries_by = ("ID",) if sorted_entries else None
writer.display_order = ["title", "author", "editor"]
return writer


def _fixdb(db):
'''
"""
Currently sorts the strings in the database.
'''
"""
db.strings = OrderedDict(sorted(db.strings.items()))
return db


def format_bib(path):
'''
"""
Format the given bibliography file.
'''
"""
# read bibliography
with open(path, "r") as f:
db = _fixdb(bp.load(f, _parser()))
Expand All @@ -117,10 +126,11 @@ def format_bib(path):
with open(path, "w") as f:
bp.dump(db, f, _writer())


def check_bib(path):
'''
"""
Check if the given bibliography is correctly formatted.
'''
"""
# read bibliography
with open(path, "r") as f:
in_ = f.read()
Expand All @@ -131,44 +141,43 @@ def check_bib(path):
out = StringIO()
bp.dump(db, out, _writer())

return [x for x in ndiff(in_.splitlines(), out.getvalue().splitlines()) if x[0] != ' ']
return [
x for x in ndiff(in_.splitlines(), out.getvalue().splitlines()) if x[0] != " "
]


def run():
'''
"""
Run the applications.
'''
"""
check_min_version()

parser = ArgumentParser(
prog='bibfmt',
description='Autoformat and check bibliography.')
prog="bibfmt", description="Autoformat and check bibliography."
)
subparsers = parser.add_subparsers(
metavar='command',
dest='command',
help='available subcommands',
required=True)
subparsers.add_parser(
'check',
help='check whether bibliography is correctly formatted')
metavar="command", dest="command", help="available subcommands", required=True
)
subparsers.add_parser(
'format',
help='format the bibliography')
"check", help="check whether bibliography is correctly formatted"
)
subparsers.add_parser("format", help="format the bibliography")

res = parser.parse_args()

if res.command == "format":
format_bib('krr.bib')
format_bib('procs.bib')
format_bib("krr.bib")
format_bib("procs.bib")
return 0

assert res.command == "check"
diff = check_bib('krr.bib') + check_bib('procs.bib')
diff = check_bib("krr.bib") + check_bib("procs.bib")
if diff:
for x in diff:
print(x, file=sys.stderr)
return 1
return 0


if __name__ == "__main__":
sys.exit(run())
5 changes: 5 additions & 0 deletions config_authfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[special_names]
'The STREAM Group' = 'The STREAM Group'
'Juan Carlos Nieves' = 'Juan Carlos | Nieves'
'Tran Cao Son' = 'Tran Cao | Son'
'Cesar Augusto Tacla' = 'Cesar Augusto | Tacla'
Loading

0 comments on commit 4f17caa

Please sign in to comment.