diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a717ba9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,202 @@ +# folders +scripts/* + +# test files +test_file_* + +# extentions +*.csv +*.txt + +# IDE +.vscode/* + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# LaTeX auxiliary files +*.aux +*.bak +*.bbl +*-blx.bib +*.blg +*.lof +*.log +*.lot +*.out +*.run.xml +*.synctex.gz +*.tdo +*.toc +*~txs0 +*.bib +*.bcf +*.glo +*.alg +*.acn +*.acr +*.glg +*.gls +*.ist +*.nlo + + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..bfcb53c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-added-large-files + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-json + - id: check-yaml + - id: debug-statements + +- repo: https://github.com/pre-commit/pre-commit + rev: v3.8.0 + hooks: + - id: validate_manifest + +- repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files"] + +- repo: https://github.com/gitleaks/gitleaks + rev: v8.19.3 + hooks: + - id: gitleaks diff --git a/README.md b/README.md index 46e6690..dfde708 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,44 @@ # LaTeX Paper Linter -This script checks for common mistakes in LaTeX source files of scientific papers. +This script checks for common mistakes in LaTeX source files of scientific papers. -## Usage +## Installation - python3 paperlint.py [-i/x ] [--error] +Install directly from this repository: -Provide either a single .tex file to check or a path to recursively check all .tex files in that directory! +```bash +pip install paperlinter@git+https://github.com/misc0110/Paper-Linter +``` + +## Usage: CLI + +``` +paperlinter [-i/x ] [--error] +``` + +
+ +Usage: Script Mode + +* Clone repository: + ```bash + git clone https://github.com/misc0110/Paper-Linter + ``` +* Navigate to the `Paper-Linter/scr` and run: + ``` + python -m paperlint [-i/x ] [--error] + ``` + +
+ +## Usage: Explanation + +Provide either a single .tex file to check or a path to recursively check all `.tex` files in that directory! By default, all rules are used for checking the document. -The switches can be configured with the `-x` and `-i` parameters to exclude and include entire categories of rules or single rules. -The include/exclude switches are evaluated in the order they are specified. -For example, `-i typography` only activates the typography rules, whereas `-i all -x typography -i cite-space` enables all rules without the typography rules, but enables the `cite-space` rule from the typography category. +The switches can be configured with the `-x` and `-i` parameters to exclude and include entire categories of rules or single rules. +The include/exclude switches are evaluated in the order they are specified. + +For example, `-i typography` only activates the typography rules, whereas `-i all -x typography -i cite-space` enables all rules without the typography rules, but enables the `cite-space` rule from the typography category. If `--error` is provided, the tool exits with error code 1 if there are warnings. @@ -211,7 +239,7 @@ This category includes warning of things that are discouraged or wrong for the s * **Description**: Warns if multiple `\cite` commands are used instead of having multiple citation keys inside one `\cite` * **Switch**: `multiple-cites` -#### Sentence starting with a Conjunction +#### Sentence starting with a Conjunction * **Description**: Warns if a sentence starts with a conjunction ("And", "But", "Or") * **Switch**: `conjunction-start` @@ -262,3 +290,7 @@ This category includes warnings for everything related to (cross-)references (sw #### Duplicate Keys in Citations * **Description**: Warns if a `cite` command has duplicate entries * **Switch**: `cite-duplicate` + +## License + +This project is licensed under the MIT License. See the LICENSE file for details. diff --git a/paperlinter/__init__.py b/paperlinter/__init__.py new file mode 100644 index 0000000..b555355 --- /dev/null +++ b/paperlinter/__init__.py @@ -0,0 +1,5 @@ +"""Top-level package.""" + +__license__ = "MIT" + +from .paperlint import main, run_linter_once diff --git a/paperlinter/__main__.py b/paperlinter/__main__.py new file mode 100644 index 0000000..7f773b5 --- /dev/null +++ b/paperlinter/__main__.py @@ -0,0 +1,8 @@ +"""Top-level package.""" + +__license__ = "MIT" + +from .paperlint import main + +if __name__ == "__main__": + main() diff --git a/paperlint.py b/paperlinter/paperlint.py old mode 100755 new mode 100644 similarity index 58% rename from paperlint.py rename to paperlinter/paperlint.py index 755a40f..3f45aed --- a/paperlint.py +++ b/paperlinter/paperlint.py @@ -1,39 +1,43 @@ #!/usr/bin/env python3 +import os import re import sys -import os +import traceback + +tex = None +tex_lines = None +tex_lines_clean = None +in_env = None +envs = None + +PROG_NAME = "paperlinter" def usage(): - print("%s [-x ] [-i ] [-i/x ] [--error]" % sys.argv[0]) - sys.exit(1) + print( + f"{PROG_NAME} [-x ] [-i ] [-i/x ] [--error]" + ) -if len(sys.argv) < 2: - usage() - -tex_files = [] +def get_tex_files(): + if not sys.argv[1].endswith(".tex"): + for path, subdirs, files in os.walk(sys.argv[1]): + for f in files: + if f.endswith(".tex"): + tex_files.append(os.path.join(path, f)) + else: + tex_files = [sys.argv[1]] -if(not sys.argv[1].endswith(".tex")): - for path, subdirs, files in os.walk(sys.argv[1]): - for f in files: - if f.endswith(".tex"): - tex_files.append(os.path.join(path,f)) -else: - tex_files = [sys.argv[1]] -tex = None -tex_lines = None -tex_lines_clean = None -in_env = None -envs = None - -def next_file(file): +def next_file(file: str) -> None: + global tex, tex_lines, tex_lines_clean, in_env, envs try: - tex = open(file).read() + with open(file, mode="r", encoding="utf-8") as content_file: + tex = content_file.read() except: print("Could not open '%s'" % sys.argv[1]) + print(f"Error: {traceback.format_exc()}") sys.exit(1) tex_lines = tex.split("\n") @@ -63,8 +67,11 @@ def preprocess(): if "%" in tex_lines[i]: idx = tex_lines[i].index("%") if idx > 0 and tex_lines[i][idx - 1] != "\\": - tex_lines_clean[i] = tex_lines[i][0:max(0, (tex_lines[i].index("%") - 1))] - if tex_lines_clean[i].startswith("%"): tex_lines_clean[i] = "" + tex_lines_clean[i] = tex_lines[i][ + 0 : max(0, (tex_lines[i].index("%") - 1)) + ] + if tex_lines_clean[i].startswith("%"): + tex_lines_clean[i] = "" else: tex_lines_clean[i] = tex_lines[i] else: @@ -85,11 +92,13 @@ def in_any_float(line): return True return False + def in_code(line): if "lstlisting" in in_env: return in_env["lstlisting"][line] return False + def in_equation(line): if "equation" in in_env and in_env["equation"][line]: return True @@ -105,9 +114,10 @@ def in_equation(line): return True if "proposition" in in_env and in_env["proposition"][line]: return True - + return False + def check_space_before_cite(): warns = [] for i, l in enumerate(tex_lines): @@ -117,27 +127,35 @@ def check_space_before_cite(): warns.append((i, "No space before \\cite", b.span(0))) return warns + def check_float_alignment(env): warns = [] for i, l in enumerate(tex_lines): b = re.search("\\\\begin\{%s\}" % env, l) if b: if not re.search("%s}\[[^\]]*[htbH][^\]]*\]" % env, l): - warns.append((i, "%s without alignment: %s" % (env, l.strip()), b.span())) + warns.append( + (i, "%s without alignment: %s" % (env, l.strip()), b.span()) + ) return warns + def check_figure_alignment(): return check_float_alignment("figure") + def check_table_alignment(): return check_float_alignment("table") + def check_listing_alignment(): return check_float_alignment("listing") + def check_float_has_label(env): warns = [] - if env not in envs: return warns + if env not in envs: + return warns for r in envs[env]: label = False for i in range(*r): @@ -151,7 +169,8 @@ def check_float_has_label(env): def check_float_has_caption(env): warns = [] - if env not in envs: return warns + if env not in envs: + return warns for r in envs[env]: label = False for i in range(*r): @@ -162,9 +181,11 @@ def check_float_has_caption(env): warns.append((r[0], "%s without a caption" % env)) return warns + def check_float_caption_label_order(env): warns = [] - if env not in envs: return warns + if env not in envs: + return warns for r in envs[env]: label = -1 caption = -1 @@ -176,13 +197,16 @@ def check_float_caption_label_order(env): if b: label = i if label > -1 and caption > -1 and label < caption: - warns.append((r[0], "label before caption in %s, swap for correct references" % env)) + warns.append( + (r[0], "label before caption in %s, swap for correct references" % env) + ) return warns def check_no_resizebox_for_tables(): warns = [] - if "table" not in envs: return warns + if "table" not in envs: + return warns for r in envs["table"]: rb = False b = None @@ -202,36 +226,52 @@ def check_weird_units(): for i, l in enumerate(tex_lines): for b in block: if b in l: - warns.append((i, "use \\hsize instead of %s" % b, (l.index(b), l.index(b) + len(b)))) + warns.append( + ( + i, + "use \\hsize instead of %s" % b, + (l.index(b), l.index(b) + len(b)), + ) + ) return warns + def check_figure_has_label(): return check_float_has_label("figure") + def check_table_has_label(): return check_float_has_label("table") + def check_listing_has_label(): return check_float_has_label("listing") + def check_figure_has_caption(): return check_float_has_caption("figure") + def check_table_has_caption(): return check_float_has_caption("table") + def check_listing_has_caption(): return check_float_has_caption("listing") + def check_figure_caption_label_order(): return check_float_caption_label_order("figure") + def check_table_caption_label_order(): return check_float_caption_label_order("table") + def check_listing_caption_label_order(): return check_float_caption_label_order("listing") + def check_todos(): warns = [] for i, l in enumerate(tex_lines_clean): @@ -244,29 +284,38 @@ def check_notes(): warns = [] for i, l in enumerate(tex_lines_clean): if "\\note" in l: - warns.append((i, "\\note found", (l.index("\\note"), l.index("\\note") + 5))) + warns.append( + (i, "\\note found", (l.index("\\note"), l.index("\\note") + 5)) + ) if "\\todo" in l: - warns.append((i, "\\todo found", (l.index("\\todo"), l.index("\\todo") + 5))) + warns.append( + (i, "\\todo found", (l.index("\\todo"), l.index("\\todo") + 5)) + ) return warns def check_math_numbers(): warns = [] for i, l in enumerate(tex_lines): - n = re.search("\\$\\d+\\$", tex_lines[i]) + n = re.search("\\$\\d+\\$", tex_lines[i]) if n and not in_any_float(i): - warns.append((i, "Number in math mode, consider using siunit instead", n.span())) + warns.append( + (i, "Number in math mode, consider using siunit instead", n.span()) + ) return warns def check_large_numbers_without_si(): warns = [] for i, l in enumerate(tex_lines): - n = re.search("[\\s\(]\\d{5,}[\\s\),\.]", tex_lines[i]) + n = re.search("[\\s\(]\\d{5,}[\\s\),\.]", tex_lines[i]) if n and not in_any_float(i): - warns.append((i, "Large number without formating, consider using siunit", n.span())) + warns.append( + (i, "Large number without formating, consider using siunit", n.span()) + ) return warns + def check_env_not_in_float(env, float_env): warns = [] if env in envs: @@ -274,14 +323,16 @@ def check_env_not_in_float(env, float_env): if (float_env not in in_env) or (not in_env[float_env][e[0]]): warns.append((e[0], "%s not within %s environment" % (env, float_env))) return warns - + def check_listing_in_correct_float(): return check_env_not_in_float("lstlisting", "listing") + def check_tabular_in_correct_float(): return check_env_not_in_float("tabular", "table") + def check_tikz_in_correct_float(): return check_env_not_in_float("tikzpicture", "figure") @@ -318,7 +369,7 @@ def check_short_form(): def check_labels_referenced(): warns = [] - labels = [] #re.findall("\\\\label\{([^\\}]+)\}", tex) + labels = [] # re.findall("\\\\label\{([^\\}]+)\}", tex) for i, l in enumerate(tex_lines_clean): lab = re.search("\\\\label\{([^\\}]+)\}", l) if lab: @@ -344,7 +395,13 @@ def check_section_capitalization(): words = n.group(2).split(" ") for w in words: if len(w) > 4 and w[0].islower(): - warns.append((i, "Wrong capitalization of header", (l.index(w), l.index(w) + 1))) + warns.append( + ( + i, + "Wrong capitalization of header", + (l.index(w), l.index(w) + 1), + ) + ) break except: pass @@ -354,10 +411,16 @@ def check_section_capitalization(): def check_quotation(): warns = [] for i, l in enumerate(tex_lines_clean): - ws = re.search("[^\\\\]\"\\w+", l) - we = re.search("\\w+\"", l) + ws = re.search('[^\\\\]"\\w+', l) + we = re.search('\\w+"', l) if (ws or we) and not in_code(i): - warns.append((i, "Wrong quotation, use `` and '' instead of \"", ws.span() if ws else we.span())) + warns.append( + ( + i, + "Wrong quotation, use `` and '' instead of \"", + ws.span() if ws else we.span(), + ) + ) return warns @@ -367,7 +430,13 @@ def check_hline_in_table(): hl = re.search("\\\\hline", l) if hl: if "tabular" in in_env and in_env["tabular"]: - warns.append((i, "\\hline in table, consider using \\toprule, \\midrule, \\bottomrule.", hl.span())) + warns.append( + ( + i, + "\\hline in table, consider using \\toprule, \\midrule, \\bottomrule.", + hl.span(), + ) + ) return warns @@ -388,11 +457,15 @@ def check_headers_without_text(): nx = i while (nx + 1) < len(tex_lines): nx += 1 - if len(tex_lines[nx].strip()) == 0: continue - if tex_lines[nx].strip().startswith("%"): continue + if len(tex_lines[nx].strip()) == 0: + continue + if tex_lines[nx].strip().startswith("%"): + continue nn = re.search("(section|paragraph)\\{([^\\}]+)\\}", tex_lines[nx]) if nn: - warns.append((i, "Section header without text before next header", n.span())) + warns.append( + (i, "Section header without text before next header", n.span()) + ) break return warns @@ -401,9 +474,15 @@ def check_one_sentence_paragraphs(): warns = [] for i, l in enumerate(tex_lines): if i > 0 and i < len(tex_lines) - 1: - if len(tex_lines[i - 1].strip()) == 0 and len(tex_lines[i + 1].strip()) == 0 and len(tex_lines[i].strip()) > 0: - if tex_lines[i].strip().startswith("\\"): continue - if ". " in tex_lines[i]: continue + if ( + len(tex_lines[i - 1].strip()) == 0 + and len(tex_lines[i + 1].strip()) == 0 + and len(tex_lines[i].strip()) > 0 + ): + if tex_lines[i].strip().startswith("\\"): + continue + if ". " in tex_lines[i]: + continue warns.append((i, "One-sentence paragraph", (0, len(tex_lines[i])))) return warns @@ -421,9 +500,17 @@ def check_unbalanced_brackets(): warns = [] for i, l in enumerate(tex_lines): if l.count("(") != l.count(")") and not in_code(i): - first = min(l.index("(") if l.count("(") > 0 else len(l), l.index(")") if l.count(")") > 0 else len(l)) - last = max(l.rindex("(") if l.count("(") > 0 else len(l), l.rindex(")") if l.count(")") > 0 else len(l)) - warns.append((i, "Mismatch of opening and closing parenthesis", (first, last))) + first = min( + l.index("(") if l.count("(") > 0 else len(l), + l.index(")") if l.count(")") > 0 else len(l), + ) + last = max( + l.rindex("(") if l.count("(") > 0 else len(l), + l.rindex(")") if l.count(")") > 0 else len(l), + ) + warns.append( + (i, "Mismatch of opening and closing parenthesis", (first, last)) + ) return warns @@ -441,7 +528,9 @@ def check_ellipsis(): for i, l in enumerate(tex_lines): el = re.search("\\w+\\.\\.\\.", l) if el: - warns.append((i, "Ellipsis \"...\" discouraged in academic writing", el.span())) + warns.append( + (i, 'Ellipsis "..." discouraged in academic writing', el.span()) + ) return warns @@ -450,7 +539,9 @@ def check_etc(): for i, l in enumerate(tex_lines): el = re.search("\\s+etc[\\.\\w]", l) if el: - warns.append((i, "Unspecific \"etc\" discouraged in academic writing", el.span())) + warns.append( + (i, 'Unspecific "etc" discouraged in academic writing', el.span()) + ) return warns @@ -471,7 +562,7 @@ def check_table_top_caption(): tab = -1 for intab in range(*table): if re.search("\\\\caption\\{", tex_lines[intab]): - caption = intab + caption = intab if re.search("\\\\begin\\{tabular", tex_lines[intab]): tab = intab if tab != -1 and caption != -1 and tab < caption: @@ -479,18 +570,30 @@ def check_table_top_caption(): return warns - def check_punctuation_end_of_line(): warns = [] for i, l in enumerate(tex_lines_clean): sl = l.strip() - if len(sl) < 10: continue - if len(sl.split(" ")) < 8: continue - if in_any_float(i): continue - if "lstlisting" in in_env and in_env["lstlisting"][i]: continue - if sl.startswith("\\") or sl.startswith("%"): continue - if sl.endswith("\\\\") or sl.endswith("}"): continue - if sl.endswith(".") or sl.endswith("!") or sl.endswith("?") or sl.endswith(":") or sl.endswith(";"): continue + if len(sl) < 10: + continue + if len(sl.split(" ")) < 8: + continue + if in_any_float(i): + continue + if "lstlisting" in in_env and in_env["lstlisting"][i]: + continue + if sl.startswith("\\") or sl.startswith("%"): + continue + if sl.endswith("\\\\") or sl.endswith("}"): + continue + if ( + sl.endswith(".") + or sl.endswith("!") + or sl.endswith("?") + or sl.endswith(":") + or sl.endswith(";") + ): + continue p = re.search("\\s*[\\w})$]+[\\.!?}{:;\\\\]\\s*$", l.rstrip()) if not p: warns.append((i, "Line ends without punctuation", (len(l) - 2, len(l)))) @@ -511,7 +614,7 @@ def check_will(): for i, l in enumerate(tex_lines): w = re.search("\\s+will\\s+", l) if w: - warns.append((i, "Usage of \"will\" is discouraged.", w.span())) + warns.append((i, 'Usage of "will" is discouraged.', w.span())) return warns @@ -522,7 +625,13 @@ def check_subsection_count(): for i, l in enumerate(tex_lines): if re.search("\\\\section{", l): if last_section != -1 and len(subsections) == 1: - warns.append((last_section, "Section only has one subsection", re.search("\\\\section{", tex_lines[last_section]).span())) + warns.append( + ( + last_section, + "Section only has one subsection", + re.search("\\\\section{", tex_lines[last_section]).span(), + ) + ) last_section = i subsections = [] if re.search("\\\\subsection{", l): @@ -550,7 +659,13 @@ def check_center_in_float(): if "center" in envs: for c in envs["center"]: if in_any_float(c[0]): - warns.append((c[0], "Use \\centering instead of \\begin{center} inside floats", re.search("\\\\begin\{center\}", tex_lines[c[0]]).span())) + warns.append( + ( + c[0], + "Use \\centering instead of \\begin{center} inside floats", + re.search("\\\\begin\{center\}", tex_lines[c[0]]).span(), + ) + ) return warns @@ -568,7 +683,9 @@ def check_eqnarray(): for i, l in enumerate(tex_lines): ap = re.search("\\\\begin\{eqnarray\}", l) if ap: - warns.append((i, "Use \\begin{align} instead of \\begin{eqnarray}", ap.span())) + warns.append( + (i, "Use \\begin{align} instead of \\begin{eqnarray}", ap.span()) + ) return warns @@ -600,13 +717,20 @@ def check_acm_pc(): ("\\bdummy\\-?\\s?head\\b", "temporary head"), ("\\bgender\\-?\\s?bender\\b", "plug-socket adapter"), ("\\borphaned\\-?\\s?object\\b", "unreferenced/unlinked object"), - ("\\bsanity\\-?\\s?check", "coherence/quick/well-formedness check") + ("\\bsanity\\-?\\s?check", "coherence/quick/well-formedness check"), ] for i, l in enumerate(tex_lines): for r in replace: w = re.search(r[0], l) if w: - warns.append((i, "Discouraged term \"%s\", consider replacing with \"%s\"" % (w.group(), r[1]), w.span())) + warns.append( + ( + i, + 'Discouraged term "%s", consider replacing with "%s"' + % (w.group(), r[1]), + w.span(), + ) + ) return warns @@ -618,7 +742,13 @@ def check_cite_noun(): warns.append((i, "Citation is used as noun", ap.span())) ap = re.search("^\\s*\\\\cite", l) if ap: - warns.append((i, "Citation at the beginning of a sentence (probably as noun)", ap.span())) + warns.append( + ( + i, + "Citation at the beginning of a sentence (probably as noun)", + ap.span(), + ) + ) return warns @@ -632,7 +762,13 @@ def check_cite_duplicate(): if len(c) != len(list(set(c))): seen = set() dupes = [x for x in c if x in seen or seen.add(x)] - warns.append((i, "Duplicate citation key: %s" % ", ".join(dupes), re.search(dupes[0], l).span())) + warns.append( + ( + i, + "Duplicate citation key: %s" % ", ".join(dupes), + re.search(dupes[0], l).span(), + ) + ) return warns @@ -641,7 +777,13 @@ def check_multicite(): for i, l in enumerate(tex_lines): cites = re.search("\\\\citeA?\\{[^\\}]+\\}\\s*\\\\citeA?\\{[^\\}]+\\}", l) if cites: - warns.append((i, "Multiple \\cite commands, use multiple citation keys in one \\cite instead", cites.span())) + warns.append( + ( + i, + "Multiple \\cite commands, use multiple citation keys in one \\cite instead", + cites.span(), + ) + ) return warns @@ -650,30 +792,51 @@ def check_conjunction_start(): for i, l in enumerate(tex_lines_clean): p = re.search("[\\.!?]\\s+(And|Or|But)[\\s,]", l.rstrip()) if p: - warns.append((i, "Starting a sentence with a conjunction is discouraged", p.span())) + warns.append( + (i, "Starting a sentence with a conjunction is discouraged", p.span()) + ) p = re.search("^(And|Or|But)[\\s,]", l.rstrip()) if p: - warns.append((i, "Starting a sentence with a conjunction is discouraged", p.span())) + warns.append( + (i, "Starting a sentence with a conjunction is discouraged", p.span()) + ) return warns def check_brackets_space(): warns = [] for i, l in enumerate(tex_lines_clean): - if in_code(i) or in_equation(i) or (len(l.strip()) > 0 and l.strip()[0] in ["\\", "%"]): continue + if ( + in_code(i) + or in_equation(i) + or (len(l.strip()) > 0 and l.strip()[0] in ["\\", "%"]) + ): + continue p = re.search("[^\\s\\{~\\\\]\\([^(s\\))]", l.rstrip()) if p: - if l.rstrip()[:p.span()[1]].count("$") % 2 == 0: # only if it is not in an equation - warns.append((i, "There must be a space before an opening parenthesis", p.span())) + if ( + l.rstrip()[: p.span()[1]].count("$") % 2 == 0 + ): # only if it is not in an equation + warns.append( + (i, "There must be a space before an opening parenthesis", p.span()) + ) p = re.search("\\(\\s", l.rstrip()) if p: - if l.rstrip()[:p.span()[1]].count("$") % 2 == 0: # only if it is not in an equation - warns.append((i, "There must be no space after an opening parenthesis", p.span())) + if ( + l.rstrip()[: p.span()[1]].count("$") % 2 == 0 + ): # only if it is not in an equation + warns.append( + (i, "There must be no space after an opening parenthesis", p.span()) + ) p = re.search("\\s\\)", l.rstrip()) if p: - if l.rstrip()[:p.span()[1]].count("$") % 2 == 0: # only if it is not in an equation - warns.append((i, "There must be no space before a closing parenthesis", p.span())) - return warns + if ( + l.rstrip()[: p.span()[1]].count("$") % 2 == 0 + ): # only if it is not in an equation + warns.append( + (i, "There must be no space before a closing parenthesis", p.span()) + ) + return warns def check_acronym_capitalization(): @@ -681,31 +844,43 @@ def check_acronym_capitalization(): acronyms = [] acronym_first = {} for i, l in enumerate(tex_lines_clean): - if in_code(i): continue + if in_code(i): + continue p = re.search("\\b[A-Z]{3,}\\b", l) if p and p.group() not in acronyms: pos = p.span()[0] - if pos > 0 and l[pos - 1] == '\\': + if pos > 0 and l[pos - 1] == "\\": continue acronyms.append(p.group()) acronym_first[p.group()] = i for i, l in enumerate(tex_lines_clean): - if in_code(i): continue + if in_code(i): + continue for a in acronyms: p = re.search("\\b%s\\b" % a, l.upper()) if p: - found = l[p.span()[0]:p.span()[1]] - if found[-1] == 's': # ignore plural + found = l[p.span()[0] : p.span()[1]] + if found[-1] == "s": # ignore plural found = found[:-1] - if l[:p.span()[0]].count("{") != l[:p.span()[0]].count("}"): # probably inside a reference or label + if l[: p.span()[0]].count("{") != l[: p.span()[0]].count( + "}" + ): # probably inside a reference or label continue - if "@" in l: # probably a mail address + if "@" in l: # probably a mail address continue - if p.span()[0] > 0 and l[p.span()[0] - 1] == '\\': - continue # probably a macro + if p.span()[0] > 0 and l[p.span()[0] - 1] == "\\": + continue # probably a macro if not found.isupper(): - warns.append((i, "(Potential) acronym with wrong capitalization (first defined in Line %d)" % (acronym_first[a] + 1), p.span())) - return warns + warns.append( + ( + i, + "(Potential) acronym with wrong capitalization (first defined in Line %d)" + % (acronym_first[a] + 1), + p.span(), + ) + ) + return warns + def check_numeral(): warns = [] @@ -719,13 +894,19 @@ def check_numeral(): ("\\bnine\\b", "9"), ("\\bten\\b", "10"), ("\\beleven\\b", "11"), - ("\\btwelve\\b", "12") + ("\\btwelve\\b", "12"), ] for i, l in enumerate(tex_lines): for r in replace: w = re.search(r[0], l) if w: - warns.append((i, "Numeral \"%s\" should be replaced with \"%s\"" % (w.group(), r[1]), w.span())) + warns.append( + ( + i, + 'Numeral "%s" should be replaced with "%s"' % (w.group(), r[1]), + w.span(), + ) + ) return warns @@ -740,7 +921,7 @@ def check_colors(): "\\bmagenta\\b", "\\bcyan\\b", "\\bbrown\\b", - "\\bpink\\b" + "\\bpink\\b", ] modifiers = [ "\\bdott?(ed)?\\b", @@ -751,14 +932,17 @@ def check_colors(): "\\bhatch", "\\bcross", "\\bcheck", - "\\bpattern" + "\\bpattern", ] for i, l in enumerate(tex_lines): for c in cols: w = re.search(c, l) if w: # check for = or { in front of color - if w.span()[0] > 0 and (l[w.span()[0] - 1] == "=" or l[w.span()[0] - 1] == "{"): continue + if w.span()[0] > 0 and ( + l[w.span()[0] - 1] == "=" or l[w.span()[0] - 1] == "{" + ): + continue # reduce false positives by looking for modifiers mod = False for m in modifiers: @@ -766,7 +950,14 @@ def check_colors(): mod = True break if not mod: - warns.append((i, "Colors (\"%s\") without a modifier such as dashed/dotted/... should be avoided." % (w[0]), w.span())) + warns.append( + ( + i, + 'Colors ("%s") without a modifier such as dashed/dotted/... should be avoided.' + % (w[0]), + w.span(), + ) + ) return warns @@ -778,7 +969,18 @@ def check_inconsistent_word_style(): if styled and "newcommand" not in l: if styled[2] in word_style: if styled[1] != word_style[styled[2]][1][1]: - warns.append((i, "Word '%s' is styled inconsistently, used with \\text%s before at line %d" % (styled[2], word_style[styled[2]][1][1], word_style[styled[2]][0] + 1), styled.span())) + warns.append( + ( + i, + "Word '%s' is styled inconsistently, used with \\text%s before at line %d" + % ( + styled[2], + word_style[styled[2]][1][1], + word_style[styled[2]][0] + 1, + ), + styled.span(), + ) + ) else: word_style[styled[2]] = (i, styled) return warns @@ -790,27 +992,43 @@ def check_missing_word_style(): for i, l in enumerate(tex_lines_clean): styled = re.search("\\\\text([^\\{]+)\{([^\\}]+)\}", l) if styled: - if len(styled[2]) <= 3: continue # reduce false positives for variables + if len(styled[2]) <= 3: + continue # reduce false positives for variables if styled[2] in word_style: word_style[styled[2]][2] += 1 else: word_style[styled[2]] = [i, styled, 1] - + for i, l in enumerate(tex_lines_clean): - if in_code(i): continue + if in_code(i): + continue for s in word_style.keys(): - if word_style[s][2] == 1: continue # reduce false positives, e.g., when the word is emphasized once + if word_style[s][2] == 1: + continue # reduce false positives, e.g., when the word is emphasized once try: w = re.search("\\b%s\\b" % s, l) except: continue if w: if w.span()[0] > 0 and l[w.span()[0] - 1] != "{": - warns.append((i, "Word '%s' used without a style, used with \\text%s before at line %d (and %d other location%s)" % (s, word_style[s][1][1], word_style[s][0] + 1, word_style[s][2], "s" if word_style[s][2] == 1 else ""), w.span())) + warns.append( + ( + i, + "Word '%s' used without a style, used with \\text%s before at line %d (and %d other location%s)" + % ( + s, + word_style[s][1][1], + word_style[s][0] + 1, + word_style[s][2], + "s" if word_style[s][2] == 1 else "", + ), + w.span(), + ) + ) return warns -def print_warnings(warn, output = True): +def print_warnings(warn, output=True): warnings = 0 sorted_warn = sorted(warn, key=lambda tup: tup[0][0]) for cw in sorted_warn: @@ -818,21 +1036,28 @@ def print_warnings(warn, output = True): if w[0] != -1 and tex_lines[w[0]].strip().startswith("%"): continue - if output: - print("\033[33mWarning %d\033[0m: " % (warnings + 1), end = "") + if output: + print("\033[33mWarning %d\033[0m: " % (warnings + 1), end="") warnings += 1 if w[0] != -1: - if output: print("Line %d: %s" % (w[0] + 1, w[1]), end = "") + if output: + print("Line %d: %s" % (w[0] + 1, w[1]), end="") else: - if output: print(w[1], end = "") - + if output: + print(w[1], end="") + if output: - print(" \033[90m[%s]\033[0m" % cw[1], end = "") + print(" \033[90m[%s]\033[0m" % cw[1], end="") print("") if len(w) > 2: - if output: print(" %s" % tex_lines[w[0]].replace("\t", " ")) - if output: print(" %s\033[33m%s\033[0m" % (" " * w[2][0], "^" * (w[2][1] - w[2][0]))) + if output: + print(" %s" % tex_lines[w[0]].replace("\t", " ")) + if output: + print( + " %s\033[33m%s\033[0m" + % (" " * w[2][0], "^" * (w[2][1] - w[2][0])) + ) return warnings @@ -843,73 +1068,80 @@ def print_warnings(warn, output = True): CATEGORY_REFERENCE = 16 checks = [ - (check_space_before_cite, CATEGORY_TYPOGRAPHY, "cite-space"), - (check_figure_alignment, CATEGORY_STYLE, "figure-alignment"), - (check_table_alignment, CATEGORY_STYLE, "table-alignment"), - (check_listing_alignment, CATEGORY_STYLE, "listing-alignment"), - (check_figure_has_label, CATEGORY_REFERENCE, "figure-label"), - (check_table_has_label, CATEGORY_REFERENCE, "table-label"), - (check_listing_has_label, CATEGORY_REFERENCE, "listing-label"), - (check_figure_has_caption, CATEGORY_STYLE, "figure-caption"), - (check_table_has_caption, CATEGORY_STYLE, "table-caption"), - (check_listing_has_caption, CATEGORY_STYLE, "listing-caption"), - (check_no_resizebox_for_tables, CATEGORY_STYLE, "resize-table"), - (check_weird_units, CATEGORY_STYLE, "dimensions"), - (check_figure_caption_label_order, CATEGORY_REFERENCE, "figure-caption-order"), - (check_table_caption_label_order, CATEGORY_REFERENCE, "table-caption-order"), - (check_listing_caption_label_order, CATEGORY_REFERENCE, "listing-caption-order"), - (check_todos, CATEGORY_GENERAL, "todo"), - (check_notes, CATEGORY_GENERAL, "note"), - (check_math_numbers, CATEGORY_TYPOGRAPHY, "math-numbers"), - (check_large_numbers_without_si, CATEGORY_TYPOGRAPHY, "si"), - (check_listing_in_correct_float, CATEGORY_REFERENCE, "listing-float"), - (check_tabular_in_correct_float, CATEGORY_REFERENCE, "tabular-float"), - (check_tikz_in_correct_float, CATEGORY_REFERENCE, "tikz-float"), - (check_comment_has_space, CATEGORY_TYPOGRAPHY, "comment-space"), - (check_percent_without_siunix, CATEGORY_TYPOGRAPHY, "percentage"), - (check_short_form, CATEGORY_GENERAL, "short-form"), - (check_labels_referenced, CATEGORY_REFERENCE, "label-referenced"), - (check_section_capitalization, CATEGORY_VISUAL, "capitalization"), - (check_quotation, CATEGORY_TYPOGRAPHY, "quotes"), - (check_hline_in_table, CATEGORY_VISUAL, "hline"), - (check_space_before_punctuation, CATEGORY_TYPOGRAPHY, "punctuation-space"), - (check_headers_without_text, CATEGORY_VISUAL, "two-header"), - (check_one_sentence_paragraphs, CATEGORY_VISUAL, "single-sentence"), - (check_multiple_sentences_per_line, CATEGORY_GENERAL, "multiple-sentences"), - (check_unbalanced_brackets, CATEGORY_TYPOGRAPHY, "unbalanced-brackets"), - (check_and_or, CATEGORY_TYPOGRAPHY, "and-or"), - (check_ellipsis, CATEGORY_TYPOGRAPHY, "ellipsis"), - (check_etc, CATEGORY_STYLE, "etc"), - (check_punctuation_end_of_line, CATEGORY_TYPOGRAPHY, "punctuation"), - (check_footnote, CATEGORY_TYPOGRAPHY, "footnote"), - (check_table_vertical_lines, CATEGORY_VISUAL, "vline"), - (check_table_top_caption, CATEGORY_STYLE, "table-top-caption"), - (check_will, CATEGORY_GENERAL, "will"), - (check_subsection_count, CATEGORY_VISUAL, "single-subsection"), - (check_mixed_compact_and_item, CATEGORY_VISUAL, "mixed-compact"), - (check_center_in_float, CATEGORY_VISUAL, "float-center"), - (check_appendix, CATEGORY_STYLE, "appendix"), - (check_eqnarray, CATEGORY_VISUAL, "eqnarray"), - (check_acm_pc, CATEGORY_STYLE, "inclusion"), - (check_cite_noun, CATEGORY_STYLE, "cite-noun"), - (check_cite_duplicate, CATEGORY_REFERENCE, "cite-duplicate"), - (check_conjunction_start, CATEGORY_STYLE, "conjunction-start"), - (check_brackets_space, CATEGORY_TYPOGRAPHY, "bracket-spacing"), - (check_acronym_capitalization, CATEGORY_TYPOGRAPHY, "acronym-capitalization"), - (check_numeral, CATEGORY_GENERAL, "numeral"), - (check_multicite, CATEGORY_STYLE, "multiple-cites"), - (check_colors, CATEGORY_VISUAL, "colors"), - (check_inconsistent_word_style, CATEGORY_TYPOGRAPHY, "inconsistent-textstyle"), - (check_missing_word_style, CATEGORY_TYPOGRAPHY, "missing-textstyle") + (check_space_before_cite, CATEGORY_TYPOGRAPHY, "cite-space"), + (check_figure_alignment, CATEGORY_STYLE, "figure-alignment"), + (check_table_alignment, CATEGORY_STYLE, "table-alignment"), + (check_listing_alignment, CATEGORY_STYLE, "listing-alignment"), + (check_figure_has_label, CATEGORY_REFERENCE, "figure-label"), + (check_table_has_label, CATEGORY_REFERENCE, "table-label"), + (check_listing_has_label, CATEGORY_REFERENCE, "listing-label"), + (check_figure_has_caption, CATEGORY_STYLE, "figure-caption"), + (check_table_has_caption, CATEGORY_STYLE, "table-caption"), + (check_listing_has_caption, CATEGORY_STYLE, "listing-caption"), + (check_no_resizebox_for_tables, CATEGORY_STYLE, "resize-table"), + (check_weird_units, CATEGORY_STYLE, "dimensions"), + (check_figure_caption_label_order, CATEGORY_REFERENCE, "figure-caption-order"), + (check_table_caption_label_order, CATEGORY_REFERENCE, "table-caption-order"), + (check_listing_caption_label_order, CATEGORY_REFERENCE, "listing-caption-order"), + (check_todos, CATEGORY_GENERAL, "todo"), + (check_notes, CATEGORY_GENERAL, "note"), + (check_math_numbers, CATEGORY_TYPOGRAPHY, "math-numbers"), + (check_large_numbers_without_si, CATEGORY_TYPOGRAPHY, "si"), + (check_listing_in_correct_float, CATEGORY_REFERENCE, "listing-float"), + (check_tabular_in_correct_float, CATEGORY_REFERENCE, "tabular-float"), + (check_tikz_in_correct_float, CATEGORY_REFERENCE, "tikz-float"), + (check_comment_has_space, CATEGORY_TYPOGRAPHY, "comment-space"), + (check_percent_without_siunix, CATEGORY_TYPOGRAPHY, "percentage"), + (check_short_form, CATEGORY_GENERAL, "short-form"), + (check_labels_referenced, CATEGORY_REFERENCE, "label-referenced"), + (check_section_capitalization, CATEGORY_VISUAL, "capitalization"), + (check_quotation, CATEGORY_TYPOGRAPHY, "quotes"), + (check_hline_in_table, CATEGORY_VISUAL, "hline"), + (check_space_before_punctuation, CATEGORY_TYPOGRAPHY, "punctuation-space"), + (check_headers_without_text, CATEGORY_VISUAL, "two-header"), + (check_one_sentence_paragraphs, CATEGORY_VISUAL, "single-sentence"), + (check_multiple_sentences_per_line, CATEGORY_GENERAL, "multiple-sentences"), + (check_unbalanced_brackets, CATEGORY_TYPOGRAPHY, "unbalanced-brackets"), + (check_and_or, CATEGORY_TYPOGRAPHY, "and-or"), + (check_ellipsis, CATEGORY_TYPOGRAPHY, "ellipsis"), + (check_etc, CATEGORY_STYLE, "etc"), + (check_punctuation_end_of_line, CATEGORY_TYPOGRAPHY, "punctuation"), + (check_footnote, CATEGORY_TYPOGRAPHY, "footnote"), + (check_table_vertical_lines, CATEGORY_VISUAL, "vline"), + (check_table_top_caption, CATEGORY_STYLE, "table-top-caption"), + (check_will, CATEGORY_GENERAL, "will"), + (check_subsection_count, CATEGORY_VISUAL, "single-subsection"), + (check_mixed_compact_and_item, CATEGORY_VISUAL, "mixed-compact"), + (check_center_in_float, CATEGORY_VISUAL, "float-center"), + (check_appendix, CATEGORY_STYLE, "appendix"), + (check_eqnarray, CATEGORY_VISUAL, "eqnarray"), + (check_acm_pc, CATEGORY_STYLE, "inclusion"), + (check_cite_noun, CATEGORY_STYLE, "cite-noun"), + (check_cite_duplicate, CATEGORY_REFERENCE, "cite-duplicate"), + (check_conjunction_start, CATEGORY_STYLE, "conjunction-start"), + (check_brackets_space, CATEGORY_TYPOGRAPHY, "bracket-spacing"), + (check_acronym_capitalization, CATEGORY_TYPOGRAPHY, "acronym-capitalization"), + (check_numeral, CATEGORY_GENERAL, "numeral"), + (check_multicite, CATEGORY_STYLE, "multiple-cites"), + (check_colors, CATEGORY_VISUAL, "colors"), + (check_inconsistent_word_style, CATEGORY_TYPOGRAPHY, "inconsistent-textstyle"), + (check_missing_word_style, CATEGORY_TYPOGRAPHY, "missing-textstyle"), ] category_switches = [ - ("all", CATEGORY_GENERAL | CATEGORY_REFERENCE | CATEGORY_STYLE | CATEGORY_TYPOGRAPHY | CATEGORY_VISUAL), - ("general", CATEGORY_GENERAL), - ("reference", CATEGORY_REFERENCE), - ("style", CATEGORY_STYLE), + ( + "all", + CATEGORY_GENERAL + | CATEGORY_REFERENCE + | CATEGORY_STYLE + | CATEGORY_TYPOGRAPHY + | CATEGORY_VISUAL, + ), + ("general", CATEGORY_GENERAL), + ("reference", CATEGORY_REFERENCE), + ("style", CATEGORY_STYLE), ("typography", CATEGORY_TYPOGRAPHY), - ("visual", CATEGORY_VISUAL) + ("visual", CATEGORY_VISUAL), ] @@ -932,7 +1164,7 @@ def add_categories(cat, new_cat): if new_cat & cats[1]: cat.add(cats[2]) - + def remove_categories(cat, rem_cat): if type(rem_cat) is str: full_cat = [x[0] for x in category_switches] @@ -949,19 +1181,52 @@ def remove_categories(cat, rem_cat): cat.remove(cats[2]) +def run_linter_once(filename: str) -> None: + + if not os.path.exists(filename): + raise FileNotFoundError(f"No file found: {filename}") + + nr_warnings, nr_suppressed = 0, 0 + used_categories = set() + add_categories(used_categories, "all") + + next_file(filename) + print("Inspecting file \033[94m'%s'\033[0m" % filename) + + preprocess() + + warnings = [] + suppressed = [] + for c in checks: + add_warn = c[0]() + if c[2] in used_categories: + warnings += [(x, c[2]) for x in add_warn] + else: + suppressed += [(x, c[2]) for x in add_warn] + + nr_warnings += print_warnings(warnings) + nr_suppressed += print_warnings(suppressed, output=False) + + def main(): + if len(sys.argv) < 2: + usage() + sys.exit(1) + + tex_files = get_tex_files() + nr_warnings = 0 nr_suppressed = 0 idx = 1 has_rules = False exit_code = False - + # -x to exclude, -i to include used_categories = set() add_categories(used_categories, "all") - + while idx < len(sys.argv): arg = sys.argv[idx] if arg == "-x": @@ -989,7 +1254,7 @@ def main(): else: print("Missing switch after -i") usage() - + if arg == "--error": exit_code = True idx += 1 @@ -1000,7 +1265,7 @@ def main(): for file in tex_files: next_file(file) print("Inspecting file \033[94m'%s'\033[0m" % file) - + preprocess() warnings = [] @@ -1013,7 +1278,7 @@ def main(): suppressed += [(x, c[2]) for x in add_warn] nr_warnings += print_warnings(warnings) - nr_suppressed += print_warnings(suppressed, output = False) + nr_suppressed += print_warnings(suppressed, output=False) print("") print("%d warnings printed; %d suppressed warnings" % (nr_warnings, nr_suppressed)) @@ -1023,4 +1288,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4d0fa2d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "setuptools-scm"] + +[project] +name = "paperlinter" +version = "0.2.3" +description = "Checks for common mistakes in LaTeX source files of scientific papers" +keywords = ["LaTeX", "TeX", "linter", "paper"] +readme = "README.md" +requires-python = ">=3.8" +authors = [ + {name = "Michael Schwarz"}, +] +classifiers = [ + "License :: OSI Approved :: MIT License", +] + +[project.scripts] +paperlinter = "paperlinter:main" + +[project.urls] +Homepage = "https://github.com/misc0110/Paper-Linter/"