Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tsv cleanup script and workflow #82

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
bd0f961
general tsv cleaner
eharkins Jul 18, 2020
d7ffc97
add --sort-col to bin/clean-tsv-metadata
eharkins Aug 10, 2020
8b76a51
add clean-tsv workflow
eharkins Aug 10, 2020
5959b73
clean-tsv-metadata: 0-based sort col + error handle
eharkins Aug 27, 2020
541c155
clean-tsv-metadata: fix pandas quotes
eharkins Aug 27, 2020
bbe727a
clean-tsv-metadata: correct header arg to_csv
eharkins Aug 27, 2020
8f5d995
clean-tsv-metadata: allow header and n_cols
eharkins Aug 27, 2020
4ef3b29
clean-tsv-metadata: input file positional arg
eharkins Aug 27, 2020
3c28524
clean-tsv-metadata: rename
eharkins Aug 27, 2020
a932f6c
clean-tsv-metadata: --header -> --no-header
eharkins Aug 27, 2020
a7ad918
clean-tsv-metadata: add output_file;
eharkins Aug 27, 2020
3b8d794
clean-tsv-metadata: fix column truncation
eharkins Sep 2, 2020
10d3b6f
clean-tsv-metadata: too many col error msg
eharkins Sep 2, 2020
54268fb
clean-tsv: Stop reading in TSV twice
kairstenfay Sep 2, 2020
83bc065
clean-tsv: Guard against negative and 0 --n-cols
kairstenfay Sep 2, 2020
920a598
clean-tsv: Improve --sort-col error message
kairstenfay Sep 3, 2020
6d9ec07
clean-tsv: Add type hints to functions
kairstenfay Sep 3, 2020
fb99922
clean-tsv: Fix typo
kairstenfay Sep 3, 2020
40a62a0
clean-tsv: Wrap long lines
kairstenfay Sep 3, 2020
37b8814
clean-tsv: Print TSV to stdout by default
kairstenfay Sep 3, 2020
b7edc34
clean-tsv: Add --sort option
kairstenfay Sep 16, 2020
01ee180
Add a script for cleaning manually maintained TSVs
kairstenfay Sep 16, 2020
22cef3f
Revert "add clean-tsv workflow"
kairstenfay Sep 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions bin/clean-source-data-tsvs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
# usage: clean-source-data-tsvs [--sort]
# clean-source-data-tsvs--help
#
# Cleans the two biggest, manually maintained source-data TSVs:
# gisaid_anontations.tsv and location_hierarchy.tsv. Optionally sorts the files
# with the --sort option.
#
set -euo pipefail

main() {
local sort=0

for arg; do
case "$arg" in
-h|--help)
print-help
exit
;;
--sort)
sort=1
shift
break
;;
esac
done

cd "$(dirname "$0")/.."

# Paper and non-paper annotations should be processed separately, because
# the build maintainers have kept title and paper_url at the bottom half of
# the file. This makes it easier for build maintainers to add other stuff at
# the top.
nonpaper_annotations="$(mktemp -t nonpaper-annotations-XXXXXX.tsv)"
paper_annotations="$(mktemp -t paper-annotations-XXXXXX.tsv)"

trap "rm -f '$nonpaper_annotations' '$paper_annotations'" EXIT
grep -v "paper\|title" ./source-data/gisaid_annotations.tsv > "$nonpaper_annotations"
grep "paper\|title" ./source-data/gisaid_annotations.tsv > "$paper_annotations"

if [[ "$sort" == 1 ]]; then
./bin/clean-tsv "$nonpaper_annotations" --output-tsv "$nonpaper_annotations" --n-cols 4 --sort --no-header
./bin/clean-tsv "$paper_annotations" --output-tsv "$paper_annotations" --n-cols 4 --sort-col 1 --no-header
./bin/clean-tsv ./source-data/location_hierarchy.tsv --output-tsv source-data/location_hierarchy.tsv --n-cols 4 --sort
else
./bin/clean-tsv "$nonpaper_annotations" --output-tsv "$nonpaper_annotations" --n-cols 4 --no-header
./bin/clean-tsv "$paper_annotations" --output-tsv "$paper_annotations" --n-cols 4 --no-header
./bin/clean-tsv ./source-data/location_hierarchy.tsv --output-tsv source-data/location_hierarchy.tsv --n-cols 4
fi

cat "$nonpaper_annotations" > ./source-data/gisaid_annotations.tsv
cat "$paper_annotations" >> ./source-data/gisaid_annotations.tsv
}

print-help() {
# Print the help comments at the top of this file ($0)
local line
while read -r line; do
if [[ $line =~ ^#! ]]; then
continue
elif [[ $line =~ ^# ]]; then
line="${line/##/}"
line="${line/# /}"
echo "$line"
else
break
fi
done < "$0"
}

main "$@"
126 changes: 126 additions & 0 deletions bin/clean-tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
import pandas as pd
import argparse
import csv
import sys


def positive_integer(string: str) -> int:
"""
Custom type checker for argparse. Throws an
:class:`argparse.ArgumentTypeError` if the given *string* is not a valid int
or if it's less than or equal to zero.
"""
try:
value = int(string)
assert value > 0

except (ValueError, AssertionError):
raise argparse.ArgumentTypeError("--n-cols must be an integer > 0")

return value

def clean_tsv_file(input_file: str, output: str, n_cols: positive_integer,
header: bool, sort: bool, sort_col: str):
"""
1. Read the tsv file
2. Sort alphabetically and make sure n columns (n-1 tabs) each row (if n_cols is passed)
3. Write out tsv file
"""
try:
data = pd.read_csv(input_file,
sep="\t",
header=None,
names=range(n_cols) if n_cols else None,
usecols=range(n_cols) if n_cols else None)

except pd.errors.ParserError as e:
if n_cols is None:
raise Exception(f"Error decoding tsv «{input_file}». Please provide an argument for "
"--n-cols specifying the number of columns to enforce. This number must not exceed "
"the total number of columns in the tsv.")
else:
raise Exception(f"Error decoding tsv: {e}. \n\n Did you specify too many columns with "
"--n-cols? --n-cols can't add extra columns, it just enforces up to the existing "
"number of columns in the tsv.")

if n_cols:
# Drop excess columns as specified by --n-cols
data = data[data.columns[0:n_cols]]

col_names = list(data.columns)

if header:
# "Pop" first row and replace column names with it
first_row = data.iloc[0]
assert first_row.notnull().all(), "Error: You have missing values in your tsv header. " \
"Did you mean to specify --no-header?"
col_names = list(first_row)
data.columns = col_names
data = data.drop(0).reset_index(drop=True)

if sort_col is not None:
try:
if not header:
sort_col = int(sort_col)
col_names.remove(sort_col)
except ValueError:
raise ValueError(f"«{sort_col}» is not a column name in file, or if --no-header was "
"used, there are not this many columns in the file.")
col_names = [sort_col] + col_names

if sort:
data = data.sort_values(col_names)

data.to_csv(output, sep="\t", index=False, header=header, quoting=csv.QUOTE_NONE)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Sort tsv and clean tabs.""",
formatter_class=argparse.RawTextHelpFormatter
)

parser.add_argument(
"tsv",
type=str,
help="Path to file in TSV format to be cleaned and sorted.\n")

parser.add_argument(
"--output-tsv",
type=str,
dest="output",
default=sys.stdout,
help="Path to write cleaned and sorted output TSV. Defaults to stdout.")

parser.add_argument(
"--n-cols",
type=positive_integer,
help="Number of columns to enforce. Extra tabs will be added for rows with fewer than "
"this number, and rows with more than this number will be truncated to this number "
"of columns/tabs.\n")

parser.add_argument(
"--sort",
action='store_true',
help="Sort output TSV alphabetically by columns in the order they appear. This is the "
"default sorting behavior, and it can be overriden with --sort-col.")

parser.add_argument(
"--sort-col",
type=str,
help="A column name (or 0-based index for headerless tsvs) to prioritize sorting; "
"overrides the --sort option.\n")

parser.add_argument(
"--no-header",
action='store_true',
help="Do not read in the first line of the file as the column names.\n")

args = parser.parse_args()
clean_tsv_file(input_file=args.tsv,
output=args.output,
n_cols=args.n_cols,
header=not args.no_header,
sort=args.sort or (args.sort_col is not None),
sort_col=args.sort_col)