-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path20.processing
37 lines (32 loc) · 940 Bytes
/
20.processing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
source .env
source .checks
set -euo pipefail
IFS=" " read -ra args <<< $HQ_ENTRY
L=${args[0]}
COLL=${args[1]}
batch=${args[2]}
INPUT=$WORKSPACE/dedup/$COLL/$L/$batch
out_dir=$WORKSPACE/annotated/$COLL/$L
OUTPUT=$out_dir/$batch
mkdir -p $out_dir
echo running $L $COLL $batch
# Run annotation parallel loop
# singularity container will run gnu parallel to avoid many containers being launched
# container needs the current dir to be mounted
zstdcat $INPUT \
| /usr/bin/time -v \
singularity exec --bind $WORKSPACE/robotstxt monotextor.sif \
annotator -d $WORKSPACE/robotstxt/$COLL/disallowed-urls.fst \
| /usr/bin/time -v \
singularity exec --bind $(pwd -P) --pwd $(pwd -P) monotextor.sif \
parallel --pipe -k \
--halt now,fail=1 \
-j200 --block 20M python scripts/annotate.py -a $L \
| zstdmt -T64 -10 \
> $OUTPUT.tmp \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}
mv $OUTPUT.tmp $OUTPUT