Skip to content

Commit

Permalink
Add retries to mergepdf (#77)
Browse files Browse the repository at this point in the history
* update limits

* Add retries to mergepdf
  • Loading branch information
joecorall authored Jan 20, 2025
1 parent 619743b commit 1c3a177
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 22 deletions.
4 changes: 2 additions & 2 deletions ci/k8s/crayfits.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ spec:
spec:
containers:
- name: scyllaridae-crayfits
image: lehighlts/scyllaridae-fits:main
image: lehighlts/scyllaridae-fits:main-14a4905
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "32Mi"
cpu: "100m"
limits:
memory: "128Mi"
memory: "1Gi"
ports:
- containerPort: 8080
hostPort: 8083
Expand Down
8 changes: 3 additions & 5 deletions ci/k8s/fits.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ spec:
app: fits
ports:
- protocol: TCP
port: 8084
targetPort: 8080
port: 8080
---
apiVersion: apps/v1
kind: Deployment
Expand All @@ -27,7 +26,7 @@ spec:
spec:
containers:
- name: fits
image: islandora/fits:3.4
image: islandora/fits:3.4.12
imagePullPolicy: IfNotPresent
env:
- name: FITS_MAX_REQUEST_SIZE
Expand All @@ -39,10 +38,9 @@ spec:
memory: "256Mi"
cpu: "250m"
limits:
memory: "2Gi"
memory: "4Gi"
ports:
- containerPort: 8080
hostPort: 8084
readinessProbe:
httpGet:
path: /fits/version
Expand Down
10 changes: 6 additions & 4 deletions ci/k8s/houdini.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ spec:
app: islandora-imagemagick
ports:
- protocol: TCP
port: 8080
port: 8090
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
Expand All @@ -26,16 +27,17 @@ spec:
spec:
containers:
- name: scyllaridae-imagemagick
image: lehighlts/scyllaridae-imagemagick:main-dff1880
image: lehighlts/scyllaridae-imagemagick:main-8a5b743
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "128Mi"
memory: "256Mi"
cpu: "100m"
limits:
memory: "2Gi"
memory: "10Gi"
ports:
- containerPort: 8080
hostPort: 8090
readinessProbe:
httpGet:
path: /healthcheck
Expand Down
4 changes: 2 additions & 2 deletions ci/k8s/mergepdf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
spec:
containers:
- name: scyllaridae-mergepdf
image: lehighlts/scyllaridae-mergepdf:main-14b2276
image: lehighlts/scyllaridae-mergepdf:main-4a2d8ca
imagePullPolicy: IfNotPresent
env:
- name: MAX_THREADS
Expand All @@ -37,7 +37,7 @@ spec:
memory: "128Mi"
cpu: "500m"
limits:
memory: "4Gi"
memory: "24Gi"
ports:
- containerPort: 8080
hostPort: 8088
Expand Down
4 changes: 2 additions & 2 deletions ci/k8s/whisper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ spec:
spec:
containers:
- name: scyllaridae-whisper
image: lehighlts/scyllaridae-whisper:main
image: lehighlts/scyllaridae-whisper:main-14a4905
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "5Gi"
memory: "10Gi"
nvidia.com/gpu: "4"
ports:
- containerPort: 8080
Expand Down
39 changes: 32 additions & 7 deletions examples/mergepdf/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,29 @@ TMP_DIR=$(mktemp -d)
I=0
MAX_THREADS=${MAX_THREADS:-5}
PIDS=()
RETRIES=3

# iterate over all images in the IIIF manifest
URLS=$(curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | awk -F '/' '{print $7}'|sed -e 's/%2F/\//g' -e 's/%3A/:/g')
# Function to download and process the image with retries
download_and_process() {
local url="$1"
local output_file="$2"
local attempt=0

while (( attempt < RETRIES )); do
if curl -s "$url" | magick - -resize 1000x\> "$output_file" > /dev/null 2>&1; then
return 0
fi
attempt=$(( attempt + 1 ))
echo "Retrying ($attempt/$RETRIES) for $url..."
sleep 1
done

echo "Failed to process $url after $RETRIES attempts." >&2
return 1
}

# Iterate over all images in the IIIF manifest
URLS=$(curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | awk -F '/' '{print $7}' | sed -e 's/%2F/\//g' -e 's/%3A/:/g')
while read -r URL; do
# If we have reached the max thread limit, wait for any one job to finish
if [ "${#PIDS[@]}" -ge "$MAX_THREADS" ]; then
Expand All @@ -24,11 +44,16 @@ while read -r URL; do

# Run each job in the background
(
# download and resize image to max 1000px width
curl -s "$URL" | magick -[0] -resize 1000x\> "$TMP_DIR/img_$I" || curl -s "$URL" | magick - -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1
# make an OCR'd PDF from the image
tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
rm "$TMP_DIR/img_$I"
local_img="$TMP_DIR/img_$I"

# Download and resize the image with retry logic
if ! download_and_process "$URL" "$local_img"; then
exit 1
fi

# Make an OCR'd PDF from the image
tesseract "$local_img" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
rm "$local_img"
) &
PIDS+=("$!")
I="$(( I + 1))"
Expand Down

0 comments on commit 1c3a177

Please sign in to comment.