Update json-stylesheet.xsl #41
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Process XML to JSON and HTML | |
on: | |
push: | |
branches: | |
- 'development_backup_cbss-bibls' # current data branch | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
process_and_transform: | |
runs-on: ubuntu-latest | |
steps: | |
# Step 1: Check out the repository | |
- name: Checkout repository | |
uses: actions/checkout@v3 | |
# Step 2: Install Java and Saxon and dependencies for XSLT | |
- name: Set up JDK 11 | |
uses: actions/setup-java@v3 | |
with: | |
java-version: '11' | |
distribution: 'temurin' | |
- name: Download Saxon from GitHub | |
run: | | |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar | |
# Step 3: Find updated XML files | |
# - name: Identify updated XML files | |
# run: | | |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml') | |
# echo "Updated XML files: $UPDATED_FILES" | |
# echo "::set-output name=updated_files::$UPDATED_FILES" | |
# id: files | |
# Step 3: Identify the first 5 XML files for testing | |
- name: Identify first 5 XML files | |
run: | | |
find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt | |
echo "Processing the first 5 XML files:" | |
cat xml_files.txt | |
# Step 4: Run XSLT Transformations and Merge into Single JSON | |
- name: Run XSLT Transformations and Create Bulk JSON | |
run: | | |
touch bulk_data.json # Create the bulk JSON file | |
while IFS= read -r file; do | |
echo "Processing $file" | |
# Extract the document type from the file path | |
type=$(echo "$file" | grep -o -E 'work|subject|person|place|spear|bibl') | |
# Extract the filename and create the index header for OpenSearch bulk format | |
filename=$(basename ${file%.xml}) | |
printf "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json | |
# Apply XSLT for JSON conversion and append it to bulk_data.json directly | |
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl | tr -d '\n' >> bulk_data.json | |
echo "" >> bulk_data.json # Add a newline after the document entry | |
done < xml_files.txt | |
# Step 5: Configure AWS credentials | |
- name: Configure AWS credentials from AWS account | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }} | |
aws-region: us-east-1 | |
role-session-name: GitHub-OIDC-data | |
# Step 6: Upload JSON files to S3 | |
- name: Upload JSON files to S3 | |
run: | | |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# - name: Upload HTML files to S3 | |
# run: | | |
# for html_file in $(find . -name "*.html"); do | |
# aws s3 cp $html_file s3://srophe-syriaca-front-end/testHtml | |
# done | |
# env: | |
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# AWS_REGION: us-east-1 | |
# - name: JSON file to OpenSearch | |
# run: | | |
# curl -X POST "${OPENSEARCH_URL}/_bulk" \ | |
# -H "Content-Type: application/json" \ | |
# --data-binary "@bulk_data.json" | |
# env: | |
# OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }} |