Update main.yml #27
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Process XML to JSON and HTML | |
on: | |
push: | |
branches: | |
- 'development_backup_cbss-bibls' # current data branch | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
process_and_transform: | |
runs-on: ubuntu-latest | |
steps: | |
# Step 1: Check out the repository | |
- name: Checkout repository | |
uses: actions/checkout@v3 | |
# Step 2: Install Java and Saxon and dependencies for XSLT | |
- name: Set up JDK 11 | |
uses: actions/setup-java@v3 | |
with: | |
java-version: '11' | |
distribution: 'temurin' | |
- name: Download Saxon from GitHub | |
run: | | |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar | |
# - name: Install Saxon and dependencies | |
# run: | | |
# wget https://www.saxonica.com/download/hej/SaxonHE10-6J.zip -O saxon.zip | |
# unzip saxon.zip | |
# Step 3: Find updated XML files | |
# - name: Identify updated XML files | |
# run: | | |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml') | |
# echo "Updated XML files: $UPDATED_FILES" | |
# echo "::set-output name=updated_files::$UPDATED_FILES" | |
# id: files | |
# Step 3: Find all XML files for first run | |
# Step 3: Identify the first 5 person XML files | |
- name: Identify first 5 XML files | |
run: | | |
find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt | |
echo "Processing the first 5 XML files:" | |
cat xml_files.txt | |
# Step 4: Run XSLT Transformations and Merge into Single JSON | |
- name: Run XSLT Transformations and Create Bulk JSON | |
run: | | |
mkdir -p json-data # Ensure the output folder exists | |
touch bulk_data.json # Create the bulk JSON file | |
while IFS= read -r file; do | |
echo "Processing $file" | |
# Create the index header for OpenSearch bulk format | |
filename=$(basename ${file%.xml}) | |
echo "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$filename\"}}" >> bulk_data.json | |
# Apply XSLT for JSON conversion and append it to bulk_data.json | |
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl -o:temp.json | |
cat temp.json >> bulk_data.json | |
echo "" >> bulk_data.json # Ensure newline between documents | |
done < xml_files.txt | |
# Step 5: Configure AWS credentials | |
- name: Configure AWS credentials from AWS account | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }} | |
aws-region: us-east-1 | |
role-session-name: GitHub-OIDC-data | |
# Step 6: Upload JSON files to S3 | |
- name: Upload JSON files to S3 | |
run: | | |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# Step 7: Upload HTML files to S3 | |
# - name: Sync HTML files to S3 | |
# run: | | |
# for html_file in $(find . -name "*.html"); do | |
# aws s3 cp $html_file s3://srophe-syriaca-front-end/ --acl public-read | |
# done | |
# env: | |
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# AWS_REGION: us-east-1 | |
# Step 8: Index JSON files into OpenSearch | |
# - name: Index JSON files to OpenSearch | |
# run: | | |
# for json_file in $(find . -name "*.json"); do | |
# curl -X POST "https://your-opensearch-endpoint/_bulk" \ | |
# -H "Content-Type: application/json" \ | |
# --data-binary "@$json_file" | |
# done | |
# env: | |
# OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }} |