.github/workflows/main.yml

name: Process XML to JSON and HTML

on:
  push:
    branches:
      - 'development_backup_cbss-bibls'  # current data branch
      
permissions:
  id-token: write
  contents: read
  
jobs:
  process_and_transform:
    runs-on: ubuntu-latest
    steps:
      # Step 1: Check out the repository
      - name: Checkout repository
        uses: actions/checkout@v3

      # Step 2: Install Java and Saxon and dependencies for XSLT
      - name: Set up JDK 11
        uses: actions/setup-java@v3
        with:
          java-version: '11'
          distribution: 'temurin'
          
      - name: Download Saxon from GitHub
        run: |
          wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar

      # Step 3: Find updated XML files
      # - name: Identify updated XML files
      #   run: |
      #     UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
      #     echo "Updated XML files: $UPDATED_FILES"
      #     echo "::set-output name=updated_files::$UPDATED_FILES"
      #   id: files
      # Step 3: Identify the first 5 XML files for testing
      - name: Identify first 5 XML files 
        run: |
          find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt
          echo "Processing the first 5 XML files:"
          cat xml_files.txt

      # Step 4: Run XSLT Transformations and Merge into Single JSON
      - name: Run XSLT Transformations and Create Bulk JSON
        run: |
          touch bulk_data.json  # Create the bulk JSON file
          while IFS= read -r file; do
            echo "Processing $file"
      
            # Extract the document type from the file path
            type=$(echo "$file" | grep -o -E 'work|subject|person|place|spear|bibl')
      
            # Extract the filename and create the index header for OpenSearch bulk format
            filename=$(basename ${file%.xml})
            printf "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
      
            # Apply XSLT for JSON conversion and append it to bulk_data.json directly
            java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl | tr -d '\n' >> bulk_data.json
            echo "" >> bulk_data.json  # Add a newline after the document entry
            
          done < xml_files.txt


      # Step 5: Configure AWS credentials
      - name: Configure AWS credentials from AWS account
        uses: aws-actions/configure-aws-credentials@v2
        with:
          role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
          aws-region: us-east-1
          role-session-name: GitHub-OIDC-data
       
      # Step 6: Upload JSON files to S3 
      - name: Upload JSON files to S3 
        run: |
          aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

      # - name: Upload HTML files to S3
      #   run: |
      #     for html_file in $(find . -name "*.html"); do
      #       aws s3 cp $html_file s3://srophe-syriaca-front-end/testHtml 
      #     done
      #   env:
      #     AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
      #     AWS_REGION: us-east-1

      # - name: JSON file to OpenSearch
      #   run: |
      #     curl -X POST "${OPENSEARCH_URL}/_bulk" \
      #     -H "Content-Type: application/json" \
      #     --data-binary "@bulk_data.json"
      #   env:
      #     OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}