.github/workflows/main.yml

name: Process XML to JSON and HTML

on:
  push:
    branches:
      - 'development_backup_cbss-bibls'  # current data branch
      
permissions:
  id-token: write
  contents: read
  
jobs:
  process_and_transform:
    runs-on: ubuntu-latest
    steps:
      # Step 1: Check out the repositories
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Checkout syriaca repository (code repo)
        uses: actions/checkout@v3
        with:
          repository: srophe/syriaca
          ref: staticSite
          path: syriaca  # Check it out into a subfolder        

      # Step 2: Install Java and Saxon for XSLT 
      - name: Set up JDK 11
        uses: actions/setup-java@v3
        with:
          java-version: '11'
          distribution: 'temurin'
          
      - name: Download Saxon from GitHub
        run: |
          wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar

      # Step 3: 
      - name: Configure AWS credentials from AWS account
        uses: aws-actions/configure-aws-credentials@v2
        with:
          role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
          aws-region: us-east-1
          role-session-name: GitHub-OIDC-data          
          
          
      # Step 4: Find updated XML files
      # - name: Identify updated XML files
      #   run: |
      #     UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
      #     echo "Updated XML files: $UPDATED_FILES"
      #     echo "::set-output name=updated_files::$UPDATED_FILES"
      #   id: files
      # Step 3: Identify the first 5 XML files for testing
      - name: Identify first 5 XML files 
        run: |
          find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt
          echo "Processing the first 5 XML files:"
          cat xml_files.txt
          

      # Step 5: Run XSLT Transformations and Merge into Single JSON
      - name: Run XSLT Transformations and Create Bulk JSON
        run: |
          touch bulk_data.json  # Create the bulk JSON file
          # mkdir -p data-html
          # echo "Created HTML directory"
          
          while IFS= read -r file; do
            echo "Processing $file"
      
            # Extract the document type from the file path
            type=$(echo "$file" | grep -o -E 'work|subject|person|place|spear|bibl')
            echo "Json type $type"
            # Extract the filename and create the index header for OpenSearch bulk format
            filename=$(basename ${file%.xml})
            echo "Processing $filename for JSON"
            printf "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
      
            # Apply XSLT for JSON conversion and append it to bulk_data.json directly
            java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl | tr -d '\n' >> bulk_data.json
            echo "" >> bulk_data.json  # Add a newline after the document entry

            

            # Apply XSLT for HTML conversion and capture any error
            # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log
            # # Upload the HTML file to S3
            # aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html

          done < xml_files.txt
        env:
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
          AWS_REGION: ${{ secrets.AWS_REGION }}          

     
          
          
      # Step 5: Create a directory for static HTML files 
      - name: Create static HTML directory
        run: |
          mkdir -p data-html  
          echo "Created HTML directory"

     
      - name: Run XSLT Transformations for HTML
        run: |
          while IFS= read -r file; do
            echo "Processing $file for HTML"
            # Extract the document type from the file path
            

            type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|spear|bibl' | head -n 1)
            echo "HTML type $type"
            # Extract the filename and create the index header for OpenSearch bulk format
            filename=$(basename ${file%.xml})
            echo "html filename: $filename"
            # Run the XSLT transformation located in the root of syriaca-data repository
            java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/$(basename ${file%.xml}.html)

          done < xml_files.txt
          
       
      # Step 6: Upload files to S3 
      - name: Upload JSON file to S3 
        run: |
          aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

      - name: Upload HTML files to S3
        run: |
        
          for html_file in $(find ./data-html -name "*.html"); do
            type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|spear|bibl' | head -n 1)
            echo "type $type"

            aws s3 cp $html_file s3://srophe-syriaca-front-end/testHtml/$(basename ${html_file%.html}.html)

          done
        env:
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
          AWS_REGION: ${{ secrets.AWS_REGION }}

      # - name: JSON file to OpenSearch
      #   run: |
      #     curl -X POST "${OPENSEARCH_URL}/_bulk" \
      #     -H "Content-Type: application/json" \
      #     --data-binary "@bulk_data.json"
      #   env:
      #     OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}