Update main.yml #27

Workflow file for this run

	name: Process XML to JSON and HTML

	on:
	push:
	branches:
	- 'development_backup_cbss-bibls' # current data branch

	permissions:
	id-token: write
	contents: read

	jobs:
	process_and_transform:
	runs-on: ubuntu-latest
	steps:
	# Step 1: Check out the repository
	- name: Checkout repository
	uses: actions/checkout@v3

	# Step 2: Install Java and Saxon and dependencies for XSLT
	- name: Set up JDK 11
	uses: actions/setup-java@v3
	with:
	java-version: '11'
	distribution: 'temurin'

	- name: Download Saxon from GitHub
	run: \|
	wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar

	# - name: Install Saxon and dependencies
	# run: \|
	# wget https://www.saxonica.com/download/hej/SaxonHE10-6J.zip -O saxon.zip
	# unzip saxon.zip

	# Step 3: Find updated XML files
	# - name: Identify updated XML files
	# run: \|
	# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD \| grep '.xml')
	# echo "Updated XML files: $UPDATED_FILES"
	# echo "::set-output name=updated_files::$UPDATED_FILES"
	# id: files
	# Step 3: Find all XML files for first run
	# Step 3: Identify the first 5 person XML files
	- name: Identify first 5 XML files
	run: \|
	find ./data/persons/tei -name '*.xml' \| head -n 5 > xml_files.txt
	echo "Processing the first 5 XML files:"
	cat xml_files.txt

	# Step 4: Run XSLT Transformations and Merge into Single JSON
	- name: Run XSLT Transformations and Create Bulk JSON
	run: \|
	mkdir -p json-data # Ensure the output folder exists
	touch bulk_data.json # Create the bulk JSON file

	while IFS= read -r file; do
	echo "Processing $file"

	# Create the index header for OpenSearch bulk format
	filename=$(basename ${file%.xml})
	echo "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$filename\"}}" >> bulk_data.json

	# Apply XSLT for JSON conversion and append it to bulk_data.json
	java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl -o:temp.json
	cat temp.json >> bulk_data.json
	echo "" >> bulk_data.json # Ensure newline between documents
	done < xml_files.txt

	# Step 5: Configure AWS credentials
	- name: Configure AWS credentials from AWS account
	uses: aws-actions/configure-aws-credentials@v2
	with:
	role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
	aws-region: us-east-1
	role-session-name: GitHub-OIDC-data

	# Step 6: Upload JSON files to S3
	- name: Upload JSON files to S3
	run: \|
	aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json
	env:
	AWS_REGION: ${{ secrets.AWS_REGION }}
	AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

	# Step 7: Upload HTML files to S3
	# - name: Sync HTML files to S3
	# run: \|
	# for html_file in $(find . -name "*.html"); do
	# aws s3 cp $html_file s3://srophe-syriaca-front-end/ --acl public-read
	# done
	# env:
	# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
	# AWS_REGION: us-east-1

	# Step 8: Index JSON files into OpenSearch
	# - name: Index JSON files to OpenSearch
	# run: \|
	# for json_file in $(find . -name "*.json"); do
	# curl -X POST "https://your-opensearch-endpoint/_bulk" \
	# -H "Content-Type: application/json" \
	# --data-binary "@$json_file"
	# done
	# env:
	# OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update main.yml #27

Workflow file

Update main.yml #27

Jobs

Run details

Workflow file for this run