-
Notifications
You must be signed in to change notification settings - Fork 16
97 lines (81 loc) · 3.46 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
name: Process XML to JSON and HTML
on:
push:
branches:
- 'development_backup_cbss-bibls' # current data branch
permissions:
id-token: write
contents: read
jobs:
process_and_transform:
runs-on: ubuntu-latest
steps:
# Step 1: Check out the repository
- name: Checkout repository
uses: actions/checkout@v3
# Step 2: Install Java and Saxon and dependencies for XSLT
- name: Set up JDK 11
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
- name: Download Saxon from GitHub
run: |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar
# Step 3: Find updated XML files
# - name: Identify updated XML files
# run: |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
# echo "Updated XML files: $UPDATED_FILES"
# echo "::set-output name=updated_files::$UPDATED_FILES"
# id: files
# Step 3: Identify the first 5 XML files for testing
- name: Identify first 5 XML files
run: |
find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt
echo "Processing the first 5 XML files:"
cat xml_files.txt
# Step 4: Run XSLT Transformations and Merge into Single JSON
- name: Run XSLT Transformations and Create Bulk JSON
run: |
touch bulk_data.json # Create the bulk JSON file
while IFS= read -r file; do
echo "Processing $file"
# Extract the document type from the file path
type=$(echo "$file" | grep -o -E 'work|subject|person|place|spear|bibl')
# Extract the filename and create the index header for OpenSearch bulk format
filename=$(basename ${file%.xml})
printf "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
# Apply XSLT for JSON conversion and append it to bulk_data.json directly
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl | tr -d '\n' >> bulk_data.json
echo "" >> bulk_data.json # Add a newline after the document entry
done < xml_files.txt
# Step 5: Configure AWS credentials
- name: Configure AWS credentials from AWS account
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
aws-region: us-east-1
role-session-name: GitHub-OIDC-data
# Step 6: Upload JSON files to S3
- name: Upload JSON files to S3
run: |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# - name: Upload HTML files to S3
# run: |
# for html_file in $(find . -name "*.html"); do
# aws s3 cp $html_file s3://srophe-syriaca-front-end/testHtml
# done
# env:
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# AWS_REGION: us-east-1
# - name: JSON file to OpenSearch
# run: |
# curl -X POST "${OPENSEARCH_URL}/_bulk" \
# -H "Content-Type: application/json" \
# --data-binary "@bulk_data.json"
# env:
# OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}