-
Notifications
You must be signed in to change notification settings - Fork 16
152 lines (118 loc) · 5.46 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
name: Process XML to JSON and HTML
on:
push:
branches:
- 'development_backup_cbss-bibls' # current data branch
permissions:
id-token: write
contents: read
jobs:
process_and_transform:
runs-on: ubuntu-latest
steps:
# Step 1: Check out the repositories
- name: Checkout repository
uses: actions/checkout@v3
- name: Checkout syriaca repository (code repo)
uses: actions/checkout@v3
with:
repository: srophe/syriaca
ref: staticSite
path: syriaca # Check it out into a subfolder
# Step 2: Install Java and Saxon for XSLT
- name: Set up JDK 11
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
- name: Download Saxon from GitHub
run: |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar
# Step 3:
- name: Configure AWS credentials from AWS account
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
aws-region: us-east-1
role-session-name: GitHub-OIDC-data
# Step 4: Find updated XML files
# - name: Identify updated XML files
# run: |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
# echo "Updated XML files: $UPDATED_FILES"
# echo "::set-output name=updated_files::$UPDATED_FILES"
# id: files
# Step 3: Identify the first 5 XML files for testing
- name: Identify first 5 XML files
run: |
find ./data/persons/tei -name '*.xml' | head -n 5 > xml_files.txt
echo "Processing the first 5 XML files:"
cat xml_files.txt
# Step 5: Run XSLT Transformations and Merge into Single JSON
- name: Run XSLT Transformations and Create Bulk JSON
run: |
touch bulk_data.json # Create the bulk JSON file
# mkdir -p data-html
# echo "Created HTML directory"
while IFS= read -r file; do
echo "Processing $file"
# Extract the document type from the file path
type=$(echo "$file" | grep -o -E 'work|subject|person|place|spear|bibl')
echo "Json type $type"
# Extract the filename and create the index header for OpenSearch bulk format
filename=$(basename ${file%.xml})
echo "Processing $filename for JSON"
printf "{\"index\":{\"_index\":\"syriaca-index-1\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
# Apply XSLT for JSON conversion and append it to bulk_data.json directly
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl | tr -d '\n' >> bulk_data.json
echo "" >> bulk_data.json # Add a newline after the document entry
# Apply XSLT for HTML conversion and capture any error
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log
# # Upload the HTML file to S3
# aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html
done < xml_files.txt
env:
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
AWS_REGION: ${{ secrets.AWS_REGION }}
# Step 5: Create a directory for static HTML files
- name: Create static HTML directory
run: |
mkdir -p data-html
echo "Created HTML directory"
- name: Run XSLT Transformations for HTML
run: |
while IFS= read -r file; do
echo "Processing $file for HTML"
# Extract the document type from the file path
type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|spear|bibl' | head -n 1)
echo "HTML type $type"
# Extract the filename and create the index header for OpenSearch bulk format
filename=$(basename ${file%.xml})
echo "html filename: $filename"
# Run the XSLT transformation located in the root of syriaca-data repository
java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/$(basename ${file%.xml}.html)
done < xml_files.txt
# Step 6: Upload files to S3
- name: Upload JSON file to S3
run: |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/bulk_data_persons.json
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Upload HTML files to S3
run: |
for html_file in $(find ./data-html -name "*.html"); do
type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|spear|bibl' | head -n 1)
echo "type $type"
aws s3 cp $html_file s3://srophe-syriaca-front-end/testHtml/$(basename ${html_file%.html}.html)
done
env:
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
AWS_REGION: ${{ secrets.AWS_REGION }}
# - name: JSON file to OpenSearch
# run: |
# curl -X POST "${OPENSEARCH_URL}/_bulk" \
# -H "Content-Type: application/json" \
# --data-binary "@bulk_data.json"
# env:
# OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}