-
Notifications
You must be signed in to change notification settings - Fork 16
229 lines (187 loc) · 8.48 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
name: Process XML to JSON and HTML
on:
push:
branches:
- 'development_backup_cbss-bibls' # current data branch
# paths:
# - 'data/**' # Trigger only on changes to data files
permissions:
id-token: write
contents: read
jobs:
process_and_transform:
runs-on: ubuntu-latest
steps:
# Step 1: Check out the repositories
- name: Checkout repository
uses: actions/checkout@v3
- name: Checkout syriaca repository (code repo)
uses: actions/checkout@v3
with:
repository: srophe/syriaca
ref: staticSite
path: syriaca # Check it out into a subfolder
# Step 2: Install Java and Saxon for XSLT
- name: Set up JDK 11
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
- name: Download Saxon from GitHub
run: |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar
# Step 3:
- name: Configure AWS credentials from AWS account
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
aws-region: us-east-1
role-session-name: GitHub-OIDC-data
# Step 4: Find updated XML files
# - name: Identify updated XML files
# id: files
# run: |
# echo "Ensuring commit history is fetched..."
# git fetch --unshallow || echo "Repository is already fully cloned."
# echo "Checking for updated XML files..."
# if git rev-list --count HEAD > 1; then
# # Find updated files between the last commit and the current HEAD
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '\.xml$')
# else
# # If there's no prior commit, process all XML files
# UPDATED_FILES=$(git ls-files | grep '\.xml$')
# fi
# # Check if any XML files were updated
# if [ -z "$UPDATED_FILES" ]; then
# echo "No XML files were updated."
# echo "::set-output name=updated_files::"
# exit 0
# fi
# Output the list of updated files
# echo "$UPDATED_FILES" > xml_files.txt
# echo "Updated XML files:"
# cat xml_files.txt
# echo "::set-output name=updated_files::$UPDATED_FILES"
# shell: bash
# Step 4: Identify XML files for batch conversions
- name: Identify XML files
run: |
# find ./data/places/tei -name '*.xml' | head -n 10 > xml_files.txt
find ./data/bibl/tei -name '*.xml' > xml_files.txt
echo "Processing XML files:"
cat xml_files.txt
# Step 5: Run XSLT Transformations and Merge into Single JSON
- name: Run XSLT Transformations and Create Bulk JSON
run: |
if [ ! -s xml_files.txt ]; then
echo "No XML files to process."
exit 0
fi
touch bulk_data.json # Create the bulk JSON file
# Commented out code in this section for possible optimization of code
# mkdir -p data-html
# echo "Created HTML directory"
while IFS= read -r file; do
# Extract the document type from the file path
type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl')
# Fix bible/subject/worker conflict: choose 'subject' over other types
if [[ "$type" == *"subject"* ]]; then
type="subject"
elif [[ "$type" == *"bibl"* ]]; then
type="cbss"
fi
# Extract the filename and create the index header for OpenSearch bulk format
filename=$(basename ${file%.xml})
echo "Processing $filename for JSON"
printf "{\"index\":{\"_index\":\"syriaca-index-10\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
# Apply XSLT for JSON conversion and append it to bulk_data.json directly
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl docType="$type" | tr -d '\n' >> bulk_data.json
echo "" >> bulk_data.json # Add a newline after the document entry
# Apply XSLT for HTML conversion and capture any error
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log
# # Upload the HTML file to S3
# aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html
done < xml_files.txt
env:
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
AWS_REGION: ${{ secrets.AWS_REGION }}
# Step 6: Convert HTML files
# - name: Create static HTML directory
# run: |
# mkdir -p data-html
# echo "Created HTML directory"
# - name: Run XSLT Transformations for HTML
# run: |
# while IFS= read -r file; do
# echo "Processing $file for HTML"
# # Extract the document type from the file path
# type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
# # Extract the filename and create the index header for OpenSearch bulk format
# filename=$(basename ${file%.xml})
# echo "html filename: $filename"
# # if [ "$type" == "bibl" ]; then
# # type="cbss"
# # fi
# echo "HTML type $type"
# # Run the XSLT transformation located in the root of syriaca-data repository
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/${filename}.html
# # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${filename}.html
# done < xml_files.txt
# Step 6:
- name: Configure AWS credentials from AWS account
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
aws-region: us-east-1
role-session-name: GitHub-OIDC-data
# Step 7: Upload files to S3
- name: Upload JSON file to S3
run: |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/advancedsearchfields/index_10_bibl.json
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# - name: Upload HTML files to S3
# run: |
# for html_file in $(find ./data-html -name "*.html"); do
# type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
# echo "html_file $html_file"
# if [ "$type" == "subject" ]; then
# type="taxonomy"
# fi
# if [ "$type" == "bibl" ]; then
# type="cbss"
# fi
# # Copy html file to S3 with the idno path
# aws s3 cp $html_file s3://srophe-syriaca-front-end/${type}/$(basename ${html_file%.html})
# done
# env:
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# AWS_REGION: ${{ secrets.AWS_REGION }}
# Step 8: Upload JSON data to OpenSearch
- name: JSON file to OpenSearch
env:
OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}
OPENSEARCH_USER: ${{ secrets.OPENSEARCH_USER }}
OPENSEARCH_PASSWORD: ${{ secrets.OPENSEARCH_PASSWORD }}
run: |
RESPONSE=$(curl -s -o response.json -w "%{http_code}" -XPOST "$OPENSEARCH_URL/_bulk" \
-H "Content-Type: application/json" \
-u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \
--data-binary "@bulk_data.json")
echo "HTTP response code: $RESPONSE"
cat response.json
# Check for errors in the response
if grep -q '"errors":true' response.json; then
echo "Errors occurred during bulk upload"
exit 1
fi
# Parse the response for failed items using jq
FAILED_ENTRIES=$(jq -c '.items[] | select(.index.status >= 400) | {id: .index._id, error: .index.error}' response.json)
if [[ -n "$FAILED_ENTRIES" ]]; then
echo "Failed entries:"
echo "$FAILED_ENTRIES" > failed_entries.json
echo "$FAILED_ENTRIES" # Prints to the console
else
echo "All entries were successfully indexed."
fi