This repository has been archived by the owner on May 10, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Taskfile.yml
203 lines (189 loc) · 7.59 KB
/
Taskfile.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
version: '3'
silent: true
tasks:
default:
desc: harvest and transform
cmds:
- task: harvest
- task: transform
reset:
desc: delete cache; harvest and transform
cmds:
- rm -rf input/*
- task: harvest
- rm -rf output/*
- task: transform
install:
desc: install requirements into subdirectories
cmds:
- task: install_openrefine
- task: install_vufindharvest
install_openrefine:
cmds:
- | # install OpenRefine into subdirectory .openrefine
mkdir -p .openrefine
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
tar -xzf openrefine.tar.gz -C .openrefine --strip 1 && rm openrefine.tar.gz
- | # fix path issue in OpenRefine startup file
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine"
- | # do not try to open OpenRefine in browser
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini"
- | # set autosave period from 5 minutes to 25 hours
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini"
- | # install openrefine-client into subdirectory .openrefine
wget --no-verbose -O .openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
chmod +x .openrefine/client
install_vufindharvest:
cmds:
- | # requirement php
if [ -z "$(which php 2> /dev/null)" ]; then
echo 1>&2 "This task needs PHP 7.3+"; exit 1
fi
- | # requirement composer
if [ -z "$(which composer 2> /dev/null)" ]; then
echo 1>&2 "This task needs Composer https://getcomposer.org"; exit 1
fi
- | # install VuFindHarvest into subdirectory .vufindharvest
mkdir -p .vufindharvest
wget --no-verbose -O vufindharvest.zip https://github.com/vufind-org/vufindharvest/archive/v4.1.0.zip
unzip vufindharvest.zip && mv vufindharvest-*/* .vufindharvest && rm -r vufindharvest*
(cd .vufindharvest && composer install)
harvest:
desc: harvest all records from OAI-PMH
cmds:
- | # requirement VuFindHarvest
if [ ! -f .vufindharvest/bin/harvest_oai.php ]; then
echo 1>&2 "VuFindHarvest missing; try task install_vufindharvest"; exit 1
fi
- | # prepare workspace
mkdir -p input
- | # harvest all journals
(cd input && php ../.vufindharvest/bin/harvest_oai.php --ini ../config/harvest.ini)
- | # rename harvested files
(cd input &&
find . -name '*_*.xml' |
while IFS= read -r path; do
f="${path##*/}"
d="${path%/*}"
n="${f%.*}"
mv -f "$path" "${d}"/"${n:11}.xml"
done)
- | # handle deleted records
(cd input &&
find . -name '*.delete' |
while IFS= read -r path; do
f="${path##*/}"
d="${path%/*}"
n="${f%.*}"
rm -f "${d}"/"${n:11}.xml"
rm -f ../output/"${d}"/"${n:11}.xml"
rm "$path"
done)
- | # check results
minimum="2400"
count="$(find input -name '*.xml' | wc -l)"
if [ "$minimum" -le "$count" ]; then
echo "Download enthält $count Datensätze"
else
echo 1>&2 "Anzahl der heruntergeladenen Datensätze (${count}) entspricht nicht der erwarteten Mindestanzahl (${minimum})! Bitte manuell prüfen."; exit 1
fi
transform:
desc: transform data to mets/mods xml
cmds:
- task: transform_start
- defer: { task: transform_stop } # will run even when one of the following commands fail
- task: transform_client
- task: transform_split
- task: transform_validate
sources:
- input/*/*.xml
- config/**
generates:
- output/**
transform_start:
cmds:
- | # requirement OpenRefine
if [ ! -f .openrefine/refine ]; then
echo 1>&2 "OpenRefine missing; try task install_openrefine"; exit 1
fi
- | # delete temporary files of previous run
rm -rf .openrefine/data .openrefine/log.txt
- | # launch OpenRefine with specific data directory and redirect its output to a log file
.openrefine/refine -v warn -p 3333 -m 5120M -d data >> .openrefine/log.txt 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:3333 | cat | grep -q -o OpenRefine; do sleep 1; done"
transform_client:
cmds:
- | # prepare workspace
mkdir -p output
- | # create temporary zip archive with new files (modified less than 1 day ago)
zip -r -q input/tmp.zip config/schema.xml -@ <<< $(find input -name '*.xml' -mtime -1)
- | # import (requires absolute path)
.openrefine/client \
--create "$(readlink -m input/tmp.zip)" \
--format xml \
--recordPath oai_dc:dc \
--storeEmptyStrings false --trimStrings true \
--projectName rub-journals \
> >(tee -a .openrefine/log.txt) 2>&1
- | # delete temporary zip archive
rm input/tmp.zip
- | # apply transformation rules
for f in config/*.json; do
.openrefine/client rub-journals --apply "$f" > >(tee -a .openrefine/log.txt) 2>&1
done
- | # templating export to METS:MODS
.openrefine/client rub-journals \
--template "$(< config/template.txt)" \
--rowSeparator '' \
--output "$(readlink -m output/rub-journals.txt)" \
> >(tee -a .openrefine/log.txt) 2>&1
transform_stop:
cmds:
- | # kill OpenRefine gracefully and print stats
PID="$(lsof -t -i:3333)"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > >(tee -a .openrefine/log.txt) 2>&1
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > >(tee -a .openrefine/log.txt) 2>&1
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
- | # archive OpenRefine projects
for p in .openrefine/data/*.project/
do tar -czf .openrefine/data/"${p:17:13}.openrefine.tar.gz" -C $p .
done
- | # check log file
if grep -i 'exception\|error' .openrefine/log.txt
then echo 1>&2 "log contains warnings!"; echo; cat .openrefine/log.txt; exit 1
fi
transform_split:
cmds:
- | # split into one file per record
(cd output &&
csplit -s -z rub-journals.txt '/<mets:mets /' "{*}" &&
rm rub-journals.txt)
- | # rename files
(cd output &&
for f in xx*
do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"
done)
transform_validate:
desc: check output and validate against METS schema
cmds:
- | # requirement xmllint
if [ -z "$(which xmllint 2> /dev/null)" ]; then
echo 1>&2 "This task needs xmllint"; exit 1
fi
- | # check results
minimum="2400"
count="$(find output -name '*.xml' | wc -l)"
if [ "$minimum" -le "$count" ]; then
echo "Es wurden $count Datensätze generiert"
else
echo 1>&2 "Anzahl der generierten Datensätze (${count}) entspricht nicht der erwarteten Mindestanzahl (${minimum})! Bitte manuell prüfen."
fi
- | # validate against METS schema
xmllint --schema config/mets.xsd --noout output/*.xml
git:
desc: commit and push if something changed
cmds:
- git add -A
- git commit -m "latest change $(date -u)" || exit 0
- git push