Skip to content

Commit

Permalink
Merge pull request #173 from bodleian/processing-overhaul
Browse files Browse the repository at this point in the history
Processing overhaul
  • Loading branch information
holfordm authored Aug 13, 2018
2 parents d7bfe0c + 3ba9ff1 commit 923cb64
Show file tree
Hide file tree
Showing 13 changed files with 885 additions and 469 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
processing/html/
processing/solr/
processing/analysis/results/
processing/lib/
*.log
*.tmp
39 changes: 34 additions & 5 deletions processing/convert2HTML.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
exclude-result-prefixes="tei html xs"
version="2.0">

<xsl:import href="https://raw.githubusercontent.com/bodleian/consolidated-tei-schema/master/msdesc2html.xsl"/>
<xsl:import href="lib/msdesc2html.xsl"/>

<!-- Only set this variable if you want full URLs hardcoded into the HTML
on the web site (previewManuscript.xsl overrides this to do so when previewing.) -->
Expand All @@ -17,10 +17,39 @@
<!-- Any templates added below will override the templates in the shared
imported stylesheet, allowing customization of manuscript display for each catalogue. -->





<xsl:template match="msDesc/msIdentifier/altIdentifier[@type='former']">
<!-- TODO: Move this template to msdesc2html.xsl? -->
<p>
<xsl:text>Former shelfmark: </xsl:text>
<xsl:apply-templates/>
</p>
</xsl:template>

<xsl:template match="title[@key]">
<!-- TODO: Move this template to msdesc2html.xsl? -->
<span>
<xsl:attribute name="class">
<xsl:if test="not(parent::msItem)">
<xsl:text>title </xsl:text>
</xsl:if>
<xsl:text>tei-title</xsl:text>
<xsl:if test="not(@rend) and not(@type)">
<xsl:text> italic</xsl:text>
</xsl:if>
</xsl:attribute>
<a>
<xsl:attribute name="href">
<xsl:value-of select="$website-url"/>
<xsl:text>/catalog/</xsl:text>
<xsl:value-of select="tokenize(@key, ' ')[1]"/>
</xsl:attribute>
<xsl:apply-templates/>
</a>
</span>
<xsl:if test="following-sibling::*[1][self::note and not(matches(., '^\s*[A-Z(,]')) and not(child::*[1][self::lb and string-length(normalize-space(preceding-sibling::text())) = 0])]">
<xsl:text>, </xsl:text>
</xsl:if>
</xsl:template>


</xsl:stylesheet>
10 changes: 8 additions & 2 deletions processing/generate-html.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
echo
echo "Generating HTML for customized manuscript view..."

# Change directory to the location of this script
cd "${0%/*}"
if [[ ! "`pwd`" == *-mss/processing ]]; then
echo "This script must be run from the processing folder"
exit 1
fi

# Create subfolder to keep generated files out of GitHub
if [ ! -d "html" ]; then
Expand All @@ -17,3 +19,7 @@ echo "Transforming TEI files in collections folder using convert2HTML.xsl on $(d

# Run XSLT on all TEI files in collections path (using pwd to get full path, not relative, which is what the XSL needs)
java -Xmx1G -Xms1G -cp "saxon/saxon9he.jar" net.sf.saxon.Transform -it:batch -xsl:convert2HTML.xsl collections-path=`pwd`/../collections/ 2>> $LOGFILE
if [ $? -gt 0 ]; then
echo "XSLT failed. Re-indexing cancelled. Please raise an issue on GitHub, attaching $LOGFILE"
exit 1;
fi
Empty file modified processing/generate-sitemap.sh
100644 → 100755
Empty file.
112 changes: 60 additions & 52 deletions processing/generate-solr-document.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/usr/bin/env bash

# Command arguments
# $1 = xQuery file
# $1 = XQuery file
# $2 = Output file
# $3 = Solr `type` (for deleting only one type of record)
# $4 = Solr address for indexing
# $5 = Optional mode:
# Append 'force' to disable checking for data issues and push to Solr without prompting
# Append 'noindex' to generate the files and do the checking but not push to Solr
# 'force' to disable checking for data issues and push to Solr without prompting
# 'noindex' to generate the files and do the checking but not push to Solr
# 'reuse' to send files previously created to Solr without rebuilding them

echo

Expand All @@ -16,70 +17,79 @@ if [ $# -lt 4 ]; then
exit 1;
fi

# Change directory to the location of this script
cd "${0%/*}"
if [[ ! "`pwd`" == *-mss/processing ]]; then
echo "This script must be run from the processing folder"
exit 1;
fi

# Create subfolder to keep generated files out of GitHub
if [ ! -d "solr" ]; then
mkdir solr
if [ ! -d "lib" ]; then
echo "Missing processing/lib subfolder"
exit 1;
fi

# Create subfolder to keep generated files out of GitHub
if [ ! -d "solr" ]; then mkdir solr; fi

# Start log file
LOGFILE="solr/$3.log"
echo "Processing TEI files in collections folder using $1 on $(date +"%Y-%m-%d %H:%M:%S") to be sent to $4 for re-indexing." > $LOGFILE

# Run XQuery to build Solr XML index files
echo "Generating Solr XML file containing $3 records..."
java -Xmx1G -Xms1G -cp "saxon/saxon9he.jar" net.sf.saxon.Query -xi:on -q:$1 1> solr/$2 2>> $LOGFILE
if [ $? -gt 0 ]; then
echo "XQuery failed. Re-indexing of $3 records cancelled. Please raise an issue on GitHub, attaching $LOGFILE"
exit 1;
if [ ! "$5" == "reuse" ]; then
echo "Processing TEI files in collections folder using $1 on $(date +"%Y-%m-%d %H:%M:%S") to be sent to $4 for re-indexing." > $LOGFILE
else
echo "Sending previously generated index files to $4"
fi

# Clean up log file (because XQuery/Saxon appends some junk to the end of each line)
# Doesn't work in git-bash which lacks the rev command
if hash rev 2>/dev/null; then
rev $LOGFILE | cut -f 2- | rev > $LOGFILE.tmp && mv $LOGFILE.tmp $LOGFILE
fi
if [ ! "$5" == "reuse" ]; then

# Check what's been logged
errors=$(grep -ic "^error" $LOGFILE)
if [ $errors -gt 0 ]; then
echo "There are $errors error messages in $LOGFILE so re-indexing of $3 records cannot proceed."
exit 1;
fi
warnings=$(grep -ic "^warn" $LOGFILE)
infos=$(grep -ic "^info" $LOGFILE)
if [ $warnings -gt 0 ] || [ $infos -gt 0 ]; then
echo "There are $warnings warning and $infos info messages in $LOGFILE"
if [ ! "$5" == "force" ] && [ ! "$5" == "noindex" ]; then
while true; do
read -p "Do you wish to rebuild the $3 index? [Yes|No|Quit|View] " answer
case $answer in
[Yy]|YES|Yes|yes ) break;;
[Nn]|NO|No|no ) echo "Re-indexing of $3 records cancelled. Proceeding to next index."; exit 0;;
[Qq]|QUIT|Quit|quit ) echo "Re-indexing of $3 records cancelled. Abandoning all further indexing."; exit 1;;
[Vv]|VIEW|View|view ) less $LOGFILE; echo;;
* ) echo;;
esac
done
# Run XQuery to build Solr XML index files
echo "Generating Solr XML file containing $3 records..."
java -Xmx1G -Xms1G -cp "saxon/saxon9he.jar" net.sf.saxon.Query -xi:on -q:$1 1> solr/$2 2>> $LOGFILE
if [ $? -gt 0 ]; then
echo "XQuery failed. Re-indexing of $3 records cancelled. Please raise an issue on GitHub, attaching $LOGFILE"
exit 1;
fi

# Clean up log file (because XQuery/Saxon appends some junk to the end of each line)
# Doesn't work in git-bash which lacks the rev command
if hash rev 2>/dev/null; then
rev $LOGFILE | cut -f 2- | rev > $LOGFILE.tmp && mv $LOGFILE.tmp $LOGFILE
fi

# Check what's been logged
errors=$(grep -ic "^error" $LOGFILE)
if [ $errors -gt 0 ]; then
echo "There are $errors error messages in $LOGFILE so re-indexing of $3 records cannot proceed."
exit 1;
fi
warnings=$(grep -ic "^warn" $LOGFILE)
infos=$(grep -ic "^info" $LOGFILE)
if [ $warnings -gt 0 ] || [ $infos -gt 0 ]; then
echo "There are $warnings warning and $infos info messages in $LOGFILE"
if [ ! "$5" == "force" ] && [ ! "$5" == "noindex" ]; then
while true; do
read -p "Do you wish to rebuild the $3 index? [Yes|No|Quit|View] " answer
case $answer in
[Yy]|YES|Yes|yes ) break;;
[Nn]|NO|No|no ) echo "Re-indexing of $3 records cancelled. Proceeding to next index."; exit 0;;
[Qq]|QUIT|Quit|quit ) echo "Re-indexing of $3 records cancelled. Abandoning all further indexing."; exit 1;;
[Vv]|VIEW|View|view ) less $LOGFILE; echo;;
* ) echo;;
esac
done
fi
fi
fi

if [ ! "$5" == "noindex" ]; then

# Emptying index on Solr. Doing so for both place and organization will result in only one of them
# being indexed; so if we're indexing organizations, then, skip the empty step.
if [ ! $1 == "organizations.xquery" ]; then
echo "Emptying Solr of $3 records..."
curl -fsS "http://${4}:8983/solr/medieval-mss/update?stream.body=<delete><query>type:${3}</query></delete>&commit=true" 1>> $LOGFILE 2>> $LOGFILE
fi
# Emptying index on Solr
echo "Emptying Solr of $3 records..."
curl -fsS "http://${4}:8983/solr/medieval-mss/update?stream.body=<delete><query>type:${3}</query></delete>&commit=true" 1>> $LOGFILE 2>> $LOGFILE

# Upload generated XML to Solr
if [ $? -gt 0 ]; then
echo "Emptying Solr failed. Try again later. If problem persists, please raise an issue on GitHub, attaching $LOGFILE."
exit 1;
else
# Upload to Solr
echo "Sending new $3 records to Solr..."
curl -fsS "http://${4}:8983/solr/medieval-mss/update?commit=true" --data-binary @solr/$2 -H "Content-Type: text/xml" 1>> $LOGFILE 2>> $LOGFILE
if [ $? -eq 0 ]; then
Expand All @@ -91,8 +101,6 @@ if [ ! "$5" == "noindex" ]; then
fi
fi
else
echo "Re-indexing skipped in $5 mode."
echo "Processing $3 records finished. Sending to Solr skipped in $5 mode."
exit 0;
fi


93 changes: 73 additions & 20 deletions processing/index-all-prd.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,82 @@

# Command arguments
# $1 = Optional mode:
# Append 'force' to disable checking for data issues and push to Solr without prompting
# Append 'noindex' to generate the files and do the checking but not push to Solr
# 'force' to disable checking for data issues and push to Solr without prompting
# 'noindex' to generate the files and do the checking but not push to Solr
# 'reuse' to send files previously created to Solr without rebuilding them

if [ ! "$1" == "force" ]; then
# Give up if any one index fails or is abandoned
set -e
fi
SERVER="solr01-prd.bodleian.ox.ac.uk"

cd "${0%/*}"
if [[ ! "`pwd`" == *-mss/processing ]]; then
echo "This script must be run from the processing folder"
exit 1
fi

# Re-index manuscripts (includes rebuilding customized manuscript HTML pages, which must be run first)
./generate-html.sh
./generate-solr-document.sh manuscripts.xquery mss_index.xml manuscript solr01-prd.bodleian.ox.ac.uk $1
if [ "$1" == "reuse" ]; then

# Reindex places (includes organizations, which must be run second)
./generate-solr-document.sh places.xquery places_index.xml place solr01-prd.bodleian.ox.ac.uk $1
if [ ! "$1" == "noindex" ]; then
echo "Place index will be incomplete until organizations have also been reindexed."
fi
./generate-solr-document.sh organizations.xquery organizations_index.xml organization solr01-prd.bodleian.ox.ac.uk $1
ageofsolrdir=$((($(date +%s) - $(date -r solr +%s)) / 3600))
if [ $ageofsolrdir -ge 1 ]; then
echo -n "The solr files are $ageofsolrdir hours old. "
while true; do
read -p "Are you sure you want to send these to the production Solr server?? [Yes|No] " answer
case $answer in
[Yy]|YES|Yes|yes ) break;;
[Nn]|NO|No|no ) echo "Abandoning re-indexing. The production server has not been updated."; exit 0;;
* ) echo;;
esac
done
fi
# Send the files to Solr
printf "manuscript\nwork\nperson\nplace" | xargs -I {} -P 2 ./generate-solr-document.sh "{}s.xquery" "{}s_index.xml" {} $SERVER $1

else

# To avoid Saxon hanging when it cannot download library modules, fetch a local copy
# and fail immediately if that is not possible. But skip this if a symlink has been
# set up for development and testing. This is in lieu of writing a custom resolver
# or using the dreaded Git submodules
if [ ! -L "lib" ]; then
if [ -d "lib" ]; then
ageoflibdir=$((($(date +%s) - $(date -r lib +%s)) / 3600))
if [ $ageoflibdir -ge 1 ]; then
# The lib folder is a real folder, and more than an hour old, so delete it
rm -rf lib;
fi
fi
if [ ! -e "lib" ]; then
# Retrieve a fresh copy
git clone -q --depth 1 https://github.com/bodleian/consolidated-tei-schema.git lib
if [ $? -gt 0 ]; then
echo "Cannot download library files from GitHub. Check your network connection."
exit 1
fi
fi
fi

# Reindex people
./generate-solr-document.sh people.xquery people_index.xml person solr01-prd.bodleian.ox.ac.uk $1
# Rebuild HTML, which must complete successfully before indexing can start
./generate-html.sh
if [ $? -gt 0 ]; then
echo "Indexing cannot proceed because HTML could not be generated for all manuscripts"
exit 1;
fi

# Reindex works
./generate-solr-document.sh works.xquery works_index.xml work solr01-prd.bodleian.ox.ac.uk $1
if [ "$1" == "force" ] || [ "$1" == "noindex" ]; then

echo "Rebuilding index files two at a time..."
printf "manuscript\nwork\nperson\nplace" | xargs -I {} -P 2 ./generate-solr-document.sh "{}s.xquery" "{}s_index.xml" {} $SERVER $1
if [ $? -gt 0 ]; then
echo
echo "WARNING: One or more index files was not completed. The web site will not be fully updated. Check the log files in the solr subfolder."
exit 1;
fi

else

# Default mode is interactive - build one index at a time, prompting before sending to Solr
set -e
./generate-solr-document.sh "manuscripts.xquery" "manuscripts_index.xml" manuscript $SERVER $1
./generate-solr-document.sh "works.xquery" "works_index.xml" work $SERVER $1
./generate-solr-document.sh "persons.xquery" "persons_index.xml" person $SERVER $1
./generate-solr-document.sh "places.xquery" "places_index.xml" place $SERVER $1
fi
fi
Loading

0 comments on commit 923cb64

Please sign in to comment.