Newer
Older
#!/bin/bash
# weekly routine on Sunday pm /Monday am. This script should be run from /data/export/canto-space
# Define canto-space root
CANTOSPACE="/data/export/canto-space"
# Define logs folder inside canto-space
LOGS="logs"
LOGFILE="${CANTOSPACE}/${LOGS}/canto_weekly_update.log"
# Function to log messages with date stamp
# e.g. log "Canto script completed successfully"
# generates something like
# 2020-05-20 10:24:37: Canto script completed successfully
function log () {
local DATESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
}
# copy/update of ontologies from /data/export/curfiles/ontologies/trunk/ into 'canto-space/import_export'
# GM comment: I don't really know how rsync works so I didn't know which options to choose (so didn't try to!) and the syntax may not be correct, but the first path should be OK if Canto is installed on the current vm.
# VT comment: the second path should be to /canto-space/import_export/
# JWRN comment: Done
local FILENAME=${1}
# first, check that the FILE exists and svn update if so
if [[ -e "/data/export/curfiles/ontologies/trunk/${FILENAME}" ]]; then
# Create sha hash for $FILE before rsync
log "Creating ${CANTOSPACE}/import_export/${FILENAME}.sha"
sha1sum "${CANTOSPACE}/import_export/${FILENAME}" > "${CANTOSPACE}/import_export/${FILENAME}.sha"
# Sync files from source
log "Updating ${FILENAME}..."
/usr/bin/rsync -a /data/export/curfiles/ontologies/trunk/${FILENAME} "${CANTOSPACE}/import_export/${FILENAME}"
# Get hash from $FILE
HASH=$(sha1sum "${CANTOSPACE}/import_export/${FILENAME}")
# Get sha hash prior to rsync
SAVEDHASH=$(< "${CANTOSPACE}/import_export/${FILENAME}.sha")
# If HASH == SAVEDHASH nothing has changed
if [[ "${HASH}" == "${SAVEDHASH}" ]]; then
log "${CANTOSPACE}/import_export/${FILENAME} is unmodified"
else
log "${CANTOSPACE}/import_export/${FILENAME} has been modified. Ontologies will be reloaded into Canto"
DATACHANGED="YES"
fi
else
log "'${FILENAME}' does not exist as a FILE, skipping ..."
function restart_canto()
{
# Reset cache (restart memcached)
if /etc/init.d/memcached restart; then
log "Cache cleared successfully"
else
log "Cache clearing failed!"
fi
# Canto restart
# Get Container ID from output
DOCKERNAME=$(docker ps -a --filter status=running | awk '$2 ~ /pombase\/canto-base/ { print $1 }')
log "Canto restarted successfully"
else
log "Canto failed to restart sucessfully!"
fi
log "Finished"
exit
function check_if_canto_restart_required()
{
# Compare contents of previous canto_done to current.
# If no change then reload canto only
# if no previous canto_done then create one
if [[ ! -e "${PREVMARKERFILE}" ]]; then
cp "${MARKERFILE}" "${PREVMARKERFILE}"
else
# For both these sh2sum commands array syntax is used to capture just
# the hash for comparison. Default behaviour is to print hash and
# filename passed in. This breaks the comparison
# Get hash of MARKERFILE
MARKERHASH=($(sha1sum "${MARKERFILE}"))
# Get hash of PREVMARKERFILE
PREVHASH=($(sha1sum "${PREVMARKERFILE}"))
# Make copy of marker file for comparison next time
cp "${MARKERFILE}" "${PREVMARKERFILE}"
if [[ "${MARKERHASH}" == "${PREVHASH}" ]]; then
log "No change in database. No further processing required."
# Restart canto
restart_canto
else
log "${MARKERFILE} has changed. Processing data..."
fi
fi
# Function to retry command until sucessful with max number of attempts
function retry {
# Starting number
local n=1
# Max number of attempts
# Run array of commands passed in and break loop if successful
"${@}" && break || {
# Else loop while attempt no is less than max
if [ "${n}" -le "${max}" ]; then
# log status
log "File retrieval failed. Attempt ${n}/${max}"
# Increment attempt counter
((n++))
else
# Hit max attempts and still failed so giving up
log "${@} failed after ${max} attempts."
# Restart canto
restart_canto
fi
}
log "Starting"
# Create archive directory if required
ARCHIVE="${CANTOSPACE}/archive"
if [[ ! -d "${ARCHIVE}" ]]; then
log "Creating ${ARCHIVE}"
mkdir "${ARCHIVE}"
fi
# Create canto_support directory if required
DATA="${CANTOSPACE}/canto_support"
if [[ ! -d "${DATA}" ]]; then
log "Creating ${DATA}"
mkdir "${DATA}"
fi
# Set filename of file to pull from upstream server
MARKERFILE="${DATA}/canto_done"
# Set filename of file from previous run
PREVMARKERFILE="${DATA}/canto_done.previous"
# Use retry function to pull marker file from deneb
retry /usr/bin/scp -q fbadmin@deneb.pdn.cam.ac.uk:instance/canto_done "${MARKERFILE}"
# Get DBNAME from downloaded file
if [[ -e "${MARKERFILE}" ]]; then
DBNAME=$(< "${MARKERFILE}")
log "${MARKERFILE} does not exist, cannot continue"
exit 1
log "${DBNAME} is blank, cannot continue"
exit 1
fi
# Transform $DBNAME
CANTO_CONFIG="${CANTOSPACE}/canto/canto_deploy.yaml"
log "Inserting ${DBNAME} into ${CANTO_CONFIG}"
sed -i.bak -E "s/(^[[:space:]]+\-[[:space:]]\"dbi\:Pg\:dbname=)\w+(\;[[:space:]]host=deneb\.pdn\.cam\.ac\.uk\")/\1${DBNAME}\2/" "${CANTO_CONFIG}"
log "${CANTO_CONFIG} does not exist, cannot continue"
exit 1
# Loop through files and check if FILE has been updated
for FILE in "fly_anatomy.obo" "flybase_controlled_vocabulary.obo" "fly_development.obo" "go-basic.obo"; do
update_obo_file ${FILE}
done
#replace merged ontology and reload all ontologies to Canto
# VT comment: Ideally, add the following 'if' routine . If hard to implement, remove the 'if' routine and make the three commands run by default
# VT comments2: the 'if' loop here may save significant time - takes 20min on my local vm
# JWRN comment: how do we know an ontology has been changed? Is there a piece of information we can write out and read back in?
# VT comments2: response to JWRN's question above: as these files will not be necessarily updated every week, the simpler way would be to check the timestamp. If no changes in the last, say, 24h, then you can assume no change has been made
# JWRN comment: may be easier to just update whatever and improve in time.
# JWRN comment: commenting out test for the moment.
# If DATACHANGED was set to YES in function above then reload Canto data
if [[ "${DATACHANGED}" == "YES" ]]; then
log "Reloading ontologies into Canto"
# increasing OWLTOOLS memory (java) to cope with the demanding ontology reloading step
export OWLTOOLS_MEMORY=20g
# redo/replace merged FBbt-GO.obo ontology
sh ${CANTOSPACE}/FBbt-GO_routine/FBbt-GO_routine.sh
# replace extension_config.tsv
sh ${CANTOSPACE}/extension_config-Rscript/list-to-extension_config.sh
# reload the ontologies and extension configuration
# JWRN comment: I suspect the sudo here is superfluous as the script is running as root so removed

Vitor Trovisco
committed
./canto/script/canto_docker --non-interactive ./script/canto_load.pl --process-extension-config --ontology /import_export/FBbt-GO.obo --ontology /import_export/fly_development.obo --ontology /import_export/flybase_controlled_vocabulary.obo
fi
# If DBNAME hasn't changed then restart canto and quit
check_if_canto_restart_required
# data import (using Gillian's scripts in the vm - see point 7.d in https://docs.google.com/document/d/19C-J8sJmZb_OSluxyzBWJxUkdR_N4sIpgjHI7u5pp0I/edit)
# GM comment: the following 'if' command should work
# run the script to generate new information into canto ONLY if the fbrf_input_list.tsv file exists
# make fbrf_input_list.tsv (list of newly thin-curated papers)
# if there are no new papers to add in a particular week, the output file,
# fbrf_input_list.tsv, will be empty
# Define refs import file

Vitor Trovisco
committed
CURATEDPAPERLISTFILE="./fbrf_input_list.tsv"
/usr/bin/perl /data/export/support_scripts/get_fbrfs_to_add_to_canto.pl /data/export/support_scripts/modules_server.cfg > "${CURATEDPAPERLISTFILE}"
cp "${CURATEDPAPERLISTFILE}" "${CANTOSPACE}/import_export/fbrf_input_list.tsv"
# Test fbrf_input_list.tsv exists and isn't empty
if [[ ( -e "${CURATEDPAPERLISTFILE}" ) && ( -s "${CURATEDPAPERLISTFILE}" ) ]]; then
# Define JSON import file
JSONIMPORTFILE="/import_export/import-fbrfs.json"
# make the json input file
log "Creating ${JSONIMPORTFILE}"
/usr/bin/perl /data/export/support_scripts/canto_json_input_maker.pl /data/export/support_scripts/modules_server.cfg "${CURATEDPAPERLISTFILE}" > ".${JSONIMPORTFILE}"
# load the json file into canto
# VT comment: this next step make take some time, depending on the amount of new data
# JWRN comment: again I suspect the sudo is superfluous so removed
log "Importing ${JSONIMPORTFILE} into Canto"

Vitor Trovisco
committed
./canto/script/canto_docker --non-interactive ./script/canto_add.pl --sessions-from-json "${JSONIMPORTFILE}" "ignore@flybase.org" 7227
log "Not loading ./import_export/import-fbrfs.json into Canto. Either ./fbrf_input_list.tsv does not exist or is empty..."
if [[ -e "${CURATEDPAPERLISTFILE}" ]]; then
# Define filename
FBRFARCHIVE="fbrf_input_list_$(date +"%Y-%m-%d")"
# Copy fbrf_input_list.tsv to archive
log "Copying ${CANTOSPACE}/import_export/fbrf_input_list.tsv to ${ARCHIVE}/${FBRFARCHIVE}"
# Archive file with correct name (and delete original)
mv "${CANTOSPACE}/import_export/fbrf_input_list.tsv" "${ARCHIVE}/${FBRFARCHIVE}"
rm "${CURATEDPAPERLISTFILE}"
# As this is not running relative to Docker we need to prefix the path
# with . to choose import-export in current directory. Otherwise script would look in
# root of disk.
if [[ -e ".${JSONIMPORTFILE}" ]]; then
JSONARCHIVE="import-fbrfs_json_$(date +"%Y-%m-%d")"
# Copy /import_export/import-fbrfs.json to archive
log "Copying /import_export/import-fbrfs.json to ${ARCHIVE}/${JSONARCHIVE}"
# Archive file with correct name (and delete original)
mv ".${JSONIMPORTFILE}" "${ARCHIVE}/${JSONARCHIVE}"
# Restart Canto post processing