weekly_routine.sh

#!/bin/bash

# weekly routine on Sunday pm /Monday am. This script should be run from /data/export/canto-space

# Define canto-space root
CANTOSPACE="/data/export/canto-space"

# Define logs folder inside canto-space
LOGS="logs"

# Define log file
LOGFILE="${CANTOSPACE}/${LOGS}/canto_weekly_update.log"

# Redirect all output to logfile
exec &>> "${LOGFILE}"

# Function to log messages with date stamp
# e.g. log "Canto script completed successfully"
# generates something like
# 2020-05-20 10:24:37: Canto script completed successfully

function log () {
    local DATESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
    /bin/echo "${DATESTAMP}: ${1}"
}

# copy/update of ontologies from /data/export/curfiles/ontologies/trunk/ into 'canto-space/import_export'
# GM comment: I don't really know how rsync works so I didn't know which options to choose (so didn't try to!) and the syntax may not be correct, but the first path should be OK if Canto is installed on the current vm.
# VT comment: the second path should be to /canto-space/import_export/
# JWRN comment: Done

function update_obo_file()
{
    local FILENAME=${1}

    # first, check that the FILE exists and svn update if so
    if [[ -e "/data/export/curfiles/ontologies/trunk/${FILENAME}" ]]; then

        # Create sha hash for $FILE before rsync
        log "Creating ${CANTOSPACE}/import_export/${FILENAME}.sha"
        sha1sum "${CANTOSPACE}/import_export/${FILENAME}" > "${CANTOSPACE}/import_export/${FILENAME}.sha"

        # Sync files from source
        log "Updating ${FILENAME}..."
        /usr/bin/rsync -a /data/export/curfiles/ontologies/trunk/${FILENAME} "${CANTOSPACE}/import_export/${FILENAME}"

        # Get hash from $FILE
        HASH=$(sha1sum "${CANTOSPACE}/import_export/${FILENAME}")

        # Get sha hash prior to rsync
        SAVEDHASH=$(< "${CANTOSPACE}/import_export/${FILENAME}.sha")

        # If HASH == SAVEDHASH nothing has changed
        if [[ "${HASH}" == "${SAVEDHASH}" ]]; then
            log "${CANTOSPACE}/import_export/${FILENAME} is unmodified"
        else
            log "${CANTOSPACE}/import_export/${FILENAME} has been modified. Ontologies will be reloaded into Canto"
            DATACHANGED="YES"
        fi
        
    else

        log "'${FILENAME}' does not exist as a FILE, skipping ..."

    fi

}

function restart_canto()
{
    # Reset cache (restart memcached)
    if /etc/init.d/memcached restart; then
        log "Cache cleared successfully"
    else
        log "Cache clearing failed!"
    fi

    # Canto restart
    # Get Container ID from output
    DOCKERNAME=$(docker ps -a --filter status=running | awk '$2 ~ /pombase\/canto-base/ { print $1 }')
    if docker restart "${DOCKERNAME}"; then
        log "Canto restarted successfully"
    else
        log "Canto failed to restart sucessfully!"
    fi

    log "Finished"

    exit

}

function check_if_canto_restart_required()
{
    # Compare contents of previous canto_done to current.
    # If no change then reload canto only

    # if no previous canto_done then create one
    if [[ ! -e "${PREVMARKERFILE}" ]]; then
        cp "${MARKERFILE}" "${PREVMARKERFILE}"
    else
        # For both these sh2sum commands array syntax is used to capture just
        # the hash for comparison. Default behaviour is to print hash and
        # filename passed in. This breaks the comparison

        # Get hash of MARKERFILE
        MARKERHASH=($(sha1sum "${MARKERFILE}"))

        # Get hash of PREVMARKERFILE
        PREVHASH=($(sha1sum "${PREVMARKERFILE}"))

        # Make copy of marker file for comparison next time
        cp "${MARKERFILE}" "${PREVMARKERFILE}"

        if [[ "${MARKERHASH}" == "${PREVHASH}" ]]; then
            log "No change in database. No further processing required."

            # Restart canto
            restart_canto
        else
            log "${MARKERFILE} has changed. Processing data..."
        fi
    fi


}

# Function to retry command until sucessful with max number of attempts
function retry {

    # Starting number
    local n=1

    # Max number of attempts
    local max=5

    # Sleep period in seconds until retry
    # 600 = 10 mins
    local delay=600

    # Loop 
    while true; do

        # Run array of commands passed in and break loop if successful
        "${@}" && break || {

            # Else loop while attempt no is less than max
            if [ "${n}" -le "${max}" ]; then

                # log status
                log "File retrieval failed. Attempt ${n}/${max}"

                # Sleep for period
                sleep "${delay}"

                # Increment attempt counter
                ((n++))
            else
                # Hit max attempts and still failed so giving up
                log "${@} failed after ${max} attempts."
                
                # Restart canto
                restart_canto

            fi
        }
    done
}

log "Starting"

# Create archive directory if required
ARCHIVE="${CANTOSPACE}/archive"

 if [[ ! -d "${ARCHIVE}" ]]; then
    log "Creating ${ARCHIVE}"
    mkdir "${ARCHIVE}"
fi

# Create canto_support directory if required
DATA="${CANTOSPACE}/canto_support"

 if [[ ! -d "${DATA}" ]]; then
    log "Creating ${DATA}"
    mkdir "${DATA}"
fi

# Set filename of file to pull from upstream server
MARKERFILE="${DATA}/canto_done"

# Set filename of file from previous run
PREVMARKERFILE="${DATA}/canto_done.previous"

# Use retry function to pull marker file from deneb
retry /usr/bin/scp -q fbadmin@deneb.pdn.cam.ac.uk:instance/canto_done "${MARKERFILE}"

# Get DBNAME from downloaded file
if [[ -e "${MARKERFILE}" ]]; then
    DBNAME=$(< "${MARKERFILE}")
else
    log "${MARKERFILE} does not exist, cannot continue"
    exit 1
fi

# Check DBNAME is not blank
if [[ -z "${DBNAME}" ]]; then
    log "${DBNAME} is blank, cannot continue"
    exit 1
fi

# Transform $DBNAME 
CANTO_CONFIG="${CANTOSPACE}/canto/canto_deploy.yaml"
log "Inserting ${DBNAME} into ${CANTO_CONFIG}"

if [[ -e "${CANTO_CONFIG}" ]]; then
    sed -i.bak -E "s/(^[[:space:]]+\-[[:space:]]\"dbi\:Pg\:dbname=)\w+(\;[[:space:]]host=deneb\.pdn\.cam\.ac\.uk\")/\1${DBNAME}\2/" "${CANTO_CONFIG}"
else
    log "${CANTO_CONFIG} does not exist, cannot continue"
    exit 1
fi

# Loop through files and check if FILE has been updated
for FILE in "fly_anatomy.obo" "flybase_controlled_vocabulary.obo" "fly_development.obo" "go-basic.obo"; do
    update_obo_file ${FILE}
done


#replace merged ontology and reload all ontologies to Canto
# VT comment: Ideally, add the following 'if' routine . If hard to implement, remove the 'if' routine and make the three commands run by default
# VT comments2: the 'if' loop here may save significant time - takes 20min on my local vm
# JWRN comment: how do we know an ontology has been changed? Is there a piece of information we can write out and read back in?
# VT comments2: response to JWRN's question above: as these files will not be necessarily updated every week, the simpler way would be to check the timestamp. If no changes in the last, say, 24h, then you can assume no change has been made
# JWRN comment: may be easier to just update whatever and improve in time.
# JWRN comment: commenting out test for the moment.

# If DATACHANGED was set to YES in function above then reload Canto data
if [[ "${DATACHANGED}" == "YES" ]]; then

    log "Reloading ontologies into Canto"

    # increasing OWLTOOLS memory (java) to cope with the demanding ontology reloading step
    export OWLTOOLS_MEMORY=20g

    # redo/replace merged FBbt-GO.obo ontology
    sh ${CANTOSPACE}/FBbt-GO_routine/FBbt-GO_routine.sh

    # replace extension_config.tsv
    sh ${CANTOSPACE}/extension_config-Rscript/list-to-extension_config.sh

    # reload the ontologies	and extension configuration
    # JWRN comment: I suspect the sudo here is superfluous as the script is running as root so removed
    ./canto/script/canto_docker  --non-interactive ./script/canto_load.pl --process-extension-config --ontology  /import_export/FBbt-GO.obo --ontology    /import_export/fly_development.obo --ontology   /import_export/flybase_controlled_vocabulary.obo
fi

# If DBNAME hasn't changed then restart canto and quit
check_if_canto_restart_required

# data import (using Gillian's scripts in the vm - see point 7.d in https://docs.google.com/document/d/19C-J8sJmZb_OSluxyzBWJxUkdR_N4sIpgjHI7u5pp0I/edit)
# GM comment: the following 'if' command should work
# run the script to generate new information into canto ONLY if the fbrf_input_list.tsv file exists

# make fbrf_input_list.tsv (list of newly thin-curated papers)
# if there are no new papers to add in a particular week, the output file,
# fbrf_input_list.tsv, will be empty

# Define refs import file
CURATEDPAPERLISTFILE="./fbrf_input_list.tsv"

log "Creating ${CURATEDPAPERLISTFILE}"
/usr/bin/perl /data/export/support_scripts/get_fbrfs_to_add_to_canto.pl /data/export/support_scripts/modules_server.cfg > "${CURATEDPAPERLISTFILE}"

cp "${CURATEDPAPERLISTFILE}" "${CANTOSPACE}/import_export/fbrf_input_list.tsv"

# Test fbrf_input_list.tsv exists and isn't empty
if [[ ( -e "${CURATEDPAPERLISTFILE}" ) && ( -s "${CURATEDPAPERLISTFILE}" ) ]]; then

    # Define JSON import file
    JSONIMPORTFILE="/import_export/import-fbrfs.json"

    # make the json input file
    log "Creating ${JSONIMPORTFILE}"
    /usr/bin/perl /data/export/support_scripts/canto_json_input_maker.pl /data/export/support_scripts/modules_server.cfg "${CURATEDPAPERLISTFILE}" > ".${JSONIMPORTFILE}"

    # load the json file into canto
    # VT comment: this next step make take some time, depending on the amount of new data
    # JWRN comment: again I suspect the sudo is superfluous so removed
    log "Importing ${JSONIMPORTFILE} into Canto"
    ./canto/script/canto_docker  --non-interactive ./script/canto_add.pl --sessions-from-json "${JSONIMPORTFILE}" "ignore@flybase.org" 7227
    
else
    log "Not loading ./import_export/import-fbrfs.json into Canto. Either ./fbrf_input_list.tsv does not exist or is empty..."
fi

if [[ -e "${CURATEDPAPERLISTFILE}" ]]; then

    # Define filename
    FBRFARCHIVE="fbrf_input_list_$(date +"%Y-%m-%d")"

    # Copy fbrf_input_list.tsv to archive
    log "Copying ${CANTOSPACE}/import_export/fbrf_input_list.tsv to ${ARCHIVE}/${FBRFARCHIVE}"

    # Archive file with correct name (and delete original)
    mv "${CANTOSPACE}/import_export/fbrf_input_list.tsv" "${ARCHIVE}/${FBRFARCHIVE}"
    rm "${CURATEDPAPERLISTFILE}"

fi

# As this is not running relative to Docker we need to prefix the path
# with . to choose import-export in current directory. Otherwise script would look in
# root of disk. 
if [[ -e ".${JSONIMPORTFILE}" ]]; then

    # Define filename
    JSONARCHIVE="import-fbrfs_json_$(date +"%Y-%m-%d")"

    # Copy /import_export/import-fbrfs.json to archive
    log "Copying /import_export/import-fbrfs.json to ${ARCHIVE}/${JSONARCHIVE}"

    # Archive file with correct name (and delete original)
    mv ".${JSONIMPORTFILE}" "${ARCHIVE}/${JSONARCHIVE}"
fi

# Restart Canto post processing
restart_canto