#!/bin/bash # weekly routine on Sunday pm /Monday am. This script should be run from /data/export/canto-space # Define canto-space root CANTOSPACE="/data/export/canto-space" # Define logs folder inside canto-space LOGS="logs" # Define log file LOGFILE="${CANTOSPACE}/${LOGS}/canto_weekly_update.log" # Redirect all output to logfile exec &>> "${LOGFILE}" # Function to log messages with date stamp # e.g. log "Canto script completed successfully" # generates something like # 2020-05-20 10:24:37: Canto script completed successfully function log () { local DATESTAMP=$(date +"%Y-%m-%d %H:%M:%S") /bin/echo "${DATESTAMP}: ${1}" } # copy/update of ontologies from /data/export/curfiles/ontologies/trunk/ into 'canto-space/import_export' # GM comment: I don't really know how rsync works so I didn't know which options to choose (so didn't try to!) and the syntax may not be correct, but the first path should be OK if Canto is installed on the current vm. # VT comment: the second path should be to /canto-space/import_export/ # JWRN comment: Done function update_obo_file() { local FILENAME=${1} # first, check that the FILE exists and svn update if so if [[ -e "/data/export/curfiles/ontologies/trunk/${FILENAME}" ]]; then # Create sha hash for $FILE before rsync log "Creating ${CANTOSPACE}/import_export/${FILENAME}.sha" sha1sum "${CANTOSPACE}/import_export/${FILENAME}" > "${CANTOSPACE}/import_export/${FILENAME}.sha" # Sync files from source log "Updating ${FILENAME}..." /usr/bin/rsync -a /data/export/curfiles/ontologies/trunk/${FILENAME} "${CANTOSPACE}/import_export/${FILENAME}" # Get hash from $FILE HASH=$(sha1sum "${CANTOSPACE}/import_export/${FILENAME}") # Get sha hash prior to rsync SAVEDHASH=$(< "${CANTOSPACE}/import_export/${FILENAME}.sha") # If HASH == SAVEDHASH nothing has changed if [[ "${HASH}" == "${SAVEDHASH}" ]]; then log "${CANTOSPACE}/import_export/${FILENAME} is unmodified" else log "${CANTOSPACE}/import_export/${FILENAME} has been modified. Ontologies will be reloaded into Canto" DATACHANGED="YES" fi else log "'${FILENAME}' does not exist as a FILE, skipping ..." fi } function restart_canto() { # Reset cache (restart memcached) if /etc/init.d/memcached restart; then log "Cache cleared successfully" else log "Cache clearing failed!" fi # Canto restart # Get Container ID from output DOCKERNAME=$(docker ps -a --filter status=running | awk '$2 ~ /pombase\/canto-base/ { print $1 }') if docker restart "${DOCKERNAME}"; then log "Canto restarted successfully" else log "Canto failed to restart sucessfully!" fi log "Finished" exit } function check_if_canto_restart_required() { # Compare contents of previous canto_done to current. # If no change then reload canto only # if no previous canto_done then create one if [[ ! -e "${PREVMARKERFILE}" ]]; then cp "${MARKERFILE}" "${PREVMARKERFILE}" else # For both these sh2sum commands array syntax is used to capture just # the hash for comparison. Default behaviour is to print hash and # filename passed in. This breaks the comparison # Get hash of MARKERFILE MARKERHASH=($(sha1sum "${MARKERFILE}")) # Get hash of PREVMARKERFILE PREVHASH=($(sha1sum "${PREVMARKERFILE}")) # Make copy of marker file for comparison next time cp "${MARKERFILE}" "${PREVMARKERFILE}" if [[ "${MARKERHASH}" == "${PREVHASH}" ]]; then log "No change in database. No further processing required." # Restart canto restart_canto else log "${MARKERFILE} has changed. Processing data..." fi fi } # Function to retry command until sucessful with max number of attempts function retry { # Starting number local n=1 # Max number of attempts local max=5 # Sleep period in seconds until retry # 600 = 10 mins local delay=600 # Loop while true; do # Run array of commands passed in and break loop if successful "${@}" && break || { # Else loop while attempt no is less than max if [ "${n}" -le "${max}" ]; then # log status log "File retrieval failed. Attempt ${n}/${max}" # Sleep for period sleep "${delay}" # Increment attempt counter ((n++)) else # Hit max attempts and still failed so giving up log "${@} failed after ${max} attempts." # Restart canto restart_canto fi } done } log "Starting" # Define archive folder for input files ARCHIVE="/data/export/archives/canto_input_archive" # Create canto_support directory if required DATA="${CANTOSPACE}/canto_support" if [[ ! -d "${DATA}" ]]; then log "Creating ${DATA}" mkdir "${DATA}" fi # Set filename of file to pull from upstream server MARKERFILE="${DATA}/canto_done" # Set filename of file from previous run PREVMARKERFILE="${DATA}/canto_done.previous" # Use retry function to pull marker file from deneb retry /usr/bin/scp -q fbadmin@deneb.pdn.cam.ac.uk:instance/canto_done "${MARKERFILE}" # Get DBNAME from downloaded file if [[ -e "${MARKERFILE}" ]]; then DBNAME=$(< "${MARKERFILE}") else log "${MARKERFILE} does not exist, cannot continue" exit 1 fi # Check DBNAME is not blank if [[ -z "${DBNAME}" ]]; then log "${DBNAME} is blank, cannot continue" exit 1 fi # Transform $DBNAME CANTO_CONFIG="${CANTOSPACE}/canto/canto_deploy.yaml" log "Inserting ${DBNAME} into ${CANTO_CONFIG}" if [[ -e "${CANTO_CONFIG}" ]]; then sed -i.bak -E "s/(^[[:space:]]+\-[[:space:]]\"dbi\:Pg\:dbname=)\w+(\;[[:space:]]host=deneb\.pdn\.cam\.ac\.uk\")/\1${DBNAME}\2/" "${CANTO_CONFIG}" else log "${CANTO_CONFIG} does not exist, cannot continue" exit 1 fi # Loop through files and check if FILE has been updated for FILE in "fly_anatomy.obo" "flybase_controlled_vocabulary.obo" "fly_development.obo" "go-basic.obo"; do update_obo_file ${FILE} done #replace merged ontology and reload all ontologies to Canto # VT comment: Ideally, add the following 'if' routine . If hard to implement, remove the 'if' routine and make the three commands run by default # VT comments2: the 'if' loop here may save significant time - takes 20min on my local vm # JWRN comment: how do we know an ontology has been changed? Is there a piece of information we can write out and read back in? # VT comments2: response to JWRN's question above: as these files will not be necessarily updated every week, the simpler way would be to check the timestamp. If no changes in the last, say, 24h, then you can assume no change has been made # JWRN comment: may be easier to just update whatever and improve in time. # JWRN comment: commenting out test for the moment. # If DATACHANGED was set to YES in function above then reload Canto data if [[ "${DATACHANGED}" == "YES" ]]; then log "Reloading ontologies into Canto" # increasing OWLTOOLS memory (java) to cope with the demanding ontology reloading step export OWLTOOLS_MEMORY=20g # redo/replace merged FBbt-GO.obo ontology sh ${CANTOSPACE}/FBbt-GO_routine/FBbt-GO_routine.sh # replace extension_config.tsv sh ${CANTOSPACE}/extension_config-Rscript/list-to-extension_config.sh # reload the ontologies and extension configuration # JWRN comment: I suspect the sudo here is superfluous as the script is running as root so removed ./canto/script/canto_docker --non-interactive ./script/canto_load.pl --process-extension-config --ontology /import_export/FBbt-GO.obo --ontology /import_export/fly_development.obo --ontology /import_export/flybase_controlled_vocabulary.obo fi # If DBNAME hasn't changed then restart canto and quit check_if_canto_restart_required # data import (using Gillian's scripts in the vm - see point 7.d in https://docs.google.com/document/d/19C-J8sJmZb_OSluxyzBWJxUkdR_N4sIpgjHI7u5pp0I/edit) # GM comment: the following 'if' command should work # run the script to generate new information into canto ONLY if the fbrf_input_list.tsv file exists # make fbrf_input_list.tsv (list of newly thin-curated papers) # if there are no new papers to add in a particular week, the output file, # fbrf_input_list.tsv, will be empty # Define refs import file CURATEDPAPERLISTFILE="./fbrf_input_list.tsv" log "Creating ${CURATEDPAPERLISTFILE}" /usr/bin/perl /data/export/support_scripts/get_fbrfs_to_add_to_canto.pl /data/export/support_scripts/modules_server.cfg > "${CURATEDPAPERLISTFILE}" cp "${CURATEDPAPERLISTFILE}" "${CANTOSPACE}/import_export/fbrf_input_list.tsv" # Test fbrf_input_list.tsv exists and isn't empty if [[ ( -e "${CURATEDPAPERLISTFILE}" ) && ( -s "${CURATEDPAPERLISTFILE}" ) ]]; then # Define JSON import file JSONIMPORTFILE="/import_export/import-fbrfs.json" # make the json input file log "Creating ${JSONIMPORTFILE}" /usr/bin/perl /data/export/support_scripts/canto_json_input_maker.pl /data/export/support_scripts/modules_server.cfg "${CURATEDPAPERLISTFILE}" > ".${JSONIMPORTFILE}" # load the json file into canto # VT comment: this next step make take some time, depending on the amount of new data # JWRN comment: again I suspect the sudo is superfluous so removed log "Importing ${JSONIMPORTFILE} into Canto" ./canto/script/canto_docker --non-interactive ./script/canto_add.pl --sessions-from-json "${JSONIMPORTFILE}" "ignore@flybase.org" 7227 else log "Not loading ./import_export/import-fbrfs.json into Canto. Either ./fbrf_input_list.tsv does not exist or is empty..." fi if [[ -e "${CURATEDPAPERLISTFILE}" ]]; then # Define filename FBRFARCHIVE="fbrf_input_list_$(date +"%Y-%m-%d")" # Copy fbrf_input_list.tsv to archive log "Copying ${CANTOSPACE}/import_export/fbrf_input_list.tsv to ${ARCHIVE}/${FBRFARCHIVE}" # Archive file with correct name (and delete original) mv "${CANTOSPACE}/import_export/fbrf_input_list.tsv" "${ARCHIVE}/${FBRFARCHIVE}" rm "${CURATEDPAPERLISTFILE}" fi # As this is not running relative to Docker we need to prefix the path # with . to choose import-export in current directory. Otherwise script would look in # root of disk. if [[ -e ".${JSONIMPORTFILE}" ]]; then # Define filename JSONARCHIVE="import-fbrfs_json_$(date +"%Y-%m-%d")" # Copy /import_export/import-fbrfs.json to archive log "Copying /import_export/import-fbrfs.json to ${ARCHIVE}/${JSONARCHIVE}" # Archive file with correct name (and delete original) mv ".${JSONIMPORTFILE}" "${ARCHIVE}/${JSONARCHIVE}" fi # Restart Canto post processing restart_canto