diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e80fdcb24bf4ea8997508ec55708dd06e4d61221..4f223c2aca4471d04628759a42cdff9a3ac68ad2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -47,6 +47,7 @@ stages: - echo "pip dir - " $PIP_CACHE_DIR - echo "full-fat run date - " $FULL_FAT_RUN_DATE - echo "custom full-fat run date - " $CUSTOM_RUN_DATE + - echo "full-fat copy output - " $FULL_FAT_COPY_OUTPUT - source /storage/app/miniconda3/bin/activate /storage/app/EWS_prod/envs/conda/py3EWS/ - pip install coverage diff --git a/configs/docker/build/build.sh b/configs/docker/build/build.sh index 168a52ace4e624c1f4274e1cf47416c00b0082f7..414ad74deeaa1e064e1180347ec16d3bbda923b3 100644 --- a/configs/docker/build/build.sh +++ b/configs/docker/build/build.sh @@ -6,6 +6,6 @@ cp ../../conda/conda-env-py3EWS-withbuilds.yml .; # the docker container will run commands using this user and group, and will have access to the relevant filesystems # that are mounted. If the default command is run, the user credentials are set to be those of the Cam group server -##tjhis is a comment that is in the new lb584 branch +##this is yet another comment that is in the new lb584 branch sudo docker build -t ews_coordinator -f Dockerfile .; rm -v conda-env-py3EWS-withbuilds.yml; diff --git a/coordinator/Processor.py b/coordinator/Processor.py index 100b7a851c9142400ae532f29664e4e750fc3368..20e9c01bdbf1fd4431cb6456dddbc18ada1bbfbc 100755 --- a/coordinator/Processor.py +++ b/coordinator/Processor.py @@ -1,4 +1,4 @@ -#Processor.py +# Processor.py '''To be used for handling any operational component of wheat rust early warning system. @@ -16,7 +16,8 @@ Example usage:: or:: $ ./run_Processor.sh -p Deposition --islive --config config_Nepal_template.json --noupload -s 20200715 ''' -from typing import List +from abc import abstractmethod, ABCMeta +from typing import List, Union, Any print("Make sure to `conda activate py3EWSepi` environment!") @@ -33,582 +34,620 @@ import shutil import sys # gitlab projects -from flagdir import jobStatus # created by jws52 +from flagdir import jobStatus # created by jws52 # submodules of this project import BufferingSMTPHandler import ProcessorComponents from ProcessorUtils import ( - append_item_to_list, - clear_up, - endScript, - endJob, - open_and_check_config, - PasswordODKFilter, - short_name + append_item_to_list, + clear_up, + endScript, + endJob, + open_and_check_config, + PasswordODKFilter, + short_name ) -# initialise default values for configuration -script_name = 'Processor' +class Processor: -timeNow = datetime.datetime.today() -dateToday = timeNow.date() -todayString = timeNow.strftime('%Y%m%d') -nowString = timeNow.strftime('%Y%m%d-%H%M-%S') + __metaclass__ = ABCMeta -# get the path to this script -script_path = os.path.dirname(__file__)+'/' + ### CLASS LEVEL VARIABLES - WILL BE SHARED BY ALL INSTANCES OF THIS CLASS + log_path_default = None + loglevels = None + #### -coordinator_path = script_path -# log file for all jobs -log_path_project = f"{coordinator_path}logs/log.txt" + def __init__(self) -> None: + super().__init__() + self.setup() -# job-specific log file will be written here until a job directory exits, when it will be moved there -log_path_default = f"{coordinator_path}logs/log_{nowString}.txt" -# get the email credentials file path from the environment variables -assert 'EMAIL_CRED' in os.environ -email_credential_fn = os.environ['EMAIL_CRED'] -assert os.path.exists(email_credential_fn) + def setup(self): + # initialise default values for configuration -with open(email_credential_fn,'r') as f: - gmail_config = json.load(f) + script_name = 'Processor' -# check contents -required_keys = ['user','pass','host','port','toaddrs'] -for required_key in required_keys: - assert required_key in gmail_config + timeNow = datetime.datetime.today() + dateToday = timeNow.date() + self.todayString = timeNow.strftime('%Y%m%d') + self.nowString = timeNow.strftime('%Y%m%d-%H%M-%S') -# load config from python dictionary (could be loaded from json file) -# TODO: smtp handler can only use tls, but ssl is more secure. Look into defining/writing a suitable smtp handler -logConfigDict = { - 'version' : 1, - 'disable_existing_loggers': False, - 'formatters' : { - 'simple' : { - 'format' : '%(name)s : %(levelname)s - %(message)s' + # get the path to this script + script_path = os.path.dirname(__file__) + '/' + + coordinator_path = script_path + + # log file for all jobs + log_path_project = f"{coordinator_path}logs/log.txt" + + # job-specific log file will be written here until a job directory exits, when it will be moved there + self.log_path_default = f"{coordinator_path}logs/log_{self.nowString}.txt" + + # get the email credentials file path from the environment variables + assert 'EMAIL_CRED' in os.environ + email_credential_fn = os.environ['EMAIL_CRED'] + assert os.path.exists(email_credential_fn) + + with open(email_credential_fn, 'r') as f: + gmail_config = json.load(f) + + # check contents + required_keys = ['user', 'pass', 'host', 'port', 'toaddrs'] + for required_key in required_keys: + assert required_key in gmail_config + + # load config from python dictionary (could be loaded from json file) + # TODO: smtp handler can only use tls, but ssl is more secure. Look into defining/writing a suitable smtp handler + logConfigDict = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'simple': { + 'format': '%(name)s : %(levelname)s - %(message)s' }, - 'detailed' : { - 'format' : f""" - For command: - {' '.join(sys.argv)} + 'detailed': { + 'format': f""" + For command: + {' '.join(sys.argv)} - %(levelname)s in %(name)s encountered at %(asctime)s: + %(levelname)s in %(name)s encountered at %(asctime)s: - %(message)s + %(message)s - Resolve this error and restart processing. - - """, - 'datefmt' : '%Y-%m-%d %H:%M:%S' - } - }, - 'filters' : { - 'mask_passwords' : { - '()' : PasswordODKFilter - } - }, - 'handlers' : { - - # logging for project - 'handler_rot_file' : { - 'class' : 'logging.handlers.TimedRotatingFileHandler', - 'level' : 'INFO', - 'formatter' : 'simple', - 'filters' : ['mask_passwords'], - 'filename' : log_path_project, - # time of day on given day - 'when' : 'W2', # rotate file every Wednesday - 'atTime' : datetime.time(1,0,0), # at 1am local time - 'backupCount' : 12, - }, + Resolve this error and restart processing. - # logging for job - 'handler_file' : { - 'class' : 'logging.FileHandler', - 'level' : 'INFO', - 'formatter' : 'simple', - 'filters' : ['mask_passwords'], - 'filename' : log_path_default, - 'mode' : 'a', # 'a' for append + """, + 'datefmt': '%Y-%m-%d %H:%M:%S' + } }, - # to email errors to maintainers - 'handler_buffered_email': { - 'class': 'BufferingSMTPHandler.BufferingSMTPHandler', - 'level': 'ERROR', - 'server': (gmail_config['host'], gmail_config['port']), # host, port. 465 fsor SSL, 587 for tls - 'credentials': (gmail_config['user'], gmail_config['pass']), - 'fromaddr': gmail_config['user'], - 'toaddrs': gmail_config['toaddrs'], - 'subject': 'ERROR in EWS Processor', - 'formatter': 'detailed', - 'filters': ['mask_passwords'], - 'capacity': 100 + 'filters': { + 'mask_passwords': { + '()': PasswordODKFilter + } }, - }, - 'loggers' : { - # this is activated when this script is imported - # i.e. with logging.getLogger('Process.') - script_name : { - 'level' : 'INFO', - 'handlers' : ['handler_rot_file','handler_file','handler_buffered_email'], - 'propagate' : True, + 'handlers': { + + # logging for project + 'handler_rot_file': { + 'class': 'logging.handlers.TimedRotatingFileHandler', + 'level': 'INFO', + 'formatter': 'simple', + 'filters': ['mask_passwords'], + 'filename': log_path_project, + # time of day on given day + 'when': 'W2', # rotate file every Wednesday + 'atTime': datetime.time(1, 0, 0), # at 1am local time + 'backupCount': 12, + }, + + # logging for job + 'handler_file': { + 'class': 'logging.FileHandler', + 'level': 'INFO', + 'formatter': 'simple', + 'filters': ['mask_passwords'], + 'filename': self.log_path_default, + 'mode': 'a', # 'a' for append + }, + # to email errors to maintainers + 'handler_buffered_email': { + 'class': 'BufferingSMTPHandler.BufferingSMTPHandler', + 'level': 'ERROR', + 'server': (gmail_config['host'], gmail_config['port']), + # host, port. 465 fsor SSL, 587 for tls + 'credentials': (gmail_config['user'], gmail_config['pass']), + 'fromaddr': gmail_config['user'], + 'toaddrs': gmail_config['toaddrs'], + 'subject': 'ERROR in EWS Processor', + 'formatter': 'detailed', + 'filters': ['mask_passwords'], + 'capacity': 100 + }, }, - # this is activated when this script is called on the command line - # or from a bash script - # i.e. with logging.getLogger(__name__) when name == '__main__' - '__main__' : { - 'level' : 'INFO', - 'handlers' : ['handler_rot_file','handler_file','handler_buffered_email'], - 'propagate' : True, + 'loggers': { + # this is activated when this script is imported + # i.e. with logging.getLogger('Process.') + script_name: { + 'level': 'INFO', + 'handlers': ['handler_rot_file', 'handler_file', 'handler_buffered_email'], + 'propagate': True, + }, + # this is activated when this script is called on the command line + # or from a bash script + # i.e. with logging.getLogger(__name__) when name == '__main__' + '__main__': { + 'level': 'INFO', + 'handlers': ['handler_rot_file', 'handler_file', 'handler_buffered_email'], + 'propagate': True, + } } } -} - -logging.config.dictConfig(logConfigDict) - -print(__name__) -# create a logger named according to how the file is called -#logger = logging.getLogger(__name__) -logger = logging.getLogger(script_name) - -loglevels = {'debug':logging.DEBUG, - 'info':logging.INFO, - 'warning':logging.WARNING, - 'error':logging.ERROR, - 'critical':logging.CRITICAL, - } - -def move_default_logfile_handler(dstPathName,srcPathName=log_path_default,FileHandlerName='handler_file',): - '''For on-the-fly move of logging from default file to another. Copies the - contents of the source log file to destination, switches file handler in - logger, then removes source log file.''' - - logger.info(f"Moving logfile location from:\n{srcPathName}\nto:\n{dstPathName}") - - # copy old log file to new filename - srcPath = Path(srcPathName) - dstPath = Path(dstPathName) - assert srcPath.exists() - assert dstPath.parent.is_dir() - - oldFileHandler = [h for h in logger.handlers if h.name==FileHandlerName][0] - oldFormatter = oldFileHandler.formatter - - # define new file handler - newfilehandler = logging.FileHandler(dstPath,mode=oldFileHandler.mode) - newfilehandler.setLevel(oldFileHandler.level) - newfilehandler.setFormatter(oldFormatter) - - shutil.copyfile(srcPath,dstPath) - - # add handler for destination file - logger.info('Adding new logging file handler to destination path') - - logger.addHandler(newfilehandler) - - # remove handler for source file - logger.info('Stopping write to old file handler') - logger.removeHandler(oldFileHandler) - oldFileHandler.close() - logger.info('Successfully stopped write to old file handler') - - # delete old log file - logger.info('Deleting old log file, since all content available in new log file stream') - os.rename(srcPathName,srcPathName+'removed') - - return - - -def parse_and_check_args(todayString) -> dict: - - # define the command line arguments - my_parser = argparse.ArgumentParser(description='Command-line arguments for coordinator script of env suitability model') - - # Add the arguments - # positional arguments do not start with - or -- and are always required - # optional arguments start with - or -- and default is required = False - - my_parser.add_argument( - '-p', '--component', - type = str, - choices = list(short_name.keys()), - required = True, - dest = 'component', - help = '''Name of EWS component to process, which must be present - in the config file.''') - - my_parser.add_argument( - '-c', '--config', - metavar = 'path', - type = str, - nargs = '+', # require at least one path - dest = 'config_paths', - required = True, - #default = ['config_Ethiopia_template_stripe.json'], # remove once live - #default = ['config_Bangladesh_template_stripe.json'], # remove once live - #default = ['config_Nepal_template_stripe.json'], # remove once live - help = '''path to a config file(s). More than one can be provided, - in which case each is worked on in turn (e.g. one each for stripe, stem, leaf). - Do not place other options between these.''') - - my_parser.add_argument( - '-l','--loglevel', - action = 'store', - choices = list(loglevels.keys()), - default = 'info', - help = 'verbosity of log messaging (debug, info, warning, error, critical)\n default is debug', - dest = 'log_level', # this names the attribute that will be parsed - ) - - my_parser.add_argument( - '--islive', - action = 'store_true', - help = 'If live, email messages are sent to maintainers for warning and errors', - dest = 'live', - ) - - my_parser.add_argument( - '-s','--start-date','-i','--initial-date', - metavar = 'YYYYMMDD', - action = 'store', - default = todayString, - help = 'Initial day of calculation, starting at 00 UTC (Default is today)', - dest = 'start_date', - ) - - my_parser.add_argument( - '--noupload', - action = 'store_true', - help = 'whether results of script should be saved to willow public directory' - ) - - my_parser.add_argument( - '--clearup', - action = 'store_true', - help = 'whether to delete mid-process files at the end of a successful job', - dest = 'clearup', - ) - - # get an object holding all of the args - args = my_parser.parse_args() - - # Check the args - - logger.info(f"Command-line options are:\n{args}") - - if not isinstance(args.config_paths,list): - logger.error('Expecting a list of config paths') - raise RuntimeError - - # check the startstring - if args.start_date is not todayString: - try: - # check date string is formatted correctly - provided_start_date = datetime.datetime.strptime(args.start_date,'%Y%m%d') - today_date = datetime.datetime.strptime(todayString,'%Y%m%d') - - # early limit is quite arbitrary, but this is earliest year of available survey data for Ethiopia - date_limit_early = datetime.datetime.strptime('20070101','%Y%m%d') - assert date_limit_early < provided_start_date - assert provided_start_date <= today_date - except (ValueError, AssertionError) as e: - logger.exception("Provided start date string is formatted incorrectly or out of range, or end date not also defined") - raise + logging.config.dictConfig(logConfigDict) + + print(__name__) + # create a logger named according to how the file is called + # logger = logging.getLogger(__name__) + self.logger = logging.getLogger(script_name) + + self.loglevels = {'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL, + } + + + def move_default_logfile_handler(self, dstPathName, srcPathName = None, FileHandlerName = 'handler_file', ): + '''For on-the-fly move of logging from default file to another. Copies the + contents of the source log file to destination, switches file handler in + logger, then removes source log file.''' + + if srcPathName is None: + srcPathName = self.log_path_default + + self.logger.info(f"Moving logfile location from:\n{srcPathName}\nto:\n{dstPathName}") + + # copy old log file to new filename + srcPath = Path(srcPathName) + dstPath = Path(dstPathName) + assert srcPath.exists() + assert dstPath.parent.is_dir() + + oldFileHandler = [h for h in self.logger.handlers if h.name == FileHandlerName][0] + oldFormatter = oldFileHandler.formatter + + # define new file handler + newfilehandler = logging.FileHandler(dstPath, mode = oldFileHandler.mode) + newfilehandler.setLevel(oldFileHandler.level) + newfilehandler.setFormatter(oldFormatter) + + shutil.copyfile(srcPath, dstPath) + + # add handler for destination file + self.logger.info('Adding new logging file handler to destination path') + + self.logger.addHandler(newfilehandler) + + # remove handler for source file + self.logger.info('Stopping write to old file handler') + self.logger.removeHandler(oldFileHandler) + oldFileHandler.close() + self.logger.info('Successfully stopped write to old file handler') + + # delete old log file + self.logger.info('Deleting old log file, since all content available in new log file stream') + os.rename(srcPathName, srcPathName + 'removed') + + return + + + def parse_and_check_args(self) -> dict: + + # define the command line arguments + my_parser = argparse.ArgumentParser( + description = 'Command-line arguments for coordinator script of env suitability model') + + # Add the arguments + # positional arguments do not start with - or -- and are always required + # optional arguments start with - or -- and default is required = False + + my_parser.add_argument( + '-p', '--component', + type = str, + choices = list(short_name.keys()), + required = False, + dest = 'component', + help = '''Name of EWS component to process, which must be present + in the config file.''') + + my_parser.add_argument( + '-c', '--config', + metavar = 'path', + type = str, + nargs = '+', # require at least one path + dest = 'config_paths', + required = True, + # default = ['config_Ethiopia_template_stripe.json'], # remove once live + # default = ['config_Bangladesh_template_stripe.json'], # remove once live + # default = ['config_Nepal_template_stripe.json'], # remove once live + help = '''path to a config file(s). More than one can be provided, + in which case each is worked on in turn (e.g. one each for stripe, stem, leaf). + Do not place other options between these.''') + + my_parser.add_argument( + '-l', '--loglevel', + action = 'store', + choices = list(self.loglevels.keys()), + default = 'info', + help = 'verbosity of log messaging (debug, info, warning, error, critical)\n default is debug', + dest = 'log_level', # this names the attribute that will be parsed + ) + + my_parser.add_argument( + '--islive', + action = 'store_true', + help = 'If live, email messages are sent to maintainers for warning and errors', + dest = 'live', + ) + + my_parser.add_argument( + '-s', '--start-date', '-i', '--initial-date', + metavar = 'YYYYMMDD', + action = 'store', + default = self.todayString, + help = 'Initial day of calculation, starting at 00 UTC (Default is today)', + dest = 'start_date', + ) + + my_parser.add_argument( + '--noupload', + action = 'store_true', + help = 'whether results of script should be saved to willow public directory' + ) + + my_parser.add_argument( + '--clearup', + action = 'store_true', + help = 'whether to delete mid-process files at the end of a successful job', + dest = 'clearup', + ) + + # get an object holding all of the args + args = my_parser.parse_args() + + # Check the args + + self.logger.info(f"Command-line options are:\n{args}") + + if not isinstance(args.config_paths, list): + self.logger.error('Expecting a list of config paths') + raise RuntimeError + + # check the startstring + if args.start_date is not self.todayString: + try: + # check date string is formatted correctly + provided_start_date = datetime.datetime.strptime(args.start_date, '%Y%m%d') + today_date = datetime.datetime.strptime(self.todayString, '%Y%m%d') - dictionary: dict = vars(args) - return dictionary + # early limit is quite arbitrary, but this is earliest year of available survey data for Ethiopia + date_limit_early = datetime.datetime.strptime('20070101', '%Y%m%d') + assert date_limit_early < provided_start_date + assert provided_start_date <= today_date + except (ValueError, AssertionError) as e: + self.logger.exception( + "Provided start date string is formatted incorrectly or out of range, or end date not also defined") + raise -def set_log_level(log_level: str): - new_log_level = loglevels[log_level] + dictionary: dict = vars(args) + return dictionary - # modify log level of all loggers - logger.info(f"logging level being changed to {new_log_level} because of command-line option") - loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] - for logger_i in loggers: logger_i.setLevel(new_log_level) + def set_log_level(self, log_level: str): + new_log_level = self.loglevels[log_level] -def build_universal_config(configs: list,component: str, universal_config=None): - '''This config obtains aspects of each dict in configs that must be common to - them all. ''' + # modify log level of all self.loggers + self.logger.info(f"logging level being changed to {new_log_level} because of command-line option") + loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] + for logger_i in loggers: logger_i.setLevel(new_log_level) - # initialise universal config - if not universal_config: - universal_config = { - 'WorkspacePathout' : set(), - 'ProcessPreJob' : set(), - 'ProcessInJob' : set(), - 'ProcessEWSPlotting' : set(), - 'ServerPath' : set(), - 'ServerName' : set(), - 'ServerKey' : set()} - keys = universal_config.keys() + def build_universal_config(self, configs: list, component: str, universal_config = None): + '''This config obtains aspects of each dict in configs that must be common to + them all. ''' - # get value of each key from each config file - for configFile in configs: + # initialise universal config + if not universal_config: + universal_config = { + 'WorkspacePathout': set(), + # 'ProcessPreJob': set(), + # 'ProcessInJob': set(), + # 'ProcessEWSPlotting': set(), + 'ServerPath': set(), + 'ServerName': set(), + 'ServerKey': set()} - try: + keys = universal_config.keys() - config_i = open_and_check_config(configFile) + # get value of each key from each config file + for configFile in configs: - except: + try: - logger.exception(f"Failure in opening or checking config {configFile}") - endScript(premature=True) + config_i = open_and_check_config(configFile) - for key in keys: - - try: - universal_config[key].add(config_i[key]) - - except KeyError: + except: - # key must be in component sub-dict - universal_config[key].add(config_i[component][key]) + self.logger.exception(f"Failure in opening or checking config {configFile}") + endScript(premature = True) - # Check for and keep only one value per key - for key in keys: + for key in keys: - if len(universal_config[key]) > 1: + try: + universal_config[key].add(config_i[key]) - logger.error(f"Config files point to multiple {key} but this script can only handle one.") - endScript(premature=True) + except KeyError: - universal_config[key] = universal_config[key].pop() + # key must be in component sub-dict + universal_config[key].add(config_i[component][key]) - return universal_config + # Check for and keep only one value per key + for key in keys: -def run_Process(args: dict): + if len(universal_config[key]) > 1: + self.logger.error(f"Config files point to multiple {key} but this script can only handle one.") + endScript(premature = True) - # check initial state of each config file, and gather terms that must apply - # across all provided configs + universal_config[key] = universal_config[key].pop() - if not args["live"]: - # remove the log handler that would send emails - logger.handlers = [h for h in logger.handlers if not isinstance(h, BufferingSMTPHandler.BufferingSMTPHandler)] + return universal_config - config_paths: List[str] = args['config_paths'] - component: str = args['component'] - start_date: str = args['start_date'] - noupload: bool = args['noupload'] - clearup: bool = args['clearup'] - universal_config = build_universal_config(config_paths, component) - - universal_config['StartString'] = start_date + def run_process(self, args: dict): + # check initial state of each config file, and gather terms that must apply + # across all provided configs - logger.info(f"Universal config is\n{json.dumps(universal_config,indent=2)}") + if not args["live"]: + # remove the log handler that would send emails + self.logger.handlers = [h for h in self.logger.handlers if + not isinstance(h, BufferingSMTPHandler.BufferingSMTPHandler)] - workspacePath = universal_config['WorkspacePathout'] + config_paths: List[str] = args['config_paths'] + component: str = args['component'] + start_date: str = args['start_date'] + noupload: bool = args['noupload'] + clearup: bool = args['clearup'] - process_pre_job = getattr(ProcessorComponents, universal_config['ProcessPreJob']) + universal_config = self.build_universal_config(config_paths, component) - process_in_job = getattr(ProcessorComponents, universal_config['ProcessInJob']) + universal_config['StartString'] = start_date - process_EWS_plotting = getattr(ProcessorComponents, universal_config['ProcessEWSPlotting']) + self.logger.info(f"Universal config is\n{json.dumps(universal_config, indent = 2)}") - # determine job directory - jobPath = f'{workspacePath}{short_name[component]}_{start_date}' - - logger.info(f"Job path will be {jobPath}") + workspacePath = universal_config['WorkspacePathout'] - # note that file date represents preceding 3 hours, so day's data starts at file timestamp 0300 UTC - startTime = datetime.datetime.strptime(start_date+'03','%Y%m%d%H') + # process_pre_job = getattr(ProcessorComponents, universal_config['ProcessPreJob']) + # + # process_in_job = getattr(ProcessorComponents, universal_config['ProcessInJob']) + # + # process_EWS_plotting = getattr(ProcessorComponents, universal_config['ProcessEWSPlotting']) - # run any checks before creating a job directory - # if this fails, then make a note once there is a job directory - ready = process_pre_job(args) - - if not ready: - logger.info(f"Process_pre_job raised an error, continuing to create job directory to make a record of it") + # determine job directory + jobPath: str = f'{workspacePath}{short_name[component]}_{start_date}' - # create job directory - Path(jobPath).mkdir(parents=True, exist_ok=True) + self.logger.info(f"Job path will be {jobPath}") - # lock job directory - with jobStatus(jobPath) as status: + # note that file date represents preceding 3 hours, so day's data starts at file timestamp 0300 UTC + startTime = datetime.datetime.strptime(start_date + '03', '%Y%m%d%H') - #lawrence comment in/out - # check for a status file in job directory - if status.had_initial_status: - logger.info(f"Job path already exists and has status {status.status}") + # run any checks before creating a job directory + # if this fails, then make a note once there is a job directory + ready = self.process_pre_job(args) - endScript(premature = status.status not in ['SUCCESS','INPROGRESS']) + if not ready: + self.logger.info(f"Process_pre_job raised an error, continuing to create job directory to make a record of it") - logger.info(f"Current status of job directory is {status.status}") + # create job directory + Path(jobPath).mkdir(parents = True, exist_ok = True) - # now that we have a useable job directory, move the log file there - logPathJob = f"{jobPath}/log.txt" + # lock job directory + status: jobStatus + with jobStatus(jobPath) as status: - move_default_logfile_handler(dstPathName=logPathJob) - - # make a record if process_pre_job failed - if not ready: - logger.error(f"Process_pre_job raised an error so making a record in the job file. For details, see earlier warnings log") - status.reset('ERROR') - - endJob(status,ignore_inprogress=True,premature=False) - - # files and directories that will be uploaded to public server - FilesToSend = [] + # lawrence comment in/out + # check for a status file in job directory + if status.had_initial_status: + self.logger.info(f"Job path already exists and has status {status.status}") - # files and directories that will be earmarked for removal after a - # successful job - paths_to_clear = [] + endScript(premature = status.status not in ['SUCCESS', 'INPROGRESS']) - logger.info('Starting to work on each configuration') + self.logger.info(f"Current status of job directory is {status.status}") - for configIndex, configtemplate in enumerate(config_paths): + # now that we have a useable job directory, move the log file there + logPathJob = f"{jobPath}/log.txt" - config_paths_length: int = len(config_paths) - logger.info(f'Working on config {configIndex+1} of {config_paths_length}') - - try: - configjson = open_and_check_config(configtemplate) - except: - logger.exception(f"Failure in opening or checking config {configtemplate}") - # TODO: This case should test flagdir.jobStatus.__exit__() - raise # endJob('ERROR',premature=True) - - # provide specific case details to template config - - configjson['StartTime'] = startTime.strftime('%Y-%m-%d-%H%M') - configjson['StartString'] = start_date - - # from configtemplate create configFileName to describe the specific job - component: str = component - configFileName = f"{os.path.basename(configtemplate).replace('.json','')}_{component}" - - configjson['ConfigFilePath'] = configFileName - - # write the complete configuration file to job directory - with open(f"{jobPath}/{configFileName}.json",'w') as write_file: - json.dump(configjson,write_file,indent=4) - - proc_description = universal_config['ProcessInJob'] - try: - proc_out = process_in_job(jobPath,status,configjson,component) - except: - logger.exception(f"Some error in {proc_description}()") - status.reset('ERROR') - endJob(status,premature=True) - - # Set default case - # This would be improved by implementing a class structure - if proc_out is None: - proc_out = { - 'output' : None, - 'clearup' : None} - - if 'output' in proc_out.keys(): - - append_item_to_list( - proc_out['output'], - FilesToSend, - proc_description, - status) - - if 'clearup' in proc_out.keys(): - - append_item_to_list( - proc_out['clearup'], - paths_to_clear, - proc_description, - status) - - # Run EWS-plotting command - - proc_description = universal_config['ProcessEWSPlotting'] - try: - EWSPlottingOutputs = process_EWS_plotting(jobPath,configjson) - except: - logger.exception(f"Error in {proc_description}()") + self.move_default_logfile_handler(dstPathName = logPathJob) + + # make a record if process_pre_job failed + if not ready: + self.logger.error( + f"Process_pre_job raised an error so making a record in the job file. For details, see earlier warnings log") status.reset('ERROR') - endJob(status,premature=True) - logger.info('Finished with EWS-Plotting, appending images to list for transfer') - - if EWSPlottingOutputs: + endJob(status, ignore_inprogress = True, premature = False) - append_item_to_list( - EWSPlottingOutputs, - FilesToSend, - proc_description, - status) + # files and directories that will be uploaded to public server + FilesToSend = [] - logger.info(f'Finished with config {configIndex+1} of {config_paths_length}') + # files and directories that will be earmarked for removal after a + # successful job + paths_to_clear = [] - # send results to remote server + self.logger.info('Starting to work on each configuration') - if not noupload: - try: - ProcessorComponents.upload(universal_config, FilesToSend, component) - except IndexError: - status.reset('WARNING') + for configIndex, configtemplate in enumerate(config_paths): - except: - logger.exception('Failed to upload files to remote server') - status.reset('ERROR') - endJob(status,premature=True) - - # check if there is a second location on willow to provide results - if 'ServerPathExtra' in configjson[component]: + config_paths_length: int = len(config_paths) + self.logger.info(f'Working on config {configIndex + 1} of {config_paths_length}') + + try: + configjson = open_and_check_config(configtemplate) + except: + self.logger.exception(f"Failure in opening or checking config {configtemplate}") + # TODO: This case should test flagdir.jobStatus.__exit__() + raise # endJob('ERROR',premature=True) + + # provide specific case details to template config + + configjson['StartTime'] = startTime.strftime('%Y-%m-%d-%H%M') + configjson['StartString'] = start_date + + # from configtemplate create configFileName to describe the specific job + component: str = component + configFileName = f"{os.path.basename(configtemplate).replace('.json', '')}_{component}" + + configjson['ConfigFilePath'] = configFileName + + # write the complete configuration file to job directory + with open(f"{jobPath}/{configFileName}.json", 'w') as write_file: + json.dump(configjson, write_file, indent = 4) + + # proc_description = universal_config['ProcessInJob'] + proc_description = 'ProcessInJob' + try: + proc_out = self.process_in_job(jobPath, status, configjson, component) + except: + self.logger.exception(f"Error in process_in_job") + status.reset('ERROR') + endJob(status, premature = True) + + # Set default case + # This would be improved by implementing a class structure + if proc_out is None: + proc_out = { + 'output': None, + 'clearup': None} + + if 'output' in proc_out.keys(): + append_item_to_list( + proc_out['output'], + FilesToSend, + proc_description, + status) + + if 'clearup' in proc_out.keys(): + append_item_to_list( + proc_out['clearup'], + paths_to_clear, + proc_description, + status) + + # Run EWS-plotting command + + # proc_description = universal_config['ProcessEWSPlotting'] + proc_description = 'ProcessEWSPlotting' + try: + EWSPlottingOutputs = self.process_post_job(jobPath, configjson) + except: + self.logger.exception(f"Error in {proc_description}()") + status.reset('ERROR') + endJob(status, premature = True) - logger.info('There is an extra path to send results to:') - - extra_path = configjson[component]['ServerPathExtra'] + self.logger.info('Finished with EWS-Plotting, appending images to list for transfer') - logger.info(extra_path) + if EWSPlottingOutputs: + append_item_to_list( + EWSPlottingOutputs, + FilesToSend, + proc_description, + status) - universal_config_extra = universal_config.copy() - universal_config_extra['ServerPath'] = extra_path + self.logger.info(f'Finished with config {configIndex + 1} of {config_paths_length}') + # send results to remote server + + if not noupload: try: - ProcessorComponents.upload(universal_config_extra, FilesToSend, component) + ProcessorComponents.upload(universal_config, FilesToSend, component) except IndexError: status.reset('WARNING') except: - logger.exception('Failed to upload files to extra directory on remote server') + self.logger.exception('Failed to upload files to remote server') status.reset('ERROR') - endJob(status,premature=True) - - else: - logger.info('Because noupload argument was present, not sending results to remote server') + endJob(status, premature = True) + + # check if there is a second location on willow to provide results + if 'ServerPathExtra' in configjson[component]: + + self.logger.info('There is an extra path to send results to:') + + extra_path = configjson[component]['ServerPathExtra'] - status.reset('SUCCESS') - - if status.is_success() & (clearup is True): + self.logger.info(extra_path) - logger.info('Clearing up') + universal_config_extra = universal_config.copy() + universal_config_extra['ServerPath'] = extra_path - clearup_dest_dir = f"{workspacePath}/clearup/{short_name[component]}_{start_date}/" - Path(clearup_dest_dir).mkdir(parents=True, exist_ok=True) + try: + ProcessorComponents.upload(universal_config_extra, FilesToSend, component) + except IndexError: + status.reset('WARNING') - logger.info(f"While developing, moving directories to this directory : {clearup_dest_dir}") + except: + self.logger.exception('Failed to upload files to extra directory on remote server') + status.reset('ERROR') + endJob(status, premature = True) - clear_up( paths_to_clear, clearup_dest = clearup_dest_dir) + else: + self.logger.info('Because noupload argument was present, not sending results to remote server') - endScript(premature=False) + status.reset('SUCCESS') + if status.is_success() & (clearup is True): + self.logger.info('Clearing up') -if __name__ == '__main__': - try: - logger.info("==========") - logger.info(f"Logging started at {datetime.datetime.now().strftime('%Y %b %d %H:%M:%S')}") - # load configurations - args_dict: dict = parse_and_check_args(todayString) - set_log_level(args_dict['log_level']) - run_Process(args_dict) - except SystemExit: + clearup_dest_dir = f"{workspacePath}/clearup/{short_name[component]}_{start_date}/" + Path(clearup_dest_dir).mkdir(parents = True, exist_ok = True) - logger.info('run_process() exited') - pass - except: - logger.exception('Uncaught exception in run_Process:') + self.logger.info(f"While developing, moving directories to this directory : {clearup_dest_dir}") + clear_up(paths_to_clear, clearup_dest = clearup_dest_dir) + + endScript(premature = False) + + @abstractmethod + def process_pre_job(self, args): + raise NotImplementedError + + @abstractmethod + def process_in_job(self, jobPath, status, configjson, component) -> object: + raise NotImplementedError + + @abstractmethod + def process_post_job(self, jobPath, configjson): + raise NotImplementedError + + + def run_processor(self, component: str): + print("Make sure to `conda activate py3EWSepi` environment!") + print("Make sure that flagdir package is available (on PYTHONPATH)") + try: + self.logger.info("==========") + self.logger.info(f"Logging started at {datetime.datetime.now().strftime('%Y %b %d %H:%M:%S')}") + # load configurations + args_dict: dict = self.parse_and_check_args() + args_dict["component"] = component + self.set_log_level(args_dict['log_level']) + self.run_process(args_dict) + except SystemExit as e: + print("caught with code " + str(e.code)) + self.logger.info('run_process() exited') + sys.exit(e.code) + except: + self.logger.exception('Uncaught exception in run_Process:') diff --git a/coordinator/ProcessorAdvisory.py b/coordinator/ProcessorAdvisory.py index a5260965fe267aacaf65ecf5a1860f4d23783398..c3f22028dc2c13e0447eacbbb7d0b51e30aebca5 100644 --- a/coordinator/ProcessorAdvisory.py +++ b/coordinator/ProcessorAdvisory.py @@ -6,43 +6,67 @@ import logging # gitlab projects # TODO: Package these projects so they are robust for importing from AdvisoryBuilder import DataGatherer # created by jws52 +from Processor import Processor from ProcessorUtils import add_filters_to_sublogger, short_name -logger = logging.getLogger('Processor.Advisory') -add_filters_to_sublogger(logger) -def process_in_job_advisory(jobPath,status,config,component): - '''Generates a word processor file containing some basic survey statistics - and output figures from deposition, environmental suitability, and - eventually also the epi model. This template advisory is intended to speed - up the process of writing advisories. The intended user is a local expert - who edits the content of the document. - Uses the gitlab project EWS-advisory-builder.''' +class ProcessorAdvisory(Processor): - config_advisory = config[component].copy() + def process_pre_job(self, args): + return True - config_advisory['jobPath'] = jobPath - # provide top-level arguments to advisory config - for k,v in config.items(): - if k not in short_name.keys(): - config_advisory[k]=v + def process_in_job(self, jobPath, status, configjson, component) -> object: + self.process_in_job_advisory(jobPath, status, configjson, component) - dateString = config['StartString'] - layout = 'tight' + def process_post_job(self, jobPath, configjson): + pass - logging.info(f"Running for scenario {config_advisory}, {dateString}, {layout}") - report_names = DataGatherer.run_each_subregion(config_advisory, dateString, layout) + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Advisory') + add_filters_to_sublogger(logger) - # pass the report filenames to upload to the remote server - proc_out = {} - # Output files available for upload - proc_out['output'] = report_names - # Processing files available for clearing - proc_out['clearup'] = None + def process_in_job_advisory(self, jobPath, status, config, component): + '''Generates a word processor file containing some basic survey statistics + and output figures from deposition, environmental suitability, and + eventually also the epi model. This template advisory is intended to speed + up the process of writing advisories. The intended user is a local expert + who edits the content of the document. + Uses the gitlab project EWS-advisory-builder.''' - return proc_out + config_advisory = config[component].copy() + + config_advisory['jobPath'] = jobPath + + # provide top-level arguments to advisory config + for k,v in config.items(): + if k not in short_name.keys(): + config_advisory[k]=v + + dateString = config['StartString'] + + layout = 'tight' + + logging.info(f"Running for scenario {config_advisory}, {dateString}, {layout}") + + report_names = DataGatherer.run_each_subregion(config_advisory, dateString, layout) + + # pass the report filenames to upload to the remote server + + proc_out = {} + # Output files available for upload + proc_out['output'] = report_names + # Processing files available for clearing + proc_out['clearup'] = None + + return proc_out + + +if __name__ == '__main__': + processor = ProcessorAdvisory() + processor.run_processor("Advisory") \ No newline at end of file diff --git a/coordinator/ProcessorComponents.py b/coordinator/ProcessorComponents.py index f14d4982c78b27c313d150599c98b6b317f097d0..53a132ead95dff9ed484951dc8d5421aeec03108 100644 --- a/coordinator/ProcessorComponents.py +++ b/coordinator/ProcessorComponents.py @@ -11,36 +11,12 @@ from typing import List # All of the process_* functions are callable from config files for the three # coordinator stages: pre, in (during) and plotting. -from ProcessorAdvisory import ( - process_in_job_advisory -) -from ProcessorDeposition import ( - process_in_job_dep, - process_EWS_plotting_dep -) -from ProcessorEnvironment import ( - process_in_job_env2_0, - process_copy_past_job_env2_0, - process_EWS_plotting_env2_0 -) -from ProcessorEpidemiology import ( - process_pre_job_epi, - process_in_job_epi, - process_EWS_plotting_epi, -) -from ProcessorScraper import ( - process_in_job_media_scraper, -) from ProcessorServer import ( process_pre_job_server_download, upload ) -from ProcessorSurveys import ( - process_pre_job_survey, - process_in_job_survey, - process_EWS_plotting_survey -) + from ProcessorUtils import ( add_filters_to_sublogger, query_past_successes diff --git a/coordinator/ProcessorDeposition.py b/coordinator/ProcessorDeposition.py index 5ecac01dbcc3730b92beff605a5d010792787b36..9fcba63baafb000262aec7a91d02c72ff67b16b6 100644 --- a/coordinator/ProcessorDeposition.py +++ b/coordinator/ProcessorDeposition.py @@ -10,6 +10,8 @@ from string import Template import iris from iris.cube import CubeList +from Processor import Processor +from ProcessorServer import process_pre_job_server_download from ProcessorUtils import ( get_only_existing_globs, subprocess_and_log, @@ -18,142 +20,168 @@ from ProcessorUtils import ( from ews_postprocessing.deposition.deposition_post_processor import DepositionPostProcessor -logger = logging.getLogger('Processor.Deposition') -add_filters_to_sublogger(logger) +class ProcessorDeposition(Processor): -def process_in_job_dep(jobPath,status,config,component): - logger.info('started process_in_job_dep()') + """ LIFECYCLE FUNCTIONS INHERITED FROM PROCESSOR.PY """ - file_path = Template(config[component]['ServerPathTemplate']).substitute(**config) - file_name = Template(config[component]['InputFileTemplate']).substitute(**config) + def process_pre_job(self, args): + return process_pre_job_server_download(args) - logger.info(f"Expecting to work with {file_name}") - if os.path.exists(f"{jobPath}/{file_name}"): - logger.info('Directory already exists in job directory, so nothing to do here') - return + def process_in_job(self, jobPath, status, configjson, component) -> object: + return self.process_in_job_dep(jobPath, status, configjson, component) - logger.info('Copying file from remote server to job directory') - # TODO: perform ssh file transfer in python instead of subprocess - server_name: str = config['ServerName'] - if server_name == "": - cmd_scp = ["scp", f"{file_path}/{file_name}.tar.gz", jobPath] - else: - cmd_scp = ["scp", "-i", config['ServerKey'], "-o", "StrictHostKeyChecking=no", - f"{server_name}:{file_path}/{file_name}.tar.gz", jobPath] + def process_post_job(self, jobPath, configjson): + return self.process_EWS_plotting_dep(jobPath, configjson) - description_short = 'dep scp' - description_long = 'scp from server to job directory' - subprocess_and_log(cmd_scp, description_short, description_long) - logger.info('untarring the input file') + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Deposition') + add_filters_to_sublogger(logger) - # TODO: untar file in python (with tarfile module) instead of subprocess - cmd_tar = ["tar","-xzf",f"{jobPath}/{file_name}.tar.gz","-C",jobPath] - description_short = 'dep tars' - description_long = 'untar the downloaded file' - subprocess_and_log(cmd_tar, description_short, description_long) + """ LIFECYCLE FUNCTIONS INHERITED FROM PROCESSOR.PY """ - # basic check that contents are as expected - # 132 files of NAME .txt timesteps and one summary png file - # if len(glob(f"{jobPath}/{file_name}/deposition_srcs_allregions_C1_T*.txt")) != 56: - # msg = f"Unexpect number of deposition .txt files in input tar file. Expected 56." - # logger.error(msg) - # raise RuntimeError(msg) + def process_in_job_dep(self, jobPath, status, config, component): + self.logger.info('started process_in_job_dep()') - # basic check that contents are as expected (56 timepoints in the file) - cube_wildcard = f"{jobPath}/{file_name}/deposition_srcs_allregions*.nc" - cubes: CubeList = iris.load(cube_wildcard) - for cube in cubes: - coord = cube.coord("time") - timepoint_count = coord.shape[0] - if timepoint_count != 56: - msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}" - logger.error(msg) - raise RuntimeError(msg) + file_path = Template(config[component]['ServerPathTemplate']).substitute(**config) + file_name = Template(config[component]['InputFileTemplate']).substitute(**config) - proc_out = {} - # Output files available for upload - proc_out['output'] = None - # Processing files available for clearing - proc_out['clearup'] = [f"{jobPath}/{file_name}.tar.gz"] + self.logger.info(f"Expecting to work with {file_name}") - return proc_out + if os.path.exists(f"{jobPath}/{file_name}"): + self.logger.info('Directory already exists in job directory, so nothing to do here') + return -def process_EWS_plotting_dep(jobPath,config): - '''Returns a list of output files for transfer.''' + self.logger.info('Copying file from remote server to job directory') - logger.info('started process_EWS_plotting_dep()') + # TODO: perform ssh file transfer in python instead of subprocess + server_name: str = config['ServerName'] + if server_name == "": + cmd_scp = ["scp", f"{file_path}/{file_name}.tar.gz", jobPath] + else: + cmd_scp = ["scp", "-i", config['ServerKey'], "-o", "StrictHostKeyChecking=no", + f"{server_name}:{file_path}/{file_name}.tar.gz", jobPath] - # initialise environment - regions = config['SubRegionNames'] + description_short = 'dep scp' + description_long = 'scp from server to job directory' + subprocess_and_log(cmd_scp, description_short, description_long) - deposition_file_name = Template(config['Deposition']['InputFileTemplate']).substitute(**config) + self.logger.info('untarring the input file') - deposition_path = f"{jobPath}/{deposition_file_name}" + # TODO: untar file in python (with tarfile module) instead of subprocess + cmd_tar = ["tar","-xzf",f"{jobPath}/{file_name}.tar.gz","-C",jobPath] + description_short = 'dep tars' + description_long = 'untar the downloaded file' + subprocess_and_log(cmd_tar, description_short, description_long) - # get the file name from the config - # this file name can be a glob, as long as matches can all be loaded by iris - deposition_data_file_name = Template(config['Deposition']['DataFileTemplate']).substitute(**config) - name_file_wildcard = f"{deposition_path}/{deposition_data_file_name}" + # basic check that contents are as expected + # 132 files of NAME .txt timesteps and one summary png file + # if len(glob(f"{jobPath}/{file_name}/deposition_srcs_allregions_C1_T*.txt")) != 56: + # msg = f"Unexpect number of deposition .txt files in input tar file. Expected 56." + # self.logger.error(msg) + # raise RuntimeError(msg) - EWSPlottingOutputGlobs = [] + # basic check that contents are as expected (56 timepoints in the file) + cube_wildcard = f"{jobPath}/{file_name}/deposition_srcs_allregions*.nc" + cubes: CubeList = iris.load(cube_wildcard) + for cube in cubes: + coord = cube.coord("time") + timepoint_count = coord.shape[0] + if timepoint_count != 56: + msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}" + self.logger.error(msg) + raise RuntimeError(msg) - for region in regions: + proc_out = {} + # Output files available for upload + proc_out['output'] = None + # Processing files available for clearing + proc_out['clearup'] = [f"{jobPath}/{file_name}.tar.gz"] - output_dir = f"{jobPath}/plotting/{region.lower()}" + return proc_out - Path(output_dir).mkdir(parents=True, exist_ok=True) - sys_config = config['Deposition']['EWS-Plotting']['SysConfig'] - name_extraction_config = config['Deposition']['EWS-Plotting']['NameExtractionConfig'] - run_config = config['Deposition']['EWS-Plotting']['RunConfig'] - run_config_norm = config['Deposition']['EWS-Plotting']['RunConfigNorm'] - chart_config = config['Deposition']['EWS-Plotting'][region]['ChartConfig'] - normalize = config['Deposition']['EWS-Plotting'][region]['Normalize'] - extraction_file_prefix = 'deposition_' + region.lower() + def process_EWS_plotting_dep(self, jobPath, config): + '''Returns a list of output files for transfer.''' - # Note that this runs all disease types available + self.logger.info('started process_EWS_plotting_dep()') - logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{name_extraction_config}\n{run_config}\n{run_config_norm}\n{chart_config}") + # initialise environment + regions = config['SubRegionNames'] - depo_processor = DepositionPostProcessor() - depo_processor.set_param_config_files(sys_config_file_arg = sys_config, - depo_name_extraction_config_file_arg = name_extraction_config, - chart_config_file_arg = chart_config, - depo_plotting_run_config_file_arg = run_config, - depo_plotting_normalized_run_config_file_arg = run_config_norm, - name_file_wildcard_arg = name_file_wildcard, - wheat_sources_dir_arg = deposition_path, - output_dir_arg = output_dir, - issue_date_arg = config['StartString'], - extraction_file_prefix_arg = extraction_file_prefix) + deposition_file_name = Template(config['Deposition']['InputFileTemplate']).substitute(**config) + deposition_path = f"{jobPath}/{deposition_file_name}" - # asia/east africa env suit should not perform normalization, false gets passed here for these areas - depo_processor.name_extract_params.NORMALIZE = (normalize.upper() == "TRUE") + # get the file name from the config + # this file name can be a glob, as long as matches can all be loaded by iris + deposition_data_file_name = Template(config['Deposition']['DataFileTemplate']).substitute(**config) + name_file_wildcard = f"{deposition_path}/{deposition_data_file_name}" - depo_processor.process() + EWSPlottingOutputGlobs = [] - # check the output - EWSPlottingOutputDir = f"{output_dir}/images/" - #EWSPlottingOutputGlobs += [ - # # daily plots - # f"{EWSPlottingOutputDir}Daily/deposition_{region.lower()}_*_daily_20*.png", - # # weekly plots - # f"{EWSPlottingOutputDir}Weekly/deposition_{region.lower()}_*_total_20*.png"] + for region in regions: - EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"] + output_dir = f"{jobPath}/plotting/{region.lower()}" - EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) + Path(output_dir).mkdir(parents=True, exist_ok=True) - # check there is some output from EWS-plotting - if not EWSPlottingOutputGlobs: - logger.error('EWS-Plotting did not produce any output') - raise RuntimeError + sys_config = config['Deposition']['EWS-Plotting']['SysConfig'] + name_extraction_config = config['Deposition']['EWS-Plotting']['NameExtractionConfig'] + run_config = config['Deposition']['EWS-Plotting']['RunConfig'] + run_config_norm = config['Deposition']['EWS-Plotting']['RunConfigNorm'] + chart_config = config['Deposition']['EWS-Plotting'][region]['ChartConfig'] + normalize = config['Deposition']['EWS-Plotting'][region]['Normalize'] + extraction_file_prefix = 'deposition_' + region.lower() - # provide list for transfer - EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)]) + # Note that this runs all disease types available - return EWSPlottingOutputs + self.logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{name_extraction_config}\n{run_config}\n{run_config_norm}\n{chart_config}") + + depo_processor = DepositionPostProcessor() + depo_processor.set_param_config_files(sys_config_file_arg = sys_config, + depo_name_extraction_config_file_arg = name_extraction_config, + chart_config_file_arg = chart_config, + depo_plotting_run_config_file_arg = run_config, + depo_plotting_normalized_run_config_file_arg = run_config_norm, + name_file_wildcard_arg = name_file_wildcard, + wheat_sources_dir_arg = deposition_path, + output_dir_arg = output_dir, + issue_date_arg = config['StartString'], + extraction_file_prefix_arg = extraction_file_prefix) + + + # asia/east africa env suit should not perform normalization, false gets passed here for these areas + depo_processor.name_extract_params.NORMALIZE = (normalize.upper() == "TRUE") + + depo_processor.process() + + # check the output + EWSPlottingOutputDir = f"{output_dir}/images/" + #EWSPlottingOutputGlobs += [ + # # daily plots + # f"{EWSPlottingOutputDir}Daily/deposition_{region.lower()}_*_daily_20*.png", + # # weekly plots + # f"{EWSPlottingOutputDir}Weekly/deposition_{region.lower()}_*_total_20*.png"] + + EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"] + + EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) + + # check there is some output from EWS-plotting + if not EWSPlottingOutputGlobs: + self.logger.error('EWS-Plotting did not produce any output') + raise RuntimeError + + # provide list for transfer + EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)]) + + return EWSPlottingOutputs + + +if __name__ == '__main__': + processor = ProcessorDeposition() + processor.run_processor("Deposition") diff --git a/coordinator/ProcessorEnvironment.py b/coordinator/ProcessorEnvironment.py index f356f7ba25d05791c2203d58f773f4bc854b8f25..df64fc01a2216e1b7141c9e0951db5424c3e9438 100644 --- a/coordinator/ProcessorEnvironment.py +++ b/coordinator/ProcessorEnvironment.py @@ -12,211 +12,233 @@ import tarfile import iris from iris.cube import CubeList +from Processor import Processor +from ProcessorServer import process_pre_job_server_download from ews_postprocessing.environmental_suitability.env_suit_post_processor import EnvSuitPostProcessor from ews_postprocessing.utils.disease_info import EnvSuitDiseaseInfo import EnvSuitPipeline as esp from ProcessorUtils import ( - get_only_existing_globs, - subprocess_and_log, - add_filters_to_sublogger, - remove_path_from_tar_members, - short_name + get_only_existing_globs, + subprocess_and_log, + remove_path_from_tar_members, + short_name, add_filters_to_sublogger ) -logger = logging.getLogger('Processor.Environment') -add_filters_to_sublogger(logger) - -def process_in_job_env2_0(jobPath,status,config,component): - '''Download met data from remote, prepare data, and run :class:`EnvSuitPipeline` pipeline.''' - - logger.info('started process_in_job_env2_0()') - - logger.info('Copying file from remote server to job directory') - - file_path = Template(config[component]['ServerPathTemplate']).substitute(**config) - file_name = Template(config[component]['InputFileTemplate']).substitute(**config) - - #TODO: check if file exists already (may be the case for multiple configs in one) - - # TODO: perform ssh file transfer in python instead of subprocess - server_name: str = config['ServerName'] - if server_name == "": - cmd_scp: list = ["scp", f"{file_path}/{file_name}.tar.gz", jobPath] - else: - cmd_scp: list = ["scp", "-i", config['ServerKey'], "-o", "StrictHostKeyChecking=no", - f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath] - - description_short = 'env2 scp' - description_long = 'Copying file from remote server to job directory' - # lawrence comment in/out - subprocess_and_log(cmd_scp,description_short, description_long) - - logger.info('untarring the input file') - - # untar incoming name data - output_directory = f"{jobPath}/NAME_Met_as_netcdf" - Path(output_directory).mkdir(parents=True, exist_ok=True) - tarfile_name = f"{jobPath}/{file_name}.tar.gz" - with tarfile.open(tarfile_name) as tar: - members = remove_path_from_tar_members(tar) - tar.extractall(output_directory, members = members) - - # basic check that contents are as expected for 7-day forecast (57 timepoints in all files) - cube_wildcard = f"{output_directory}/*.nc" - cubes: CubeList = iris.load(cube_wildcard) - - # land_fraction and topography will only have a single timepoint (as these dont change over time), so we can ignore - # these when sense-checking the expected number of timepoints - ignore_list = ["LAND_FRACTION", "TOPOGRAPHY"] - - for cube in cubes: - var_name = cube.name() - coord = cube.coord("time") - timepoint_count = coord.shape[0] - if timepoint_count != 57 and var_name not in ignore_list: - msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}" - logger.error(msg) - raise RuntimeError(msg) - - region = config['RegionName'] - - logger.info(f"Calling environmental suitability 2.0 for {region} so wait for output to appear") - - pipeline_config = config["Environment"] - try: - #todo lawrence comment this back to original (extracted=False) - esp.run_pipeline(pipeline_config, region, config["StartString"], extracted=False) - except: - logger.exception(f"Some failure when running EnvSuitPipeline.py") - raise - - logger.info('Finished running environmental suitability 2.0') - - # TODO: Check that the output appears as expected +class ProcessorEnvironment(Processor): - proc_out = {} - # Output files available for upload - proc_out['output'] = None - # Processing files available for clearing - proc_out['clearup'] = [f"{jobPath}/{file_name}.tar.gz"] - - return proc_out - -def process_copy_past_job_env2_0(jobPath,status,config,component): - '''For when we want to skip process_in_job() to test the other components of - this script. Currently hard-wired.''' - - # TODO: remove this hard-wired assumption - jobPath_to_copy = f"{jobPath}/../{short_name['Environment']}_{config['StartString']}_bak/" - - assert os.path.exists(jobPath_to_copy) - - dir_src = f"{jobPath_to_copy}/processed/" - - dir_dst = f"{jobPath}/processed/" - - logger.info(f"Copying from {dir_src}") - - logger.info(f"to {dir_dst}") + def process_pre_job(self, args): + return process_pre_job_server_download(args) - copy_tree(dir_src,dir_dst) - logger.info('Copying complete') + def process_in_job(self, jobPath, status, configjson, component) -> object: + return self.process_in_job_env2_0(jobPath, status, configjson, component) - proc_out = {} - # Output files available for upload - proc_out['output'] = None - # Processing files available for clearing - proc_out['clearup'] = None - return proc_out + def process_post_job(self, jobPath, configjson): + return self.process_EWS_plotting_env2_0(jobPath, configjson) -'''class EWSPlottingEnvSuit(EWSPlottingEnvSuitBase): - def set_custom_params(self, - sys_params_dict: dict, - chart_params_dict: dict, - run_params_dict: dict, - disease_csv_template_arg: str, - diseases: List[EnvSuitDiseaseInfo]): - # this is unique to the asia/east africa env suit, as we are not filtering within country boundaries - run_params_dict[RUN_PARAMS.FILTER_FOR_COUNTRY_KEY] = "False"''' - -#TODO test if this works -def process_EWS_plotting_env2_0(jobPath,config): - '''Configures the plotting arguments and calls EWS-plotting as a python module. - Returns a list of output files for transfer.''' - - logger.info('started process_EWS_plotting_env2_0()') - - main_region = config['RegionName'] - - input_dir = f"{jobPath}/processed/{main_region}" - - subregions = config['SubRegionNames'] - - EWSPlottingOutputGlobs = [] - - # work on each region - for region in subregions: - - output_dir = f"{jobPath}/plotting/{region.lower()}" - csv_template_dir = input_dir + "/{DISEASE_DIR}/RIE_value.csv" - - Path(output_dir).mkdir(parents=True, exist_ok=True) - - sys_config = config['Environment']['EWS-Plotting']['SysConfig'] - run_config = config['Environment']['EWS-Plotting']['RunConfig'] - chart_config = config['Environment']['EWS-Plotting'][region]['ChartConfig'] - filter_for_country = config['Environment']['EWS-Plotting'][region]['FilterForCountry'] - - # Note that this runs all disease types available - - logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}") - - env_suit_processor = EnvSuitPostProcessor() - env_suit_processor.set_param_config_files(sys_params_file_arg = sys_config, - chart_params_file_arg = chart_config, - run_params_file_arg = run_config, - es_output_dir_arg = output_dir, - issue_date_arg = config['StartString'], - disease_csv_template_arg = csv_template_dir) - - env_suit_processor.run_params.FILTER_FOR_COUNTRY = (filter_for_country.upper() == "TRUE") - - # Include further diseases in plotting. In this case the irrigated suitabilite for the rusts. - # TODO: move this part out into a config - extra_diseases = [ - EnvSuitDiseaseInfo("Stem rust temp-only", "stem_rust_temponly", config['StartString'], "StemRust_TempOnly", csv_template_dir), - EnvSuitDiseaseInfo("Leaf rust temp-only", "leaf_rust_temponly", config['StartString'], "LeafRust_TempOnly", csv_template_dir), - EnvSuitDiseaseInfo("Stripe rust temp-only", "stripe_temponly", config['StartString'], "StripeRust_TempOnly", csv_template_dir), - ] - - env_suit_processor.add_diseases(diseases=extra_diseases) - - env_suit_processor.process() + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Environment') + add_filters_to_sublogger(logger) + def process_in_job_env2_0(self, jobPath,status,config,component): + '''Download met data from remote, prepare data, and run :class:`EnvSuitPipeline` pipeline.''' + + self.logger.info('started process_in_job_env2_0()') + + self.logger.info('Copying file from remote server to job directory') + + file_path = Template(config[component]['ServerPathTemplate']).substitute(**config) + file_name = Template(config[component]['InputFileTemplate']).substitute(**config) + + #TODO: check if file exists already (may be the case for multiple configs in one) + + # TODO: perform ssh file transfer in python instead of subprocess + server_name: str = config['ServerName'] + if server_name == "": + cmd_scp: list = ["scp", f"{file_path}/{file_name}.tar.gz", jobPath] + else: + cmd_scp: list = ["scp", "-i", config['ServerKey'], "-o", "StrictHostKeyChecking=no", + f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath] + + description_short = 'env2 scp' + description_long = 'Copying file from remote server to job directory' + # lawrence comment in/out + subprocess_and_log(cmd_scp,description_short, description_long) + + self.logger.info('untarring the input file') + + # untar incoming name data + output_directory = f"{jobPath}/NAME_Met_as_netcdf" + Path(output_directory).mkdir(parents=True, exist_ok=True) + tarfile_name = f"{jobPath}/{file_name}.tar.gz" + with tarfile.open(tarfile_name) as tar: + members = remove_path_from_tar_members(tar) + tar.extractall(output_directory, members = members) + + # basic check that contents are as expected for 7-day forecast (57 timepoints in all files) + cube_wildcard = f"{output_directory}/*.nc" + cubes: CubeList = iris.load(cube_wildcard) + + # land_fraction and topography will only have a single timepoint (as these dont change over time), so we can ignore + # these when sense-checking the expected number of timepoints + ignore_list = ["LAND_FRACTION", "TOPOGRAPHY"] + + for cube in cubes: + var_name = cube.name() + coord = cube.coord("time") + timepoint_count = coord.shape[0] + if timepoint_count != 57 and var_name not in ignore_list: + msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}" + self.logger.error(msg) + raise RuntimeError(msg) + + region = config['RegionName'] + + self.logger.info(f"Calling environmental suitability 2.0 for {region} so wait for output to appear") + + pipeline_config = config["Environment"] + try: + #todo lawrence comment this back to original (extracted=False) + esp.run_pipeline(pipeline_config, region, config["StartString"], extracted=False) + except: + self.logger.exception(f"Some failure when running EnvSuitPipeline.py") + raise + + self.logger.info('Finished running environmental suitability 2.0') + + # TODO: Check that the output appears as expected + + proc_out = {} + # Output files available for upload + proc_out['output'] = None + # Processing files available for clearing + proc_out['clearup'] = [f"{jobPath}/{file_name}.tar.gz"] + + return proc_out + + def process_copy_past_job_env2_0(self, jobPath,status,config,component): + '''For when we want to skip process_in_job() to test the other components of + this script. Currently hard-wired.''' + + # TODO: remove this hard-wired assumption + jobPath_to_copy = f"{jobPath}/../{short_name['Environment']}_{config['StartString']}_bak/" + + assert os.path.exists(jobPath_to_copy) + + dir_src = f"{jobPath_to_copy}/processed/" + + dir_dst = f"{jobPath}/processed/" + + self.logger.info(f"Copying from {dir_src}") + + self.logger.info(f"to {dir_dst}") + + copy_tree(dir_src,dir_dst) + + self.logger.info('Copying complete') + + proc_out = {} + # Output files available for upload + proc_out['output'] = None + # Processing files available for clearing + proc_out['clearup'] = None + + return proc_out + + '''class EWSPlottingEnvSuit(EWSPlottingEnvSuitBase): + + def set_custom_params(self, + sys_params_dict: dict, + chart_params_dict: dict, + run_params_dict: dict, + disease_csv_template_arg: str, + diseases: List[EnvSuitDiseaseInfo]): + # this is unique to the asia/east africa env suit, as we are not filtering within country boundaries + run_params_dict[RUN_PARAMS.FILTER_FOR_COUNTRY_KEY] = "False"''' + + #TODO test if this works + def process_EWS_plotting_env2_0(self, jobPath,config): + '''Configures the plotting arguments and calls EWS-plotting as a python module. + Returns a list of output files for transfer.''' + + self.logger.info('started process_EWS_plotting_env2_0()') + + main_region = config['RegionName'] + + input_dir = f"{jobPath}/processed/{main_region}" + + subregions = config['SubRegionNames'] + + EWSPlottingOutputGlobs = [] + + # work on each region + for region in subregions: + + output_dir = f"{jobPath}/plotting/{region.lower()}" + csv_template_dir = input_dir + "/{DISEASE_DIR}/RIE_value.csv" + + Path(output_dir).mkdir(parents=True, exist_ok=True) + + sys_config = config['Environment']['EWS-Plotting']['SysConfig'] + run_config = config['Environment']['EWS-Plotting']['RunConfig'] + chart_config = config['Environment']['EWS-Plotting'][region]['ChartConfig'] + filter_for_country = config['Environment']['EWS-Plotting'][region]['FilterForCountry'] + + # Note that this runs all disease types available + + self.logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}") + + env_suit_processor = EnvSuitPostProcessor() + env_suit_processor.set_param_config_files(sys_params_file_arg = sys_config, + chart_params_file_arg = chart_config, + run_params_file_arg = run_config, + es_output_dir_arg = output_dir, + issue_date_arg = config['StartString'], + disease_csv_template_arg = csv_template_dir) + + env_suit_processor.run_params.FILTER_FOR_COUNTRY = (filter_for_country.upper() == "TRUE") + + # Include further diseases in plotting. In this case the irrigated suitabilite for the rusts. + # TODO: move this part out into a config + extra_diseases = [ + EnvSuitDiseaseInfo("Stem rust temp-only", "stem_rust_temponly", config['StartString'], "StemRust_TempOnly", csv_template_dir), + EnvSuitDiseaseInfo("Leaf rust temp-only", "leaf_rust_temponly", config['StartString'], "LeafRust_TempOnly", csv_template_dir), + EnvSuitDiseaseInfo("Stripe rust temp-only", "stripe_temponly", config['StartString'], "StripeRust_TempOnly", csv_template_dir), + ] + + env_suit_processor.add_diseases(diseases=extra_diseases) + + env_suit_processor.process() + + # check the output + EWSPlottingOutputDir = f"{output_dir}/images/" + #EWSPlottingOutputGlobs += [ + # # daily plots + # f"{EWSPlottingOutputDir}Daily/suitability_{region.lower()}_*_rust_daily_20*.png", + # # weekly plots + # f"{EWSPlottingOutputDir}Weekly/suitability_{region.lower()}_*_rust_total_20*.png"] + + EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"] + # check the output - EWSPlottingOutputDir = f"{output_dir}/images/" - #EWSPlottingOutputGlobs += [ - # # daily plots - # f"{EWSPlottingOutputDir}Daily/suitability_{region.lower()}_*_rust_daily_20*.png", - # # weekly plots - # f"{EWSPlottingOutputDir}Weekly/suitability_{region.lower()}_*_rust_total_20*.png"] - - EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"] - - # check the output - EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) - - # check there is some output from EWS-plotting - if not EWSPlottingOutputGlobs: - logger.error('EWS-Plotting did not produce any output') - raise RuntimeError + EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) + + # check there is some output from EWS-plotting + if not EWSPlottingOutputGlobs: + self.logger.error('EWS-Plotting did not produce any output') + raise RuntimeError + + # provide list for transfer + EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)]) + + return EWSPlottingOutputs - # provide list for transfer - EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)]) - return EWSPlottingOutputs +if __name__ == '__main__': + processor = ProcessorEnvironment() + processor.run_processor("Environment") diff --git a/coordinator/ProcessorEpidemiology.py b/coordinator/ProcessorEpidemiology.py index 23f193b8bdec60a2e096e6355425bec507cdc381..ceb85afeb318cf7b7b2a6bcc45d97eb60fc196cd 100644 --- a/coordinator/ProcessorEpidemiology.py +++ b/coordinator/ProcessorEpidemiology.py @@ -22,6 +22,7 @@ from EpiModel import ( # created by rs481 plotRaster ) from EpiModel.EpiPrep import lister, loader, prep, updater +from Processor import Processor from ews_postprocessing.epi.epi_post_processor import EPIPostPostProcessor from ProcessorUtils import ( @@ -34,781 +35,804 @@ from ProcessorUtils import ( disease_latin_name_dict ) -logger = logging.getLogger('Processor.Epi') -add_filters_to_sublogger(logger) +class ProcessorEpidemiology(Processor): + def process_pre_job(self, args): + return self.process_pre_job_epi(args) -def calc_epi_date_range(init_str,span_days=[0,6]): - '''Date range is determined relative to init_date. - span_days is usually defined in the job config file. Day zero is current - day, negative values point to past (historical or analysis) days, and - positive values point to forecast days. - Returns a start_date and end_date.''' - init_date = datetime.datetime.strptime(init_str,'%Y%m%d') + def process_in_job(self, jobPath, status, configjson, component) -> object: + return self.process_in_job_epi(jobPath, status, configjson, component) - # note that filename date represents preceding 3 hours, so day's data - # starts at file timestamp 0300 UTC - threehour_shift = datetime.timedelta(hours=3) - # add 24hrs so that final day is fully included - day_shift = datetime.timedelta(days=1) + def process_post_job(self, jobPath, configjson): + return self.process_EWS_plotting_epi(jobPath, configjson) - # if more than 999 days - if len(str(span_days[0]))>3: - # assume it is a date string - start_date = datetime.datetime.strptime(span_days[0]+'0300','%Y%m%d%H%M') - else: - date_shift0 = datetime.timedelta(days=span_days[0]) - start_date = init_date + date_shift0 + threehour_shift + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Epi') + add_filters_to_sublogger(logger) - if len(str(span_days[1]))>3: - # assume it is a date string - end_date = datetime.strptime(span_days[1]+'0000','%Y%m%d%H%M') - end_date = end_date + day_shift - else: - date_shift1 = datetime.timedelta(days=span_days[1]) + def calc_epi_date_range(self, init_str, span_days = [0, 6]): + '''Date range is determined relative to init_date. + span_days is usually defined in the job config file. Day zero is current + day, negative values point to past (historical or analysis) days, and + positive values point to forecast days. + Returns a start_date and end_date.''' - end_date = init_date + date_shift1 +day_shift + init_date = datetime.datetime.strptime(init_str,'%Y%m%d') - return start_date, end_date + # note that filename date represents preceding 3 hours, so day's data + # starts at file timestamp 0300 UTC + threehour_shift = datetime.timedelta(hours=3) -def process_pre_job_epi(input_args: dict): - '''Returns a boolean as to whether the job is ready for full processing.''' + # add 24hrs so that final day is fully included + day_shift = datetime.timedelta(days=1) - logger.info('started process_pre_job_epi()') + # if more than 999 days + if len(str(span_days[0]))>3: + # assume it is a date string + start_date = datetime.datetime.strptime(self.span_days[0]+'0300','%Y%m%d%H%M') + else: + date_shift0 = datetime.timedelta(days=span_days[0]) - # check pre-requisite jobs are complete - query_past_successes(input_args) + start_date = init_date + date_shift0 + threehour_shift - config_fns: List[str] = input_args['config_paths'] + if len(str(span_days[1]))>3: + # assume it is a date string + end_date = datetime.strptime(span_days[1]+'0000','%Y%m%d%H%M') - for configFile in config_fns: + end_date = end_date + day_shift + else: + date_shift1 = datetime.timedelta(days=span_days[1]) - # they should be working if the script made it this far, no need to try - config_i = open_and_check_config(configFile) + end_date = init_date + date_shift1 +day_shift - #determine end time, from config file - arg_start_date: str = input_args['start_date'] - calc_span_days = config_i['Epidemiology']['CalculationSpanDays'] - assert len(calc_span_days) == 2 + return start_date, end_date - start_time, end_time = calc_epi_date_range(arg_start_date,calc_span_days) + def process_pre_job_epi(self, input_args: dict): + '''Returns a boolean as to whether the job is ready for full processing.''' - # warn if it is a long timespan - date_diff = end_time - start_time - if date_diff.days > 100: - logger.warning("More than 100 days will be calculated over, likely longer than any single season") + self.logger.info('started process_pre_job_epi()') - return True + # check pre-requisite jobs are complete + query_past_successes(input_args) + config_fns: List[str] = input_args['config_paths'] -def create_epi_config_string(config,jobPath,startString,endString): + for configFile in config_fns: - configtemplate_fn = config['ConfigFilePath'] - configName_withoutEpi = f"{os.path.basename(configtemplate_fn).replace('.json','')}_{startString}-{endString}" + # they should be working if the script made it this far, no need to try + config_i = open_and_check_config(configFile) - # create a string describing every epi calc configuration - epiStrings = [] - for epiconf in config['Epidemiology']['Epi']: - epiKwargsString = ''.join([f"{k}{v}" for k,v in epiconf['modelArguments'].items()]) + #determine end time, from config file + arg_start_date: str = input_args['start_date'] + calc_span_days = config_i['Epidemiology']['CalculationSpanDays'] + assert len(calc_span_days) == 2 - # drop any repetitive elements of kwarg - epiKwargsString = epiKwargsString.replace('infectionprevious','') - epiKwargsString = epiKwargsString.replace('capbeta','cb') + start_time, end_time = self.calc_epi_date_range(arg_start_date,calc_span_days) - epiCaseString = f"{epiconf['model'].lower()}{epiKwargsString}" + # warn if it is a long timespan + date_diff = end_time - start_time + if date_diff.days > 100: + self.logger.warning("More than 100 days will be calculated over, likely longer than any single season") - # provide to configuration for output filename - epiconf["infectionRasterFileName"] = f"{jobPath}/infections_{configName_withoutEpi}_{epiCaseString}" + return True - epiStrings += [epiCaseString] - epiString = '-'.join(epiStrings) + def create_epi_config_string(self, config,jobPath,startString,endString): - config_filename = f"{configName_withoutEpi}_{epiString}" + configtemplate_fn = config['ConfigFilePath'] + configName_withoutEpi = f"{os.path.basename(configtemplate_fn).replace('.json','')}_{startString}-{endString}" - logger.debug(f"length of config filename is {len(config_filename)}.") + # create a string describing every epi calc configuration + epiStrings = [] + for epiconf in config['Epidemiology']['Epi']: + epiKwargsString = ''.join([f"{k}{v}" for k,v in epiconf['modelArguments'].items()]) - if len(config_filename) > 254: - logger.info(f"filename length is too long, it will raise an OSError, using a short form instead") + # drop any repetitive elements of kwarg + epiKwargsString = epiKwargsString.replace('infectionprevious','') + epiKwargsString = epiKwargsString.replace('capbeta','cb') - # epi cases are not described in filename, an interested user - # must look in the json file for details. - config_filename = configName_withoutEpi + epiCaseString = f"{epiconf['model'].lower()}{epiKwargsString}" - assert len(config_filename) <= 254 + # provide to configuration for output filename + epiconf["infectionRasterFileName"] = f"{jobPath}/infections_{configName_withoutEpi}_{epiCaseString}" - return config_filename + epiStrings += [epiCaseString] -def are_indices_close(idx1: MultiIndex, idx2: MultiIndex, atol=2.51e-6) -> bool: - """An absolute tolerance of 2.51e-6 relates to differences between the - grid's of NAME vn7.2 output and met-extractor in 2022.""" + epiString = '-'.join(epiStrings) - assert idx1.nlevels == idx2.nlevels - num_levels = idx1.nlevels + config_filename = f"{configName_withoutEpi}_{epiString}" - # a stricter check is idx_i.equals(idx_0) + self.logger.debug(f"length of config filename is {len(config_filename)}.") - levels_close = [] - for i in range(num_levels): - close_i = allclose(idx1.get_level_values(i),idx2.get_level_values(i),atol=atol,rtol=0) - levels_close += [close_i] - - return all(levels_close) + if len(config_filename) > 254: + self.logger.info(f"filename length is too long, it will raise an OSError, using a short form instead") -def raster_to_series(raster_fn): + # epi cases are not described in filename, an interested user + # must look in the json file for details. + config_filename = configName_withoutEpi - with rio_open(raster_fn,'r') as host_raster: - host_arr = host_raster.read(1) - shape = host_raster.shape + assert len(config_filename) <= 254 - # determine coordinates - coords = [host_raster.xy(i,j) for i in range(shape[0]) for j in range(shape[1])] - lons = unique([ci[0] for ci in coords]) - lats = unique([ci[1] for ci in coords]) - assert shape == (lats.size,lons.size) + return config_filename - # build into a dataframe - # (rasters start in the top left, so descending latitude coordinates) - host_df = DataFrame(data=host_arr,index=lats[::-1],columns=lons) - host_df.index.name = 'latitude' - host_df.columns.name = 'longitude' - # rearrange to ascending latitude corodinates - host_df.sort_index(axis='rows',inplace=True) - # make spatial coordinates a multi-index, like for dep and env suit csvs - host_series = host_df.stack() + def are_indices_close(self, idx1: MultiIndex, idx2: MultiIndex, atol=2.51e-6) -> bool: + """An absolute tolerance of 2.51e-6 relates to differences between the + grid's of NAME vn7.2 output and met-extractor in 2022.""" - return host_series + assert idx1.nlevels == idx2.nlevels + num_levels = idx1.nlevels -def rasters_to_csv( - raster_fns_dict: dict, - csv_fn: str, - ): - """Takes a dictionary of raster files with associated times and saves them - as rows of a single csv. The csv columns and index structure matches model - outputs as expected by the epi model. Used to prepare the host data.""" - - host_serieses = [] - first = True - for date_valid_from, raster_fn in raster_fns_dict.items(): - - host_series = raster_to_series(raster_fn) + # a stricter check is idx_i.equals(idx_0) - # for now, provide a nominal date of validity to enable a time column - # so far, using mapspam which is a static map, so time is irrelevant - host_series.name = date_valid_from + levels_close = [] + for i in range(num_levels): + close_i = allclose(idx1.get_level_values(i),idx2.get_level_values(i),atol=atol,rtol=0) + levels_close += [close_i] - # conform indices (handle float differences) - if first: - idx_0 = host_series.index + return all(levels_close) - if not first: - idx_i = host_series.index - - indices_are_close = are_indices_close(idx_0,idx_i) - assert indices_are_close, (f"Coordinates of host_rasters do not match.\nFailed for {raster_fn}.") - host_series.index = idx_0 - - first = False - - host_serieses += [host_series] - - host_df = DataFrame(host_serieses) + def raster_to_series(self, raster_fn): - host_df.to_csv(csv_fn) + with rio_open(raster_fn,'r') as host_raster: + host_arr = host_raster.read(1) + shape = host_raster.shape - return - -def get_model_divided_by_host_fraction( - dfm, - hostCSV, - model_colns=None, - **kwargs): - """when model_infection pressure has units of [ha_infected/ha_cell] - we want [ha_infected/ha_wheat] to compare with surveys - (because surveys only sample the wheat covered landscape) - so we must load the host raster and divide all model results by it. - - TODO: Instead of doing as post-processing in coordinator, best do it within - the ews-epidemiology package. - """ - - print('Converting units of prediction from ha_infected/ha_cell to ha_infect/ha_wheat') - - # load host raster - host_fn = hostCSV - host_df = read_csv(host_fn,index_col=0,header=[0,1]) - host_df.columns = host_df.columns.set_levels([lvl.astype('float') for lvl in host_df.columns.levels]) - host_df.index = to_datetime(host_df.index,format='%Y%m%d%H%M') - host_df = host_df.T + # determine coordinates + coords = [host_raster.xy(i,j) for i in range(shape[0]) for j in range(shape[1])] + lons = unique([ci[0] for ci in coords]) + lats = unique([ci[1] for ci in coords]) + assert shape == (lats.size,lons.size) - # conform the structure with infection dataframe + # build into a dataframe + # (rasters start in the top left, so descending latitude coordinates) + host_df = DataFrame(data=host_arr,index=lats[::-1],columns=lons) + host_df.index.name = 'latitude' + host_df.columns.name = 'longitude' + # rearrange to ascending latitude corodinates + host_df.sort_index(axis='rows',inplace=True) + # make spatial coordinates a multi-index, like for dep and env suit csvs + host_series = host_df.stack() - # conform indices (coordinates) - host_df.index = host_df.index.reorder_levels(['longitude','latitude']) - host_df.sort_index(level=['longitude','latitude'],ascending=[True,False],inplace=True) + return host_series - indices_are_close = are_indices_close(host_df.index,dfm.index) - assert indices_are_close, ('Coordinates of model grid do not match host map.') - host_df.index = dfm.index + def rasters_to_csv( + self, + raster_fns_dict: dict, + csv_fn: str, + ): + """Takes a dictionary of raster files with associated times and saves them + as rows of a single csv. The csv columns and index structure matches model + outputs as expected by the epi model. Used to prepare the host data.""" - # conform columns (dates) - column_end_dates = dfm.columns.map(lambda x: x[-12:]) - model_dates = to_datetime(column_end_dates, format='%Y%m%d%H%M') - datetime.timedelta(days=1) - dfm2 = dfm.copy() - dfm2.columns = model_dates - - # Set a host value for every model date, based forward-filling dated map to - # next available date - host_df_resampled = host_df.reindex(dfm2.columns,axis='columns',method='ffill') - assert not host_df_resampled.isna().any().any(), ('Dates of host rasters do not cover all model dates') - - # new approach, take advantage of pandas broadcasting - print('Applying unit conversion to all columns in output') - dfm3 = dfm2.divide(host_df_resampled) - # Handle cases of zero division - dfm3[host_df_resampled<=0]=0 - - # check for anomalously large values - where_too_big = dfm3 > 1.00001 - if any(where_too_big): - msg = 'ERROR: Unit conversion failed, host area seems to be smaller than predicted infection area in a cell' - print(msg) - raise Exception + host_serieses = [] + first = True + for date_valid_from, raster_fn in raster_fns_dict.items(): - # clip any values that are above 1 - # (Expect this is not needed, but may help resolve float precision issues) - dfm3.clip(0.,1.,inplace=True) - - # Retain original column names - dfm3.columns = dfm.columns + host_series = self.raster_to_series(raster_fn) - return dfm3 + # for now, provide a nominal date of validity to enable a time column + # so far, using mapspam which is a static map, so time is irrelevant + host_series.name = date_valid_from -def process_in_job_epi(jobPath,status,config,component): - logger.info('started process_in_job_epi()') + # conform indices (handle float differences) + if first: + idx_0 = host_series.index - # TODO: Some of this is modifying config before epi model is run. Determine - # how to account for that + if not first: + idx_i = host_series.index - # initialise any needed variables + indices_are_close = self.are_indices_close(idx_0,idx_i) + assert indices_are_close, (f"Coordinates of host_rasters do not match.\nFailed for {raster_fn}.") + host_series.index = idx_0 - reference_date_str = config['StartString'] - reference_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d') + first = False - start_date, end_date = calc_epi_date_range(reference_date_str,config['Epidemiology']['CalculationSpanDays']) + host_serieses += [host_series] - date_diff = end_date - start_date + host_df = DataFrame(host_serieses) - start_string = start_date.strftime('%Y-%m-%d-%H%M') - start_string_short = start_date.strftime('%Y%m%d%H%M') - end_string = end_date.strftime('%Y-%m-%d-%H%M') + host_df.to_csv(csv_fn) - # update config accordingly - config['ReferenceTime'] = reference_date_str - config['StartTime'] = start_string - config['StartTimeShort'] = start_string_short - config['EndTime'] = end_string + return - yesterday_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d') - datetime.timedelta(days=1) - yesterday_string = yesterday_date.strftime('%Y%m%d') + def get_model_divided_by_host_fraction( + self, + dfm, + hostCSV, + model_colns=None, + **kwargs): + """when model_infection pressure has units of [ha_infected/ha_cell] + we want [ha_infected/ha_wheat] to compare with surveys + (because surveys only sample the wheat covered landscape) + so we must load the host raster and divide all model results by it. - diseases = config['Epidemiology']['DiseaseNames'] + TODO: Instead of doing as post-processing in coordinator, best do it within + the ews-epidemiology package. + """ - def gather_dependent_models(config_epi,config,variable_name,start_date,reference_date,end_date,jobDataPath,lastjobDataPath,status,component='Deposition'): + print('Converting units of prediction from ha_infected/ha_cell to ha_infect/ha_wheat') - # This function is only prepared for components in this list - assert component in ['Deposition','Environment'] + # load host raster + host_fn = hostCSV + host_df = read_csv(host_fn,index_col=0,header=[0,1]) + host_df.columns = host_df.columns.set_levels([lvl.astype('float') for lvl in host_df.columns.levels]) + host_df.index = to_datetime(host_df.index,format='%Y%m%d%H%M') + host_df = host_df.T - # TODO: Simplify the set of required arguments. Check if config is necessary. + # conform the structure with infection dataframe - config_epi[component]['VariableName'] = variable_name # disease_latin_name_dict[disease]+'_DEPOSITION' + # conform indices (coordinates) + host_df.index = host_df.index.reorder_levels(['longitude','latitude']) + host_df.sort_index(level=['longitude','latitude'],ascending=[True,False],inplace=True) - config_epi[component]['FileNamePrepared'] = f"{jobDataPath}/data_input_{component.lower()}.csv" + indices_are_close = self.are_indices_close(host_df.index,dfm.index) + assert indices_are_close, ('Coordinates of model grid do not match host map.') + host_df.index = dfm.index - config_epi[component]['LastFileNamePrepared'] = f"{lastjobDataPath}/data_input_{component.lower()}.csv" + # conform columns (dates) + column_end_dates = dfm.columns.map(lambda x: x[-12:]) + model_dates = to_datetime(column_end_dates, format='%Y%m%d%H%M') - datetime.timedelta(days=1) + dfm2 = dfm.copy() + dfm2.columns = model_dates - # Use config-defined file lister - file_lister_name = config_epi[component]['FileListerFunction'] + # Set a host value for every model date, based forward-filling dated map to + # next available date + host_df_resampled = host_df.reindex(dfm2.columns,axis='columns',method='ffill') + assert not host_df_resampled.isna().any().any(), ('Dates of host rasters do not cover all model dates') - file_lister_func = getattr(lister,file_lister_name) + # new approach, take advantage of pandas broadcasting + print('Applying unit conversion to all columns in output') + dfm3 = dfm2.divide(host_df_resampled) + # Handle cases of zero division + dfm3[host_df_resampled<=0]=0 - config_for_lister = config.copy() - config_for_lister.update(config_epi) + # check for anomalously large values + where_too_big = dfm3 > 1.00001 + if any(where_too_big): + msg = 'ERROR: Unit conversion failed, host area seems to be smaller than predicted infection area in a cell' + print(msg) + raise Exception - lister_kwargs = {} - lister_kwargs['reference_date']=config['ReferenceTime'] + # clip any values that are above 1 + # (Expect this is not needed, but may help resolve float precision issues) + dfm3.clip(0.,1.,inplace=True) - loader_kwargs= {} + # Retain original column names + dfm3.columns = dfm.columns - loader_dict = { - 'Deposition' : loader.load_NAME_file, - 'Environment' : loader.load_env_file, - } + return dfm3 - loader_func = loader_dict[component] + def process_in_job_epi(self, jobPath,status,config,component): + self.logger.info('started process_in_job_epi()') - # Provide component-specific variables - if component == 'Deposition': + # TODO: Some of this is modifying config before epi model is run. Determine + # how to account for that - loader_kwargs['VariableName']= config_for_lister[component].get('VariableName') - loader_kwargs['VariableNameAlternative']= config_for_lister[component].get('VariableNameAlternative') + # initialise any needed variables - try: - # Make use of data prepared yesterday - updater.update_input( - config_for_lister, - reference_date, - end_date, - component=component, - file_lister=file_lister_func, - file_loader=loader_func, - lister_kwargs=lister_kwargs, - update_period_days=3, - **loader_kwargs) + reference_date_str = config['StartString'] + reference_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d') - assert os.path.isfile(config_epi[component]['FileNamePrepared']) + start_date, end_date = self.calc_epi_date_range(reference_date_str,config['Epidemiology']['CalculationSpanDays']) - except AssertionError: + date_diff = end_date - start_date - logger.exception(f"Unexpected error in {component} data preparation (updater)") + start_string = start_date.strftime('%Y-%m-%d-%H%M') + start_string_short = start_date.strftime('%Y%m%d%H%M') + end_string = end_date.strftime('%Y-%m-%d-%H%M') - # Performa a fresh load of the full time series + # update config accordingly + config['ReferenceTime'] = reference_date_str + config['StartTime'] = start_string + config['StartTimeShort'] = start_string_short + config['EndTime'] = end_string - try: + yesterday_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d') - datetime.timedelta(days=1) + yesterday_string = yesterday_date.strftime('%Y%m%d') + + diseases = config['Epidemiology']['DiseaseNames'] + + def gather_dependent_models(config_epi,config,variable_name,start_date,reference_date,end_date,jobDataPath,lastjobDataPath,status,component='Deposition'): + + # This function is only prepared for components in this list + assert component in ['Deposition','Environment'] + + # TODO: Simplify the set of required arguments. Check if config is necessary. + + config_epi[component]['VariableName'] = variable_name # disease_latin_name_dict[disease]+'_DEPOSITION' + + config_epi[component]['FileNamePrepared'] = f"{jobDataPath}/data_input_{component.lower()}.csv" + + config_epi[component]['LastFileNamePrepared'] = f"{lastjobDataPath}/data_input_{component.lower()}.csv" + + # Use config-defined file lister + file_lister_name = config_epi[component]['FileListerFunction'] + + file_lister_func = getattr(lister,file_lister_name) - prep.prep_input( + config_for_lister = config.copy() + config_for_lister.update(config_epi) + + lister_kwargs = {} + lister_kwargs['reference_date']=config['ReferenceTime'] + + loader_kwargs= {} + + loader_dict = { + 'Deposition' : loader.load_NAME_file, + 'Environment' : loader.load_env_file, + } + + loader_func = loader_dict[component] + + # Provide component-specific variables + if component == 'Deposition': + + loader_kwargs['VariableName']= config_for_lister[component].get('VariableName') + loader_kwargs['VariableNameAlternative']= config_for_lister[component].get('VariableNameAlternative') + + try: + # Make use of data prepared yesterday + updater.update_input( config_for_lister, - start_date, + reference_date, end_date, component=component, file_lister=file_lister_func, file_loader=loader_func, lister_kwargs=lister_kwargs, + update_period_days=3, **loader_kwargs) assert os.path.isfile(config_epi[component]['FileNamePrepared']) - except: + except AssertionError: - logger.exception(f"Unexpected error in {component} data preparation (full load)") - status.reset('ERROR') - endJob(status,premature=True) + self.logger.exception(f"Unexpected error in {component} data preparation (updater)") - return + # Performa a fresh load of the full time series + + try: + + prep.prep_input( + config_for_lister, + start_date, + end_date, + component=component, + file_lister=file_lister_func, + file_loader=loader_func, + lister_kwargs=lister_kwargs, + **loader_kwargs) - # get list of variable names to be loaded from deposition input - depo_variable_names = config['Epidemiology']['Deposition']['VariableNames'] - assert len(depo_variable_names) == len(diseases) + assert os.path.isfile(config_epi[component]['FileNamePrepared']) - # loop over each sub region + except: - region = config['RegionName'] - #for region in config['SubRegionNames']: + self.logger.exception(f"Unexpected error in {component} data preparation (full load)") + status.reset('ERROR') + endJob(status,premature=True) - for disease in diseases: + return - assert disease in disease_latin_name_dict.keys() + # get list of variable names to be loaded from deposition input + depo_variable_names = config['Epidemiology']['Deposition']['VariableNames'] + assert len(depo_variable_names) == len(diseases) - config['SubRegionName'] = region - config['DiseaseName'] = disease + # loop over each sub region - config_epi = config['Epidemiology'].copy() + region = config['RegionName'] + #for region in config['SubRegionNames']: - # TODO: CAUTION: Any iterations (e.g. disease or sub-region) are hidden - # in jobPath, and not retained in the config file. This is a provlem for - # process_EWS_plotting_epi which receives a single config file and must - # try a fudge to retrieve details for each iteration. - # This should be improved, either by making the one config file - # aware of all of the iterations, or looping over iterations in - # Processor.py with one iteration-specific config. - case_specific_path = f"{jobPath}/{region}/{disease}/" - Path(case_specific_path).mkdir(parents=True, exist_ok=True) + for disease in diseases: - logger.info(f"Preparing for epidemiology calc of {disease} in {region}") + assert disease in disease_latin_name_dict.keys() - # create config_filename to describe job configuration - config_filename = create_epi_config_string(config,case_specific_path,start_string,end_string) + config['SubRegionName'] = region + config['DiseaseName'] = disease - # prepare a directory for input data - jobDataPath = f"{case_specific_path}/input_data/" - Path(jobDataPath).mkdir(parents=True, exist_ok=True) + config_epi = config['Epidemiology'].copy() - lastjobDataPath = jobDataPath.replace(f"EPI_{reference_date_str}",f"EPI_{yesterday_string}") + # TODO: CAUTION: Any iterations (e.g. disease or sub-region) are hidden + # in jobPath, and not retained in the config file. This is a provlem for + # process_EWS_plotting_epi which receives a single config file and must + # try a fudge to retrieve details for each iteration. + # This should be improved, either by making the one config file + # aware of all of the iterations, or looping over iterations in + # Processor.py with one iteration-specific config. + case_specific_path = f"{jobPath}/{region}/{disease}/" + Path(case_specific_path).mkdir(parents=True, exist_ok=True) - # configure filename of prepared deposition data + self.logger.info(f"Preparing for epidemiology calc of {disease} in {region}") - if 'Deposition' in config_epi: + # create config_filename to describe job configuration + config_filename = self.create_epi_config_string(config,case_specific_path,start_string,end_string) - # determine which variable name to load for this disease - disease_idx = [i for i,j in enumerate(diseases) if j==disease][0] + # prepare a directory for input data + jobDataPath = f"{case_specific_path}/input_data/" + Path(jobDataPath).mkdir(parents=True, exist_ok=True) - variable_name = depo_variable_names[disease_idx] + lastjobDataPath = jobDataPath.replace(f"EPI_{reference_date_str}",f"EPI_{yesterday_string}") - gather_dependent_models( - config_epi, - config, - variable_name, - start_date, - reference_date, - end_date, - jobDataPath, - lastjobDataPath, - status, - component='Deposition') - - # configure filename of prepared deposition data + # configure filename of prepared deposition data - if 'Environment' in config_epi: + if 'Deposition' in config_epi: - logger.info('Preparing environmental suitability data') + # determine which variable name to load for this disease + disease_idx = [i for i,j in enumerate(diseases) if j==disease][0] - gather_dependent_models( - config_epi, - config, - variable_name, - start_date, - reference_date, - end_date, - jobDataPath, - lastjobDataPath, - status, - component='Environment') - - # prepare a copy of the host data + variable_name = depo_variable_names[disease_idx] - logger.info('Preparing a copy of the host raster data') - - # TargetRaster defines the grid that the epi model works on. - assert 'TargetRaster' in config_epi['Host'] + gather_dependent_models( + config_epi, + config, + variable_name, + start_date, + reference_date, + end_date, + jobDataPath, + lastjobDataPath, + status, + component='Deposition') - # It should have been generated in advance by the user, by reprojecting - # the available host map (e.g. MapSPAM) to the NAME output grid. - # wheat_raster_reprojection.py is available to support this. + # configure filename of prepared deposition data - if 'HostRasters' in config_epi['Host']: - # HostRasters is a dictionary with date: filename entries describing - # different host rasters valid at different times i.e. a simple - # representation of dynamic host, so prepare a host file as is done - # for the Deposition and Environment components. + if 'Environment' in config_epi: - # All host maps should have the same spatial grid as the TargetRaster + self.logger.info('Preparing environmental suitability data') - rasters_dict = config_epi['Host']['HostRasters'] + gather_dependent_models( + config_epi, + config, + variable_name, + start_date, + reference_date, + end_date, + jobDataPath, + lastjobDataPath, + status, + component='Environment') - dst_host_csv = f"{jobDataPath}/data_input_host.csv" + # prepare a copy of the host data - rasters_to_csv(rasters_dict,dst_host_csv) + self.logger.info('Preparing a copy of the host raster data') - else: - # There is a host raster applicable to all times, i.e. static host + # TargetRaster defines the grid that the epi model works on. + assert 'TargetRaster' in config_epi['Host'] - src_host = config_epi['Host']['TargetRaster'] - fn_host = os.path.basename(src_host) - dst_host = f"{jobDataPath}/{fn_host}" + # It should have been generated in advance by the user, by reprojecting + # the available host map (e.g. MapSPAM) to the NAME output grid. + # wheat_raster_reprojection.py is available to support this. - # copy the tif to the job directory and refer to that instead - shutil.copyfile(src_host,dst_host) - config_epi['Host']['TargetRaster'] = dst_host + if 'HostRasters' in config_epi['Host']: + # HostRasters is a dictionary with date: filename entries describing + # different host rasters valid at different times i.e. a simple + # representation of dynamic host, so prepare a host file as is done + # for the Deposition and Environment components. - logger.info('Preparing a copy of the host data as csv') + # All host maps should have the same spatial grid as the TargetRaster - dst_host_csv = dst_host.replace('.tif','.csv') + rasters_dict = config_epi['Host']['HostRasters'] - rasters_to_csv( - {'201001010000': dst_host}, - dst_host_csv) + dst_host_csv = f"{jobDataPath}/data_input_host.csv" - config_epi['Host']['HostCSV'] = dst_host_csv - config_epi['Host']['FileNamePrepared'] = dst_host_csv + self.rasters_to_csv(rasters_dict,dst_host_csv) - # provide fundamental config elements to config_epi - for k,v in config.items(): - if k not in short_name.keys(): - config_epi[k]=v + else: + # There is a host raster applicable to all times, i.e. static host - logger.debug('Incremental configuration looks like:') - def print_item(item): - logger.debug(f"Item {item}") - logger.debug(json.dumps(item,indent=2)) - def iterate(items): - for item in items.items(): - if hasattr(item,'items'): - # iterate - iterate(item) - else: - print_item(item) - iterate(config_epi) + src_host = config_epi['Host']['TargetRaster'] + fn_host = os.path.basename(src_host) + dst_host = f"{jobDataPath}/{fn_host}" - logger.debug('Complete configuration looks like:') - logger.debug(json.dumps(config_epi,indent=2)) + # copy the tif to the job directory and refer to that instead + shutil.copyfile(src_host,dst_host) + config_epi['Host']['TargetRaster'] = dst_host - # write the complete configuration file to job directory - with open(f"{case_specific_path}/{config_filename}.json",'w') as write_file: - json.dump(config_epi,write_file,indent=4) + self.logger.info('Preparing a copy of the host data as csv') - # run epi model + dst_host_csv = dst_host.replace('.tif','.csv') - try: - EpiModel.run_epi_model(f"{case_specific_path}/{config_filename}.json") - except: - logger.exception('Unexpected error in EpiModel') - raise + self.rasters_to_csv( + {'201001010000': dst_host}, + dst_host_csv) - # perform calc on output + config_epi['Host']['HostCSV'] = dst_host_csv + config_epi['Host']['FileNamePrepared'] = dst_host_csv - def calc_total(arr): - return 'total', arr.sum() + # provide fundamental config elements to config_epi + for k,v in config.items(): + if k not in short_name.keys(): + config_epi[k]=v - def calc_max(arr): - return 'maximum', arr.max() + self.logger.debug('Incremental configuration looks like:') + def print_item(item): + self.logger.debug(f"Item {item}") + self.logger.debug(json.dumps(item,indent=2)) + def iterate(items): + for item in items.items(): + if hasattr(item,'items'): + # iterate + iterate(item) + else: + print_item(item) + iterate(config_epi) - def calc_mean(arr): - return 'mean', arr.mean() + self.logger.debug('Complete configuration looks like:') + self.logger.debug(json.dumps(config_epi,indent=2)) - for epiconf in config['Epidemiology']['Epi']: + # write the complete configuration file to job directory + with open(f"{case_specific_path}/{config_filename}.json",'w') as write_file: + json.dump(config_epi,write_file,indent=4) + + # run epi model - outfile = epiconf["infectionRasterFileName"] + try: + EpiModel.run_epi_model(f"{case_specific_path}/{config_filename}.json") + except: + self.logger.exception('Unexpected error in EpiModel') + raise - with rio_open(outfile+'.tif','r') as infectionRaster: - infection = infectionRaster.read(1) + # perform calc on output - # define function to quantify overall result, for easy check - # TODO: Create a more meaningful result? - # TODO: make this configurable - analysis_func = calc_mean + def calc_total(arr): + return 'total', arr.sum() - analysis_desc, analysis_value = analysis_func(infection) + def calc_max(arr): + return 'maximum', arr.max() - logger.info(f"For case {outfile}") - logger.info('Infection {:s} is {:.2e}'.format( analysis_desc, analysis_value)) + def calc_mean(arr): + return 'mean', arr.mean() - # to save tif as png for easy viewing - logger.debug('Saving tif output as png for easier viewing') - plotRaster.save_raster_as_png(outfile) + for epiconf in config['Epidemiology']['Epi']: - # comparison figure + outfile = epiconf["infectionRasterFileName"] - # TODO: make this plot configurable? with function or args? - #logger.info('Plotting epi output alongside contributing components') - # figure_func = getattr(EpiAnalysis,'plot_compare_host_env_dep_infection') - logger.info('Plotting composite image of epi formulations') - figure_func = getattr(EpiAnalysis,'plot_compare_epi_cases') + with rio_open(outfile+'.tif','r') as infectionRaster: + infection = infectionRaster.read(1) - # isolate the config for this function, in case of modifications - config_epi_for_comparison = config_epi.copy() + # define function to quantify overall result, for easy check + # TODO: Create a more meaningful result? + # TODO: make this configurable + analysis_func = calc_mean - fig,axes,cases = figure_func( - config_epi_for_comparison, - start_str = start_string, - end_str = end_string) + analysis_desc, analysis_value = analysis_func(infection) - SaveFileName = f"{case_specific_path}/EPI_{config_filename}_comparison" + self.logger.info(f"For case {outfile}") + self.logger.info('Infection {:s} is {:.2e}'.format( analysis_desc, analysis_value)) - fig.savefig(SaveFileName+'.png',dpi=300) + # to save tif as png for easy viewing + self.logger.debug('Saving tif output as png for easier viewing') + plotRaster.save_raster_as_png(outfile) - # slice the epi results into before forecast and in forecast + # comparison figure - for epiconf in config['Epidemiology']['Epi']: + # TODO: make this plot configurable? with function or args? + #logger.info('Plotting epi output alongside contributing components') + # figure_func = getattr(EpiAnalysis,'plot_compare_host_env_dep_infection') + self.logger.info('Plotting composite image of epi formulations') + figure_func = getattr(EpiAnalysis,'plot_compare_epi_cases') - outfile = epiconf["infectionRasterFileName"]+'_progression.csv' + # isolate the config for this function, in case of modifications + config_epi_for_comparison = config_epi.copy() - # load the full epi results - df_full = read_csv(outfile,header=[0],index_col=[0,1]) - column_date_fmt = f"X{config['StartTimeShort']}_X%Y%m%d%H%M" - df_full_dates = to_datetime(df_full.columns.astype('str'),format=column_date_fmt) + fig,axes,cases = figure_func( + config_epi_for_comparison, + start_str = start_string, + end_str = end_string) - unit_description = '' + SaveFileName = f"{case_specific_path}/EPI_{config_filename}_comparison" - if epiconf['rescale_output_by_host_raster'] is True: + fig.savefig(SaveFileName+'.png',dpi=300) - unit_description = '_per_ha_wheat' + # slice the epi results into before forecast and in forecast - model_colns = df_full.columns + for epiconf in config['Epidemiology']['Epi']: - # convert units from ha_infected/ha_cell to ha_infected/ha_wheat - - df_full = get_model_divided_by_host_fraction( - df_full, - config_epi['Host']['HostCSV'], - model_colns = model_colns) + outfile = epiconf["infectionRasterFileName"]+'_progression.csv' - # save to csv - outfile_hawheat = f"{epiconf['infectionRasterFileName']}{unit_description}_progression.csv" - df_full.to_csv(outfile_hawheat,header=True,index=True) + # load the full epi results + df_full = read_csv(outfile,header=[0],index_col=[0,1]) + column_date_fmt = f"X{config['StartTimeShort']}_X%Y%m%d%H%M" + df_full_dates = to_datetime(df_full.columns.astype('str'),format=column_date_fmt) - outfile_hawheat_final = f"{epiconf['infectionRasterFileName']}{unit_description}.csv" - df_full.iloc[:,-1].to_csv(outfile_hawheat_final,header=True,index=True) + unit_description = '' - # determine date to cut with - # plus 1 minute so midnight is associated with preceding day - date_to_cut = datetime.datetime.strptime(config['StartString']+'0001','%Y%m%d%H%M') - dates_after_cut = df_full_dates >= date_to_cut - idx = argmax(dates_after_cut)-1 + if epiconf['rescale_output_by_host_raster'] is True: - if idx == -1: - # only working on forecast dates so no season so far, skip - continue + unit_description = '_per_ha_wheat' - # build seasonsofar dataframe (only need the last date) - df_seasonsofar = df_full.iloc[:,idx] + model_colns = df_full.columns - # check column name is defined as expected - # from epi start time to forecast start time - column_name = f"X{config['StartTimeShort']}_X{config['StartString']}0000" - assert df_seasonsofar.name == column_name + # convert units from ha_infected/ha_cell to ha_infected/ha_wheat - # save to csv - fn_seasonsofar = f"{epiconf['infectionRasterFileName']}{unit_description}_seasonsofar.csv" - df_seasonsofar.to_csv(fn_seasonsofar,header=True,index=True) + df_full = self.get_model_divided_by_host_fraction( + df_full, + config_epi['Host']['HostCSV'], + model_colns = model_colns) - # build weekahead dataframe and save to csv - df_fc_start = df_full.iloc[:,idx] - df_fc_start_name = df_fc_start.name.split('_')[-1] + # save to csv + outfile_hawheat = f"{epiconf['infectionRasterFileName']}{unit_description}_progression.csv" + df_full.to_csv(outfile_hawheat,header=True,index=True) - df_fc_end = df_full.iloc[:,-1] - df_fc_end_name = df_fc_end.name.split('_')[-1] + outfile_hawheat_final = f"{epiconf['infectionRasterFileName']}{unit_description}.csv" + df_full.iloc[:,-1].to_csv(outfile_hawheat_final,header=True,index=True) - df_weekahead = df_fc_end - df_fc_start + # determine date to cut with + # plus 1 minute so midnight is associated with preceding day + date_to_cut = datetime.datetime.strptime(config['StartString']+'0001','%Y%m%d%H%M') + dates_after_cut = df_full_dates >= date_to_cut + idx = argmax(dates_after_cut)-1 - # defined column name - fn_weekahead = f"{epiconf['infectionRasterFileName']}{unit_description}_weekahead.csv" - df_weekahead.name = '_'.join([df_fc_start_name,df_fc_end_name]) + if idx == -1: + # only working on forecast dates so no season so far, skip + continue - # save to csv - df_weekahead.to_csv(fn_weekahead,header=True,index=True) - - proc_out = {} - # Output files available for upload - proc_out['output'] = None - # Processing files available for clearing - proc_out['clearup'] = None + # build seasonsofar dataframe (only need the last date) + df_seasonsofar = df_full.iloc[:,idx] - return proc_out + # check column name is defined as expected + # from epi start time to forecast start time + column_name = f"X{config['StartTimeShort']}_X{config['StartString']}0000" + assert df_seasonsofar.name == column_name -def process_EWS_plotting_epi(jobPath,config): - '''Returns a list of output files for transfer.''' + # save to csv + fn_seasonsofar = f"{epiconf['infectionRasterFileName']}{unit_description}_seasonsofar.csv" + df_seasonsofar.to_csv(fn_seasonsofar,header=True,index=True) - logger.info('started process_EWS_plotting_epi()') + # build weekahead dataframe and save to csv + df_fc_start = df_full.iloc[:,idx] + df_fc_start_name = df_fc_start.name.split('_')[-1] - # initalise necessary variables from config + df_fc_end = df_full.iloc[:,-1] + df_fc_end_name = df_fc_end.name.split('_')[-1] - start_date, end_date = calc_epi_date_range(config['StartString'],config['Epidemiology']['CalculationSpanDays']) + df_weekahead = df_fc_end - df_fc_start + + # defined column name + fn_weekahead = f"{epiconf['infectionRasterFileName']}{unit_description}_weekahead.csv" + df_weekahead.name = '_'.join([df_fc_start_name,df_fc_end_name]) + + # save to csv + df_weekahead.to_csv(fn_weekahead,header=True,index=True) - start_string = start_date.strftime('%Y%m%d') - end_string = end_date.strftime('%Y%m%d') + proc_out = {} + # Output files available for upload + proc_out['output'] = None + # Processing files available for clearing + proc_out['clearup'] = None - epi_case_operational = config['Epidemiology']['EWS-Plotting']['EpiCase'] + return proc_out - if epi_case_operational == 'none': - logger.info('Config specifies not to call to EWS-Plotting') - return [] + def process_EWS_plotting_epi(self, jobPath,config): + '''Returns a list of output files for transfer.''' - diseases = config['Epidemiology']['DiseaseNames'] + self.logger.info('started process_EWS_plotting_epi()') - # initialise environment - sys_config = config['Epidemiology']['EWS-Plotting']['SysConfig'] + # initalise necessary variables from config - chart_config = config['Epidemiology']['EWS-Plotting']['ChartConfig'] + start_date, end_date = self.calc_epi_date_range(config['StartString'],config['Epidemiology']['CalculationSpanDays']) - # use the first matching epi formulation - # TODO: Is there a more efficient way to select? - epi_filename = [ce['infectionRasterFileName'] for ce in config['Epidemiology']['Epi'] if ce['model']==epi_case_operational][0] + start_string = start_date.strftime('%Y%m%d') + end_string = end_date.strftime('%Y%m%d') - dep_regionnames = ['SouthAsia','Ethiopia'] + epi_case_operational = config['Epidemiology']['EWS-Plotting']['EpiCase'] - # TODO get deposition_dir from config['Epidemiology']['Deposition']['PathTemplate'] - dep_regionname = 'Ethiopia' #SouthAsia + if epi_case_operational == 'none': + self.logger.info('Config specifies not to call to EWS-Plotting') + return [] - deposition_dir = f"{config['WorkspacePath']}DEPOSITION_{start_string}/WR_NAME_{dep_regionname}_{start_string}/" + diseases = config['Epidemiology']['DiseaseNames'] - # TODO: handle multiple diseases and regions in Processor as a loop, or in the config - deposition_disease_name = [disease_latin_name_dict[disease]+'_DEPOSITION' for disease in diseases][0] + # initialise environment + sys_config = config['Epidemiology']['EWS-Plotting']['SysConfig'] - ews_plot_dir = f"{jobPath}/plotting/" + chart_config = config['Epidemiology']['EWS-Plotting']['ChartConfig'] - Path(ews_plot_dir).mkdir(parents=True, exist_ok=True) + # use the first matching epi formulation + # TODO: Is there a more efficient way to select? + epi_filename = [ce['infectionRasterFileName'] for ce in config['Epidemiology']['Epi'] if ce['model']==epi_case_operational][0] - # loop over diseases - EWSPlottingOutputGlobs = [] - for disease in diseases: - disease_short = disease.lower().replace('rust','') + dep_regionnames = ['SouthAsia','Ethiopia'] + + # TODO get deposition_dir from config['Epidemiology']['Deposition']['PathTemplate'] + dep_regionname = 'Ethiopia' #SouthAsia + + deposition_dir = f"{config['WorkspacePath']}DEPOSITION_{start_string}/WR_NAME_{dep_regionname}_{start_string}/" - # a fudge, guess disease type - # because config['Epidemiology']['ProcessInJob'] handles disease loop internally - # assumes disease name is the last directory before the filename # TODO: handle multiple diseases and regions in Processor as a loop, or in the config - disease_to_drop = os.path.dirname(epi_filename).split('/')[-1].replace('Rust','') - disease_to_add = disease.replace('Rust','') - epi_filename = epi_filename.replace(disease_to_drop,disease_to_add) + deposition_disease_name = [disease_latin_name_dict[disease]+'_DEPOSITION' for disease in diseases][0] - map_title = "Integrated prediction of Wheat $\\bf{" + disease_to_add + "}$ Rust infection" - if 'PlottingRegionName' not in config['Epidemiology']['EWS-Plotting']: - plotting_region_name_lower = config['RegionName'].lower() - else: - plotting_region_name_lower = config['Epidemiology']['EWS-Plotting']['PlottingRegionName'].lower() + ews_plot_dir = f"{jobPath}/plotting/" + + Path(ews_plot_dir).mkdir(parents=True, exist_ok=True) + + # loop over diseases + EWSPlottingOutputGlobs = [] + for disease in diseases: + disease_short = disease.lower().replace('rust','') + + # a fudge, guess disease type + # because config['Epidemiology']['ProcessInJob'] handles disease loop internally + # assumes disease name is the last directory before the filename + # TODO: handle multiple diseases and regions in Processor as a loop, or in the config + disease_to_drop = os.path.dirname(epi_filename).split('/')[-1].replace('Rust','') + disease_to_add = disease.replace('Rust','') + epi_filename = epi_filename.replace(disease_to_drop,disease_to_add) + + map_title = "Integrated prediction of Wheat $\\bf{" + disease_to_add + "}$ Rust infection" + if 'PlottingRegionName' not in config['Epidemiology']['EWS-Plotting']: + plotting_region_name_lower = config['RegionName'].lower() + else: + plotting_region_name_lower = config['Epidemiology']['EWS-Plotting']['PlottingRegionName'].lower() - epi_seasonsofar_fn = epi_filename+'_per_ha_wheat_seasonsofar.csv' + epi_seasonsofar_fn = epi_filename+'_per_ha_wheat_seasonsofar.csv' - epi_seasonincforecast_fn = epi_filename+'_per_ha_wheat.csv' + epi_seasonincforecast_fn = epi_filename+'_per_ha_wheat.csv' - seasonsofar_run_config = config['Epidemiology']['EWS-Plotting'].get('RunConfig_seasonsofar',None) + seasonsofar_run_config = config['Epidemiology']['EWS-Plotting'].get('RunConfig_seasonsofar',None) - # only run season so far (i.e. historic dates) if they exist - if (seasonsofar_run_config is not None) & os.path.exists(epi_seasonsofar_fn): + # only run season so far (i.e. historic dates) if they exist + if (seasonsofar_run_config is not None) & os.path.exists(epi_seasonsofar_fn): - logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{seasonsofar_run_config}\n{chart_config}") + self.logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{seasonsofar_run_config}\n{chart_config}") - epi_processor_1 = EPIPostPostProcessor() - epi_processor_1.set_param_config_files(sys_params_file_arg=sys_config, + epi_processor_1 = EPIPostPostProcessor() + epi_processor_1.set_param_config_files(sys_params_file_arg=sys_config, + chart_params_file_arg=chart_config, + run_params_file_arg=seasonsofar_run_config, + epi_input_csv_arg=epi_seasonsofar_fn, + disease_type_arg=disease_short+'_seasontodate', + issue_date_arg=start_string, + output_dir_arg=ews_plot_dir, + wheat_sources_dir_arg=deposition_dir, + wheat_source_disease_name_arg=deposition_disease_name, + map_title_arg=map_title, + chart_area_prefix=plotting_region_name_lower) + epi_processor_1.process() + + # prepare command for seasonplusforecast + + run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonplusforecast'] + + self.logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}") + + epi_processor_2 = EPIPostPostProcessor() + epi_processor_2.set_param_config_files(sys_params_file_arg=sys_config, chart_params_file_arg=chart_config, - run_params_file_arg=seasonsofar_run_config, - epi_input_csv_arg=epi_seasonsofar_fn, - disease_type_arg=disease_short+'_seasontodate', + run_params_file_arg=run_config, + epi_input_csv_arg=epi_seasonincforecast_fn, # for seasonplusforecast + #epi_input_csv_arg=epi_filename+'_weekahead.csv', # for weekahead + disease_type_arg=disease_short+'_seasonincforecast', issue_date_arg=start_string, output_dir_arg=ews_plot_dir, wheat_sources_dir_arg=deposition_dir, wheat_source_disease_name_arg=deposition_disease_name, map_title_arg=map_title, chart_area_prefix=plotting_region_name_lower) - epi_processor_1.process() - - # prepare command for seasonplusforecast - - run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonplusforecast'] + epi_processor_2.process() - logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}") + # check the output + EWSPlottingOutputDir = f"{ews_plot_dir}/images/" + # TODO: Make this smarter, connected to the results of EWSPlottingEPIBase.plot_epi() + EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}infection_{plotting_region_name_lower}_*{disease_short}*.png"] - epi_processor_2 = EPIPostPostProcessor() - epi_processor_2.set_param_config_files(sys_params_file_arg=sys_config, - chart_params_file_arg=chart_config, - run_params_file_arg=run_config, - epi_input_csv_arg=epi_seasonincforecast_fn, # for seasonplusforecast - #epi_input_csv_arg=epi_filename+'_weekahead.csv', # for weekahead - disease_type_arg=disease_short+'_seasonincforecast', - issue_date_arg=start_string, - output_dir_arg=ews_plot_dir, - wheat_sources_dir_arg=deposition_dir, - wheat_source_disease_name_arg=deposition_disease_name, - map_title_arg=map_title, - chart_area_prefix=plotting_region_name_lower) - epi_processor_2.process() + EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) - # check the output - EWSPlottingOutputDir = f"{ews_plot_dir}/images/" - # TODO: Make this smarter, connected to the results of EWSPlottingEPIBase.plot_epi() - EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}infection_{plotting_region_name_lower}_*{disease_short}*.png"] + # check there is some output from EWS-plotting + if not EWSPlottingOutputGlobs: + self.logger.error('EWS-Plotting did not produce any output') + raise RuntimeError - EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False) + # provide to list for transfer + EWSPlottingOutputs = [item for EWSPlottingOutput in EWSPlottingOutputGlobs for item in glob(EWSPlottingOutput)] - # check there is some output from EWS-plotting - if not EWSPlottingOutputGlobs: - logger.error('EWS-Plotting did not produce any output') - raise RuntimeError + return EWSPlottingOutputs - # provide to list for transfer - EWSPlottingOutputs = [item for EWSPlottingOutput in EWSPlottingOutputGlobs for item in glob(EWSPlottingOutput)] - return EWSPlottingOutputs +if __name__ == '__main__': + processor = ProcessorEpidemiology() + processor.run_processor("Epidemiology") \ No newline at end of file diff --git a/coordinator/ProcessorScraper.py b/coordinator/ProcessorScraper.py index cee27963b2fc62b71aeb6cb671547ac4493921a2..9f3aa70f781ecef0a1efb812656d5e35fdc888f8 100644 --- a/coordinator/ProcessorScraper.py +++ b/coordinator/ProcessorScraper.py @@ -22,422 +22,451 @@ import certifi from numpy import where from pandas import concat, DataFrame, read_csv, Series, set_option +from Processor import Processor # gitlab projects # TODO: Package these projects so they are robust for importing from flagdir import jobStatus # created by jws52 from ProcessorUtils import add_filters_to_sublogger -logger = logging.getLogger('Processor.Scraper') -add_filters_to_sublogger(logger) -# date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv -# # ODK v1.11.2: -# FMT = '%d-%b-%Y' -# FMT_SHORT = '%d-%b-%Y' -# ODK v1.18.0: -FMT = '%b %d, %Y %H:%m:%S %p' -FMT_SHORT = '%b %d, %Y' +class ProcessorScraper(Processor): -# url location of latest news report search -URL_DEFAULT = 'http://arrcc-viewer.herokuapp.com/assets/sample_data/data.zip' + """ LIFECYCLE FUNCTIONS INHERITED FROM PROCESSOR.PY """ -def get_news_reports_from_url(job_dir: str, url = URL_DEFAULT) -> None: - '''Downloads the news report data available on the ARRCC media report - dashboard, into the provided directory. - - Does not return anything''' + def process_pre_job(self, args): + return True + + def process_in_job(self, jobPath, status, configjson, component) -> object: + return self.process_in_job_media_scraper(jobPath, status, configjson, component) - assert os.path.exists(job_dir) + def process_post_job(self, jobPath, configjson): + pass - # Get the zip file from the url and immediately write a local copy - fn_zip = f"{job_dir}/data.zip" - with open(fn_zip,'wb') as zipfile: - zipfile.write(requests.get(url).content) + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Scraper') + add_filters_to_sublogger(logger) - # unzip it - dir_unzip = f"{job_dir}/data/" - Path(dir_unzip).mkdir(parents=True, exist_ok=False) - cmd_unzip = ['unzip',fn_zip,'-d',dir_unzip] - subprocess.run(cmd_unzip) + """ LIFECYCLE FUNCTIONS INHERITED FROM PROCESSOR.PY """ - return -def read_news_reports(job_dir: str) -> DataFrame: - '''Opens the news reports in the provided directory. + # date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv + # # ODK v1.11.2: + # FMT = '%d-%b-%Y' + # FMT_SHORT = '%d-%b-%Y' + # ODK v1.18.0: + FMT = '%b %d, %Y %H:%m:%S %p' + FMT_SHORT = '%b %d, %Y' + + # url location of latest news report search + URL_DEFAULT = 'http://arrcc-viewer.herokuapp.com/assets/sample_data/data.zip' + + def get_news_reports_from_url(self, job_dir: str, url = URL_DEFAULT) -> None: + '''Downloads the news report data available on the ARRCC media report + dashboard, into the provided directory. + + Does not return anything''' + + assert os.path.exists(job_dir) + + # Get the zip file from the url and immediately write a local copy + fn_zip = f"{job_dir}/data.zip" + with open(fn_zip,'wb') as zipfile: + zipfile.write(requests.get(url).content) + + # unzip it + dir_unzip = f"{job_dir}/data/" + Path(dir_unzip).mkdir(parents=True, exist_ok=False) + cmd_unzip = ['unzip',fn_zip,'-d',dir_unzip] + subprocess.run(cmd_unzip) + + return + + def read_news_reports(self, job_dir: str) -> DataFrame: + '''Opens the news reports in the provided directory. + + Returns a pandas dataframe.''' + + fn = f"{job_dir}/data/NEWS REPORTS.csv" + + dateparse = lambda x: datetime.datetime.strptime(x, '%d-%m-%y') + + df = read_csv( + fn, + index_col=0, + header=0, + parse_dates=['Date'], + date_parser=dateparse) + + return df + + def estimate_rust( + self, + description: Series, + disease: str, + return_type: str) -> Series: + '''Works with pandas series''' + + # check for alternative naming + if disease in ['yellow','stripe']: + + any_disease = description.str.contains('yellow') | description.str.contains('stripe') + + else: + any_disease = description.str.contains(disease) + + return_dict = { + 'incidence':'medium', + 'severity':'30' + } + + prevalence = where(any_disease,return_dict[return_type],'na') + + return prevalence + + def guess_stage( + self, + date: Series, + country: Series) -> Series: + + #TODO: provide typical phenology dates per country + + # all of the country + # based on Moin's estimates from vegetative to ripening & maturing + # note his Start Date and End Dates are always 30 days apart + # so, this is sticking to the middle of the range + stage_start_dates_bangladesh = { + 'tillering':'5 Dec', # ~26 days { combined 30 days to complete + 'boot' :'31 Dec', # ~4 days { + 'heading' :'4 Jan', # 20 days + 'flowering':'24 Jan', # 10 days + 'milk' :'3 Feb', # 10 days + 'dough' :'13 Feb', # ~15 days { combined 25 days to complete + 'maturity' :'28 Mar', # ~10 days { + 'NA' :'9 Mar'} # total ~95 days + + # main season wheat in Terai + # based on Madan's estimates of min and max duration, taking the mean + # from vegetative to ripening & maturing + stage_start_dates_nepal_terai = { + 'tillering':'24 Dec', # ~ 56 days { combined 66 days to complete + 'boot' :'18 Feb', # ~10 days { + 'heading' :'28 Feb', # 10 days + 'flowering':'9 Mar', # 5 days + 'milk' :'14 Mar', # ~12 days { combined 27 days to complete + 'dough' :'26 Mar', # ~15 days { + 'maturity' :'10 Apr', # 10 days + 'NA' :'20 Apr'} # total ~118 days + + # TODO: Less important: implement main season wheat in mid-hills from Madan's estimates + # and determine how to distinguish Terai from mid-hills in news report + stage_start_dates_nepal_midhills = { + 'tillering':'', + 'boot' :'', + 'heading' :'', + 'flowering':'', + 'milk' :'', + 'dough' :'', + 'maturity' :'', + 'NA' :''} + + + # mainly for the north + # assume the same as Nepal Terai as for last year. + # TODO: get estimates specific to Pakistan + stage_start_dates_pakistan = stage_start_dates_nepal_terai + + # mainly for Haryana district + # assume the same as Nepal Terai as for last year. + # TODO: get estimates specific to India NW districts + stage_start_dates_india = stage_start_dates_nepal_terai + + stage_start_dates_by_country = { + 'Bangladesh' : stage_start_dates_bangladesh, + 'Nepal' : stage_start_dates_nepal_terai, + 'India' : stage_start_dates_india, + 'Pakistan' : stage_start_dates_pakistan} + + df = DataFrame({'date':date,'country':country}) + dates_by_entry = country.apply(lambda val: stage_start_dates_by_country[val]) + df2 = DataFrame.from_records(dates_by_entry.values) + df2.index = df.index + df3 = concat([df,df2],axis='columns') + + # handle Dec-Jan crossover (is there a neater way of doing this?) + + df3['year'] = df3['date'].apply(lambda di: di.year) + df3['lastyear'] = df3['date'].apply(lambda di: di.year-1) + + stages_thisyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['year']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} + stages_lastyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['lastyear']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} + df3_thisyear = DataFrame.from_records(stages_thisyear) + df3_lastyear = DataFrame.from_records(stages_lastyear) + + # Use knowledge of order of phenological stages to determine which dates are from last year + stage_order = ['tillering','boot','heading','flowering','milk','dough','maturity','NA'] + + df4 = df3_thisyear[stage_order] + + # check each stage in turn + for i,stage in enumerate(stage_order[:-1]): + stage_ip1 = stage_order[i+1] + # if the earlier stage has a later date, use last year's dates + df4[stage] = where(df4[stage]<df4[stage_ip1],df4[stage],df3_lastyear[stage]) + + # find out which stages start earlier than the survey date + date_compare = df4.le(date,axis='rows') + + # get name of latest valid stage + stage_series = date_compare.apply(lambda row: row.iloc[::-1].idxmax(),axis='columns') + + return stage_series + + def reformat_news_reports(self, df_in: DataFrame) -> DataFrame: + '''Reformats a dataframe of news reports to match BGRI wheat rust survey + data entries (making assumptions where necessary). First checks input is as + expected.''' + + #Check contents are as expected + cols = df_in.columns + expected_cols = ['Lon','Lat','Date','Type','Link','Country','State','District'] + + for expected_col in expected_cols: + assert expected_col in cols + + # re-order dataframe, with newest entry last + df = df_in.copy() + df.sort_values('Date',ascending=True,inplace=True) + + assumption_dict = { + 'field_area' : 1} + + output_dict = { + 'SubmissionDate' : df['Date'], + 'start' : df['Date'], + 'end' : df['Date'], + 'today' : df['Date'], + 'deviceid' : 999, + 'subscriberid' : 999, + 'imei' : 999, + 'phonenumber' : 999, + 'username' : 999, + 'country_list' : df['Country'], + 'blast_rust' : 'Rust', + 'surveyor_name' : 'News report', + 'institution' : 'na', + 'mobile_num' : 999, + 'site_information-survey_site' : 'Farmer field', + 'site_information-crop' : 'NA', + 'site_information-field_area' : assumption_dict['field_area'], + 'site_information-unit_m2' : 999, + 'site_information-field_size' : 999, + 'site_information-variety' : 'NA', + 'site_information-growth_stage' : self.guess_stage(df['Date'],df['Country']), + 'survey_infromation-location_name' : 999, + 'survey_infromation-location_blast' : 999, + 'survey_infromation-sampColor' : 999, + 'survey_infromation-dateRange' : 999, + 'survey_infromation-fieldNumber' : 999, + 'survey_infromation-diseaseIncidencePercentage' : 999, + 'survey_infromation-severityPercentage' : 999, + 'survey_infromation-survey_date' : df['Date'].apply(lambda cell: cell.strftime(self.FMT_SHORT)), + 'survey_infromation-site_name' : '"'+df['District'].astype(str)+', '+df['State'].astype(str)+', '+df['Country'].astype(str)+'"', + 'survey_infromation-location-Latitude' : df['Lat'], + 'survey_infromation-location-Longitude' : df['Lon'], + 'survey_infromation-location-Altitude' : -999, + 'survey_infromation-location-Accuracy' : -999, + 'stem_rust-stemrust_incidence' : self.estimate_rust(df['Type'],'stem','incidence'), + 'stem_rust-Stemrust_severity' : self.estimate_rust(df['Type'],'stem','severity'), + 'stem_rust-stemrust_host_plant_reaction' : 'na', + 'leaf_rust-leafrust_incidence' : self.estimate_rust(df['Type'],'leaf','incidence'), + 'leaf_rust-leafrust_severity' : self.estimate_rust(df['Type'],'leaf','severity'), + 'leaf_rust-leafrust_host_plant_reaction' : 'na', + 'yellow_rust-yellowrust_incidence' : self.estimate_rust(df['Type'],'yellow','incidence'), + 'yellow_rust-yellowrust_severity' : self.estimate_rust(df['Type'],'yellow','severity'), + 'yellow_rust-yellowrust_host_plant_reaction' : 'na', + 'septoria-septoria_incidence' : 'na', + 'septoria-septoria_severity' : 'na', + 'other_diseases_group-other_diseases' : -999, + 'score_diseases_count' : -999, + 'SET-OF-score_diseases' : -999, + 'samples_collected' : -999, + 'samples_type' : -999, + 'sample_size-number_stemrust_live' : -999, + 'sample_size-number_stemrust_dead_dna' : -999, + 'sample_size-number_yellowrust_live' : -999, + 'sample_size-number_yellowrust_dead' : -999, + 'sample_size-number_leafrust_live' : -999, + 'sample_size-using_barcode' : -999, + 'live_stemrust_samples_count' : -999, + 'SET-OF-live_stemrust_samples' : -999, + 'dead_stemrust_samples_count' : -999, + 'SET-OF-dead_stemrust_samples' : -999, + 'live_yellowrust_samples_count' : -999, + 'SET-OF-live_yellowrust_samples' : -999, + 'dead_yellowrust_samples_count' : -999, + 'SET-OF-dead_yellowrust_samples' : -999, + 'live_leafrust_samples_count' : -999, + 'SET-OF-live_leafrust_samples' : -999, + 'comment' : df['Link'], + 'meta-instanceID' : -999, + 'meta-instanceName' : -999, + 'KEY' : -999} + + df_out = DataFrame(output_dict) + + return df_out + + EMAIL_MSG = """Subject: ARRCC latest scraped media reports + + Here is an update of what is on the ARRCC media scraper platform. - Returns a pandas dataframe.''' - - fn = f"{job_dir}/data/NEWS REPORTS.csv" - - dateparse = lambda x: datetime.datetime.strptime(x, '%d-%m-%y') - - df = read_csv( - fn, - index_col=0, - header=0, - parse_dates=['Date'], - date_parser=dateparse) - - return df - -def estimate_rust( - description: Series, - disease: str, - return_type: str) -> Series: - '''Works with pandas series''' - - # check for alternative naming - if disease in ['yellow','stripe']: - - any_disease = description.str.contains('yellow') | description.str.contains('stripe') - - else: - any_disease = description.str.contains(disease) - - return_dict = { - 'incidence':'medium', - 'severity':'30' - } - - prevalence = where(any_disease,return_dict[return_type],'na') - - return prevalence - -def guess_stage( - date: Series, - country: Series) -> Series: - - #TODO: provide typical phenology dates per country - - # all of the country - # based on Moin's estimates from vegetative to ripening & maturing - # note his Start Date and End Dates are always 30 days apart - # so, this is sticking to the middle of the range - stage_start_dates_bangladesh = { - 'tillering':'5 Dec', # ~26 days { combined 30 days to complete - 'boot' :'31 Dec', # ~4 days { - 'heading' :'4 Jan', # 20 days - 'flowering':'24 Jan', # 10 days - 'milk' :'3 Feb', # 10 days - 'dough' :'13 Feb', # ~15 days { combined 25 days to complete - 'maturity' :'28 Mar', # ~10 days { - 'NA' :'9 Mar'} # total ~95 days + The latest entry is below. The full set for this season, auto-formatted for input to NAME source calcs in a basic way, is available at: + {0} - # main season wheat in Terai - # based on Madan's estimates of min and max duration, taking the mean - # from vegetative to ripening & maturing - stage_start_dates_nepal_terai = { - 'tillering':'24 Dec', # ~ 56 days { combined 66 days to complete - 'boot' :'18 Feb', # ~10 days { - 'heading' :'28 Feb', # 10 days - 'flowering':'9 Mar', # 5 days - 'milk' :'14 Mar', # ~12 days { combined 27 days to complete - 'dough' :'26 Mar', # ~15 days { - 'maturity' :'10 Apr', # 10 days - 'NA' :'20 Apr'} # total ~118 days - - # TODO: Less important: implement main season wheat in mid-hills from Madan's estimates - # and determine how to distinguish Terai from mid-hills in news report - stage_start_dates_nepal_midhills = { - 'tillering':'', - 'boot' :'', - 'heading' :'', - 'flowering':'', - 'milk' :'', - 'dough' :'', - 'maturity' :'', - 'NA' :''} - - - # mainly for the north - # assume the same as Nepal Terai as for last year. - # TODO: get estimates specific to Pakistan - stage_start_dates_pakistan = stage_start_dates_nepal_terai - - # mainly for Haryana district - # assume the same as Nepal Terai as for last year. - # TODO: get estimates specific to India NW districts - stage_start_dates_india = stage_start_dates_nepal_terai - - stage_start_dates_by_country = { - 'Bangladesh' : stage_start_dates_bangladesh, - 'Nepal' : stage_start_dates_nepal_terai, - 'India' : stage_start_dates_india, - 'Pakistan' : stage_start_dates_pakistan} + Check all new webpages for validity and extra info (e.g. field area, variety), then edit and copy any relevant entries to: + /storage/app/EWS_prod/regions/SouthAsia/resources/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv - df = DataFrame({'date':date,'country':country}) - dates_by_entry = country.apply(lambda val: stage_start_dates_by_country[val]) - df2 = DataFrame.from_records(dates_by_entry.values) - df2.index = df.index - df3 = concat([df,df2],axis='columns') + Then, check that the survey data processor succeeds with these new entries. + + Thanks, Jake + + {1} + """ - # handle Dec-Jan crossover (is there a neater way of doing this?) + def send_email( + self, + output_fn: str, + data_str: str, + email_credential_fn: str, + ) -> None: - df3['year'] = df3['date'].apply(lambda di: di.year) - df3['lastyear'] = df3['date'].apply(lambda di: di.year-1) - stages_thisyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['year']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} - stages_lastyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['lastyear']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} - df3_thisyear = DataFrame.from_records(stages_thisyear) - df3_lastyear = DataFrame.from_records(stages_lastyear) - - # Use knowledge of order of phenological stages to determine which dates are from last year - stage_order = ['tillering','boot','heading','flowering','milk','dough','maturity','NA'] + msg = self.EMAIL_MSG.format(output_fn,data_str) - df4 = df3_thisyear[stage_order] + with open(email_credential_fn,'r') as f: + gmail_config = json.load(f) - # check each stage in turn - for i,stage in enumerate(stage_order[:-1]): - stage_ip1 = stage_order[i+1] - # if the earlier stage has a later date, use last year's dates - df4[stage] = where(df4[stage]<df4[stage_ip1],df4[stage],df3_lastyear[stage]) + maintainers = gmail_config['toaddrs'] - # find out which stages start earlier than the survey date - date_compare = df4.le(date,axis='rows') - - # get name of latest valid stage - stage_series = date_compare.apply(lambda row: row.iloc[::-1].idxmax(),axis='columns') - - return stage_series - -def reformat_news_reports(df_in: DataFrame) -> DataFrame: - '''Reformats a dataframe of news reports to match BGRI wheat rust survey - data entries (making assumptions where necessary). First checks input is as - expected.''' - - #Check contents are as expected - cols = df_in.columns - expected_cols = ['Lon','Lat','Date','Type','Link','Country','State','District'] - - for expected_col in expected_cols: - assert expected_col in cols - - # re-order dataframe, with newest entry last - df = df_in.copy() - df.sort_values('Date',ascending=True,inplace=True) - - assumption_dict = { - 'field_area' : 1} - - output_dict = { - 'SubmissionDate' : df['Date'], - 'start' : df['Date'], - 'end' : df['Date'], - 'today' : df['Date'], - 'deviceid' : 999, - 'subscriberid' : 999, - 'imei' : 999, - 'phonenumber' : 999, - 'username' : 999, - 'country_list' : df['Country'], - 'blast_rust' : 'Rust', - 'surveyor_name' : 'News report', - 'institution' : 'na', - 'mobile_num' : 999, - 'site_information-survey_site' : 'Farmer field', - 'site_information-crop' : 'NA', - 'site_information-field_area' : assumption_dict['field_area'], - 'site_information-unit_m2' : 999, - 'site_information-field_size' : 999, - 'site_information-variety' : 'NA', - 'site_information-growth_stage' : guess_stage(df['Date'],df['Country']), - 'survey_infromation-location_name' : 999, - 'survey_infromation-location_blast' : 999, - 'survey_infromation-sampColor' : 999, - 'survey_infromation-dateRange' : 999, - 'survey_infromation-fieldNumber' : 999, - 'survey_infromation-diseaseIncidencePercentage' : 999, - 'survey_infromation-severityPercentage' : 999, - 'survey_infromation-survey_date' : df['Date'].apply(lambda cell: cell.strftime(FMT_SHORT)), - 'survey_infromation-site_name' : '"'+df['District'].astype(str)+', '+df['State'].astype(str)+', '+df['Country'].astype(str)+'"', - 'survey_infromation-location-Latitude' : df['Lat'], - 'survey_infromation-location-Longitude' : df['Lon'], - 'survey_infromation-location-Altitude' : -999, - 'survey_infromation-location-Accuracy' : -999, - 'stem_rust-stemrust_incidence' : estimate_rust(df['Type'],'stem','incidence'), - 'stem_rust-Stemrust_severity' : estimate_rust(df['Type'],'stem','severity'), - 'stem_rust-stemrust_host_plant_reaction' : 'na', - 'leaf_rust-leafrust_incidence' : estimate_rust(df['Type'],'leaf','incidence'), - 'leaf_rust-leafrust_severity' : estimate_rust(df['Type'],'leaf','severity'), - 'leaf_rust-leafrust_host_plant_reaction' : 'na', - 'yellow_rust-yellowrust_incidence' : estimate_rust(df['Type'],'yellow','incidence'), - 'yellow_rust-yellowrust_severity' : estimate_rust(df['Type'],'yellow','severity'), - 'yellow_rust-yellowrust_host_plant_reaction' : 'na', - 'septoria-septoria_incidence' : 'na', - 'septoria-septoria_severity' : 'na', - 'other_diseases_group-other_diseases' : -999, - 'score_diseases_count' : -999, - 'SET-OF-score_diseases' : -999, - 'samples_collected' : -999, - 'samples_type' : -999, - 'sample_size-number_stemrust_live' : -999, - 'sample_size-number_stemrust_dead_dna' : -999, - 'sample_size-number_yellowrust_live' : -999, - 'sample_size-number_yellowrust_dead' : -999, - 'sample_size-number_leafrust_live' : -999, - 'sample_size-using_barcode' : -999, - 'live_stemrust_samples_count' : -999, - 'SET-OF-live_stemrust_samples' : -999, - 'dead_stemrust_samples_count' : -999, - 'SET-OF-dead_stemrust_samples' : -999, - 'live_yellowrust_samples_count' : -999, - 'SET-OF-live_yellowrust_samples' : -999, - 'dead_yellowrust_samples_count' : -999, - 'SET-OF-dead_yellowrust_samples' : -999, - 'live_leafrust_samples_count' : -999, - 'SET-OF-live_leafrust_samples' : -999, - 'comment' : df['Link'], - 'meta-instanceID' : -999, - 'meta-instanceName' : -999, - 'KEY' : -999} - - df_out = DataFrame(output_dict) - - return df_out - -EMAIL_MSG = """Subject: ARRCC latest scraped media reports - -Here is an update of what is on the ARRCC media scraper platform. - -The latest entry is below. The full set for this season, auto-formatted for input to NAME source calcs in a basic way, is available at: -{0} - -Check all new webpages for validity and extra info (e.g. field area, variety), then edit and copy any relevant entries to: -/storage/app/EWS_prod/regions/SouthAsia/resources/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv - -Then, check that the survey data processor succeeds with these new entries. - -Thanks, Jake - -{1} -""" - -def send_email( - output_fn: str, - data_str: str, - email_credential_fn: str, - ) -> None: - - - msg = EMAIL_MSG.format(output_fn,data_str) - - with open(email_credential_fn,'r') as f: - gmail_config = json.load(f) - - maintainers = gmail_config['toaddrs'] - - # Create a secure SSL context - context = ssl.create_default_context(cafile=certifi.where()) - - # It is indicated that gmail requires port 465 for SMTP_SSL, otherwise port - # 587 with .starttls() from - # https://realpython.com/python-send-email/#sending-a-plain-text-email I - # think port 587 is meant to make sense for the typical python logging - # smtphandler but that doesn't apply here - port = 465 # gmail_config['port'] - - with smtplib.SMTP_SSL(gmail_config['host'], port, context=context) as server: - - server.login(gmail_config['user'], gmail_config['pass']) - - server.sendmail(gmail_config['user'], maintainers, msg) - - logger.info('Message sent!') - - return - -def process_in_job_media_scraper( - jobPath: str, - status: jobStatus, - config: dict, - component: str = 'Scraper' - ) -> None: - """ - 1) Get a latest copy of the news report data from dashboard URL. - 2) TODO: Reformat to match BGRI wheat rust survey data entries . - 3) Filter to latest news reports. - 4) Output to csv. - 5) email any new reports to maintainers, so they can copy into - SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate. - """ - - config_scraper = config['Scraper'].copy() + # Create a secure SSL context + context = ssl.create_default_context(cafile=certifi.where()) - dateString = config['StartString'] + # It is indicated that gmail requires port 465 for SMTP_SSL, otherwise port + # 587 with .starttls() from + # https://realpython.com/python-send-email/#sending-a-plain-text-email I + # think port 587 is meant to make sense for the typical python logging + # smtphandler but that doesn't apply here + port = 465 # gmail_config['port'] - logger.info("1) Getting a latest copy of the news report data from dashboard URL") - - url = config['Scraper']['URL'] + with smtplib.SMTP_SSL(gmail_config['host'], port, context=context) as server: - get_news_reports_from_url(jobPath,url) + server.login(gmail_config['user'], gmail_config['pass']) - reports_in = read_news_reports(jobPath) + server.sendmail(gmail_config['user'], maintainers, msg) - # TODO: Reformat - logger.info("2) Reformat to match BGRI wheat rust survey data entries") - - # (making assumptions where necessary) - reports = reformat_news_reports(reports_in) + logger.info('Message sent!') - logger.info("3) Filter to latest news reports") + return - this_season_starts_str = config['Scraper']['seasonStartString'] # '20201201' - this_season_starts = datetime.datetime.strptime(this_season_starts_str,'%Y%m%d') + def process_in_job_media_scraper( + self, + jobPath: str, + status: jobStatus, + config: dict, + component: str = 'Scraper' + ) -> None: + """ + 1) Get a latest copy of the news report data from dashboard URL. + 2) TODO: Reformat to match BGRI wheat rust survey data entries . + 3) Filter to latest news reports. + 4) Output to csv. + 5) email any new reports to maintainers, so they can copy into + SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate. + """ - latest_reports = reports[reports['SubmissionDate']>=this_season_starts] + config_scraper = config['Scraper'].copy() - # TODO: Low priority: Determine differences from last reformatted set of news reports + dateString = config['StartString'] - logger.info("4) Output to csv") + logger.info("1) Getting a latest copy of the news report data from dashboard URL") - output_fn = f"{jobPath}/latest_reports_as_proxy_surveys.csv" + url = config['Scraper']['URL'] - # date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv - latest_reports.to_csv( - output_fn, - index=False, - date_format=FMT) - - is_sending_email = config['Scraper'].get('SendEmail',True) - - if is_sending_email == True: - - logger.info("5) email any new reports to maintainers, so they can copy into") - logger.info("SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate") - - selected_columns = [ - 'SubmissionDate', - 'site_information-field_area', - 'site_information-growth_stage', - 'survey_infromation-site_name', - 'country_list', - 'stem_rust-stemrust_incidence', - 'stem_rust-Stemrust_severity', - 'yellow_rust-yellowrust_incidence', - 'yellow_rust-yellowrust_severity', - 'leaf_rust-leafrust_incidence', - 'leaf_rust-leafrust_severity', - 'comment'] - - # remove pandas display limitation, so full web address is shown from comment - set_option('display.max_colwidth', None) - latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__() - - # get the email credentials file path from the environment variables - assert 'EMAIL_CRED' in os.environ - email_credential_fn = os.environ['EMAIL_CRED'] - assert os.path.exists(email_credential_fn) - - send_email( - output_fn, - latest_report_selection, - email_credential_fn = email_credential_fn) + self.get_news_reports_from_url(jobPath,url) + + reports_in = self.read_news_reports(jobPath) + + # TODO: Reformat + logger.info("2) Reformat to match BGRI wheat rust survey data entries") + + # (making assumptions where necessary) + reports = self.reformat_news_reports(reports_in) - proc_out = {} - # Output files available for upload - proc_out['output'] = None - # Processing files available for clearing - proc_out['clearup'] = None + logger.info("3) Filter to latest news reports") - return proc_out + this_season_starts_str = config['Scraper']['seasonStartString'] # '20201201' + this_season_starts = datetime.datetime.strptime(this_season_starts_str,'%Y%m%d') + + latest_reports = reports[reports['SubmissionDate']>=this_season_starts] + + # TODO: Low priority: Determine differences from last reformatted set of news reports + + logger.info("4) Output to csv") + + output_fn = f"{jobPath}/latest_reports_as_proxy_surveys.csv" + + # date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv + latest_reports.to_csv( + output_fn, + index=False, + date_format=self.FMT) + + is_sending_email = config['Scraper'].get('SendEmail',True) + + if is_sending_email == True: + + logger.info("5) email any new reports to maintainers, so they can copy into") + logger.info("SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate") + + selected_columns = [ + 'SubmissionDate', + 'site_information-field_area', + 'site_information-growth_stage', + 'survey_infromation-site_name', + 'country_list', + 'stem_rust-stemrust_incidence', + 'stem_rust-Stemrust_severity', + 'yellow_rust-yellowrust_incidence', + 'yellow_rust-yellowrust_severity', + 'leaf_rust-leafrust_incidence', + 'leaf_rust-leafrust_severity', + 'comment'] + + # remove pandas display limitation, so full web address is shown from comment + set_option('display.max_colwidth', None) + latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__() + + # get the email credentials file path from the environment variables + assert 'EMAIL_CRED' in os.environ + email_credential_fn = os.environ['EMAIL_CRED'] + assert os.path.exists(email_credential_fn) + + self.send_email( + output_fn, + latest_report_selection, + email_credential_fn = email_credential_fn) + + proc_out = {} + # Output files available for upload + proc_out['output'] = None + # Processing files available for clearing + proc_out['clearup'] = None + + return proc_out + + +if __name__ == '__main__': + processor = ProcessorScraper() + processor.run_processor("Scraper") diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 2f2e9975493745b8fc7b6ac841218d92a9b007bc..33bbd9edbaa83381f05bf8569e69e79458e4b06e 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -29,6 +29,7 @@ from numpy import any as np_any from shutil import copyfile from pandas import read_csv, concat +from Processor import Processor from source_gen.clustering import run_case from ProcessorSurveysODK import get_ODK_form_as_csv @@ -43,401 +44,423 @@ from ProcessorUtils import ( add_filters_to_sublogger, ) -logger = logging.getLogger('Processor.Surveys') -add_filters_to_sublogger(logger) -GET_FORM_AS_CSV_DICT = { - 'ODK' : get_ODK_form_as_csv, - 'kobotoolbox' : get_kobotoolbox_form_as_csv, - 'WRSIS' : get_WRSIS_form_as_csv, - 'WRT' : get_WRT_form_as_csv, - 'ODKv2' : get_ODKv2_form_as_csv, - 'newODK' : get_newODK_form_as_csv, -} +class ProcessorSurveys(Processor): -def process_pre_job_survey(input_args): - '''Returns a boolean as to whether the job is ready for full processing.''' - logger.info('started process_pre_job_survey(), nothing to do') + def process_pre_job(self, args): + return self.process_pre_job_survey(args) - return True -def process_in_job_survey(jobPath,status,config,component): - logger.info('started process_in_job_survey()') + def process_in_job(self, jobPath, status, configjson, component) -> object: + return self.process_in_job_survey(jobPath, status, configjson, component) - logger.debug('Performing download(s) from ODK server') - credentials_filename = config['Survey']['ServerCredentialsFile'] - with open(credentials_filename) as credentials_file: + def process_post_job(self, jobPath, configjson): + return self.process_EWS_plotting_survey(jobPath, configjson) - cred: dict = json.load(credentials_file) - assert 'forms' in cred.keys() + def __init__(self) -> None: + super().__init__() + logger = logging.getLogger('Processor.Surveys') + add_filters_to_sublogger(logger) - csv_filenames = {} - for form in cred['forms']: + self.GET_FORM_AS_CSV_DICT = { + 'ODK': get_ODK_form_as_csv, + 'kobotoolbox': get_kobotoolbox_form_as_csv, + 'WRSIS': get_WRSIS_form_as_csv, + 'WRT': get_WRT_form_as_csv, + 'ODKv2': get_ODKv2_form_as_csv, + 'newODK': get_newODK_form_as_csv, + } - logger.debug(f"Starting to download {form['form_id']}") + def process_pre_job_survey(self, input_args): + '''Returns a boolean as to whether the job is ready for full processing.''' + self.logger.info('started process_pre_job_survey(), nothing to do') - assert form['type'] in GET_FORM_AS_CSV_DICT + return True - func_get_form_as_csv = GET_FORM_AS_CSV_DICT[form['type']] + def process_in_job_survey(self, jobPath,status,config,component): + self.logger.info('started process_in_job_survey()') - csv_filename = func_get_form_as_csv(form, jobPath, config, status) + self.logger.debug('Performing download(s) from ODK server') - csv_filenames[form['form_id']] = csv_filename + credentials_filename = config['Survey']['ServerCredentialsFile'] + with open(credentials_filename) as credentials_file: - # load each file of surveys as a dataframe - forms = {} - for form_name,form_fn in csv_filenames.items(): + cred: dict = json.load(credentials_file) - # some define column types, hardwired for now - col_types = {'comment':'str','KEY':'str'} + assert 'forms' in cred.keys() - form_df = read_csv(form_fn,dtype=col_types) + csv_filenames = {} + for form in cred['forms']: - forms[form_name] = form_df + self.logger.debug(f"Starting to download {form['form_id']}") - # create some standard dataframe modification functions - def add_column(df,coln,value): - df[coln]=value - return + assert form['type'] in self.GET_FORM_AS_CSV_DICT - def remove_column(df,coln,value): - del df[coln] - return + func_get_form_as_csv = self.GET_FORM_AS_CSV_DICT[form['type']] - def replace_column(df,coln,value): - df[coln]=value - return + csv_filename = func_get_form_as_csv(form, jobPath, config, status) - def filter_by_column(df,coln,value): - # CAUTION: This requires surveyor to provide the correct country - df.drop(df.loc[df[coln]!=value].index,inplace=True) - #TODO : for Kenya data, provide a coordinate-based filter - return + csv_filenames[form['form_id']] = csv_filename - def filter_by_list(df,coln,values): - # CAUTION: This requires surveyor to provide the correct list of countries - df.drop(df.loc[~df[coln].isin(values)].index,inplace=True) - return + # load each file of surveys as a dataframe + forms = {} + for form_name,form_fn in csv_filenames.items(): - func_types = { - 'add': add_column, - 'remove' : remove_column, - 'replace' : replace_column, - 'filter' : filter_by_column, - 'filter_by_list' : filter_by_list - } + # some define column types, hardwired for now + col_types = {'comment':'str','KEY':'str'} - # simple format alignment using edits on config - # (should this need to be much more sophisticated, reconsider the workflow) - if 'FormEdits' in config['Survey']: + form_df = read_csv(form_fn,dtype=col_types) - form_edits = config['Survey']['FormEdits'] + forms[form_name] = form_df - # loop over each form - for form_name, edits in form_edits.items(): + # create some standard dataframe modification functions + def add_column(df,coln,value): + df[coln]=value + return - form_df = forms[form_name] + def remove_column(df,coln,value): + del df[coln] + return - # loop over each type of edit - for func_type, columns in edits.items(): + def replace_column(df,coln,value): + df[coln]=value + return - # check the function is available - assert func_type in func_types + def filter_by_column(df,coln,value): + # CAUTION: This requires surveyor to provide the correct country + df.drop(df.loc[df[coln]!=value].index,inplace=True) + #TODO : for Kenya data, provide a coordinate-based filter + return - # loop over each column to modify - for coln,val in columns.items(): + def filter_by_list(df,coln,values): + # CAUTION: This requires surveyor to provide the correct list of countries + df.drop(df.loc[~df[coln].isin(values)].index,inplace=True) + return - # apply the edit - func_types[func_type](form_df,coln,val) + func_types = { + 'add': add_column, + 'remove' : remove_column, + 'replace' : replace_column, + 'filter' : filter_by_column, + 'filter_by_list' : filter_by_list + } - # Merge additional SurveyData files and rearrange columns to be consistent - # Assumes that the same columns are present in all forms - # and that the first form is the standard + # simple format alignment using edits on config + # (should this need to be much more sophisticated, reconsider the workflow) + if 'FormEdits' in config['Survey']: - first=True - for dfi in forms.values(): + form_edits = config['Survey']['FormEdits'] - if first: - standard_columns = dfi.columns.tolist() - dfm = dfi + # loop over each form + for form_name, edits in form_edits.items(): - logger.debug(f"First processed form contains {dfm.shape[0]} records") + form_df = forms[form_name] - first=False - continue + # loop over each type of edit + for func_type, columns in edits.items(): - # re-order columns to match first case (presumed standard format) - # and fill missing columns with empty strings - dfi = dfi.reindex(standard_columns,fill_value='',axis='columns') + # check the function is available + assert func_type in func_types - logger.debug(f"Next processed form contains {dfi.shape[0]} records") + # loop over each column to modify + for coln,val in columns.items(): - dfm = concat([dfm,dfi],axis='rows') + # apply the edit + func_types[func_type](form_df,coln,val) - # save the result - Export_csv_path = f"{jobPath}/ExportCSV/" - Path(Export_csv_path).mkdir(parents = True, exist_ok = True) - forms_fn = f"{Export_csv_path}/Merged_SurveyData.csv" - dfm.to_csv(forms_fn,index=False,quoting=csv.QUOTE_MINIMAL) + # Merge additional SurveyData files and rearrange columns to be consistent + # Assumes that the same columns are present in all forms + # and that the first form is the standard - logger.debug(f"Preparing to apply removals and additions to survey data") + first=True + for dfi in forms.values(): - processed_surveys_filepath = f"{Export_csv_path}/Processed_SurveyData.csv" + if first: + standard_columns = dfi.columns.tolist() + dfm = dfi - survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv" - survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv" - - # perform here in python, using the 'KEY' column - # check the key column is unique, if not raise a warning and remove duplicates - - if dfm['KEY'].unique().size != dfm['KEY'].size: - status.reset('WARNING') - logger.warning(f"KEY column is not unique, removing duplicates") - # count the number of duplicates - n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size - # drop the duplicates - dfm = dfm.drop_duplicates(keep='first') - logger.warning(f"Removed {n_duplicates} duplicates") + self.logger.debug(f"First processed form contains {dfm.shape[0]} records") - df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str') - keys_to_rm = df_rm['KEY'] + first=False + continue - # check that all of the keys to remove exist in the original data - rm_keys_found = df_rm['KEY'].isin(dfm['KEY']) - n_rm_keys_found = rm_keys_found.sum() - n_rm_keys = rm_keys_found.size - if not np_all(rm_keys_found): - # this might happen if the run date is in the past - logger.warning(f"Only found {n_rm_keys_found} of {n_rm_keys} survey errors to remove") + # re-order columns to match first case (presumed standard format) + # and fill missing columns with empty strings + dfi = dfi.reindex(standard_columns,fill_value='',axis='columns') - rm_keys_not_found = df_rm[~rm_keys_found] - logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}") + self.logger.debug(f"Next processed form contains {dfi.shape[0]} records") - logger.debug(f"Type of keys that can be found include:\n{dfm['KEY'].dtype}") + dfm = concat([dfm,dfi],axis='rows') - dfm_short_keys = [val for val in dfm['KEY'].values if len(str(val)) <10] - logger.debug(f"Keys that can be found include:\n{dfm_short_keys}") + # save the result + Export_csv_path = f"{jobPath}/ExportCSV/" + Path(Export_csv_path).mkdir(parents = True, exist_ok = True) + forms_fn = f"{Export_csv_path}/Merged_SurveyData.csv" + dfm.to_csv(forms_fn,index=False,quoting=csv.QUOTE_MINIMAL) - # identify which surveys to remove - idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values) + self.logger.debug(f"Preparing to apply removals and additions to survey data") - #drop them in-place - dfm = dfm[~idx_to_rm] - logger.info(f"Removed {n_rm_keys_found} erroneous surveys") + processed_surveys_filepath = f"{Export_csv_path}/Processed_SurveyData.csv" - # add the extra entries - df_add = read_csv(survey_additions_filepath,dtype='str') - n_add_keys = df_add.shape[0] - df_join = concat([dfm,df_add]) - assert dfm.shape[0]+df_add.shape[0] == df_join.shape[0], 'Unexpected result of including additional surveys' + survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv" + survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv" - logger.info(f"Added {n_add_keys} additional surveys") + # perform here in python, using the 'KEY' column + # check the key column is unique, if not raise a warning and remove duplicates - # save as processed - df_join.to_csv(processed_surveys_filepath,index=False,quoting=csv.QUOTE_MINIMAL) + if dfm['KEY'].unique().size != dfm['KEY'].size: + status.reset('WARNING') + self.logger.warning(f"KEY column is not unique, removing duplicates") + # count the number of duplicates + n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size + # drop the duplicates + dfm = dfm.drop_duplicates(keep='first') + self.logger.warning(f"Removed {n_duplicates} duplicates") - logger.debug('Preparing clustering calculation') + df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str') + keys_to_rm = df_rm['KEY'] - date = datetime.datetime.now() + # check that all of the keys to remove exist in the original data + rm_keys_found = df_rm['KEY'].isin(dfm['KEY']) + n_rm_keys_found = rm_keys_found.sum() + n_rm_keys = rm_keys_found.size + if not np_all(rm_keys_found): + # this might happen if the run date is in the past + self.logger.warning(f"Only found {n_rm_keys_found} of {n_rm_keys} survey errors to remove") - # prepare environment for clustering calc - call_R = False + rm_keys_not_found = df_rm[~rm_keys_found] + self.logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}") - upload_directory = f"{jobPath}/upload" - Path(upload_directory).mkdir(parents=True, exist_ok=True) + self.logger.debug(f"Type of keys that can be found include:\n{dfm['KEY'].dtype}") - if call_R: + dfm_short_keys = [val for val in dfm['KEY'].values if len(str(val)) <10] + self.logger.debug(f"Keys that can be found include:\n{dfm_short_keys}") - cluster_calc_path = "/storage/app/EWS_prod/code/wheat_source_generation/" + # identify which surveys to remove + idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values) - # clear old output - old_clustering_output_glob = f"{cluster_calc_path}/output/sources_*" - old_clustering_outputs = glob(old_clustering_output_glob) + #drop them in-place + dfm = dfm[~idx_to_rm] + self.logger.info(f"Removed {n_rm_keys_found} erroneous surveys") - logger.info('About to unlink old output from clustering calculation') - for path in old_clustering_outputs: - logger.info(f"unlinking {path}") - Path(path).unlink() + # add the extra entries + df_add = read_csv(survey_additions_filepath,dtype='str') + n_add_keys = df_add.shape[0] + df_join = concat([dfm,df_add]) + assert dfm.shape[0]+df_add.shape[0] == df_join.shape[0], 'Unexpected result of including additional surveys' + self.logger.info(f"Added {n_add_keys} additional surveys") - RPath = '/usr/local/R/bin/Rscript' + # save as processed + df_join.to_csv(processed_surveys_filepath,index=False,quoting=csv.QUOTE_MINIMAL) - clustering_script = f"{cluster_calc_path}/code/R/clustering.R" + self.logger.debug('Preparing clustering calculation') - clustering_env = { - **os.environ, - 'R_LIBS':'/home/ewsmanager/R-packages-EWS-clustering/x86_64-pc-linux-gnu-library/3.5', - 'PROJ_LIB' : '/usr/share/proj/', # conda env breaks the automatic assignment of PROJ_LIB - } + date = datetime.datetime.now() - clustering_config = config['Survey']['SourcesConfigFilename'] - assert os.path.isfile(clustering_config) + # prepare environment for clustering calc + call_R = False - clustering_calc = [RPath, - '--no-init-file', - clustering_script, - processed_surveys_filepath, - config['StartString'], - '-2', - '7', - config['Survey']['SourcesConfigFilename']] + upload_directory = f"{jobPath}/upload" + Path(upload_directory).mkdir(parents=True, exist_ok=True) - logger.debug('Performing clustering calculation') + if call_R: - description_short = 'wheat-source-generation' - description_long = 'source calculation on processed surveys' + cluster_calc_path = "/storage/app/EWS_prod/code/wheat_source_generation/" - try: - subprocess_and_log(clustering_calc, description_short, description_long, env=clustering_env) - except: - status.reset('ERROR') - endJob(status,premature=True) + # clear old output + old_clustering_output_glob = f"{cluster_calc_path}/output/sources_*" + old_clustering_outputs = glob(old_clustering_output_glob) - logger.debug('Checking output of clustering calculation') + self.logger.info('About to unlink old output from clustering calculation') + for path in old_clustering_outputs: + self.logger.info(f"unlinking {path}") + Path(path).unlink() - try: - logger.debug('Trying to copy the dataset processed for clustering') - clustering_proc_path_glob = f"{cluster_calc_path}/output/survey_data_processed_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv" - clustering_proc_path_list = glob(clustering_proc_path_glob) - if len(clustering_proc_path_list) == 0: - logger.debug(f"No processed files produced from clustering in {clustering_proc_path_glob}") - raise Exception + RPath = '/usr/local/R/bin/Rscript' - elif len(clustering_proc_path_list) > 1: - logger.debug(f"Multiple processed files produced from clustering in {clustering_proc_path_glob}") - raise Exception + clustering_script = f"{cluster_calc_path}/code/R/clustering.R" - else: - logger.debug('Found 1 processed file, placing copy of result in job directory') + clustering_env = { + **os.environ, + 'R_LIBS':'/home/ewsmanager/R-packages-EWS-clustering/x86_64-pc-linux-gnu-library/3.5', + 'PROJ_LIB' : '/usr/share/proj/', # conda env breaks the automatic assignment of PROJ_LIB + } - proc_filename = f"survey_data_processed_{config['StartString']}.csv" - proc_path = f"{output_directory}/{proc_filename}" + clustering_config = config['Survey']['SourcesConfigFilename'] + assert os.path.isfile(clustering_config) - logger.debug(f"as {proc_path}") + clustering_calc = [RPath, + '--no-init-file', + clustering_script, + processed_surveys_filepath, + config['StartString'], + '-2', + '7', + config['Survey']['SourcesConfigFilename']] - copyfile(clustering_proc_path_list[0], proc_path) + self.logger.debug('Performing clustering calculation') - except: - logger.debug('Failed to get a copy of the dataset processed for clustering') + description_short = 'wheat-source-generation' + description_long = 'source calculation on processed surveys' - clustering_output_path_glob = f"{cluster_calc_path}/output/sources_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv" - clustering_output_path_list = glob(clustering_output_path_glob) - if len(clustering_output_path_list) == 0: - logger.error(f"No output produced from clustering in {clustering_output_path_glob}") - status.reset('ERROR') - endJob(status,premature=True) - if len(clustering_output_path_list) > 1: - logger.error(f"Multiple outputs produced from clustering in {clustering_output_path_glob}") - status.reset('ERROR') - endJob(status,premature=True) + try: + subprocess_and_log(clustering_calc, description_short, description_long, env=clustering_env) + except: + status.reset('ERROR') + endJob(status,premature=True) - sources_path = clustering_output_path_list[0] + self.logger.debug('Checking output of clustering calculation') - elif 'Groups' in config['Survey']: - # if 'Groups' is defined in the config, create grouped survey files and run python version - - logger.debug('Preparing grouped survey files') - group_directory = f"{jobPath}/Groups" - Path(group_directory).mkdir(parents=True, exist_ok=True) + try: + self.logger.debug('Trying to copy the dataset processed for clustering') - origins_list = df_join["Origin"].unique() - groups = {i:[i] for i in origins_list} + clustering_proc_path_glob = f"{cluster_calc_path}/output/survey_data_processed_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv" + clustering_proc_path_list = glob(clustering_proc_path_glob) + if len(clustering_proc_path_list) == 0: + self.logger.debug(f"No processed files produced from clustering in {clustering_proc_path_glob}") + raise Exception - assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]) + elif len(clustering_proc_path_list) > 1: + self.logger.debug(f"Multiple processed files produced from clustering in {clustering_proc_path_glob}") + raise Exception - groups.update(config['Survey']['Groups']) - - # remove groups that are listed in GroupsToIgnore - if 'GroupsToIgnore' in config['Survey']: - for group_name in config['Survey']['GroupsToIgnore']: - if group_name in groups: - logger.info(f"Removing group {group_name} from list of groups") - del groups[group_name] - - for group_name,group_content in groups.items(): + else: + self.logger.debug('Found 1 processed file, placing copy of result in job directory') - logger.info(f"Creating survey group {group_name} which includes {group_content}") - - df_group = df_join.loc[df_join["Origin"].isin(group_content)] - - group_surveys_filename = f"surveys_{group_name}.csv" - group_surveys_filepath = f"{group_directory}/{group_surveys_filename}" - - df_group.to_csv(group_surveys_filepath, index=False, quoting=csv.QUOTE_MINIMAL) - - output_directory = f"{jobPath}/source_gen/{group_name}" + proc_filename = f"survey_data_processed_{config['StartString']}.csv" + proc_path = f"{output_directory}/{proc_filename}" + + self.logger.debug(f"as {proc_path}") + + copyfile(clustering_proc_path_list[0], proc_path) + + except: + self.logger.debug('Failed to get a copy of the dataset processed for clustering') + + clustering_output_path_glob = f"{cluster_calc_path}/output/sources_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv" + clustering_output_path_list = glob(clustering_output_path_glob) + if len(clustering_output_path_list) == 0: + self.logger.error(f"No output produced from clustering in {clustering_output_path_glob}") + status.reset('ERROR') + endJob(status,premature=True) + if len(clustering_output_path_list) > 1: + self.logger.error(f"Multiple outputs produced from clustering in {clustering_output_path_glob}") + status.reset('ERROR') + endJob(status,premature=True) + + sources_path = clustering_output_path_list[0] + + elif 'Groups' in config['Survey']: + # if 'Groups' is defined in the config, create grouped survey files and run python version + + self.logger.debug('Preparing grouped survey files') + group_directory = f"{jobPath}/Groups" + Path(group_directory).mkdir(parents=True, exist_ok=True) + + origins_list = df_join["Origin"].unique() + groups = {i:[i] for i in origins_list} + + assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]) + + groups.update(config['Survey']['Groups']) + + # remove groups that are listed in GroupsToIgnore + if 'GroupsToIgnore' in config['Survey']: + for group_name in config['Survey']['GroupsToIgnore']: + if group_name in groups: + self.logger.info(f"Removing group {group_name} from list of groups") + del groups[group_name] + + for group_name,group_content in groups.items(): + + self.logger.info(f"Creating survey group {group_name} which includes {group_content}") + + df_group = df_join.loc[df_join["Origin"].isin(group_content)] + + group_surveys_filename = f"surveys_{group_name}.csv" + group_surveys_filepath = f"{group_directory}/{group_surveys_filename}" + + df_group.to_csv(group_surveys_filepath, index=False, quoting=csv.QUOTE_MINIMAL) + + output_directory = f"{jobPath}/source_gen/{group_name}" + Path(output_directory).mkdir(parents=True, exist_ok=True) + + sources_path = run_case( + config_path = config['Survey']['pySourcesConfigFilename'], + survey_path = group_surveys_filepath, + reference_date = config['StartString'], + # Day 0 (current day) is always included + # Days -2,-1 and 0 are needed to initialise spores in NAME + # days 1-7 are forecast days + # days 8 and 9 are buffers in case NAME needs to run with the + # previous day's source job + day_offsets = [-2,9], + output_dir = output_directory) + + self.logger.debug('Placing copy of result in job directory with conventional name') + + output_filename = f"sources_{group_name}_{config['StartString']}.csv" + output_path = f"{jobPath}/upload/{output_filename}" + + self.logger.debug(f"as {output_path}") + + copyfile(sources_path, output_path) + else: + # run python version without grouping surveys + + output_directory = f"{jobPath}/source_gen" Path(output_directory).mkdir(parents=True, exist_ok=True) sources_path = run_case( config_path = config['Survey']['pySourcesConfigFilename'], - survey_path = group_surveys_filepath, + survey_path = processed_surveys_filepath, reference_date = config['StartString'], # Day 0 (current day) is always included # Days -2,-1 and 0 are needed to initialise spores in NAME # days 1-7 are forecast days - # days 8 and 9 are buffers in case NAME needs to run with the + # days 8 and 9 are buffers in case NAME needs to run with the # previous day's source job day_offsets = [-2,9], output_dir = output_directory) - - logger.debug('Placing copy of result in job directory with conventional name') - - output_filename = f"sources_{group_name}_{config['StartString']}.csv" - output_path = f"{jobPath}/upload/{output_filename}" - logger.debug(f"as {output_path}") - - copyfile(sources_path, output_path) - else: - # run python version without grouping surveys + self.logger.debug('Placing copy of result in job directory with conventional name') - output_directory = f"{jobPath}/source_gen" - Path(output_directory).mkdir(parents=True, exist_ok=True) - - sources_path = run_case( - config_path = config['Survey']['pySourcesConfigFilename'], - survey_path = processed_surveys_filepath, - reference_date = config['StartString'], - # Day 0 (current day) is always included - # Days -2,-1 and 0 are needed to initialise spores in NAME - # days 1-7 are forecast days - # days 8 and 9 are buffers in case NAME needs to run with the - # previous day's source job - day_offsets = [-2,9], - output_dir = output_directory) + output_filename = f"sources_{config['StartString']}.csv" + output_path = f"{jobPath}/upload/{output_filename}" - logger.debug('Placing copy of result in job directory with conventional name') + self.logger.debug(f"as {output_path}") - output_filename = f"sources_{config['StartString']}.csv" - output_path = f"{jobPath}/upload/{output_filename}" + copyfile(sources_path, output_path) - logger.debug(f"as {output_path}") - copyfile(sources_path, output_path) + upload_filenames = f"sources_*{config['StartString']}.csv" + upload_path = f"{jobPath}/upload/{upload_filenames}" + # glob list of output files + upload_path_list = glob(upload_path) - upload_filenames = f"sources_*{config['StartString']}.csv" - upload_path = f"{jobPath}/upload/{upload_filenames}" + proc_out = {} + # Output files available for upload + proc_out['output'] = upload_path_list + # Processing files available for clearing + proc_out['clearup'] = None - # glob list of output files - upload_path_list = glob(upload_path) + return proc_out - proc_out = {} - # Output files available for upload - proc_out['output'] = upload_path_list - # Processing files available for clearing - proc_out['clearup'] = None + #TODO + def process_EWS_plotting_survey(self, jobPath,config): + '''Returns a list of output files for transfer.''' - return proc_out + self.logger.info('started process_EWS_plotting_survey(), nothing to do') -#TODO -def process_EWS_plotting_survey(jobPath,config): - '''Returns a list of output files for transfer.''' + pass + return [] - logger.info('started process_EWS_plotting_survey(), nothing to do') - pass - return [] +if __name__ == '__main__': + processor = ProcessorSurveys() + processor.run_processor("Survey") \ No newline at end of file diff --git a/scripts/run_Processor.sh b/scripts/run_Processor.sh index 00366a535de3b6f6bcf3458f4ec0b6c6ba761c73..8e1fd2fd68a2221845a664294a8097460d2d17db 100755 --- a/scripts/run_Processor.sh +++ b/scripts/run_Processor.sh @@ -1,5 +1,52 @@ #!/bin/bash +original_args=("$@") + +SHORT=p:h +OPTS=$(getopt -a --options $SHORT -- "$@") +echo $OPTS +eval set -- "$OPTS" + +while : +do + case "$1" in + -p ) + component="$2" + shift 2 + ;; + -h | --help) + "Runs the appropriate Processor component using the -p flag, all other args are passed through" + exit 2 + ;; + --) + shift; + break + ;; + *) + echo "Unexpected option: $1" + ;; + esac +done + +printf "component is %s" "$component" + +if [ "$component" = "Environment" ]; then + processor_class="ProcessorEnvironment.py" +elif [ "$component" = "Deposition" ];then + processor_class="ProcessorDeposition.py" +elif [ "$component" = "Survey" ];then + processor_class="ProcessorSurveys.py" +elif [ "$component" = "Advisory" ];then + processor_class="ProcessorAdvisory.py" +elif [ "$component" = "Scraper" ];then + processor_class="ProcessorScraper.py" +elif [ "$component" = "Epidemiology" ];then + processor_class="ProcessorEpidemiology.py" +else + printf "component '%s' not recognised" "$component" +fi + + # directory containing all environment envs=/storage/app/EWS_prod/envs @@ -31,7 +78,9 @@ source /storage/app/miniconda3/bin/activate ${conda_env} proc_path="$( dirname "$(readlink -f "$0" )" )" # run the processor with all arguments -python ${proc_path}/../coordinator/Processor.py "$@" +processor=${proc_path}/../coordinator/${processor_class} +printf "processor is %s\n\n" "$processor" +python "${processor}" "${original_args[@]}" exit_code=$?; # deactivate conda environment diff --git a/tests/integration/full/full_test_advisory.py b/tests/integration/full/full_test_advisory.py index 372e5526326e6a7be894518de364ea74b4379088..f9fbf645a886599ea45478549583dcab43531daf 100644 --- a/tests/integration/full/full_test_advisory.py +++ b/tests/integration/full/full_test_advisory.py @@ -2,6 +2,10 @@ import copy import os import sys +from ProcessorAdvisory import ProcessorAdvisory +from ProcessorDeposition import ProcessorDeposition +from ProcessorEnvironment import ProcessorEnvironment +from ProcessorSurveys import ProcessorSurveys from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.advisory_test_suite import BaseAdvisoryTestSuite @@ -56,24 +60,35 @@ class FullTestAdvisory(BaseAdvisoryTestSuite.AdvisoryTestSuite): @staticmethod def run_dependent_pipelines(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + + depo_processor = ProcessorDeposition() IntegrationTestUtils.run_external_pipeline("Deposition", IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + depo_processor) + + env_processor = ProcessorEnvironment() IntegrationTestUtils.run_external_pipeline("Environment", IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + env_processor) + survey_processor = ProcessorSurveys() previous_day_string: str = IntegrationTestUtils.get_day_before_as_string(IntegrationTestUtils.TEST_START_DATE) IntegrationTestUtils.run_external_pipeline("Survey", previous_day_string, - IntegrationTestUtils.EMAIL_CRED_PATH) + survey_processor) pass @staticmethod def run_advisory_pipeline(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + + advisory_processor = ProcessorAdvisory() IntegrationTestUtils.run_external_pipeline(BaseAdvisoryTestSuite.AdvisoryTestSuite.ADVISORY_COMPONENT_NAME, IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + advisory_processor) if __name__ == '__main__': diff --git a/tests/integration/full/full_test_deposition.py b/tests/integration/full/full_test_deposition.py index 0402e79d1200a94170b67da7453e114efae3801c..c594f51e59fc820823d0419b8beef6401de731ff 100644 --- a/tests/integration/full/full_test_deposition.py +++ b/tests/integration/full/full_test_deposition.py @@ -2,6 +2,7 @@ import copy import os import sys +from ProcessorDeposition import ProcessorDeposition from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.depo_test_suite import BaseDepoTestSuite @@ -45,9 +46,12 @@ class FullTestDeposition(BaseDepoTestSuite.DepoTestSuite): @staticmethod def run_depo_pipeline(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + depo_processor = ProcessorDeposition() IntegrationTestUtils.run_external_pipeline(BaseDepoTestSuite.DepoTestSuite.DEPO_COMPONENT_NAME, IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + depo_processor) if __name__ == '__main__': diff --git a/tests/integration/full/full_test_env_suit.py b/tests/integration/full/full_test_env_suit.py index aa4d5cbb60db37bda46d858db4503a7c34e1af4c..d6a9f390e44fb159e777dafd4d55c63e51bf3591 100644 --- a/tests/integration/full/full_test_env_suit.py +++ b/tests/integration/full/full_test_env_suit.py @@ -2,6 +2,7 @@ import copy import os import sys +from ProcessorEnvironment import ProcessorEnvironment from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.env_suit_test_suite import BaseEnvSuitTestSuite @@ -46,9 +47,12 @@ class FullTestEnvSuit(BaseEnvSuitTestSuite.EnvSuitTestSuite): @staticmethod def run_env_pipeline(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + env_processor = ProcessorEnvironment() IntegrationTestUtils.run_external_pipeline(BaseEnvSuitTestSuite.EnvSuitTestSuite.ENV_COMPONENT_NAME, IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + env_processor) if __name__ == '__main__': _success: bool = IntegrationTestUtils.run_full_integration_test_pipeline(FullTestEnvSuit, diff --git a/tests/integration/full/full_test_epi.py b/tests/integration/full/full_test_epi.py index 71561f282fa9f6621a04cc29addac584e574eef8..53dd143228dcdf1e4e9f23df490341b07cb9e15c 100644 --- a/tests/integration/full/full_test_epi.py +++ b/tests/integration/full/full_test_epi.py @@ -2,6 +2,9 @@ import copy import os import sys +from ProcessorDeposition import ProcessorDeposition +from ProcessorEnvironment import ProcessorEnvironment +from ProcessorEpidemiology import ProcessorEpidemiology from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.epi_test_suite import BaseEpiTestSuite @@ -57,24 +60,29 @@ class FullTestEpi(BaseEpiTestSuite.EpiTestSuite): @staticmethod def run_dependent_pipelines(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + + depo_processor = ProcessorDeposition() IntegrationTestUtils.run_external_pipeline("Deposition", IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + depo_processor) + env_processor = ProcessorEnvironment() IntegrationTestUtils.run_external_pipeline("Environment", IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + env_processor) - # previous_day_string: str = IntegrationTestUtils.get_day_before_as_string(IntegrationTestUtils.TEST_START_DATE) - # IntegrationTestUtils.run_external_pipeline("Survey", - # previous_day_string, - # IntegrationTestUtils.EMAIL_CRED_PATH) pass @staticmethod def run_epi_pipeline(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + + epi_processor = ProcessorEpidemiology() IntegrationTestUtils.run_external_pipeline(BaseEpiTestSuite.EpiTestSuite.EPI_COMPONENT_NAME, IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + epi_processor) if __name__ == '__main__': diff --git a/tests/integration/full/full_test_survey.py b/tests/integration/full/full_test_survey.py index 610cca52769ac4216c1d7d9bab8662874cda5d35..f38bcd7fe1bdaf9ca61b10cef778a82e5702cda3 100644 --- a/tests/integration/full/full_test_survey.py +++ b/tests/integration/full/full_test_survey.py @@ -2,6 +2,7 @@ import copy import os import sys +from ProcessorSurveys import ProcessorSurveys from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.survey_test_suite import BaseSurveyTestSuite @@ -40,9 +41,13 @@ class FullTestSurvey(BaseSurveyTestSuite.SurveyTestSuite): @staticmethod def run_survey_pipeline(): + # need EMAIL_CRED in the environment before we import Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + + survey_processor = ProcessorSurveys() IntegrationTestUtils.run_external_pipeline(BaseSurveyTestSuite.SurveyTestSuite.SURVEY_COMPONENT_NAME, IntegrationTestUtils.TEST_START_DATE, - IntegrationTestUtils.EMAIL_CRED_PATH) + survey_processor) if __name__ == '__main__': diff --git a/tests/integration/partial/integration_test_utils.py b/tests/integration/partial/integration_test_utils.py index 5b17c3d739cd34a5bd251323999ff130785fdc80..9903274e81c3eecb127359666ecc018a3c5dc606 100644 --- a/tests/integration/partial/integration_test_utils.py +++ b/tests/integration/partial/integration_test_utils.py @@ -4,12 +4,15 @@ import glob import json import os from importlib import reload -from typing import List +from typing import List, Type from unittest import TestSuite, TestLoader, TestCase, TestResult from zipfile import ZipFile from HtmlTestRunner import HTMLTestRunner +from Processor import Processor +from ProcessorDeposition import ProcessorDeposition + class IntegrationTestUtils: @@ -166,24 +169,20 @@ class IntegrationTestUtils: @staticmethod def run_partial_integration_test_pipeline(component: str, start_date: str, + processor: Processor, **kwargs): """ Runs the "run_Process" function in Processor.py with the given arguments for the partial integration tests. The full integration pipeline is run in the "run_full_integration_test_pipeline" function. + :param processor: :param component: :param start_date: :param kwargs: :return: """ - # need EMAIL_CRED in the environment before we import Processor - os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH - import Processor - reload(Processor) - from Processor import run_Process, set_log_level - args_dict: dict = {} # note, possible to override these values in the kwargs loop below @@ -199,10 +198,10 @@ class IntegrationTestUtils: args_dict[key] = value log_level = args_dict['log_level'] - set_log_level(log_level) + processor.set_log_level(log_level) try: - run_Process(args_dict) + processor.run_process(args_dict) except SystemExit: # we will eventually want to throw these to the calling class to be dealt with pass @@ -210,16 +209,9 @@ class IntegrationTestUtils: @staticmethod def run_external_pipeline(component: str, start_date: str, - email_cred_path: str, + processor: Processor, **kwargs): - # need EMAIL_CRED in the environment before we import Processor - os.environ["EMAIL_CRED"] = email_cred_path - - import Processor - reload(Processor) # reload the class to reset all variables - from Processor import run_Process, set_log_level - args_dict: dict = {} # note, possible to override these values in the kwargs loop below @@ -235,10 +227,10 @@ class IntegrationTestUtils: args_dict[key] = value log_level = args_dict['log_level'] - set_log_level(log_level) + processor.set_log_level(log_level) try: - run_Process(args_dict) + processor.run_process(args_dict) except SystemExit: # we will eventually want to throw these to the calling class to be dealt with pass diff --git a/tests/integration/partial/test_advisory.py b/tests/integration/partial/test_advisory.py index c38bd377c16d87585472a068a7a9ba0d0aec526f..d8558cd7459895a0e0508f9fbc671c9225b5861c 100644 --- a/tests/integration/partial/test_advisory.py +++ b/tests/integration/partial/test_advisory.py @@ -2,6 +2,7 @@ import copy import os import unittest +from ProcessorAdvisory import ProcessorAdvisory from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.advisory_test_suite import BaseAdvisoryTestSuite @@ -68,8 +69,12 @@ class TestAdvisory(BaseAdvisoryTestSuite.AdvisoryTestSuite): @staticmethod def run_advisory_pipeline(): component = 'Advisory' - IntegrationTestUtils.run_partial_integration_test_pipeline(component, IntegrationTestUtils.TEST_START_DATE) - + # need EMAIL_CRED in the environment before we create a Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + advisory_processor = ProcessorAdvisory() + IntegrationTestUtils.run_partial_integration_test_pipeline(component, + IntegrationTestUtils.TEST_START_DATE, + advisory_processor) if __name__ == '__main__': unittest.main() diff --git a/tests/integration/partial/test_deposition.py b/tests/integration/partial/test_deposition.py index ace57114737eacac6385602e4f0976b341d34997..96268cb9510e7a6d5fc71c4f0474a43b622d139f 100644 --- a/tests/integration/partial/test_deposition.py +++ b/tests/integration/partial/test_deposition.py @@ -2,6 +2,7 @@ import copy import os import unittest +from ProcessorDeposition import ProcessorDeposition from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.depo_test_suite import BaseDepoTestSuite @@ -63,7 +64,12 @@ class TestDeposition(BaseDepoTestSuite.DepoTestSuite): @staticmethod def run_depo_pipeline(): component = 'Deposition' - IntegrationTestUtils.run_partial_integration_test_pipeline(component, IntegrationTestUtils.TEST_START_DATE) + # need EMAIL_CRED in the environment before we create a Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + deposition_processor = ProcessorDeposition() + IntegrationTestUtils.run_partial_integration_test_pipeline(component, + IntegrationTestUtils.TEST_START_DATE, + deposition_processor) if __name__ == '__main__': diff --git a/tests/integration/partial/test_env_suit.py b/tests/integration/partial/test_env_suit.py index b4ba788c281371ce430d1cab4d79202a1ab411f3..11169a6601406b1bb4c8a193dd9f2ce72097f47f 100644 --- a/tests/integration/partial/test_env_suit.py +++ b/tests/integration/partial/test_env_suit.py @@ -2,6 +2,7 @@ import copy import os import unittest +from ProcessorEnvironment import ProcessorEnvironment from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.env_suit_test_suite import BaseEnvSuitTestSuite @@ -42,7 +43,7 @@ class TestEnvSuit(BaseEnvSuitTestSuite.EnvSuitTestSuite): def write_temp_run_config_file(): nowstring: str = IntegrationTestUtils.get_now_string() prefix: str = "temp_env_" + nowstring - # prefix: str = "temp_env" + #prefix: str = "temp_env" default_config = IntegrationTestUtils.DEFAULT_CONFIG_FILE_PATH default_config_dict: dict = IntegrationTestUtils.load_json_file(default_config) @@ -66,7 +67,12 @@ class TestEnvSuit(BaseEnvSuitTestSuite.EnvSuitTestSuite): @staticmethod def run_env_pipeline(): component = 'Environment' - IntegrationTestUtils.run_partial_integration_test_pipeline(component, IntegrationTestUtils.TEST_START_DATE) + # need EMAIL_CRED in the environment before we create a Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + environment_processor = ProcessorEnvironment() + IntegrationTestUtils.run_partial_integration_test_pipeline(component, + IntegrationTestUtils.TEST_START_DATE, + environment_processor) if __name__ == '__main__': diff --git a/tests/integration/partial/test_epi.py b/tests/integration/partial/test_epi.py index 6f1d85ed3f69ac66a194d37b3680e31f1bff6c99..af7fb9eb307cf9fc9fa60d94402086383dffd366 100644 --- a/tests/integration/partial/test_epi.py +++ b/tests/integration/partial/test_epi.py @@ -2,6 +2,7 @@ import copy import os import unittest +from ProcessorEpidemiology import ProcessorEpidemiology from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.epi_test_suite import BaseEpiTestSuite @@ -67,7 +68,12 @@ class TestEpi(BaseEpiTestSuite.EpiTestSuite): @staticmethod def run_epi_pipeline(): component = 'Epidemiology' - IntegrationTestUtils.run_partial_integration_test_pipeline(component, IntegrationTestUtils.TEST_START_DATE) + # need EMAIL_CRED in the environment before we create a Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + epi_processor = ProcessorEpidemiology() + IntegrationTestUtils.run_partial_integration_test_pipeline(component, + IntegrationTestUtils.TEST_START_DATE, + epi_processor) if __name__ == '__main__': diff --git a/tests/integration/partial/test_survey.py b/tests/integration/partial/test_survey.py index a770fed939f046e069d6fe97165dcf7ad601dfdd..0026f5206ed6771858d74b04ef95f4888b1fda53 100644 --- a/tests/integration/partial/test_survey.py +++ b/tests/integration/partial/test_survey.py @@ -2,6 +2,7 @@ import copy import os import unittest +from ProcessorSurveys import ProcessorSurveys from integration.partial.integration_test_utils import IntegrationTestUtils from integration.test_suites.survey_test_suite import BaseSurveyTestSuite @@ -64,7 +65,12 @@ class TestSurvey(BaseSurveyTestSuite.SurveyTestSuite): @staticmethod def run_survey_pipeline(): component = 'Survey' - IntegrationTestUtils.run_partial_integration_test_pipeline(component, IntegrationTestUtils.TEST_START_DATE) + # need EMAIL_CRED in the environment before we create a Processor + os.environ["EMAIL_CRED"] = IntegrationTestUtils.EMAIL_CRED_PATH + survey_processor = ProcessorSurveys() + IntegrationTestUtils.run_partial_integration_test_pipeline(component, + IntegrationTestUtils.TEST_START_DATE, + survey_processor) if __name__ == '__main__':