From cf4caf3f34e026f7d15eb8633672b70c0bb06ccb Mon Sep 17 00:00:00 2001
From: Jake Smith <jws52@cam.ac.uk>
Date: Wed, 31 Aug 2022 15:54:37 +0100
Subject: [PATCH] feat: Source switch and split ProcessorComponents

1) There is now a python alternative to the R-based wheat-source-
generation, called source_gen. The functionality to call source_gen
has been added (see ProcessorSurveys.pyL939 and L1032-1040), but not
yet switched on while we await testing of
output at the Met Office.

2) ProcessorComponents was huge (2100 lines!) and the structure was
not apparent. Each component's supporting functions are now in a
separate directory. There is also a ProcessorServer.py relating to
file down/up-loads. ProcessorComponents is now essentially a directory
 to import all the functions. This might inspire a more systematic
 (object-oriented?) structure for each component within coordinator
 in the future.
---
 Processor.py             |    7 +-
 ProcessorAdvisory.py     |   46 +
 ProcessorComponents.py   | 2175 +-------------------------------------
 ProcessorDeposition.py   |  137 +++
 ProcessorEnvironment.py  |  200 ++++
 ProcessorEpidemiology.py |  630 +++++++++++
 ProcessorServer.py       |  124 +++
 ProcessorSurveys.py      | 1060 +++++++++++++++++++
 ProcessorUtils.py        |   71 +-
 run_Processor.sh         |    3 +-
 10 files changed, 2319 insertions(+), 2134 deletions(-)
 create mode 100644 ProcessorAdvisory.py
 create mode 100644 ProcessorDeposition.py
 create mode 100644 ProcessorEnvironment.py
 create mode 100644 ProcessorEpidemiology.py
 create mode 100644 ProcessorServer.py
 create mode 100644 ProcessorSurveys.py

diff --git a/Processor.py b/Processor.py
index 4d224ee..ea4e64c 100755
--- a/Processor.py
+++ b/Processor.py
@@ -39,7 +39,12 @@ import BufferingSMTPHandler
 import EnvSuitPipeline as esp
 import NAMEPreProcessor as npp
 import ProcessorComponents
-from ProcessorUtils import endScript, endJob, open_and_check_config, PasswordODKFilter
+from ProcessorUtils import (
+        endScript,
+        endJob,
+        open_and_check_config,
+        PasswordODKFilter
+)
 
 # initialise default values for configuration
 
diff --git a/ProcessorAdvisory.py b/ProcessorAdvisory.py
new file mode 100644
index 0000000..1345f4e
--- /dev/null
+++ b/ProcessorAdvisory.py
@@ -0,0 +1,46 @@
+#ProcessorAdvisory.py
+'''Functions to process the advisory component.'''
+
+import logging
+
+# gitlab projects
+# TODO: Package these projects so they are robust for importing
+from AdvisoryBuilder import DataGatherer # created by jws52
+
+from ProcessorUtils import add_filters_to_sublogger, short_name
+
+logger = logging.getLogger('Processor.Advisory')
+add_filters_to_sublogger(logger)
+
+short_name = {
+        'Advisory' : 'SUMMARY',
+        'Deposition' : 'DEPOSITION',
+        'Environment' : 'ENVIRONMENT_2.0',
+        'Epidemiology' : 'EPI',
+        'Survey' : 'SURVEYDATA',
+        }
+
+def process_in_job_advisory(jobPath,status,config,component):
+    '''Generates a word processor file containing some basic survey statistics
+    and output figures from deposition, environmental suitability, and
+    eventually also the epi model. This template advisory is intended to speed
+    up the process of writing advisories. The intended user is a local expert
+    who edits the content of the document.
+    Uses the gitlab project EWS-advisory-builder.'''
+
+    config_advisory = config[component].copy()
+
+    # provide top-level arguments to advisory config
+    for k,v in config.items():
+        if k not in short_name.keys():
+            config_advisory[k]=v
+
+    dateString = config['StartString']
+
+    layout = 'tight'
+
+    report_names = DataGatherer.run_each_subregion(config_advisory, dateString, layout)
+
+    # pass the report filenames to upload to the remote server
+
+    return report_names
diff --git a/ProcessorComponents.py b/ProcessorComponents.py
index 138743e..e9ab6ff 100644
--- a/ProcessorComponents.py
+++ b/ProcessorComponents.py
@@ -1,54 +1,50 @@
 #ProcessorComponents.py
-'''Contains the specific functions to process survey data, environmental
-suitability, spore deposition and epidemiology. These functions are handled by
-Processor.py'''
+'''Contains imports of all the specific functions to process survey data,
+environmental suitability, spore deposition and epidemiology. These functions
+are handled by Processor.py .'''
 
-import csv
-import datetime
-from distutils.dir_util import copy_tree
-from glob import glob
-import json
-import tarfile
 import logging
 import os
-from pathlib import Path
-import re
-
-import iris
-import requests
-import shutil
-from shutil import copyfile
-import subprocess
-from string import Template
-
-from iris.cube import CubeList, Cube
-from numpy import all as np_all
-from numpy import argmax, unique
-from pandas import read_csv, Series, DataFrame, concat, to_datetime, json_normalize
-from rasterio import open as rio_open
-
-# gitlab projects
-# TODO: Package these projects so they are robust for importing
-from AdvisoryBuilder import DataGatherer # created by jws52
-from EpiModel import ( # created by rs481
-    EpiAnalysis,
-    EpiModel,
-    EpiPrep,
-    EpiPrepLister,
-    EpiPrepLoader,
-    plotRaster
-)
 
 # submodules of this project
-import EnvSuitPipeline as esp
-import NAMEPreProcessor as npp
-from plotting.common.utils import EnvSuitDiseaseInfo
-from plotting.common.plotting_coordinator.ews_env_disease_plotting_coordinator import EWSPlottingEnvSuitBase
-from plotting.common.plotting_coordinator.ews_depo_disease_plotting_coordinator import EWSPlottingDepoBase
-from plotting.common.plotting_coordinator.ews_epi_disease_plotting_coordinator import EWSPlottingEPIBase
-from ProcessorUtils import open_and_check_config, get_only_existing_globs, subprocess_and_log, endScript, endJob, \
-    add_filters_to_sublogger, remove_path_from_tar_members
-
+# All of the process_* functions are callable from config files for the three
+# coordinator stages: pre, in (during) and plotting. 
+from ProcessorAdvisory import process_in_job_advisory
+from ProcessorDeposition import (
+        process_in_job_dep, 
+        process_EWS_plotting_dep
+)
+from ProcessorEnvironment import (
+        process_in_job_env2_0,
+        process_copy_past_job_env2_0,
+        process_EWS_plotting_env2_0
+)
+from ProcessorEpidemiology import (
+        process_pre_job_epi,
+        process_in_job_epi,
+        process_EWS_plotting_epi,
+)
+from ProcessorServer import (
+        process_pre_job_server_download,
+        upload
+)
+from ProcessorSurveys import (
+        process_pre_job_survey, 
+        process_in_job_survey,
+        process_EWS_plotting_survey
+)
+from ProcessorUtils import (
+        open_and_check_config,
+        get_only_existing_globs,
+        subprocess_and_log,
+        endScript,
+        endJob,
+        add_filters_to_sublogger,
+        remove_path_from_tar_members,
+        query_past_successes,
+        short_name,
+        disease_latin_name_dict
+)
 
 # TODO: Replace subprocess scp and ssh commands with paramiko.SSHClient() instance
 
@@ -60,2093 +56,10 @@ script_path = os.path.dirname(__file__)+'/'
 
 coordinator_path = script_path
 
-short_name = {
-        'Advisory' : 'SUMMARY',
-        'Deposition' : 'DEPOSITION',
-        'Environment' : 'ENVIRONMENT_2.0',
-        'Epidemiology' : 'EPI',
-        'Survey' : 'SURVEYDATA',
-        }
-
-disease_latin_name_dict = {
-        'StemRust' : 'P_GRAMINIS',
-        'StripeRust' : 'P_STRIIFORMIS',
-        'LeafRust' : 'P_RECONDITA',
-        'WheatBlast' : 'M_ORYZAE'}
-
-def process_pre_job_survey(input_args):
-    '''Returns a boolean as to whether the job is ready for full processing.'''
-    logger.info('started process_pre_job_survey(), nothing to do')
-
-    return True
-
-def process_pre_job_server_download(input_args):
-    '''This is set up for environmental suitability v2.0 and deposition.
-    Returns a boolean as to whether the job is ready for full processing.'''
-
-    logger.info('started process_pre_job_willow_download()')
-
-    # Check if there is a file available on willow
-    logger.debug('Checking for file(s) on remote server')
-
-    for i,config_path in enumerate(input_args.config_paths):
-
-        config = open_and_check_config(config_path)
-
-        config['StartString'] = input_args.start_date
-
-        file_path = Template(config[input_args.component]['ServerPathTemplate']).substitute(**config)
-        file_name = Template(config[input_args.component]['InputFileTemplate']).substitute(**config)
-        logger.info(f"Checking for existence of {file_path}/{file_name}.tar.gz")
-
-        timenow = datetime.datetime.now(tz=datetime.timezone.utc).time()
-
-        cmd_ssh = ["ssh","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",config['ServerName'],f"test -f {file_path}/{file_name}.tar.gz"]
-        description_short = 'subprocess_ssh'
-        description_long = f"Checking for existence of {file_path}/{file_name}.tar.gz"
-
-        status = subprocess_and_log(cmd_ssh,description_short,description_long,check=False)
-
-        if status.returncode == 1:
-
-            # a time check in UTC. If it's late, raise warning, if very late, raise error
-
-            time_0 = config[input_args.component]['TimeExpectedAvailable']
-            time_0 = datetime.datetime.strptime(time_0,'%H%M')
-
-            time_until_warn = datetime.timedelta(hours=4)
-            time_until_error = datetime.timedelta(hours=5)
-
-            time_warn = (time_0 + time_until_warn).time()
-            time_error = (time_0 + time_until_error).time()
-
-            message = f"Data not yet available for config {i+1} of {len(input_args.config_paths)}, expected between {time_0.time()} and {time_error} and long before {time_error}"
-
-            if timenow > time_error:
-                # job is not able to proceed
-
-                logger.warning(message)
-
-                return False
-
-            elif timenow > time_warn:
-                # job is not ready to proceed
-
-                logger.warning(message)
-                endScript(premature=True)
-
-            else:
-                # some other problem with the job
-
-                logger.info(message)
-                endScript(premature=True)
-
-        elif status.returncode == 0:
-            logger.info(f"Data is available for config {i+1} of {len(input_args.config_paths)}, calculation shall proceed")
-
-    return True
-
-def calc_epi_date_range(init_str,span_days=[0,6]):
-    '''Date range is determined relative to init_date.
-    span_days is usually defined in the job config file. Day zero is current
-    day, negative values point to past (historical or analysis) days, and
-    positive values point to forecast days.
-    Returns a start_date and end_date.'''
-
-    init_date = datetime.datetime.strptime(init_str,'%Y%m%d')
-
-    # note that filename date represents preceding 3 hours, so day's data
-    #  starts at file timestamp 0300 UTC
-    threehour_shift = datetime.timedelta(hours=3)
-
-    # add 24hrs so that final day is fully included
-    day_shift = datetime.timedelta(days=1)
-
-    # if more than 999 days
-    if len(str(span_days[0]))>3:
-        # assume it is a date string
-        start_date = datetime.datetime.strptime(span_days[0]+'0300','%Y%m%d%H%M')
-    else:
-        date_shift0 = datetime.timedelta(days=span_days[0])
-
-        start_date = init_date + date_shift0 + threehour_shift
-
-    if len(str(span_days[1]))>3:
-        # assume it is a date string
-        end_date = datetime.strptime(span_days[1]+'0000','%Y%m%d%H%M')
-
-        end_date = end_date + day_shift
-    else:
-        date_shift1 = datetime.timedelta(days=span_days[1])
-
-        end_date = init_date + date_shift1 +day_shift
-
-    return start_date, end_date
-
-def query_proceed(necessary_file,description):
-
-    try:
-
-        assert os.path.isfile(necessary_file)
-
-        logger.info(f"Found:\n{necessary_file}\nso {description} job has succeeded for this date, this job shall run.")
-
-    except AssertionError as e:
-
-        logger.info(f"Failed to find:\n{necessary_file}\nso {description} job has not yet succeeded for this date, so cannot run this job.")
-
-        endScript(premature=True)
-
-        return False
-
-    return True
-
-def query_past_successes(input_args):
-    '''Checks if deposition and environment jobs are already completed
-    successfully. If not, it raises an error.'''
-
-    component = input_args.component
-
-    # check configs can be loaded
-    config_fns = input_args.config_paths
-    for configFile in config_fns:
-        try:
-            config_i = open_and_check_config(configFile)
-        except:
-            logger.exception(f"Failure in opening or checking config {configFile}")
-            endScript(premature=True)
-
-        # some config initialisation is necessary
-        config_i['StartString'] = input_args.start_date
-
-        # check if deposition data is readily available
-        dep_success_file = Template(config_i[component]['Deposition']['SuccessFileTemplate']).substitute(**config_i)
-        try:
-            query_proceed(dep_success_file,'deposition')
-        except:
-            dep_success_file_alt = Template(config_i[component]['Deposition']['AlternativeSuccessFileTemplate']).substitute(**config_i)
-            query_proceed(dep_success_file_alt,'deposition')
-
-        # check if environment data is readily available
-        env_success_file = Template(config_i[component]['Environment']['SuccessFileTemplate']).substitute(**config_i)
-        try:
-            query_proceed(env_success_file,'environment')
-        except:
-            env_success_file_alt = Template(config_i[component]['Environment']['AlternativeSuccessFileTemplate']).substitute(**config_i)
-            query_proceed(env_success_file_alt,'environment')
-
-    return True
-
-def process_pre_job_epi(input_args):
-    '''Returns a boolean as to whether the job is ready for full processing.'''
-
-    logger.info('started process_pre_job_epi()')
-
-    # check pre-requisite jobs are complete
-    query_past_successes(input_args)
-
-    config_fns = input_args.config_paths
-
-    for configFile in config_fns:
-
-        # they should be working if the script made it this far, no need to try
-        config_i = open_and_check_config(configFile)
-
-        #determine end time, from config file
-        arg_start_date = input_args.start_date
-        calc_span_days = config_i['Epidemiology']['CalculationSpanDays']
-        assert len(calc_span_days) == 2
-
-        start_time, end_time = calc_epi_date_range(arg_start_date,calc_span_days)
-
-        # warn if it is a long timespan
-        date_diff = end_time - start_time
-        if date_diff.days > 100:
-            logger.warning("More than 100 days will be calculated over, likely longer than any single season")
-
-    return True
-
-def process_in_job_advisory(jobPath,status,config,component):
-    '''Generates a word processor file containing some basic survey statistics
-    and output figures from deposition, environmental suitability, and
-    eventually also the epi model. This template advisory is intended to speed
-    up the process of writing advisories. The intended user is a local expert
-    who edits the content of the document.
-    Uses the gitlab project EWS-advisory-builder.'''
-
-    config_advisory = config[component].copy()
-
-    # provide top-level arguments to advisory config
-    for k,v in config.items():
-        if k not in short_name.keys():
-            config_advisory[k]=v
-
-    dateString = config['StartString']
-
-    layout = 'tight'
-
-    report_names = DataGatherer.run_each_subregion(config_advisory, dateString, layout)
-
-    # pass the report filenames to upload to the remote server
-
-    return report_names
-
-def get_ODK_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
-    '''Given a dict with a single ODK form to download from an ODK Aggregate
-    server, obtains it and converts to csv.'''
-
-    # Caution: Not tested whether different servers can be downloded to the same ODK_output_path
-    ODK_output_path = f"{jobPath}/ExportRawDB"
-
-    # get data from ODK server
-    description_short = 'ODK download'
-    description_long = 'survey download from ODK server'
-
-    # get path to ODK executable
-    ODK_jar = form_credentials['ODK_jar']
-    assert os.path.exists(ODK_jar)
-
-    ODK_download = ['java',
-            '-jar', ODK_jar,
-            '--pull_aggregate',
-            '--form_id', form_credentials['form_id'],
-            '--storage_directory', ODK_output_path,
-            '--odk_url', form_credentials['url'],
-            '--odk_username',form_credentials['user'],
-            '--odk_password',form_credentials['pass']]
-
-    ODK_download_success = True
-
-    logger.debug('Performing ' + description_long)
-
-    try:
-        # perform a pull from the ODK server, and if it fails write a warning message
-
-        subprocess_and_log(ODK_download,description_short,description_long,log_type='warning',check=True)
-
-    except subprocess.CalledProcessError as e:
-        status.reset('WARNING')
-        ODK_download_success = False
-
-    #TODO: Check it came down cleanly ($serverOutputDir is created whether cleanly or not, so test more explicitly):
-
-    ODK_csv_path = f"{jobPath}/ExportCSV/"
-
-    Path(ODK_csv_path).mkdir(parents=True, exist_ok=True)
-
-    ODK_csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"
-
-    if ODK_download_success:
-        description_short = 'ODK export'
-        description_long = 'converting ODK download to csv'
-        logger.debug(description_long)
-
-        ODK_java_to_csv = ['java',
-                '-jar', ODK_jar,
-                '--export',
-                '--form_id', form_credentials['form_id'],
-                '--storage_directory',ODK_output_path,
-                '--export_directory',ODK_csv_path,
-                '--export_filename',ODK_csv_filename]
-
-        logger.debug('Performing ' + description_long)
-
-        try:
-            subprocess_and_log(ODK_java_to_csv,description_short,description_long,check=True)
-
-        except subprocess.CalledProcessError as e:
-            status.reset('WARNING')
-            ODK_download_success = False
-
-    if not ODK_download_success:
-
-        logger.info("Because ODK server download failed somewhere, trying to recover by copying recent download")
-
-        ODK_copy_success = False
-
-        days_back = 1
-        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
-        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
-
-        while ((not ODK_copy_success) and (days_back <= acceptable_days_back)):
-            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
-
-            past_date = current_date - datetime.timedelta(days=days_back)
-
-            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
-            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
-
-            past_ODK_csv_path = f"{past_jobPath}/ExportCSV/"
-
-            try:
-                # check that python or perl coordinator script succeeded for that date
-                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
-                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
-                assert success_py or success_perl
-
-                #logger.warning(f"Temporary rename of expected previous download, for jobs before ~Apr 2021")
-                #past_ODK_csv_filename = f"SurveyData.csv"
-                past_ODK_csv_filename = ODK_csv_filename
-
-                logger.info(f"Looking for {past_ODK_csv_path+past_ODK_csv_filename}")
-
-                copyfile(past_ODK_csv_path+past_ODK_csv_filename,ODK_csv_path+ODK_csv_filename)
-
-                assert os.path.isfile(ODK_csv_path+ODK_csv_filename)
-
-                ODK_copy_success = True
-            except:
-                logger.info(f"Not found an ODK download in {past_ODK_csv_path}")
-
-            days_back += 1
-
-        if not ODK_copy_success:
-            logger.error(f"Failed get a suitable copy of survey data.")
-            status.reset('ERROR')
-            endJob(status,premature=True)
-
-        logger.warning(f"Using ODK download from {past_jobPath}.")
-
-    return ODK_csv_path+ODK_csv_filename
-
-# TODO: Consider placing survey download functions to a separate file
-def get_from_kobotoolbox(url,form_id,form_token,**kwargs):
-
-    # Kenya survey form
-    #url = 'https://kf.kobotoolbox.org/'
-    #form_name = 'Wheat rust survey 1.0'
-    #form_id = 'akpyJHvYxkLKPkxFJnPyTW'
-    #form_token = '???' # this is sensitive
-
-    url_for_requests = url #f"{url}api/v2/"
-
-    url_to_get_form = f"{url_for_requests}assets/{form_id}/data.json"
-
-    headers = {'Authorization': f"Token {form_token}"}
-
-    response = requests.get(url_to_get_form,headers=headers)
-
-    if response.status_code != 200:
-        raise requests.exceptions.HTTPError('HTTP status was not 200')
-
-    logger.info('successful connection to kobotoolbox server')
-    return response
-
-def build_dataframe(response):
-
-    result_count = response.json()['count']
-
-    logger.info(f"{result_count} records")
-
-    request_results = response.json()['results']
-
-    # crude merging of list of dicts into pandas dataframe
-    df = DataFrame.from_records(request_results)
-
-    return df
-
-#parse columns into ODK format
-def parse_location_str(location_str):
-
-    # expecting a space-separated string containing four numbers which
-    # contain a decimal point
-    regex = r'(?P<lat>[-?0-9\.]+)\s(?P<lon>[-?0-9\.]+)\s(?P<alt>[0-9\.]+)\s(?P<acc>[0-9\.]+)'
-
-    # needed because the ODK names are too complicated for regex named groups
-    name_dict = {
-        'lat' : 'survey_infromation-location-Latitude',
-        'lon' : 'survey_infromation-location-Longitude',
-        'alt' : 'survey_infromation-location-Altitude',
-        'acc' : 'survey_infromation-location-Accuracy'
-        }
-
-    res = re.search(regex,location_str)
-
-    loc_series = Series(res.groupdict())
-
-    loc_series.rename(index=name_dict,inplace=True)
-
-    return loc_series
-
-def parse_location_kobotoolbox(series):
-
-    loc_df = series.apply(parse_location_str)
-
-    return loc_df
-
-def convert_date(date_str,fmt_in,fmt_out):
-
-    # in case any nan's creep in
-    if str(date_str)=='nan':
-        return 'nan'
-
-    # timezones in kobotoolbox data are irregular
-    # datetime needs +HHMM
-    # so setting up a regex to check for these cases and handle
-    pattern1 = '\+[0-9][0-9]$'
-    if re.search(pattern1,date_str):
-        # need to provide empty MM
-        date_str = date_str + '00'
-    pattern2 = '\+([0-9][0-9]):([0-9][0-9])$'
-    if re.search(pattern2,date_str):
-        # need to provide empty MM
-        date_str = re.sub(pattern2,'+\g<1>\g<2>',date_str)
-
-    date_in = datetime.datetime.strptime(date_str,fmt_in)
-    date_str_out = date_in.strftime(fmt_out)
-
-    return date_str_out
-
-def parse_date(series,name_out='date',fmt_in = '%Y-%m-%d',fmt_out= '%b %d, %Y'):
-
-    s_out = series.apply(convert_date,fmt_in=fmt_in,fmt_out=fmt_out)
-
-    s_out.rename(name_out,inplace=True)
-
-    return s_out
-
-# dict of functions callable within coln_parser_dict
-# so they can be obtained with a string in coln_parser_dict
-func_dict = {
-    'parse_date' : parse_date,
-    'parse_location_kobotoolbox' : parse_location_kobotoolbox
-}
-
-def parse_columns(df_in,coln_parser_dict):
-    '''Works on each type of conversion in turn.
-
-    coln_parse_dict is the configuration used to convert columns:
-    - keys are column names in the input dataframe
-    - values that are 'None' mean they should be dropped
-    - values that are string simply rename the column
-    - values that are tuples should be a runnable function with kwargs, where
-    the first item is the string identifier of the functionre and the rest is a
-    list of key,value pairs to be provided as kwargs, returns series/dataframe,
-    and drops key column.
-    # TODO: is it neccesary to provide dtype conversion somewhere (e.g. dates)?'''
-
-    df_out = df_in.copy()
-
-    # drop any indicated columns
-    coln_drop_list = [k for k,v in coln_parser_dict.items() if v == 'None']
-    logger.info(f"Dropping {len(coln_drop_list)} columns")
-    logger.debug(f"Columns being dropped are {coln_drop_list}")
-    for key in coln_drop_list:
-        del df_out[key]
-
-    # rename any indicated columns
-    coln_rename_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,str)}
-    logger.info(f"Renaming {len(coln_rename_dict)} columns")
-    logger.debug(f"Columns being renamed are {coln_rename_dict}")
-    df_out.rename(columns=coln_rename_dict,inplace=True)
-
-    # apply any functions
-    # callable only works in python 3.2+ apparently
-    coln_func_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,tuple)}
-    logger.info(f"Applying {len(coln_func_dict)} functions to columns")
-    logger.debug(f"Columns being renamed are {coln_rename_dict}")
-    dfs_to_concat = [df_out]
-
-    for key,val in coln_func_dict.items():
-
-        # TODO: there is a more pythonic way to get functions with a string
-        func = func_dict[val[0]]
-        assert callable(func)
-        kwargs = {k:v for k,v in val[1]}
-        columns_out = func(df_in[key],**kwargs)
-
-        if isinstance(columns_out,DataFrame):
-            num_outputs = columns_out.shape[-1]
-            column_names = columns_out.columns
-
-        elif isinstance(columns_out,Series):
-            num_outputs = 1
-            column_names = [columns_out.name]
-
-        logger.info(f"Adding {num_outputs} columns to dataframe")
-        logger.debug(f"New columns are {column_names}")
-
-        dfs_to_concat += [columns_out]
-
-        # drop the original column, now that it has been parsed with func
-        del df_out[key]
-
-    df_final = concat(dfs_to_concat,axis='columns')
-
-    return df_final
-
-def get_kobotoolbox_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
-    '''Given a dict with a single kobotoolbox form to download from a kobotoolbox
-    server, obtains it and converts to csv.'''
-
-    output_dir = 'Export_kobotoolbox'
-    output_path = f"{jobPath}/{output_dir}/"
-
-    Path(output_path).mkdir(parents=True, exist_ok=True)
-
-    # get data from kobotoolbox server
-
-    # keys are column names in the input dataframe
-    # values that are None mean they should be dropped
-    # values that are string simply rename the column
-    # values that are functions should be run with that key and returns series/dataframe
-    column_parser_dict = {
-        '__version__' : 'None',
-        '_attachments' : 'None',
-        '_bamboo_dataset_id' : 'None',
-        '_geolocation' : 'None', # looks like a duplication of survey_infromation/location
-        '_id' : 'None',
-        '_notes' : 'None',
-        '_status' : 'None',
-        '_submission_time' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%Y-%m-%dT%H:%M:%S'))),
-        '_submitted_by' : 'None',
-        '_tags' : 'None',
-        '_uuid' : 'KEY',
-        '_validation_status' : 'None',
-        '_xform_id_string' : 'None',
-        'comment' : 'comment',
-        'dead_stemrust_samples' : 'SET-OF-dead_stemrust_samples',
-        'dead_stemrust_samples_count' : 'dead_stemrust_samples_count',
-        'dead_yellowrust_samples' : 'SET-OF-dead_yellowrust_samples',
-        'dead_yellowrust_samples_count' : 'dead_yellowrust_samples_count',
-        'deviceid' : 'deviceid',
-        'end' : ('parse_date',(('name_out','end'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
-        'formhub/uuid' : 'None',
-        'imei' : 'imei',
-        'leaf_rust/leafrust_host_plant_reaction' : 'leaf_rust-leafrust_host_plant_reaction',
-        'leaf_rust/leafrust_incidence' : 'leaf_rust-leafrust_incidence',
-        'leaf_rust/leafrust_severity' : 'leaf_rust-leafrust_severity',
-        'live_leafrust_samples' : 'SET-OF-live_leafrust_samples',
-        'live_leafrust_samples_count' : 'live_leafrust_samples_count',
-        'live_stemrust_samples' : 'SET-OF-live_stemrust_samples',
-        'live_stemrust_samples_count' : 'live_stemrust_samples_count',
-        'live_yellowrust_samples' : 'SET-OF-live_yellowrust_samples',
-        'live_yellowrust_samples_count' : 'live_yellowrust_samples_count',
-        'meta/instanceID' : 'meta-instanceID',
-        'other_crop' : 'other_crop',
-        'other_diseases_group/other_diseases' : 'other_diseases_group-other_diseases',
-        'phonenumber' : 'phonenumber',
-        'sample_size/number_leafrust_live' : 'sample_size-number_leafrust_live',
-        'sample_size/number_stemrust_dead_dna' : 'sample_size-number_stemrust_dead_dna',
-        'sample_size/number_stemrust_live' : 'sample_size-number_stemrust_live',
-        'sample_size/number_yellowrust_dead' : 'sample_size-number_yellowrust_dead',
-        'sample_size/number_yellowrust_live' : 'sample_size-number_yellowrust_live',
-        'sample_size/using_barcode' : 'sample_size-using_barcode',
-        'samples_collected' : 'samples_collected',
-        'samples_type' : 'samples_type',
-        'score_diseases' : 'SET-OF-score_diseases',
-        'score_diseases_count' : 'score_diseases_count',
-        'septoria/septoria_incidence' : 'septoria-septoria_incidence',
-        'septoria/septoria_severity' : 'septoria-septoria_severity',
-        'site_information/crop' : 'site_information-crop',
-        'site_information/field_area' : 'site_information-field_area',
-        'site_information/growth_stage' : 'site_information-growth_stage',
-        'site_information/survey_site' : 'site_information-survey_site',
-        'site_information/variety' : 'site_information-variety',
-        'start' : ('parse_date',(('name_out','start'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
-        'stem_rust/Stemrust_severity' : 'stem_rust-Stemrust_severity',
-        'stem_rust/stemrust_host_plant_reaction' : 'stem_rust-stemrust_host_plant_reaction',
-        'stem_rust/stemrust_incidence' : 'stem_rust-stemrust_incidence',
-        'subscriberid' : 'subscriberid',
-        'survey_infromation/location' : ('parse_location_kobotoolbox',()),
-        'survey_infromation/location_name' : 'survey_infromation-location_name',
-        'survey_infromation/survey_date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%Y-%m-%d'))),
-        'surveyor_infromation/country' : 'surveyor_infromation-country',
-        'surveyor_infromation/institution' : 'surveyor_infromation-institution',
-        'surveyor_infromation/surveyor_name' : 'surveyor_infromation-surveyor_name',
-        'today' : ('parse_date',(('name_out','today'),('fmt_in','%Y-%m-%d'))),
-        'username' : 'username',
-        'yellow_rust/yellowrust_host_plant_reaction' : 'yellow_rust-yellowrust_host_plant_reaction',
-        'yellow_rust/yellowrust_incidence' : 'yellow_rust-yellowrust_incidence',
-        'yellow_rust/yellowrust_severity' : 'yellow_rust-yellowrust_severity',
-        }
-
-    logger.debug('Performing download')
-
-    # perform a pull from the server, and if it fails write a warning message
-
-    download_success = True
-
-    try:
-
-        request = get_from_kobotoolbox(**form_credentials)
-
-    except requests.exceptions.RequestException as e:
-        status.reset('WARNING')
-
-        download_success = False
-
-    # define filenames
-    csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"
-
-    csv_processed_filename = f"SurveyDataProcessed.csv"
-    csv_processed_path = f"{output_path}/{csv_processed_filename}"
-
-    if download_success:
-        # parse dataframe
-
-        dataframe_raw = build_dataframe(request)
-
-        logger.debug('Saving raw csv file')
-
-        df_raw_filename = f"{output_path}/{csv_filename}.csv"
-
-        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)
-
-        # process to match ODK format
-
-        dataframe_processed = parse_columns(dataframe_raw,column_parser_dict)
-
-        logger.debug('Saving processed csv file')
-
-        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)
-
-    if not download_success:
-
-        logger.info("Because server download failed somewhere, trying to recover by copying recent download")
-
-        copy_success = False
-
-        days_back = 1
-        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
-        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
-
-        while ((not copy_success) and (days_back <= acceptable_days_back)):
-
-            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
-
-            past_date = current_date - datetime.timedelta(days=days_back)
-
-            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
-            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
-
-            past_output_path = f"{past_jobPath}/{output_dir}/"
-
-            try:
-                # check that python or perl coordinator script succeeded for that date
-                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
-                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
-                assert success_py or success_perl
-
-                past_csv_filename = csv_processed_filename
-
-                logger.info(f"Looking for {past_output_path+past_csv_filename}")
-
-                copyfile(past_output_path+past_csv_filename,csv_processed_path)
-
-                assert os.path.isfile(csv_processed_path)
-
-                copy_success = True
-            except:
-                logger.info(f"Not found a kobotoolbox download in {past_output_path}")
-
-            days_back += 1
-
-        if not copy_success:
-            logger.error(f"Failed get a suitable copy of survey data.")
-            status.reset('ERROR')
-            endJob(status,premature=True)
-
-        logger.warning(f"Using download from {past_jobPath}.")
-
-    return csv_processed_path
-
-def get_from_WRSIS(form_credentials: dict, startDate: str, endDate: str):
-    date_params = {
-        'fromDate':startDate,
-        'toDate':endDate}
-
-    # set up http session
-    session = requests.Session()
-
-    # provide authorisation
-    session.auth = (form_credentials['user'],form_credentials['pass'])
-
-    response = session.post(f"{form_credentials['url']}getUKMetSurveyData",json=date_params)
-
-    # possible HTTP responses as provided in the API document
-    # (I've seen some other responses though, e.g. 415)
-    # It seems there is another layer of status codes
-    status_codes = {
-            200 : 'OK',
-            201 : 'Created',
-            202 : 'Accepted (Request accepted, and queued for execution)',
-            400 : 'Bad request',
-            401 : 'Authentication failure',
-            403 : 'Forbidden',
-            404 : 'Resource not found',
-            405 : 'Method Not Allowed',
-            409 : 'Conflict',
-            412 : 'Precondition Failed',
-            413 : 'Request Entity Too Large',
-            500 : 'Internal Server Error',
-            501 : 'Not Implemented',
-            503 : 'Service Unavailable'}
-
-    # checking the HTTP status code (not the code in the response)
-    if response.status_code == 200:
-        logger.info('HTTP request succeeded OK')
-
-    elif response.status_code in status_codes:
-        logger.info("HTTP response did not succeed OK, code is {:d}: {:s} ".format(response.status_code,status_codes[response.status_code]))
-        raise requests.exceptions.HTTPError('HTTP status was not 200')
-
-    else:
-        logger.info("HTTP response did not succeed OK, unknown code {:d}".format(response.status_code))
-        raise requests.exceptions.HTTPError('HTTP status was not 200')
-
-    return response
-
-def categorize_incident(incident):
-    '''Converting incident values into category string.
-       TODO: float values are not handled'''
-
-    try:
-        incident_value = int(incident)
-
-        if  0 < incident_value <= 20:
-            incident_category = "low"
-        elif 20 < incident_value <= 40:
-            incident_category = "medium"
-        elif 40 < incident_value <= 100:
-            incident_category = "high"
-        else:
-           incident_category = "none"
-    except:
-        if incident.lower() in ["low", "medium", "high", "none", "na"]:
-            incident_category = incident.lower()
-        else:
-            incident_category = "none"
-
-    return incident_category
-
-def nested_to_flattened(df):
-    '''WRSIS rust data is in a nested format, so it require to be flattened.
-       To do this, the nested data need to be spareated into dedicated columns.'''
-
-    # check if the dataframe is empty, if it is then add the raw columns
-    if len(df.index) == 0:
-        logger.info('Recent WRSIS download is empty.')
-        logger.info('Adding raw columns.')
-        RAW_COLUMNS = ["Rust Details","Other Disease","Sample Details","Survey Details.Latitude","Survey Details.First Rust Observation Date","Survey Details.Longitude","Survey Details.Kebele Name","Survey Details.Publish Date","Survey Details.Region Name","Survey Details.Survey Date","Survey Details.Season","Survey Details.Planting Date","Survey Details.Woreda Name","Survey Details.Location other details","Survey Details.Tillering Date","Survey Details.Zone Name","Survey Other Details.Moisture","Survey Other Details.Soil colour","Survey Other Details.Weed Control","Survey Other Details.Irrigated","Site Information.Wheat Type","Site Information.Growth Stage","Site Information.Varity Name","Site Information.Survey Site","Site Information.Site Area","Surveyor Details.Surveyors","Surveyor Details.Country","Surveyor Details.Other Surveyors","Surveyor Details.Institution Name","Fungicide Details.Fungicide Name","Fungicide Details.Spray Date","Fungicide Details.EffectiveNess","Fungicide Details.Used Dose"]
-        for i in RAW_COLUMNS:
-            df[i] = ""
-
-    # add new columns
-    logger.info('Adding new columns')
-    NEW_COLUMNS = ['imei', 'sample_size-number_yellowrust_live', 'sample_size-number_stemrust_live', 'dead_stemrust_samples_count', 'samples_collected', 'sample_size-number_yellowrust_dead', 'live_leafrust_samples_count', 'other_crop', 'live_yellowrust_samples_count', 'subscriberid', 'sample_size-using_barcode', 'start', 'score_diseases_count', 'phonenumber', 'survey_infromation-location-Accuracy', 'SET-OF-live_yellowrust_samples', 'SET-OF-score_diseases', 'meta-instanceID', 'deviceid', 'end', 'samples_type', 'live_stemrust_samples_count', 'dead_yellowrust_samples_count', 'SET-OF-live_leafrust_samples', 'KEY', 'other_diseases_group-other_diseases', 'survey_infromation-location-Altitude', 'SET-OF-dead_stemrust_samples', 'comment', 'sample_size-number_leafrust_live', 'today', 'SET-OF-dead_yellowrust_samples', 'username', 'SET-OF-live_stemrust_samples', 'sample_size-number_stemrust_dead_dna']
-
-    for i in NEW_COLUMNS:
-        df[i] = ""
-
-    #TODO: replace with a better KEY column
-    df["KEY"] = df.index
-
-    # add dedicated rust columns, with default values
-    NEW_RUST_COLUMNS = {"Stem Rust.Incident":"none","Stem Rust.Severity":"-9","Stem Rust.Reaction":"na",
-                   "Leaf Rust.Incident":"none","Leaf Rust.Severity":"-9","Leaf Rust.Reaction":"na",
-                   "Yellow Rust.Incident":"none","Yellow Rust.Severity":"-9","Yellow Rust.Reaction":"na",
-                   "Septoria.Incident":"none","Septoria.Severity":"0"}
-
-    for i in NEW_RUST_COLUMNS.keys():
-        df[i] = NEW_RUST_COLUMNS[i]
-
-    logger.info('Separating nested information into dedicated columns')
-
-    for index,row in df.iterrows():
-        nested_row = row["Rust Details"]
-        for rr in range(len(nested_row)):
-            # separating nested information into the dedicated columns
-            row[nested_row[rr]["Rust Type"] + ".Incident"] = categorize_incident(nested_row[rr]["Incident"])
-            row[nested_row[rr]["Rust Type"] + ".Severity"] = nested_row[rr]["Severity"]
-            row[nested_row[rr]["Rust Type"] + ".Reaction"] = nested_row[rr]["Reaction"]
-            df.loc[index] = row
-
-    return df
-
-def get_WRSIS_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
-    '''Given a dict with a single WRSIS form to download from WRSIS, obtains it and converts to csv.'''
-
-    output_dir = 'Export_WRSIS'
-    output_path = f"{jobPath}/{output_dir}/"
-
-    Path(output_path).mkdir(parents=True, exist_ok=True)
-
-    # get data from WRSIS
-
-    # keys are column names in the input dataframe
-    # values that are None mean they should be dropped
-    # values that are string simply rename the column
-    # values that are functions should be run with that key and returns series/dataframe
-    column_parser_dict = {
-        'Rust Details' : 'None',
-        'Other Disease' : 'None',
-        'Sample Details' : 'None',
-        'Survey Details.Latitude' : 'survey_infromation-location-Latitude',
-        'Survey Details.First Rust Observation Date' : 'None',
-        'Survey Details.Longitude' : 'survey_infromation-location-Longitude',
-        'Survey Details.Kebele Name' : 'None',
-        'Survey Details.Publish Date' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%d-%b-%Y'))),
-        'Survey Details.Region Name' : 'None',
-        'Survey Details.Survey Date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%d-%b-%Y'))),
-        'Survey Details.Season' : 'None',
-        'Survey Details.Planting Date' : 'None',
-        'Survey Details.Woreda Name' : 'None',
-        'Survey Details.Location other details' : 'None',
-        'Survey Details.Tillering Date' : 'None',
-        'Survey Details.Zone Name' : 'survey_infromation-location_name',
-        'Survey Other Details.Moisture' : 'None',
-        'Survey Other Details.Soil colour' : 'None',
-        'Survey Other Details.Weed Control' : 'None',
-        'Survey Other Details.Irrigated' : 'None',
-        'Site Information.Wheat Type' : 'site_information-crop',
-        'Site Information.Growth Stage' : 'site_information-growth_stage',
-        'Site Information.Varity Name' : 'site_information-variety',
-        'Site Information.Survey Site' : 'site_information-survey_site',
-        'Site Information.Site Area' : 'site_information-field_area',
-        'Surveyor Details.Surveyors' : 'surveyor_infromation-surveyor_name',
-        'Surveyor Details.Country' : 'surveyor_infromation-country',
-        'Surveyor Details.Institution Name' : 'surveyor_infromation-institution',
-        'Surveyor Details.Other Surveyors' : 'None',
-        #'Fungicide Details.Fungicide Name' : 'None',
-        #'Fungicide Details.Spray Date' : 'None',
-        #'Fungicide Details.EffectiveNess' : 'None',
-        #'Fungicide Details.Used Dose' : 'None',
-        "Yellow Rust.Severity" : 'yellow_rust-yellowrust_severity',
-        "Yellow Rust.Incident" : 'yellow_rust-yellowrust_incidence',
-        "Yellow Rust.Reaction" : 'yellow_rust-yellowrust_host_plant_reaction',
-        "Stem Rust.Severity" : 'stem_rust-Stemrust_severity',
-        "Stem Rust.Incident" : 'stem_rust-stemrust_incidence',
-        "Stem Rust.Reaction" : 'stem_rust-stemrust_host_plant_reaction',
-        "Leaf Rust.Severity" : 'leaf_rust-leafrust_severity',
-        "Leaf Rust.Incident" : 'leaf_rust-leafrust_incidence',
-        "Leaf Rust.Reaction" : 'leaf_rust-leafrust_host_plant_reaction',
-        "Septoria.Severity" : 'septoria-septoria_severity',
-        "Septoria.Incident" : 'septoria-septoria_incidence'
-    }
-
-    # perform a pull from the server, and if it fails write a warning message
-
-    download_success = True
-
-    start_date = datetime.datetime.strptime('01-03-2022','%d-%m-%Y').strftime('%d-%m-%Y') #TODO: set start date
-    end_date = datetime.datetime.strptime(config['StartString'], '%Y%m%d').strftime('%d-%m-%Y')
-
-    logger.debug(f'Performing download from WRSIS between {start_date} and {end_date}')
-
-    try:
-        request = get_from_WRSIS(form_credentials,start_date,end_date)
-
-    except requests.exceptions.RequestException as e:
-        status.reset('WARNING')
-
-        download_success = False
-
-    # define filenames
-    csv_filename = f"SurveyData_raw.csv"
-
-    csv_processed_filename = f"SurveyDataProcessed.csv"
-    csv_processed_path = f"{output_path}/{csv_processed_filename}"
-
-    if download_success:
-        # parse dataframe
-
-        logger.debug('Saving raw csv file')
-
-        df_raw_filename = f"{output_path}/{csv_filename}"
-        dataframe_raw = json_normalize(request.json()["response"]["Rust Survey Data"])
-
-        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)
-
-        # flatten the nested dataframe
-        dataframe_flattened = nested_to_flattened(dataframe_raw)
+def do_nothing(*args, **kwargs):
+    '''Dummy function'''
 
-        # process to match ODK format
-        dataframe_processed = parse_columns(dataframe_flattened,column_parser_dict)
-
-        logger.debug('Saving processed csv file')
-
-        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)
-
-    if not download_success:
-
-        logger.info("Because server download failed somewhere, trying to recover by copying recent download")
-
-        copy_success = False
-
-        days_back = 1
-        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
-        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
-
-        while ((not copy_success) and (days_back <= acceptable_days_back)):
-
-            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
-
-            past_date = current_date - datetime.timedelta(days=days_back)
-
-            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
-            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
-
-            past_output_path = f"{past_jobPath}/{output_dir}/"
-
-            try:
-                # check that python or perl coordinator script succeeded for that date
-                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
-                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
-                assert success_py or success_perl
-
-                past_csv_filename = csv_processed_filename
-
-                logger.info(f"Looking for {past_output_path+past_csv_filename}")
-
-                copyfile(past_output_path+past_csv_filename,csv_processed_path)
-
-                assert os.path.isfile(csv_processed_path)
-
-                copy_success = True
-            except:
-                logger.info(f"Not found a WRSIS download in {past_output_path}")
-
-            days_back += 1
-
-        if not copy_success:
-            logger.error(f"Failed get a suitable copy of survey data.")
-            status.reset('ERROR')
-            endJob(status,premature=True)
-
-        logger.warning(f"Using download from {past_jobPath}.")
-
-    return csv_processed_path
-
-def process_in_job_survey(jobPath,status,config,component):
-    logger.info('started process_in_job_survey()')
-
-    logger.debug('Performing download(s) from ODK server')
-
-    credentials_filename = config['Survey']['ServerCredentialsFile']
-    with open(credentials_filename) as credentials_file:
-
-        cred = json.load(credentials_file)
-
-        assert 'forms' in cred.keys()
-
-    csv_filenames = {}
-    for form in cred['forms']:
-
-        logger.debug(f"Starting to download {form['form_id']}")
-
-        get_form_as_csv_dict = {
-            'ODK' : get_ODK_form_as_csv,
-            'kobotoolbox' : get_kobotoolbox_form_as_csv,
-            'WRSIS' : get_WRSIS_form_as_csv
-        }
-
-        assert form['type'] in get_form_as_csv_dict
-
-        func_get_form_as_csv = get_form_as_csv_dict[form['type']]
-
-        csv_filename = func_get_form_as_csv(form, jobPath, config, status)
-
-        csv_filenames[form['form_id']] = csv_filename
-
-    # load each file of surveys as a dataframe
-    forms = {}
-    for form_name,form_fn in csv_filenames.items():
-
-        # some define column types, hardwired for now
-        col_types = {'comment':'str'}
-
-        form_df = read_csv(form_fn,dtype=col_types)
-
-        forms[form_name] = form_df
-
-    # create some standard dataframe modification functions
-    def add_column(df,coln,value):
-        df[coln]=value
-        return
-
-    def remove_column(df,coln,value):
-        del df[coln]
-        return
-
-    def replace_column(df,coln,value):
-        df[coln]=value
-        return
-
-    def filter_by_column(df,coln,value):
-        # CAUTION: This requires surveyor to provide the correct country
-        df.drop(df.loc[df[coln]!=value].index,inplace=True)
-        #TODO : for Kenya data, provide a coordinate-based filter
-        return
-
-    def filter_by_list(df,coln,values):
-        # CAUTION: This requires surveyor to provide the correct list of countries
-        df.drop(df.loc[~df[coln].isin(values)].index,inplace=True)
-        return
-
-    func_types = {
-        'add': add_column,
-        'remove' : remove_column,
-        'replace' : replace_column,
-        'filter' : filter_by_column,
-        'filter_by_list' : filter_by_list
-    }
-
-    # simple format alignment using edits on config
-    # (should this need to be much more sophisticated, reconsider the workflow)
-    if 'FormEdits' in config['Survey']:
-
-        form_edits = config['Survey']['FormEdits']
-
-        # loop over each form
-        for form_name, edits in form_edits.items():
-
-            form_df = forms[form_name]
-
-            # loop over each type of edit
-            for func_type, columns in edits.items():
-
-                # check the function is available
-                assert func_type in func_types
-
-                # loop over each column to modify
-                for coln,val in columns.items():
-
-                    # apply the edit
-                    func_types[func_type](form_df,coln,val)
-
-    # Merge additional SurveyData files and rearrange columns to be consistent
-    # Assumes that the same columns are present in all forms
-    # and that the first form is the standard
-
-    first=True
-    for dfi in forms.values():
-
-        if first:
-            standard_columns = dfi.columns.tolist()
-            dfm = dfi
-
-            logger.debug(f"First processed form contains {dfm.shape[0]} records")
-
-            first=False
-            continue
-
-        # re-order columns to match first case (presumed standard format)
-        dfi = dfi[standard_columns]
-
-        logger.debug(f"Next processed form contains {dfi.shape[0]} records")
-
-        dfm = concat([dfm,dfi],axis='rows')
-
-    # save the result
-    ODK_csv_path = f"{jobPath}/ExportCSV/"
-    forms_fn = f"{ODK_csv_path}/Merged_SurveyData.csv"
-    dfm.to_csv(forms_fn,index=False,quoting=csv.QUOTE_MINIMAL)
-
-    logger.debug(f"Preparing to apply removals and additions to ODK survey data")
-
-    processed_surveys_filepath = f"{ODK_csv_path}/Processed_SurveyData.csv"
-
-    survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv"
-    survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv"
-
-    # perform here in python, using the 'KEY' column
-    # check the key column is unique
-
-    assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique'
-
-    df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str')
-    keys_to_rm = df_rm['KEY']
-
-    # check that all of the keys to remove exist in the original data
-    rm_keys_found = df_rm['KEY'].apply(lambda cell: cell in dfm['KEY'].values)
-    n_rm_keys_found = rm_keys_found.sum()
-    n_rm_keys = rm_keys_found.size
-    if not np_all(rm_keys_found):
-        # this might happen if the run date is in the past
-        logger.warning(f"Only found {n_rm_keys_found} of {n_rm_keys} survey errors to remove")
-
-        rm_keys_not_found = df_rm[~rm_keys_found]
-        logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}")
-
-    # identify which surveys to remove
-    idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values)
-
-    #drop them in-place
-    dfm = dfm[~idx_to_rm]
-    logger.info(f"Removed {n_rm_keys_found} erroneous surveys")
-
-    # add the extra entries
-    df_add = read_csv(survey_additions_filepath,dtype='str')
-    n_add_keys = df_add.shape[0]
-    df_join = concat([dfm,df_add])
-    assert dfm.shape[0]+df_add.shape[0] == df_join.shape[0], 'Unexpected result of including additional surveys'
-
-    logger.info(f"Added {n_add_keys} additional surveys")
-
-    # save as processed
-    df_join.to_csv(processed_surveys_filepath,index=False,quoting=csv.QUOTE_MINIMAL)
-
-    logger.debug('Preparing clustering calculation')
-
-    date = datetime.datetime.now()
-
-    cluster_calc_path = "/storage/app/EWS_prod/code/wheat_source_generation/"
-
-    # clear old output
-    old_clustering_output_glob = f"{cluster_calc_path}/output/sources_*"
-    old_clustering_outputs = glob(old_clustering_output_glob)
-
-    logger.info('About to unlink old output from clustering calculation')
-    for path in old_clustering_outputs:
-        logger.info(f"unlinking {path}")
-        Path(path).unlink()
-
-    # prepare environment for clustering calc
-
-    RPath = '/usr/local/R/bin/Rscript'
-
-    clustering_script = f"{cluster_calc_path}/code/R/clustering.R"
-
-    clustering_env = {
-            **os.environ,
-            'R_LIBS':'/home/ewsmanager/R-packages-EWS-clustering/x86_64-pc-linux-gnu-library/3.5',
-            'PROJ_LIB' : '/usr/share/proj/', # conda env breaks the automatic assignment of PROJ_LIB
-            }
-
-    clustering_config = config['Survey']['SourcesConfigFilename']
-    assert os.path.isfile(clustering_config)
-
-    clustering_calc = [RPath,
-            '--no-init-file',
-            clustering_script,
-            processed_surveys_filepath,
-            config['StartString'],
-            '-2',
-            '7',
-            config['Survey']['SourcesConfigFilename']]
-
-    logger.debug('Performing clustering calculation')
-
-    description_short = 'wheat-source-generation'
-    description_long = 'source calculation on processed surveys'
-
-    try:
-        subprocess_and_log(clustering_calc, description_short, description_long, env=clustering_env)
-    except:
-        status.reset('ERROR')
-        endJob(status,premature=True)
-
-    logger.debug('Checking output of clustering calculation')
-
-    output_directory = f"{jobPath}/SURVEYDATA_{config['StartString']}_0000"
-    Path(output_directory).mkdir(parents=True, exist_ok=True)
-
-    try:
-        logger.debug('Trying to copy the dataset processed for clustering')
-
-        clustering_proc_path_glob = f"{cluster_calc_path}/output/survey_data_processed_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
-        clustering_proc_path_list = glob(clustering_proc_path_glob)
-        if len(clustering_proc_path_list) == 0:
-            logger.debug(f"No processed files produced from clustering in {clustering_proc_path_glob}")
-            raise Exception
-
-        elif len(clustering_proc_path_list) > 1:
-            logger.debug(f"Multiple processed files produced from clustering in {clustering_proc_path_glob}")
-            raise Exception
-
-        else:
-            logger.debug('Found 1 processed file, placing copy of result in job directory')
-
-            proc_filename = f"survey_data_processed_{config['StartString']}.csv"
-            proc_path = f"{output_directory}/{proc_filename}"
-
-            logger.debug(f"as {proc_path}")
-
-            copyfile(clustering_proc_path_list[0], proc_path)
-
-    except:
-        logger.debug('Failed to get a copy of the dataset processed for clustering')
-
-    clustering_output_path_glob = f"{cluster_calc_path}/output/sources_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
-    clustering_output_path_list = glob(clustering_output_path_glob)
-    if len(clustering_output_path_list) == 0:
-        logger.error(f"No output produced from clustering in {clustering_output_path_glob}")
-        status.reset('ERROR')
-        endJob(status,premature=True)
-    if len(clustering_output_path_list) > 1:
-        logger.error(f"Multiple outputs produced from clustering in {clustering_output_path_glob}")
-        status.reset('ERROR')
-        endJob(status,premature=True)
-
-    logger.debug('Placing copy of result in job directory')
-
-    output_filename = f"sources_{config['StartString']}.csv"
-    output_path = f"{output_directory}/{output_filename}"
-
-    logger.debug(f"as {output_path}")
-
-    copyfile(clustering_output_path_list[0], output_path)
-
-    return [output_path]
-
-def process_in_job_env2_0(jobPath,status,config,component):
-    logger.info('started process_in_job_env2_0()')
-
-    logger.info('Copying file from remote server to job directory')
-
-    file_path = Template(config[component]['ServerPathTemplate']).substitute(**config)
-    file_name = Template(config[component]['InputFileTemplate']).substitute(**config)
-
-    #TODO: check if file exists already (may be the case for multiple configs in one)
-
-    # TODO: perform ssh file transfer in python instead of subprocess
-    cmd_scp = ["scp","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath]
-    description_short = 'env2 scp'
-    description_long = 'Copying file from remote server to job directory'
-
-    # lawrence comment in/out
-    subprocess_and_log(cmd_scp,description_short, description_long)
-
-    logger.info('untarring the input file')
-
-    # untar incoming name data
-    output_directory = f"{jobPath}/NAME_Met_as_netcdf"
-    Path(output_directory).mkdir(parents=True, exist_ok=True)
-    tarfile_name = f"{jobPath}/{file_name}.tar.gz"
-    with tarfile.open(tarfile_name) as tar:
-        members = remove_path_from_tar_members(tar)
-        tar.extractall(output_directory, members = members)
-
-    # basic check that contents are as expected for 7-day forecast (57 timepoints in all files)
-    cube_wildcard = f"{output_directory}/*.nc"
-    cubes: CubeList = iris.load(cube_wildcard)
-    for cube in cubes:
-        coord = cube.coord("time")
-        timepoint_count = coord.shape[0]
-        if timepoint_count != 57:
-            msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}"
-            logger.error(msg)
-            raise RuntimeError(msg)
-
-    region = config['RegionName']
-
-    logger.info(f"Calling environmental suitability 2.0 for {region} so wait for output to appear")
-
-    pipeline_config = config["Environment"]
-    try:
-        #todo lawrence comment this back to original (extracted=False)
-        esp.run_pipeline(pipeline_config, region, config["StartString"], extracted=False)
-    except:
-        logger.exception(f"Some failure when running EnvSuitPipeline.py")
-        raise
-
-    logger.info('Finished running environmental suitability 2.0')
-
-    # TODO: Check that the output appears as expected
-
-    return
-
-def process_copy_past_job_env2_0(jobPath,status,config,component):
-    '''For when we want to skip process_in_job() to test the other components of
-    this script. Currently hard-wired.'''
-
-    # TODO: remove this hard-wired assumption
-    jobPath_to_copy = f"{jobPath}/../{short_name['Environment']}_{config['StartString']}_bak/"
-
-    assert os.path.exists(jobPath_to_copy)
-
-    dir_src = f"{jobPath_to_copy}/processed/"
-
-    dir_dst = f"{jobPath}/processed/"
-
-    logger.info(f"Copying from {dir_src}")
-
-    logger.info(f"to {dir_dst}")
-
-    copy_tree(dir_src,dir_dst)
-
-    logger.info('Copying complete')
-
-    return
-
-def process_in_job_dep(jobPath,status,config,component):
-    logger.info('started process_in_job_dep()')
-
-    file_path = Template(config[component]['ServerPathTemplate']).substitute(**config)
-    file_name = Template(config[component]['InputFileTemplate']).substitute(**config)
-
-    logger.info(f"Expecting to work with {file_name}")
-
-    if os.path.exists(f"{jobPath}/{file_name}"):
-        logger.info('Directory already exists in job directory, so nothing to do here')
-        return
-
-    logger.info('Copying file from remote server to job directory')
-
-    # TODO: perform ssh file transfer in python instead of subprocess
-    cmd_scp = ["scp","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath]
-    description_short = 'dep scp'
-    description_long = 'scp from server to job directory'
-    subprocess_and_log(cmd_scp, description_short, description_long)
-
-    logger.info('untarring the input file')
-
-    # TODO: untar file in python (with tarfile module) instead of subprocess
-    cmd_tar = ["tar","-xzf",f"{jobPath}/{file_name}.tar.gz","-C",jobPath]
-    description_short = 'dep tars'
-    description_long = 'untar the downloaded file'
-    subprocess_and_log(cmd_tar, description_short, description_long)
-
-    # basic check that contents are as expected
-    # 132 files of NAME .txt timesteps and one summary png file
-    if len(glob(f"{jobPath}/{file_name}/deposition_srcs_allregions_C1_T*.txt")) != 56:
-        msg = f"Unexpect number of deposition .txt files in input tar file. Expected 56."
-        logger.error(msg)
-        raise RuntimeError(msg)
-    return
-
-def create_epi_config_string(config,jobPath,startString,endString):
-
-    configtemplate_fn = config['ConfigFilePath']
-    configName_withoutEpi = f"{os.path.basename(configtemplate_fn).replace('.json','')}_{startString}-{endString}"
-
-    # create a string describing every epi calc configuration
-    epiStrings = []
-    for epiconf in config['Epidemiology']['Epi']:
-        epiKwargsString = ''.join([f"{k}{v}" for k,v in epiconf['modelArguments'].items()])
-
-        # drop any repetitive elements of kwarg
-        epiKwargsString = epiKwargsString.replace('infectionprevious','')
-        epiKwargsString = epiKwargsString.replace('capbeta','cb')
-
-        epiCaseString = f"{epiconf['model'].lower()}{epiKwargsString}"
-
-        # provide to configuration for output filename
-        epiconf["infectionRasterFileName"] = f"{jobPath}/infections_{configName_withoutEpi}_{epiCaseString}"
-
-        epiStrings += [epiCaseString]
-
-    epiString = '-'.join(epiStrings)
-
-    config_filename = f"{configName_withoutEpi}_{epiString}"
-
-    logger.debug(f"length of config filename is {len(config_filename)}.")
-
-    if len(config_filename) > 254:
-        logger.info(f"filename length is too long, it will raise an OSError, using a short form instead")
-
-        # epi cases are not described in filename, an interested user
-        # must look in the json file for details.
-        config_filename = configName_withoutEpi
-
-        assert len(config_filename) <= 254
-
-    return config_filename
-
-def raster_to_csv(raster_fn,csv_fn):
-
-    # create a csv version and save in the job directory,
-    # to compare host raster with dep and env suit
-    # note this can be time-varying by providing additional rows
-    with rio_open(raster_fn,'r') as host_raster:
-        host_arr = host_raster.read(1)
-        shape = host_raster.shape
-
-        # determine coordinates
-        coords = [host_raster.xy(i,j) for i in range(shape[0]) for j in range(shape[1])]
-        lons = unique([ci[0] for ci in coords])
-        lats = unique([ci[1] for ci in coords])
-        assert shape == (lats.size,lons.size)
-
-    # build into a dataframe
-    # (rasters start in the top left, so descending latitude coordinates)
-    host_df = DataFrame(data=host_arr,index=lats[::-1],columns=lons)
-    # rearrange to ascending latitude corodinates
-    host_df.sort_index(axis='rows',inplace=True)
-    # make spatial coordinates a multi-index, like for dep and env suit csvs
-    host_series = host_df.stack()
-    # for now, provide a nominal date of validity to enable a time column
-    # so far, using mapspam which is a static map, so time is irrelevant
-    host_series.name = '201908150000'
-    host_df2 = DataFrame(host_series).T
-
-    host_df2.to_csv(csv_fn)
-
-    return
-
-def process_in_job_epi(jobPath,status,config,component):
-    logger.info('started process_in_job_epi()')
-
-    # TODO: Some of this is modifying config before epi model is run. Determine
-    # how to account for that
-
-    # initialise any needed variables
-
-    reference_date_str = config['StartString']
-    reference_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d')
-
-    start_date, end_date = calc_epi_date_range(reference_date_str,config['Epidemiology']['CalculationSpanDays'])
-
-    date_diff = end_date - start_date
-
-    start_string = start_date.strftime('%Y-%m-%d-%H%M')
-    start_string_short = start_date.strftime('%Y%m%d%H%M')
-    end_string = end_date.strftime('%Y-%m-%d-%H%M')
-
-    # update config accordingly
-    config['ReferenceTime'] = reference_date_str
-    config['StartTime'] = start_string
-    config['StartTimeShort'] = start_string_short
-    config['EndTime'] = end_string
-
-    diseases = config['Epidemiology']['DiseaseNames']
-
-    def gather_deposition(config_epi,config,variable_name,start_date,end_date,jobDataPath,status):
-
-        # TODO: Simplify the set of required arguments . Check if config is necessary.
-
-        config_epi['Deposition']['VariableName'] = variable_name # disease_latin_name_dict[disease]+'_DEPOSITION'
-
-        config_epi['Deposition']['FileNamePrepared'] = f"{jobDataPath}/data_input_deposition.csv"
-
-        # Use config-defined file lister in config file instead of here
-        file_lister_dep_name = config_epi['Deposition'].get('FileListerFunction',None)
-
-        # when it isn't defined, guess what it should be
-        if file_lister_dep_name is None:
-
-            file_lister_dep_name = 'list_deposition_files_operational'
-
-            if date_diff > datetime.timedelta(days=7):
-
-                file_lister_dep_name = 'list_deposition_files_historical'
-                logger.info('Using historical method to prepare data on spore deposition')
-
-        file_lister_dep = getattr(EpiPrepLister,file_lister_dep_name)
-
-        config_for_lister = config.copy()
-        config_for_lister.update(config_epi)
-
-        # get bounds of host map, to exclude redundant deposition datapoints
-        hostRasterFileName = config_for_lister["Host"]["HostRaster"]
-        with rio_open(hostRasterFileName) as hostRaster:
-            bounds = hostRaster.bounds
-
-        lister_kwargs = {}
-        lister_kwargs['reference_date']=config['ReferenceTime']
-
-        loader_kwargs= {}
-        loader_kwargs['VariableName']= config_for_lister['Deposition'].get('VariableName')
-        loader_kwargs['VariableNameAlternative']= config_for_lister['Deposition'].get('VariableNameAlternative')
-        loader_kwargs['bounds'] = bounds
-
-        try:
-
-            EpiPrep.prep_input(config_for_lister,start_date,end_date,
-                    component='Deposition',
-                    file_lister=file_lister_dep,
-                    file_loader=EpiPrepLoader.load_NAME_file,
-                    lister_kwargs=lister_kwargs,
-                    **loader_kwargs)
-
-            assert os.path.isfile(config_epi['Deposition']['FileNamePrepared'])
-
-        except:
-
-            logger.exception(f"Unexpected error in deposition data preparation")
-            status.reset('ERROR')
-            endJob(status,premature=True)
-
-        return
-
-    # get list of variable names to be loaded from deposition input
-    depo_variable_names =  config['Epidemiology']['Deposition']['VariableNames']
-    assert len(depo_variable_names) == len(diseases)
-
-    # loop over each sub region
-
-    region = config['RegionName']
-    #for region in config['SubRegionNames']:
-
-    for disease in diseases:
-
-        assert disease in disease_latin_name_dict.keys()
-
-        config_epi = config['Epidemiology'].copy()
-
-        # TODO: CAUTION: Any iterations (e.g. disease or sub-region) are hidden
-        # in jobPath, and not retained in the config file. This is a provlem for
-        # process_EWS_plotting_epi which receives a single config file and must
-        # try a fudge to retrieve details for each iteration.
-        # This should be improved, either by making the one config file
-        # aware of all of the iterations, or looping over iterations in
-        # Processor.py with one iteration-specific config.
-        case_specific_path = f"{jobPath}/{region}/{disease}/"
-        Path(case_specific_path).mkdir(parents=True, exist_ok=True)
-
-        logger.info(f"Preparing for epidemiology calc of {disease} in {region}")
-
-        # create config_filename to describe job configuration
-        config_filename = create_epi_config_string(config,case_specific_path,start_string,end_string)
-
-        # prepare a directory for input data
-        jobDataPath = f"{case_specific_path}/input_data/"
-        Path(jobDataPath).mkdir(parents=True, exist_ok=True)
-
-        # configure filename of prepared deposition data
-
-        if 'Deposition' in config_epi:
-
-            # determine which variable name to load for this disease
-            disease_idx = [i for i,j in enumerate(diseases) if j==disease][0]
-
-            variable_name = depo_variable_names[disease_idx]
-
-            gather_deposition(config_epi,config,variable_name,start_date,end_date,jobDataPath,status)
-
-        # configure filename of prepared deposition data
-
-        if 'Environment' in config_epi:
-
-            logger.info('Preparing environmental suitability data')
-
-            config_epi['SubRegionName'] = region
-
-            config_epi['DiseaseName'] = disease
-
-            config_epi['Environment']['FileNamePrepared'] = f"{jobDataPath}/data_input_environment.csv"
-
-            # Use config-defined file lister in config file instead of here
-            file_lister_env_name = config_epi['Environment'].get('FileListerFunction',None)
-
-            # when it isn't defined, guess what it should be
-            if file_lister_env_name is None:
-
-                use_monthly_chunk=False # hard-coded for historical analysis
-                file_lister_env_name = 'list_env_suit_files_operational'
-
-                if (date_diff > datetime.timedelta(days=7)) & ('ENVIRONMENT_2.0' in config_epi['Environment']['PathTemplate']) & use_monthly_chunk:
-
-                    logger.info('Using monthly-chunk method to prepare data on environmental suitability')
-                    file_lister_env_name = 'list_env_suit_files_historical_monthlychunk'
-
-                elif date_diff > datetime.timedelta(days=7):
-
-                    logger.info('Using historical method to prepare data on environmental suitability')
-                    file_lister_env_name = 'list_env_suit_files_historical'
-
-            file_lister_env = getattr(EpiPrepLister,file_lister_env_name)
-
-            config_for_lister = config.copy()
-            config_for_lister.update(config_epi)
-
-            try:
-
-                EpiPrep.prep_input(config_for_lister,start_date,end_date,
-                        component='Environment',
-                        file_loader=EpiPrepLoader.load_env_file,
-                        file_lister=file_lister_env)
-
-                assert os.path.isfile(config_epi['Environment']['FileNamePrepared'])
-
-            except:
-
-                logger.exception(f"Unexpected error in env data preparation")
-                status.reset('ERROR')
-                endJob(status,premature=True)
-
-        # prepare a copy of the host data
-
-        logger.info('Preparing a copy of the host raster data')
-
-        src_host = config_epi['Host']['HostRaster']
-        fn_host = os.path.basename(src_host)
-        dst_host = f"{jobDataPath}/{fn_host}"
-
-        # copy the tif to the job directory and refer to that instead
-        shutil.copyfile(src_host,dst_host)
-        config_epi['Host']['HostRaster'] = dst_host
-
-        logger.info('Preparing a copy of the host data as csv')
-
-        dst_host_csv = dst_host.replace('.tif','.csv')
-
-        raster_to_csv(dst_host,dst_host_csv)
-
-        config_epi['Host']['HostCSV'] = dst_host_csv
-
-        # provide fundamental config elements to config_epi
-        for k,v in config.items():
-            if k not in short_name.keys():
-                config_epi[k]=v
-
-        logger.debug('Incremental configuration looks like:')
-        def print_item(item):
-            logger.debug(f"Item {item}")
-            logger.debug(json.dumps(item,indent=2))
-        def iterate(items):
-            for item in items.items():
-                if hasattr(item,'items'):
-                    # iterate
-                    iterate(item)
-                else:
-                    print_item(item)
-        iterate(config_epi)
-
-        logger.debug('Complete configuration looks like:')
-        logger.debug(json.dumps(config_epi,indent=2))
-
-        # write the complete configuration file to job directory
-        with open(f"{case_specific_path}/{config_filename}.json",'w') as write_file:
-            json.dump(config_epi,write_file,indent=4)
-
-        # run epi model
-
-        try:
-            EpiModel.run_epi_model(f"{case_specific_path}/{config_filename}.json")
-        except:
-            logger.exception('Unexpected error in EpiModel')
-            raise
-
-        # perform calc on output
-
-        def calc_total(arr):
-            return 'total', arr.sum()
-
-        def calc_max(arr):
-            return 'maximum', arr.max()
-
-        def calc_mean(arr):
-            return 'mean', arr.mean()
-
-        for epiconf in config['Epidemiology']['Epi']:
-
-            outfile = epiconf["infectionRasterFileName"]
-
-            with rio_open(outfile+'.tif','r') as infectionRaster:
-                infection = infectionRaster.read(1)
-
-                # define function to quantify overall result, for easy check
-                # TODO: Create a more meaningful result?
-                # TODO: make this configurable
-                analysis_func = calc_mean
-
-                analysis_desc, analysis_value = analysis_func(infection)
-
-                logger.info(f"For case {outfile}")
-                logger.info('Infection {:s} is {:.2e}'.format( analysis_desc, analysis_value))
-
-                # to save tif as png for easy viewing
-                logger.debug('Saving tif output as png for easier viewing')
-                plotRaster.save_raster_as_png(outfile)
-
-        # comparison figure
-
-        # TODO: make this plot configurable? with function or args?
-        #logger.info('Plotting epi output alongside contributing components')
-        # figure_func = getattr(EpiAnalysis,'plot_compare_host_env_dep_infection')
-        logger.info('Plotting composite image of epi formulations')
-        figure_func = getattr(EpiAnalysis,'plot_compare_epi_cases')
-
-        # isolate the config for this function, in case of modifications
-        config_epi_for_comparison = config_epi.copy()
-
-        fig,axes,cases = figure_func(
-                config_epi_for_comparison,
-                start_str = start_string,
-                end_str = end_string)
-
-        SaveFileName = f"{case_specific_path}/EPI_{config_filename}_comparison"
-
-        fig.savefig(SaveFileName+'.png',dpi=300)
-
-        # slice the epi results into before forecast and in forecast
-
-        for epiconf in config['Epidemiology']['Epi']:
-
-            outfile = epiconf["infectionRasterFileName"]+'_progression.csv'
-
-            fn_seasonsofar = epiconf["infectionRasterFileName"]+'_seasonsofar.csv'
-            fn_weekahead = epiconf["infectionRasterFileName"]+'_weekahead.csv'
-
-            # load the full epi results
-            df_full = read_csv(outfile,header=[0],index_col=[0,1])
-            column_date_fmt = f"X{config['StartTimeShort']}_X%Y%m%d%H%M"
-            df_full_dates = to_datetime(df_full.columns.astype('str'),format=column_date_fmt)
-
-            # determine date to cut with
-            # plus 1 minute so midnight is associated with preceding day
-            date_to_cut = datetime.datetime.strptime(config['StartString']+'0001','%Y%m%d%H%M')
-            dates_after_cut = df_full_dates >= date_to_cut
-            idx = argmax(dates_after_cut)-1
-
-            # build seasonsofar dataframe (only need the last date)
-            df_seasonsofar = df_full.iloc[:,idx]
-
-            # check column name is defined as expected
-            # from epi start time to forecast start time
-            column_name = f"X{config['StartTimeShort']}_X{config['StartString']}0000"
-            assert df_seasonsofar.name == column_name
-
-            #  save to csv
-            df_seasonsofar.to_csv(fn_seasonsofar,header=True,index=True)
-
-            # build weekahead dataframe and save to csv
-            df_fc_start = df_full.iloc[:,idx]
-            df_fc_start_name = df_fc_start.name.split('_')[-1]
-
-            df_fc_end = df_full.iloc[:,-1]
-            df_fc_end_name = df_fc_end.name.split('_')[-1]
-
-            df_weekahead = df_fc_end - df_fc_start
-
-            # defined column name
-            df_weekahead.name = '_'.join([df_fc_start_name,df_fc_end_name])
-
-            # save to csv
-            df_weekahead.to_csv(fn_weekahead,header=True,index=True)
-
-    return
-
-def do_nothing(*args, **kwargs):
-    '''Dummy function'''
-
-    logger.info('Called do_nothing(). Nothing to do here')
+    logger.info('Called do_nothing(). Nothing to do here')
 
     pass
     return []
-
-#TODO
-def process_EWS_plotting_survey(jobPath,config):
-    '''Returns a list of output files for transfer.'''
-
-    logger.info('started process_EWS_plotting_survey(), nothing to do')
-
-    pass
-    return []
-
-'''class EWSPlottingEnvSuit(EWSPlottingEnvSuitBase):
-
-    def set_custom_params(self,
-                          sys_params_dict: dict,
-                          chart_params_dict: dict,
-                          run_params_dict: dict,
-                          disease_csv_template_arg: str,
-                          diseases: List[EnvSuitDiseaseInfo]):
-        # this is unique to the asia/east africa env suit, as we are not filtering within country boundaries
-        run_params_dict[RUN_PARAMS.FILTER_FOR_COUNTRY_KEY] = "False"'''
-
-#TODO test if this works
-def process_EWS_plotting_env2_0(jobPath,config):
-    '''Configures the plotting arguments and calls EWS-plotting as a python module.
-    Returns a list of output files for transfer.'''
-
-    logger.info('started process_EWS_plotting_env2_0()')
-
-    main_region = config['RegionName']
-
-    input_dir = f"{jobPath}/processed/{main_region}"
-
-    subregions = config['SubRegionNames']
-
-    EWSPlottingOutputGlobs = []
-
-    # work on each region
-    for region in subregions:
-
-        output_dir = f"{jobPath}/plotting/{region.lower()}"
-        csv_template_dir = input_dir + "/{DISEASE_DIR}/RIE_value.csv"
-
-        Path(output_dir).mkdir(parents=True, exist_ok=True)
-
-        sys_config = config['Environment']['EWS-Plotting']['SysConfig']
-        run_config = config['Environment']['EWS-Plotting']['RunConfig']
-        chart_config = config['Environment']['EWS-Plotting'][region]['ChartConfig']
-        filter_for_country = config['Environment']['EWS-Plotting'][region]['FilterForCountry']
-
-        # Note that this runs all disease types available
-
-        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
-
-        env_suit_plotter = EWSPlottingEnvSuitBase()
-        env_suit_plotter.set_param_config_files(sys_params_file_arg=sys_config,
-                                             chart_params_file_arg=chart_config,
-                                             run_params_file_arg=run_config,
-                                             es_output_dir_arg=output_dir,
-                                             issue_date_arg=config['StartString'],
-                                             disease_csv_template_arg=csv_template_dir)
-
-        env_suit_plotter.run_params.FILTER_FOR_COUNTRY = (filter_for_country.upper() == "TRUE")
-
-        # Include further diseases in plotting. In this case the irrigated suitabilite for the rusts.
-        # TODO: move this part out into a config
-        extra_diseases = [
-            EnvSuitDiseaseInfo("Stem rust temp-only", "stem_rust_temponly", config['StartString'], "StemRust_TempOnly", csv_template_dir),
-            EnvSuitDiseaseInfo("Leaf rust temp-only", "leaf_rust_temponly", config['StartString'], "LeafRust_TempOnly", csv_template_dir),
-            EnvSuitDiseaseInfo("Stripe rust temp-only", "stripe_temponly", config['StartString'], "StripeRust_TempOnly", csv_template_dir)
-        ]
-
-        env_suit_plotter.add_diseases(diseases=extra_diseases)
-
-        env_suit_plotter.plot_env_suit()
-
-        # check the output
-        EWSPlottingOutputDir = f"{output_dir}/images/"
-        #EWSPlottingOutputGlobs += [
-        #        # daily plots
-        #        f"{EWSPlottingOutputDir}Daily/suitability_{region.lower()}_*_rust_daily_20*.png",
-        #        # weekly plots
-        #        f"{EWSPlottingOutputDir}Weekly/suitability_{region.lower()}_*_rust_total_20*.png"]
-
-        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"]
-
-    # check the output
-    EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
-
-    # check there is some output from EWS-plotting
-    if not EWSPlottingOutputGlobs:
-        logger.error('EWS-Plotting did not produce any output')
-        raise RuntimeError
-
-    # provide list for transfer
-    EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)])
-
-    return EWSPlottingOutputs
-
-def process_EWS_plotting_dep(jobPath,config):
-    '''Returns a list of output files for transfer.'''
-
-    logger.info('started process_EWS_plotting_dep()')
-
-    # initialise environment
-    regions = config['SubRegionNames']
-
-    deposition_file_name = Template(config['Deposition']['InputFileTemplate']).substitute(**config)
-
-    deposition_path = f"{jobPath}/{deposition_file_name}"
-
-    # get the file name from the config
-    # this file name can be a glob, as long as matches can all be loaded by iris
-    deposition_data_file_name = Template(config['Deposition']['DataFileTemplate']).substitute(**config)
-    name_file_wildcard = f"{deposition_path}/{deposition_data_file_name}"
-
-    EWSPlottingOutputGlobs = []
-
-    for region in regions:
-
-        output_dir = f"{jobPath}/plotting/{region.lower()}"
-
-        Path(output_dir).mkdir(parents=True, exist_ok=True)
-
-        sys_config = config['Deposition']['EWS-Plotting']['SysConfig']
-        name_extraction_config = config['Deposition']['EWS-Plotting']['NameExtractionConfig']
-        run_config = config['Deposition']['EWS-Plotting']['RunConfig']
-        run_config_norm = config['Deposition']['EWS-Plotting']['RunConfigNorm']
-        chart_config = config['Deposition']['EWS-Plotting'][region]['ChartConfig']
-        normalize = config['Deposition']['EWS-Plotting'][region]['Normalize']
-        extraction_file_prefix = 'deposition_' + region.lower()
-
-        # Note that this runs all disease types available
-
-        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{name_extraction_config}\n{run_config}\n{run_config_norm}\n{chart_config}")
-
-        depo_plotter = EWSPlottingDepoBase()
-        depo_plotter.set_param_config_files(sys_config_file_arg=sys_config,
-                                        depo_name_extraction_config_file_arg=name_extraction_config,
-                                        chart_config_file_arg= chart_config,
-                                        depo_plotting_run_config_file_arg=run_config,
-                                        depo_plotting_normalized_run_config_file_arg=run_config_norm,
-                                        name_file_wildcard_arg=name_file_wildcard,
-                                        wheat_sources_dir_arg=deposition_path,
-                                        output_dir_arg=output_dir,
-                                        issue_date_arg=config['StartString'],
-                                        extraction_file_prefix_arg=extraction_file_prefix)
-
-
-        # asia/east africa env suit should not perform normalization, false gets passed here for these areas
-        depo_plotter.name_extract_params.NORMALIZE = (normalize.upper() == "TRUE")
-
-        depo_plotter.plot_depo()
-
-        # check the output
-        EWSPlottingOutputDir = f"{output_dir}/images/"
-        #EWSPlottingOutputGlobs += [
-        #        # daily plots
-        #        f"{EWSPlottingOutputDir}Daily/deposition_{region.lower()}_*_daily_20*.png",
-        #        # weekly plots
-        #        f"{EWSPlottingOutputDir}Weekly/deposition_{region.lower()}_*_total_20*.png"]
-
-        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"]
-
-    EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
-
-    # check there is some output from EWS-plotting
-    if not EWSPlottingOutputGlobs:
-        logger.error('EWS-Plotting did not produce any output')
-        raise RuntimeError
-
-    # provide list for transfer
-    EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)])
-
-    return EWSPlottingOutputs
-
-def process_EWS_plotting_epi(jobPath,config):
-    '''Returns a list of output files for transfer.'''
-
-    logger.info('started process_EWS_plotting_epi()')
-
-    # initalise necessary variables from config
-
-    start_date, end_date = calc_epi_date_range(config['StartString'],config['Epidemiology']['CalculationSpanDays'])
-
-    start_string = start_date.strftime('%Y%m%d')
-    end_string = end_date.strftime('%Y%m%d')
-
-    epi_case_operational = config['Epidemiology']['EWS-Plotting']['EpiCase']
-
-    if epi_case_operational == 'none':
-        logger.info('Config specifies not to call to EWS-Plotting')
-        return []
-
-    diseases = config['Epidemiology']['DiseaseNames']
-
-    # initialise environment
-    sys_config = config['Epidemiology']['EWS-Plotting']['SysConfig']
-
-    chart_config = config['Epidemiology']['EWS-Plotting']['ChartConfig']
-
-    # use the first matching epi formulation
-    # TODO: Is there a more efficient way to select?
-    epi_filename = [ce['infectionRasterFileName'] for ce in config['Epidemiology']['Epi'] if ce['model']==epi_case_operational][0]
-
-    dep_regionnames = ['SouthAsia','Ethiopia']
-
-    # TODO get deposition_dir from config['Epidemiology']['Deposition']['PathTemplate']
-    dep_regionname = 'Ethiopia' #SouthAsia
-
-    deposition_dir = f"{config['WorkspacePath']}DEPOSITION_{start_string}/WR_NAME_{dep_regionname}_{start_string}/"
-
-    # TODO: handle multiple diseases and regions in Processor as a loop, or in the config
-    deposition_disease_name = [disease_latin_name_dict[disease]+'_DEPOSITION' for disease in diseases][0]
-
-    ews_plot_dir = f"{jobPath}/plotting/"
-
-    Path(ews_plot_dir).mkdir(parents=True, exist_ok=True)
-
-    # loop over diseases
-    EWSPlottingOutputGlobs = []
-    for disease in diseases:
-        disease_short = disease.lower().replace('rust','')
-
-        # a fudge, guess disease type
-        # because config['Epidemiology']['ProcessInJob'] handles disease loop internally
-        # assumes disease name is the last directory before the filename
-        # TODO: handle multiple diseases and regions in Processor as a loop, or in the config
-        disease_to_drop = os.path.dirname(epi_filename).split('/')[-1].replace('Rust','')
-        disease_to_add = disease.replace('Rust','')
-        epi_filename = epi_filename.replace(disease_to_drop,disease_to_add)
-
-        map_title = "Integrated prediction of Wheat $\\bf{" + disease_to_add + "}$ Rust infection"
-        if 'PlottingRegionName' not in config['Epidemiology']['EWS-Plotting']:
-            plotting_region_name_lower = config['RegionName'].lower()
-        else:
-            plotting_region_name_lower = config['Epidemiology']['EWS-Plotting']['PlottingRegionName'].lower()
-
-        run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonsofar']
-
-        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
-
-        epi_plotter = EWSPlottingEPIBase()
-        epi_plotter.set_param_config_files(sys_params_file_arg=sys_config,
-                                        chart_params_file_arg=chart_config,
-                                        run_params_file_arg=run_config,
-                                        epi_input_csv_arg=epi_filename+'_seasonsofar.csv',
-                                        disease_type_arg=disease_short+'_seasontodate',
-                                        issue_date_arg=start_string,
-                                        output_dir_arg=ews_plot_dir,
-                                        wheat_sources_dir_arg=deposition_dir,
-                                        wheat_source_disease_name_arg=deposition_disease_name,
-                                        map_title_arg=map_title,
-                                        chart_area_prefix=plotting_region_name_lower)
-        epi_plotter.plot_epi()
-
-        # prepare command for seasonplusforecast
-
-        run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonplusforecast']
-
-        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
-
-        epi_plotter_2 = EWSPlottingEPIBase()
-        epi_plotter_2.set_param_config_files(sys_params_file_arg=sys_config,
-                                        chart_params_file_arg=chart_config,
-                                        run_params_file_arg=run_config,
-                                        epi_input_csv_arg=epi_filename+'.csv', # for seasonplusforecast
-                                        #epi_input_csv_arg=epi_filename+'_weekahead.csv', # for weekahead
-                                        disease_type_arg=disease_short+'_seasonincforecast',
-                                        issue_date_arg=start_string,
-                                        output_dir_arg=ews_plot_dir,
-                                        wheat_sources_dir_arg=deposition_dir,
-                                        wheat_source_disease_name_arg=deposition_disease_name,
-                                        map_title_arg=map_title,
-                                        chart_area_prefix=plotting_region_name_lower)
-        epi_plotter_2.plot_epi()
-
-        # check the output
-        EWSPlottingOutputDir = f"{ews_plot_dir}/images/"
-        # TODO: Make this smarter, connected to the results of EWSPlottingEPIBase.plot_epi()
-        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}infection_{plotting_region_name_lower}_*{disease_short}*.png"]
-
-        EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
-
-        # check there is some output from EWS-plotting
-        if not EWSPlottingOutputGlobs:
-            logger.error('EWS-Plotting did not produce any output')
-            raise RuntimeError
-
-    # provide to list for transfer
-    EWSPlottingOutputs = [item for EWSPlottingOutput in EWSPlottingOutputGlobs for item in glob(EWSPlottingOutput)]
-
-    return EWSPlottingOutputs
-
-def upload(config,FilesToSend,component):
-
-    usual_path = f"{config['StartString']}_0000/"
-
-    component_path = {
-            'Environment' : usual_path,
-            'Deposition' : usual_path,
-            'Epidemiology' : usual_path,
-            'Survey' : f"SURVEYDATA_{config['StartString']}_0000/",
-            'Advisory' : usual_path }
-
-
-    # TODO: make path discern Daily or Weekly sub-directory
-
-    OutputServerPath = f"{config['ServerPath']}/{component_path[component]}"
-
-    logger.info(f"Trying upload to {config['ServerName']}:{OutputServerPath}")
-
-    logger.info(f"File(s) that will be put on remote server: {FilesToSend}")
-
-    if len(FilesToSend) == 0:
-        logger.warning('No files to send, so skipping this task')
-        raise IndexError
-
-    logger.debug("Making path directory on remote server if it doesn't already exist")
-
-    ssh_cmd = ["ssh","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",config['ServerName'], f"mkdir -p {OutputServerPath}"]
-
-    description_short = 'upload ssh'
-    description_long = 'make remote directory'
-    subprocess_and_log(ssh_cmd, description_short, description_long)
-
-    logger.debug('Sending file(s) to remote server')
-
-    scp_cmd = ["scp","-ri",config['ServerKey'],"-o","StrictHostKeyChecking=no",*FilesToSend, f"{config['ServerName']}:{OutputServerPath}"]
-
-    description_short = 'upload scp'
-    description_long = 'scp files to remote directory'
-    subprocess_and_log(scp_cmd, description_short, description_long)
-
-    return
diff --git a/ProcessorDeposition.py b/ProcessorDeposition.py
new file mode 100644
index 0000000..ef646ec
--- /dev/null
+++ b/ProcessorDeposition.py
@@ -0,0 +1,137 @@
+#ProcessorDeposition.py
+'''Functions to process the deposition component.'''
+
+from glob import glob
+import logging
+from pathlib import Path
+import os
+from string import Template
+
+from plotting.common.plotting_coordinator.ews_depo_disease_plotting_coordinator import EWSPlottingDepoBase
+
+from ProcessorUtils import (
+        open_and_check_config,
+        get_only_existing_globs,
+        subprocess_and_log,
+        endScript,
+        endJob,
+        add_filters_to_sublogger,
+        remove_path_from_tar_members
+)
+
+logger = logging.getLogger('Processor.Deposition')
+add_filters_to_sublogger(logger)
+
+def process_in_job_dep(jobPath,status,config,component):
+    logger.info('started process_in_job_dep()')
+
+    file_path = Template(config[component]['ServerPathTemplate']).substitute(**config)
+    file_name = Template(config[component]['InputFileTemplate']).substitute(**config)
+
+    logger.info(f"Expecting to work with {file_name}")
+
+    if os.path.exists(f"{jobPath}/{file_name}"):
+        logger.info('Directory already exists in job directory, so nothing to do here')
+        return
+
+    logger.info('Copying file from remote server to job directory')
+
+    # TODO: perform ssh file transfer in python instead of subprocess
+    cmd_scp = ["scp","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath]
+    description_short = 'dep scp'
+    description_long = 'scp from server to job directory'
+    subprocess_and_log(cmd_scp, description_short, description_long)
+
+    logger.info('untarring the input file')
+
+    # TODO: untar file in python (with tarfile module) instead of subprocess
+    cmd_tar = ["tar","-xzf",f"{jobPath}/{file_name}.tar.gz","-C",jobPath]
+    description_short = 'dep tars'
+    description_long = 'untar the downloaded file'
+    subprocess_and_log(cmd_tar, description_short, description_long)
+
+    # basic check that contents are as expected
+    # 132 files of NAME .txt timesteps and one summary png file
+    if len(glob(f"{jobPath}/{file_name}/deposition_srcs_allregions_C1_T*.txt")) != 56:
+        msg = f"Unexpect number of deposition .txt files in input tar file. Expected 56."
+        logger.error(msg)
+        raise RuntimeError(msg)
+    return
+
+
+def process_EWS_plotting_dep(jobPath,config):
+    '''Returns a list of output files for transfer.'''
+
+    logger.info('started process_EWS_plotting_dep()')
+
+    # initialise environment
+    regions = config['SubRegionNames']
+
+    deposition_file_name = Template(config['Deposition']['InputFileTemplate']).substitute(**config)
+
+    deposition_path = f"{jobPath}/{deposition_file_name}"
+
+    # get the file name from the config
+    # this file name can be a glob, as long as matches can all be loaded by iris
+    deposition_data_file_name = Template(config['Deposition']['DataFileTemplate']).substitute(**config)
+    name_file_wildcard = f"{deposition_path}/{deposition_data_file_name}"
+
+    EWSPlottingOutputGlobs = []
+
+    for region in regions:
+
+        output_dir = f"{jobPath}/plotting/{region.lower()}"
+
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+        sys_config = config['Deposition']['EWS-Plotting']['SysConfig']
+        name_extraction_config = config['Deposition']['EWS-Plotting']['NameExtractionConfig']
+        run_config = config['Deposition']['EWS-Plotting']['RunConfig']
+        run_config_norm = config['Deposition']['EWS-Plotting']['RunConfigNorm']
+        chart_config = config['Deposition']['EWS-Plotting'][region]['ChartConfig']
+        normalize = config['Deposition']['EWS-Plotting'][region]['Normalize']
+        extraction_file_prefix = 'deposition_' + region.lower()
+
+        # Note that this runs all disease types available
+
+        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{name_extraction_config}\n{run_config}\n{run_config_norm}\n{chart_config}")
+
+        depo_plotter = EWSPlottingDepoBase()
+        depo_plotter.set_param_config_files(sys_config_file_arg=sys_config,
+                                        depo_name_extraction_config_file_arg=name_extraction_config,
+                                        chart_config_file_arg= chart_config,
+                                        depo_plotting_run_config_file_arg=run_config,
+                                        depo_plotting_normalized_run_config_file_arg=run_config_norm,
+                                        name_file_wildcard_arg=name_file_wildcard,
+                                        wheat_sources_dir_arg=deposition_path,
+                                        output_dir_arg=output_dir,
+                                        issue_date_arg=config['StartString'],
+                                        extraction_file_prefix_arg=extraction_file_prefix)
+
+
+        # asia/east africa env suit should not perform normalization, false gets passed here for these areas
+        depo_plotter.name_extract_params.NORMALIZE = (normalize.upper() == "TRUE")
+
+        depo_plotter.plot_depo()
+
+        # check the output
+        EWSPlottingOutputDir = f"{output_dir}/images/"
+        #EWSPlottingOutputGlobs += [
+        #        # daily plots
+        #        f"{EWSPlottingOutputDir}Daily/deposition_{region.lower()}_*_daily_20*.png",
+        #        # weekly plots
+        #        f"{EWSPlottingOutputDir}Weekly/deposition_{region.lower()}_*_total_20*.png"]
+
+        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"]
+
+    EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
+
+    # check there is some output from EWS-plotting
+    if not EWSPlottingOutputGlobs:
+        logger.error('EWS-Plotting did not produce any output')
+        raise RuntimeError
+
+    # provide list for transfer
+    EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)])
+
+    return EWSPlottingOutputs
diff --git a/ProcessorEnvironment.py b/ProcessorEnvironment.py
new file mode 100644
index 0000000..743937e
--- /dev/null
+++ b/ProcessorEnvironment.py
@@ -0,0 +1,200 @@
+#ProcessorEnvironment.py
+'''Functions to process the environment component.'''
+
+from distutils.dir_util import copy_tree
+from glob import glob
+import logging
+from pathlib import Path
+import os
+from string import Template
+import tarfile
+
+import iris
+from iris.cube import CubeList
+
+from plotting.common.utils import EnvSuitDiseaseInfo
+from plotting.common.plotting_coordinator.ews_env_disease_plotting_coordinator import EWSPlottingEnvSuitBase
+
+import EnvSuitPipeline as esp
+from ProcessorUtils import (
+        open_and_check_config,
+        get_only_existing_globs,
+        subprocess_and_log,
+        endScript,
+        endJob,
+        add_filters_to_sublogger,
+        remove_path_from_tar_members,
+        short_name,
+)
+
+logger = logging.getLogger('Processor.Environment')
+add_filters_to_sublogger(logger)
+
+
+def process_in_job_env2_0(jobPath,status,config,component):
+    logger.info('started process_in_job_env2_0()')
+
+    logger.info('Copying file from remote server to job directory')
+
+    file_path = Template(config[component]['ServerPathTemplate']).substitute(**config)
+    file_name = Template(config[component]['InputFileTemplate']).substitute(**config)
+
+    #TODO: check if file exists already (may be the case for multiple configs in one)
+
+    # TODO: perform ssh file transfer in python instead of subprocess
+    cmd_scp = ["scp","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",f"{config['ServerName']}:{file_path}/{file_name}.tar.gz", jobPath]
+    description_short = 'env2 scp'
+    description_long = 'Copying file from remote server to job directory'
+
+    # lawrence comment in/out
+    subprocess_and_log(cmd_scp,description_short, description_long)
+
+    logger.info('untarring the input file')
+
+    # untar incoming name data
+    output_directory = f"{jobPath}/NAME_Met_as_netcdf"
+    Path(output_directory).mkdir(parents=True, exist_ok=True)
+    tarfile_name = f"{jobPath}/{file_name}.tar.gz"
+    with tarfile.open(tarfile_name) as tar:
+        members = remove_path_from_tar_members(tar)
+        tar.extractall(output_directory, members = members)
+
+    # basic check that contents are as expected for 7-day forecast (57 timepoints in all files)
+    cube_wildcard = f"{output_directory}/*.nc"
+    cubes: CubeList = iris.load(cube_wildcard)
+    for cube in cubes:
+        coord = cube.coord("time")
+        timepoint_count = coord.shape[0]
+        if timepoint_count != 57:
+            msg = f"Unexpected number of timepoints ({timepoint_count}) in cube {cube.name()}"
+            logger.error(msg)
+            raise RuntimeError(msg)
+
+    region = config['RegionName']
+
+    logger.info(f"Calling environmental suitability 2.0 for {region} so wait for output to appear")
+
+    pipeline_config = config["Environment"]
+    try:
+        #todo lawrence comment this back to original (extracted=False)
+        esp.run_pipeline(pipeline_config, region, config["StartString"], extracted=False)
+    except:
+        logger.exception(f"Some failure when running EnvSuitPipeline.py")
+        raise
+
+    logger.info('Finished running environmental suitability 2.0')
+
+    # TODO: Check that the output appears as expected
+
+    return
+
+def process_copy_past_job_env2_0(jobPath,status,config,component):
+    '''For when we want to skip process_in_job() to test the other components of
+    this script. Currently hard-wired.'''
+
+    # TODO: remove this hard-wired assumption
+    jobPath_to_copy = f"{jobPath}/../{short_name['Environment']}_{config['StartString']}_bak/"
+
+    assert os.path.exists(jobPath_to_copy)
+
+    dir_src = f"{jobPath_to_copy}/processed/"
+
+    dir_dst = f"{jobPath}/processed/"
+
+    logger.info(f"Copying from {dir_src}")
+
+    logger.info(f"to {dir_dst}")
+
+    copy_tree(dir_src,dir_dst)
+
+    logger.info('Copying complete')
+
+    return
+
+'''class EWSPlottingEnvSuit(EWSPlottingEnvSuitBase):
+
+    def set_custom_params(self,
+                          sys_params_dict: dict,
+                          chart_params_dict: dict,
+                          run_params_dict: dict,
+                          disease_csv_template_arg: str,
+                          diseases: List[EnvSuitDiseaseInfo]):
+        # this is unique to the asia/east africa env suit, as we are not filtering within country boundaries
+        run_params_dict[RUN_PARAMS.FILTER_FOR_COUNTRY_KEY] = "False"'''
+
+#TODO test if this works
+def process_EWS_plotting_env2_0(jobPath,config):
+    '''Configures the plotting arguments and calls EWS-plotting as a python module.
+    Returns a list of output files for transfer.'''
+
+    logger.info('started process_EWS_plotting_env2_0()')
+
+    main_region = config['RegionName']
+
+    input_dir = f"{jobPath}/processed/{main_region}"
+
+    subregions = config['SubRegionNames']
+
+    EWSPlottingOutputGlobs = []
+
+    # work on each region
+    for region in subregions:
+
+        output_dir = f"{jobPath}/plotting/{region.lower()}"
+        csv_template_dir = input_dir + "/{DISEASE_DIR}/RIE_value.csv"
+
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+        sys_config = config['Environment']['EWS-Plotting']['SysConfig']
+        run_config = config['Environment']['EWS-Plotting']['RunConfig']
+        chart_config = config['Environment']['EWS-Plotting'][region]['ChartConfig']
+        filter_for_country = config['Environment']['EWS-Plotting'][region]['FilterForCountry']
+
+        # Note that this runs all disease types available
+
+        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
+
+        env_suit_plotter = EWSPlottingEnvSuitBase()
+        env_suit_plotter.set_param_config_files(sys_params_file_arg=sys_config,
+                                             chart_params_file_arg=chart_config,
+                                             run_params_file_arg=run_config,
+                                             es_output_dir_arg=output_dir,
+                                             issue_date_arg=config['StartString'],
+                                             disease_csv_template_arg=csv_template_dir)
+
+        env_suit_plotter.run_params.FILTER_FOR_COUNTRY = (filter_for_country.upper() == "TRUE")
+
+        # Include further diseases in plotting. In this case the irrigated suitabilite for the rusts.
+        # TODO: move this part out into a config
+        extra_diseases = [
+            EnvSuitDiseaseInfo("Stem rust temp-only", "stem_rust_temponly", config['StartString'], "StemRust_TempOnly", csv_template_dir),
+            EnvSuitDiseaseInfo("Leaf rust temp-only", "leaf_rust_temponly", config['StartString'], "LeafRust_TempOnly", csv_template_dir),
+            EnvSuitDiseaseInfo("Stripe rust temp-only", "stripe_temponly", config['StartString'], "StripeRust_TempOnly", csv_template_dir)
+        ]
+
+        env_suit_plotter.add_diseases(diseases=extra_diseases)
+
+        env_suit_plotter.plot_env_suit()
+
+        # check the output
+        EWSPlottingOutputDir = f"{output_dir}/images/"
+        #EWSPlottingOutputGlobs += [
+        #        # daily plots
+        #        f"{EWSPlottingOutputDir}Daily/suitability_{region.lower()}_*_rust_daily_20*.png",
+        #        # weekly plots
+        #        f"{EWSPlottingOutputDir}Weekly/suitability_{region.lower()}_*_rust_total_20*.png"]
+
+        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}*"]
+
+    # check the output
+    EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
+
+    # check there is some output from EWS-plotting
+    if not EWSPlottingOutputGlobs:
+        logger.error('EWS-Plotting did not produce any output')
+        raise RuntimeError
+
+    # provide list for transfer
+    EWSPlottingOutputs = sorted([file for glob_str in EWSPlottingOutputGlobs for file in glob(glob_str)])
+
+    return EWSPlottingOutputs
diff --git a/ProcessorEpidemiology.py b/ProcessorEpidemiology.py
new file mode 100644
index 0000000..66e116a
--- /dev/null
+++ b/ProcessorEpidemiology.py
@@ -0,0 +1,630 @@
+#ProcessorEpidemiology.py
+'''Functions to process the epidemiology component.'''
+
+import datetime
+from glob import glob
+import json
+import logging
+from pathlib import Path
+import os
+import shutil
+
+from numpy import argmax, unique
+from pandas import read_csv, DataFrame, to_datetime
+from rasterio import open as rio_open
+
+# gitlab projects
+# TODO: Package these projects so they are robust for importing
+from EpiModel import ( # created by rs481
+    EpiAnalysis,
+    EpiModel,
+    EpiPrep,
+    EpiPrepLister,
+    EpiPrepLoader,
+    plotRaster
+)
+from plotting.common.plotting_coordinator.ews_epi_disease_plotting_coordinator import EWSPlottingEPIBase
+
+from ProcessorUtils import (
+        open_and_check_config,
+        get_only_existing_globs,
+        endScript,
+        endJob,
+        add_filters_to_sublogger,
+        query_past_successes,
+        query_proceed,
+        short_name,
+        disease_latin_name_dict
+)
+
+logger = logging.getLogger('Processor.Epi')
+add_filters_to_sublogger(logger)
+
+def calc_epi_date_range(init_str,span_days=[0,6]):
+    '''Date range is determined relative to init_date.
+    span_days is usually defined in the job config file. Day zero is current
+    day, negative values point to past (historical or analysis) days, and
+    positive values point to forecast days.
+    Returns a start_date and end_date.'''
+
+    init_date = datetime.datetime.strptime(init_str,'%Y%m%d')
+
+    # note that filename date represents preceding 3 hours, so day's data
+    #  starts at file timestamp 0300 UTC
+    threehour_shift = datetime.timedelta(hours=3)
+
+    # add 24hrs so that final day is fully included
+    day_shift = datetime.timedelta(days=1)
+
+    # if more than 999 days
+    if len(str(span_days[0]))>3:
+        # assume it is a date string
+        start_date = datetime.datetime.strptime(span_days[0]+'0300','%Y%m%d%H%M')
+    else:
+        date_shift0 = datetime.timedelta(days=span_days[0])
+
+        start_date = init_date + date_shift0 + threehour_shift
+
+    if len(str(span_days[1]))>3:
+        # assume it is a date string
+        end_date = datetime.strptime(span_days[1]+'0000','%Y%m%d%H%M')
+
+        end_date = end_date + day_shift
+    else:
+        date_shift1 = datetime.timedelta(days=span_days[1])
+
+        end_date = init_date + date_shift1 +day_shift
+
+    return start_date, end_date
+
+def process_pre_job_epi(input_args):
+    '''Returns a boolean as to whether the job is ready for full processing.'''
+
+    logger.info('started process_pre_job_epi()')
+
+    # check pre-requisite jobs are complete
+    query_past_successes(input_args)
+
+    config_fns = input_args.config_paths
+
+    for configFile in config_fns:
+
+        # they should be working if the script made it this far, no need to try
+        config_i = open_and_check_config(configFile)
+
+        #determine end time, from config file
+        arg_start_date = input_args.start_date
+        calc_span_days = config_i['Epidemiology']['CalculationSpanDays']
+        assert len(calc_span_days) == 2
+
+        start_time, end_time = calc_epi_date_range(arg_start_date,calc_span_days)
+
+        # warn if it is a long timespan
+        date_diff = end_time - start_time
+        if date_diff.days > 100:
+            logger.warning("More than 100 days will be calculated over, likely longer than any single season")
+
+    return True
+
+
+def create_epi_config_string(config,jobPath,startString,endString):
+
+    configtemplate_fn = config['ConfigFilePath']
+    configName_withoutEpi = f"{os.path.basename(configtemplate_fn).replace('.json','')}_{startString}-{endString}"
+
+    # create a string describing every epi calc configuration
+    epiStrings = []
+    for epiconf in config['Epidemiology']['Epi']:
+        epiKwargsString = ''.join([f"{k}{v}" for k,v in epiconf['modelArguments'].items()])
+
+        # drop any repetitive elements of kwarg
+        epiKwargsString = epiKwargsString.replace('infectionprevious','')
+        epiKwargsString = epiKwargsString.replace('capbeta','cb')
+
+        epiCaseString = f"{epiconf['model'].lower()}{epiKwargsString}"
+
+        # provide to configuration for output filename
+        epiconf["infectionRasterFileName"] = f"{jobPath}/infections_{configName_withoutEpi}_{epiCaseString}"
+
+        epiStrings += [epiCaseString]
+
+    epiString = '-'.join(epiStrings)
+
+    config_filename = f"{configName_withoutEpi}_{epiString}"
+
+    logger.debug(f"length of config filename is {len(config_filename)}.")
+
+    if len(config_filename) > 254:
+        logger.info(f"filename length is too long, it will raise an OSError, using a short form instead")
+
+        # epi cases are not described in filename, an interested user
+        # must look in the json file for details.
+        config_filename = configName_withoutEpi
+
+        assert len(config_filename) <= 254
+
+    return config_filename
+
+
+def raster_to_csv(raster_fn,csv_fn):
+
+    # create a csv version and save in the job directory,
+    # to compare host raster with dep and env suit
+    # note this can be time-varying by providing additional rows
+    with rio_open(raster_fn,'r') as host_raster:
+        host_arr = host_raster.read(1)
+        shape = host_raster.shape
+
+        # determine coordinates
+        coords = [host_raster.xy(i,j) for i in range(shape[0]) for j in range(shape[1])]
+        lons = unique([ci[0] for ci in coords])
+        lats = unique([ci[1] for ci in coords])
+        assert shape == (lats.size,lons.size)
+
+    # build into a dataframe
+    # (rasters start in the top left, so descending latitude coordinates)
+    host_df = DataFrame(data=host_arr,index=lats[::-1],columns=lons)
+    # rearrange to ascending latitude corodinates
+    host_df.sort_index(axis='rows',inplace=True)
+    # make spatial coordinates a multi-index, like for dep and env suit csvs
+    host_series = host_df.stack()
+    # for now, provide a nominal date of validity to enable a time column
+    # so far, using mapspam which is a static map, so time is irrelevant
+    host_series.name = '201908150000'
+    host_df2 = DataFrame(host_series).T
+
+    host_df2.to_csv(csv_fn)
+
+    return
+
+def process_in_job_epi(jobPath,status,config,component):
+    logger.info('started process_in_job_epi()')
+
+    # TODO: Some of this is modifying config before epi model is run. Determine
+    # how to account for that
+
+    # initialise any needed variables
+
+    reference_date_str = config['StartString']
+    reference_date = datetime.datetime.strptime(reference_date_str,'%Y%m%d')
+
+    start_date, end_date = calc_epi_date_range(reference_date_str,config['Epidemiology']['CalculationSpanDays'])
+
+    date_diff = end_date - start_date
+
+    start_string = start_date.strftime('%Y-%m-%d-%H%M')
+    start_string_short = start_date.strftime('%Y%m%d%H%M')
+    end_string = end_date.strftime('%Y-%m-%d-%H%M')
+
+    # update config accordingly
+    config['ReferenceTime'] = reference_date_str
+    config['StartTime'] = start_string
+    config['StartTimeShort'] = start_string_short
+    config['EndTime'] = end_string
+
+    diseases = config['Epidemiology']['DiseaseNames']
+
+    def gather_deposition(config_epi,config,variable_name,start_date,end_date,jobDataPath,status):
+
+        # TODO: Simplify the set of required arguments . Check if config is necessary.
+
+        config_epi['Deposition']['VariableName'] = variable_name # disease_latin_name_dict[disease]+'_DEPOSITION'
+
+        config_epi['Deposition']['FileNamePrepared'] = f"{jobDataPath}/data_input_deposition.csv"
+
+        # Use config-defined file lister in config file instead of here
+        file_lister_dep_name = config_epi['Deposition'].get('FileListerFunction',None)
+
+        # when it isn't defined, guess what it should be
+        if file_lister_dep_name is None:
+
+            file_lister_dep_name = 'list_deposition_files_operational'
+
+            if date_diff > datetime.timedelta(days=7):
+
+                file_lister_dep_name = 'list_deposition_files_historical'
+                logger.info('Using historical method to prepare data on spore deposition')
+
+        file_lister_dep = getattr(EpiPrepLister,file_lister_dep_name)
+
+        config_for_lister = config.copy()
+        config_for_lister.update(config_epi)
+
+        # get bounds of host map, to exclude redundant deposition datapoints
+        hostRasterFileName = config_for_lister["Host"]["HostRaster"]
+        with rio_open(hostRasterFileName) as hostRaster:
+            bounds = hostRaster.bounds
+
+        lister_kwargs = {}
+        lister_kwargs['reference_date']=config['ReferenceTime']
+
+        loader_kwargs= {}
+        loader_kwargs['VariableName']= config_for_lister['Deposition'].get('VariableName')
+        loader_kwargs['VariableNameAlternative']= config_for_lister['Deposition'].get('VariableNameAlternative')
+        loader_kwargs['bounds'] = bounds
+
+        try:
+
+            EpiPrep.prep_input(config_for_lister,start_date,end_date,
+                    component='Deposition',
+                    file_lister=file_lister_dep,
+                    file_loader=EpiPrepLoader.load_NAME_file,
+                    lister_kwargs=lister_kwargs,
+                    **loader_kwargs)
+
+            assert os.path.isfile(config_epi['Deposition']['FileNamePrepared'])
+
+        except:
+
+            logger.exception(f"Unexpected error in deposition data preparation")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        return
+
+    # get list of variable names to be loaded from deposition input
+    depo_variable_names =  config['Epidemiology']['Deposition']['VariableNames']
+    assert len(depo_variable_names) == len(diseases)
+
+    # loop over each sub region
+
+    region = config['RegionName']
+    #for region in config['SubRegionNames']:
+
+    for disease in diseases:
+
+        assert disease in disease_latin_name_dict.keys()
+
+        config_epi = config['Epidemiology'].copy()
+
+        # TODO: CAUTION: Any iterations (e.g. disease or sub-region) are hidden
+        # in jobPath, and not retained in the config file. This is a provlem for
+        # process_EWS_plotting_epi which receives a single config file and must
+        # try a fudge to retrieve details for each iteration.
+        # This should be improved, either by making the one config file
+        # aware of all of the iterations, or looping over iterations in
+        # Processor.py with one iteration-specific config.
+        case_specific_path = f"{jobPath}/{region}/{disease}/"
+        Path(case_specific_path).mkdir(parents=True, exist_ok=True)
+
+        logger.info(f"Preparing for epidemiology calc of {disease} in {region}")
+
+        # create config_filename to describe job configuration
+        config_filename = create_epi_config_string(config,case_specific_path,start_string,end_string)
+
+        # prepare a directory for input data
+        jobDataPath = f"{case_specific_path}/input_data/"
+        Path(jobDataPath).mkdir(parents=True, exist_ok=True)
+
+        # configure filename of prepared deposition data
+
+        if 'Deposition' in config_epi:
+
+            # determine which variable name to load for this disease
+            disease_idx = [i for i,j in enumerate(diseases) if j==disease][0]
+
+            variable_name = depo_variable_names[disease_idx]
+
+            gather_deposition(config_epi,config,variable_name,start_date,end_date,jobDataPath,status)
+
+        # configure filename of prepared deposition data
+
+        if 'Environment' in config_epi:
+
+            logger.info('Preparing environmental suitability data')
+
+            config_epi['SubRegionName'] = region
+
+            config_epi['DiseaseName'] = disease
+
+            config_epi['Environment']['FileNamePrepared'] = f"{jobDataPath}/data_input_environment.csv"
+
+            # Use config-defined file lister in config file instead of here
+            file_lister_env_name = config_epi['Environment'].get('FileListerFunction',None)
+
+            # when it isn't defined, guess what it should be
+            if file_lister_env_name is None:
+
+                use_monthly_chunk=False # hard-coded for historical analysis
+                file_lister_env_name = 'list_env_suit_files_operational'
+
+                if (date_diff > datetime.timedelta(days=7)) & ('ENVIRONMENT_2.0' in config_epi['Environment']['PathTemplate']) & use_monthly_chunk:
+
+                    logger.info('Using monthly-chunk method to prepare data on environmental suitability')
+                    file_lister_env_name = 'list_env_suit_files_historical_monthlychunk'
+
+                elif date_diff > datetime.timedelta(days=7):
+
+                    logger.info('Using historical method to prepare data on environmental suitability')
+                    file_lister_env_name = 'list_env_suit_files_historical'
+
+            file_lister_env = getattr(EpiPrepLister,file_lister_env_name)
+
+            config_for_lister = config.copy()
+            config_for_lister.update(config_epi)
+
+            try:
+
+                EpiPrep.prep_input(config_for_lister,start_date,end_date,
+                        component='Environment',
+                        file_loader=EpiPrepLoader.load_env_file,
+                        file_lister=file_lister_env)
+
+                assert os.path.isfile(config_epi['Environment']['FileNamePrepared'])
+
+            except:
+
+                logger.exception(f"Unexpected error in env data preparation")
+                status.reset('ERROR')
+                endJob(status,premature=True)
+
+        # prepare a copy of the host data
+
+        logger.info('Preparing a copy of the host raster data')
+
+        src_host = config_epi['Host']['HostRaster']
+        fn_host = os.path.basename(src_host)
+        dst_host = f"{jobDataPath}/{fn_host}"
+
+        # copy the tif to the job directory and refer to that instead
+        shutil.copyfile(src_host,dst_host)
+        config_epi['Host']['HostRaster'] = dst_host
+
+        logger.info('Preparing a copy of the host data as csv')
+
+        dst_host_csv = dst_host.replace('.tif','.csv')
+
+        raster_to_csv(dst_host,dst_host_csv)
+
+        config_epi['Host']['HostCSV'] = dst_host_csv
+
+        # provide fundamental config elements to config_epi
+        for k,v in config.items():
+            if k not in short_name.keys():
+                config_epi[k]=v
+
+        logger.debug('Incremental configuration looks like:')
+        def print_item(item):
+            logger.debug(f"Item {item}")
+            logger.debug(json.dumps(item,indent=2))
+        def iterate(items):
+            for item in items.items():
+                if hasattr(item,'items'):
+                    # iterate
+                    iterate(item)
+                else:
+                    print_item(item)
+        iterate(config_epi)
+
+        logger.debug('Complete configuration looks like:')
+        logger.debug(json.dumps(config_epi,indent=2))
+
+        # write the complete configuration file to job directory
+        with open(f"{case_specific_path}/{config_filename}.json",'w') as write_file:
+            json.dump(config_epi,write_file,indent=4)
+
+        # run epi model
+
+        try:
+            EpiModel.run_epi_model(f"{case_specific_path}/{config_filename}.json")
+        except:
+            logger.exception('Unexpected error in EpiModel')
+            raise
+
+        # perform calc on output
+
+        def calc_total(arr):
+            return 'total', arr.sum()
+
+        def calc_max(arr):
+            return 'maximum', arr.max()
+
+        def calc_mean(arr):
+            return 'mean', arr.mean()
+
+        for epiconf in config['Epidemiology']['Epi']:
+
+            outfile = epiconf["infectionRasterFileName"]
+
+            with rio_open(outfile+'.tif','r') as infectionRaster:
+                infection = infectionRaster.read(1)
+
+                # define function to quantify overall result, for easy check
+                # TODO: Create a more meaningful result?
+                # TODO: make this configurable
+                analysis_func = calc_mean
+
+                analysis_desc, analysis_value = analysis_func(infection)
+
+                logger.info(f"For case {outfile}")
+                logger.info('Infection {:s} is {:.2e}'.format( analysis_desc, analysis_value))
+
+                # to save tif as png for easy viewing
+                logger.debug('Saving tif output as png for easier viewing')
+                plotRaster.save_raster_as_png(outfile)
+
+        # comparison figure
+
+        # TODO: make this plot configurable? with function or args?
+        #logger.info('Plotting epi output alongside contributing components')
+        # figure_func = getattr(EpiAnalysis,'plot_compare_host_env_dep_infection')
+        logger.info('Plotting composite image of epi formulations')
+        figure_func = getattr(EpiAnalysis,'plot_compare_epi_cases')
+
+        # isolate the config for this function, in case of modifications
+        config_epi_for_comparison = config_epi.copy()
+
+        fig,axes,cases = figure_func(
+                config_epi_for_comparison,
+                start_str = start_string,
+                end_str = end_string)
+
+        SaveFileName = f"{case_specific_path}/EPI_{config_filename}_comparison"
+
+        fig.savefig(SaveFileName+'.png',dpi=300)
+
+        # slice the epi results into before forecast and in forecast
+
+        for epiconf in config['Epidemiology']['Epi']:
+
+            outfile = epiconf["infectionRasterFileName"]+'_progression.csv'
+
+            fn_seasonsofar = epiconf["infectionRasterFileName"]+'_seasonsofar.csv'
+            fn_weekahead = epiconf["infectionRasterFileName"]+'_weekahead.csv'
+
+            # load the full epi results
+            df_full = read_csv(outfile,header=[0],index_col=[0,1])
+            column_date_fmt = f"X{config['StartTimeShort']}_X%Y%m%d%H%M"
+            df_full_dates = to_datetime(df_full.columns.astype('str'),format=column_date_fmt)
+
+            # determine date to cut with
+            # plus 1 minute so midnight is associated with preceding day
+            date_to_cut = datetime.datetime.strptime(config['StartString']+'0001','%Y%m%d%H%M')
+            dates_after_cut = df_full_dates >= date_to_cut
+            idx = argmax(dates_after_cut)-1
+
+            # build seasonsofar dataframe (only need the last date)
+            df_seasonsofar = df_full.iloc[:,idx]
+
+            # check column name is defined as expected
+            # from epi start time to forecast start time
+            column_name = f"X{config['StartTimeShort']}_X{config['StartString']}0000"
+            assert df_seasonsofar.name == column_name
+
+            #  save to csv
+            df_seasonsofar.to_csv(fn_seasonsofar,header=True,index=True)
+
+            # build weekahead dataframe and save to csv
+            df_fc_start = df_full.iloc[:,idx]
+            df_fc_start_name = df_fc_start.name.split('_')[-1]
+
+            df_fc_end = df_full.iloc[:,-1]
+            df_fc_end_name = df_fc_end.name.split('_')[-1]
+
+            df_weekahead = df_fc_end - df_fc_start
+
+            # defined column name
+            df_weekahead.name = '_'.join([df_fc_start_name,df_fc_end_name])
+
+            # save to csv
+            df_weekahead.to_csv(fn_weekahead,header=True,index=True)
+
+    return
+
+def process_EWS_plotting_epi(jobPath,config):
+    '''Returns a list of output files for transfer.'''
+
+    logger.info('started process_EWS_plotting_epi()')
+
+    # initalise necessary variables from config
+
+    start_date, end_date = calc_epi_date_range(config['StartString'],config['Epidemiology']['CalculationSpanDays'])
+
+    start_string = start_date.strftime('%Y%m%d')
+    end_string = end_date.strftime('%Y%m%d')
+
+    epi_case_operational = config['Epidemiology']['EWS-Plotting']['EpiCase']
+
+    if epi_case_operational == 'none':
+        logger.info('Config specifies not to call to EWS-Plotting')
+        return []
+
+    diseases = config['Epidemiology']['DiseaseNames']
+
+    # initialise environment
+    sys_config = config['Epidemiology']['EWS-Plotting']['SysConfig']
+
+    chart_config = config['Epidemiology']['EWS-Plotting']['ChartConfig']
+
+    # use the first matching epi formulation
+    # TODO: Is there a more efficient way to select?
+    epi_filename = [ce['infectionRasterFileName'] for ce in config['Epidemiology']['Epi'] if ce['model']==epi_case_operational][0]
+
+    dep_regionnames = ['SouthAsia','Ethiopia']
+
+    # TODO get deposition_dir from config['Epidemiology']['Deposition']['PathTemplate']
+    dep_regionname = 'Ethiopia' #SouthAsia
+
+    deposition_dir = f"{config['WorkspacePath']}DEPOSITION_{start_string}/WR_NAME_{dep_regionname}_{start_string}/"
+
+    # TODO: handle multiple diseases and regions in Processor as a loop, or in the config
+    deposition_disease_name = [disease_latin_name_dict[disease]+'_DEPOSITION' for disease in diseases][0]
+
+    ews_plot_dir = f"{jobPath}/plotting/"
+
+    Path(ews_plot_dir).mkdir(parents=True, exist_ok=True)
+
+    # loop over diseases
+    EWSPlottingOutputGlobs = []
+    for disease in diseases:
+        disease_short = disease.lower().replace('rust','')
+
+        # a fudge, guess disease type
+        # because config['Epidemiology']['ProcessInJob'] handles disease loop internally
+        # assumes disease name is the last directory before the filename
+        # TODO: handle multiple diseases and regions in Processor as a loop, or in the config
+        disease_to_drop = os.path.dirname(epi_filename).split('/')[-1].replace('Rust','')
+        disease_to_add = disease.replace('Rust','')
+        epi_filename = epi_filename.replace(disease_to_drop,disease_to_add)
+
+        map_title = "Integrated prediction of Wheat $\\bf{" + disease_to_add + "}$ Rust infection"
+        if 'PlottingRegionName' not in config['Epidemiology']['EWS-Plotting']:
+            plotting_region_name_lower = config['RegionName'].lower()
+        else:
+            plotting_region_name_lower = config['Epidemiology']['EWS-Plotting']['PlottingRegionName'].lower()
+
+        run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonsofar']
+
+        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
+
+        epi_plotter = EWSPlottingEPIBase()
+        epi_plotter.set_param_config_files(sys_params_file_arg=sys_config,
+                                        chart_params_file_arg=chart_config,
+                                        run_params_file_arg=run_config,
+                                        epi_input_csv_arg=epi_filename+'_seasonsofar.csv',
+                                        disease_type_arg=disease_short+'_seasontodate',
+                                        issue_date_arg=start_string,
+                                        output_dir_arg=ews_plot_dir,
+                                        wheat_sources_dir_arg=deposition_dir,
+                                        wheat_source_disease_name_arg=deposition_disease_name,
+                                        map_title_arg=map_title,
+                                        chart_area_prefix=plotting_region_name_lower)
+        epi_plotter.plot_epi()
+
+        # prepare command for seasonplusforecast
+
+        run_config = config['Epidemiology']['EWS-Plotting']['RunConfig_seasonplusforecast']
+
+        logger.info(f"Running EWS-Plotting with the following configs:\n{sys_config}\n{run_config}\n{chart_config}")
+
+        epi_plotter_2 = EWSPlottingEPIBase()
+        epi_plotter_2.set_param_config_files(sys_params_file_arg=sys_config,
+                                        chart_params_file_arg=chart_config,
+                                        run_params_file_arg=run_config,
+                                        epi_input_csv_arg=epi_filename+'.csv', # for seasonplusforecast
+                                        #epi_input_csv_arg=epi_filename+'_weekahead.csv', # for weekahead
+                                        disease_type_arg=disease_short+'_seasonincforecast',
+                                        issue_date_arg=start_string,
+                                        output_dir_arg=ews_plot_dir,
+                                        wheat_sources_dir_arg=deposition_dir,
+                                        wheat_source_disease_name_arg=deposition_disease_name,
+                                        map_title_arg=map_title,
+                                        chart_area_prefix=plotting_region_name_lower)
+        epi_plotter_2.plot_epi()
+
+        # check the output
+        EWSPlottingOutputDir = f"{ews_plot_dir}/images/"
+        # TODO: Make this smarter, connected to the results of EWSPlottingEPIBase.plot_epi()
+        EWSPlottingOutputGlobs += [f"{EWSPlottingOutputDir}infection_{plotting_region_name_lower}_*{disease_short}*.png"]
+
+        EWSPlottingOutputGlobs = get_only_existing_globs(EWSPlottingOutputGlobs,inplace=False)
+
+        # check there is some output from EWS-plotting
+        if not EWSPlottingOutputGlobs:
+            logger.error('EWS-Plotting did not produce any output')
+            raise RuntimeError
+
+    # provide to list for transfer
+    EWSPlottingOutputs = [item for EWSPlottingOutput in EWSPlottingOutputGlobs for item in glob(EWSPlottingOutput)]
+
+    return EWSPlottingOutputs
diff --git a/ProcessorServer.py b/ProcessorServer.py
new file mode 100644
index 0000000..ed2f585
--- /dev/null
+++ b/ProcessorServer.py
@@ -0,0 +1,124 @@
+#ProcessorServer.py
+'''Functions to communicate with server sites for download and upload.'''
+
+import datetime
+import logging
+from string import Template
+
+from ProcessorUtils import (
+        open_and_check_config,
+        subprocess_and_log,
+        endScript,
+        add_filters_to_sublogger
+)
+
+logger = logging.getLogger('Processor.Server')
+add_filters_to_sublogger(logger)
+
+def process_pre_job_server_download(input_args):
+    '''This is set up for environmental suitability v2.0 and deposition.
+    Returns a boolean as to whether the job is ready for full processing.'''
+
+    logger.info('started process_pre_job_willow_download()')
+
+    # Check if there is a file available on willow
+    logger.debug('Checking for file(s) on remote server')
+
+    for i,config_path in enumerate(input_args.config_paths):
+
+        config = open_and_check_config(config_path)
+
+        config['StartString'] = input_args.start_date
+
+        file_path = Template(config[input_args.component]['ServerPathTemplate']).substitute(**config)
+        file_name = Template(config[input_args.component]['InputFileTemplate']).substitute(**config)
+        logger.info(f"Checking for existence of {file_path}/{file_name}.tar.gz")
+
+        timenow = datetime.datetime.now(tz=datetime.timezone.utc).time()
+
+        cmd_ssh = ["ssh","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",config['ServerName'],f"test -f {file_path}/{file_name}.tar.gz"]
+        description_short = 'subprocess_ssh'
+        description_long = f"Checking for existence of {file_path}/{file_name}.tar.gz"
+
+        status = subprocess_and_log(cmd_ssh,description_short,description_long,check=False)
+
+        if status.returncode == 1:
+
+            # a time check in UTC. If it's late, raise warning, if very late, raise error
+
+            time_0 = config[input_args.component]['TimeExpectedAvailable']
+            time_0 = datetime.datetime.strptime(time_0,'%H%M')
+
+            time_until_warn = datetime.timedelta(hours=4)
+            time_until_error = datetime.timedelta(hours=5)
+
+            time_warn = (time_0 + time_until_warn).time()
+            time_error = (time_0 + time_until_error).time()
+
+            message = f"Data not yet available for config {i+1} of {len(input_args.config_paths)}, expected between {time_0.time()} and {time_error} and long before {time_error}"
+
+            if timenow > time_error:
+                # job is not able to proceed
+
+                logger.warning(message)
+
+                return False
+
+            elif timenow > time_warn:
+                # job is not ready to proceed
+
+                logger.warning(message)
+                endScript(premature=True)
+
+            else:
+                # some other problem with the job
+
+                logger.info(message)
+                endScript(premature=True)
+
+        elif status.returncode == 0:
+            logger.info(f"Data is available for config {i+1} of {len(input_args.config_paths)}, calculation shall proceed")
+
+    return True
+
+def upload(config,FilesToSend,component):
+
+    usual_path = f"{config['StartString']}_0000/"
+
+    component_path = {
+            'Environment' : usual_path,
+            'Deposition' : usual_path,
+            'Epidemiology' : usual_path,
+            'Survey' : f"SURVEYDATA_{config['StartString']}_0000/",
+            'Advisory' : usual_path }
+
+
+    # TODO: make path discern Daily or Weekly sub-directory
+
+    OutputServerPath = f"{config['ServerPath']}/{component_path[component]}"
+
+    logger.info(f"Trying upload to {config['ServerName']}:{OutputServerPath}")
+
+    logger.info(f"File(s) that will be put on remote server: {FilesToSend}")
+
+    if len(FilesToSend) == 0:
+        logger.warning('No files to send, so skipping this task')
+        raise IndexError
+
+    logger.debug("Making path directory on remote server if it doesn't already exist")
+
+    ssh_cmd = ["ssh","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",config['ServerName'], f"mkdir -p {OutputServerPath}"]
+
+    description_short = 'upload ssh'
+    description_long = 'make remote directory'
+    subprocess_and_log(ssh_cmd, description_short, description_long)
+
+    logger.debug('Sending file(s) to remote server')
+
+    scp_cmd = ["scp","-ri",config['ServerKey'],"-o","StrictHostKeyChecking=no",*FilesToSend, f"{config['ServerName']}:{OutputServerPath}"]
+
+    description_short = 'upload scp'
+    description_long = 'scp files to remote directory'
+    subprocess_and_log(scp_cmd, description_short, description_long)
+
+    return
diff --git a/ProcessorSurveys.py b/ProcessorSurveys.py
new file mode 100644
index 0000000..c79128c
--- /dev/null
+++ b/ProcessorSurveys.py
@@ -0,0 +1,1060 @@
+#ProcessorSurveys.py
+'''Functions to process the survey component.'''
+
+import csv
+import datetime
+from glob import glob
+import json
+import logging
+from pathlib import Path
+import os
+import re
+import subprocess
+import requests
+
+from numpy import all as np_all
+from shutil import copyfile
+from pandas import read_csv, Series, DataFrame, concat, json_normalize
+
+from source_gen.clustering import run_case
+
+from ProcessorUtils import (
+        open_and_check_config,
+        subprocess_and_log,
+        endScript,
+        endJob,
+        add_filters_to_sublogger,
+)
+
+logger = logging.getLogger('Processor.Surveys')
+add_filters_to_sublogger(logger)
+
+def process_pre_job_survey(input_args):
+    '''Returns a boolean as to whether the job is ready for full processing.'''
+    logger.info('started process_pre_job_survey(), nothing to do')
+
+    return True
+
+def get_ODK_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
+    '''Given a dict with a single ODK form to download from an ODK Aggregate
+    server, obtains it and converts to csv.'''
+
+    # Caution: Not tested whether different servers can be downloded to the same ODK_output_path
+    ODK_output_path = f"{jobPath}/ExportRawDB"
+
+    # get data from ODK server
+    description_short = 'ODK download'
+    description_long = 'survey download from ODK server'
+
+    # get path to ODK executable
+    ODK_jar = form_credentials['ODK_jar']
+    assert os.path.exists(ODK_jar)
+
+    ODK_download = ['java',
+            '-jar', ODK_jar,
+            '--pull_aggregate',
+            '--form_id', form_credentials['form_id'],
+            '--storage_directory', ODK_output_path,
+            '--odk_url', form_credentials['url'],
+            '--odk_username',form_credentials['user'],
+            '--odk_password',form_credentials['pass']]
+
+    ODK_download_success = True
+
+    logger.debug('Performing ' + description_long)
+
+    try:
+        # perform a pull from the ODK server, and if it fails write a warning message
+
+        subprocess_and_log(ODK_download,description_short,description_long,log_type='warning',check=True)
+
+    except subprocess.CalledProcessError as e:
+        status.reset('WARNING')
+        ODK_download_success = False
+
+    #TODO: Check it came down cleanly ($serverOutputDir is created whether cleanly or not, so test more explicitly):
+
+    ODK_csv_path = f"{jobPath}/ExportCSV/"
+
+    Path(ODK_csv_path).mkdir(parents=True, exist_ok=True)
+
+    ODK_csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"
+
+    if ODK_download_success:
+        description_short = 'ODK export'
+        description_long = 'converting ODK download to csv'
+        logger.debug(description_long)
+
+        ODK_java_to_csv = ['java',
+                '-jar', ODK_jar,
+                '--export',
+                '--form_id', form_credentials['form_id'],
+                '--storage_directory',ODK_output_path,
+                '--export_directory',ODK_csv_path,
+                '--export_filename',ODK_csv_filename]
+
+        logger.debug('Performing ' + description_long)
+
+        try:
+            subprocess_and_log(ODK_java_to_csv,description_short,description_long,check=True)
+
+        except subprocess.CalledProcessError as e:
+            status.reset('WARNING')
+            ODK_download_success = False
+
+    if not ODK_download_success:
+
+        logger.info("Because ODK server download failed somewhere, trying to recover by copying recent download")
+
+        ODK_copy_success = False
+
+        days_back = 1
+        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
+        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
+
+        while ((not ODK_copy_success) and (days_back <= acceptable_days_back)):
+            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
+
+            past_date = current_date - datetime.timedelta(days=days_back)
+
+            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
+            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
+
+            past_ODK_csv_path = f"{past_jobPath}/ExportCSV/"
+
+            try:
+                # check that python or perl coordinator script succeeded for that date
+                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
+                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
+                assert success_py or success_perl
+
+                #logger.warning(f"Temporary rename of expected previous download, for jobs before ~Apr 2021")
+                #past_ODK_csv_filename = f"SurveyData.csv"
+                past_ODK_csv_filename = ODK_csv_filename
+
+                logger.info(f"Looking for {past_ODK_csv_path+past_ODK_csv_filename}")
+
+                copyfile(past_ODK_csv_path+past_ODK_csv_filename,ODK_csv_path+ODK_csv_filename)
+
+                assert os.path.isfile(ODK_csv_path+ODK_csv_filename)
+
+                ODK_copy_success = True
+            except:
+                logger.info(f"Not found an ODK download in {past_ODK_csv_path}")
+
+            days_back += 1
+
+        if not ODK_copy_success:
+            logger.error(f"Failed get a suitable copy of survey data.")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        logger.warning(f"Using ODK download from {past_jobPath}.")
+
+    return ODK_csv_path+ODK_csv_filename
+
+# TODO: Consider placing survey download functions to a separate file
+def get_from_kobotoolbox(url,form_id,form_token,**kwargs):
+
+    # Kenya survey form
+    #url = 'https://kf.kobotoolbox.org/'
+    #form_name = 'Wheat rust survey 1.0'
+    #form_id = 'akpyJHvYxkLKPkxFJnPyTW'
+    #form_token = '???' # this is sensitive
+
+    url_for_requests = url #f"{url}api/v2/"
+
+    url_to_get_form = f"{url_for_requests}assets/{form_id}/data.json"
+
+    headers = {'Authorization': f"Token {form_token}"}
+
+    response = requests.get(url_to_get_form,headers=headers)
+
+    if response.status_code != 200:
+        raise requests.exceptions.HTTPError('HTTP status was not 200')
+
+    logger.info('successful connection to kobotoolbox server')
+    return response
+
+def build_dataframe(response):
+
+    result_count = response.json()['count']
+
+    logger.info(f"{result_count} records")
+
+    request_results = response.json()['results']
+
+    # crude merging of list of dicts into pandas dataframe
+    df = DataFrame.from_records(request_results)
+
+    return df
+
+#parse columns into ODK format
+def parse_location_str(location_str):
+
+    # expecting a space-separated string containing four numbers which
+    # contain a decimal point
+    regex = r'(?P<lat>[-?0-9\.]+)\s(?P<lon>[-?0-9\.]+)\s(?P<alt>[0-9\.]+)\s(?P<acc>[0-9\.]+)'
+
+    # needed because the ODK names are too complicated for regex named groups
+    name_dict = {
+        'lat' : 'survey_infromation-location-Latitude',
+        'lon' : 'survey_infromation-location-Longitude',
+        'alt' : 'survey_infromation-location-Altitude',
+        'acc' : 'survey_infromation-location-Accuracy'
+        }
+
+    res = re.search(regex,location_str)
+
+    loc_series = Series(res.groupdict())
+
+    loc_series.rename(index=name_dict,inplace=True)
+
+    return loc_series
+
+def parse_location_kobotoolbox(series):
+
+    loc_df = series.apply(parse_location_str)
+
+    return loc_df
+
+def convert_date(date_str,fmt_in,fmt_out):
+
+    # in case any nan's creep in
+    if str(date_str)=='nan':
+        return 'nan'
+
+    # timezones in kobotoolbox data are irregular
+    # datetime needs +HHMM
+    # so setting up a regex to check for these cases and handle
+    pattern1 = '\+[0-9][0-9]$'
+    if re.search(pattern1,date_str):
+        # need to provide empty MM
+        date_str = date_str + '00'
+    pattern2 = '\+([0-9][0-9]):([0-9][0-9])$'
+    if re.search(pattern2,date_str):
+        # need to provide empty MM
+        date_str = re.sub(pattern2,'+\g<1>\g<2>',date_str)
+
+    date_in = datetime.datetime.strptime(date_str,fmt_in)
+    date_str_out = date_in.strftime(fmt_out)
+
+    return date_str_out
+
+def parse_date(series,name_out='date',fmt_in = '%Y-%m-%d',fmt_out= '%b %d, %Y'):
+
+    s_out = series.apply(convert_date,fmt_in=fmt_in,fmt_out=fmt_out)
+
+    s_out.rename(name_out,inplace=True)
+
+    return s_out
+
+# dict of functions callable within coln_parser_dict
+# so they can be obtained with a string in coln_parser_dict
+func_dict = {
+    'parse_date' : parse_date,
+    'parse_location_kobotoolbox' : parse_location_kobotoolbox
+}
+
+def parse_columns(df_in,coln_parser_dict):
+    '''Works on each type of conversion in turn.
+
+    coln_parse_dict is the configuration used to convert columns:
+    - keys are column names in the input dataframe
+    - values that are 'None' mean they should be dropped
+    - values that are string simply rename the column
+    - values that are tuples should be a runnable function with kwargs, where
+    the first item is the string identifier of the functionre and the rest is a
+    list of key,value pairs to be provided as kwargs, returns series/dataframe,
+    and drops key column.
+    # TODO: is it neccesary to provide dtype conversion somewhere (e.g. dates)?'''
+
+    df_out = df_in.copy()
+
+    # drop any indicated columns
+    coln_drop_list = [k for k,v in coln_parser_dict.items() if v == 'None']
+    logger.info(f"Dropping {len(coln_drop_list)} columns")
+    logger.debug(f"Columns being dropped are {coln_drop_list}")
+    for key in coln_drop_list:
+        del df_out[key]
+
+    # rename any indicated columns
+    coln_rename_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,str)}
+    logger.info(f"Renaming {len(coln_rename_dict)} columns")
+    logger.debug(f"Columns being renamed are {coln_rename_dict}")
+    df_out.rename(columns=coln_rename_dict,inplace=True)
+
+    # apply any functions
+    # callable only works in python 3.2+ apparently
+    coln_func_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,tuple)}
+    logger.info(f"Applying {len(coln_func_dict)} functions to columns")
+    logger.debug(f"Columns being renamed are {coln_rename_dict}")
+    dfs_to_concat = [df_out]
+
+    for key,val in coln_func_dict.items():
+
+        # TODO: there is a more pythonic way to get functions with a string
+        func = func_dict[val[0]]
+        assert callable(func)
+        kwargs = {k:v for k,v in val[1]}
+        columns_out = func(df_in[key],**kwargs)
+
+        if isinstance(columns_out,DataFrame):
+            num_outputs = columns_out.shape[-1]
+            column_names = columns_out.columns
+
+        elif isinstance(columns_out,Series):
+            num_outputs = 1
+            column_names = [columns_out.name]
+
+        logger.info(f"Adding {num_outputs} columns to dataframe")
+        logger.debug(f"New columns are {column_names}")
+
+        dfs_to_concat += [columns_out]
+
+        # drop the original column, now that it has been parsed with func
+        del df_out[key]
+
+    df_final = concat(dfs_to_concat,axis='columns')
+
+    return df_final
+
+def get_kobotoolbox_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
+    '''Given a dict with a single kobotoolbox form to download from a kobotoolbox
+    server, obtains it and converts to csv.'''
+
+    output_dir = 'Export_kobotoolbox'
+    output_path = f"{jobPath}/{output_dir}/"
+
+    Path(output_path).mkdir(parents=True, exist_ok=True)
+
+    # get data from kobotoolbox server
+
+    # keys are column names in the input dataframe
+    # values that are None mean they should be dropped
+    # values that are string simply rename the column
+    # values that are functions should be run with that key and returns series/dataframe
+    column_parser_dict = {
+        '__version__' : 'None',
+        '_attachments' : 'None',
+        '_bamboo_dataset_id' : 'None',
+        '_geolocation' : 'None', # looks like a duplication of survey_infromation/location
+        '_id' : 'None',
+        '_notes' : 'None',
+        '_status' : 'None',
+        '_submission_time' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%Y-%m-%dT%H:%M:%S'))),
+        '_submitted_by' : 'None',
+        '_tags' : 'None',
+        '_uuid' : 'KEY',
+        '_validation_status' : 'None',
+        '_xform_id_string' : 'None',
+        'comment' : 'comment',
+        'dead_stemrust_samples' : 'SET-OF-dead_stemrust_samples',
+        'dead_stemrust_samples_count' : 'dead_stemrust_samples_count',
+        'dead_yellowrust_samples' : 'SET-OF-dead_yellowrust_samples',
+        'dead_yellowrust_samples_count' : 'dead_yellowrust_samples_count',
+        'deviceid' : 'deviceid',
+        'end' : ('parse_date',(('name_out','end'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
+        'formhub/uuid' : 'None',
+        'imei' : 'imei',
+        'leaf_rust/leafrust_host_plant_reaction' : 'leaf_rust-leafrust_host_plant_reaction',
+        'leaf_rust/leafrust_incidence' : 'leaf_rust-leafrust_incidence',
+        'leaf_rust/leafrust_severity' : 'leaf_rust-leafrust_severity',
+        'live_leafrust_samples' : 'SET-OF-live_leafrust_samples',
+        'live_leafrust_samples_count' : 'live_leafrust_samples_count',
+        'live_stemrust_samples' : 'SET-OF-live_stemrust_samples',
+        'live_stemrust_samples_count' : 'live_stemrust_samples_count',
+        'live_yellowrust_samples' : 'SET-OF-live_yellowrust_samples',
+        'live_yellowrust_samples_count' : 'live_yellowrust_samples_count',
+        'meta/instanceID' : 'meta-instanceID',
+        'other_crop' : 'other_crop',
+        'other_diseases_group/other_diseases' : 'other_diseases_group-other_diseases',
+        'phonenumber' : 'phonenumber',
+        'sample_size/number_leafrust_live' : 'sample_size-number_leafrust_live',
+        'sample_size/number_stemrust_dead_dna' : 'sample_size-number_stemrust_dead_dna',
+        'sample_size/number_stemrust_live' : 'sample_size-number_stemrust_live',
+        'sample_size/number_yellowrust_dead' : 'sample_size-number_yellowrust_dead',
+        'sample_size/number_yellowrust_live' : 'sample_size-number_yellowrust_live',
+        'sample_size/using_barcode' : 'sample_size-using_barcode',
+        'samples_collected' : 'samples_collected',
+        'samples_type' : 'samples_type',
+        'score_diseases' : 'SET-OF-score_diseases',
+        'score_diseases_count' : 'score_diseases_count',
+        'septoria/septoria_incidence' : 'septoria-septoria_incidence',
+        'septoria/septoria_severity' : 'septoria-septoria_severity',
+        'site_information/crop' : 'site_information-crop',
+        'site_information/field_area' : 'site_information-field_area',
+        'site_information/growth_stage' : 'site_information-growth_stage',
+        'site_information/survey_site' : 'site_information-survey_site',
+        'site_information/variety' : 'site_information-variety',
+        'start' : ('parse_date',(('name_out','start'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
+        'stem_rust/Stemrust_severity' : 'stem_rust-Stemrust_severity',
+        'stem_rust/stemrust_host_plant_reaction' : 'stem_rust-stemrust_host_plant_reaction',
+        'stem_rust/stemrust_incidence' : 'stem_rust-stemrust_incidence',
+        'subscriberid' : 'subscriberid',
+        'survey_infromation/location' : ('parse_location_kobotoolbox',()),
+        'survey_infromation/location_name' : 'survey_infromation-location_name',
+        'survey_infromation/survey_date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%Y-%m-%d'))),
+        'surveyor_infromation/country' : 'surveyor_infromation-country',
+        'surveyor_infromation/institution' : 'surveyor_infromation-institution',
+        'surveyor_infromation/surveyor_name' : 'surveyor_infromation-surveyor_name',
+        'today' : ('parse_date',(('name_out','today'),('fmt_in','%Y-%m-%d'))),
+        'username' : 'username',
+        'yellow_rust/yellowrust_host_plant_reaction' : 'yellow_rust-yellowrust_host_plant_reaction',
+        'yellow_rust/yellowrust_incidence' : 'yellow_rust-yellowrust_incidence',
+        'yellow_rust/yellowrust_severity' : 'yellow_rust-yellowrust_severity',
+        }
+
+    logger.debug('Performing download')
+
+    # perform a pull from the server, and if it fails write a warning message
+
+    download_success = True
+
+    try:
+
+        request = get_from_kobotoolbox(**form_credentials)
+
+    except requests.exceptions.RequestException as e:
+        status.reset('WARNING')
+
+        download_success = False
+
+    # define filenames
+    csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"
+
+    csv_processed_filename = f"SurveyDataProcessed.csv"
+    csv_processed_path = f"{output_path}/{csv_processed_filename}"
+
+    if download_success:
+        # parse dataframe
+
+        dataframe_raw = build_dataframe(request)
+
+        logger.debug('Saving raw csv file')
+
+        df_raw_filename = f"{output_path}/{csv_filename}.csv"
+
+        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)
+
+        # process to match ODK format
+
+        dataframe_processed = parse_columns(dataframe_raw,column_parser_dict)
+
+        logger.debug('Saving processed csv file')
+
+        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)
+
+    if not download_success:
+
+        logger.info("Because server download failed somewhere, trying to recover by copying recent download")
+
+        copy_success = False
+
+        days_back = 1
+        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
+        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
+
+        while ((not copy_success) and (days_back <= acceptable_days_back)):
+
+            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
+
+            past_date = current_date - datetime.timedelta(days=days_back)
+
+            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
+            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
+
+            past_output_path = f"{past_jobPath}/{output_dir}/"
+
+            try:
+                # check that python or perl coordinator script succeeded for that date
+                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
+                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
+                assert success_py or success_perl
+
+                past_csv_filename = csv_processed_filename
+
+                logger.info(f"Looking for {past_output_path+past_csv_filename}")
+
+                copyfile(past_output_path+past_csv_filename,csv_processed_path)
+
+                assert os.path.isfile(csv_processed_path)
+
+                copy_success = True
+            except:
+                logger.info(f"Not found a kobotoolbox download in {past_output_path}")
+
+            days_back += 1
+
+        if not copy_success:
+            logger.error(f"Failed get a suitable copy of survey data.")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        logger.warning(f"Using download from {past_jobPath}.")
+
+    return csv_processed_path
+
+def get_from_WRSIS(form_credentials: dict, startDate: str, endDate: str):
+    date_params = {
+        'fromDate':startDate,
+        'toDate':endDate}
+
+    # set up http session
+    session = requests.Session()
+
+    # provide authorisation
+    session.auth = (form_credentials['user'],form_credentials['pass'])
+
+    response = session.post(f"{form_credentials['url']}getUKMetSurveyData",json=date_params)
+
+    # possible HTTP responses as provided in the API document
+    # (I've seen some other responses though, e.g. 415)
+    # It seems there is another layer of status codes
+    status_codes = {
+            200 : 'OK',
+            201 : 'Created',
+            202 : 'Accepted (Request accepted, and queued for execution)',
+            400 : 'Bad request',
+            401 : 'Authentication failure',
+            403 : 'Forbidden',
+            404 : 'Resource not found',
+            405 : 'Method Not Allowed',
+            409 : 'Conflict',
+            412 : 'Precondition Failed',
+            413 : 'Request Entity Too Large',
+            500 : 'Internal Server Error',
+            501 : 'Not Implemented',
+            503 : 'Service Unavailable'}
+
+    # checking the HTTP status code (not the code in the response)
+    if response.status_code == 200:
+        logger.info('HTTP request succeeded OK')
+
+    elif response.status_code in status_codes:
+        logger.info("HTTP response did not succeed OK, code is {:d}: {:s} ".format(response.status_code,status_codes[response.status_code]))
+        raise requests.exceptions.HTTPError('HTTP status was not 200')
+
+    else:
+        logger.info("HTTP response did not succeed OK, unknown code {:d}".format(response.status_code))
+        raise requests.exceptions.HTTPError('HTTP status was not 200')
+
+    return response
+
+def categorize_incident(incident):
+    '''Converting incident values into category string.
+       TODO: float values are not handled'''
+
+    try:
+        incident_value = int(incident)
+
+        if  0 < incident_value <= 20:
+            incident_category = "low"
+        elif 20 < incident_value <= 40:
+            incident_category = "medium"
+        elif 40 < incident_value <= 100:
+            incident_category = "high"
+        else:
+           incident_category = "none"
+    except:
+        if incident.lower() in ["low", "medium", "high", "none", "na"]:
+            incident_category = incident.lower()
+        else:
+            incident_category = "none"
+
+    return incident_category
+
+def nested_to_flattened(df):
+    '''WRSIS rust data is in a nested format, so it require to be flattened.
+       To do this, the nested data need to be spareated into dedicated columns.'''
+
+    # check if the dataframe is empty, if it is then add the raw columns
+    if len(df.index) == 0:
+        logger.info('Recent WRSIS download is empty.')
+        logger.info('Adding raw columns.')
+        RAW_COLUMNS = ["Rust Details","Other Disease","Sample Details","Survey Details.Latitude","Survey Details.First Rust Observation Date","Survey Details.Longitude","Survey Details.Kebele Name","Survey Details.Publish Date","Survey Details.Region Name","Survey Details.Survey Date","Survey Details.Season","Survey Details.Planting Date","Survey Details.Woreda Name","Survey Details.Location other details","Survey Details.Tillering Date","Survey Details.Zone Name","Survey Other Details.Moisture","Survey Other Details.Soil colour","Survey Other Details.Weed Control","Survey Other Details.Irrigated","Site Information.Wheat Type","Site Information.Growth Stage","Site Information.Varity Name","Site Information.Survey Site","Site Information.Site Area","Surveyor Details.Surveyors","Surveyor Details.Country","Surveyor Details.Other Surveyors","Surveyor Details.Institution Name","Fungicide Details.Fungicide Name","Fungicide Details.Spray Date","Fungicide Details.EffectiveNess","Fungicide Details.Used Dose"]
+        for i in RAW_COLUMNS:
+            df[i] = ""
+
+    # add new columns
+    logger.info('Adding new columns')
+    NEW_COLUMNS = ['imei', 'sample_size-number_yellowrust_live', 'sample_size-number_stemrust_live', 'dead_stemrust_samples_count', 'samples_collected', 'sample_size-number_yellowrust_dead', 'live_leafrust_samples_count', 'other_crop', 'live_yellowrust_samples_count', 'subscriberid', 'sample_size-using_barcode', 'start', 'score_diseases_count', 'phonenumber', 'survey_infromation-location-Accuracy', 'SET-OF-live_yellowrust_samples', 'SET-OF-score_diseases', 'meta-instanceID', 'deviceid', 'end', 'samples_type', 'live_stemrust_samples_count', 'dead_yellowrust_samples_count', 'SET-OF-live_leafrust_samples', 'KEY', 'other_diseases_group-other_diseases', 'survey_infromation-location-Altitude', 'SET-OF-dead_stemrust_samples', 'comment', 'sample_size-number_leafrust_live', 'today', 'SET-OF-dead_yellowrust_samples', 'username', 'SET-OF-live_stemrust_samples', 'sample_size-number_stemrust_dead_dna']
+
+    for i in NEW_COLUMNS:
+        df[i] = ""
+
+    #TODO: replace with a better KEY column
+    df["KEY"] = df.index
+
+    # add dedicated rust columns, with default values
+    NEW_RUST_COLUMNS = {"Stem Rust.Incident":"none","Stem Rust.Severity":"-9","Stem Rust.Reaction":"na",
+                   "Leaf Rust.Incident":"none","Leaf Rust.Severity":"-9","Leaf Rust.Reaction":"na",
+                   "Yellow Rust.Incident":"none","Yellow Rust.Severity":"-9","Yellow Rust.Reaction":"na",
+                   "Septoria.Incident":"none","Septoria.Severity":"0"}
+
+    for i in NEW_RUST_COLUMNS.keys():
+        df[i] = NEW_RUST_COLUMNS[i]
+
+    logger.info('Separating nested information into dedicated columns')
+
+    for index,row in df.iterrows():
+        nested_row = row["Rust Details"]
+        for rr in range(len(nested_row)):
+            # separating nested information into the dedicated columns
+            row[nested_row[rr]["Rust Type"] + ".Incident"] = categorize_incident(nested_row[rr]["Incident"])
+            row[nested_row[rr]["Rust Type"] + ".Severity"] = nested_row[rr]["Severity"]
+            row[nested_row[rr]["Rust Type"] + ".Reaction"] = nested_row[rr]["Reaction"]
+            df.loc[index] = row
+
+    return df
+
+def get_WRSIS_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
+    '''Given a dict with a single WRSIS form to download from WRSIS, obtains it and converts to csv.'''
+
+    output_dir = 'Export_WRSIS'
+    output_path = f"{jobPath}/{output_dir}/"
+
+    Path(output_path).mkdir(parents=True, exist_ok=True)
+
+    # get data from WRSIS
+
+    # keys are column names in the input dataframe
+    # values that are None mean they should be dropped
+    # values that are string simply rename the column
+    # values that are functions should be run with that key and returns series/dataframe
+    column_parser_dict = {
+        'Rust Details' : 'None',
+        'Other Disease' : 'None',
+        'Sample Details' : 'None',
+        'Survey Details.Latitude' : 'survey_infromation-location-Latitude',
+        'Survey Details.First Rust Observation Date' : 'None',
+        'Survey Details.Longitude' : 'survey_infromation-location-Longitude',
+        'Survey Details.Kebele Name' : 'None',
+        'Survey Details.Publish Date' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%d-%b-%Y'))),
+        'Survey Details.Region Name' : 'None',
+        'Survey Details.Survey Date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%d-%b-%Y'))),
+        'Survey Details.Season' : 'None',
+        'Survey Details.Planting Date' : 'None',
+        'Survey Details.Woreda Name' : 'None',
+        'Survey Details.Location other details' : 'None',
+        'Survey Details.Tillering Date' : 'None',
+        'Survey Details.Zone Name' : 'survey_infromation-location_name',
+        'Survey Other Details.Moisture' : 'None',
+        'Survey Other Details.Soil colour' : 'None',
+        'Survey Other Details.Weed Control' : 'None',
+        'Survey Other Details.Irrigated' : 'None',
+        'Site Information.Wheat Type' : 'site_information-crop',
+        'Site Information.Growth Stage' : 'site_information-growth_stage',
+        'Site Information.Varity Name' : 'site_information-variety',
+        'Site Information.Survey Site' : 'site_information-survey_site',
+        'Site Information.Site Area' : 'site_information-field_area',
+        'Surveyor Details.Surveyors' : 'surveyor_infromation-surveyor_name',
+        'Surveyor Details.Country' : 'surveyor_infromation-country',
+        'Surveyor Details.Institution Name' : 'surveyor_infromation-institution',
+        'Surveyor Details.Other Surveyors' : 'None',
+        #'Fungicide Details.Fungicide Name' : 'None',
+        #'Fungicide Details.Spray Date' : 'None',
+        #'Fungicide Details.EffectiveNess' : 'None',
+        #'Fungicide Details.Used Dose' : 'None',
+        "Yellow Rust.Severity" : 'yellow_rust-yellowrust_severity',
+        "Yellow Rust.Incident" : 'yellow_rust-yellowrust_incidence',
+        "Yellow Rust.Reaction" : 'yellow_rust-yellowrust_host_plant_reaction',
+        "Stem Rust.Severity" : 'stem_rust-Stemrust_severity',
+        "Stem Rust.Incident" : 'stem_rust-stemrust_incidence',
+        "Stem Rust.Reaction" : 'stem_rust-stemrust_host_plant_reaction',
+        "Leaf Rust.Severity" : 'leaf_rust-leafrust_severity',
+        "Leaf Rust.Incident" : 'leaf_rust-leafrust_incidence',
+        "Leaf Rust.Reaction" : 'leaf_rust-leafrust_host_plant_reaction',
+        "Septoria.Severity" : 'septoria-septoria_severity',
+        "Septoria.Incident" : 'septoria-septoria_incidence'
+    }
+
+    # perform a pull from the server, and if it fails write a warning message
+
+    download_success = True
+
+    start_date = datetime.datetime.strptime('01-03-2022','%d-%m-%Y').strftime('%d-%m-%Y') #TODO: set start date
+    end_date = datetime.datetime.strptime(config['StartString'], '%Y%m%d').strftime('%d-%m-%Y')
+
+    logger.debug(f'Performing download from WRSIS between {start_date} and {end_date}')
+
+    try:
+        request = get_from_WRSIS(form_credentials,start_date,end_date)
+
+    except requests.exceptions.RequestException as e:
+        status.reset('WARNING')
+
+        download_success = False
+
+    # define filenames
+    csv_filename = f"SurveyData_raw.csv"
+
+    csv_processed_filename = f"SurveyDataProcessed.csv"
+    csv_processed_path = f"{output_path}/{csv_processed_filename}"
+
+    if download_success:
+        # parse dataframe
+
+        logger.debug('Saving raw csv file')
+
+        df_raw_filename = f"{output_path}/{csv_filename}"
+        dataframe_raw = json_normalize(request.json()["response"]["Rust Survey Data"])
+
+        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)
+
+        # flatten the nested dataframe
+        dataframe_flattened = nested_to_flattened(dataframe_raw)
+
+        # process to match ODK format
+        dataframe_processed = parse_columns(dataframe_flattened,column_parser_dict)
+
+        logger.debug('Saving processed csv file')
+
+        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)
+
+    if not download_success:
+
+        logger.info("Because server download failed somewhere, trying to recover by copying recent download")
+
+        copy_success = False
+
+        days_back = 1
+        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
+        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")
+
+        while ((not copy_success) and (days_back <= acceptable_days_back)):
+
+            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')
+
+            past_date = current_date - datetime.timedelta(days=days_back)
+
+            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
+            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"
+
+            past_output_path = f"{past_jobPath}/{output_dir}/"
+
+            try:
+                # check that python or perl coordinator script succeeded for that date
+                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
+                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
+                assert success_py or success_perl
+
+                past_csv_filename = csv_processed_filename
+
+                logger.info(f"Looking for {past_output_path+past_csv_filename}")
+
+                copyfile(past_output_path+past_csv_filename,csv_processed_path)
+
+                assert os.path.isfile(csv_processed_path)
+
+                copy_success = True
+            except:
+                logger.info(f"Not found a WRSIS download in {past_output_path}")
+
+            days_back += 1
+
+        if not copy_success:
+            logger.error(f"Failed get a suitable copy of survey data.")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        logger.warning(f"Using download from {past_jobPath}.")
+
+    return csv_processed_path
+
+def process_in_job_survey(jobPath,status,config,component):
+    logger.info('started process_in_job_survey()')
+
+    logger.debug('Performing download(s) from ODK server')
+
+    credentials_filename = config['Survey']['ServerCredentialsFile']
+    with open(credentials_filename) as credentials_file:
+
+        cred = json.load(credentials_file)
+
+        assert 'forms' in cred.keys()
+
+    csv_filenames = {}
+    for form in cred['forms']:
+
+        logger.debug(f"Starting to download {form['form_id']}")
+
+        get_form_as_csv_dict = {
+            'ODK' : get_ODK_form_as_csv,
+            'kobotoolbox' : get_kobotoolbox_form_as_csv,
+            'WRSIS' : get_WRSIS_form_as_csv
+        }
+
+        assert form['type'] in get_form_as_csv_dict
+
+        func_get_form_as_csv = get_form_as_csv_dict[form['type']]
+
+        csv_filename = func_get_form_as_csv(form, jobPath, config, status)
+
+        csv_filenames[form['form_id']] = csv_filename
+
+    # load each file of surveys as a dataframe
+    forms = {}
+    for form_name,form_fn in csv_filenames.items():
+
+        # some define column types, hardwired for now
+        col_types = {'comment':'str'}
+
+        form_df = read_csv(form_fn,dtype=col_types)
+
+        forms[form_name] = form_df
+
+    # create some standard dataframe modification functions
+    def add_column(df,coln,value):
+        df[coln]=value
+        return
+
+    def remove_column(df,coln,value):
+        del df[coln]
+        return
+
+    def replace_column(df,coln,value):
+        df[coln]=value
+        return
+
+    def filter_by_column(df,coln,value):
+        # CAUTION: This requires surveyor to provide the correct country
+        df.drop(df.loc[df[coln]!=value].index,inplace=True)
+        #TODO : for Kenya data, provide a coordinate-based filter
+        return
+
+    def filter_by_list(df,coln,values):
+        # CAUTION: This requires surveyor to provide the correct list of countries
+        df.drop(df.loc[~df[coln].isin(values)].index,inplace=True)
+        return
+
+    func_types = {
+        'add': add_column,
+        'remove' : remove_column,
+        'replace' : replace_column,
+        'filter' : filter_by_column,
+        'filter_by_list' : filter_by_list
+    }
+
+    # simple format alignment using edits on config
+    # (should this need to be much more sophisticated, reconsider the workflow)
+    if 'FormEdits' in config['Survey']:
+
+        form_edits = config['Survey']['FormEdits']
+
+        # loop over each form
+        for form_name, edits in form_edits.items():
+
+            form_df = forms[form_name]
+
+            # loop over each type of edit
+            for func_type, columns in edits.items():
+
+                # check the function is available
+                assert func_type in func_types
+
+                # loop over each column to modify
+                for coln,val in columns.items():
+
+                    # apply the edit
+                    func_types[func_type](form_df,coln,val)
+
+    # Merge additional SurveyData files and rearrange columns to be consistent
+    # Assumes that the same columns are present in all forms
+    # and that the first form is the standard
+
+    first=True
+    for dfi in forms.values():
+
+        if first:
+            standard_columns = dfi.columns.tolist()
+            dfm = dfi
+
+            logger.debug(f"First processed form contains {dfm.shape[0]} records")
+
+            first=False
+            continue
+
+        # re-order columns to match first case (presumed standard format)
+        dfi = dfi[standard_columns]
+
+        logger.debug(f"Next processed form contains {dfi.shape[0]} records")
+
+        dfm = concat([dfm,dfi],axis='rows')
+
+    # save the result
+    ODK_csv_path = f"{jobPath}/ExportCSV/"
+    forms_fn = f"{ODK_csv_path}/Merged_SurveyData.csv"
+    dfm.to_csv(forms_fn,index=False,quoting=csv.QUOTE_MINIMAL)
+
+    logger.debug(f"Preparing to apply removals and additions to ODK survey data")
+
+    processed_surveys_filepath = f"{ODK_csv_path}/Processed_SurveyData.csv"
+
+    survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv"
+    survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv"
+
+    # perform here in python, using the 'KEY' column
+    # check the key column is unique
+
+    assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique'
+
+    df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str')
+    keys_to_rm = df_rm['KEY']
+
+    # check that all of the keys to remove exist in the original data
+    rm_keys_found = df_rm['KEY'].apply(lambda cell: cell in dfm['KEY'].values)
+    n_rm_keys_found = rm_keys_found.sum()
+    n_rm_keys = rm_keys_found.size
+    if not np_all(rm_keys_found):
+        # this might happen if the run date is in the past
+        logger.warning(f"Only found {n_rm_keys_found} of {n_rm_keys} survey errors to remove")
+
+        rm_keys_not_found = df_rm[~rm_keys_found]
+        logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}")
+
+    # identify which surveys to remove
+    idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values)
+
+    #drop them in-place
+    dfm = dfm[~idx_to_rm]
+    logger.info(f"Removed {n_rm_keys_found} erroneous surveys")
+
+    # add the extra entries
+    df_add = read_csv(survey_additions_filepath,dtype='str')
+    n_add_keys = df_add.shape[0]
+    df_join = concat([dfm,df_add])
+    assert dfm.shape[0]+df_add.shape[0] == df_join.shape[0], 'Unexpected result of including additional surveys'
+
+    logger.info(f"Added {n_add_keys} additional surveys")
+
+    # save as processed
+    df_join.to_csv(processed_surveys_filepath,index=False,quoting=csv.QUOTE_MINIMAL)
+
+    logger.debug('Preparing clustering calculation')
+
+    date = datetime.datetime.now()
+
+    # prepare environment for clustering calc
+    call_R = True
+
+    output_directory = f"{jobPath}/SURVEYDATA_{config['StartString']}_0000"
+    Path(output_directory).mkdir(parents=True, exist_ok=True)
+
+    if call_R:
+        
+        cluster_calc_path = "/storage/app/EWS_prod/code/wheat_source_generation/"
+
+        # clear old output
+        old_clustering_output_glob = f"{cluster_calc_path}/output/sources_*"
+        old_clustering_outputs = glob(old_clustering_output_glob)
+
+        logger.info('About to unlink old output from clustering calculation')
+        for path in old_clustering_outputs:
+            logger.info(f"unlinking {path}")
+            Path(path).unlink()
+
+
+        RPath = '/usr/local/R/bin/Rscript'
+
+        clustering_script = f"{cluster_calc_path}/code/R/clustering.R"
+
+        clustering_env = {
+                **os.environ,
+                'R_LIBS':'/home/ewsmanager/R-packages-EWS-clustering/x86_64-pc-linux-gnu-library/3.5',
+                'PROJ_LIB' : '/usr/share/proj/', # conda env breaks the automatic assignment of PROJ_LIB
+                }
+
+        clustering_config = config['Survey']['SourcesConfigFilename']
+        assert os.path.isfile(clustering_config)
+
+        clustering_calc = [RPath,
+                '--no-init-file',
+                clustering_script,
+                processed_surveys_filepath,
+                config['StartString'],
+                '-2',
+                '7',
+                config['Survey']['SourcesConfigFilename']]
+
+        logger.debug('Performing clustering calculation')
+
+        description_short = 'wheat-source-generation'
+        description_long = 'source calculation on processed surveys'
+
+        try:
+            subprocess_and_log(clustering_calc, description_short, description_long, env=clustering_env)
+        except:
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        logger.debug('Checking output of clustering calculation')
+
+        try:
+            logger.debug('Trying to copy the dataset processed for clustering')
+
+            clustering_proc_path_glob = f"{cluster_calc_path}/output/survey_data_processed_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
+            clustering_proc_path_list = glob(clustering_proc_path_glob)
+            if len(clustering_proc_path_list) == 0:
+                logger.debug(f"No processed files produced from clustering in {clustering_proc_path_glob}")
+                raise Exception
+
+            elif len(clustering_proc_path_list) > 1:
+                logger.debug(f"Multiple processed files produced from clustering in {clustering_proc_path_glob}")
+                raise Exception
+
+            else:
+                logger.debug('Found 1 processed file, placing copy of result in job directory')
+
+                proc_filename = f"survey_data_processed_{config['StartString']}.csv"
+                proc_path = f"{output_directory}/{proc_filename}"
+
+                logger.debug(f"as {proc_path}")
+
+                copyfile(clustering_proc_path_list[0], proc_path)
+
+        except:
+            logger.debug('Failed to get a copy of the dataset processed for clustering')
+
+        clustering_output_path_glob = f"{cluster_calc_path}/output/sources_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
+        clustering_output_path_list = glob(clustering_output_path_glob)
+        if len(clustering_output_path_list) == 0:
+            logger.error(f"No output produced from clustering in {clustering_output_path_glob}")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+        if len(clustering_output_path_list) > 1:
+            logger.error(f"Multiple outputs produced from clustering in {clustering_output_path_glob}")
+            status.reset('ERROR')
+            endJob(status,premature=True)
+
+        sources_path = clustering_output_path_list[0]
+
+    else:
+        # run python version
+
+        sources_path = run_case(
+                config_path = config['Survey']['pySourcesConfigFilename'],
+                survey_path = processed_surveys_filepath,
+                reference_date = config['StartString'],
+                day_offsets = [-2,7],
+                output_dir = output_directory)
+
+    logger.debug('Placing copy of result in job directory with conventional name')
+
+    output_filename = f"sources_{config['StartString']}.csv"
+    output_path = f"{output_directory}/{output_filename}"
+
+    logger.debug(f"as {output_path}")
+
+    copyfile(sources_path, output_path)
+
+    return [output_path]
+
+#TODO
+def process_EWS_plotting_survey(jobPath,config):
+    '''Returns a list of output files for transfer.'''
+
+    logger.info('started process_EWS_plotting_survey(), nothing to do')
+
+    pass
+    return []
diff --git a/ProcessorUtils.py b/ProcessorUtils.py
index 4e0dee3..1a8707d 100644
--- a/ProcessorUtils.py
+++ b/ProcessorUtils.py
@@ -6,10 +6,25 @@ import json
 import logging
 import os
 import re
+from string import Template
 import subprocess
+import sys
 import tarfile
 
-import sys
+
+short_name = {
+        'Advisory' : 'SUMMARY',
+        'Deposition' : 'DEPOSITION',
+        'Environment' : 'ENVIRONMENT_2.0',
+        'Epidemiology' : 'EPI',
+        'Survey' : 'SURVEYDATA',
+        }
+
+disease_latin_name_dict = {
+        'StemRust' : 'P_GRAMINIS',
+        'StripeRust' : 'P_STRIIFORMIS',
+        'LeafRust' : 'P_RECONDITA',
+        'WheatBlast' : 'M_ORYZAE'}
 
 # define logging filter to obscure ODK passwords
 
@@ -179,3 +194,57 @@ def remove_path_from_tar_members(tf: tarfile.TarFile):
     for member in tf.getmembers():
         member.path = os.path.basename(member.path)
         yield member
+
+def query_proceed(necessary_file,description):
+
+    try:
+
+        assert os.path.isfile(necessary_file)
+
+        logger.info(f"Found:\n{necessary_file}\nso {description} job has succeeded for this date, this job shall run.")
+
+    except AssertionError as e:
+
+        logger.info(f"Failed to find:\n{necessary_file}\nso {description} job has not yet succeeded for this date, so cannot run this job.")
+
+        endScript(premature=True)
+
+        return False
+
+    return True
+
+def query_past_successes(input_args):
+    '''Checks if deposition and environment jobs are already completed
+    successfully. If not, it raises an error.'''
+
+    component = input_args.component
+
+    # check configs can be loaded
+    config_fns = input_args.config_paths
+    for configFile in config_fns:
+        try:
+            config_i = open_and_check_config(configFile)
+        except:
+            logger.exception(f"Failure in opening or checking config {configFile}")
+            endScript(premature=True)
+
+        # some config initialisation is necessary
+        config_i['StartString'] = input_args.start_date
+
+        # check if deposition data is readily available
+        dep_success_file = Template(config_i[component]['Deposition']['SuccessFileTemplate']).substitute(**config_i)
+        try:
+            query_proceed(dep_success_file,'deposition')
+        except:
+            dep_success_file_alt = Template(config_i[component]['Deposition']['AlternativeSuccessFileTemplate']).substitute(**config_i)
+            query_proceed(dep_success_file_alt,'deposition')
+
+        # check if environment data is readily available
+        env_success_file = Template(config_i[component]['Environment']['SuccessFileTemplate']).substitute(**config_i)
+        try:
+            query_proceed(env_success_file,'environment')
+        except:
+            env_success_file_alt = Template(config_i[component]['Environment']['AlternativeSuccessFileTemplate']).substitute(**config_i)
+            query_proceed(env_success_file_alt,'environment')
+
+    return True
\ No newline at end of file
diff --git a/run_Processor.sh b/run_Processor.sh
index 722c2f1..02e20d6 100755
--- a/run_Processor.sh
+++ b/run_Processor.sh
@@ -13,8 +13,9 @@ advisory=${bin}/advisory_builder/
 met_processing=${bin}/met_extractor_v2/main/
 met_processor=${bin}/environmental_suitability/
 plotting=${bin}/plotting/
+source_gen=${bin}/source_gen/
 
-export PYTHONPATH=$PYTHONPATH:$flagdir:$epimodel:$advisory:$met_processing:$met_processor:$plotting
+export PYTHONPATH=$PYTHONPATH:$flagdir:$epimodel:$advisory:$met_processing:$met_processor:$plotting:$source_gen
 
 # provide path to email credentials for logging
 
-- 
GitLab