#ProcessorSurveys.py
'''Functions to process the survey component.'''

import csv
import datetime
from glob import glob
import json
import logging
from pathlib import Path
import os
import re
import subprocess
import requests

from numpy import all as np_all
from shutil import copyfile
from pandas import read_csv, Series, DataFrame, concat, json_normalize

from source_gen.clustering import run_case

from ProcessorUtils import (
        subprocess_and_log,
        endJob,
        add_filters_to_sublogger,
)

logger = logging.getLogger('Processor.Surveys')
add_filters_to_sublogger(logger)

def process_pre_job_survey(input_args):
    '''Returns a boolean as to whether the job is ready for full processing.'''
    logger.info('started process_pre_job_survey(), nothing to do')

    return True

def get_ODK_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
    '''Given a dict with a single ODK form to download from an ODK Aggregate
    server, obtains it and converts to csv.'''

    # Caution: Not tested whether different servers can be downloded to the same ODK_output_path
    ODK_output_path = f"{jobPath}/ExportRawDB"

    # get data from ODK server
    description_short = 'ODK download'
    description_long = 'survey download from ODK server'

    # get path to ODK executable
    ODK_jar = form_credentials['ODK_jar']
    assert os.path.exists(ODK_jar)

    skip_download: bool = config['Survey'].get('SkipServerDownload', False)

    ODK_download_success = True

    if not skip_download:
        try:
            ODK_download = ['java',
                            '-jar', ODK_jar,
                            '--pull_aggregate',
                            '--form_id', form_credentials['form_id'],
                            '--storage_directory', ODK_output_path,
                            '--odk_url', form_credentials['url'],
                            '--odk_username', form_credentials['user'],
                            '--odk_password', form_credentials['pass']]

            logger.debug('Performing ' + description_long)

            # perform a pull from the ODK server, and if it fails write a warning message

            subprocess_and_log(ODK_download,description_short,description_long,log_type='warning',check=True)

        except subprocess.CalledProcessError as e:
            status.reset('WARNING')
            ODK_download_success = False

    #TODO: Check it came down cleanly ($serverOutputDir is created whether cleanly or not, so test more explicitly):

    ODK_csv_path = f"{jobPath}/ExportCSV/"

    Path(ODK_csv_path).mkdir(parents = True, exist_ok = True)

    ODK_csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"

    if ODK_download_success and not skip_download:

        description_short = 'ODK export'
        description_long = 'converting ODK download to csv'
        logger.debug(description_long)

        ODK_java_to_csv = ['java',
                '-jar', ODK_jar,
                '--export',
                '--form_id', form_credentials['form_id'],
                '--storage_directory',ODK_output_path,
                '--export_directory',ODK_csv_path,
                '--export_filename',ODK_csv_filename]

        logger.debug('Performing ' + description_long)

        try:
            subprocess_and_log(ODK_java_to_csv,description_short,description_long,check=True)

        except subprocess.CalledProcessError as e:
            status.reset('WARNING')
            ODK_download_success = False

    if not ODK_download_success or skip_download:

        logger.info("Because ODK server download failed somewhere (or we are skipping downloads), trying to recover by copying recent download")

        ODK_copy_success = False

        days_back = 1
        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")

        while ((not ODK_copy_success) and (days_back <= acceptable_days_back)):
            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')

            past_date = current_date - datetime.timedelta(days=days_back)

            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"

            past_ODK_csv_path = f"{past_jobPath}/ExportCSV/"

            try:
                # check that python or perl coordinator script succeeded for that date
                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
                assert success_py or success_perl

                #logger.warning(f"Temporary rename of expected previous download, for jobs before ~Apr 2021")
                #past_ODK_csv_filename = f"SurveyData.csv"
                past_ODK_csv_filename = ODK_csv_filename

                logger.info(f"Looking for {past_ODK_csv_path+past_ODK_csv_filename}")
                print(f"Looking for {past_ODK_csv_path+past_ODK_csv_filename}")

                copyfile(past_ODK_csv_path+past_ODK_csv_filename,ODK_csv_path+ODK_csv_filename)

                assert os.path.isfile(ODK_csv_path+ODK_csv_filename)

                ODK_copy_success = True
            except:
                logger.info(f"Not found an ODK download in {past_ODK_csv_path}")

            days_back += 1

        if not ODK_copy_success:
            logger.error(f"Failed get a suitable copy of survey data.")
            status.reset('ERROR')
            endJob(status,premature=True)

        logger.warning(f"Using ODK download from {past_jobPath}.")

    return ODK_csv_path+ODK_csv_filename

# TODO: Consider placing survey download functions to a separate file
def get_from_kobotoolbox(url,form_id,form_token,**kwargs):

    # Kenya survey form
    #url = 'https://kf.kobotoolbox.org/'
    #form_name = 'Wheat rust survey 1.0'
    #form_id = 'akpyJHvYxkLKPkxFJnPyTW'
    #form_token = '???' # this is sensitive

    url_for_requests = url #f"{url}api/v2/"

    url_to_get_form = f"{url_for_requests}assets/{form_id}/data.json"

    headers = {'Authorization': f"Token {form_token}"}

    response = requests.get(url_to_get_form,headers=headers)

    if response.status_code != 200:
        raise requests.exceptions.HTTPError('HTTP status was not 200')

    logger.info('successful connection to kobotoolbox server')
    return response

def build_dataframe(response):

    result_count = response.json()['count']

    logger.info(f"{result_count} records")

    request_results = response.json()['results']

    # crude merging of list of dicts into pandas dataframe
    df = DataFrame.from_records(request_results)

    return df

#parse columns into ODK format
def parse_location_str(location_str):

    # expecting a space-separated string containing four numbers which
    # contain a decimal point
    regex = r'(?P<lat>[-?0-9\.]+)\s(?P<lon>[-?0-9\.]+)\s(?P<alt>[0-9\.]+)\s(?P<acc>[0-9\.]+)'

    # needed because the ODK names are too complicated for regex named groups
    name_dict = {
        'lat' : 'survey_infromation-location-Latitude',
        'lon' : 'survey_infromation-location-Longitude',
        'alt' : 'survey_infromation-location-Altitude',
        'acc' : 'survey_infromation-location-Accuracy'
        }

    res = re.search(regex,location_str)

    loc_series = Series(res.groupdict())

    loc_series.rename(index=name_dict,inplace=True)

    return loc_series

def parse_location_kobotoolbox(series):

    loc_df = series.apply(parse_location_str)

    return loc_df

def convert_date(date_str,fmt_in,fmt_out):

    # in case any nan's creep in
    if str(date_str)=='nan':
        return 'nan'

    # timezones in kobotoolbox data are irregular
    # datetime needs +HHMM
    # so setting up a regex to check for these cases and handle
    pattern1 = '\+[0-9][0-9]$'
    if re.search(pattern1,date_str):
        # need to provide empty MM
        date_str = date_str + '00'
    pattern2 = '\+([0-9][0-9]):([0-9][0-9])$'
    if re.search(pattern2,date_str):
        # need to provide empty MM
        date_str = re.sub(pattern2,'+\g<1>\g<2>',date_str)

    date_in = datetime.datetime.strptime(date_str,fmt_in)
    date_str_out = date_in.strftime(fmt_out)

    return date_str_out

def parse_date(series,name_out='date',fmt_in = '%Y-%m-%d',fmt_out= '%b %d, %Y'):

    s_out = series.apply(convert_date,fmt_in=fmt_in,fmt_out=fmt_out)

    s_out.rename(name_out,inplace=True)

    return s_out

# dict of functions callable within coln_parser_dict
# so they can be obtained with a string in coln_parser_dict
func_dict = {
    'parse_date' : parse_date,
    'parse_location_kobotoolbox' : parse_location_kobotoolbox
}

def parse_columns(df_in,coln_parser_dict):
    '''Works on each type of conversion in turn.

    coln_parse_dict is the configuration used to convert columns:
    - keys are column names in the input dataframe
    - values that are 'None' mean they should be dropped
    - values that are string simply rename the column
    - values that are tuples should be a runnable function with kwargs, where
    the first item is the string identifier of the functionre and the rest is a
    list of key,value pairs to be provided as kwargs, returns series/dataframe,
    and drops key column.
    # TODO: is it neccesary to provide dtype conversion somewhere (e.g. dates)?'''

    df_out = df_in.copy()

    # drop any indicated columns
    coln_drop_list = [k for k,v in coln_parser_dict.items() if v == 'None']
    logger.info(f"Dropping {len(coln_drop_list)} columns")
    logger.debug(f"Columns being dropped are {coln_drop_list}")
    for key in coln_drop_list:
        del df_out[key]

    # rename any indicated columns
    coln_rename_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,str)}
    logger.info(f"Renaming {len(coln_rename_dict)} columns")
    logger.debug(f"Columns being renamed are {coln_rename_dict}")
    df_out.rename(columns=coln_rename_dict,inplace=True)

    # apply any functions
    # callable only works in python 3.2+ apparently
    coln_func_dict = {k:v for k,v in coln_parser_dict.items() if isinstance(v,tuple)}
    logger.info(f"Applying {len(coln_func_dict)} functions to columns")
    logger.debug(f"Columns being renamed are {coln_rename_dict}")
    dfs_to_concat = [df_out]

    for key,val in coln_func_dict.items():

        # TODO: there is a more pythonic way to get functions with a string
        func = func_dict[val[0]]
        assert callable(func)
        kwargs = {k:v for k,v in val[1]}
        columns_out = func(df_in[key],**kwargs)

        if isinstance(columns_out,DataFrame):
            num_outputs = columns_out.shape[-1]
            column_names = columns_out.columns

        elif isinstance(columns_out,Series):
            num_outputs = 1
            column_names = [columns_out.name]

        logger.info(f"Adding {num_outputs} columns to dataframe")
        logger.debug(f"New columns are {column_names}")

        dfs_to_concat += [columns_out]

        # drop the original column, now that it has been parsed with func
        del df_out[key]

    df_final = concat(dfs_to_concat,axis='columns')

    return df_final

def get_kobotoolbox_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
    '''Given a dict with a single kobotoolbox form to download from a kobotoolbox
    server, obtains it and converts to csv.'''

    output_dir = 'Export_kobotoolbox'
    output_path = f"{jobPath}/{output_dir}/"

    Path(output_path).mkdir(parents=True, exist_ok=True)

    # get data from kobotoolbox server

    # keys are column names in the input dataframe
    # values that are None mean they should be dropped
    # values that are string simply rename the column
    # values that are functions should be run with that key and returns series/dataframe
    column_parser_dict = {
        '__version__' : 'None',
        '_attachments' : 'None',
        '_bamboo_dataset_id' : 'None',
        '_geolocation' : 'None', # looks like a duplication of survey_infromation/location
        '_id' : 'None',
        '_notes' : 'None',
        '_status' : 'None',
        '_submission_time' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%Y-%m-%dT%H:%M:%S'))),
        '_submitted_by' : 'None',
        '_tags' : 'None',
        '_uuid' : 'KEY',
        '_validation_status' : 'None',
        '_xform_id_string' : 'None',
        'comment' : 'comment',
        'dead_stemrust_samples' : 'SET-OF-dead_stemrust_samples',
        'dead_stemrust_samples_count' : 'dead_stemrust_samples_count',
        'dead_yellowrust_samples' : 'SET-OF-dead_yellowrust_samples',
        'dead_yellowrust_samples_count' : 'dead_yellowrust_samples_count',
        'deviceid' : 'deviceid',
        'end' : ('parse_date',(('name_out','end'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
        'formhub/uuid' : 'None',
        'imei' : 'imei',
        'leaf_rust/leafrust_host_plant_reaction' : 'leaf_rust-leafrust_host_plant_reaction',
        'leaf_rust/leafrust_incidence' : 'leaf_rust-leafrust_incidence',
        'leaf_rust/leafrust_severity' : 'leaf_rust-leafrust_severity',
        'live_leafrust_samples' : 'SET-OF-live_leafrust_samples',
        'live_leafrust_samples_count' : 'live_leafrust_samples_count',
        'live_stemrust_samples' : 'SET-OF-live_stemrust_samples',
        'live_stemrust_samples_count' : 'live_stemrust_samples_count',
        'live_yellowrust_samples' : 'SET-OF-live_yellowrust_samples',
        'live_yellowrust_samples_count' : 'live_yellowrust_samples_count',
        'meta/instanceID' : 'meta-instanceID',
        'other_crop' : 'other_crop',
        'other_diseases_group/other_diseases' : 'other_diseases_group-other_diseases',
        'phonenumber' : 'phonenumber',
        'sample_size/number_leafrust_live' : 'sample_size-number_leafrust_live',
        'sample_size/number_stemrust_dead_dna' : 'sample_size-number_stemrust_dead_dna',
        'sample_size/number_stemrust_live' : 'sample_size-number_stemrust_live',
        'sample_size/number_yellowrust_dead' : 'sample_size-number_yellowrust_dead',
        'sample_size/number_yellowrust_live' : 'sample_size-number_yellowrust_live',
        'sample_size/using_barcode' : 'sample_size-using_barcode',
        'samples_collected' : 'samples_collected',
        'samples_type' : 'samples_type',
        'score_diseases' : 'SET-OF-score_diseases',
        'score_diseases_count' : 'score_diseases_count',
        'septoria/septoria_incidence' : 'septoria-septoria_incidence',
        'septoria/septoria_severity' : 'septoria-septoria_severity',
        'site_information/crop' : 'site_information-crop',
        'site_information/field_area' : 'site_information-field_area',
        'site_information/growth_stage' : 'site_information-growth_stage',
        'site_information/survey_site' : 'site_information-survey_site',
        'site_information/variety' : 'site_information-variety',
        'start' : ('parse_date',(('name_out','start'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))),
        'stem_rust/Stemrust_severity' : 'stem_rust-Stemrust_severity',
        'stem_rust/stemrust_host_plant_reaction' : 'stem_rust-stemrust_host_plant_reaction',
        'stem_rust/stemrust_incidence' : 'stem_rust-stemrust_incidence',
        'subscriberid' : 'subscriberid',
        'survey_infromation/location' : ('parse_location_kobotoolbox',()),
        'survey_infromation/location_name' : 'survey_infromation-location_name',
        'survey_infromation/survey_date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%Y-%m-%d'))),
        'surveyor_infromation/country' : 'surveyor_infromation-country',
        'surveyor_infromation/institution' : 'surveyor_infromation-institution',
        'surveyor_infromation/surveyor_name' : 'surveyor_infromation-surveyor_name',
        'today' : ('parse_date',(('name_out','today'),('fmt_in','%Y-%m-%d'))),
        'username' : 'username',
        'yellow_rust/yellowrust_host_plant_reaction' : 'yellow_rust-yellowrust_host_plant_reaction',
        'yellow_rust/yellowrust_incidence' : 'yellow_rust-yellowrust_incidence',
        'yellow_rust/yellowrust_severity' : 'yellow_rust-yellowrust_severity',
        }

    logger.debug('Performing download')

    # perform a pull from the server, and if it fails write a warning message

    download_success = True

    skip_download: bool = config['Survey'].get('SkipServerDownload', False)

    if not skip_download:
        try:

            request = get_from_kobotoolbox(**form_credentials)

        except requests.exceptions.RequestException as e:
            status.reset('WARNING')

            download_success = False

    # define filenames
    csv_filename = f"SurveyData_{form_credentials['form_id']}.csv"

    csv_processed_filename = f"SurveyDataProcessed.csv"
    csv_processed_path = f"{output_path}/{csv_processed_filename}"

    if download_success and not skip_download:
        # parse dataframe

        dataframe_raw = build_dataframe(request)

        logger.debug('Saving raw csv file')

        df_raw_filename = f"{output_path}/{csv_filename}.csv"

        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)

        # process to match ODK format

        dataframe_processed = parse_columns(dataframe_raw,column_parser_dict)

        logger.debug('Saving processed csv file')

        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)

    if not download_success or skip_download:

        logger.info("Because server download failed somewhere (or we are skipping downloads), trying to recover by copying recent download")

        copy_success = False

        days_back = 1
        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")

        while ((not copy_success) and (days_back <= acceptable_days_back)):

            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')

            past_date = current_date - datetime.timedelta(days=days_back)

            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"

            past_output_path = f"{past_jobPath}/{output_dir}/"

            try:
                # check that python or perl coordinator script succeeded for that date
                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
                assert success_py or success_perl

                past_csv_filename = csv_processed_filename

                logger.info(f"Looking for {past_output_path+past_csv_filename}")

                copyfile(past_output_path+past_csv_filename,csv_processed_path)

                assert os.path.isfile(csv_processed_path)

                copy_success = True
            except:
                logger.info(f"Not found a kobotoolbox download in {past_output_path}")

            days_back += 1

        if not copy_success:
            logger.error(f"Failed get a suitable copy of survey data.")
            status.reset('ERROR')
            endJob(status,premature=True)

        logger.warning(f"Using download from {past_jobPath}.")

    return csv_processed_path

def get_from_WRSIS(form_credentials: dict, startDate: str, endDate: str):
    date_params = {
        'fromDate':startDate,
        'toDate':endDate}

    # set up http session
    session = requests.Session()

    # provide authorisation
    session.auth = (form_credentials['user'],form_credentials['pass'])

    response = session.post(f"{form_credentials['url']}getUKMetSurveyData",json=date_params)

    # possible HTTP responses as provided in the API document
    # (I've seen some other responses though, e.g. 415)
    # It seems there is another layer of status codes
    status_codes = {
            200 : 'OK',
            201 : 'Created',
            202 : 'Accepted (Request accepted, and queued for execution)',
            400 : 'Bad request',
            401 : 'Authentication failure',
            403 : 'Forbidden',
            404 : 'Resource not found',
            405 : 'Method Not Allowed',
            409 : 'Conflict',
            412 : 'Precondition Failed',
            413 : 'Request Entity Too Large',
            500 : 'Internal Server Error',
            501 : 'Not Implemented',
            503 : 'Service Unavailable'}

    # checking the HTTP status code (not the code in the response)
    if response.status_code == 200:
        logger.info('HTTP request succeeded OK')

    elif response.status_code in status_codes:
        logger.info("HTTP response did not succeed OK, code is {:d}: {:s} ".format(response.status_code,status_codes[response.status_code]))
        raise requests.exceptions.HTTPError('HTTP status was not 200')

    else:
        logger.info("HTTP response did not succeed OK, unknown code {:d}".format(response.status_code))
        raise requests.exceptions.HTTPError('HTTP status was not 200')

    return response

def categorize_incident(incident):
    '''Converting incident values into category string.
       TODO: float values are not handled'''

    try:
        incident_value = int(incident)

        if  0 < incident_value <= 20:
            incident_category = "low"
        elif 20 < incident_value <= 40:
            incident_category = "medium"
        elif 40 < incident_value <= 100:
            incident_category = "high"
        else:
           incident_category = "none"
    except:
        if incident.lower() in ["low", "medium", "high", "none", "na"]:
            incident_category = incident.lower()
        else:
            incident_category = "none"

    return incident_category

def nested_to_flattened(df):
    '''WRSIS rust data is in a nested format, so it require to be flattened.
       To do this, the nested data need to be spareated into dedicated columns.'''

    # check if the dataframe is empty, if it is then add the raw columns
    if len(df.index) == 0:
        logger.info('Recent WRSIS download is empty.')
        logger.info('Adding raw columns.')
        RAW_COLUMNS = ["Rust Details","Other Disease","Sample Details","Survey Details.Latitude","Survey Details.First Rust Observation Date","Survey Details.Longitude","Survey Details.Kebele Name","Survey Details.Publish Date","Survey Details.Region Name","Survey Details.Survey Date","Survey Details.Season","Survey Details.Planting Date","Survey Details.Woreda Name","Survey Details.Location other details","Survey Details.Tillering Date","Survey Details.Zone Name","Survey Other Details.Moisture","Survey Other Details.Soil colour","Survey Other Details.Weed Control","Survey Other Details.Irrigated","Site Information.Wheat Type","Site Information.Growth Stage","Site Information.Varity Name","Site Information.Survey Site","Site Information.Site Area","Surveyor Details.Surveyors","Surveyor Details.Country","Surveyor Details.Other Surveyors","Surveyor Details.Institution Name","Fungicide Details.Fungicide Name","Fungicide Details.Spray Date","Fungicide Details.EffectiveNess","Fungicide Details.Used Dose"]
        for i in RAW_COLUMNS:
            df[i] = ""

    # add new columns
    logger.info('Adding new columns')
    NEW_COLUMNS = ['imei', 'sample_size-number_yellowrust_live', 'sample_size-number_stemrust_live', 'dead_stemrust_samples_count', 'samples_collected', 'sample_size-number_yellowrust_dead', 'live_leafrust_samples_count', 'other_crop', 'live_yellowrust_samples_count', 'subscriberid', 'sample_size-using_barcode', 'start', 'score_diseases_count', 'phonenumber', 'survey_infromation-location-Accuracy', 'SET-OF-live_yellowrust_samples', 'SET-OF-score_diseases', 'meta-instanceID', 'deviceid', 'end', 'samples_type', 'live_stemrust_samples_count', 'dead_yellowrust_samples_count', 'SET-OF-live_leafrust_samples', 'KEY', 'other_diseases_group-other_diseases', 'survey_infromation-location-Altitude', 'SET-OF-dead_stemrust_samples', 'comment', 'sample_size-number_leafrust_live', 'today', 'SET-OF-dead_yellowrust_samples', 'username', 'SET-OF-live_stemrust_samples', 'sample_size-number_stemrust_dead_dna']

    for i in NEW_COLUMNS:
        df[i] = ""

    #TODO: replace with a better KEY column
    df["KEY"] = df.index

    # add dedicated rust columns, with default values
    NEW_RUST_COLUMNS = {"Stem Rust.Incident":"none","Stem Rust.Severity":"-9","Stem Rust.Reaction":"na",
                   "Leaf Rust.Incident":"none","Leaf Rust.Severity":"-9","Leaf Rust.Reaction":"na",
                   "Yellow Rust.Incident":"none","Yellow Rust.Severity":"-9","Yellow Rust.Reaction":"na",
                   "Septoria.Incident":"none","Septoria.Severity":"0"}

    for i in NEW_RUST_COLUMNS.keys():
        df[i] = NEW_RUST_COLUMNS[i]

    logger.info('Separating nested information into dedicated columns')

    for index,row in df.iterrows():
        nested_row = row["Rust Details"]
        for rr in range(len(nested_row)):
            # separating nested information into the dedicated columns
            row[nested_row[rr]["Rust Type"] + ".Incident"] = categorize_incident(nested_row[rr]["Incident"])
            row[nested_row[rr]["Rust Type"] + ".Severity"] = nested_row[rr]["Severity"]
            row[nested_row[rr]["Rust Type"] + ".Reaction"] = nested_row[rr]["Reaction"]
            df.loc[index] = row

    return df

def get_WRSIS_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status):
    '''Given a dict with a single WRSIS form to download from WRSIS, obtains it and converts to csv.'''

    output_dir = 'Export_WRSIS'
    output_path = f"{jobPath}/{output_dir}/"

    Path(output_path).mkdir(parents=True, exist_ok=True)

    # get data from WRSIS

    # keys are column names in the input dataframe
    # values that are None mean they should be dropped
    # values that are string simply rename the column
    # values that are functions should be run with that key and returns series/dataframe
    column_parser_dict = {
        'Rust Details' : 'None',
        'Other Disease' : 'None',
        'Sample Details' : 'None',
        'Survey Details.Latitude' : 'survey_infromation-location-Latitude',
        'Survey Details.First Rust Observation Date' : 'None',
        'Survey Details.Longitude' : 'survey_infromation-location-Longitude',
        'Survey Details.Kebele Name' : 'None',
        'Survey Details.Publish Date' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%d-%b-%Y'))),
        'Survey Details.Region Name' : 'None',
        'Survey Details.Survey Date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%d-%b-%Y'))),
        'Survey Details.Season' : 'None',
        'Survey Details.Planting Date' : 'None',
        'Survey Details.Woreda Name' : 'None',
        'Survey Details.Location other details' : 'None',
        'Survey Details.Tillering Date' : 'None',
        'Survey Details.Zone Name' : 'survey_infromation-location_name',
        'Survey Other Details.Moisture' : 'None',
        'Survey Other Details.Soil colour' : 'None',
        'Survey Other Details.Weed Control' : 'None',
        'Survey Other Details.Irrigated' : 'None',
        'Site Information.Wheat Type' : 'site_information-crop',
        'Site Information.Growth Stage' : 'site_information-growth_stage',
        'Site Information.Varity Name' : 'site_information-variety',
        'Site Information.Survey Site' : 'site_information-survey_site',
        'Site Information.Site Area' : 'site_information-field_area',
        'Surveyor Details.Surveyors' : 'surveyor_infromation-surveyor_name',
        'Surveyor Details.Country' : 'surveyor_infromation-country',
        'Surveyor Details.Institution Name' : 'surveyor_infromation-institution',
        'Surveyor Details.Other Surveyors' : 'None',
        #'Fungicide Details.Fungicide Name' : 'None',
        #'Fungicide Details.Spray Date' : 'None',
        #'Fungicide Details.EffectiveNess' : 'None',
        #'Fungicide Details.Used Dose' : 'None',
        "Yellow Rust.Severity" : 'yellow_rust-yellowrust_severity',
        "Yellow Rust.Incident" : 'yellow_rust-yellowrust_incidence',
        "Yellow Rust.Reaction" : 'yellow_rust-yellowrust_host_plant_reaction',
        "Stem Rust.Severity" : 'stem_rust-Stemrust_severity',
        "Stem Rust.Incident" : 'stem_rust-stemrust_incidence',
        "Stem Rust.Reaction" : 'stem_rust-stemrust_host_plant_reaction',
        "Leaf Rust.Severity" : 'leaf_rust-leafrust_severity',
        "Leaf Rust.Incident" : 'leaf_rust-leafrust_incidence',
        "Leaf Rust.Reaction" : 'leaf_rust-leafrust_host_plant_reaction',
        "Septoria.Severity" : 'septoria-septoria_severity',
        "Septoria.Incident" : 'septoria-septoria_incidence'
    }

    # perform a pull from the server, and if it fails write a warning message

    download_success = True

    start_date = datetime.datetime.strptime(config['Survey']['SeasonStartString'],'%Y%m%d').strftime('%d-%m-%Y')
    end_date = datetime.datetime.strptime(config['StartString'], '%Y%m%d').strftime('%d-%m-%Y')

    logger.debug(f'Performing download from WRSIS between {start_date} and {end_date}')

    skip_download: bool = config['Survey'].get('SkipServerDownload', False)

    if not skip_download:
        try:
            request = get_from_WRSIS(form_credentials,start_date,end_date)
            assert "response" in request.json().keys()

        except requests.exceptions.RequestException as e:
            status.reset('WARNING')

            download_success = False

        except AssertionError:
            logger.warning(f"WRSIS returned incomplete data:\n{request.json()}")
            status.reset('WARNING')

            download_success = False

    # define filenames
    csv_filename = f"SurveyData_raw.csv"

    csv_processed_filename = f"SurveyDataProcessed.csv"
    csv_processed_path = f"{output_path}/{csv_processed_filename}"

    if download_success and not skip_download:
        # parse dataframe

        logger.debug('Saving raw csv file')

        df_raw_filename = f"{output_path}/{csv_filename}"
        dataframe_raw = json_normalize(request.json()["response"]["Rust Survey Data"])

        dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL)

        # flatten the nested dataframe
        dataframe_flattened = nested_to_flattened(dataframe_raw)

        # process to match ODK format
        dataframe_processed = parse_columns(dataframe_flattened,column_parser_dict)

        logger.debug('Saving processed csv file')

        dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL)

    if not download_success or skip_download:

        logger.info("Because server download failed somewhere (or we are skipping downloads), trying to recover by copying recent download")

        copy_success = False

        days_back = 1
        acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays'])
        logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days")

        while ((not copy_success) and (days_back <= acceptable_days_back)):

            current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d')

            past_date = current_date - datetime.timedelta(days=days_back)

            #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}"
            past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}"

            past_output_path = f"{past_jobPath}/{output_dir}/"

            try:
                # check that python or perl coordinator script succeeded for that date
                success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS")
                success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt")
                assert success_py or success_perl

                past_csv_filename = csv_processed_filename

                logger.info(f"Looking for {past_output_path+past_csv_filename}")

                copyfile(past_output_path+past_csv_filename,csv_processed_path)

                assert os.path.isfile(csv_processed_path)

                copy_success = True
            except:
                logger.info(f"Not found a WRSIS download in {past_output_path}")

            days_back += 1

        if not copy_success:
            logger.error(f"Failed get a suitable copy of survey data.")
            status.reset('ERROR')
            endJob(status,premature=True)

        logger.warning(f"Using download from {past_jobPath}.")

    return csv_processed_path

def process_in_job_survey(jobPath,status,config,component):
    logger.info('started process_in_job_survey()')

    logger.debug('Performing download(s) from ODK server')

    credentials_filename = config['Survey']['ServerCredentialsFile']
    with open(credentials_filename) as credentials_file:

        cred: dict = json.load(credentials_file)

        assert 'forms' in cred.keys()

    csv_filenames = {}
    for form in cred['forms']:

        logger.debug(f"Starting to download {form['form_id']}")

        get_form_as_csv_dict = {
            'ODK' : get_ODK_form_as_csv,
            'kobotoolbox' : get_kobotoolbox_form_as_csv,
            'WRSIS' : get_WRSIS_form_as_csv
        }

        assert form['type'] in get_form_as_csv_dict

        func_get_form_as_csv = get_form_as_csv_dict[form['type']]

        csv_filename = func_get_form_as_csv(form, jobPath, config, status)

        csv_filenames[form['form_id']] = csv_filename

    # load each file of surveys as a dataframe
    forms = {}
    for form_name,form_fn in csv_filenames.items():

        # some define column types, hardwired for now
        col_types = {'comment':'str','KEY':'str'}

        form_df = read_csv(form_fn,dtype=col_types)

        forms[form_name] = form_df

    # create some standard dataframe modification functions
    def add_column(df,coln,value):
        df[coln]=value
        return

    def remove_column(df,coln,value):
        del df[coln]
        return

    def replace_column(df,coln,value):
        df[coln]=value
        return

    def filter_by_column(df,coln,value):
        # CAUTION: This requires surveyor to provide the correct country
        df.drop(df.loc[df[coln]!=value].index,inplace=True)
        #TODO : for Kenya data, provide a coordinate-based filter
        return

    def filter_by_list(df,coln,values):
        # CAUTION: This requires surveyor to provide the correct list of countries
        df.drop(df.loc[~df[coln].isin(values)].index,inplace=True)
        return

    func_types = {
        'add': add_column,
        'remove' : remove_column,
        'replace' : replace_column,
        'filter' : filter_by_column,
        'filter_by_list' : filter_by_list
    }

    # simple format alignment using edits on config
    # (should this need to be much more sophisticated, reconsider the workflow)
    if 'FormEdits' in config['Survey']:

        form_edits = config['Survey']['FormEdits']

        # loop over each form
        for form_name, edits in form_edits.items():

            form_df = forms[form_name]

            # loop over each type of edit
            for func_type, columns in edits.items():

                # check the function is available
                assert func_type in func_types

                # loop over each column to modify
                for coln,val in columns.items():

                    # apply the edit
                    func_types[func_type](form_df,coln,val)

    # Merge additional SurveyData files and rearrange columns to be consistent
    # Assumes that the same columns are present in all forms
    # and that the first form is the standard

    first=True
    for dfi in forms.values():

        if first:
            standard_columns = dfi.columns.tolist()
            dfm = dfi

            logger.debug(f"First processed form contains {dfm.shape[0]} records")

            first=False
            continue

        # re-order columns to match first case (presumed standard format)
        dfi = dfi[standard_columns]

        logger.debug(f"Next processed form contains {dfi.shape[0]} records")

        dfm = concat([dfm,dfi],axis='rows')

    # save the result
    ODK_csv_path = f"{jobPath}/ExportCSV/"
    forms_fn = f"{ODK_csv_path}/Merged_SurveyData.csv"
    dfm.to_csv(forms_fn,index=False,quoting=csv.QUOTE_MINIMAL)

    logger.debug(f"Preparing to apply removals and additions to ODK survey data")

    processed_surveys_filepath = f"{ODK_csv_path}/Processed_SurveyData.csv"

    survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv"
    survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv"

    # perform here in python, using the 'KEY' column
    # check the key column is unique

    assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique'

    df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str')
    keys_to_rm = df_rm['KEY']

    # check that all of the keys to remove exist in the original data
    rm_keys_found = df_rm['KEY'].isin(dfm['KEY'])
    n_rm_keys_found = rm_keys_found.sum()
    n_rm_keys = rm_keys_found.size
    if not np_all(rm_keys_found):
        # this might happen if the run date is in the past
        logger.warning(f"Only found {n_rm_keys_found} of {n_rm_keys} survey errors to remove")

        rm_keys_not_found = df_rm[~rm_keys_found]
        logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}")

        logger.debug(f"Type of keys that can be found include:\n{dfm['KEY'].dtype}")

        dfm_short_keys = [val for val in dfm['KEY'].values if len(str(val)) <10]
        logger.debug(f"Keys that can be found include:\n{dfm_short_keys}")

    # identify which surveys to remove
    idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values)

    #drop them in-place
    dfm = dfm[~idx_to_rm]
    logger.info(f"Removed {n_rm_keys_found} erroneous surveys")

    # add the extra entries
    df_add = read_csv(survey_additions_filepath,dtype='str')
    n_add_keys = df_add.shape[0]
    df_join = concat([dfm,df_add])
    assert dfm.shape[0]+df_add.shape[0] == df_join.shape[0], 'Unexpected result of including additional surveys'

    logger.info(f"Added {n_add_keys} additional surveys")

    # save as processed
    df_join.to_csv(processed_surveys_filepath,index=False,quoting=csv.QUOTE_MINIMAL)

    logger.debug('Preparing clustering calculation')

    date = datetime.datetime.now()

    # prepare environment for clustering calc
    call_R = False

    output_directory = f"{jobPath}/SURVEYDATA_{config['StartString']}_0000"
    Path(output_directory).mkdir(parents=True, exist_ok=True)

    if call_R:

        cluster_calc_path = "/storage/app/EWS_prod/code/wheat_source_generation/"

        # clear old output
        old_clustering_output_glob = f"{cluster_calc_path}/output/sources_*"
        old_clustering_outputs = glob(old_clustering_output_glob)

        logger.info('About to unlink old output from clustering calculation')
        for path in old_clustering_outputs:
            logger.info(f"unlinking {path}")
            Path(path).unlink()


        RPath = '/usr/local/R/bin/Rscript'

        clustering_script = f"{cluster_calc_path}/code/R/clustering.R"

        clustering_env = {
                **os.environ,
                'R_LIBS':'/home/ewsmanager/R-packages-EWS-clustering/x86_64-pc-linux-gnu-library/3.5',
                'PROJ_LIB' : '/usr/share/proj/', # conda env breaks the automatic assignment of PROJ_LIB
                }

        clustering_config = config['Survey']['SourcesConfigFilename']
        assert os.path.isfile(clustering_config)

        clustering_calc = [RPath,
                '--no-init-file',
                clustering_script,
                processed_surveys_filepath,
                config['StartString'],
                '-2',
                '7',
                config['Survey']['SourcesConfigFilename']]

        logger.debug('Performing clustering calculation')

        description_short = 'wheat-source-generation'
        description_long = 'source calculation on processed surveys'

        try:
            subprocess_and_log(clustering_calc, description_short, description_long, env=clustering_env)
        except:
            status.reset('ERROR')
            endJob(status,premature=True)

        logger.debug('Checking output of clustering calculation')

        try:
            logger.debug('Trying to copy the dataset processed for clustering')

            clustering_proc_path_glob = f"{cluster_calc_path}/output/survey_data_processed_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
            clustering_proc_path_list = glob(clustering_proc_path_glob)
            if len(clustering_proc_path_list) == 0:
                logger.debug(f"No processed files produced from clustering in {clustering_proc_path_glob}")
                raise Exception

            elif len(clustering_proc_path_list) > 1:
                logger.debug(f"Multiple processed files produced from clustering in {clustering_proc_path_glob}")
                raise Exception

            else:
                logger.debug('Found 1 processed file, placing copy of result in job directory')

                proc_filename = f"survey_data_processed_{config['StartString']}.csv"
                proc_path = f"{output_directory}/{proc_filename}"

                logger.debug(f"as {proc_path}")

                copyfile(clustering_proc_path_list[0], proc_path)

        except:
            logger.debug('Failed to get a copy of the dataset processed for clustering')

        clustering_output_path_glob = f"{cluster_calc_path}/output/sources_{config['Survey']['SourcesRegionName']}_{date.strftime('%Y-%m-%d')}_*.csv"
        clustering_output_path_list = glob(clustering_output_path_glob)
        if len(clustering_output_path_list) == 0:
            logger.error(f"No output produced from clustering in {clustering_output_path_glob}")
            status.reset('ERROR')
            endJob(status,premature=True)
        if len(clustering_output_path_list) > 1:
            logger.error(f"Multiple outputs produced from clustering in {clustering_output_path_glob}")
            status.reset('ERROR')
            endJob(status,premature=True)

        sources_path = clustering_output_path_list[0]

    else:
        # run python version

        sources_path = run_case(
                config_path = config['Survey']['pySourcesConfigFilename'],
                survey_path = processed_surveys_filepath,
                reference_date = config['StartString'],
                day_offsets = [-2,7],
                output_dir = output_directory)

    logger.debug('Placing copy of result in job directory with conventional name')

    output_filename = f"sources_{config['StartString']}.csv"
    output_path = f"{output_directory}/{output_filename}"

    logger.debug(f"as {output_path}")

    copyfile(sources_path, output_path)

    proc_out = {}
    # Output files available for upload
    proc_out['output'] = [output_path]
    # Processing files available for clearing
    proc_out['clearup'] = None

    return proc_out

#TODO
def process_EWS_plotting_survey(jobPath,config):
    '''Returns a list of output files for transfer.'''

    logger.info('started process_EWS_plotting_survey(), nothing to do')

    pass
    return []