From cf533ecc6df90c89d5ec6c127f53c5cd6a12f2dc Mon Sep 17 00:00:00 2001 From: jws52 <jws52@cam.ac.uk> Date: Thu, 19 Oct 2023 10:50:43 +0100 Subject: [PATCH] feat: Alternative new ODK form --- coordinator/ProcessorSurveys.py | 4 +- coordinator/ProcessorSurveysnewODK.py | 10 + coordinator/ProcessorSurveysnewODK2.py | 243 +++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 coordinator/ProcessorSurveysnewODK2.py diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 33bbd9e..0a70a84 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -35,6 +35,7 @@ from source_gen.clustering import run_case from ProcessorSurveysODK import get_ODK_form_as_csv from ProcessorSurveysODKv2 import get_ODKv2_form_as_csv from ProcessorSurveysnewODK import get_newODK_form_as_csv +from ProcessorSurveysnewODK2 import get_newODK2_form_as_csv from ProcessorSurveyskobotoolbox import get_kobotoolbox_form_as_csv from ProcessorSurveysWRSIS import get_WRSIS_form_as_csv from ProcessorSurveysWRT import get_WRT_form_as_csv @@ -71,6 +72,7 @@ class ProcessorSurveys(Processor): 'WRT': get_WRT_form_as_csv, 'ODKv2': get_ODKv2_form_as_csv, 'newODK': get_newODK_form_as_csv, + 'newODK2' : get_newODK2_form_as_csv, } def process_pre_job_survey(self, input_args): @@ -463,4 +465,4 @@ class ProcessorSurveys(Processor): if __name__ == '__main__': processor = ProcessorSurveys() - processor.run_processor("Survey") \ No newline at end of file + processor.run_processor("Survey") diff --git a/coordinator/ProcessorSurveysnewODK.py b/coordinator/ProcessorSurveysnewODK.py index 90d5637..caa817c 100644 --- a/coordinator/ProcessorSurveysnewODK.py +++ b/coordinator/ProcessorSurveysnewODK.py @@ -39,13 +39,23 @@ cases_severity = { '10':10, '15':15, '20':20, + '25':25, '30':30, + '35':35, '40':40, + '45':45, '50':50, + '55':55, '60':60, + '65':65, '70':70, + '75':75, '80':80, '80+':80, + '85':85, + '90':90, + '95':95, + '100':100 } def get_from_kobotoolbox(url,form_id,form_token,**kwargs): diff --git a/coordinator/ProcessorSurveysnewODK2.py b/coordinator/ProcessorSurveysnewODK2.py new file mode 100644 index 0000000..720a129 --- /dev/null +++ b/coordinator/ProcessorSurveysnewODK2.py @@ -0,0 +1,243 @@ +#ProcessorSurveyskobotoolbox.py +"""Functions for parsing wheat rust survey records from the new ODK form on the kobotoolbox server.""" + +import csv +import datetime +import logging +import os +from pathlib import Path +import requests + +from shutil import copyfile +from pandas import DataFrame + +from ProcessorSurveyUtils import parse_columns +from ProcessorSurveysnewODK import ( + cases_incident, + cases_severity, + get_from_kobotoolbox, + build_dataframe +) +from ProcessorUtils import ( + endJob, + add_filters_to_sublogger, +) + +logger = logging.getLogger('Processor.Surveys.kobotoolbox') +add_filters_to_sublogger(logger) + + +def get_newODK2_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status): + '''Given a dict with a single kobotoolbox form to download from a kobotoolbox + server, obtains it and converts to csv.''' + + output_dir = 'Export_newCSV2' + output_path = f"{jobPath}/{output_dir}/" + + Path(output_path).mkdir(parents=True, exist_ok=True) + + # get data from kobotoolbox server + + # keys are column names in the input dataframe + # values that are None mean they should be dropped + # values that are string simply rename the column + # values that are functions should be run with that key and returns series/dataframe + + column_parser_dict = { + '_id' : 'None', + 'formhub/uuid' : 'None', + 'start' : ('parse_date',(('name_out','start'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))), + 'end' : ('parse_date',(('name_out','end'),('fmt_in','%Y-%m-%dT%H:%M:%S.%f%z'))), + 'today' : ('parse_date',(('name_out','today'),('fmt_in','%Y-%m-%d'))), + 'deviceid' : 'deviceid', + 'imei' : 'imei', + 'phonenumber' : 'None', + 'username' : 'username', + 'surveyor_information/region' : 'None', + 'surveyor_information/country' : 'surveyor_infromation-country', + 'surveyor_information/surveyor' : 'surveyor_infromation-surveyor_name', + 'surveyor_information/institution' : 'surveyor_infromation-institution', + 'survey_information/admin_level_1' : 'None', + 'survey_information/admin_level_2' : 'None', + 'survey_information/admin_level_3' : 'None', + 'survey_information/admin_level_4' : 'None', + 'survey_information/location' : 'None', + 'survey_information/location_aggregate' : 'None', + 'survey_information/location_name' : 'survey_infromation-location_name', + 'survey_information/location_gps' : ('parse_location_kobotoolbox',()), + 'survey_information/survey_date' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%Y-%m-%d'))), + 'survey_information/survey_season' : 'None', + 'site_information/survey_site' : 'site_information-survey_site', + 'site_information/crop' : 'site_information-crop', + 'site_information/growth_stage' : 'site_information-growth_stage', + 'site_information/area_unit' : 'None', + 'site_information/field_area' : 'None', + 'site_information/field_area_in_ha' : 'site_information-field_area', + 'site_information/variety' : 'site_information-variety', + 'site_information/survey_site_other' : 'None', + 'major_observed_diseases' : 'None', + 'SR/SR_incidence' : ('parse_cases',(('name_out','stem_rust-stemrust_incidence'),('cases', cases_incident),('fillna','none'))), + 'SR/SR_severity' : ('parse_cases',(('name_out','stem_rust-Stemrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0'))), + 'SR/SR_IT' : 'stem_rust-stemrust_host_plant_reaction', + 'SR/SR_image' : 'None', + 'LR/LR_incidence' : ('parse_cases',(('name_out','leaf_rust-leafrust_incidence'),('cases', cases_incident),('fillna','none'))), + 'LR/LR_severity' : ('parse_cases',(('name_out','leaf_rust-leafrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0'))), + 'LR/LR_IT' : 'leaf_rust-leafrust_host_plant_reaction', + 'LR/LR_image' : 'None', + 'YR/YR_incidence' : ('parse_cases',(('name_out','yellow_rust-yellowrust_incidence'),('cases', cases_incident),('fillna','none'))), + 'YR/YR_severity' : ('parse_cases',(('name_out','yellow_rust-yellowrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0'))), + 'YR/YR_IT' : 'yellow_rust-yellowrust_host_plant_reaction', + 'YR/YR_image' : 'None', + 'YR_head/YR_head_infection' : '', + 'YR_head/YR_head_incidence' : '', + 'YR_head/YR_head_severity' : '', + 'YR_head/YR_head_image' : 'None', + 'septoria/septoria_incidence' : 'septoria-septoria_incidence', + 'septoria/septoria_severity' : 'septoria-septoria_severity', + 'septoria/septoria_image' : 'None', + 'blast/blast_incidence' : 'None', + 'blast/blast_severity' : 'None', + 'blast/blast_image' : 'None', + 'SB/SB_incidence' : 'None', + 'SB/SB_severity' : 'None', + 'SB/SB_image' : 'None', + 'FHB/FHB_incidence' : 'None', + 'FHB/FHB_severity' : 'None', + 'FHB/FHB_image' : 'None', + 'other_observed_diseases_pests' : 'None', + 'other_disease_reapeat' : 'None', + 'score_diseases_count' : 'None', + 'score_diseases' : 'None', + 'observed_other_pests' : 'None', + 'observed_other_pests_record_count' : 'None', + 'observed_other_pests_record' : 'None', + 'insects/insect_damage' : 'None', + 'fungicide_applied' : 'None', + 'Fungicide_information/fungicide_names' : 'None', + 'Fungicide_information/Application_frequency' : 'None', + 'Fungicide_application_information/application_dates_count' : 'None', + 'Fungicide_application_information/application_dates' : 'None', + 'samples_collected_y_n' : 'samples_collected', + 'samples_type' : 'samples_type', + 'samples_count' : 'None', + 'samples' : 'None', + 'comment' : 'comment', + '__version__' : 'None', + 'meta/instanceID' : 'meta-instanceID', + '_xform_id_string' : 'None', + '_uuid' : 'KEY', + '_attachments' : 'None', + '_status' : 'None', + '_geolocation' : 'None', # looks like a duplication of survey_infromation/location + '_submission_time' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%Y-%m-%dT%H:%M:%S'))), + '_tags' : 'None', + '_notes' : 'None', + '_validation_status' : 'None', + '_submitted_by' : 'None', + } + + unavailable_at_top_level = { + 'dead_stemrust_samples' : 'SET-OF-dead_stemrust_samples', + 'dead_stemrust_samples_count' : 'dead_stemrust_samples_count', + 'dead_yellowrust_samples' : 'SET-OF-dead_yellowrust_samples', + 'dead_yellowrust_samples_count' : 'dead_yellowrust_samples_count', + 'live_leafrust_samples' : 'SET-OF-live_leafrust_samples', + 'live_leafrust_samples_count' : 'live_leafrust_samples_count', + 'live_stemrust_samples' : 'SET-OF-live_stemrust_samples', + 'live_stemrust_samples_count' : 'live_stemrust_samples_count', + 'live_yellowrust_samples' : 'SET-OF-live_yellowrust_samples', + 'live_yellowrust_samples_count' : 'live_yellowrust_samples_count', + } + + logger.debug('Performing download') + + # perform a pull from the server, and if it fails write a warning message + + download_success = True + + skip_download: bool = config['Survey'].get('SkipServerDownload', False) + + if not skip_download: + try: + + request = get_from_kobotoolbox(**form_credentials) + + except requests.exceptions.RequestException as e: + status.reset('WARNING') + + download_success = False + + # define filenames + csv_filename = f"SurveyData_{form_credentials['form_id']}.csv" + + csv_processed_filename = f"SurveyDataProcessed.csv" + csv_processed_path = f"{output_path}/{csv_processed_filename}" + + if download_success and not skip_download: + # parse dataframe + + dataframe_raw = build_dataframe(request) + + logger.debug('Saving raw csv file') + + df_raw_filename = f"{output_path}/{csv_filename}.csv" + + dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL) + + # process to match ODK format + + dataframe_processed = parse_columns(dataframe_raw,column_parser_dict) + + logger.debug('Saving processed csv file') + + dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL) + + if not download_success or skip_download: + + logger.info("Because server download failed somewhere (or we are skipping downloads), trying to recover by copying recent download") + + copy_success = False + + days_back = 1 + acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays']) + logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days") + + while ((not copy_success) and (days_back <= acceptable_days_back)): + + current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d') + + past_date = current_date - datetime.timedelta(days=days_back) + + #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}" + past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}" + + past_output_path = f"{past_jobPath}/{output_dir}/" + + try: + # check that python or perl coordinator script succeeded for that date + success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS") + success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt") + assert success_py or success_perl + + past_csv_filename = csv_processed_filename + + logger.info(f"Looking for {past_output_path+past_csv_filename}") + + copyfile(past_output_path+past_csv_filename,csv_processed_path) + + assert os.path.isfile(csv_processed_path) + + copy_success = True + except: + logger.info(f"Not found a kobotoolbox download in {past_output_path}") + + days_back += 1 + + if not copy_success: + logger.error(f"Failed get a suitable copy of survey data.") + status.reset('ERROR') + endJob(status,premature=True) + + logger.warning(f"Using download from {past_jobPath}.") + + return csv_processed_path -- GitLab