From 062c82d603ee28b492a00769c08a3ac62b8e5e2f Mon Sep 17 00:00:00 2001 From: jws52 <jws52@cam.ac.uk> Date: Thu, 10 Aug 2023 15:56:43 +0100 Subject: [PATCH] feat: Process surveys coming from file This is useful for the new ODK form until the wheat rust toolbox is setup. --- coordinator/ProcessorSurveys.py | 4 +- coordinator/ProcessorSurveysODKv2.py | 433 +++++++++++++++++++++++++++ 2 files changed, 436 insertions(+), 1 deletion(-) create mode 100644 coordinator/ProcessorSurveysODKv2.py diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 7c06856..4d5bb4b 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -17,6 +17,7 @@ from pandas import read_csv, concat from source_gen.clustering import run_case from ProcessorSurveysODK import get_ODK_form_as_csv +from ProcessorSurveysODKv2 import get_ODKv2_form_as_csv from ProcessorSurveyskobotoolbox import get_kobotoolbox_form_as_csv from ProcessorSurveysWRSIS import get_WRSIS_form_as_csv from ProcessorSurveysWRT import get_WRT_form_as_csv @@ -33,7 +34,8 @@ GET_FORM_AS_CSV_DICT = { 'ODK' : get_ODK_form_as_csv, 'kobotoolbox' : get_kobotoolbox_form_as_csv, 'WRSIS' : get_WRSIS_form_as_csv, - 'WRT' : get_WRT_form_as_csv + 'WRT' : get_WRT_form_as_csv, + 'ODKv2' : get_ODKv2_form_as_csv } def process_pre_job_survey(input_args): diff --git a/coordinator/ProcessorSurveysODKv2.py b/coordinator/ProcessorSurveysODKv2.py new file mode 100644 index 0000000..45f4429 --- /dev/null +++ b/coordinator/ProcessorSurveysODKv2.py @@ -0,0 +1,433 @@ +#ProcessorSurveysNewODK.py +"""Functions for parsing wheat rust survey records from the new ODK form. This +is present on the kobotoolbox server housed in Kenya, but the intended workflow +is that surveys are compiled on Wheat Rust Toolbox and models access surveys +there. Until that is in place, the EWS will use this access method, which loads +a file of surveys provided manually by Yoseph.""" + +import csv +import datetime +import logging +import os +from pathlib import Path + +from pandas import read_csv +from shutil import copyfile + +from ProcessorSurveyUtils import parse_columns +from ProcessorUtils import ( + endJob, + add_filters_to_sublogger, +) + +logger = logging.getLogger('Processor.Surveys.ODKv2') +add_filters_to_sublogger(logger) + +def get_from_file( + file_path: str, + **kwargs + ): + '''Load the first sheet of the excel file into a pandas dataframe.''' + + df = read_csv( + file_path) + + logger.info('successfully opened the survey file') + return df + +def get_ODKv2_form_as_csv(form_credentials: dict, jobPath: str, config: dict, status): + '''Given a dict with a single kobotoolbox form to download from a kobotoolbox + server, obtains it and converts to csv.''' + + output_dir = 'Export_ODK_v2' + output_path = f"{jobPath}/{output_dir}/" + + Path(output_path).mkdir(parents=True, exist_ok=True) + + # get data from file + + # dicts to convert ODKv2 disease prevalence categories to ODK format + cases_incidence = { + 'None' : 'none', # Not validated as not yet present in available dataset + 'Low (1-20%)' : 'low', + 'Medium(21-40%)' : 'medium', + 'High (>40%)' : 'high' + } + + cases_severity = { + '0%' : 0, # Not validated as not yet present in available dataset + 'Trace (1%)' : 1, + '5%' : 5, + '10%' : 10, + '15%' : 15, + '20%' : 20, + '25%' : 25, + '30%' : 30, + '35%' : 35, + '40%' : 40, + '45%' : 45, + '50%' : 50, + '55%' : 55, + '60%' : 60, + '65%' : 65, + '70%' : 70, + '75%' : 75, + '80%' : 80, + '85%' : 85, + '90%' : 90, + '95%' : 95, + '100%' : 100, + } + + + # keys are column names in the input dataframe + # values that are None mean they should be dropped + # values that are string simply rename the column + # values that are functions should be run with that key and returns series/dataframe + column_parser_dict = { + 'start' : ('parse_date',(('name_out','start'),('fmt_in','%Y-%m-%d %H:%M:%S'))), + 'end' : ('parse_date',(('name_out','end'),('fmt_in','%Y-%m-%d %H:%M:%S'))), + 'today' : ('parse_date',(('name_out','today'),('fmt_in','%Y-%m-%d'))), + 'deviceid' : 'deviceid', + 'imei' : 'imei', + 'phonenumber' : 'phonenumber', + 'username' : 'username', + 'Survey Region' : 'None', + 'Country Name' : 'surveyor_infromation-country', + "Surveyor's Name" : 'surveyor_infromation-surveyor_name', + 'Institution Name' : 'surveyor_infromation-institution', + 'Region' : 'None', + 'Zone' : 'None', + 'Woreda' : 'None', + 'Kebele' : 'None', + 'Location Name' : 'survey_infromation-location_name', + 'Gps Location (Latitude, Longitude, Elevation)' : 'None', + '_Gps Location (Latitude, Longitude, Elevation)_latitude' : 'survey_infromation-location-Latitude', + '_Gps Location (Latitude, Longitude, Elevation)_longitude' : 'survey_infromation-location-Longitude', + '_Gps Location (Latitude, Longitude, Elevation)_altitude' : 'survey_infromation-location-Altitude', + '_Gps Location (Latitude, Longitude, Elevation)_precision' : 'survey_infromation-location-Accuracy', + 'Date of Survey' : ('parse_date',(('name_out','survey_infromation-survey_date'),('fmt_in','%Y-%m-%d'))), + 'Survey Season' : 'None', + 'Survey Site' : 'site_information-survey_site', + 'Other Suevey site (Specify)' : 'None', + 'Crop' : 'site_information-crop', + 'Other Crop (Specify)' : 'other_crop', + 'Field Area (In Ha)' : 'site_information-field_area', + 'Variety' : 'site_information-variety', + 'Growth Stage' : 'site_information-growth_stage', + 'Major Observed Diseases' : 'None', + 'Major Observed Diseases/Stem Rust' : 'None', + 'Major Observed Diseases/Leaf Rust' : 'None', + 'Major Observed Diseases/Yellow/Stripe Rust' : 'None', + 'Major Observed Diseases/Fusarium Head Blight' : 'None', + 'Major Observed Diseases/Septoria tritici Blotch' : 'None', + 'Major Observed Diseases/Spot Blotch' : 'None', + 'Major Observed Diseases/Wheat Blast (MOT)' : 'None', + 'Major Observed Diseases/Leaf Rust.1' : 'None', + 'Stem Rust Incidence (%)' : ('parse_cases',(('name_out','stem_rust-stemrust_incidence'),('cases', cases_incidence),('dtype', str),('fillna','None'))), + 'Stem Rust Severity (%)' : ('parse_cases',(('name_out','stem_rust-Stemrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0%'))), + 'Host Plant Reaction To Stem Rust' : 'stem_rust-stemrust_host_plant_reaction', + 'Stem Rust Image' : 'None', + 'Stem Rust Image_URL' : 'None', + 'Leaf Rust Incidence' : ('parse_cases',(('name_out','leaf_rust-leafrust_incidence'),('cases', cases_incidence),('dtype', str),('fillna','None'))), + 'Leaf Rust Severity (%)' : ('parse_cases',(('name_out','leaf_rust-leafrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0%'))), + 'Host Plant Reaction' : 'leaf_rust-leafrust_host_plant_reaction', + 'Leaf Rust Image' : 'None', + 'Leaf Rust Image_URL' : 'None', + 'Yellow Rust Incidence(%)' : ('parse_cases',(('name_out','yellow_rust-yellowrust_incidence'),('cases', cases_incidence),('dtype', str),('fillna','None'))), + 'Yellow Rust Severity (%)' : ('parse_cases',(('name_out','yellow_rust-yellowrust_severity'),('cases', cases_severity),('dtype', int),('fillna','0%'))), + 'Host Plant Reaction.1' : 'yellow_rust-yellowrust_host_plant_reaction', + 'Yellow Rust Image' : 'None', + 'Yellow Rust Image_URL' : 'None', + 'Observed Yellow Rust Head Infection?' : 'None', + 'Yellow Rust Head Infection Incidence(%) (If Any)' : 'None', + 'Yellow Head Infection Rust Severity (%)' : 'None', + 'Yellow Rust Head Infection Image' : 'None', + 'Yellow Rust Head Infection Image_URL' : 'None', + 'Septoria Incidence (%)' : 'septoria-septoria_incidence', + 'Septoria Severity' : 'septoria-septoria_severity', + 'Septoria Image' : 'None', + 'Septoria Image_URL' : 'None', + 'Wheat Blast (Mot) Incidence (%)' : 'None', + 'Wheat Blast (Mot) Severity' : 'None', + 'Wheat Blast (Mot) Image' : 'None', + 'Wheat Blast (Mot) Image_URL' : 'None', + 'Spot Blotch Incidence (%)' : 'None', + 'Spot Blotch Severity' : 'None', + 'Spot Blotch Image' : 'None', + 'Spot Blotch Image_URL' : 'None', + 'Fusarium Head Blight Incidence (%)' : 'None', + 'Fusarium Head Blight Severity' : 'None', + 'Fusarium Head Blight Image' : 'None', + 'Fusarium Head Blight Image_URL' : 'None', + 'Other Diseases and Pests (Optional)' : 'None', + 'Other Diseases and Pests (Optional)/Alternaria Leaf Blight' : 'None', + 'Other Diseases and Pests (Optional)/Bacterial Stripe' : 'None', + 'Other Diseases and Pests (Optional)/Bacterial_blight' : 'None', + 'Other Diseases and Pests (Optional)/Barley Yellow Dwarf Virus (BYDV)' : 'None', + 'Other Diseases and Pests (Optional)/Basal Glume Rot' : 'None', + 'Other Diseases and Pests (Optional)/Common Bunt' : 'None', + 'Other Diseases and Pests (Optional)/Common Root Rot' : 'None', + 'Other Diseases and Pests (Optional)/Crown Rot' : 'None', + 'Other Diseases and Pests (Optional)/Eye Spot' : 'None', + 'Other Diseases and Pests (Optional)/Glume Blotch' : 'None', + 'Other Diseases and Pests (Optional)/Loose Smut' : 'None', + 'Other Diseases and Pests (Optional)/Pythium Root Rot' : 'None', + 'Other Diseases and Pests (Optional)/Sclerotium Wilt' : 'None', + 'Other Diseases and Pests (Optional)/Sharp Eyespot' : 'None', + 'Other Diseases and Pests (Optional)/Soilborne Wheat Mosaic Virus (SBWMV)' : 'None', + 'Other Diseases and Pests (Optional)/Take-All' : 'None', + 'Other Diseases and Pests (Optional)/Tan Spot' : 'None', + 'Other Diseases and Pests (Optional)/Wheat Streak Mosaic Virus (WSMV)' : 'None', + 'Other Diseases and Pests (Optional)/Sunpest' : 'None', + 'Other Diseases and Pests (Optional)/Fall Armyworm' : 'None', + 'Other Diseases and Pests (Optional)/Aphid' : 'None', + 'Other Diseases and Pests (Optional)/Other' : 'None', + 'Alternaria Leaf Blight Incidence (%)' : 'None', + 'Alternaria Leaf Blight Severity' : 'None', + 'Alternaria Leaf Blight Image' : 'None', + 'Alternaria Leaf Blight Image_URL' : 'None', + 'Bacterial Stripe Incidence (%)' : 'None', + 'Bacterial Stripe Severity' : 'None', + 'Bacterial Stripe Image' : 'None', + 'Bacterial Stripe Image_URL' : 'None', + 'Barley Yellow Dwarf Virus (bydv) Incidence (%)' : 'None', + 'Barley Yellow Dwarf Virus (bydv) Severity' : 'None', + 'Barley Yellow Dwarf Virus (bydv) Image' : 'None', + 'Barley Yellow Dwarf Virus (bydv) Image_URL' : 'None', + 'Common Bunt Incidence (%)' : 'None', + 'Common Bunt Severity' : 'None', + 'Common Bunt Severity Image' : 'None', + 'Common Bunt Severity Image_URL' : 'None', + 'Basal Glume Rot Incidence (%)' : 'None', + 'Basal Glume Rot Severity' : 'None', + 'Basal Glume Rot Image' : 'None', + 'Basal Glume Rot Image_URL' : 'None', + 'Crown Rot Incidence (%)' : 'None', + 'Crown Rot Severity' : 'None', + 'Crown Rot Image' : 'None', + 'Crown Rot Image_URL' : 'None', + 'Eye Spot Incidence (%)' : 'None', + 'Eye Spot Severity' : 'None', + 'Eye Spot Image' : 'None', + 'Eye Spot Image_URL' : 'None', + 'Glume Blotch Incidence (%)' : 'None', + 'Glume Blotch Severity' : 'None', + 'Glume Blotch Image' : 'None', + 'Glume Blotch Image_URL' : 'None', + 'Loose Smut Incidence (%)' : 'None', + 'Loose Smut Severity' : 'None', + 'Loose Smut Image' : 'None', + 'Loose Smut Image_URL' : 'None', + 'Pythium Root Rot Incidence (%)' : 'None', + 'Pythium Root Rot Severity' : 'None', + 'Pythium Root Rot Image' : 'None', + 'Pythium Root Rot Image_URL' : 'None', + 'Sclerotium Wilt Incidence (%)' : 'None', + 'Sclerotium Wilt Severity' : 'None', + 'Sclerotium Wilt Image' : 'None', + 'Sclerotium Wilt Image_URL' : 'None', + 'Sharp Eyespot Incidence (%)' : 'None', + 'Sharp Eyespot Severity' : 'None', + 'Sharp Eyespot Image' : 'None', + 'Sharp Eyespot Image_URL' : 'None', + 'Soilborne Wheat Mosaic Virus (Sbwmv) Incidence (%)' : 'None', + 'Soilborne Wheat Mosaic Virus (Sbwmv) Severity' : 'None', + 'Soilborne Wheat Mosaic Virus (Sbwmv) Image' : 'None', + 'Soilborne Wheat Mosaic Virus (Sbwmv) Image_URL' : 'None', + 'Take-All Incidence (%)' : 'None', + 'Take-All Severity' : 'None', + 'Take-All Image' : 'None', + 'Take-All Image_URL' : 'None', + 'Tan Spot Incidence (%)' : 'None', + 'Tan Spot Severity' : 'None', + 'Tan Spot Image' : 'None', + 'Tan Spot Image_URL' : 'None', + 'Wheat Streak Mosaic Virus (Wsmv) Incidence (%)' : 'None', + 'Wheat Streak Mosaic Virus (Wsmv) Severity' : 'None', + 'Wheat Streak Mosaic Virus (Wsmv) Image' : 'None', + 'Wheat Streak Mosaic Virus (Wsmv) Image_URL' : 'None', + 'List Other Diseases and Pests' : 'None', + 'Rate of Damage by Insects' : 'None', + 'Fungicide Applied?' : 'None', + 'Fungicide Names' : 'None', + 'Number of Fungicide Application' : 'None', + 'Samples Collected? (Y/N)' : 'samples_collected', + 'Type of Samples Collected' : 'samples_type', + 'Type of Samples Collected/Stem rust Live Sample' : 'None', + 'Type of Samples Collected/Stem rust dead DNA Sample' : 'None', + 'Type of Samples Collected/Yellow rust Live Sample' : 'None', + 'Type of Samples Collected/Yellow rust dead DNA Sample' : 'None', + 'Type of Samples Collected/Leaf rust Live Sample' : 'None', + 'Type of Samples Collected/Fusarium Head Blight Live Sample' : 'None', + 'Type of Samples Collected/Septoria Live Sample' : 'None', + 'Type of Samples Collected/Wheat Blast (MOT) Live Sample' : 'None', + 'Type of Samples Collected/Spot Blotch Live Sample' : 'None', + 'Number of Stem Rust Live Sample' : 'None', + 'Number of Stem Rust Dead Dna Samples' : 'None', + 'Number of Yellow Rust Live Samples' : 'None', + 'Number of Yellow Rust Dead Samples' : 'None', + 'Number of Leaf Rust Live Samples' : 'None', + 'Number of Fusarium Head Blight Live Sample' : 'None', + 'Number of Septoria Live Sample' : 'None', + 'Number of Spot Blotch Live Sample' : 'None', + 'Number of Wheat Blast (Mot) Live Sample' : 'None', + 'Additional Comments/ Observations' : 'comment', + 'Survey Season.1' : 'None', + 'Survey Season.2' : 'None', + 'Specify Other' : 'None', + 'Other Diseases and Pests (Optional).1' : 'None', + 'Other Diseases and Pests (Optional)/Alternaria Leaf Blight.1' : 'None', + 'Other Diseases and Pests (Optional)/Bacterial Stripe.1' : 'None', + 'Other Diseases and Pests (Optional)/Bacterial_blight.1' : 'None', + 'Other Diseases and Pests (Optional)/Barley Yellow Dwarf Virus (BYDV).1' : 'None', + 'Other Diseases and Pests (Optional)/Basal Glume Rot.1' : 'None', + 'Other Diseases and Pests (Optional)/Common Bunt.1' : 'None', + 'Other Diseases and Pests (Optional)/Common Root Rot.1' : 'None', + 'Other Diseases and Pests (Optional)/Crown Rot.1' : 'None', + 'Other Diseases and Pests (Optional)/Eye Spot.1' : 'None', + 'Other Diseases and Pests (Optional)/Glume Blotch.1' : 'None', + 'Other Diseases and Pests (Optional)/Loose Smut.1' : 'None', + 'Other Diseases and Pests (Optional)/Pythium Root Rot.1' : 'None', + 'Other Diseases and Pests (Optional)/Sclerotium Wilt.1' : 'None', + 'Other Diseases and Pests (Optional)/Sharp Eyespot.1' : 'None', + 'Other Diseases and Pests (Optional)/Soilborne Wheat Mosaic Virus (SBWMV).1' : 'None', + 'Other Diseases and Pests (Optional)/Take-All.1' : 'None', + 'Other Diseases and Pests (Optional)/Tan Spot.1' : 'None', + 'Other Diseases and Pests (Optional)/Wheat Streak Mosaic Virus (WSMV).1' : 'None', + 'Other Diseases and Pests (Optional)/Sunpest.1' : 'None', + 'Other Diseases and Pests (Optional)/Fall Armyworm.1' : 'None', + 'Other Diseases and Pests (Optional)/Aphid.1' : 'None', + 'Other Diseases and Pests (Optional)/Other.1' : 'None', + 'Insects Observed (%)' : 'None', + 'Insects Observed (%)/Fall Armyworm' : 'None', + 'Insects Observed (%)/Aphid' : 'None', + 'Insects Observed (%)/Other' : 'None', + '_id' : 'None', + '_uuid' : 'KEY', + '_submission_time' : ('parse_date',(('name_out','SubmissionDate'),('fmt_in','%Y-%m-%d %H:%M:%S'))), + '_validation_status' : 'None', + '_notes' : 'None', + '_status' : 'None', + '_submitted_by' : 'None', + '__version__' : 'None', + '_tags' : 'None', + '_index' : 'None', + # multiline key + "Image capture Instruction:\n" + "While scoring diseases you will be asked to capture an image of " + "the disease. Please follow the following instruction when you " + "are taking pictures.\nTake ONE leaf per image.\nThere must only " + "be one focal leaf in each image. Its ok to have leaves in the " + "background,\nsometimes it may be impossible to avoid having " + "other leaves in the image. However, the main leaf should be in " + "the foreground with no other leaves in front of it\n\nCenter the " + "focal leaf in the image\n\nThe focal leaf must be in focus (and " + "background is not more focused than leaf)\n\nThe focal leaf must " + "be fully in the image. You should be able to see all edges of " + "the leaf.\n\nHands, feet and other body parts/objects in images " + "are OK.\nThe focal leaf must have a mono-infection. Do not take " + "images of leaves that are co-infected unless instructed to do " + "so." : 'None' + # value for multiline key is just above + } + + logger.debug('Performing download') + + # perform a pull from the server, and if it fails write a warning message + + download_success = True + + skip_download: bool = config['Survey'].get('SkipServerDownload', False) + + if not skip_download: + try: + + file_path = form_credentials['file_path'] + + assert os.path.isfile(file_path) + + request = get_from_file(file_path) + + except (AssertionError, FileNotFoundError) as e: + + logger.warning(f"Failed to access '{file_path}'") + + status.reset('WARNING') + + download_success = False + + # define filenames + csv_filename = f"SurveyData_{form_credentials['form_id']}.csv" + + csv_processed_filename = f"SurveyDataProcessed.csv" + csv_processed_path = f"{output_path}/{csv_processed_filename}" + + if download_success and not skip_download: + # parse dataframe + + dataframe_raw = request + + logger.debug('Saving raw csv file') + + df_raw_filename = f"{output_path}/{csv_filename}.csv" + + dataframe_raw.to_csv(df_raw_filename,index=False,quoting=csv.QUOTE_MINIMAL) + + # process to match ODK format + + dataframe_processed = parse_columns(dataframe_raw,column_parser_dict) + + logger.debug('Saving processed csv file') + + dataframe_processed.to_csv(csv_processed_path,index=False,quoting=csv.QUOTE_MINIMAL) + + if not download_success or skip_download: + + logger.info("Because server download failed somewhere (or we are skipping downloads), trying to recover by copying recent download") + + copy_success = False + + days_back = 1 + acceptable_days_back = int(config['Survey']['AcceptableDowntimeDays']) + logger.debug(f"Acceptable server downtime is set to {acceptable_days_back} days") + + while ((not copy_success) and (days_back <= acceptable_days_back)): + + current_date = datetime.datetime.strptime(config['StartString'],'%Y%m%d') + + past_date = current_date - datetime.timedelta(days=days_back) + + #past_jobPath = f"{config['WorkspacePathout']}{short_name[component]}_{past_date.strftime('%Y%m%d')}" + past_jobPath = f"{config['WorkspacePath']}/SURVEYDATA_{past_date.strftime('%Y%m%d')}" + + past_output_path = f"{past_jobPath}/{output_dir}/" + + try: + # check that python or perl coordinator script succeeded for that date + success_py = os.path.isfile(f"{past_jobPath}/STATUS_SUCCESS") + success_perl = os.path.isfile(f"{past_jobPath}/SURVEYDATA_SUCCESS.txt") + assert success_py or success_perl + + past_csv_filename = csv_processed_filename + + logger.info(f"Looking for {past_output_path+past_csv_filename}") + + copyfile(past_output_path+past_csv_filename,csv_processed_path) + + assert os.path.isfile(csv_processed_path) + + copy_success = True + except: + logger.info(f"Not found a past copy of ODKv2 in {past_output_path}") + + days_back += 1 + + if not copy_success: + logger.error(f"Failed get a suitable copy of survey data.") + status.reset('ERROR') + endJob(status,premature=True) + + logger.warning(f"Using download from {past_jobPath}.") + + return csv_processed_path -- GitLab