From fb9ad6aceeac17e5ec4b63731b1522fa1c36a601 Mon Sep 17 00:00:00 2001 From: tm689 <tm689@cam.ac.uk> Date: Tue, 1 Oct 2024 14:33:34 +0100 Subject: [PATCH] feat: restructuring Grouping config to enable additional meta information for groups. --- ews/coordinator/processor_surveys.py | 33 ++++--- .../survey_config_EastAfrica_fc_live.json | 93 ++++++------------- 2 files changed, 48 insertions(+), 78 deletions(-) diff --git a/ews/coordinator/processor_surveys.py b/ews/coordinator/processor_surveys.py index 9c875fb..a07a70d 100644 --- a/ews/coordinator/processor_surveys.py +++ b/ews/coordinator/processor_surveys.py @@ -267,20 +267,20 @@ class ProcessorSurveys(ProcessorBase): upload_directory = f"{job_path}/upload" Path(upload_directory).mkdir(parents=True, exist_ok=True) - if 'Groups' in config: - # if 'Groups' is defined in the config, create grouped survey files and run python version + if 'Grouping' in config: + # if 'Grouping' is defined in the config, create grouped survey files and run python version sources calculation logger.info('Preparing grouped survey files') group_directory = f"{job_path}/Groups" Path(group_directory).mkdir(parents=True, exist_ok=True) # creating initial groups - groups = config['Groups'] + groups = config['Grouping']['Groups'] # check if columns requested in GroupBy are present in the dataframe - assert all([group_by in df_join.columns for group_by in config['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe' + assert all([group_by in df_join.columns for group_by in config['Grouping']['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe' - for group_by in config['GroupBy']: + for group_by in config['Grouping']['GroupBy']: logger.debug(f"grouping by {group_by}") # handle NaN values @@ -294,11 +294,11 @@ class ProcessorSurveys(ProcessorBase): assert not np_any([i in groups_list for i in groups]), f"Group name {[i for i in groups_list if i in groups]} already present in the {group_by} column. Please rename the group name in the config file." - groups.update({i:{group_by:[i]} for i in groups_list}) + groups.update({i:{'longname':i,'content':{group_by:[i]}} for i in groups_list}) # remove groups that are listed in GroupsToIgnore - if 'GroupsToIgnore' in config: - groups_to_ignore = config['GroupsToIgnore'] + if 'GroupsToIgnore' in config['Grouping']: + groups_to_ignore = config['Grouping']['GroupsToIgnore'] # add groups to ignore if all elements of the group are in the list of groups to ignore for group_name in groups: @@ -314,10 +314,13 @@ class ProcessorSurveys(ProcessorBase): del groups[group_name] # create a list of each group and their content - groups_list = DataFrame(columns=['Group','Content','SourcesFile']) + groups_list = DataFrame(columns=['Group','Group_longname','Content','SourcesFile']) - for group_name,group_content in groups.items(): + for group_name,group_meta in groups.items(): + group_longname = group_meta['longname'] + group_content = group_meta['content'] + logger.info(f"Creating survey group {group_name} which includes {group_content}") # applying the grouping, keeping only the surveys that have the elements listed in the group @@ -333,9 +336,9 @@ class ProcessorSurveys(ProcessorBase): output_directory = f"{job_path}/source_gen/{group_name}" Path(output_directory).mkdir(parents=True, exist_ok=True) - if 'SourcesConfigs' in config and group_name in config['SourcesConfigs']: - logger.info(f"Running source gen for {group_name} group wih config {config['SourcesConfigs'][group_name]}") - sources_config = config['SourcesConfigs'][group_name] + if 'SourcesConfigs' in group_meta: + logger.info(f"Running source gen for {group_name} group wih config {group_meta['sourcesConfig']}") + sources_config = group_meta['sourcesConfig'] else: logger.info(f"Running source gen for {group_name} group wih default config {config['SourcesConfigDefault']}") sources_config = config['SourcesConfigDefault'] @@ -366,7 +369,7 @@ class ProcessorSurveys(ProcessorBase): copyfile(sources_path, output_path) # add group to the list of groups - groups_list = concat([groups_list, DataFrame({'Group':[group_name],'Content':[group_content],'SourcesFile':[output_filename]})]) + groups_list = concat([groups_list, DataFrame({'Group':[group_name],'Group_longname':[group_longname],'Content':[group_content],'SourcesFile':[output_filename]})]) # THIS CAN BE REMOVED ONCE THE GROUPS ARE PROPERLY PICKED UP BY THE METOFFICE if (group_name == 'PROD'): @@ -380,7 +383,7 @@ class ProcessorSurveys(ProcessorBase): copyfile(sources_path, output_path) # output groups_list to a file - groups_list_filename = f"{job_path}/upload/list_sources_{config['StartString']}.csv" + groups_list_filename = f"{job_path}/upload/groups_{config['StartString']}.csv" logger.debug(f"Outputting list of groups to {groups_list_filename}") groups_list.to_csv(groups_list_filename, index=False) diff --git a/tests/test_data/test_deployment/regions/EastAfrica/resources/configs/coordinator/survey_config_EastAfrica_fc_live.json b/tests/test_data/test_deployment/regions/EastAfrica/resources/configs/coordinator/survey_config_EastAfrica_fc_live.json index 5133d04..88ffd8b 100644 --- a/tests/test_data/test_deployment/regions/EastAfrica/resources/configs/coordinator/survey_config_EastAfrica_fc_live.json +++ b/tests/test_data/test_deployment/regions/EastAfrica/resources/configs/coordinator/survey_config_EastAfrica_fc_live.json @@ -5,74 +5,41 @@ "AcceptableDowntimeDays": 70, "SeasonStartString": "20220930", "ServerCredentialsFile": "${ConfigsPath}/coordinator/Cred-WRT.json", - "Groups" : { - "PROD" : { - "origin": [ - "CSV_01", - "CSV_02", - "ODK_01", - "ODK_02", - "CSV-CAM" - ] + "Grouping": { + "GroupBy": ["origin","published_level","survey_site"], + "Groups": { + "PROD": { + "longname": "Production", + "content" : { + "origin": ["CSV_01","CSV_02","ODK_01","ODK_02","CSV-CAM"] + } + }, + "WRT-Publ": { + "longname": "WRT published", + "content": { + "origin": ["CSV_01","CSV_02","ODK_01","ODK_02"], + "published_level": ["Publ"] + }, + "sourcesConfig": "${ConfigsPath}/source_gen/config_EastAfrica_mapspam2017.json" + }, + "Trial-sites": { + "longname": "Trial sites", + "content": { + "origin": ["CSV_01","CSV_02","ODK_01","ODK_02","CSV-CAM"], + "survey_site": ["Trial"] + } + } }, - "WRT-Publ": { - "origin": [ - "CSV_01", - "CSV_02", - "ODK_01", - "ODK_02" - ], - "published_level": [ - "Publ" - ] - }, - "Trial-sites": { - "origin": [ - "CSV_01", - "CSV_02", - "ODK_01", - "ODK_02", - "CSV-CAM" - ], - "survey_site": [ - "Trial" - ] - } + "GroupsToIgnore": [ + "ODK-server","kobo-server","newODK","newODK2","CSV-CAM","CSV_01","CSV_02","ODK_01","ODK_02", + "Unpubl","Publ", + "Raw", + "Farmer","farmer_field","Seed_production","Weed","Road","Trial","VCU","Sentinel","Other" + ] }, - "GroupBy": [ - "origin", - "published_level", - "survey_site" - ], - "GroupsToIgnore": [ - "ODK-server", - "kobo-server", - "newODK", - "newODK2", - "CSV-CAM", - "CSV_01", - "CSV_02", - "ODK_01", - "ODK_02", - "Unpubl", - "Publ", - "Raw", - "Farmer", - "farmer_field", - "Seed_production", - "Weed", - "Road", - "Trial", - "VCU", - "Sentinel", - "Other" - ], "ODKDatabasePathTemplate": "${WorkspacePathout}/ODK_DB/", "SurveyFormat": "WRT", "SurveyorNameCol" : "NOT_USED?", "SourcesRegionName" : "EastAfrica", - "SourcesConfigs": { - "WRT-Publ": "${ConfigsPath}/source_gen/config_EastAfrica_mapspam2017.json" - }, "SourcesConfigDefault": "${ConfigsPath}/source_gen/config_EastAfrica_mapspam2017.json" } -- GitLab