From 7211f6dbff739ae03087561a22183281e17fec42 Mon Sep 17 00:00:00 2001 From: tm689 <tm689@cam.ac.uk> Date: Thu, 29 Feb 2024 17:27:28 +0000 Subject: [PATCH] feat: Advanced grouping. - Individual grouping based on GroupBy content (defined in config) - Allow grouping by multiple columns - Handling NaN while creating individual groups - Handling '+' separrated group names --- coordinator/ProcessorSurveys.py | 44 +++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 8da4d69..e489732 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -363,29 +363,57 @@ class ProcessorSurveys(Processor): elif 'Groups' in config['Survey']: # if 'Groups' is defined in the config, create grouped survey files and run python version - self.logger.debug('Preparing grouped survey files') + self.logger.info('Preparing grouped survey files') group_directory = f"{jobPath}/Groups" Path(group_directory).mkdir(parents=True, exist_ok=True) - origins_list = df_join["Origin"].unique() - groups = {i:[i] for i in origins_list} + # creating initial groups + groups = config['Survey']['Groups'] - assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]), 'Group(s) defined in config already present in the survey data as origin' + # check if columns requested in GroupBy are present in the dataframe + assert all([group_by in df_join.columns for group_by in config['Survey']['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe' - groups.update(config['Survey']['Groups']) + for group_by in config['Survey']['GroupBy']: + self.logger.debug(f"grouping by {group_by}") + + # handle NaN values + if df_join[group_by].isna().any(): + self.logger.warning(f"Grouping by {group_by} contains NaN values. Filling NaN values with 'Unknown'") + df_join[group_by] = df_join[group_by].fillna('Unknown') + + groups_list = df_join[group_by].unique() + # remove 'Unknown' from the list of groups + groups_list = [i for i in groups_list if i != 'Unknown'] + + assert not np_any([i in groups_list for i in groups]), f"Group name {[i for i in groups_list if i in groups]} already present in the {group_by} column. Please rename the group name in the config file." + + groups.update({i:{group_by:[i]} for i in groups_list}) # remove groups that are listed in GroupsToIgnore if 'GroupsToIgnore' in config['Survey']: - for group_name in config['Survey']['GroupsToIgnore']: + groups_to_ignore = config['Survey']['GroupsToIgnore'] + + # add groups to ignore if all elements of the group are in the list of groups to ignore + for group_name in groups: + if '+' in group_name: + group_name_separated = group_name.split('+') + if all([i in groups_to_ignore for i in group_name_separated]): + self.logger.debug(f"Adding group {group_name} to list of groups to ignore") + groups_to_ignore.append(group_name) # TODO: rename group based on the unique element + + for group_name in groups_to_ignore: if group_name in groups: - self.logger.info(f"Removing group {group_name} from list of groups") + self.logger.debug(f"Removing group {group_name} from list of groups") del groups[group_name] for group_name,group_content in groups.items(): self.logger.info(f"Creating survey group {group_name} which includes {group_content}") - df_group = df_join.loc[df_join["Origin"].isin(group_content)] + # applying the grouping, keeping only the surveys that have the elements listed in the group + df_group = df_join + for group_by,group_elements in group_content.items(): + df_group = df_group.loc[df_group[group_by].isin(group_elements)] group_surveys_filename = f"surveys_{group_name}.csv" group_surveys_filepath = f"{group_directory}/{group_surveys_filename}" -- GitLab