From 7211f6dbff739ae03087561a22183281e17fec42 Mon Sep 17 00:00:00 2001
From: tm689 <tm689@cam.ac.uk>
Date: Thu, 29 Feb 2024 17:27:28 +0000
Subject: [PATCH] feat: Advanced grouping. - Individual grouping based on
 GroupBy content (defined in config) - Allow grouping by multiple columns -
 Handling NaN while creating individual groups - Handling '+' separrated group
 names

---
 coordinator/ProcessorSurveys.py | 44 +++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py
index 8da4d69..e489732 100644
--- a/coordinator/ProcessorSurveys.py
+++ b/coordinator/ProcessorSurveys.py
@@ -363,29 +363,57 @@ class ProcessorSurveys(Processor):
         elif 'Groups' in config['Survey']:
             # if 'Groups' is defined in the config, create grouped survey files and run python version
 
-            self.logger.debug('Preparing grouped survey files')
+            self.logger.info('Preparing grouped survey files')
             group_directory = f"{jobPath}/Groups"
             Path(group_directory).mkdir(parents=True, exist_ok=True)
 
-            origins_list = df_join["Origin"].unique()
-            groups = {i:[i] for i in origins_list}
+            # creating initial groups
+            groups = config['Survey']['Groups']
 
-            assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]), 'Group(s) defined in config already present in the survey data as origin'
+            # check if columns requested in GroupBy are present in the dataframe
+            assert all([group_by in df_join.columns for group_by in config['Survey']['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe'
 
-            groups.update(config['Survey']['Groups'])
+            for group_by in config['Survey']['GroupBy']:
+                self.logger.debug(f"grouping by {group_by}")
+
+                # handle NaN values
+                if df_join[group_by].isna().any():
+                    self.logger.warning(f"Grouping by {group_by} contains NaN values. Filling NaN values with 'Unknown'")
+                    df_join[group_by] = df_join[group_by].fillna('Unknown')
+                
+                groups_list = df_join[group_by].unique()
+                # remove 'Unknown' from the list of groups
+                groups_list = [i for i in groups_list if i != 'Unknown']
+
+                assert not np_any([i in groups_list for i in groups]), f"Group name {[i for i in groups_list if i in groups]} already present in the {group_by} column. Please rename the group name in the config file."
+                
+                groups.update({i:{group_by:[i]} for i in groups_list})
 
             # remove groups that are listed in GroupsToIgnore
             if 'GroupsToIgnore' in config['Survey']:
-                for group_name in config['Survey']['GroupsToIgnore']:
+                groups_to_ignore = config['Survey']['GroupsToIgnore']
+
+                # add groups to ignore if all elements of the group are in the list of groups to ignore
+                for group_name in groups:
+                    if '+' in group_name:
+                        group_name_separated = group_name.split('+')
+                        if all([i in groups_to_ignore for i in group_name_separated]):
+                            self.logger.debug(f"Adding group {group_name} to list of groups to ignore")
+                            groups_to_ignore.append(group_name) # TODO: rename group based on the unique element
+
+                for group_name in groups_to_ignore:
                     if group_name in groups:
-                        self.logger.info(f"Removing group {group_name} from list of groups")
+                        self.logger.debug(f"Removing group {group_name} from list of groups")
                         del groups[group_name]
 
             for group_name,group_content in groups.items():
 
                 self.logger.info(f"Creating survey group {group_name} which includes {group_content}")
 
-                df_group = df_join.loc[df_join["Origin"].isin(group_content)]
+                # applying the grouping, keeping only the surveys that have the elements listed in the group
+                df_group = df_join
+                for group_by,group_elements in group_content.items():
+                    df_group = df_group.loc[df_group[group_by].isin(group_elements)]
 
                 group_surveys_filename = f"surveys_{group_name}.csv"
                 group_surveys_filepath = f"{group_directory}/{group_surveys_filename}"
-- 
GitLab