feat: Advanced grouping.

- Individual grouping based on GroupBy content (defined in config) - Allow grouping by multiple columns - Handling NaN while creating individual groups - Handling '+' separrated group names

feat: Advanced grouping.
7211f6db · Dr T. Mona · 1dc1aff0 · 7211f6db
Commit 7211f6db authored 1 year ago by Dr T. Mona
--- a/coordinator/ProcessorSurveys.py
+++ b/coordinator/ProcessorSurveys.py
@@ -363,29 +363,57 @@ class ProcessorSurveys(Processor):
        elif 'Groups' in config['Survey']:
            # if 'Groups' is defined in the config, create grouped survey files and run python version

-            self.logger.debug('Preparing grouped survey files')
+            self.logger.info('Preparing grouped survey files')
            group_directory = f"{jobPath}/Groups"
            Path(group_directory).mkdir(parents=True, exist_ok=True)

-            origins_list = df_join["Origin"].unique()
-            groups = {i:[i] for i in origins_list}
+            # creating initial groups
+            groups = config['Survey']['Groups']

-            assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]), 'Group(s) defined in config already present in the survey data as origin'
+            # check if columns requested in GroupBy are present in the dataframe
+            assert all([group_by in df_join.columns for group_by in config['Survey']['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe'

-            groups.update(config['Survey']['Groups'])
+            for group_by in config['Survey']['GroupBy']:
+                self.logger.debug(f"grouping by {group_by}")
+
+                # handle NaN values
+                if df_join[group_by].isna().any():
+                    self.logger.warning(f"Grouping by {group_by} contains NaN values. Filling NaN values with 'Unknown'")
+                    df_join[group_by] = df_join[group_by].fillna('Unknown')
+                
+                groups_list = df_join[group_by].unique()
+                # remove 'Unknown' from the list of groups
+                groups_list = [i for i in groups_list if i != 'Unknown']
+
+                assert not np_any([i in groups_list for i in groups]), f"Group name {[i for i in groups_list if i in groups]} already present in the {group_by} column. Please rename the group name in the config file."
+                
+                groups.update({i:{group_by:[i]} for i in groups_list})

            # remove groups that are listed in GroupsToIgnore
            if 'GroupsToIgnore' in config['Survey']:
-                for group_name in config['Survey']['GroupsToIgnore']:
+                groups_to_ignore = config['Survey']['GroupsToIgnore']
+
+                # add groups to ignore if all elements of the group are in the list of groups to ignore
+                for group_name in groups:
+                    if '+' in group_name:
+                        group_name_separated = group_name.split('+')
+                        if all([i in groups_to_ignore for i in group_name_separated]):
+                            self.logger.debug(f"Adding group {group_name} to list of groups to ignore")
+                            groups_to_ignore.append(group_name) # TODO: rename group based on the unique element
+
+                for group_name in groups_to_ignore:
                    if group_name in groups:
-                        self.logger.info(f"Removing group {group_name} from list of groups")
+                        self.logger.debug(f"Removing group {group_name} from list of groups")
                        del groups[group_name]

            for group_name,group_content in groups.items():

                self.logger.info(f"Creating survey group {group_name} which includes {group_content}")

-                df_group = df_join.loc[df_join["Origin"].isin(group_content)]
+                # applying the grouping, keeping only the surveys that have the elements listed in the group
+                df_group = df_join
+                for group_by,group_elements in group_content.items():
+                    df_group = df_group.loc[df_group[group_by].isin(group_elements)]

                group_surveys_filename = f"surveys_{group_name}.csv"
                group_surveys_filepath = f"{group_directory}/{group_surveys_filename}"