FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit 7211f6db authored by Dr T. Mona's avatar Dr T. Mona
Browse files

feat: Advanced grouping.

- Individual grouping based on GroupBy content (defined in config)
- Allow grouping by multiple columns
- Handling NaN while creating individual groups
- Handling '+' separrated group names
parent 1dc1aff0
No related branches found
No related tags found
No related merge requests found
......@@ -363,29 +363,57 @@ class ProcessorSurveys(Processor):
elif 'Groups' in config['Survey']:
# if 'Groups' is defined in the config, create grouped survey files and run python version
self.logger.debug('Preparing grouped survey files')
self.logger.info('Preparing grouped survey files')
group_directory = f"{jobPath}/Groups"
Path(group_directory).mkdir(parents=True, exist_ok=True)
origins_list = df_join["Origin"].unique()
groups = {i:[i] for i in origins_list}
# creating initial groups
groups = config['Survey']['Groups']
assert not np_any([k in origins_list for k in config['Survey']['Groups'].keys()]), 'Group(s) defined in config already present in the survey data as origin'
# check if columns requested in GroupBy are present in the dataframe
assert all([group_by in df_join.columns for group_by in config['Survey']['GroupBy']]), 'Column(s) requested in GroupBy are not present in the dataframe'
groups.update(config['Survey']['Groups'])
for group_by in config['Survey']['GroupBy']:
self.logger.debug(f"grouping by {group_by}")
# handle NaN values
if df_join[group_by].isna().any():
self.logger.warning(f"Grouping by {group_by} contains NaN values. Filling NaN values with 'Unknown'")
df_join[group_by] = df_join[group_by].fillna('Unknown')
groups_list = df_join[group_by].unique()
# remove 'Unknown' from the list of groups
groups_list = [i for i in groups_list if i != 'Unknown']
assert not np_any([i in groups_list for i in groups]), f"Group name {[i for i in groups_list if i in groups]} already present in the {group_by} column. Please rename the group name in the config file."
groups.update({i:{group_by:[i]} for i in groups_list})
# remove groups that are listed in GroupsToIgnore
if 'GroupsToIgnore' in config['Survey']:
for group_name in config['Survey']['GroupsToIgnore']:
groups_to_ignore = config['Survey']['GroupsToIgnore']
# add groups to ignore if all elements of the group are in the list of groups to ignore
for group_name in groups:
if '+' in group_name:
group_name_separated = group_name.split('+')
if all([i in groups_to_ignore for i in group_name_separated]):
self.logger.debug(f"Adding group {group_name} to list of groups to ignore")
groups_to_ignore.append(group_name) # TODO: rename group based on the unique element
for group_name in groups_to_ignore:
if group_name in groups:
self.logger.info(f"Removing group {group_name} from list of groups")
self.logger.debug(f"Removing group {group_name} from list of groups")
del groups[group_name]
for group_name,group_content in groups.items():
self.logger.info(f"Creating survey group {group_name} which includes {group_content}")
df_group = df_join.loc[df_join["Origin"].isin(group_content)]
# applying the grouping, keeping only the surveys that have the elements listed in the group
df_group = df_join
for group_by,group_elements in group_content.items():
df_group = df_group.loc[df_group[group_by].isin(group_elements)]
group_surveys_filename = f"surveys_{group_name}.csv"
group_surveys_filepath = f"{group_directory}/{group_surveys_filename}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment