From 0c6e68f892a1da2e0acf6f5313ee5b56387741ca Mon Sep 17 00:00:00 2001 From: tm689 <tm689@cam.ac.uk> Date: Mon, 25 Sep 2023 11:25:04 +0100 Subject: [PATCH] feat: check the key column is unique, if not raise a warning and remove duplicates --- coordinator/ProcessorSurveys.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 020f5e1..2f2e997 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -188,11 +188,18 @@ def process_in_job_survey(jobPath,status,config,component): survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv" survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv" - + # perform here in python, using the 'KEY' column - # check the key column is unique - - assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique' + # check the key column is unique, if not raise a warning and remove duplicates + + if dfm['KEY'].unique().size != dfm['KEY'].size: + status.reset('WARNING') + logger.warning(f"KEY column is not unique, removing duplicates") + # count the number of duplicates + n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size + # drop the duplicates + dfm = dfm.drop_duplicates(keep='first') + logger.warning(f"Removed {n_duplicates} duplicates") df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str') keys_to_rm = df_rm['KEY'] -- GitLab