From 0c6e68f892a1da2e0acf6f5313ee5b56387741ca Mon Sep 17 00:00:00 2001
From: tm689 <tm689@cam.ac.uk>
Date: Mon, 25 Sep 2023 11:25:04 +0100
Subject: [PATCH] feat: check the key column is unique, if not raise a warning
 and remove duplicates

---
 coordinator/ProcessorSurveys.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py
index 020f5e1..2f2e997 100644
--- a/coordinator/ProcessorSurveys.py
+++ b/coordinator/ProcessorSurveys.py
@@ -188,11 +188,18 @@ def process_in_job_survey(jobPath,status,config,component):
 
     survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv"
     survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv"
-
+    
     # perform here in python, using the 'KEY' column
-    # check the key column is unique
-
-    assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique'
+    # check the key column is unique, if not raise a warning and remove duplicates
+    
+    if dfm['KEY'].unique().size != dfm['KEY'].size:
+        status.reset('WARNING')
+        logger.warning(f"KEY column is not unique, removing duplicates")
+        # count the number of duplicates
+        n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size
+        # drop the duplicates
+        dfm = dfm.drop_duplicates(keep='first')
+        logger.warning(f"Removed {n_duplicates} duplicates")
 
     df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str')
     keys_to_rm = df_rm['KEY']
-- 
GitLab