diff --git a/coordinator/ProcessorSurveys.py b/coordinator/ProcessorSurveys.py index 020f5e1851832193365509b25278fa8b2c7f725b..2f2e9975493745b8fc7b6ac841218d92a9b007bc 100644 --- a/coordinator/ProcessorSurveys.py +++ b/coordinator/ProcessorSurveys.py @@ -188,11 +188,18 @@ def process_in_job_survey(jobPath,status,config,component): survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv" survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv" - + # perform here in python, using the 'KEY' column - # check the key column is unique - - assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique' + # check the key column is unique, if not raise a warning and remove duplicates + + if dfm['KEY'].unique().size != dfm['KEY'].size: + status.reset('WARNING') + logger.warning(f"KEY column is not unique, removing duplicates") + # count the number of duplicates + n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size + # drop the duplicates + dfm = dfm.drop_duplicates(keep='first') + logger.warning(f"Removed {n_duplicates} duplicates") df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str') keys_to_rm = df_rm['KEY']