FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit 0c6e68f8 authored by Dr T. Mona's avatar Dr T. Mona
Browse files

feat: check the key column is unique, if not raise a warning and remove duplicates

parent 472bf13e
No related branches found
No related tags found
No related merge requests found
......@@ -188,11 +188,18 @@ def process_in_job_survey(jobPath,status,config,component):
survey_errors_to_remove_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/SurveyDataErrorsToRemove.csv"
survey_additions_filepath = f"{config['ResourcesPath']}/coordinator/assets/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv"
# perform here in python, using the 'KEY' column
# check the key column is unique
assert dfm['KEY'].unique().size == dfm['KEY'].size, 'KEY column is not unique'
# check the key column is unique, if not raise a warning and remove duplicates
if dfm['KEY'].unique().size != dfm['KEY'].size:
status.reset('WARNING')
logger.warning(f"KEY column is not unique, removing duplicates")
# count the number of duplicates
n_duplicates = dfm.shape[0] - dfm['KEY'].unique().size
# drop the duplicates
dfm = dfm.drop_duplicates(keep='first')
logger.warning(f"Removed {n_duplicates} duplicates")
df_rm = read_csv(survey_errors_to_remove_filepath,dtype='str')
keys_to_rm = df_rm['KEY']
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment