From c530dc0830d8bda9f221f5e4c3a9ea8b116cb279 Mon Sep 17 00:00:00 2001
From: Tamas Mona <tm689@cam.ac.uk>
Date: Tue, 6 Sep 2022 15:46:28 +0100
Subject: [PATCH] fix: Read KEY column as string.

The WRSIS download KEY column is an intger which made a problem in the survey data remove subprocess. The solution is to read the KEY column always as a string.
---
 ProcessorSurveys.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ProcessorSurveys.py b/ProcessorSurveys.py
index 10d6298..1a2fc1b 100644
--- a/ProcessorSurveys.py
+++ b/ProcessorSurveys.py
@@ -796,7 +796,7 @@ def process_in_job_survey(jobPath,status,config,component):
     for form_name,form_fn in csv_filenames.items():
 
         # some define column types, hardwired for now
-        col_types = {'comment':'str'}
+        col_types = {'comment':'str','KEY':'str'}
 
         form_df = read_csv(form_fn,dtype=col_types)
 
@@ -901,7 +901,7 @@ def process_in_job_survey(jobPath,status,config,component):
     keys_to_rm = df_rm['KEY']
 
     # check that all of the keys to remove exist in the original data
-    rm_keys_found = df_rm['KEY'].apply(lambda cell: cell in dfm['KEY'].values)
+    rm_keys_found = df_rm['KEY'].isin(dfm['KEY'])
     n_rm_keys_found = rm_keys_found.sum()
     n_rm_keys = rm_keys_found.size
     if not np_all(rm_keys_found):
@@ -911,6 +911,11 @@ def process_in_job_survey(jobPath,status,config,component):
         rm_keys_not_found = df_rm[~rm_keys_found]
         logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}")
 
+        logger.debug(f"Type of keys that can be found include:\n{dfm['KEY'].dtype}")
+
+        dfm_short_keys = [val for val in dfm['KEY'].values if len(str(val)) <10]
+        logger.debug(f"Keys that can be found include:\n{dfm_short_keys}")
+
     # identify which surveys to remove
     idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values)
 
-- 
GitLab