fix: Read KEY column as string.

The WRSIS download KEY column is an intger which made a problem in the survey data remove subprocess. The solution is to read the KEY column always as a string.

fix: Read KEY column as string.
c530dc08 · Dr T. Mona · 45e1877e · c530dc08
Commit c530dc08 authored 2 years ago by Dr T. Mona
--- a/ProcessorSurveys.py
+++ b/ProcessorSurveys.py
@@ -796,7 +796,7 @@ def process_in_job_survey(jobPath,status,config,component):
    for form_name,form_fn in csv_filenames.items():
        # some define column types, hardwired for now
-        col_types = {'comment':'str'}
+        col_types = {'comment':'str','KEY':'str'}
        form_df = read_csv(form_fn,dtype=col_types)
@@ -901,7 +901,7 @@ def process_in_job_survey(jobPath,status,config,component):
    keys_to_rm = df_rm['KEY']
    # check that all of the keys to remove exist in the original data
-    rm_keys_found = df_rm['KEY'].apply(lambda cell: cell in dfm['KEY'].values)
+    rm_keys_found = df_rm['KEY'].isin(dfm['KEY'])
    n_rm_keys_found = rm_keys_found.sum()
    n_rm_keys = rm_keys_found.size
    if not np_all(rm_keys_found):
@@ -911,6 +911,11 @@ def process_in_job_survey(jobPath,status,config,component):
        rm_keys_not_found = df_rm[~rm_keys_found]
        logger.debug(f"Erroneous entries not found are:\n{rm_keys_not_found}")
+        logger.debug(f"Type of keys that can be found include:\n{dfm['KEY'].dtype}")
+        dfm_short_keys = [val for val in dfm['KEY'].values if len(str(val)) <10]
+        logger.debug(f"Keys that can be found include:\n{dfm_short_keys}")
    # identify which surveys to remove
    idx_to_rm = dfm['KEY'].apply(lambda cell: cell in keys_to_rm.values)