diff --git a/survey_reformatter/survey_reformatter.py b/survey_reformatter/survey_reformatter.py index 6ea8872baf5939c908fe20efbaf776ed6aaff4d1..e1bbc1bacd6dab95e5c6de4231d64706e1b7cf68 100644 --- a/survey_reformatter/survey_reformatter.py +++ b/survey_reformatter/survey_reformatter.py @@ -1,25 +1,44 @@ #survey_reformatter.py ''' -Takes xlsx files containing additional surveillance data provided by Yoseph and creates entries to the SurveyDataExtraToAdd.csv in Ethiopia Early Warning System. +Takes xlsx file containing additional surveillance data provided by Yoseph and creates entries formatted for Early Warning System clustering.R script. -Currently only checks for consistency (partially complete). +region option can be 'Ethiopia' or 'SouthAsia'. -Not yet implemented reformatting to match example_SurveyDataExtraToAdd.csv. +verbose and verbose_warning are boolean options. -Some expectations can be configured, see variable names starting 'expected_'. +Checks for consistency (fairly thorough but can be improved), and reformats. + +Some consistency checks can modify values. + +Some expectations can be configured by modifying variable names starting 'expected_'. + +input and output filenames provided as arguments. + +Output can append to a new or existing file. ''' print('Check this is running in python3') -print('Warning: Septoria is not expected from input file') import pandas as pd import numpy as np import sys +region = 'Ethiopia' + +excelDataFilename = sys.argv[1] + +outputFileName = sys.argv[2] + +#excelDataFilename = '20190902_ETH_YosephA.xlsx' +#excelDataFilename = '20190905_ETH_YosephA.xlsx' +#outputFileName = 'test.csv' +#outputFileName = 'SurveyDataExtraToAddcopy.csv' + # If you want to check all of the comparisons being tested # (e.g. if checking a new addition is now being used) # set verbose to True verbose=False +verbose_warning=True if verbose: def verboseprint(message): @@ -31,14 +50,6 @@ else: if not len(sys.argv) == 3: exit("ERROR: Should supply exactly two arguments: target excel data file name and output file name. Program called as: "+str(sys.argv)) -excelDataFilename = sys.argv[1] -outputFileName = sys.argv[2] - -#excelDataFilename = '20190902_ETH_YosephA.xlsx' -#excelDataFilename = '20190905_ETH_YosephA.xlsx' -#outputFileName = 'test.csv' -#outputFileName = 'SurveyDataExtraToAddcopy.csv' - # define the formats of columns expected_num_cols = 53 @@ -100,17 +111,30 @@ expected_col_formats = { expected_cols = expected_col_formats.keys() # open the file -df = pd.read_excel(excelDataFilename, converters = expected_col_formats, na_filter=False) +df = pd.read_excel(excelDataFilename, converters = expected_col_formats, na_filter=False, parse_dates=['ObsDate']) # assume there are only two dimensions num_rows, num_cols = df.shape # for reporting errors def raise_error(message): - print('Error raised during') - print(message) + error_str = 'Error raised during' + print(error_str + message) quit() +warning_counter = 0 + +def raise_warning(message,counting=True): + + global warning_counter + if counting: warning_counter += 1 + + warning_str = 'Warning {:d} raised: '.format(warning_counter) + if verbose_warning: + print(warning_str + message) + +raise_warning('Septoria is not expected from input file') + # functions for consistenty check @@ -127,10 +151,20 @@ def consistency_checks_overall(df): raise_error(message) + # when there is more than one survey at the same Location ID, + # it has a different observation ID + message = 'Keeping only last Observation ID at each Location ID (assuming this is latest survey)' + print(message) + + df.drop_duplicates(['Location ID'], keep='last',inplace=True) + # update number of rows + num_rows = df.shape[0] + message = 'checking each row is a unique entry (assuming Location ID tells us this)' verboseprint(message) try: - num_ids = df.loc[:,'Location ID'].unique().size + ids = df.loc[:,'Location ID'] + num_ids = ids.unique().size assert num_ids == num_rows except AssertionError as error_type: raise_error(message) @@ -171,9 +205,9 @@ def valid_entries(row,coln,valid_values,message=None): raise_error(message) -def valid_entry_span(row,coln,valid_span,message=None): +def valid_entry_span(row,coln,valid_span,alt_value=None,message=None): '''Given a row of pandas data, and column name, checks that column value - is in list of valid values.''' + is within a valid span values.''' if message is None: message = '' @@ -181,8 +215,23 @@ def valid_entry_span(row,coln,valid_span,message=None): message += 'it actually returns {:s}'.format(str(row[coln])) verboseprint(message) + #if coln=='FieldArea' and row[coln]==-999.99: raise Exception + try: - assert valid_span[0] <= row[coln] <= valid_span[1] + # check that it is not a blank string i.e. '' + within_span = isinstance(row[coln],float) + + # if a number, check it is within given range + if within_span: + within_span = valid_span[0] <= row[coln] <= valid_span[1] + + if alt_value is None: + assert within_span + elif not within_span: + + message += '\nreplacing with default {:f}'.format(alt_value) + raise_warning(message,counting=True) + row[coln] = alt_value except AssertionError as error_type: @@ -213,10 +262,58 @@ def consistent_maps(row,coln1,coln2,case1_to_2,message=None): except AssertionError as error_type: - raise_error(message) + #raise_error(message) + raise_warning(message,counting=True) return +def consistent_maps_with_replacement(row,coln1,coln2,case1_to_2,case2_remap1,message=None): + '''Checks that the contents of two columns follow expected mapping. + If there is a mis-match, the N/A or zero case is replaced with medium. + case1_to_2 is expected mappings between 1 and 2. + When they do not map, case2_remap1 tells us how to modify coln1 value + according to coln2. + e.g. + case2_remap1 = {} + case2_remap1[-9] = -9 + case2_remap1[0] = 0 + case2_remap1[1] = 2 + case2_remap1[2] = 2 + case2_remap1[3] = 2 + ''' + + # this case could be replaced with a dict mapping outcome of column 1 to expected outcome of column 2 + case1 = row[coln1] + case2 = row[coln2] + + if case1 in case1_to_2: + expect_case2 = case1_to_2[case1] + else: + # not expecting consistent behaviour so skip + return + + if message is None: message = '' + message += '{:s} returns {:s}, so expected {:s} to return {:s}, '.format(coln1,str(case1),coln2,str(expect_case2)) + message += 'it actually returns {:s}'.format(str(case2)) + + verboseprint(message) + + + if case2 == expect_case2: + return + + elif case2 in case2_remap1: + newval = case2_remap1[case2] + message += '\nreplacing {:s} with {:d}'.format(coln1,newval) + row[coln1] = newval + + else: + message += '\nno solution found yet' + raise_error(message) + + raise_warning(message,counting=True) + + return # defining data structure for consistenty check @@ -237,10 +334,16 @@ def consistent_maps(row,coln1,coln2,case1_to_2,message=None): # defining expected value maps between columns # This is a list of 3-value tuples: column name1, column name2, and dict that maps between them expected_maps = [] +# for cases where it is appropriate to make a modification for consistency +expected_maps_with_replacement = [] # survey site number and name expected_siteid_to_sitename = {} expected_siteid_to_sitename[1]='Farmer field' +expected_siteid_to_sitename[2]='Weed' +expected_siteid_to_sitename[3]='Road side' +expected_siteid_to_sitename[4]='Trial' +expected_siteid_to_sitename[-9]='N/A' expected_maps += [('SurveySiteID','SurveySiteName',expected_siteid_to_sitename)] # some other values are probably ok, but not encountered yet @@ -248,16 +351,17 @@ expected_maps += [('SurveySiteID','SurveySiteName',expected_siteid_to_sitename)] expected_growthid_to_growthname = {} expected_growthid_to_growthname[1]='Tillering' expected_growthid_to_growthname[2]='Boot' +expected_growthid_to_growthname[7]='Heading' # note ID is out of order expected_growthid_to_growthname[3]='Flowering' expected_growthid_to_growthname[4]='Milk' expected_growthid_to_growthname[5]='Dough' -expected_growthid_to_growthname[6]='Milk' -expected_growthid_to_growthname[7]='Heading' +expected_growthid_to_growthname[6]='Maturity' expected_growthid_to_growthname[-9]='N/A' expected_maps += [('GrowthStageID','GrowthStageName',expected_growthid_to_growthname)] # severity match severity name expected_severity_to_severityname = {} +expected_severity_to_severityname[-9]='N/A' expected_severity_to_severityname[0]='None (0)' expected_severity_to_severityname[1]='Low (less than 20 %)' expected_severity_to_severityname[2]='Moderate (20 - 40 %)' @@ -266,25 +370,72 @@ expected_maps += [('Severity','SeverityName',expected_severity_to_severityname)] # incidence match incidence name expected_incidence_to_incidencename = {} +expected_incidence_to_incidencename[-9]='N/A' expected_incidence_to_incidencename[0]='None (0)' expected_incidence_to_incidencename[1]='Low (less than 20 %)' expected_incidence_to_incidencename[2]='Moderate (20 - 40 %)' expected_incidence_to_incidencename[3]='High (more than 40 %)' expected_maps += [('Incidence','IncidenceName',expected_incidence_to_incidencename)] -# severity and incidence match when null +# severity and incidence match when None or N/A. +# when only one is None or N/A, change the other vale to be consistent expected_severity_to_incidence = {} +expected_severity_to_incidence[-9]=-9 expected_severity_to_incidence[0]=0 -expected_maps += [('Severity','Incidence',expected_severity_to_incidence)] + +remap_severity_from_incidence = {} +remap_severity_from_incidence[-9] = -9 +remap_severity_from_incidence[0] = 0 +remap_severity_from_incidence[1] = 2 +remap_severity_from_incidence[2] = 2 +remap_severity_from_incidence[3] = 2 + +expected_maps_with_replacement += [('Severity','Incidence',expected_severity_to_incidence, remap_severity_from_incidence)] +#expected_maps_with_replacement += [('Severity.1','Incidence.1',expected_severity_to_incidence)] # leaf rust not important +expected_maps_with_replacement += [('Severity.2','Incidence.2',expected_severity_to_incidence,remap_severity_from_incidence)] + +expected_incidence_to_severity = {v:k for k,v in expected_severity_to_incidence.items()} + +remap_incidence_from_severity = remap_severity_from_incidence + +expected_maps_with_replacement += [('Incidence','Severity',expected_incidence_to_severity, remap_incidence_from_severity)] +#expected_maps_with_replacement += [('Incidence.1','Severity.1',expected_incidence_to_severity)] # leaf rust not important +expected_maps_with_replacement += [('Incidence.2','Severity.2',expected_incidence_to_severity, remap_incidence_from_severity)] + +# do the same with names of severity and incidence + +expected_severityname_to_incidencename = {} +expected_severityname_to_incidencename[-9]=-9 +expected_severityname_to_incidencename[0]=0 + +remap_severityname_from_incidencename = {} +remap_severityname_from_incidencename[-9] = -9 +remap_severityname_from_incidencename[0] = 0 +remap_severityname_from_incidencename[1] = 2 +remap_severityname_from_incidencename[2] = 2 +remap_severityname_from_incidencename[3] = 2 + +expected_maps_with_replacement += [('SeverityName','IncidenceName',expected_severityname_to_incidencename, remap_severityname_from_incidencename)] +#expected_maps_with_replacement += [('SeverityName.1','IncidenceName.1',expected_severityname_to_incidencename)] # leaf rust not important +expected_maps_with_replacement += [('SeverityName.2','IncidenceName.2',expected_severityname_to_incidencename,remap_severityname_from_incidencename)] + +expected_incidencename_to_severityname = {v:k for k,v in expected_severityname_to_incidencename.items()} + +remap_incidencename_from_severityname = remap_severityname_from_incidencename + +expected_maps_with_replacement += [('IncidenceName','SeverityName',expected_incidencename_to_severityname, remap_incidencename_from_severityname)] +#expected_maps_with_replacement += [('IncidenceName.1','SeverityName.1',expected_incidencename_to_severityname)] # leaf rust not important +expected_maps_with_replacement += [('IncidenceName.2','SeverityName.2',expected_incidencename_to_severityname, remap_incidencename_from_severityname)] + # infection stage id and name expected_infectionid_to_infectionname = {} -#expected_infectionid_to_infectionname[1]='?' # not encountered yet -#expected_infectionid_to_infectionname[2]='?' +expected_infectionid_to_infectionname[1]='R' +expected_infectionid_to_infectionname[2]='R-MR' expected_infectionid_to_infectionname[3]='MR' -#expected_infectionid_to_infectionname[4]='?' +expected_infectionid_to_infectionname[4]='MR-M' expected_infectionid_to_infectionname[5]='M (MR-MS)' -#expected_infectionid_to_infectionname[6]='?' +expected_infectionid_to_infectionname[6]='M-MS' expected_infectionid_to_infectionname[7]='MS' expected_infectionid_to_infectionname[8]='MS-S' expected_infectionid_to_infectionname[9]='S' @@ -311,7 +462,7 @@ expected_col_values['SurveySiteName'] = expected_siteid_to_sitename.values() # s expected_col_values['GrowthStageID'] = expected_growthid_to_growthname.keys() expected_col_values['GrowthStageName'] = expected_growthid_to_growthname.values() -print('Expect information on only three diseases: stem, lead and yellow rusts') +print('Expect information on only three diseases: stem, leaf and yellow rusts') expected_col_values['Disease ID'] = [1] expected_col_values['Disease ID.1'] = [2] expected_col_values['Disease ID.2'] = [3] @@ -338,17 +489,31 @@ expected_col_values['InfectionType.2'] = expected_col_values['InfectionType'] expected_col_values['InfectionTypeName.2'] = expected_col_values['InfectionTypeName'] expected_col_values['PathogenSpeciesName'] = expected_diseaseid_to_pathogenname.values() -expected_col_values['CountryName'] = ['Ethiopia'] -expected_col_values['ObsYear'] = [2019] + +expected_col_values['ObsYear'] = range(2011,2019+1) # defining expected limits of values in certain columns expected_col_value_limits = {} -expected_col_value_limits['FieldArea'] = [0.1,100] -expected_col_value_limits['Altitude'] = [0,5000] -#expected_col_value_limits['Longitude'] = [32.,48.1] # limits of the country of Ethiopia -#expected_col_value_limits['Latitude'] = [3.,15.] # limits of the country of Ethiopia -expected_col_value_limits['Longitude'] = [34.9,43.] # limits of the fixed shapefile of sources for Ethiopia -expected_col_value_limits['Latitude'] = [4.7,15.] # limits of the fixed shapefile of sources for Ethiopia +expected_col_value_limits['FieldArea'] = [0.001,100] +expected_col_value_limits['Altitude'] = [-1000,5000] + +# defining values to be applied if outside of expected limits. +expected_col_value_alternative = {} +expected_col_value_alternative['FieldArea'] = 0.01 + +if region == 'Ethiopia': + expected_col_values['CountryName'] = ['Ethiopia'] + + #expected_col_value_limits['Longitude'] = [32.,48.1] # limits of the country of Ethiopia + #expected_col_value_limits['Latitude'] = [3.,15.] # limits of the country of Ethiopia + expected_col_value_limits['Longitude'] = [34.9,43.] # limits of the fixed shapefile of sources for Ethiopia + expected_col_value_limits['Latitude'] = [4.7,15.] # limits of the fixed shapefile of sources for Ethiopia + +elif region == 'SouthAsia': + expected_col_values['CountryName'] = ['Bangladesh','Bhutan','India','Nepal','Pakistan'] + + expected_col_value_limits['Longitude'] = [65.,98.] # limits of the fixed shapefile of sources for South Asia + expected_col_value_limits['Latitude'] = [7.,36.] # limits of the fixed shapefile of sources for South Asia # run consistency checks @@ -368,17 +533,22 @@ def consistent_fields_per_row(row): # check entries in defined columns are within expected limits for coln,valid_span in expected_col_value_limits.items(): - valid_entry_span(row,coln,valid_span,message=message) + valid_entry_span(row,coln,valid_span,alt_value=expected_col_value_alternative.get(coln,None),message=message) # check values in column correspond to value occurring in another for coln1,coln2,namemap in expected_maps: consistent_maps(row,coln1,coln2,namemap,message=message) - return + # some other column comparisons, but mis-matches require modification + for coln1,coln2,namemap,remap in expected_maps_with_replacement: + consistent_maps_with_replacement(row,coln1,coln2,namemap,remap,message=message) + + return row print('Checking consistent entries per row') -df.apply(consistent_fields_per_row,axis=1) +df2 = df.apply(consistent_fields_per_row,axis=1) +df = df2 # define functions to convert to new dataframe, matching example csv. @@ -397,7 +567,7 @@ def convert_ObsDate_to_datestr(series): # excel date format was read successfully # no need to convert from integer - datestr = series.apply(lambda date: date.strftime('%d-%b-%Y')) + datestr = series.dt.strftime('%d-%b-%Y') pass @@ -419,7 +589,8 @@ def convert_severity(series): 'Low (less than 20 %)':'10', 'Moderate (20 - 40 %)':'30', 'High (more than 40 %)':'50', - 'None (0)':'0'} + 'None (0)':'0', + 'N/A':'na'} return series.map(map_severity_in_out) @@ -429,7 +600,8 @@ def convert_incidence(series): 'Low (less than 20 %)':'low', 'Moderate (20 - 40 %)':'medium', 'High (more than 40 %)':'high', - 'None (0)':'none'} + 'None (0)':'none', + 'N/A':'na'} return series.map(map_severity_in_out) @@ -615,9 +787,9 @@ expected_map_output_input_function = { 'surveyor_infromation-country':do_nothing, 'survey_infromation-location-Latitude':do_nothing, 'survey_infromation-location-Longitude':do_nothing, - 'survey_infromation-location-Altitude':do_nothing, + 'survey_infromation-location-Altitude':do_nothing, 'survey_infromation-location_name':do_nothing, - 'site_information-field_area':do_nothing, + 'site_information-field_area':do_nothing, # TODO: replace NaN with fill value 'site_information-survey_site':do_nothing, 'surveyor_infromation-surveyor_name':do_nothing, 'surveyor_infromation-institution':do_nothing, @@ -725,52 +897,14 @@ for outcoln in default_output_cols: dfout = dfout.astype(expected_col_formats) -# if writing its own file: -#dfout.to_csv(outputFileName,index=False) - -# if appending - -#print('Appending rows from {:s} to {:s}'.format(excelDataFilename,outputFileName)) - -# TODO check for duplicate rows in final file -# This gets pretty complicated because: -# Some duplicated entries have different dates, in July-August - - -#df_original = pd.read_csv(outputFileName)#,na_filter=False) - -cols_to_check = [ - #'surveyor_infromation-country', - #'surveyor_infromation-surveyor_name', - #'surveyor_infromation-institution', - #'survey_infromation-location_name', - #'survey_infromation-location-Latitude', - #'survey_infromation-location-Longitude', - 'survey_infromation-location-Altitude', - #'survey_infromation-location-Accuracy', - #'survey_infromation-survey_date', - 'site_information-survey_site', - #'site_information-crop', - 'site_information-field_area', - 'site_information-variety', - 'site_information-growth_stage', - 'yellow_rust-yellowrust_severity'] - - -#def duplicate_close(daf): -# ''' Trying to work like pd.DataFrame.duplciated() but using np.isclose.''' -# pass -# return - -#dfm = df_original.append(dfout) - -#duplicate_rows = np.where(dfm.duplicated(subset=cols_to_check)) +# append/write to file +with open(outputFileName, 'a') as f: -# remove them from dataframe to be appended + # writing header if file is new/empty + header = f.tell()==0 + dfout.to_csv(f,header=header,index=False) -# if merging indiscriminately -with open(outputFileName, 'a') as f: - dfout.to_csv(f,header=False,index=False) +print('{:d} warnings'.format(warning_counter)) print('end of script')