diff --git a/survey_reformatter/survey_reformatter.py b/survey_reformatter/survey_reformatter.py
index 6ea8872baf5939c908fe20efbaf776ed6aaff4d1..e1bbc1bacd6dab95e5c6de4231d64706e1b7cf68 100644
--- a/survey_reformatter/survey_reformatter.py
+++ b/survey_reformatter/survey_reformatter.py
@@ -1,25 +1,44 @@
 #survey_reformatter.py
 '''
-Takes xlsx files containing additional surveillance data provided by Yoseph and creates entries to the SurveyDataExtraToAdd.csv in Ethiopia Early Warning System.
+Takes xlsx file containing additional surveillance data provided by Yoseph and creates entries formatted for Early Warning System clustering.R script.
 
-Currently only checks for consistency (partially complete).
+region option can be 'Ethiopia' or 'SouthAsia'.
 
-Not yet implemented reformatting to match example_SurveyDataExtraToAdd.csv.
+verbose and verbose_warning are boolean options.
 
-Some expectations can be configured, see variable names starting 'expected_'.
+Checks for consistency (fairly thorough but can be improved), and reformats.
+
+Some consistency checks can modify values.
+
+Some expectations can be configured by modifying variable names starting 'expected_'.
+
+input and output filenames provided as arguments.
+
+Output can append to a new or existing file.
 '''
 
 print('Check this is running in python3')
-print('Warning: Septoria is not expected from input file')
 
 import pandas as pd
 import numpy as np
 import sys
 
+region = 'Ethiopia'
+
+excelDataFilename = sys.argv[1]
+
+outputFileName = sys.argv[2]
+
+#excelDataFilename = '20190902_ETH_YosephA.xlsx'
+#excelDataFilename = '20190905_ETH_YosephA.xlsx'
+#outputFileName = 'test.csv'
+#outputFileName = 'SurveyDataExtraToAddcopy.csv'
+
 # If you want to check all of the comparisons being tested
 # (e.g. if checking a new addition is now being used)
 # set verbose to True
 verbose=False
+verbose_warning=True
 
 if verbose:
     def verboseprint(message):
@@ -31,14 +50,6 @@ else:
 if not len(sys.argv) == 3:
     exit("ERROR: Should supply exactly two arguments: target excel data file name and output file name. Program called as: "+str(sys.argv))
 
-excelDataFilename = sys.argv[1]
-outputFileName = sys.argv[2]
-
-#excelDataFilename = '20190902_ETH_YosephA.xlsx'
-#excelDataFilename = '20190905_ETH_YosephA.xlsx'
-#outputFileName = 'test.csv'
-#outputFileName = 'SurveyDataExtraToAddcopy.csv'
-
 # define the formats of columns
 expected_num_cols = 53
 
@@ -100,17 +111,30 @@ expected_col_formats = {
 expected_cols = expected_col_formats.keys()
 
 # open the file
-df = pd.read_excel(excelDataFilename, converters = expected_col_formats, na_filter=False)
+df = pd.read_excel(excelDataFilename, converters = expected_col_formats, na_filter=False, parse_dates=['ObsDate'])
 
 # assume there are only two dimensions
 num_rows, num_cols = df.shape
 
 # for reporting errors
 def raise_error(message):
-    print('Error raised during')
-    print(message)
+    error_str = 'Error raised during'
+    print(error_str + message)
     quit()
 
+warning_counter = 0
+
+def raise_warning(message,counting=True):
+
+    global warning_counter
+    if counting: warning_counter += 1
+    
+    warning_str = 'Warning {:d} raised: '.format(warning_counter)
+    if verbose_warning:
+        print(warning_str + message)
+
+raise_warning('Septoria is not expected from input file')
+
 
 # functions for consistenty check
 
@@ -127,10 +151,20 @@ def consistency_checks_overall(df):
         
         raise_error(message)
 
+    # when there is more than one survey at the same Location ID, 
+    # it has a different observation ID
+    message = 'Keeping only last Observation ID at each Location ID (assuming this is latest survey)'
+    print(message)
+    
+    df.drop_duplicates(['Location ID'], keep='last',inplace=True)
+    # update number of rows
+    num_rows = df.shape[0]
+
     message = 'checking each row is a unique entry (assuming Location ID tells us this)'
     verboseprint(message)
     try:
-        num_ids = df.loc[:,'Location ID'].unique().size
+        ids = df.loc[:,'Location ID']
+        num_ids = ids.unique().size
         assert num_ids == num_rows
     except AssertionError as error_type:
         raise_error(message)
@@ -171,9 +205,9 @@ def valid_entries(row,coln,valid_values,message=None):
         
         raise_error(message)
 
-def valid_entry_span(row,coln,valid_span,message=None):
+def valid_entry_span(row,coln,valid_span,alt_value=None,message=None):
     '''Given a row of pandas data, and column name, checks that column value 
-    is in list of valid values.'''
+    is within a valid span values.'''
 
     if message is None: message = ''
         
@@ -181,8 +215,23 @@ def valid_entry_span(row,coln,valid_span,message=None):
     message += 'it actually returns {:s}'.format(str(row[coln]))
     verboseprint(message)
 
+    #if coln=='FieldArea' and row[coln]==-999.99: raise Exception
+
     try:
-        assert valid_span[0] <= row[coln] <= valid_span[1]
+        # check that it is not a blank string i.e. ''
+        within_span = isinstance(row[coln],float)
+
+        # if a number, check it is within given range
+        if within_span:
+            within_span = valid_span[0] <= row[coln] <= valid_span[1]
+        
+        if alt_value is None:
+            assert within_span
+        elif not within_span:
+
+            message += '\nreplacing with default {:f}'.format(alt_value)
+            raise_warning(message,counting=True)
+            row[coln] = alt_value
 
     except AssertionError as error_type:
 
@@ -213,10 +262,58 @@ def consistent_maps(row,coln1,coln2,case1_to_2,message=None):
 
     except AssertionError as error_type:
         
-        raise_error(message)
+        #raise_error(message)
+        raise_warning(message,counting=True)
 
     return
 
+def consistent_maps_with_replacement(row,coln1,coln2,case1_to_2,case2_remap1,message=None):
+    '''Checks that the contents of two columns follow expected mapping.
+    If there is a mis-match, the N/A or zero case is replaced with medium.
+    case1_to_2 is expected mappings between 1 and 2.
+    When they do not map, case2_remap1 tells us how to modify coln1 value
+    according to coln2.
+    e.g.
+    case2_remap1 = {}
+    case2_remap1[-9] = -9
+    case2_remap1[0] = 0
+    case2_remap1[1] = 2
+    case2_remap1[2] = 2
+    case2_remap1[3] = 2
+    '''
+
+    # this case could be replaced with a dict mapping outcome of column 1 to expected outcome of column 2
+    case1 = row[coln1]
+    case2 = row[coln2]
+    
+    if case1 in case1_to_2:
+        expect_case2 = case1_to_2[case1]
+    else:
+        # not expecting consistent behaviour so skip
+        return
+
+    if message is None: message = ''
+    message += '{:s} returns {:s}, so expected {:s} to return {:s}, '.format(coln1,str(case1),coln2,str(expect_case2))
+    message += 'it actually returns {:s}'.format(str(case2))
+    
+    verboseprint(message)
+
+
+    if case2 == expect_case2:
+        return
+    
+    elif case2 in case2_remap1:
+        newval = case2_remap1[case2]
+        message += '\nreplacing {:s} with {:d}'.format(coln1,newval)
+        row[coln1] = newval
+    
+    else:
+        message += '\nno solution found yet'
+        raise_error(message)
+
+    raise_warning(message,counting=True)
+
+    return
 
 # defining data structure for consistenty check
 
@@ -237,10 +334,16 @@ def consistent_maps(row,coln1,coln2,case1_to_2,message=None):
 # defining expected value maps between columns
 # This is a list of 3-value tuples: column name1, column name2, and dict that maps between them
 expected_maps = []
+# for cases where it is appropriate to make a modification for consistency
+expected_maps_with_replacement = []
 
 # survey site number and name
 expected_siteid_to_sitename = {}
 expected_siteid_to_sitename[1]='Farmer field'
+expected_siteid_to_sitename[2]='Weed'
+expected_siteid_to_sitename[3]='Road side'
+expected_siteid_to_sitename[4]='Trial'
+expected_siteid_to_sitename[-9]='N/A'
 expected_maps += [('SurveySiteID','SurveySiteName',expected_siteid_to_sitename)]
 # some other values are probably ok, but not encountered yet
 
@@ -248,16 +351,17 @@ expected_maps += [('SurveySiteID','SurveySiteName',expected_siteid_to_sitename)]
 expected_growthid_to_growthname = {}
 expected_growthid_to_growthname[1]='Tillering'
 expected_growthid_to_growthname[2]='Boot'
+expected_growthid_to_growthname[7]='Heading' # note ID is out of order
 expected_growthid_to_growthname[3]='Flowering'
 expected_growthid_to_growthname[4]='Milk'
 expected_growthid_to_growthname[5]='Dough'
-expected_growthid_to_growthname[6]='Milk'
-expected_growthid_to_growthname[7]='Heading'
+expected_growthid_to_growthname[6]='Maturity'
 expected_growthid_to_growthname[-9]='N/A'
 expected_maps += [('GrowthStageID','GrowthStageName',expected_growthid_to_growthname)]
 
 # severity match severity name
 expected_severity_to_severityname = {}
+expected_severity_to_severityname[-9]='N/A'
 expected_severity_to_severityname[0]='None (0)'
 expected_severity_to_severityname[1]='Low (less than 20 %)'
 expected_severity_to_severityname[2]='Moderate (20 - 40 %)'
@@ -266,25 +370,72 @@ expected_maps += [('Severity','SeverityName',expected_severity_to_severityname)]
 
 # incidence match incidence name
 expected_incidence_to_incidencename = {}
+expected_incidence_to_incidencename[-9]='N/A'
 expected_incidence_to_incidencename[0]='None (0)'
 expected_incidence_to_incidencename[1]='Low (less than 20 %)'
 expected_incidence_to_incidencename[2]='Moderate (20 - 40 %)'
 expected_incidence_to_incidencename[3]='High (more than 40 %)'
 expected_maps += [('Incidence','IncidenceName',expected_incidence_to_incidencename)]
 
-# severity and incidence match when null
+# severity and incidence match when None or N/A.
+# when only one is None or N/A, change the other vale to be consistent 
 expected_severity_to_incidence = {}
+expected_severity_to_incidence[-9]=-9
 expected_severity_to_incidence[0]=0
-expected_maps += [('Severity','Incidence',expected_severity_to_incidence)]
+
+remap_severity_from_incidence = {}
+remap_severity_from_incidence[-9] = -9
+remap_severity_from_incidence[0] = 0
+remap_severity_from_incidence[1] = 2
+remap_severity_from_incidence[2] = 2
+remap_severity_from_incidence[3] = 2
+
+expected_maps_with_replacement += [('Severity','Incidence',expected_severity_to_incidence, remap_severity_from_incidence)]
+#expected_maps_with_replacement += [('Severity.1','Incidence.1',expected_severity_to_incidence)] # leaf rust not important
+expected_maps_with_replacement += [('Severity.2','Incidence.2',expected_severity_to_incidence,remap_severity_from_incidence)]
+
+expected_incidence_to_severity = {v:k for k,v in expected_severity_to_incidence.items()}
+
+remap_incidence_from_severity = remap_severity_from_incidence
+
+expected_maps_with_replacement += [('Incidence','Severity',expected_incidence_to_severity, remap_incidence_from_severity)]
+#expected_maps_with_replacement += [('Incidence.1','Severity.1',expected_incidence_to_severity)]  # leaf rust not important
+expected_maps_with_replacement += [('Incidence.2','Severity.2',expected_incidence_to_severity, remap_incidence_from_severity)]
+
+# do the same with names of severity and incidence
+
+expected_severityname_to_incidencename = {}
+expected_severityname_to_incidencename[-9]=-9
+expected_severityname_to_incidencename[0]=0
+
+remap_severityname_from_incidencename = {}
+remap_severityname_from_incidencename[-9] = -9
+remap_severityname_from_incidencename[0] = 0
+remap_severityname_from_incidencename[1] = 2
+remap_severityname_from_incidencename[2] = 2
+remap_severityname_from_incidencename[3] = 2
+
+expected_maps_with_replacement += [('SeverityName','IncidenceName',expected_severityname_to_incidencename, remap_severityname_from_incidencename)]
+#expected_maps_with_replacement += [('SeverityName.1','IncidenceName.1',expected_severityname_to_incidencename)] # leaf rust not important
+expected_maps_with_replacement += [('SeverityName.2','IncidenceName.2',expected_severityname_to_incidencename,remap_severityname_from_incidencename)]
+
+expected_incidencename_to_severityname = {v:k for k,v in expected_severityname_to_incidencename.items()}
+
+remap_incidencename_from_severityname = remap_severityname_from_incidencename
+
+expected_maps_with_replacement += [('IncidenceName','SeverityName',expected_incidencename_to_severityname, remap_incidencename_from_severityname)]
+#expected_maps_with_replacement += [('IncidenceName.1','SeverityName.1',expected_incidencename_to_severityname)]  # leaf rust not important
+expected_maps_with_replacement += [('IncidenceName.2','SeverityName.2',expected_incidencename_to_severityname, remap_incidencename_from_severityname)]
+
 
 # infection stage id and name
 expected_infectionid_to_infectionname = {}
-#expected_infectionid_to_infectionname[1]='?' # not encountered yet
-#expected_infectionid_to_infectionname[2]='?'
+expected_infectionid_to_infectionname[1]='R'
+expected_infectionid_to_infectionname[2]='R-MR'
 expected_infectionid_to_infectionname[3]='MR'
-#expected_infectionid_to_infectionname[4]='?'
+expected_infectionid_to_infectionname[4]='MR-M'
 expected_infectionid_to_infectionname[5]='M (MR-MS)'
-#expected_infectionid_to_infectionname[6]='?'
+expected_infectionid_to_infectionname[6]='M-MS'
 expected_infectionid_to_infectionname[7]='MS'
 expected_infectionid_to_infectionname[8]='MS-S'
 expected_infectionid_to_infectionname[9]='S'
@@ -311,7 +462,7 @@ expected_col_values['SurveySiteName'] = expected_siteid_to_sitename.values() # s
 expected_col_values['GrowthStageID'] = expected_growthid_to_growthname.keys()
 expected_col_values['GrowthStageName'] = expected_growthid_to_growthname.values()
 
-print('Expect information on only three diseases: stem, lead and yellow rusts')
+print('Expect information on only three diseases: stem, leaf and yellow rusts')
 expected_col_values['Disease ID'] = [1]
 expected_col_values['Disease ID.1'] = [2]
 expected_col_values['Disease ID.2'] = [3]
@@ -338,17 +489,31 @@ expected_col_values['InfectionType.2'] = expected_col_values['InfectionType']
 expected_col_values['InfectionTypeName.2'] = expected_col_values['InfectionTypeName']
 
 expected_col_values['PathogenSpeciesName'] = expected_diseaseid_to_pathogenname.values()
-expected_col_values['CountryName'] = ['Ethiopia']
-expected_col_values['ObsYear'] = [2019]
+
+expected_col_values['ObsYear'] = range(2011,2019+1)
 
 # defining expected limits of values in certain columns
 expected_col_value_limits = {}
-expected_col_value_limits['FieldArea'] = [0.1,100]
-expected_col_value_limits['Altitude'] = [0,5000]
-#expected_col_value_limits['Longitude'] = [32.,48.1] # limits of the country of Ethiopia
-#expected_col_value_limits['Latitude'] = [3.,15.] # limits of the country of Ethiopia
-expected_col_value_limits['Longitude'] = [34.9,43.] # limits of the fixed shapefile of sources for Ethiopia
-expected_col_value_limits['Latitude'] = [4.7,15.] # limits of the fixed shapefile of sources for Ethiopia
+expected_col_value_limits['FieldArea'] = [0.001,100]
+expected_col_value_limits['Altitude'] = [-1000,5000]
+
+# defining values to be applied if outside of expected limits.
+expected_col_value_alternative = {}
+expected_col_value_alternative['FieldArea'] = 0.01
+
+if region == 'Ethiopia':
+    expected_col_values['CountryName'] = ['Ethiopia']
+
+    #expected_col_value_limits['Longitude'] = [32.,48.1] # limits of the country of Ethiopia
+    #expected_col_value_limits['Latitude'] = [3.,15.] # limits of the country of Ethiopia
+    expected_col_value_limits['Longitude'] = [34.9,43.] # limits of the fixed shapefile of sources for Ethiopia
+    expected_col_value_limits['Latitude'] = [4.7,15.] # limits of the fixed shapefile of sources for Ethiopia
+
+elif region == 'SouthAsia':
+    expected_col_values['CountryName'] = ['Bangladesh','Bhutan','India','Nepal','Pakistan']
+
+    expected_col_value_limits['Longitude'] = [65.,98.] # limits of the fixed shapefile of sources for South Asia
+    expected_col_value_limits['Latitude'] = [7.,36.] # limits of the fixed shapefile of sources for South Asia
 
 
 # run consistency checks 
@@ -368,17 +533,22 @@ def consistent_fields_per_row(row):
 
     # check entries in defined columns are within expected limits
     for coln,valid_span in expected_col_value_limits.items():
-        valid_entry_span(row,coln,valid_span,message=message)
+        valid_entry_span(row,coln,valid_span,alt_value=expected_col_value_alternative.get(coln,None),message=message)
 
     # check values in column correspond to value occurring in another
     for coln1,coln2,namemap in expected_maps:
         consistent_maps(row,coln1,coln2,namemap,message=message)
 
-    return
+    # some other column comparisons, but mis-matches require modification
+    for coln1,coln2,namemap,remap in expected_maps_with_replacement:
+        consistent_maps_with_replacement(row,coln1,coln2,namemap,remap,message=message)
+
+    return row
 
 print('Checking consistent entries per row')
 
-df.apply(consistent_fields_per_row,axis=1)
+df2 = df.apply(consistent_fields_per_row,axis=1)
+df = df2
 
 
 # define functions to convert to new dataframe, matching example csv.
@@ -397,7 +567,7 @@ def convert_ObsDate_to_datestr(series):
         # excel date format was read successfully
         # no need to convert from integer
 
-        datestr = series.apply(lambda date: date.strftime('%d-%b-%Y'))
+        datestr = series.dt.strftime('%d-%b-%Y')
 
         pass
 
@@ -419,7 +589,8 @@ def convert_severity(series):
             'Low (less than 20 %)':'10',
             'Moderate (20 - 40 %)':'30',
             'High (more than 40 %)':'50',
-            'None (0)':'0'}
+            'None (0)':'0',
+            'N/A':'na'}
 
     return series.map(map_severity_in_out)
 
@@ -429,7 +600,8 @@ def convert_incidence(series):
             'Low (less than 20 %)':'low',
             'Moderate (20 - 40 %)':'medium',
             'High (more than 40 %)':'high',
-            'None (0)':'none'}
+            'None (0)':'none',
+            'N/A':'na'}
 
     return series.map(map_severity_in_out)
 
@@ -615,9 +787,9 @@ expected_map_output_input_function = {
         'surveyor_infromation-country':do_nothing,
         'survey_infromation-location-Latitude':do_nothing,
         'survey_infromation-location-Longitude':do_nothing,
-        'survey_infromation-location-Altitude':do_nothing,
+        'survey_infromation-location-Altitude':do_nothing, 
         'survey_infromation-location_name':do_nothing,
-        'site_information-field_area':do_nothing,
+        'site_information-field_area':do_nothing,  # TODO: replace NaN with fill value
         'site_information-survey_site':do_nothing,
         'surveyor_infromation-surveyor_name':do_nothing,
         'surveyor_infromation-institution':do_nothing,
@@ -725,52 +897,14 @@ for outcoln in default_output_cols:
 
 dfout = dfout.astype(expected_col_formats)
 
-# if writing its own file:
-#dfout.to_csv(outputFileName,index=False)
-
-# if appending
-
-#print('Appending rows from {:s} to {:s}'.format(excelDataFilename,outputFileName))
-
-# TODO check for duplicate rows in final file
-# This gets pretty complicated because:
-# Some duplicated entries have different dates, in July-August
-
-
-#df_original = pd.read_csv(outputFileName)#,na_filter=False)
-
-cols_to_check = [
-        #'surveyor_infromation-country',
-        #'surveyor_infromation-surveyor_name',
-        #'surveyor_infromation-institution',
-        #'survey_infromation-location_name',
-        #'survey_infromation-location-Latitude',
-        #'survey_infromation-location-Longitude',
-        'survey_infromation-location-Altitude',
-        #'survey_infromation-location-Accuracy',
-        #'survey_infromation-survey_date',
-        'site_information-survey_site',
-        #'site_information-crop',
-        'site_information-field_area',
-        'site_information-variety',
-        'site_information-growth_stage',
-        'yellow_rust-yellowrust_severity']
-
-
-#def duplicate_close(daf):
-#    ''' Trying to work like pd.DataFrame.duplciated() but using np.isclose.'''
-#    pass
-#    return
-
-#dfm = df_original.append(dfout)
-
-#duplicate_rows = np.where(dfm.duplicated(subset=cols_to_check))
+# append/write to file
+with open(outputFileName, 'a') as f:
 
-# remove them from dataframe to be appended
+    # writing header if file is new/empty
+    header = f.tell()==0
 
+    dfout.to_csv(f,header=header,index=False)
 
-# if merging indiscriminately
-with open(outputFileName, 'a') as f:
-    dfout.to_csv(f,header=False,index=False)
+print('{:d} warnings'.format(warning_counter))
 
 print('end of script')