SurveyDataPreprocessor.R

  
 
options(stringsAsFactors = FALSE)

library(dplyr)

args = commandArgs(TRUE)

originalDataFileName = args[1]
print(paste0("ORIGINAL DATA: ", originalDataFileName))

deleteRowsFileName = args[2]
print(paste0("DELETE DATA: ", deleteRowsFileName))

addRowsFileName = args[3]
print(paste0("ADD DATA: ", addRowsFileName))

outputFileName = args[4]
print(paste0("OUTPUT DATA: ", outputFileName))


#colClasses must be used otherwise R gets the inference wrong for the deleteData table, 
#which has much less data for it to work with, and therefore dplyr can't match between tables
#check.names is essential, as otherwise the characters in the column names are arbitrarily butchered and no longer match what downstream elements require
originalData = read.csv(originalDataFileName, header = TRUE, check.names=FALSE, colClasses = "character")

print(paste0("Original rows: ", nrow(originalData)))

#Remove unwanted rows:
deleteData = read.csv(deleteRowsFileName, header = TRUE, check.names=FALSE, colClasses = "character")

print(paste0("Deletion rows: ", nrow(deleteData)))

cleanData = anti_join(originalData, deleteData)

print(paste0("Clean rows: ", nrow(cleanData)))

#Check and see if each row from deleteData was removed from originalData - warn if rows not removed
nRowNotDeleted = nrow(cleanData) + nrow(deleteData) - nrow(originalData)
if(nRowNotDeleted != 0) {
  print(paste0("WARNING: Did not find a match for ", nRowNotDeleted, " rows in the deletion table"))
}

#Add extra rows:
extraData = read.csv(addRowsFileName, header = TRUE, check.names=FALSE, colClasses = "character")

print(paste0("Extra rows: ", nrow(extraData)))

joinedData = rbind(cleanData, extraData)

print(paste0("Joined rows: ", nrow(joinedData)))


#Write output:
write.csv(joinedData, file = outputFileName, row.names = FALSE, quote = TRUE)