Survey data handles errors in ODK database and additional data sources

6c8a1573 · Richard Stutt · 571f95ca · 6c8a1573 · 6c8a1573 · 6c8a1573
Commit 6c8a1573 authored 6 years ago by Richard Stutt
--- a/SurveyDataErrorsToRemove.csv
+++ b/SurveyDataErrorsToRemove.csv
+SubmissionDate,start,end,today,deviceid,subscriberid,imei,phonenumber,username,surveyor_infromation-country,surveyor_infromation-surveyor_name,surveyor_infromation-institution,survey_infromation-location_name,survey_infromation-location-Latitude,survey_infromation-location-Longitude,survey_infromation-location-Altitude,survey_infromation-location-Accuracy,survey_infromation-survey_date,site_information-survey_site,site_information-crop,site_information-field_area,site_information-variety,site_information-growth_stage,other_crop,stem_rust-stemrust_incidence,stem_rust-Stemrust_severity,stem_rust-stemrust_host_plant_reaction,leaf_rust-leafrust_incidence,leaf_rust-leafrust_severity,leaf_rust-leafrust_host_plant_reaction,yellow_rust-yellowrust_incidence,yellow_rust-yellowrust_severity,yellow_rust-yellowrust_host_plant_reaction,septoria-septoria_incidence,septoria-septoria_severity,other_diseases_group-other_diseases,score_diseases_count,SET-OF-score_diseases,samples_collected,samples_type,sample_size-number_stemrust_live,sample_size-number_stemrust_dead_dna,sample_size-number_yellowrust_live,sample_size-number_yellowrust_dead,sample_size-number_leafrust_live,sample_size-using_barcode,live_stemrust_samples_count,SET-OF-live_stemrust_samples,dead_stemrust_samples_count,SET-OF-dead_stemrust_samples,live_yellowrust_samples_count,SET-OF-live_yellowrust_samples,dead_yellowrust_samples_count,SET-OF-dead_yellowrust_samples,live_leafrust_samples_count,SET-OF-live_leafrust_samples,comment,meta-instanceID,KEY
+06-Aug-2018 13:43:06,13-Jun-2018 12:26:53,06-Aug-2018 13:42:21,13-Jun-2018,FC:19:10:89:E2:6B,"",FC:19:10:89:E2:6B,"",mekele,Ethiopia,tesfay gebrekirstos,Tigray Agricultural Research Institute,Ilala,13.5235519900,39.5033560800,1991.0000000000,5.0000000000,06-Aug-2018,farmer_field,bread_wheat,2.0000000000,mekele-1,tillering,"",high,20,mr,low,10,mr,high,50,s,medium,55,glume_blotch loose_smut pythium_root_rot scab_fusarium_head_blight take_all tan_spot,6,uuid:58cb6a0f-8e47-4339-8080-0e320234415c/score_diseases,no,"","","","","","","","","","","","","","","","","",There is potential inoculum for the future.,uuid:58cb6a0f-8e47-4339-8080-0e320234415c,uuid:58cb6a0f-8e47-4339-8080-0e320234415c
--- a/SurveyDataPreprocessor.R
+++ b/SurveyDataPreprocessor.R
+
+options(stringsAsFactors = FALSE)
+
+library(dplyr)
+
+args = commandArgs(TRUE)
+
+defaultDataFileName = args[1]
+
+deleteRowsFileName = args[2]
+
+addRowsFileName = args[3]
+
+outputFileName = args[4]
+
+
+#Reason for using colClasses is that otherwise it gets the inference wrong for the deleteData table, which has much less data for it to work with, 
+#and therefore dplyr can't match between  the tow tables
+defaultData = read.csv(defaultDataFileName, header = TRUE, colClasses = "character")
+
+#Remove unwanted rows:
+deleteData = read.csv(deleteRowsFileName, header = TRUE, colClasses = "character")
+
+cleanData = anti_join(defaultData, deleteData)
+
+#TODO: Check and see if each row from deleteData was removed from defaultData - warn if a row not removed
+
+#Add extra rows:
+#TODO: enable once data available
+#extraData = read.csv(addRowsFileName, header = TRUE, colClasses = "character")
+
+#joinedData = rbind(cleanData, extraData)
+
+#Write output:
+write.csv(joinedData, file = outputFileName, row.names = FALSE, quote = FALSE)
+
+
+
--- a/SurveyDataProcessor.pl
+++ b/SurveyDataProcessor.pl
@@ -20,8 +20,10 @@ my $debugNoUpload = 0;
 my $workspacePath = "/storage/app/Ethiopia-EWS/Ethiopia-EWS-Workspace";
 my $coordinatorPath = "/storage/app/Ethiopia-EWS/Ethiopia-EWS-Coordinator";

+my $RPath = "/usr/local/R/bin/Rscript";
+
 my @maintainers = ("ca500\@cam.ac.uk", "ds603\@cam.ac.uk", "rs481\@cam.ac.uk");
-#@maintainers = ("rs481\@cam.ac.uk");print STDERR "DEBUG: MAINTAINERS LIST\n";
+@maintainers = ("rs481\@cam.ac.uk");print STDERR "DEBUG: MAINTAINERS LIST\n";

 my $todayString = strftime("%Y%m%d", localtime());

@@ -131,13 +133,21 @@ unless(-d $serverOutputDir) {
 my $csvOutputDir = $todayFolderPath."/ExportCSV";
 my $csvFileName = "SurveyData.csv";

+my $processedSurveyDataFileName = $csvOutputDir."/Processed_".$csvFileName;
+
+my $surveyDataErrorsToRemoveFileName = $coordinatorPath."/SurveyDataErrorsToRemove.csv";
+my $surveyDataAdditionalDataFileName = "";
+
 unless($serverFailure) {

 	my $dataToCSVCmd = "java -jar ".$serverJar." --export --form_id ".$serverFormID." --storage_directory ".$serverOutputDir." --export_directory ".$csvOutputDir." --export_filename ".$csvFileName;
 	system($dataToCSVCmd);

 	#Check we got the folder we were expecting
-	unless(-d $csvOutputDir) {
+	if (-d $csvOutputDir) {
+		#Do any necessary modifications to the survey data:
+		system($RPath." ".$coordinatorPath."/SurveyDataPreprocessor.R ".$csvOutputDir."/".$csvFileName." ".$surveyDataErrorsToRemoveFileName." ".$surveyDataAdditionalDataFileName." ".$processedSurveyDataFileName);
+	} else {
 		#TODO: If this fails, we should either retry after a few minutes, or just put up yesterdays data
 		if($debugOutput) {
 			print("DEBUG: ".$debugTimeString." csv Files not extracted: ".$csvOutputDir);
@@ -146,9 +156,9 @@ unless($serverFailure) {
 		#handleError("Failure: Didn't manage to convert to csv in output directory: ".$csvOutputDir);
 		$serverFailure = 1;
 	}
-
 }

+
 my $localOutputFolderName = $todayFolderPath."/".$jobIDString."_".$todayString."_0000";

 if(!$serverFailure) {
@@ -169,7 +179,7 @@ if(!$serverFailure) {
 	#Just in case the server was *very* slow to send the data
 	my $todayAndrewString = strftime("%Y-%m-%d", localtime());

-	my $clusteringCmd = "/usr/local/R/bin/Rscript code/R/clustering.R ".$csvOutputDir."/".$csvFileName." ".$todayString." -2 7 --plot";
+	my $clusteringCmd = $RPath." code/R/clustering.R ".$processedSurveyDataFileName." ".$todayString." -2 7 --plot";
 	system($clusteringCmd);

 	#Find the output file:
@@ -195,11 +205,6 @@ if(!$serverFailure) {
 	my $sourcesUploadFileName = $localOutputFolderName."/sources_".$todayString.".csv";
 	fcopy($clusteringOutputFile, $sourcesUploadFileName) or handleError("ERROR: Unable to copy output file ".$clusteringOutputFile." to the output upload staging folder ".$sourcesUploadFileName);

-	unless($debugNoUpload) {
-		#Transfer the output to public server for others to be able to pull it down:
-		system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/");
-	}
-
 } else {

 	#Server was not able to provide us with the latest data, so we need to look back in time until we find one that succeeded
@@ -233,10 +238,6 @@ if(!$serverFailure) {

 			dircopy($lastSuccessfulClusteringOutputPath, $localOutputFolderName);

-			my $ret = system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/");
-
-			print "Transfer output: ".$ret."\n";
-
 			my $warningTimeStamp = getTimestampNow();

 			my $warningString = $staleClusteringDataWarningString;
@@ -252,6 +253,12 @@ if(!$serverFailure) {

 }

+#Upload the survey data:
+unless($debugNoUpload) {
+	#Transfer the output to public server for others to be able to pull it down:
+	system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/");
+}
+
 #Successful exit, remove the in progress file:
 unlink($inProgressFilePath) or handleError("ERROR: Unable to remove in-progress lock file: ".$inProgressFilePath);