diff --git a/SurveyDataErrorsToRemove.csv b/SurveyDataErrorsToRemove.csv new file mode 100644 index 0000000000000000000000000000000000000000..9ffcf725919905f1ee41b5d6cdd2e1f3488aebf3 --- /dev/null +++ b/SurveyDataErrorsToRemove.csv @@ -0,0 +1,2 @@ +SubmissionDate,start,end,today,deviceid,subscriberid,imei,phonenumber,username,surveyor_infromation-country,surveyor_infromation-surveyor_name,surveyor_infromation-institution,survey_infromation-location_name,survey_infromation-location-Latitude,survey_infromation-location-Longitude,survey_infromation-location-Altitude,survey_infromation-location-Accuracy,survey_infromation-survey_date,site_information-survey_site,site_information-crop,site_information-field_area,site_information-variety,site_information-growth_stage,other_crop,stem_rust-stemrust_incidence,stem_rust-Stemrust_severity,stem_rust-stemrust_host_plant_reaction,leaf_rust-leafrust_incidence,leaf_rust-leafrust_severity,leaf_rust-leafrust_host_plant_reaction,yellow_rust-yellowrust_incidence,yellow_rust-yellowrust_severity,yellow_rust-yellowrust_host_plant_reaction,septoria-septoria_incidence,septoria-septoria_severity,other_diseases_group-other_diseases,score_diseases_count,SET-OF-score_diseases,samples_collected,samples_type,sample_size-number_stemrust_live,sample_size-number_stemrust_dead_dna,sample_size-number_yellowrust_live,sample_size-number_yellowrust_dead,sample_size-number_leafrust_live,sample_size-using_barcode,live_stemrust_samples_count,SET-OF-live_stemrust_samples,dead_stemrust_samples_count,SET-OF-dead_stemrust_samples,live_yellowrust_samples_count,SET-OF-live_yellowrust_samples,dead_yellowrust_samples_count,SET-OF-dead_yellowrust_samples,live_leafrust_samples_count,SET-OF-live_leafrust_samples,comment,meta-instanceID,KEY +06-Aug-2018 13:43:06,13-Jun-2018 12:26:53,06-Aug-2018 13:42:21,13-Jun-2018,FC:19:10:89:E2:6B,"",FC:19:10:89:E2:6B,"",mekele,Ethiopia,tesfay gebrekirstos,Tigray Agricultural Research Institute,Ilala,13.5235519900,39.5033560800,1991.0000000000,5.0000000000,06-Aug-2018,farmer_field,bread_wheat,2.0000000000,mekele-1,tillering,"",high,20,mr,low,10,mr,high,50,s,medium,55,glume_blotch loose_smut pythium_root_rot scab_fusarium_head_blight take_all tan_spot,6,uuid:58cb6a0f-8e47-4339-8080-0e320234415c/score_diseases,no,"","","","","","","","","","","","","","","","","",There is potential inoculum for the future.,uuid:58cb6a0f-8e47-4339-8080-0e320234415c,uuid:58cb6a0f-8e47-4339-8080-0e320234415c diff --git a/SurveyDataPreprocessor.R b/SurveyDataPreprocessor.R new file mode 100644 index 0000000000000000000000000000000000000000..545663a058ebd6c366053d256414bf8727b20552 --- /dev/null +++ b/SurveyDataPreprocessor.R @@ -0,0 +1,38 @@ + +options(stringsAsFactors = FALSE) + +library(dplyr) + +args = commandArgs(TRUE) + +defaultDataFileName = args[1] + +deleteRowsFileName = args[2] + +addRowsFileName = args[3] + +outputFileName = args[4] + + +#Reason for using colClasses is that otherwise it gets the inference wrong for the deleteData table, which has much less data for it to work with, +#and therefore dplyr can't match between the tow tables +defaultData = read.csv(defaultDataFileName, header = TRUE, colClasses = "character") + +#Remove unwanted rows: +deleteData = read.csv(deleteRowsFileName, header = TRUE, colClasses = "character") + +cleanData = anti_join(defaultData, deleteData) + +#TODO: Check and see if each row from deleteData was removed from defaultData - warn if a row not removed + +#Add extra rows: +#TODO: enable once data available +#extraData = read.csv(addRowsFileName, header = TRUE, colClasses = "character") + +#joinedData = rbind(cleanData, extraData) + +#Write output: +write.csv(joinedData, file = outputFileName, row.names = FALSE, quote = FALSE) + + + diff --git a/SurveyDataProcessor.pl b/SurveyDataProcessor.pl index 5d1a556ee8cd82b750e35a81dba3622b2b9eb2c8..fdbc59adaec000131d58f63169cd647555aae105 100755 --- a/SurveyDataProcessor.pl +++ b/SurveyDataProcessor.pl @@ -20,8 +20,10 @@ my $debugNoUpload = 0; my $workspacePath = "/storage/app/Ethiopia-EWS/Ethiopia-EWS-Workspace"; my $coordinatorPath = "/storage/app/Ethiopia-EWS/Ethiopia-EWS-Coordinator"; +my $RPath = "/usr/local/R/bin/Rscript"; + my @maintainers = ("ca500\@cam.ac.uk", "ds603\@cam.ac.uk", "rs481\@cam.ac.uk"); -#@maintainers = ("rs481\@cam.ac.uk");print STDERR "DEBUG: MAINTAINERS LIST\n"; +@maintainers = ("rs481\@cam.ac.uk");print STDERR "DEBUG: MAINTAINERS LIST\n"; my $todayString = strftime("%Y%m%d", localtime()); @@ -131,13 +133,21 @@ unless(-d $serverOutputDir) { my $csvOutputDir = $todayFolderPath."/ExportCSV"; my $csvFileName = "SurveyData.csv"; +my $processedSurveyDataFileName = $csvOutputDir."/Processed_".$csvFileName; + +my $surveyDataErrorsToRemoveFileName = $coordinatorPath."/SurveyDataErrorsToRemove.csv"; +my $surveyDataAdditionalDataFileName = ""; + unless($serverFailure) { my $dataToCSVCmd = "java -jar ".$serverJar." --export --form_id ".$serverFormID." --storage_directory ".$serverOutputDir." --export_directory ".$csvOutputDir." --export_filename ".$csvFileName; system($dataToCSVCmd); #Check we got the folder we were expecting - unless(-d $csvOutputDir) { + if (-d $csvOutputDir) { + #Do any necessary modifications to the survey data: + system($RPath." ".$coordinatorPath."/SurveyDataPreprocessor.R ".$csvOutputDir."/".$csvFileName." ".$surveyDataErrorsToRemoveFileName." ".$surveyDataAdditionalDataFileName." ".$processedSurveyDataFileName); + } else { #TODO: If this fails, we should either retry after a few minutes, or just put up yesterdays data if($debugOutput) { print("DEBUG: ".$debugTimeString." csv Files not extracted: ".$csvOutputDir); @@ -146,9 +156,9 @@ unless($serverFailure) { #handleError("Failure: Didn't manage to convert to csv in output directory: ".$csvOutputDir); $serverFailure = 1; } - } + my $localOutputFolderName = $todayFolderPath."/".$jobIDString."_".$todayString."_0000"; if(!$serverFailure) { @@ -169,7 +179,7 @@ if(!$serverFailure) { #Just in case the server was *very* slow to send the data my $todayAndrewString = strftime("%Y-%m-%d", localtime()); - my $clusteringCmd = "/usr/local/R/bin/Rscript code/R/clustering.R ".$csvOutputDir."/".$csvFileName." ".$todayString." -2 7 --plot"; + my $clusteringCmd = $RPath." code/R/clustering.R ".$processedSurveyDataFileName." ".$todayString." -2 7 --plot"; system($clusteringCmd); #Find the output file: @@ -195,11 +205,6 @@ if(!$serverFailure) { my $sourcesUploadFileName = $localOutputFolderName."/sources_".$todayString.".csv"; fcopy($clusteringOutputFile, $sourcesUploadFileName) or handleError("ERROR: Unable to copy output file ".$clusteringOutputFile." to the output upload staging folder ".$sourcesUploadFileName); - unless($debugNoUpload) { - #Transfer the output to public server for others to be able to pull it down: - system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/"); - } - } else { #Server was not able to provide us with the latest data, so we need to look back in time until we find one that succeeded @@ -233,10 +238,6 @@ if(!$serverFailure) { dircopy($lastSuccessfulClusteringOutputPath, $localOutputFolderName); - my $ret = system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/"); - - print "Transfer output: ".$ret."\n"; - my $warningTimeStamp = getTimestampNow(); my $warningString = $staleClusteringDataWarningString; @@ -252,6 +253,12 @@ if(!$serverFailure) { } +#Upload the survey data: +unless($debugNoUpload) { + #Transfer the output to public server for others to be able to pull it down: + system("scp -i ".$coordinatorPath."/ssh_key_willow"." -r ".$localOutputFolderName." willow.csx.cam.ac.uk:/var/www/html/Ethiopia/"); +} + #Successful exit, remove the in progress file: unlink($inProgressFilePath) or handleError("ERROR: Unable to remove in-progress lock file: ".$inProgressFilePath);