FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit 8e11ed07 authored by J.W. Smith's avatar J.W. Smith
Browse files

feat: Checker for ARRCC media scraper tool

A basic script that checks the ARRCC media scraper web tool for latest
entries and provides rough formatting for input to NAME source calcs.
The idea is to email this to a list of maintainers (me, @tm689, @rs481)
regularly (once a week). Because the system isn't smart enough to filter
out any irrelevant news reports (e.g. general early season advice
without a specific rust sighting), human intervention is required to
filter and append to the survey data. The media scraper checker is set
as a cron task to run every Monday at midday.
parent 58ffb7c8
No related branches found
No related tags found
No related merge requests found
#GatherScraedMediaReports.py
'''Downloads a csv file of news reports and reformats them as survey records to
provide to the wheat rust early warning system.
The format and content of the csv of news reports is based on ARRCC work by
Asif Al Faisal (CIMMYT-Bangladesh).'''
import os
import json
import requests
import subprocess
from pathlib import Path
import datetime
from numpy import where
import pandas as pd
from pandas import read_csv, DataFrame
#maintainers = ['jws52@cam.ac.uk','tm689@cam.ac.uk','rs481@cam.ac.uk']
maintainers = ['jws52@cam.ac.uk']
# date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv
# # ODK v1.11.2:
# fmt = '%d-%b-%Y'
# fmt_short = '%d-%b-%Y'
# ODK v1.18.0:
fmt = '%b %d, %Y %H:%m:%S %p'
fmt_short = '%b %d, %Y'
def get_news_reports(job_dir):
'''Downloads the news report data available on the ARRCC media report
dashboard, into the provided directory.
Does not return anything'''
assert os.path.exists(job_dir)
# url location of latest news report search
url = 'http://arrcc-viewer.herokuapp.com/assets/sample_data/data.zip'
r = requests.get(url)
#r.content
# write a local copy of the zip file
fn_zip = f"{job_dir}data.zip"
with open(fn_zip,'wb') as zipfile:
zipfile.write(r.content)
# unzip it
dir_unzip = f"{job_dir}/data/"
cmd_unzip = ['unzip',fn_zip,'-d',dir_unzip]
subprocess.run(cmd_unzip)
return
def read_news_reports(job_dir):
'''Opens the news reports in the provided directory.
Returns a pandas dataframe.'''
fn = f"{job_dir}/data/NEWS REPORTS.csv"
df = read_csv(fn,index_col=0,header=0,parse_dates=['Date'])
return df
def reformat_news_reports(df):
'''Reformats a dataframe of news reports to match BGRI wheat rust survey
data entries (making assumptions where necessary). First checks input is as
expected.'''
#Check contents are as expected
cols = df.columns
expected_cols = ['Lon','Lat','Date','Type','Link','Country','State','District']
for expected_col in expected_cols:
assert expected_col in cols
assumption_dict = {
'field_area' : 1}
def guess_stage(date,country):
#TODO: provide typical phenology dates per country
# all of the country
# based on Moin's estimates from vegetative to ripening & maturing
# note his Start Date and End Dates are always 30 days apart
# so, this is sticking to the middle of the range
stage_start_dates_bangladesh = {
'tillering':'5 Dec', # ~26 days { combined 30 days to complete
'boot' :'31 Dec', # ~4 days {
'heading' :'4 Jan', # 20 days
'flowering':'24 Jan', # 10 days
'milk' :'3 Feb', # 10 days
'dough' :'13 Feb', # ~15 days { combined 25 days to complete
'maturity' :'28 Mar', # ~10 days {
'NA' :'9 Mar'} # total ~95 days
# main season wheat in Terai
# based on Madan's estimates of min and max duration, taking the mean
# from vegetative to ripening & maturing
stage_start_dates_nepal_terai = {
'tillering':'24 Dec', # ~ 56 days { combined 66 days to complete
'boot' :'18 Feb', # ~10 days {
'heading' :'28 Feb', # 10 days
'flowering':'9 Mar', # 5 days
'milk' :'14 Mar', # ~12 days { combined 27 days to complete
'dough' :'26 Mar', # ~15 days {
'maturity' :'10 Apr', # 10 days
'NA' :'20 Apr'} # total ~118 days
# TODO: Less important: implement main season wheat in mid-hills from Madan's estimates
# and determine how to distinguish Terai from mid-hills in news report
stage_start_dates_nepal_midhills = {
'tillering':'',
'boot' :'',
'heading' :'',
'flowering':'',
'milk' :'',
'dough' :'',
'maturity' :'',
'NA' :''}
# mainly for the north
# assume the same as Nepal Terai as for last year.
# TODO: get estimates specific to Pakistan
stage_start_dates_pakistan = stage_start_dates_nepal_terai
# mainly for Haryana district
# assume the same as Nepal Terai as for last year.
# TODO: get estimates specific to India NW districts
stage_start_dates_india = stage_start_dates_nepal_terai
stage_start_dates_by_country = {
'Bangladesh' : stage_start_dates_bangladesh,
'Nepal' : stage_start_dates_nepal_terai,
'India' : stage_start_dates_india,
'Pakistan' : stage_start_dates_pakistan}
df = DataFrame({'date':date,'country':country})
dates_by_entry = country.apply(lambda val: stage_start_dates_by_country[val])
df2 = DataFrame.from_records(dates_by_entry.values)
df2.index = df.index
df3 = pd.concat([df,df2],axis='columns')
# handle Dec-Jan crossover (is there a neater way of doing this?)
df3['year'] = df3['date'].apply(lambda di: di.year)
df3['lastyear'] = df3['date'].apply(lambda di: di.year-1)
stages_thisyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['year']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()}
stages_lastyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['lastyear']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()}
df3_thisyear = DataFrame.from_records(stages_thisyear)
df3_lastyear = DataFrame.from_records(stages_lastyear)
# Use knowledge of order of phenological stages to determine which dates are from last year
stage_order = ['tillering','boot','heading','flowering','milk','dough','maturity','NA']
df4 = df3_thisyear[stage_order]
# check each stage in turn
for i,stage in enumerate(stage_order[:-1]):
stage_ip1 = stage_order[i+1]
# if the earlier stage has a later date, use last year's dates
df4[stage] = where(df4[stage]<df4[stage_ip1],df4[stage],df3_lastyear[stage])
# find out which stages start earlier than the survey date
date_compare = df4.le(date,axis='rows')
# get name of latest valid stage
stage_series = date_compare.apply(lambda row: row.iloc[::-1].idxmax(),axis='columns')
return stage_series
def estimate_rust(description: pd.Series,disease: str,return_type: str):
'''Works with pandas series'''
# check for alternative naming
if disease in ['yellow','stripe']:
any_disease = description.str.contains('yellow') | description.str.contains('stripe')
else:
any_disease = description.str.contains(disease)
return_dict = {
'incidence':'medium',
'severity':'30'
}
prevalence = where(any_disease,return_dict[return_type],'na')
return prevalence
output_dict = {
'SubmissionDate' : df['Date'],
'start' : df['Date'],
'end' : df['Date'],
'today' : df['Date'],
'deviceid' : 999,
'subscriberid' : 999,
'imei' : 999,
'phonenumber' : 999,
'username' : 999,
'country_list' : df['Country'],
'blast_rust' : 'Rust',
'surveyor_name' : 'News report',
'institution' : 'na',
'mobile_num' : 999,
'site_information-survey_site' : 'Farmer field',
'site_information-crop' : 'NA',
'site_information-field_area' : assumption_dict['field_area'],
'site_information-unit_m2' : 999,
'site_information-field_size' : 999,
'site_information-variety' : 'NA',
'site_information-growth_stage' : guess_stage(df['Date'],df['Country']),
'survey_infromation-location_name' : 999,
'survey_infromation-location_blast' : 999,
'survey_infromation-sampColor' : 999,
'survey_infromation-dateRange' : 999,
'survey_infromation-fieldNumber' : 999,
'survey_infromation-diseaseIncidencePercentage' : 999,
'survey_infromation-severityPercentage' : 999,
'survey_infromation-survey_date' : df['Date'].apply(lambda cell: cell.strftime(fmt_short)),
'survey_infromation-site_name' : '"'+df['District'].astype(str)+', '+df['State'].astype(str)+', '+df['Country'].astype(str)+'"',
'survey_infromation-location-Latitude' : df['Lat'],
'survey_infromation-location-Longitude' : df['Lon'],
'survey_infromation-location-Altitude' : -999,
'survey_infromation-location-Accuracy' : -999,
'stem_rust-stemrust_incidence' : estimate_rust(df['Type'],'stem','incidence'),
'stem_rust-Stemrust_severity' : estimate_rust(df['Type'],'stem','severity'),
'stem_rust-stemrust_host_plant_reaction' : 'na',
'leaf_rust-leafrust_incidence' : estimate_rust(df['Type'],'leaf','incidence'),
'leaf_rust-leafrust_severity' : estimate_rust(df['Type'],'leaf','severity'),
'leaf_rust-leafrust_host_plant_reaction' : 'na',
'yellow_rust-yellowrust_incidence' : estimate_rust(df['Type'],'yellow','incidence'),
'yellow_rust-yellowrust_severity' : estimate_rust(df['Type'],'yellow','severity'),
'yellow_rust-yellowrust_host_plant_reaction' : 'na',
'septoria-septoria_incidence' : 'na',
'septoria-septoria_severity' : 'na',
'other_diseases_group-other_diseases' : -999,
'score_diseases_count' : -999,
'SET-OF-score_diseases' : -999,
'samples_collected' : -999,
'samples_type' : -999,
'sample_size-number_stemrust_live' : -999,
'sample_size-number_stemrust_dead_dna' : -999,
'sample_size-number_yellowrust_live' : -999,
'sample_size-number_yellowrust_dead' : -999,
'sample_size-number_leafrust_live' : -999,
'sample_size-using_barcode' : -999,
'live_stemrust_samples_count' : -999,
'SET-OF-live_stemrust_samples' : -999,
'dead_stemrust_samples_count' : -999,
'SET-OF-dead_stemrust_samples' : -999,
'live_yellowrust_samples_count' : -999,
'SET-OF-live_yellowrust_samples' : -999,
'dead_yellowrust_samples_count' : -999,
'SET-OF-dead_yellowrust_samples' : -999,
'live_leafrust_samples_count' : -999,
'SET-OF-live_leafrust_samples' : -999,
'comment' : df['Link'],
'meta-instanceID' : -999,
'meta-instanceName' : -999,
'KEY' : -999}
df_out = DataFrame(output_dict)
return df_out
email_msg = """Subject: ARRCC latest scraped media reports
Here is an update of what is on the ARRCC media scraper platform.
The latest entry is below. The full set for this season, auto-formatted for input to NAME source calcs in a basic way, is available at:
pine:{0}
Check all new webpages for validity and extra info (e.g. field area, variety), then edit and copy any relevant entries to:
pine:/storage/app/EWS/SouthAsia/Workspace/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv
Then, check that the survey data processor succeeds with these new entries.
Thanks, Jake
{1}
"""
def send_email(output_fn,data_str):
import smtplib, ssl
msg = email_msg.format(output_fn,data_str)
with open('/storage/app/EWS/General/EWS-Coordinator/Cred_gmail.json','r') as f:
gmailConfig = json.load(f)
# Create a secure SSL context
context = ssl.create_default_context()
# It is indicated that gmail requires port 465 for SMTP_SSL, otherwise port 587 with .starttls()
# from https://realpython.com/python-send-email/#sending-a-plain-text-email
# I think port 587 is meant to make sense for the typical python logging smtphandler
# but that doesn't apply here
port = 465 # gmailConfig['port']
with smtplib.SMTP_SSL(gmailConfig['host'], 465, context=context) as server:
server.login(gmailConfig['user'], gmailConfig['pass'])
server.sendmail(gmailConfig['user'], maintainers, msg)
print('Message sent!')
return
if __name__ == '__main__':
# 0) Prepare directory for this job's files
#job_dir = 'scrape_job/'
date_str = datetime.datetime.now().strftime('%Y%m%d_%H%M')
#output_dir = '/storage/app/EWS/General/EWS-Coordinator-Dev/scratch_SouthAsia'
output_dir = '/storage/app/EWS/SouthAsia/Workspace/'
job_dir = f"{output_dir}/REPORTS_{date_str}/"
Path(job_dir).mkdir(parents=True, exist_ok=False)
# 1) Get a latest copy of the news report data
get_news_reports(job_dir)
reports_in = read_news_reports(job_dir)
# 2) TODO: Reformat to match BGRI wheat rust survey data entries
# (making assumptions where necessary)
reports = reformat_news_reports(reports_in)
# 3) Filter to latest news reports
this_season_starts_str = '01 Dec 2020'
this_season_starts = datetime.datetime.strptime(this_season_starts_str,'%d %b %Y')
latest_reports = reports[reports['SubmissionDate']>=this_season_starts]
# TODO: Low priority: Determine differences from last reformatted set of news reports
# 4) Output to csv
output_fn = f"{job_dir}/latest_reports_as_surveys.csv"
# date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv
latest_reports.to_csv(output_fn,index=False,date_format=fmt)
# 5) email any new reports to maintainers, so they can copy into
# SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate
selected_columns = [
'SubmissionDate',
'site_information-field_area',
'site_information-growth_stage',
'survey_infromation-site_name',
'country_list',
'stem_rust-stemrust_incidence',
'stem_rust-Stemrust_severity',
'yellow_rust-yellowrust_incidence',
'yellow_rust-yellowrust_severity',
'leaf_rust-leafrust_incidence',
'leaf_rust-leafrust_severity',
'comment']
# remove pandas display limitation, so full web address is shown from comment
pd.set_option('display.max_colwidth', None)
latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__()
send_email(output_fn,latest_report_selection)
......@@ -35,3 +35,6 @@
# Environmental suitability analysis (Wednesday only)
00 18,20,22 * * 3 /storage/app/EWS/General/EWS-Coordinator/run_Processor.sh -p Environment -c /storage/app/EWS/General/EWS-Coordinator/config_SouthAsia_an.json --islive
# check for scraped media reports (Monday only)
00 12 * * 1 /storage/app/EWS/General/EWS-Coordinator/run_GatherScrapedMediaReports.sh
\ No newline at end of file
#!/bin/bash
# activate conda environment of python modules so they can be imported
source /storage/app/miniconda3/bin/activate /storage/app/EWS/General/EWS-python/py3EWSepi
python /storage/app/EWS/General/EWS-Coordinator/GatherScrapedMediaReports.py "$@"
# deactivate conda environment
source /storage/app/miniconda3/bin/deactivate /storage/app/EWS/General/EWS-python/py3EWSepi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment