diff --git a/GatherScrapedMediaReports.py b/GatherScrapedMediaReports.py new file mode 100644 index 0000000000000000000000000000000000000000..ff90c1e867fe0448e75b41bc1105d1a498811c30 --- /dev/null +++ b/GatherScrapedMediaReports.py @@ -0,0 +1,388 @@ +#GatherScraedMediaReports.py +'''Downloads a csv file of news reports and reformats them as survey records to +provide to the wheat rust early warning system. + +The format and content of the csv of news reports is based on ARRCC work by +Asif Al Faisal (CIMMYT-Bangladesh).''' + +import os +import json +import requests +import subprocess +from pathlib import Path +import datetime + +from numpy import where +import pandas as pd +from pandas import read_csv, DataFrame + +#maintainers = ['jws52@cam.ac.uk','tm689@cam.ac.uk','rs481@cam.ac.uk'] +maintainers = ['jws52@cam.ac.uk'] + +# date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv +# # ODK v1.11.2: +# fmt = '%d-%b-%Y' +# fmt_short = '%d-%b-%Y' +# ODK v1.18.0: +fmt = '%b %d, %Y %H:%m:%S %p' +fmt_short = '%b %d, %Y' + + +def get_news_reports(job_dir): + '''Downloads the news report data available on the ARRCC media report + dashboard, into the provided directory. + + Does not return anything''' + + assert os.path.exists(job_dir) + + # url location of latest news report search + url = 'http://arrcc-viewer.herokuapp.com/assets/sample_data/data.zip' + + r = requests.get(url) + + #r.content + + # write a local copy of the zip file + fn_zip = f"{job_dir}data.zip" + with open(fn_zip,'wb') as zipfile: + zipfile.write(r.content) + + # unzip it + dir_unzip = f"{job_dir}/data/" + cmd_unzip = ['unzip',fn_zip,'-d',dir_unzip] + subprocess.run(cmd_unzip) + + return + +def read_news_reports(job_dir): + '''Opens the news reports in the provided directory. + + Returns a pandas dataframe.''' + + fn = f"{job_dir}/data/NEWS REPORTS.csv" + + df = read_csv(fn,index_col=0,header=0,parse_dates=['Date']) + + return df + +def reformat_news_reports(df): + '''Reformats a dataframe of news reports to match BGRI wheat rust survey + data entries (making assumptions where necessary). First checks input is as + expected.''' + + #Check contents are as expected + cols = df.columns + expected_cols = ['Lon','Lat','Date','Type','Link','Country','State','District'] + + for expected_col in expected_cols: + assert expected_col in cols + + assumption_dict = { + 'field_area' : 1} + + def guess_stage(date,country): + + #TODO: provide typical phenology dates per country + + # all of the country + # based on Moin's estimates from vegetative to ripening & maturing + # note his Start Date and End Dates are always 30 days apart + # so, this is sticking to the middle of the range + stage_start_dates_bangladesh = { + 'tillering':'5 Dec', # ~26 days { combined 30 days to complete + 'boot' :'31 Dec', # ~4 days { + 'heading' :'4 Jan', # 20 days + 'flowering':'24 Jan', # 10 days + 'milk' :'3 Feb', # 10 days + 'dough' :'13 Feb', # ~15 days { combined 25 days to complete + 'maturity' :'28 Mar', # ~10 days { + 'NA' :'9 Mar'} # total ~95 days + + # main season wheat in Terai + # based on Madan's estimates of min and max duration, taking the mean + # from vegetative to ripening & maturing + stage_start_dates_nepal_terai = { + 'tillering':'24 Dec', # ~ 56 days { combined 66 days to complete + 'boot' :'18 Feb', # ~10 days { + 'heading' :'28 Feb', # 10 days + 'flowering':'9 Mar', # 5 days + 'milk' :'14 Mar', # ~12 days { combined 27 days to complete + 'dough' :'26 Mar', # ~15 days { + 'maturity' :'10 Apr', # 10 days + 'NA' :'20 Apr'} # total ~118 days + + # TODO: Less important: implement main season wheat in mid-hills from Madan's estimates + # and determine how to distinguish Terai from mid-hills in news report + stage_start_dates_nepal_midhills = { + 'tillering':'', + 'boot' :'', + 'heading' :'', + 'flowering':'', + 'milk' :'', + 'dough' :'', + 'maturity' :'', + 'NA' :''} + + + # mainly for the north + # assume the same as Nepal Terai as for last year. + # TODO: get estimates specific to Pakistan + stage_start_dates_pakistan = stage_start_dates_nepal_terai + + # mainly for Haryana district + # assume the same as Nepal Terai as for last year. + # TODO: get estimates specific to India NW districts + stage_start_dates_india = stage_start_dates_nepal_terai + + stage_start_dates_by_country = { + 'Bangladesh' : stage_start_dates_bangladesh, + 'Nepal' : stage_start_dates_nepal_terai, + 'India' : stage_start_dates_india, + 'Pakistan' : stage_start_dates_pakistan} + + df = DataFrame({'date':date,'country':country}) + dates_by_entry = country.apply(lambda val: stage_start_dates_by_country[val]) + df2 = DataFrame.from_records(dates_by_entry.values) + df2.index = df.index + df3 = pd.concat([df,df2],axis='columns') + + # handle Dec-Jan crossover (is there a neater way of doing this?) + + df3['year'] = df3['date'].apply(lambda di: di.year) + df3['lastyear'] = df3['date'].apply(lambda di: di.year-1) + + stages_thisyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['year']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} + stages_lastyear = {coln:df3.apply(lambda row: datetime.datetime.strptime(f"{row[coln]} {row['lastyear']}",'%d %b %Y'),axis='columns') for coln in stage_start_dates_bangladesh.keys()} + df3_thisyear = DataFrame.from_records(stages_thisyear) + df3_lastyear = DataFrame.from_records(stages_lastyear) + + # Use knowledge of order of phenological stages to determine which dates are from last year + stage_order = ['tillering','boot','heading','flowering','milk','dough','maturity','NA'] + + df4 = df3_thisyear[stage_order] + + # check each stage in turn + for i,stage in enumerate(stage_order[:-1]): + stage_ip1 = stage_order[i+1] + # if the earlier stage has a later date, use last year's dates + df4[stage] = where(df4[stage]<df4[stage_ip1],df4[stage],df3_lastyear[stage]) + + # find out which stages start earlier than the survey date + date_compare = df4.le(date,axis='rows') + + # get name of latest valid stage + stage_series = date_compare.apply(lambda row: row.iloc[::-1].idxmax(),axis='columns') + + return stage_series + + def estimate_rust(description: pd.Series,disease: str,return_type: str): + '''Works with pandas series''' + + + + # check for alternative naming + if disease in ['yellow','stripe']: + + any_disease = description.str.contains('yellow') | description.str.contains('stripe') + + else: + any_disease = description.str.contains(disease) + + + return_dict = { + 'incidence':'medium', + 'severity':'30' + } + + prevalence = where(any_disease,return_dict[return_type],'na') + + return prevalence + + output_dict = { + 'SubmissionDate' : df['Date'], + 'start' : df['Date'], + 'end' : df['Date'], + 'today' : df['Date'], + 'deviceid' : 999, + 'subscriberid' : 999, + 'imei' : 999, + 'phonenumber' : 999, + 'username' : 999, + 'country_list' : df['Country'], + 'blast_rust' : 'Rust', + 'surveyor_name' : 'News report', + 'institution' : 'na', + 'mobile_num' : 999, + 'site_information-survey_site' : 'Farmer field', + 'site_information-crop' : 'NA', + 'site_information-field_area' : assumption_dict['field_area'], + 'site_information-unit_m2' : 999, + 'site_information-field_size' : 999, + 'site_information-variety' : 'NA', + 'site_information-growth_stage' : guess_stage(df['Date'],df['Country']), + 'survey_infromation-location_name' : 999, + 'survey_infromation-location_blast' : 999, + 'survey_infromation-sampColor' : 999, + 'survey_infromation-dateRange' : 999, + 'survey_infromation-fieldNumber' : 999, + 'survey_infromation-diseaseIncidencePercentage' : 999, + 'survey_infromation-severityPercentage' : 999, + 'survey_infromation-survey_date' : df['Date'].apply(lambda cell: cell.strftime(fmt_short)), + 'survey_infromation-site_name' : '"'+df['District'].astype(str)+', '+df['State'].astype(str)+', '+df['Country'].astype(str)+'"', + 'survey_infromation-location-Latitude' : df['Lat'], + 'survey_infromation-location-Longitude' : df['Lon'], + 'survey_infromation-location-Altitude' : -999, + 'survey_infromation-location-Accuracy' : -999, + 'stem_rust-stemrust_incidence' : estimate_rust(df['Type'],'stem','incidence'), + 'stem_rust-Stemrust_severity' : estimate_rust(df['Type'],'stem','severity'), + 'stem_rust-stemrust_host_plant_reaction' : 'na', + 'leaf_rust-leafrust_incidence' : estimate_rust(df['Type'],'leaf','incidence'), + 'leaf_rust-leafrust_severity' : estimate_rust(df['Type'],'leaf','severity'), + 'leaf_rust-leafrust_host_plant_reaction' : 'na', + 'yellow_rust-yellowrust_incidence' : estimate_rust(df['Type'],'yellow','incidence'), + 'yellow_rust-yellowrust_severity' : estimate_rust(df['Type'],'yellow','severity'), + 'yellow_rust-yellowrust_host_plant_reaction' : 'na', + 'septoria-septoria_incidence' : 'na', + 'septoria-septoria_severity' : 'na', + 'other_diseases_group-other_diseases' : -999, + 'score_diseases_count' : -999, + 'SET-OF-score_diseases' : -999, + 'samples_collected' : -999, + 'samples_type' : -999, + 'sample_size-number_stemrust_live' : -999, + 'sample_size-number_stemrust_dead_dna' : -999, + 'sample_size-number_yellowrust_live' : -999, + 'sample_size-number_yellowrust_dead' : -999, + 'sample_size-number_leafrust_live' : -999, + 'sample_size-using_barcode' : -999, + 'live_stemrust_samples_count' : -999, + 'SET-OF-live_stemrust_samples' : -999, + 'dead_stemrust_samples_count' : -999, + 'SET-OF-dead_stemrust_samples' : -999, + 'live_yellowrust_samples_count' : -999, + 'SET-OF-live_yellowrust_samples' : -999, + 'dead_yellowrust_samples_count' : -999, + 'SET-OF-dead_yellowrust_samples' : -999, + 'live_leafrust_samples_count' : -999, + 'SET-OF-live_leafrust_samples' : -999, + 'comment' : df['Link'], + 'meta-instanceID' : -999, + 'meta-instanceName' : -999, + 'KEY' : -999} + + df_out = DataFrame(output_dict) + + return df_out + +email_msg = """Subject: ARRCC latest scraped media reports + +Here is an update of what is on the ARRCC media scraper platform. + +The latest entry is below. The full set for this season, auto-formatted for input to NAME source calcs in a basic way, is available at: +pine:{0} + +Check all new webpages for validity and extra info (e.g. field area, variety), then edit and copy any relevant entries to: +pine:/storage/app/EWS/SouthAsia/Workspace/SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv + +Then, check that the survey data processor succeeds with these new entries. + +Thanks, Jake + +{1} +""" + +def send_email(output_fn,data_str): + + import smtplib, ssl + + msg = email_msg.format(output_fn,data_str) + + with open('/storage/app/EWS/General/EWS-Coordinator/Cred_gmail.json','r') as f: + gmailConfig = json.load(f) + + # Create a secure SSL context + context = ssl.create_default_context() + + # It is indicated that gmail requires port 465 for SMTP_SSL, otherwise port 587 with .starttls() + # from https://realpython.com/python-send-email/#sending-a-plain-text-email + # I think port 587 is meant to make sense for the typical python logging smtphandler + # but that doesn't apply here + port = 465 # gmailConfig['port'] + + with smtplib.SMTP_SSL(gmailConfig['host'], 465, context=context) as server: + server.login(gmailConfig['user'], gmailConfig['pass']) + + server.sendmail(gmailConfig['user'], maintainers, msg) + + print('Message sent!') + + return + + +if __name__ == '__main__': + + # 0) Prepare directory for this job's files + + #job_dir = 'scrape_job/' + + date_str = datetime.datetime.now().strftime('%Y%m%d_%H%M') + #output_dir = '/storage/app/EWS/General/EWS-Coordinator-Dev/scratch_SouthAsia' + output_dir = '/storage/app/EWS/SouthAsia/Workspace/' + job_dir = f"{output_dir}/REPORTS_{date_str}/" + + Path(job_dir).mkdir(parents=True, exist_ok=False) + + # 1) Get a latest copy of the news report data + get_news_reports(job_dir) + + reports_in = read_news_reports(job_dir) + + # 2) TODO: Reformat to match BGRI wheat rust survey data entries + # (making assumptions where necessary) + reports = reformat_news_reports(reports_in) + + # 3) Filter to latest news reports + this_season_starts_str = '01 Dec 2020' + this_season_starts = datetime.datetime.strptime(this_season_starts_str,'%d %b %Y') + + latest_reports = reports[reports['SubmissionDate']>=this_season_starts] + + # TODO: Low priority: Determine differences from last reformatted set of news reports + + # 4) Output to csv + output_fn = f"{job_dir}/latest_reports_as_surveys.csv" + + # date format conforms to format used in SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv + latest_reports.to_csv(output_fn,index=False,date_format=fmt) + + # 5) email any new reports to maintainers, so they can copy into + # SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate + selected_columns = [ + 'SubmissionDate', + 'site_information-field_area', + 'site_information-growth_stage', + 'survey_infromation-site_name', + 'country_list', + 'stem_rust-stemrust_incidence', + 'stem_rust-Stemrust_severity', + 'yellow_rust-yellowrust_incidence', + 'yellow_rust-yellowrust_severity', + 'leaf_rust-leafrust_incidence', + 'leaf_rust-leafrust_severity', + 'comment'] + + # remove pandas display limitation, so full web address is shown from comment + pd.set_option('display.max_colwidth', None) + latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__() + + send_email(output_fn,latest_report_selection) + + + + + + + + + diff --git a/crontab.txt b/crontab.txt index a64bc2a1c57326f3dda882c3e4f04223504ec99d..97fc148f54767f8a2ec9efbbe1a0ba3c647a032f 100644 --- a/crontab.txt +++ b/crontab.txt @@ -35,3 +35,6 @@ # Environmental suitability analysis (Wednesday only) 00 18,20,22 * * 3 /storage/app/EWS/General/EWS-Coordinator/run_Processor.sh -p Environment -c /storage/app/EWS/General/EWS-Coordinator/config_SouthAsia_an.json --islive + +# check for scraped media reports (Monday only) +00 12 * * 1 /storage/app/EWS/General/EWS-Coordinator/run_GatherScrapedMediaReports.sh \ No newline at end of file diff --git a/run_GatherScrapedMediaReports.sh b/run_GatherScrapedMediaReports.sh new file mode 100755 index 0000000000000000000000000000000000000000..82421b76f7c973f4ad28b7c60c6425ae6ca8d546 --- /dev/null +++ b/run_GatherScrapedMediaReports.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# activate conda environment of python modules so they can be imported +source /storage/app/miniconda3/bin/activate /storage/app/EWS/General/EWS-python/py3EWSepi + +python /storage/app/EWS/General/EWS-Coordinator/GatherScrapedMediaReports.py "$@" + +# deactivate conda environment +source /storage/app/miniconda3/bin/deactivate /storage/app/EWS/General/EWS-python/py3EWSepi +