From 8001587be7c7bc985ab251a7590890e6b8def06a Mon Sep 17 00:00:00 2001 From: jws52 <jws52@cam.ac.uk> Date: Thu, 12 Jan 2023 17:05:23 +0000 Subject: [PATCH] feat: Optional email from Scraper And reports are sorted by date. And assumed disease prevalence is set to medium, not low. --- coordinator/ProcessorScraper.py | 85 +++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/coordinator/ProcessorScraper.py b/coordinator/ProcessorScraper.py index b4e1af5..11f3bf0 100644 --- a/coordinator/ProcessorScraper.py +++ b/coordinator/ProcessorScraper.py @@ -72,7 +72,14 @@ def read_news_reports(job_dir: str) -> DataFrame: fn = f"{job_dir}/data/NEWS REPORTS.csv" - df = read_csv(fn,index_col=0,header=0,parse_dates=['Date']) + dateparse = lambda x: datetime.datetime.strptime(x, '%d-%m-%y') + + df = read_csv( + fn, + index_col=0, + header=0, + parse_dates=['Date'], + date_parser=dateparse) return df @@ -91,8 +98,8 @@ def estimate_rust( any_disease = description.str.contains(disease) return_dict = { - 'incidence':'low', - 'severity':'10' + 'incidence':'medium', + 'severity':'30' } prevalence = where(any_disease,return_dict[return_type],'na') @@ -196,18 +203,22 @@ def guess_stage( return stage_series -def reformat_news_reports(df: DataFrame) -> DataFrame: +def reformat_news_reports(df_in: DataFrame) -> DataFrame: '''Reformats a dataframe of news reports to match BGRI wheat rust survey data entries (making assumptions where necessary). First checks input is as expected.''' #Check contents are as expected - cols = df.columns + cols = df_in.columns expected_cols = ['Lon','Lat','Date','Type','Link','Country','State','District'] for expected_col in expected_cols: assert expected_col in cols + # re-order dataframe, with newest entry last + df = df_in.copy() + df.sort_values('Date',ascending=True,inplace=True) + assumption_dict = { 'field_area' : 1} @@ -390,35 +401,39 @@ def process_in_job_media_scraper( index=False, date_format=FMT) - logger.info("5) email any new reports to maintainers, so they can copy into") - logger.info("SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate") - - selected_columns = [ - 'SubmissionDate', - 'site_information-field_area', - 'site_information-growth_stage', - 'survey_infromation-site_name', - 'country_list', - 'stem_rust-stemrust_incidence', - 'stem_rust-Stemrust_severity', - 'yellow_rust-yellowrust_incidence', - 'yellow_rust-yellowrust_severity', - 'leaf_rust-leafrust_incidence', - 'leaf_rust-leafrust_severity', - 'comment'] - - # remove pandas display limitation, so full web address is shown from comment - set_option('display.max_colwidth', None) - latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__() - - # get the email credentials file path from the environment variables - assert 'EMAIL_CRED' in os.environ - email_credential_fn = os.environ['EMAIL_CRED'] - assert os.path.exists(email_credential_fn) - - send_email( - output_fn, - latest_report_selection, - email_credential_fn = email_credential_fn) + is_sending_email = config['Scraper'].get('SendEmail',True) + + if is_sending_email == True: + + logger.info("5) email any new reports to maintainers, so they can copy into") + logger.info("SURVEYDATA_MANUAL/LIVE_SURVEYDATA_TOUSE.csv where appropriate") + + selected_columns = [ + 'SubmissionDate', + 'site_information-field_area', + 'site_information-growth_stage', + 'survey_infromation-site_name', + 'country_list', + 'stem_rust-stemrust_incidence', + 'stem_rust-Stemrust_severity', + 'yellow_rust-yellowrust_incidence', + 'yellow_rust-yellowrust_severity', + 'leaf_rust-leafrust_incidence', + 'leaf_rust-leafrust_severity', + 'comment'] + + # remove pandas display limitation, so full web address is shown from comment + set_option('display.max_colwidth', None) + latest_report_selection = latest_reports.iloc[-1,:].loc[selected_columns].__str__() + + # get the email credentials file path from the environment variables + assert 'EMAIL_CRED' in os.environ + email_credential_fn = os.environ['EMAIL_CRED'] + assert os.path.exists(email_credential_fn) + + send_email( + output_fn, + latest_report_selection, + email_credential_fn = email_credential_fn) return -- GitLab