From 9f19cef119fa67740a735a3c2ad90d7800b36dff Mon Sep 17 00:00:00 2001 From: jws52 <jws52@cam.ac.uk> Date: Mon, 7 Nov 2022 14:49:40 +0000 Subject: [PATCH] fix: Check tar file completeness before download Before this commit, the process_pre_job_server_download() function would check if the necessary tar file exists on the remote server. But, because it is written there by sftp, and sftp doesn't generate any lock file or temp files, in-progress sftp writes look like they are available and the job continues. With this change, the job will only process if the tar file is completely written. --- coordinator/ProcessorServer.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/coordinator/ProcessorServer.py b/coordinator/ProcessorServer.py index cca3e76..c07349d 100644 --- a/coordinator/ProcessorServer.py +++ b/coordinator/ProcessorServer.py @@ -37,17 +37,28 @@ def process_pre_job_server_download(input_args: dict): file_path = Template(config[component]['ServerPathTemplate']).substitute(**config) file_name = Template(config[component]['InputFileTemplate']).substitute(**config) - logger.info(f"Checking for existence of {file_path}/{file_name}.tar.gz") + file_path_full = f"{file_path}/{file_name}.tar.gz" + logger.info(f"Checking for existence of {file_path_full}") timenow = datetime.datetime.now(tz=datetime.timezone.utc).time() - cmd_ssh = ["ssh","-i",config['ServerKey'],"-o","StrictHostKeyChecking=no",config['ServerName'],f"test -f {file_path}/{file_name}.tar.gz"] + # test whether the file exists (if not, returns error code 1) + # and test whether the tar file is complete (if not, error code is 2) + cmd_ssh = [ + "ssh", + "-i", + config['ServerKey'], + "-o", + "StrictHostKeyChecking=no", + config['ServerName'], + f"test -f {file_path_full} && tar -tzf {file_path_full} >/dev/null"] + description_short = 'subprocess_ssh' - description_long = f"Checking for existence of {file_path}/{file_name}.tar.gz" + description_long = f"Checking for existence of {file_path_full}" status = subprocess_and_log(cmd_ssh,description_short,description_long,check=False) - if status.returncode == 1: + if status.returncode > 0: # a time check in UTC. If it's late, raise warning, if very late, raise error -- GitLab