From 82d1737b73652f9c9f9f000f17a66f61fdaee56d Mon Sep 17 00:00:00 2001 From: "Silas S. Brown" <ssb22@cam.ac.uk> Date: Mon, 9 Dec 2019 08:20:40 +0000 Subject: [PATCH] Update Annotator Generator --- annogen.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/annogen.py b/annogen.py index 7541ace..3725ef8 100644 --- a/annogen.py +++ b/annogen.py @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -program_name = "Annotator Generator v0.6891 (c) 2012-19 Silas S. Brown" +program_name = "Annotator Generator v0.6892 (c) 2012-19 Silas S. Brown" # See http://ssb22.user.srcf.net/adjuster/annogen.html @@ -3897,7 +3897,8 @@ def normalise(): try: f=open_try_bz2(checkpoint+os.sep+'normalised','rb') corpus_unistr = f.read().decode('utf-8') - return sys.stderr.write("Normalised copy loaded\n") + sys.stderr.write("Normalised copy loaded\n") + return True # loaded from checkpoint except: # if re-generating 'normalised', will also need to regenerate 'map' and 'checkpoint' if present assert main, "normalise checkpoint not readable in non-main module" rm_f(checkpoint+os.sep+'map.bz2') ; rm_f(checkpoint+os.sep+'map') @@ -4256,7 +4257,7 @@ def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,force_negate): negate = None # not yet set stuffToCheck = [] if not force_negate: stuffToCheck.append((okStrs,pAppend,pCovered,unique_substrings(okStrs,markedUp_unichars,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))) # a generator and associated parameters for positive indicators - if force_negate or len(okStrs) > len(badStrs) or not okStrs: stuffToCheck.append((badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s)))) # and for negative indicators, if appropriate + if force_negate or 5*len(okStrs) > len(badStrs) or not okStrs: stuffToCheck.append((badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s)))) # and for negative indicators, if appropriate (changed in v0.6892: still check for negative indicators if len(okStrs) is similar to len(badStrs) even if not strictly greater, but don't bother if len(okStrs) is MUCH less) while stuffToCheck and negate==None: for i in range(len(stuffToCheck)): strs,append,covered,generator = stuffToCheck[i] @@ -5245,8 +5246,10 @@ if main and not compile_only: diagnose_write(diagnose+" is not present in the corpus, even before normalisation") suppress = True else: suppress = False - normalise() - if diagnose and not suppress and not diagnose in corpus_unistr: diagnose_write(diagnose+" was in the corpus before normalisation, but not after") # (if running from a checkpoint, might want to rm normalised and redo the diagnose) + loaded_from_checkpoint = normalise() + if diagnose and not suppress and not diagnose in corpus_unistr: + diagnose_write(diagnose+" was in the corpus before normalisation, but not after") + if loaded_from_checkpoint: diagnose_write("You might want to remove "+checkpoint+os.sep+'normalised* and redo the diagnose') if normalise_only: sys.exit() if priority_list: if os.path.exists(priority_list): -- GitLab