diff --git a/annogen.py b/annogen.py index 77866aee22fc951cc4d8e22ce45394dfd006b196..123fa9d458a7961cf6d49b5b7b3ae5b17a3332d9 100755 --- a/annogen.py +++ b/annogen.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -program_name = "Annotator Generator v0.62 (c) 2012-17 Silas S. Brown" +program_name = "Annotator Generator v0.621 (c) 2012-17 Silas S. Brown" # See http://people.ds.cam.ac.uk/ssb22/adjuster/annogen.html @@ -2895,7 +2895,7 @@ def generate_map(): checkpoint_exit() def setup_parallelism(): - if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus etc) TODO: document that checkpoint also affects this + if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus and map etc) import commands try: commands.getoutput( @@ -2944,9 +2944,12 @@ def setup_other_globals(): if yarowsky_all: markedUp_unichars = None else: markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int))) def check_globals_are_set_up(): # for use during parallelism + global corpus_unistr try: corpus_unistr # if we fork()d, we may already have it except NameError: - normalise() # should get corpus_unistr from checkpoint + normalise() # should get corpus_unistr from checkpoint, + try: corpus_unistr # unless we're NOT normalising, + except: corpus_unistr = openfile(infile).read().decode(incode) # in which case we have to load the corpus from scratch (it won't be stdin) generate_map() # similarly this should just be a read setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine @@ -3381,6 +3384,7 @@ if main: if diagnose and not suppress and not diagnose in corpus_unistr: diagnose_write(diagnose+" was in the corpus before normalisation, but not after") # (if running from a checkpoint, might want to rm normalised and redo the diagnose) generate_map() ; setup_other_globals() executor = setup_parallelism() + if executor and capitalisation and annot_whitespace and infile==sys.stdin: open_try_bz2(checkpoint+os.sep+'normalised','wb').write(corpus_unistr.encode('utf-8')) # normalise won't have done it and the other nodes will need it (TODO: unless we're doing concurrent.futures with fork) try: rulesAndConds = analyse() finally: sys.stderr.write("\n") # so status line is not overwritten by 1st part of traceback on interrupt etc del _gp_cache