From 3afd34424316012221d54a35f5abb18efc62ce6d Mon Sep 17 00:00:00 2001
From: "Silas S. Brown" <ssb22@cam.ac.uk>
Date: Wed, 11 Jan 2017 11:09:45 +0000
Subject: [PATCH] Update Annotator Generator

git-svn-id: http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster@2684 29193198-4895-4776-b068-10539e920549
---
 annogen.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/annogen.py b/annogen.py
index 8105c22..f2d98d5 100755
--- a/annogen.py
+++ b/annogen.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-program_name = "Annotator Generator v0.61 (c) 2012-16 Silas S. Brown"
+program_name = "Annotator Generator v0.62 (c) 2012-17 Silas S. Brown"
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -234,9 +234,9 @@ parser.add_option("--time-estimate",
 
 parser.add_option("--single-core",
                   action="store_true",default=False,
-                  help="Use only one CPU core even when others are available. (If this option is not set, multiple cores are used if a 'futures' or 'mpi4py.futures' package is installed; this currently requires --checkpoint and is used only for large collocation checks in limited circumstances.  MPI is not currently likely to achieve much speed increase, but concurrent.futures is.)") # (limited circumstances: namely, words that occur in length-1 phrases)
+                  help="Use only one CPU core even when others are available. (If this option is not set, multiple cores are used if a 'futures' or 'mpi4py.futures' package is installed; this currently requires --checkpoint and is used only for large collocation checks in limited circumstances.)") # namely, words that occur in length-1 phrases
 
-main = (__name__ == "__main__")
+main = (__name__ == "__main__" and not os.environ.get("OMPI_COMM_WORLD_RANK","0").replace("0",""))
 if main: sys.stderr.write(program_name+"\n") # not sys.stdout: may or may not be showing --help (and anyway might want to process the help text for website etc)
 # else STILL parse options (if we're being imported for parallel processing)
 options, args = parser.parse_args()
@@ -2533,6 +2533,7 @@ def yarowsky_indicators(withAnnot_unistr,canBackground):
       if not distance: distance = ybytes_max
       yield negate,ret,distance
 def yarowsky_indicators_wrapped(withAnnot_unistr):
+    check_globals_are_set_up()
     return yarowsky_indicators(withAnnot_unistr,False).next()
 def getOkStarts(withAnnot_unistr):
     if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr]
@@ -2936,6 +2937,12 @@ def setup_other_globals():
         global markedUp_unichars
         if yarowsky_all: markedUp_unichars = None
         else: markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int)))
+def check_globals_are_set_up(): # for use during parallelism
+  try: corpus_unistr # if we fork()d, we may already have it
+  except NameError:
+    normalise() # should get corpus_unistr from checkpoint
+    generate_map() # similarly this should just be a read
+    setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine
 
 def analyse():
     accum = RulesAccumulator()
@@ -3001,7 +3008,7 @@ def test_manual_rules():
           if k not in precalc_sets: precalc_sets[k]=set()
       yb = []
       if not test_rule(l,yb).next() or len(yb):
-        sys.stderr.write("\nWARNING: Manual rule '%s' may contradict the examples" % (l.encode(terminal_charset),))
+        sys.stderr.write("\nWARNING: Manual rule '%s' may contradict the examples. " % (l.encode(terminal_charset),))
         global diagnose,diagnose_limit,ybytes
         od,odl,oy,diagnose,diagnose_limit,ybytes = diagnose,diagnose_limit,ybytes,markDown(l),0,ybytes_max
         test_rule(l,[]).next()
@@ -3365,13 +3372,6 @@ if main:
   try: rulesAndConds = analyse()
   finally: sys.stderr.write("\n") # so status line is not overwritten by 1st part of traceback on interrupt etc
   del _gp_cache
-else: # not main: set up corpus globals for parallel funcs:
-  try: corpus_unistr # if we fork()d, we may already have it
-  except NameError:
-    normalise() # should get corpus_unistr from checkpoint
-    generate_map() # similarly this should just be a read
-    setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine
-  setup_parallelism() # TODO: is this call really needed?
 
 if main:
  if c_filename: outfile = open(c_filename,"w")
-- 
GitLab