diff --git a/annogen.py b/annogen.py index 8f39036b66ff8a0fed0c6d5e6816277c22ce8da0..4c44816095390539b977c7ea7843e9679161f15d 100755 --- a/annogen.py +++ b/annogen.py @@ -2500,7 +2500,7 @@ def yarowsky_indicators(withAnnot_unistr,canBackground): if all(x.end()-x.start()==llen for x in re.finditer(re.escape(mdStart)+("("+re.escape(mdEnd)+"((?!"+re.escape(mdStart)+").)*.?"+re.escape(mdStart)+")?").join(re.escape(c) for c in list(nonAnnot)),corpus_unistr)): if nonAnnot==diagnose: diagnose_write("%s is default by majority-case rule after checking for dangerous overlaps etc" % (withAnnot_unistr,)) yield True ; return - run_in_background = canBackground and len(okStarts) > 500 and executor # TODO: is this 500 threshold correct? + run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds. TODO: does mpi4py.futures have a different 'sweet spot' here? (low priority unless we can get MPI to outdo concurrent.futures in this application) may_take_time = canBackground and len(okStarts) > 1000 if may_take_time: sys.stderr.write("\nLarge collocation check (%s has %d matches + %s), %s.... \n" % (withAnnot_unistr.encode(terminal_charset,'replace'),len(okStarts),badInfo(badStarts,nonAnnot),cond(run_in_background,"backgrounding","could take some time"))) if run_in_background: