FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 378 KiB
Newer Older
  import sys ; aFormat = 'ruby'
  for a in sys.argv[1:]:
    if a.startswith("--"): aFormat=a[2:]
  if type("")==type(u""): sys.stdout.buffer.write(annotate(sys.stdin.buffer.read(),aFormat)) # Python 3
  else: sys.stdout.write(annotate(sys.stdin.read(),aFormat)) # Python 2
if __name__=="__main__": main()
""" # TODO: annotation-type option from command line in py

c_zlib = br"""static unsigned char *data=NULL;
static void init() {
  z_stream s; memset(&s,0,sizeof(s));
  s.next_in=origData; s.avail_in=%%ZLIBLEN%%;
  data=malloc(%%ORIGLEN%%); // TODO: check non-NULL
  s.next_out=data; s.avail_out=%%ORIGLEN%%;
  inflateInit(&s); inflate(&s, Z_NO_FLUSH); // TODO: check for memory and data-corruption errors
  inflateEnd(&s);
}
"""
c_datadrive = br"""
static unsigned char *dPtr; static int addrLen;

#include <stdlib.h>

static unsigned char * readAddr() {
  size_t i,addr=0;
  for (i=addrLen; i; i--) addr=(addr << 8) | *dPtr++;
  return data + addr;
}
static void readData() {
  POSTYPE *savedPositions = NULL;
  size_t numSavedPositions = 0;
  while(1) {
    unsigned char c = *dPtr++;
    if (c & 0x80) dPtr += (c&0x7F); // short relative forward jump (up to 128 bytes from addr after instruction)
    else if(c < 20) { // switchbyte with short jumps
      c++; // now c == nBytes
      unsigned char byte=(unsigned char)NEXTBYTE;
      int i;
      for (i=0; i<c; i++) if(byte==dPtr[i]) break;
      if(i) dPtr += dPtr[c+i-1]; dPtr += c+c; // relative from end of switch (after all bytes, 1-byte addresses (except 1st) and the 1-byte default address)
    } else switch(c) {
    case 50: /* jump */ dPtr = readAddr(); break;
    case 51: /* call */ {
      unsigned char *funcToCall=readAddr();
      unsigned char *retAddr = dPtr;
      dPtr = funcToCall; readData(); dPtr = retAddr;
      break; }
    case 52: /* return */
      if (savedPositions) free(savedPositions);
      return;
    case 60: /* switchbyte */ {
      int nBytes=(*dPtr++)+1, i;
      unsigned char byte=(unsigned char)NEXTBYTE;
      for (i=0; i<nBytes; i++) if(byte==dPtr[i]) break;
      dPtr += (nBytes + i * addrLen);
      dPtr = readAddr(); break; }
    case 70: s0(); break;
    case 71: case 74: /* copyBytes */ {
      int numBytes=*dPtr++;
      for(;numBytes;numBytes--)
        OutWriteByte(NEXT_COPY_BYTE);
      if(c==74) return; else break; }
    case 72: case 75: /* o */ {
      int numBytes=*dPtr++;
      char *annot = (char*)readAddr();
      o(numBytes,annot); if(c==75) return; else break; }
    case 73: case 76: /* o2 */ {
      int numBytes=*dPtr++;
      char *annot = (char*)readAddr();
      char *title = (char*)readAddr();
      o2(numBytes,annot,title);
      if(c==76) return; else break; }
    case 80: /* savepos */
      savedPositions=realloc(savedPositions,++numSavedPositions*sizeof(POSTYPE)); // TODO: check non-NULL?
      savedPositions[numSavedPositions-1]=THEPOS;
      break;
    case 81: /* restorepos */
      SETPOS(savedPositions[--numSavedPositions]);
      break;
    case 90: /* neartest */ {
      unsigned char *truePtr = readAddr();
      unsigned char *falsePtr = readAddr();
      setnear(*dPtr++); int found=0;
      while(dPtr < truePtr && dPtr < falsePtr) if(near((char*)readAddr())) { found = 1; break; }
      dPtr = found ? truePtr : falsePtr; break; }
      // default: TODO: error about corrupt data?
    }
  }
}
static void topLevelMatch() {
  addrLen = data[0];
  dPtr=data+1; readData();
}
"""

def splitWords(text,phrases=False):
    # split text into words, ignoring anything between markupStart and markupEnd
    # if phrases = True, instead of words, split on any non-whitespace char outside markupStart..markupEnd
    warnPhrases = phrases
    if phrases: it=re.finditer(phrasePattern,text)
    else: it=re.finditer(wordPattern,text)
    for i in it:
      y = i.group()
      if len(y) > 1000000 and warnPhrases:
        sys.stderr.write("WARNING: Your corpus needs more phrase delimiters!\nVery long phrases can take a LONG time to process.\n")
        warnPhrases = False
      yield y

markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),flags=re.DOTALL)
wordPattern = re.escape(markupStart)+'.*?'+re.escape(markupEnd)
phrasePattern = re.compile(wordPattern+r'(\s*'+wordPattern+r')*',flags=re.DOTALL+re.UNICODE)
wordPattern = re.compile(wordPattern,flags=re.DOTALL)
wspPattern = re.compile(r"\s+",flags=re.UNICODE)

def annotationOnly(text):
    ret = []
    for w in re.finditer(markupPattern,text):
        if mreverse: ret.append(w.group(1))
        else: ret.append(w.group(2))
    return ' '.join(ret)

def markDown(text):
    # Return just the original text, without markup
    if mreverse: group=r"\2"
    else: group=r"\1"
    return re.sub(markupPattern,group,text)

def markUp(text,annotation):
  if mreverse: text,annotation = annotation,text
  return markupStart + text + markupMid + annotation + markupEnd
    
def checkpoint_exit(doIt=1):
  if not checkpoint: return
  try: open(checkpoint+os.sep+"ExitASAP")
  except: return
  if doIt:
    assert main, "Only annogen's main module should call checkpoint_exit with doIt=1"
    os.remove(checkpoint+os.sep+"ExitASAP")
    sys.stderr.write("\nExitASAP found: exit\n")
    raise SystemExit
  else: return True
try: import cPickle as pickle
except:
  try: import pickle
  except: pickle = None
def read_checkpoint():
  t = pickle.Unpickler(open(checkpoint+os.sep+'checkpoint','rb')).load()
  sys.stderr.write("Checkpoint loaded from %d phrases\n" % t[0])
  return t
def write_checkpoint(t):
  pickle.Pickler(open(checkpoint+os.sep+'checkpoint-NEW','wb'),-1).dump(t) # better write to checkpoint-NEW, in case we reboot or have an OS-level "Out of memory" condition *while* checkpointing
  try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
  except OSError: # OS can't do it atomically?
    rm_f(checkpoint+os.sep+'checkpoint')
    try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
    except OSError: pass
  checkpoint_exit()

def status_update(phraseNo,numPhrases,wordsThisPhrase,nRules,phraseLastUpdate,lastUpdate,phraseLastCheckpoint,lastCheckpoint,coverP,nRej,startTime):
  phraseSec = (phraseNo-phraseLastUpdate)*1.0/(time.time()-lastUpdate)
  if phraseSec < 100:
    phraseSecS = "%.1f" % phraseSec
  else: phraseSecS = "%d" % int(phraseSec)
  progress = status_prefix + "%s phrase/sec (%d%%/#w=%d) rules=%d cover=%d%%" % (phraseSecS,int(100.0*phraseNo/numPhrases),wordsThisPhrase,nRules,coverP)
  if warn_yarowsky: progress += (" rej=%d" % nRej)
  if time_estimate:
    if phraseNo-phraseLastCheckpoint < 10: phraseMin = phraseSec*60 # current 'instantaneous' speed
    else: phraseMin = (phraseNo-phraseLastCheckpoint)*60/(time.time()-lastCheckpoint) # longer-term average
    minsLeft = (numPhrases-phraseNo)/phraseMin
    if minsLeft>60*24: progress += " %dd+" % int(minsLeft/60/24)
    elif minsLeft>60: progress += " %dh+" % int(minsLeft/60)
    elif minsLeft: progress += " %dmin+" % minsLeft
    # (including the + because this is liable to be an underestimate; see comment after the --time-estimate option)
    if len(progress) + 14 < screenWidth:
     progress += " (at %02d:%02d:%02d" % time.localtime()[3:6] # clock time: might be useful for checking if it seems stuck
     if len(progress) + 20 < screenWidth and not clear_eol == "  \r": # (being able to fit this in can be intermittent)
      elapsed = time.time() - startTime
      progress += ", analyse=%d:%02d:%02d" % (elapsed/3600,(elapsed%3600)/60,elapsed%60)
     progress += ")"
  sys.stderr.write(progress+clear_eol)
  sys.stderr.flush()

def normalise():
    if normalised_file == infile: return
    global capitalisation # might want to temp change it
    if (capitalisation or priority_list) and annot_whitespace: return
    global corpus_unistr
    if checkpoint:
      try:
        f=open_try_bz2(checkpoint+os.sep+'normalised','r')
        corpus_unistr = f.read().decode('utf-8')
        sys.stderr.write("Normalised copy loaded\n")
        return True # loaded from checkpoint
      except: # if re-generating 'normalised', will also need to regenerate 'map' and 'checkpoint' if present
        assert main, "normalise checkpoint not readable in non-main module"
        rm_f(checkpoint+os.sep+'map.bz2') ; rm_f(checkpoint+os.sep+'map')
        rm_f(checkpoint+os.sep+'checkpoint')
    else: assert main, "normalise called in non-main module and checkpoint isn't even set"
    sys.stderr.write("Normalising...");sys.stderr.flush()
    old_caps = capitalisation
    if priority_list: capitalisation = True # no point keeping it at False
    allWords = getAllWords()
    if removeSpace:
     corpus_unistr = re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),(markupEnd+markupStart).replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE) # so getOkStarts works consistently if corpus has some space-separated and some not
     corpus_unistr = re.sub(re.escape(markupStart)+'\s+',markupStart.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupMid),markupMid.replace('\\',r'\\'),re.sub(re.escape(markupMid)+'\s+',markupMid.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupEnd),markupEnd.replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE) # so we're tolerant of spurious whitespace between delimeters and markup (TODO: do this even if not removeSpace?)
     if not annot_whitespace:
      # normalise trailing hyphens e.g. from OCR'd scans:
      cu0 = corpus_unistr ; ff = 0
      for hTry in [1,2]:
          if '-'+aoEnd in w:
            idx = w.index('-'+aoEnd)
            if w[:idx].endswith(aoStart) or w[:idx].endswith("-"): continue # ignore this one (a mess of some kind)
            if hTry==2: # ouch, this doesn't look good
              getBuf(sys.stderr).write((u" (can't normalise hyphens due to '%s') " % w).encode(terminal_charset,'replace')) ; sys.stderr.flush()
              corpus_unistr = cu0 ; break
            if mreverse: grp,mdG=r"-\1",r"\2"
            else: grp,mdG=r"-\2",r"\1"
            # TODO: batch up the following replacements by using something similar to Replacer but with a common destination regexp that takes groups from the 'w' entries as well.  (Low priority because don't typically get TOO many of these dangling hyphens in most corpuses.)
            corpus_unistr = re.sub(re.escape(w)+r"\s*"+re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),w.replace('\\',r'\\').replace('-'+aoEnd.replace('\\',r'\\'),grp+aoEnd.replace('\\',r'\\')).replace(mdEnd.replace('\\',r'\\'),mdG+mdEnd.replace('\\',r'\\')),corpus_unistr,flags=re.DOTALL+re.UNICODE)
            ff = 1
        if ff: allWords = getAllWords() # re-generate
      del cu0
    sys.stderr.write(":") ; sys.stderr.flush()
    KeywordProcessor = None
    # try: from flashtext.keyword import KeywordProcessor # pip install flashtext
    # except: pass
    # # commenting out the above because the fallback code below was ~20% faster than flashtext in my test (perhaps their benchmark figures rely on a non-empty non_word_boundaries set)
    if not KeywordProcessor:
     class KeywordProcessor:
      def __init__(self,*a,**k): self.dic = {}
      def __len__(self): return len(self.dic)
      def add_keyword(self,x,y):
        if diagnose and diagnose in x: diagnose_write("Replacer.add(%s,%s)" % (x,y))
        self.dic[x] = y
        if not (len(self.dic)%1500):
          sys.stderr.write('.') ; sys.stderr.flush()
      def replace_keywords(self,unistr):
        if not self.dic: return
        for exp in orRegexes(re.escape(k) for k in iterkeys(self.dic)):
          sys.stderr.write(';') ; sys.stderr.flush()
          unistr = re.sub(exp,lambda k:self.dic[k.group(0)],unistr)
        return unistr
    rpl = KeywordProcessor(case_sensitive=True)
    rpl.non_word_boundaries = set()
    cu_nosp = [None]
    def normWord(w):
      if '-' in w: hTry=set([w.replace('-','')]) # if not annot_whitespace, we'll replace any non-hyphenated 'run together' version by the version with the hyphen; that's often the sensible thing to do with pinyin etc (TODO more customisation??)
      else: hTry=None
      if not capitalisation:
        wl = w.lower() # (as long as it's all Unicode strings, .lower() and .upper() work with accents etc)
        if not w==wl and wl in allWords:
            # This word is NOT always capitalised.
            # Could be 'caps if at start of sentence'
            # (or title-case etc), but might also be
            # a corpus error, so check numbers.
            if allWords[wl]*5 < allWords[w] and allWords[wl] <= normalise_debug: # not really "Yarowsky" here in the normaliser, but similar principle
              typo_report("normalise-debug.txt","allow-caps-exceptions.txt",wl,(u"%s (%d instances) overrides %s (%d instances)" % (wl,allWords[wl],w,allWords[w])))
            # To simplify rules, make it always lower.
            w = wl
            if hTry: hTry.add(w.replace('-',''))
      if annot_whitespace or (keep_whitespace and markDown(w) in keep_whitespace): return w,None
      if not re.search(wspPattern,w): return w,hTry
      nowsp = re.sub(wspPattern,"",w)
      if not capitalisation and not nowsp.lower()==nowsp and nowsp.lower() in allWords: nowsp = nowsp.lower()
      if nowsp in allWords: return nowsp,hTry # varying whitespace in the annotation of a SINGLE word: probably simplest if we say the version without whitespace, if it exists, is 'canonical' (there might be more than one with-whitespace variant), at least until we can set the relative authority of the reference (TODO)
      ao,md = annotationOnly(w),markDown(w)
      aoS = ao.split()
      if len(md.split())==1 and len(md) <= 5 and len(aoS) <= len(md): # TODO: 5 configurable?  don't want different_ways_of_splitting to take too long
        # if not too many chars, try different ways of
        # assigning each word to chars, and see if any
        # of these exist in the corpus; if any does,
        # assume we have "ABC|a bc" <= "A|a BC|bc" type
        # situations - the latter shouldn't necessarily be
        # converted into the former, but the former might
        # be convertible into the latter to simplify rules
        if cu_nosp[0] == None:
          cu_nosp[0] = re.sub(wspPattern,"",corpus_unistr)
          if not capitalisation: cu_nosp[0] = cu_nosp[0].lower() # ignore capitalisation when searching for this
        if capitalisation: aoS2 = aoS
        else: aoS2 = [w0.lower() for w0 in aoS]
        for charBunches in different_ways_of_splitting(md,len(aoS)):
            mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS2)]
            multiword = "".join(mw)
            if multiword in cu_nosp[0]:
              # we're about to return a split version of the words, but we now have to pretend it went through the initial capitalisation logic that way (otherwise could get unnecessarily large collocation checks)
              if not capitalisation:
                mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS)] # the original capitalisation. for selective .lower()
                for i in range(len(mw)):
                  w0 = mw[i]
                  wl = w0.lower()
                  if not w0==wl and wl in allWords:
                    mw[i] = wl
              return "".join(mw),hTry
          # TODO: is there ANY time where we want multiword to take priority over the nowsp (no-whitespace) version above?  or even REPLACE multiword occurrences in the corpus with the 1-word nowsp version?? (must be VERY CAREFUL doing that)
      # TODO: anything else?
      return w,hTry
      w2,hTry = normWord(w)
      if hTry:
        hTry.add(w2.replace('-','')) # in case not already there
        for h in hTry:
          if h in allWords: rpl.add_keyword(h,w2)
      if not w==w2: rpl.add_keyword(w,w2)
    if len(rpl):
      corpus_unistr = rpl.replace_keywords(corpus_unistr)
    del rpl
    sys.stderr.write(" done\n")
    if normalised_file: openfile(normalised_file,'w').write(corpus_unistr.encode(incode))
    if checkpoint and capitalisation==old_caps: open_try_bz2(checkpoint+os.sep+'normalised','w').write(corpus_unistr.encode('utf-8'))
    capitalisation = old_caps
    checkpoint_exit()
def getAllWords():
  for phrase in splitWords(corpus_unistr,phrases=True):
    for w in splitWords(phrase): allWords[w]=allWords.setdefault(w,0)+1
  return allWords # do NOT cache (is called either side of the normaliser)
def orRegexes(escaped_keys):
  escaped_keys = list(escaped_keys) # don't just iterate
  try: yield re.compile('|'.join(escaped_keys))
  except OverflowError: # regex too big (e.g. default Python on Mac OS 10.7 i.e. Python 2.7.1 (r271:86832, Jul 31 2011, 19:30:53); probably some Windows versions also; does not affect Mac HomeBrew's Python 2.7.12)
    ek = escaped_keys[:int(len(escaped_keys)/2)]
    for r in orRegexes(ek): yield r
    ek = escaped_keys[len(ek):]
    for r in orRegexes(ek): yield r

def PairPriorities(markedDown_Phrases,existingFreqs={}):
    markedDown_Phrases = list(markedDown_Phrases)
    assert all(type(p)==list for p in markedDown_Phrases)
    mdwSet = set(existingFreqs.keys())
    for p in markedDown_Phrases: mdwSet.update(p)
    assert all(type(w)==unicode for w in mdwSet)
    votes = {} ; lastT = time.time()
    for pi in xrange(len(markedDown_Phrases)):
      if time.time() > lastT+2:
        sys.stderr.write("PairPriorities: %d%%%s" % (pi*100/len(markedDown_Phrases),clear_eol)) ; sys.stderr.flush()
        lastT = time.time()
      p=markedDown_Phrases[pi]
      for x in xrange(len(p)-1):
        a,b = p[x:x+2]
        combined = a+b
        for i in xrange(1,len(combined)):
            if i==len(a): continue
            elif i<len(a): prefer,over = a,combined[i:]
            else: prefer,over = b,combined[:i]
            if not (prefer in mdwSet and over in mdwSet and not prefer==over): continue
            k = tuple(sorted([prefer,over]))
            if k[0]==prefer: direction = 1
            else: direction = -1
            votes[k]=votes.get(k,0)+direction
            if diagnose in k: diagnose_write("Prefer %s over %s: %d vote from %s | %s" % (k+(direction,a,b)))
    sys.stderr.write("PairPriorities: done\n")
    del markedDown_Phrases
    global closure,gtThan,lessThan
    closure,gtThan, lessThan = set(),{},{}
    def addToClosure(a,b):
        # If a>b, then a>c for any b>c (c<b)
        # (actually b>=c but we don't have equality),
        # and c>b for any c>a.
        candidate = set([(a,b)]+[(a,c) for c in lessThan.get(b,[])]+[(c,b) for c in gtThan.get(a,[])])
        if closure==None: # no longer tracking closure
          if any(y in gtThan.get(x,{}) for (x,y) in candidate): return # contradiction
        else:
          if any((y,x) in closure for (x,y) in candidate):
            return # contradiction, use higher abs votes
          closure.update(candidate)
        for x,y in candidate:
            # x>y y<x, so y should be in lessThan[x]
            lessThan.setdefault(x,set()).add(y)
            gtThan.setdefault(y,set()).add(x)
        return True
    for _,v,a,b in reversed(sorted([(abs(v),v,a,b) for (a,b),v in votes.items()])):
        if v < 0: a,b = b,a
        r=addToClosure(a,b)
        if diagnose in (a,b):
          if r==None: r = False
          diagnose_write("addToClosure(%s,%s) [v=%d] returned %s" % (a,b,abs(v),repr(r)))
    trueClosure,closure = closure,None
    lastW,lastF,lastPriorW = set(),None,set()
    for _,w in reversed(sorted((f,w) for w,f in existingFreqs.items())): # highest frequency first
      if lastW and existingFreqs[w] < lastF:
        lastPriorW,lastW = lastW,set()
        lastF = existingFreqs[w]
      for W in lastPriorW: addToClosure(W,w)
      lastW.add(w)
    sys.stderr.write("%d words\n" % len(mdwSet))
    # Kahn 1962 - topological sort:
    no_incoming = set(w for w in mdwSet if not w in lessThan)
    del mdwSet ; mdwList = []
    while no_incoming:
      n = no_incoming.pop()
      mdwList.append(n)
      if not n in gtThan: continue
      for m in gtThan[n]:
        lessThan[m].remove(n)
        if not lessThan[m]:
          del lessThan[m] ; no_incoming.add(m)
      del gtThan[n]
    assert not lessThan and not gtThan, "graph has cycle(s), (%d,%d) edges remain" % (len(lessThan),len(gtThan))
    tcA = set(w for w,_ in trueClosure)
    if diagnose: diagnose_write("%s in tcA %s" % (diagnose,diagnose in tcA))
    r = [] ; _cmpT,_cmpW=time.time(),False
    for w in mdwList: # lower priorities first
        if time.time() > _cmpT + 2:
          sys.stderr.write("Finalising: %d/%d%s" % (len(r),len(mdwList),clear_eol)) ; sys.stderr.flush()
          _cmpT=time.time()
          _cmpW=True
        if w in tcA:
          if w==diagnose:
            f0 = existingFreqs.get(w,0)
            found = False
            for i in xrange(len(r)):
              W,f = r[i]
              if (w,W) in trueClosure:
                found = True
                if 1+f > f0:
                  diagnose_write("Increasing f(%s) from %d to %d to outweigh %s (f=%d)" % (w,f0,1+f,W,f))
                  f0 = 1+f
                else: diagnose_write("f(%s)=%d already outweighs %d for %s" % (w,f0,f,W))
              elif (W,w) in trueClosure:
                found = True
                diagnose_write("Problem? %s (f=%d) before %s (f=%d)" % (W,f,w,f0))
            if not found: diagnose_write("No interactions with %s found among %d lower-priority words" % (w,len(r)))
            l = [f0-1]
          else: l = [r[i][1] for i in xrange(len(r)) if (w,r[i][0]) in trueClosure]
        else: l = []
        r.append((w,1+max([existingFreqs.get(w,0)-1]+l)))
    if _cmpW: sys.stderr.write("Finalising: done%s\n" % clear_eol)
    return sorted(r)

def skipToNext(thing): return "(?:(?!"+re.escape(thing)+").)*"+re.escape(thing) # not ".*?"+re.escape(thing) as it may absorb one 'thing' to match the rest of the regex later
if mreverse: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupMid,markupEnd,re.escape(markupEnd)+r'\s*'+re.escape(markupStart)+skipToNext(markupMid),markupStart,markupMid
else: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupStart,markupMid,re.escape(markupMid)+skipToNext(markupEnd)+r'\s*'+re.escape(markupStart),markupMid,markupEnd
mdSplitR="(?:"+mdSplitR+")?" # so can use it in .join(chars) to say "maybe word-break between these chars"

def different_ways_of_splitting(chars,numWords):
  if numWords > len(chars): return
  elif numWords == len(chars):
    yield list(chars) ; return
  elif numWords == 1:
    yield [chars] ; return
  spAt_try1 = int(len(chars) / numWords) + 1
  for spAt in list(range(spAt_try1,0,-1)) + list(range(spAt_try1+1, len(chars)-numWords+1)):
    for r in different_ways_of_splitting(chars[spAt:],numWords-1): yield [chars[:spAt]]+r

if type(u"")==type(""): # Python 3
  getNext = lambda gen: gen.__next__()
  iterkeys = lambda d: d.keys()
  itervalues = lambda d: d.values()
  iteritems = lambda d: d.items()
else: # Python 2
  getNext = lambda gen: gen.next()
  iterkeys = lambda d: d.iterkeys()
  itervalues = lambda d: d.itervalues()
  iteritems = lambda d: d.iteritems()

def yarowsky_indicators(withAnnot_unistr,canBackground):
    # yields True if rule always works (or in majority of cases with ymajority), or lists enough indicators to cover example instances and yields (negate, list, nbytes), or just list if empty.
    # (If too few indicators can be found, will list the ones it can, or empty if no clearly-distinguishable indicators can be found within ybytes of end of match.)
    # yield "backgrounded" = task has been backgrounded; getNext collects result
    nonAnnot=markDown(withAnnot_unistr)
    def unconditional_looks_ok(explain):
        # could we have this as an unconditional rule, with the other cases as exceptions that will be found first?  (NB this is not the same thing as a 'default-yes rule with exceptions', this is a rule with NO qualifying indicators either way)
        if len(nonAnnot)==1:
          if nonAnnot==diagnose: diagnose_write("%s is default by %s len=1 rule after removing irrelevant badStarts" % (withAnnot_unistr,explain))
          return True # should be safe, and should cover most "common short Chinese word with thousands of contexts" cases
        # If len 2 or more, it's risky because the correct solution could be to process just a fraction of the word now and the rest will become the start of a longer word, so we probably don't want it matching the whole lot by default: we'll want positive or negative indicators instead.
        # e.g. looking at rule AB, text ABC and correct segmentation is A BC, don't want it to 'greedily' match AB by default without positive indicators it should do so
        # Check for no "A BC" situations, i.e. can't find any possible SEQUENCE of rules that STARTS with ALL the characters in nonAnnot and that involves having them SPLIT across multiple words:
        # (The below might under-match if there's the appearance of a split rule but it actually has extra non-marked-up text in between, but it shouldn't over-match.)
        # TODO: if we can find the actual "A BC" sequences (instead of simply checking for their possibility as here), and if we can guarantee to make 'phrase'-length rules for all of them, then AB can still be the default.  This might be useful if okStarts is very much greater than badStarts.
        # (TODO: until the above is implemented, consider recommending --ymax-threshold=0, because, now that Yarowsky-like collocations can be negative, the 'following word' could just go in as a collocation with low ybytes)
        # TODO: also, if the exceptions to rule AB are always of the form "Z A B", and we can guarantee to generate a phrase rule for "Z A B", then AB can still be default.  (We should already catch this when the exceptions are "ZA B", but not when they are "Z A B", and --ymax-threshold=0 probably won't always help here, especially if Z==B; Mandarin "mei2you3" / "you3 mei2 you3" comes to mind)
        llen = len(mdStart)+len(nonAnnot)
        regex=re.compile(re.escape(mdStart) + mdSplitR.join(re.escape(c) for c in list(nonAnnot)))
        if all(x.end()-x.start()==llen for x in re.finditer(regex,corpus_unistr)):
          if nonAnnot==diagnose: diagnose_write("%s is default by %s rule after checking for dangerous overlaps etc" % (withAnnot_unistr,explain))
          return True
        if nonAnnot==diagnose: diagnose_write("%s cannot be default by %s due to %s" % (withAnnot_unistr,explain,', '.join(list(set(["'"+x.group()+"'" for x in re.finditer(regex,corpus_unistr) if not x.end()-x.start()==llen]))[:5])))
    if nonAnnot in yPriorityDic: # TODO: enforce len==1 ?
        if yPriorityDic[nonAnnot] == withAnnot_unistr:
            # we want this case to be the default
            if len(withAnnot_unistr)==1:
                if nonAnnot==diagnose: diagnose_write("ref-pri forces %s" % (withAnnot_unistr,))
                yield True ; return
            else:
                if nonAnnot==diagnose: diagnose_write("ref-pri wants %s by default: finding negative indicators only" % (withAnnot_unistr,))
                can_be_default = "must"
                # might not even need to get okStarts, etc
                if unconditional_looks_ok("ref-pri"):
                  yield True ; return
        else:
          if nonAnnot==diagnose: diagnose_write("ref-pri forbids default %s" % (withAnnot_unistr,))
          can_be_default = False # another is default, don't make this one default even if it occurs more
    else: can_be_default = True
    # First, find positions in corpus_markedDown which match withAnnot_unistr in corpus_unistr
    okStarts = getOkStarts(withAnnot_unistr)
    # now check for corpus_markedDown matches that *don't* have withAnnot_unistr
    badStarts = getBadStarts(nonAnnot,okStarts)
    if not badStarts:
      if nonAnnot==diagnose: diagnose_write("%s has no badStarts" % (withAnnot_unistr,))
      yield True ; return # rule always works, no Yarowsky-like indicators needed
    if can_be_default and len(okStarts) > len(badStarts) and len(nonAnnot)==1:
      if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule" % (withAnnot_unistr,))
      yield True ; return # duplicate of code below (can test for this case early before reducing-down badStarts)
    badStarts = getReallyBadStarts(badStarts,nonAnnot) # see its comments (ignore some badStarts)
    if not badStarts:
      if nonAnnot==diagnose: diagnose_write("%s has only probably-irrelevant badStarts" % (withAnnot_unistr,))
      yield True ; return
    # Now, if it's right more often than not:
    if can_be_default==True and len(okStarts) > len(badStarts) and unconditional_looks_ok("majority-case"): # (if can_be_default=="must", we have already checked for unconditional_looks_ok() above before computing okStarts and badStarts)
        yield True ; return
    run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds.  TODO: does mpi4py.futures have a different 'sweet spot' here? (low priority unless we can get MPI to outdo concurrent.futures in this application)
    may_take_time = canBackground and len(okStarts) > 1000
    if may_take_time:
      getBuf(sys.stderr).write((u"\nLarge collocation check (%s has %d matches + %s), %s....  \n" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot),cond(run_in_background,"backgrounding","could take some time"))).encode(terminal_charset,'replace'))
      if len(badStarts) <= yarowsky_debug: typo_report("yarowsky-debug.txt","allow-exceptions.txt",withAnnot_unistr,(u"%s has %d matches + %s" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot,False))))
    if run_in_background:
      job = executor.submit(yarowsky_indicators_wrapped,withAnnot_unistr) # recalculate the above on the other CPU in preference to passing, as memory might not be shared
      yield "backgrounded" ; yield job.result() ; return
    if ybytes_max > ybytes and (not ymax_threshold or len(nonAnnot) <= ymax_threshold):
      retList = [] ; append=retList.append
      for nbytes in range(ybytes,ybytes_max+1,ybytes_step):
        negate,ret,covered,toCover = tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must",nbytes==ybytes_max)
        if covered==toCover and len(ret)==1:
          if may_take_time: sys.stderr.write(" - using 1 indicator, negate=%s\n" % repr(negate))
          yield (negate,ret,nbytes) ; return # a single indicator that covers everything will be better than anything else we'll find
        append((-int(covered*100/toCover),len(ret),nbytes,negate,toCover,ret)) # (1st 4 of these are the sort keys: maximum coverage to nearest 1%, THEN minimum num indicators for the same coverage, THEN minimum nbytes (TODO: problems of very large nbytes might outweigh having more indicators; break if found 100% coverage by N?), THEN avoid negate)
        # TODO: try finding an OR-combination of indicators at *different* proximity lengths ?
      retList.sort()
      if nonAnnot==diagnose: diagnose_write("Best coverage is %d%% of %d" % (-retList[0][0],retList[0][-2]))
      negate,ret = retList[0][-3],retList[0][-1]
      distance = retList[0][2]
    else:
      negate,ret = tryNBytes(ybytes_max,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must")[:2]
      if ybytes < ybytes_max: distance = ybytes_max
      else: distance = None # all the same anyway
    if not ret and warn_yarowsky: getBuf(sys.stderr).write((u"Couldn't find ANY Yarowsky-like indicators for %s   \n" % withAnnot_unistr).encode(terminal_charset,'replace')) # (if nonAnnot==diagnose, this'll be reported by tryNBytes below)
    # TODO: if partially but not completely covered, shouldn't entirely count the word as 'covered' in analyse()
    elif ret and may_take_time: sys.stderr.write(" - using %d indicators, negate=%s\n" % (len(ret),repr(negate)))
    if not ret or (not distance and not negate):
      yield ret
    else:
      if not distance: distance = ybytes_max
      yield negate,ret,distance
typo_data = {}
def typo_report(debugFile,exceptionFile,withAnnot_unistr,msg_unistr):
  if not exceptionFile in typo_data:
    try: typo_data[exceptionFile]=set(splitWords(openfile(exceptionFile).read().decode(terminal_charset)))
    except IOError: typo_data[exceptionFile]=set()
  if withAnnot_unistr not in typo_data[exceptionFile]:
    if not debugFile in typo_data:
      typo_data[debugFile] = openfile(debugFile,'w')
      getBuf(sys.stderr).write(bold_on+"Writing to "+debugFile+bold_off+"\n")
      typo_data[debugFile].write("Put any of the following first-of-line words into allow-exceptions.txt to avoid being alerted here next time.\n\n")
    typo_data[debugFile].write((msg_unistr+u"\n").encode(terminal_charset,'replace'))
    typo_data[debugFile].flush() # in case interrupted
4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000
def yarowsky_indicators_wrapped(withAnnot_unistr):
    check_globals_are_set_up()
    return getNext(yarowsky_indicators(withAnnot_unistr,False))
def getOkStarts(withAnnot_unistr):
    if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr]
    walen = len(withAnnot_unistr)
    return set(x for x in precalc_sets[getNext(splitWords(withAnnot_unistr))] if corpus_unistr[m2c_map[x]:m2c_map[x]+walen]==withAnnot_unistr)
def getBadStarts(nonAnnot,okStarts):
  r = [] ; append=r.append
  l=len(nonAnnot)
  k = nonAnnot[:2]
  if k in bigramCache:
    for i in bigramCache[k]:
      if not i in okStarts and corpus_markedDown[i:i+l]==nonAnnot: append(i)
    return r
  find = corpus_markedDown.find
  i = find(nonAnnot)
  while i != -1:
    if not i in okStarts: append(i)
    i = find(nonAnnot,i+l)
  return r
def getReallyBadStarts(badStarts,nonAnnot):
    # Some of the badStarts can be ignored on the grounds that they should be picked up by other rules first: any where the nonAnnot match does not start at the start of a word (the rule matching the word starting earlier should get there first), and any where it starts at the start of a word that is longer than its own first word (the longest-first ordering should take care of this).  So keep only the ones where it starts at the start of a word and that word is no longer than len(nonAnnot).
    reallyBadStarts = [] ; append=reallyBadStarts.append
    nonAnnotLen = len(mdStart+nonAnnot+mdEnd)
    theRe = re.compile(re.escape(mdStart+nonAnnot[0])+".*?"+re.escape(mdEnd),flags=re.DOTALL)
    for b in badStarts:
      try: s = m2c_map[b]
      except KeyError: continue # it wasn't the start of a word (only start positions are in that map)
      m=theRe.search(corpus_unistr, s) # will either start at s, or after it if mreverse
      assert m, "m2c_map error? "+repr(nonAnnot[0])+" "+repr(b)+"->"+repr(s)+" not found ( "+repr(corpus_markedDown[b:b+25])+"... -> "+repr(corpus_unistr[s:s+50])+"...)"
      s,e = m.start(),m.end()
      if e-s > nonAnnotLen: continue # this word is too long, should be matched by a longer rule 1st
      append(b) # to reallyBadStarts
    return reallyBadStarts
def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,force_negate,try_harder=True):
    # try to find either positive or negative Yarowsky-like indicators, whichever gives a smaller set (or only negative ones if force_negate, used by ref_pri).  Negative indicators might be useful if there are many matches and only a few special exceptions.  (If not force_negate, then negative indicators are used only if they cover 100% of the exceptions; see below re negate==None)
    def bytesAround(start): return within_Nbytes(start+len(nonAnnot),nbytes)
    okStrs=list(set(bytesAround(s) for s in okStarts))
    badStrs=list(set(bytesAround(s) for s in badStarts))
    pOmit = unichr(1).join(badStrs) # omit anything that occurs in this string from +ve indicators
    nOmit = unichr(1).join(okStrs) # ditto for -ve indicators
    pCovered=[False]*len(okStrs)
    nCovered=[False]*len(badStrs)
    n2Covered=[False]*len(badStrs)
    pRet = [] ; pAppend=pRet.append
    nRet = [] ; nAppend=nRet.append
    n2Ret = [] ; nAppend2 = n2Ret.append
    negate = None # not yet set
    stuffToCheck = [] ; stuffChecked = []
    if not force_negate:
      l = []
      stuffChecked.append((l,"",pRet,pCovered))
      stuffToCheck.append((l,okStrs,pAppend,pCovered,unique_substrings(okStrs,markedUp_unichars,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))) # a generator and associated parameters for positive indicators
    diagnose_extra = []
    if force_negate or 5*len(okStrs) > len(badStrs) or not okStrs: # and for negative indicators, if appropriate: (changed in v0.6892: still check for negative indicators if len(okStrs) is similar to len(badStrs) even if not strictly greater, but don't bother if len(okStrs) is MUCH less)
      l = []
      stuffChecked.append((l,"negative",nRet,nCovered))
      stuffToCheck.append((l,badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s))))
      if try_harder and okStrs and not force_negate:
        l = [] ; stuffChecked.append((l,"overmatch-negative",n2Ret,n2Covered))
        stuffToCheck.append((l,badStrs,nAppend2,n2Covered,unique_substrings(badStrs,markedUp_unichars,lambda txt:False,lambda txt:(sum(1 for s in badStrs if txt in s),-sum(1 for s in okStrs if txt in s))))) # a harder try to find negative indicators (added in v0.6896): allow over-matching (equivalent to under-matching positive indicators) if it's the only way to get all badStrs covered; may be useful if the word can occur in isolation
    elif nonAnnot==diagnose: diagnose_extra.append("Not checking for negative indicators as 5*%d>%d=%s." % (len(okStrs),len(badStrs),repr(5*len(okStrs)>len(badStrs))))
    while stuffToCheck and negate==None:
      for i in range(len(stuffToCheck)):
        l,strs,append,covered,generator = stuffToCheck[i]
        try: indicator = getNext(generator)
        except StopIteration:
          del stuffToCheck[i] ; break
        found = True ; cChanged = False
        for j in xrange(len(strs)):
          if not covered[j] and indicator in strs[j]:
            covered[j]=cChanged=True
        if cChanged:
         append(indicator)
         if not l: l.append(True)
         if all(covered):
          if append==pAppend: negate=False
          elif append==nAppend: negate=True # negate with no overmatch allowed found all the exceptions, so use it (don't use it if it doesn't find ALL the exceptions, since we don't ever want an as-if 'overmatch positive' i.e. misidentifying a word/phrase in a place where the corpus explicitly DOESN'T have it, unless force_negate see comment below)
          else: # append==nAppend2 (negate with overmatch allowed): we managed to get all exceptions with overmatch-negative, but how much damage did our overmatching do to the NON-exceptions?
            fxCover = [True]*len(okStrs)
            for indicator in n2Ret:
              for i in xrange(len(okStrs)):
                if fxCover[i] and indicator in okStrs[i]:
                  # a negative indicator 'misfires' here, resulting in this okStr NOT being identified as 'ok'
                  fxCover[i] = False
            if sum(1 for x in fxCover if x) >= sum(1 for x in pCovered if x): negate="harder"
            else: diagnose_extra.append("Overmatch-negate got worse actual coverage than partial-positive.") # so don't set negate="harder", but we might still force_negate to set negate=True below
          break
    # and if negate==None AFTER this loop, didn't get all(pCovered) OR all(nCovered), in which case we fall back to negate=False (unless force_negate).  In other words, negative indicators normally have to cover ALL non-occurrences to be passed, whereas positive indicators just have to cover SOME.  This is in keeping with the idea of 'under-match is better than over-match' (because an under-matching negative indicator is like an over-matching positive one)
    if force_negate: negate = True
    if negate==True: ret,covered = nRet,nCovered
    elif negate=="harder": ret,covered = n2Ret,n2Covered
    else: ret,covered = pRet,pCovered
    if nonAnnot==diagnose:
      def report(actuallyChecked,negate,ret,covered):
        if not actuallyChecked: return ""
        if negate: indicators = negate+" indicators "
        else: indicators = "indicators "
        if ret:
          if len(ret) > 30: indicators=str(len(ret))+" "+indicators # +'/'.join(ret[:30]+['...'])
          else: indicators += '/'.join(ret)
        else: indicators = "no "+indicators
        if all(covered): notCovered = ""
        else:
          if negate: strs = badStrs
          else: strs = okStrs
          notCovered = [strs[i] for i in xrange(len(covered)) if not covered[i]]
          if len(notCovered) > 10: notCovered = notCovered[:10]+["..."]
          notCovered = ", not "+'/'.join(notCovered).replace('\n',"\\n")
        if negate=="overmatch-negative":
          overmatch=[s for s in okStrs if any(i in s for i in n2Ret)]
          if len(overmatch) > 10: overmatch = overmatch[:10]+["..."]
          if overmatch: notCovered += ", overmatch "+"/".join(overmatch).replace('\n',"\\n")
        return "%s (cover=%d/%d%s)" % (indicators,sum(1 for x in covered if x),len(covered),notCovered)
      if len(pOmit) > 200: pOmit = pOmit[:200]+"..."
      diagnose_extra = " ".join(diagnose_extra)
      if diagnose_extra: diagnose_extra=" "+diagnose_extra
      rr = ", ".join(r for r in [report(*i) for i in stuffChecked] if r)
      if not rr: rr = "nothing"
      diagnose_write("tryNBytes(%d) on %s (avoiding '%s') found %s%s" % (nbytes,withAnnot_unistr,pOmit.replace(unichr(1),'/').replace('\n',"\\n"),rr,diagnose_extra))
    return negate,ret,sum(1 for x in covered if x),len(covered)

def cond(a,b,c):
  if a: return b
  else: return c

def badInfo(badStarts,nonAnnot,for_tty=True):
  ret = u"%d false positive" % len(badStarts)
  if not len(badStarts)==1: ret += "s"
  if len(badStarts) > yarowsky_debug: return ret
  for wordStart in badStarts:
   wordEnd = wordStart + len(nonAnnot)
   contextStart,contextEnd=max(0,wordStart-5),wordEnd+5
   toRead = corpus_markedDown
   # but can we report it from the original corpus_unistr?
   if wordStart in m2c_map and wordEnd in m2c_map:
    toRead = corpus_unistr
    wordStart,wordEnd = m2c_map[wordStart],m2c_map[wordEnd]
    newCStart,newCEnd = contextStart,contextEnd
    while newCStart not in m2c_map and newCStart >= contextStart-5: newCStart-=1
    while newCEnd not in m2c_map and newCEnd<contextEnd+5: newCEnd+=1
    if newCStart in m2c_map: contextStart = m2c_map[newCStart]
    else: contextStart = max(0,wordStart - 15) # This might cut across markup, but better that than failing to report the original corpus and making it look like the words might not have "lined up" when actually they did.  Might also just cut into surrounding non-markup text (if the above loop simply couldn't find anything near enough because such text was in the way).
    if newCEnd in m2c_map: contextEnd = m2c_map[newCEnd]
    else: contextEnd = wordEnd + 15 # ditto
   if for_tty: ret += (u" (%s%s%s%s%s)" % (toRead[contextStart:wordStart],reverse_on,toRead[wordStart:wordEnd],reverse_off,toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r")
   else: ret += (u" (%s <<%s>> %s)" % (toRead[contextStart:wordStart],toRead[wordStart:wordEnd],toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r")
  return ret

def unique_substrings(texts,allowedChars,omitFunc,valueFunc):
    # yield unique substrings of texts, in increasing length, with equal lengths sorted by highest score returned by valueFunc, and omitting any where omitFunc is true, or that uses any character not in allowedChars (allowedChars==None means all allowed)
    if allowedChars:
        # remove non-allowedChars from texts, splitting into smaller strings as necessary
        texts2 = [] ; append=texts2.append
        for text in texts:
            start = 0
            for i in xrange(len(text)):
                if not text[i] in allowedChars:
                    if i>start: append(text[start:i])
                    start=i+1
            if start<len(text): append(text[start:])
        texts=texts2
    if not texts: return
    length=1 ; maxlen = max(len(t) for t in texts)
    while length <= maxlen:
        ret=set()
        # sys.stderr.write("Finding (l=%d)... " % len(texts))
        for text in texts: ret.update(text[s:s+length] for s in xrange(len(text)-length+1))
        l=[(valueFunc(k),k) for k in ret if not omitFunc(k)]
        # if length == ybytes_max and not l: sys.stderr.write("Debugger: omitFunc was true for all %s\n" % repr(ret))
        l.sort() ; l.reverse()
        # sys.stderr.write("%d of length %d\n" % (len(l),length))
        for v,k in l: yield k
        length += 1

def within_Nbytes(matchEndPos,nbytes):
    # return the Unicode characters within nbytes of matchEndPos, assuming the encoding will be outcode.  Used for the Yarowsky-like functions.
    # Assumes multibyte codes are self-synchronizing, i.e. if you start in the middle of a multibyte sequence, the first valid character will be the start of the next sequence, ok for utf-8 but TODO might not be the case for some codes
    return corpus_markedDown[max(0,matchEndPos-nbytes):matchEndPos].encode(outcode)[-nbytes:].decode(outcode,'ignore')+corpus_markedDown[matchEndPos:matchEndPos+nbytes].encode(outcode)[:nbytes].decode(outcode,'ignore')

def test_rule(withAnnot_unistr,yBytesRet,canBackground=None):
    # Tests to see if the rule withAnnot_unistr is
    # ALWAYS right in the examples, i.e.
    # the number of occurrences of its marked-down text
    # in the continuous marked-down string should be
    # EXACTLY equal to the number of occurrences of the
    # marked-up version.
    # (If we deal only in rules that ALWAYS work, we can
    # build them up incrementally without "cross-talk")
    # yield "backgrounded" = task has been backgrounded; getNext collects result (nb we default to NOT canBackground, as test_rule is called from several places of which ONE can handle backgrounding)
    if primitive: yield True
    elif ybytes:
        # Doesn't have to be always right, but put the indicators in yBytesRet
        ybrG = yarowsky_indicators(withAnnot_unistr,canBackground)
        ybr = getNext(ybrG)
        if ybr == "backgrounded":
          yield ybr ; ybr = getNext(ybrG)
        if ybr==True or not ybr:
          yield ybr ; return
        yBytesRet.append(ybr) # (negate, list of indicators, nbytes)
        yield True
    else: # non-ybytes version: accept rule only if it exactly matches the corpus
      phrase = markDown(withAnnot_unistr)
      ret = corpus_markedDown.count(phrase) == len(getOkStarts(withAnnot_unistr))
      if diagnose and diagnose==phrase:
        diagnose_write("occurrences(%s)==occurrences(%s) = %s" % (phrase,withAnnot_unistr,ret))
      yield ret

def all_possible_rules(words,covered):
    # Iterate over ALL possible rules derived from the
    # word sequence (don't just "find the shortest context
    # that predicts each word" because that can have
    # trouble with overlaps; need to check them all and
    # stop when we've got enough to reproduce the example)
    # As optimisation, avoids returning rules for which
    # all(covered) over that rule's range
    if max_words: maxRuleLen = min(len(words),max_words)
    else: maxRuleLen = len(words)
    for ruleLen in range(1,maxRuleLen+1): # (sort by len)
        for wStart in range(len(words)-ruleLen+1):
          if not all(covered[wStart:wStart+ruleLen]):
            yield words[wStart:wStart+ruleLen]
            # caller join()s before adding to rules dict

def checkCoverage(ruleAsWordlist,words,coveredFlags):
    # Updates coveredFlags and returns True if any changes
    # (if False, the new rule is redundant).
    # Don't worry about ybytes - assume the Yarowsky-like
    # indicators have been calculated correctly across the
    # whole text so we don't need to re-check them now.
    assert type(ruleAsWordlist)==type(words)==list
    try: start = words.index(ruleAsWordlist[0])
    except ValueError: return False
    ln = len(ruleAsWordlist)
    changedFlags = False
    while start <= len(words)-ln:
        if words[start:start+ln] == ruleAsWordlist:
            if not all(coveredFlags[start:start+ln]):
                coveredFlags[start:start+ln]=[True]*ln
                changedFlags = True
            start += ln
        else:
            try: start = words.index(ruleAsWordlist[0],start+1)
            except ValueError: break
    return changedFlags

def wspJoin(l):
  if removeSpace: return "".join(l)
  else: return " ".join(l)

def potentially_bad_overlap(rulesAsWordlists,newRuleAsWords):
    # Allow overlaps only if rule(s) being overlapped are
    # entirely included within newRule.  Otherwise could
    # get problems generating closures of overlaps.
    # (If newRule not allowed, caller to try a longer one)
    # Additionally, if allow_overlaps, allow ANY overlap as
    # long as it's not found in the marked-down text.
    if len(newRuleAsWords)==1 or primitive or ybytes: return False
    for ruleAsWordlist in rulesAsWordlists:
        if len(ruleAsWordlist)==1: continue
        if not len(ruleAsWordlist)==len(newRuleAsWords) and longerStartsOrEndsWithTheShorter(ruleAsWordlist,newRuleAsWords): continue
        for overlapSize in range(1,min(len(x) for x in [newRuleAsWords,ruleAsWordlist])):
            if not (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] or newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize]): continue
            if not allow_overlaps: return True
            # Test to see if the examples "allow" this potentially-bad overlap
            def overlapOK(rAW): return not markDown(wspJoin(rAW)) in corpus_markedDown
            if (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] and not overlapOK(ruleAsWordlist[:-overlapSize]+newRuleAsWords)) or (newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize] and not overlapOK(newRuleAsWords[:-overlapSize]+ruleAsWordlist)): return True

def longerStartsOrEndsWithTheShorter(l1,l2):
    if len(l1) > len(l2): l1,l2 = l2,l1
    return l2[:len(l1)]==l1 or l2[-len(l1):]==l1

class RulesAccumulator:
  def __init__(self):
    self.rules = {}
    self.rulesAsWordlists_By1stWord = {} # starting word -> list of possible rules (as wordlists) that might apply
    self.rulesAsWordlists = list() # all rules as words (list of lists) (used if not ybytes, TODO: integrate with rulesAsWordlists_By1stWord?)
    self.rejectedRules = set()
    self.seenPhrases = set() # de-duplicate, might speed up
    self.amend_rules = False
    if rulesFile: self.load()
  def save(self):
    sys.stderr.write("\nPickling rules to %s... " % rulesFile) ; sys.stderr.flush()
    f = openfile(rulesFile,'w')
    pickle.Pickler(f,-1).dump((self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases))
    # (don't save self.rejectedRules, there might be better clues next time)
    f.close() ; sys.stderr.write("done")
    sys.stderr.flush()
  def load(self):
    if not os.path.isfile(rulesFile):
      sys.stderr.write("%s does not exist, starting with blank rules\n" % rulesFile)
      return
    sys.stderr.write("Unpickling rules from %s... " % rulesFile) ; sys.stderr.flush()
    f = openfile(rulesFile)
    self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases = pickle.Unpickler(f).load()
    sys.stderr.write("done\n")
    self.amend_rules = True
    self.newRules = set()
  def remove_old_rules(self,words): # for incremental runs - removes previously-discovered rules that would have been suggested by this new phrase but that no longer 'work' with the rest of the corpus due to alterations elsewhere.  DOES NOT remove old rules that are not suggested by any phrase in the corpus because the phrases that suggested them have been removed or changed (TODO: might want an option for that, although fundamentally you shouldn't be relying on incremental runs if you're making a lot of changes to the corpus)
    for w in set(words):
      rulesAsWordlists = self.rulesAsWordlists_By1stWord.get(w,[])
      i=0
      while i<len(rulesAsWordlists):
        if max_words and len(rulesAsWordlists[i])>max_words:
          i += 1 ; continue # better leave that one alone if we're not reconsidering rules that long (e.g. running again with single_words when previous run wasn't)
        rule = wspJoin(rulesAsWordlists[i])
        if rule not in self.newRules and checkCoverage(rulesAsWordlists[i],words,[False]*len(words)): # rule would apply to the new phrase
          yBytesRet = []
          if not getNext(test_rule(rule,yBytesRet)) or potentially_bad_overlap(self.rulesAsWordlists,rulesAsWordlists[i]): # re-test fails.  In versions v0.543 and below, we just removed ALL rules that would apply to the new phrase, to see if they would be re-generated.  But that caused problems because addRulesForPhrase can return early if all(covered) due to other (longer) rules and we might be removing a perfectly good short rule that's needed elsewhere.  So we now re-test before removal.
            self.rejectedRules.add(rule)
            if not ybytes: self.rulesAsWordlists.discard(rulesAsWordlists[i])
            del rulesAsWordlists[i] ; del self.rules[rule]
            continue
          self.newRules.add(rule) # still current - add to newRules now to save calling test_rule again
          if len(yBytesRet): self.rules[rule] = yBytesRet[0] # overriding what it was before (since we've re-done test_rule for it, which might have returned a new set of Yarowsky-like indicators for the new version of the corpus)
        i += 1
  def addRulesForPhrase(self,phrase,canBackground=False):
    if phrase in self.seenPhrases or (diagnose_quick and diagnose):
      # if diagnose and (diagnose_quick or self.amend_rules) and mdStart+diagnose+mdEnd in phrase: pass # look at it again for diagnostics.  But do we accept a diagnose that spans multiple words?  should be pointed out by --diagnose-quick below if uncommented
      if diagnose and (diagnose_quick or self.amend_rules) and diagnose in markDown(phrase): pass # this version accepts diagnose of multiple words (and might also let some phrases through where it matches on an overlap)
      else:
        yield 0,0 ; return # TODO: document that this means the total 'covered' figure in the progress status is AFTER phrase de-duplication (otherwise we'd have to look up what the previous values were last time we saw it - no point doing that just for a quick statistic)
    self.seenPhrases.add(phrase)
    words = list(filter(lambda x:markDown(x).strip(),splitWords(phrase))) # filter out any that don't have base text (these will be input glitches, TODO: verify the annotation text is also just whitespace, warn if not)
    if not words:
      yield 0,0 ; return
    covered = [False]*len(words)
    # first see how much is covered by existing rules
    # (don't have to worry about the order, as we've been
    # careful about overlaps)
    if self.amend_rules: self.remove_old_rules(words) # NB if yignore this might not remove all, but still removes all that affect checkCoverage below
    for w in set(words):
      for ruleAsWordlist in self.rulesAsWordlists_By1stWord.get(w,[]):
        if checkCoverage(ruleAsWordlist,words,covered) and all(covered):
          yield len(covered),len(covered) ; return # no new rules needed
    for ruleAsWordlist in all_possible_rules(words,covered):
        rule = wspJoin(ruleAsWordlist) ; yBytesRet = []
        if rule in self.rejectedRules: continue
        if rule in self.rules: continue # this can still happen even now all_possible_rules takes 'covered' into account, because the above checkCoverage assumes the rule won't be applied in a self-overlapping fashion, whereas all_possible_rules makes no such assumption (TODO: fix this inconsistency?)
        rGen = test_rule(rule,yBytesRet,canBackground)
        r = getNext(rGen)
        if r=="backgrounded":
          yield r ; r = getNext(rGen)
        del rGen
        if not r or potentially_bad_overlap(self.rulesAsWordlists,ruleAsWordlist):
            self.rejectedRules.add(rule) # so we don't waste time evaluating it again (TODO: make sure rejectedRules doesn't get too big?)
            continue
        cc = checkCoverage(ruleAsWordlist,words,covered) # changes 'covered'
        assert cc, "this call to checkCoverage should never return False now that all_possible_rules takes 'covered' into account" # and it's a generator which is always working from the CURRENT copy of 'covered'
        if len(yBytesRet): self.rules[rule] = yBytesRet[0]
        else: self.rules[rule] = [] # unconditional
        if not ybytes: self.rulesAsWordlists.append(ruleAsWordlist)
        if not ruleAsWordlist[0] in self.rulesAsWordlists_By1stWord: self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]] = []
        self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]].append(ruleAsWordlist)
        if self.amend_rules: self.newRules.add(rule)
        handle_diagnose_limit(rule)
        if all(covered):
          yield len(covered),len(covered) ; return
    # If get here, failed to completely cover the phrase.
    # ruleAsWordlist should be set to the whole-phrase rule.
    yield sum(1 for x in covered if x),len(covered)
  def rulesAndConds(self):
    if self.amend_rules: return [(k,v) for k,v in self.rules.items() if not k in self.newRules] + [(k,v) for k,v in self.rules.items() if k in self.newRules] # new rules must come last for incremental runs, so they will override existing actions in byteSeq_to_action_dict when small changes have been made to the annotation of the same word (e.g. capitalisation-normalisation has been changed by the presence of new material)
    else: return self.rules.items()

def handle_diagnose_limit(rule):
  global diagnose,diagnose_limit
  if diagnose and diagnose_limit and diagnose==markDown(rule):
    diagnose_limit -= 1
    if not diagnose_limit:
      diagnose = False
      diagnose_write("limit reached, suppressing further diagnostics")

def generate_map():
    global m2c_map, precalc_sets, yPriorityDic
    if checkpoint:
      try:
        f=open_try_bz2(checkpoint+os.sep+'map','r')
        m2c_map,precalc_sets,yPriorityDic = pickle.Unpickler(f).load()
        return sys.stderr.write("Corpus map loaded\n")
      except: pass
    assert main, "Only main should generate corpus map"
    sys.stderr.write("Generating corpus map... ")
    m2c_map = {} ; precalc_sets = {}
    muStart = downLenSoFar = 0
    for s in re.finditer(re.escape(markupStart), corpus_unistr):
      s=s.start()
      md = markDown(corpus_unistr[muStart:s])
      if markupStart in md: errExit("examples have nested markup! "+repr(md))
      downLenSoFar += len(md)
      muStart = s
      m2c_map[downLenSoFar] = s
      # Added optimisation: do precalc_sets as well
      # (at least catch the 1-word cases)
      # -> this is now needed even if not ybytes
      e=corpus_unistr.find(markupEnd,s)
      if e>-1:
        e += len(markupEnd)
        k = corpus_unistr[s:e]
        if k not in precalc_sets: precalc_sets[k]=set()
        precalc_sets[k].add(downLenSoFar)
    yPriorityDic = {}
    if ref_pri and ybytes:
      sys.stderr.write("yPriorityDic ... ")
      for s in re.finditer(re.escape(reference_sep+ref_pri+ref_name_end), corpus_unistr):
        s = s.start()+len(reference_sep+ref_pri+ref_name_end)
        e = corpus_unistr.find(reference_sep,s)
        if e==-1: e=len(corpus_unistr)
        for w in splitWords(corpus_unistr[s:e]):
          wd = markDown(w)
          if wd in yPriorityDic: continue
          if diagnose==wd: diagnose_write("yPriorityDic[%s] = %s" % (wd,w))
          yPriorityDic[wd] = w
    sys.stderr.write("done\n")
    if checkpoint: pickle.Pickler(open_try_bz2(checkpoint+os.sep+'map','w'),-1).dump((m2c_map,precalc_sets,yPriorityDic))
    checkpoint_exit()

def setup_parallelism():
    if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus and map etc)
    args = getoutput("ps -p " + str(os.getpid()) + " -o args")
    try:
      args.index("-m mpi4py.futures") # ValueError if not found
      import mpi4py.futures # mpi4py v2.1+
      import mpi4py.MPI, mpi4py ; assert mpi4py.MPI.COMM_WORLD.size > 1, "mpi4py says world size is 1: likely a symptom of incorrectly-configured MPI.  Did you compile mpi4py using the same setup (e.g. MPICH or OpenMPI) as you are running?  mpi4py's config is: "+repr(mpi4py.get_config())
      return mpi4py.futures.MPIPoolExecutor()
    except ValueError: pass # but raise all other exceptions: if we're being run within mpi4py.futures then we want to know about MPI problems
    try:
      args.index("-m scoop") # ValueError if not found
      import scoop.futures
      return scoop.futures # submit() is at module level
    except ValueError: pass
    try:
      import concurrent.futures # sudo pip install futures (2.7 backport of 3.2 standard library)
      import multiprocessing
      num_cpus = multiprocessing.cpu_count()
      if num_cpus > 1: return concurrent.futures.ProcessPoolExecutor(num_cpus-1) # leave one for the CPU-heavy control task
    except: pass

def get_phrases():
    # Returns a list of phrases in processing order, with length-numbers inserted in the list.  Caches its result.
    global _gp_cache
    try: return _gp_cache