FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 198 KiB
Newer Older
            case 72: {
              var numBytes = data.charCodeAt(dPtr++);
              var annot = readRefStr();
  s();
  output.push("<ruby><rb>");
  output.push(input.slice(copyP,copyP+numBytes));
  copyP += numBytes;
  output.push("</rb><rt>"); output.push(annot);
  output.push("</rt></ruby>"); break; }
            case 73: {
              var numBytes = data.charCodeAt(dPtr++);
              var annot = readRefStr();
              var title = readRefStr();
  s();
  output.push("<ruby title=\""); output.push(title);
  output.push("\"><rb>");
  output.push(input.slice(copyP,copyP+numBytes));
  copyP += numBytes;
  output.push("</rb><rt>"); output.push(annot);
  output.push("</rt></ruby>"); break; }
            case 80: sPos.push(p); break;
            case 81: p=sPos.pop(); break;
            case 90: {
                var tPtr = readAddr();
                var fPtr = readAddr();
                var nearbytes = data.charCodeAt(dPtr++);
  var o=p;
  if (o > nearbytes) o -= nearbytes; else o = 0;
  var max = p + nearbytes;
  if (max > inputLength) max = inputLength;
  var tStr = input.slice(o,max);
                var found = 0;
                while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; }
                dPtr = found ? tPtr : fPtr; break;
                }
        default: throw("corrupt data table at "+(dPtr-1)+" ("+data.charCodeAt(dPtr-1)+")");
            }
        }
    }

while(p < inputLength) {
var oldPos=p;
dPtr=1;readData();
if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; }
}
return decodeURIComponent(escape(output.join(""))); // from UTF-8 back to Unicode
} // end of annotate function
};
function annotate(input) { return Annotator.annotate(input); }

if (typeof Backbone != "undefined" && Backbone.Model) { Annotator = Backbone.Model.extend(Annotator); annotate=function(input) { return new Annotator().annotate(input) } }
if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) {
  // Node.js command-line test
  fs=require('fs');
  process.stdout.write(annotate(fs.readFileSync('/dev/stdin').toString()));
} else if (typeof module != "undefined" && module.exports) { // Common.js
  module.exports = Annotator;
}
"""

py_start = '# Python '+version_stamp+r"""

# You can import this module and call annotate(utf8 bytes)
# (from multiple threads if desired),
# or you can run from the command line on standard input.

# annotate has an optional second argument, which can be
# 'ruby' (default), 'raw' (annotation only) or 'braces'.

"""
py_end = r"""
class Annotator:
 version="""+'"'+version_stamp+r""""
 def __call__(self,inStr,aType):
  if aType=="ruby": self.startA,self.midA,self.endA = "<ruby><rb>","</rb><rt>","</rt></ruby>"
  elif aType=="raw": self.startA=self.midA=self.endA = ""
  elif aType=="braces": self.startA,self.midA,self.endA = "{","|","}"
  else: raise Exception("Unrecognised annotation type "+repr(aType))
  assert type(inStr)==str
  self.inStr = inStr
  self.addrLen = ord(data[0])
  self.inputLength = len(inStr)
  self.p = 0 # read-ahead pointer
  self.copyP = 0 # copy pointer
  self.output = []
  self.needSpace = 0 ; out = self.output
  while self.p < self.inputLength:
    oldPos = self.p
    self.dPtr = 1 ; self.readData()
    if oldPos == self.p:
      self.needSpace=0
      out.append(inStr[self.p])
      self.p += 1 ; self.copyP += 1
  return "".join(self.output)
 def readAddr(self):
  addr = 0
  for i in range(self.addrLen):
    addr=(addr << 8) | ord(data[self.dPtr])
    self.dPtr += 1
  return addr
 def readRefStr(self):
  a = self.readAddr(); l=ord(data[a])
  if l: return data[a+1:a+l+1]
  else: return data[a+1:data.index('\x00',a+1)]
 def s(self):
  if self.needSpace: self.output.append(" ")
  else: self.needSpace=1
 def readData(self):
  sPos = [] ; out = self.output
  while True:
    d = ord(data[self.dPtr]) ; self.dPtr += 1
    if d==50: self.dPtr = self.readAddr()
    elif d==51:
      func = self.readAddr() ; dO = self.dPtr
      self.dPtr = func ; self.readData() ; self.dPtr = dO
    elif d==52: return
    elif d==60:
      nBytes = ord(data[self.dPtr])+1 ; self.dPtr += 1
Silas S. Brown's avatar
Silas S. Brown committed
      if self.p>=len(self.inStr): i = -1
      else: i = data[self.dPtr:self.dPtr+nBytes].find(self.inStr[self.p]) ; self.p += 1
      if i==-1: i = nBytes
      self.dPtr += (nBytes + i * self.addrLen)
      self.dPtr = self.readAddr()
    elif d==71:
      numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
      out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
    elif d==72:
      numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
      annot = self.readRefStr()
      self.s()
      if self.startA:
        out.append(self.startA)
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
      out.append(self.midA) ; out.append(annot)
      out.append(self.endA)
    elif d==73:
      numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
      annot = self.readRefStr()
      title = self.readRefStr()
      self.s()
      if self.startA=="{": # omit title in braces mode
        out.append(self.startA)
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      elif self.startA:
        out.append("<ruby title=\"");out.append(title)
        out.append("\"><rb>");
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
      out.append(self.midA) ; out.append(annot)
      out.append(self.endA)
    elif d==80: sPos.append(self.p)
    elif d==81: self.p = sPos.pop()
    elif d==90:
      tPtr = self.readAddr()
      fPtr = self.readAddr()
      nearbytes = ord(data[self.dPtr]) ; self.dPtr += 1
      o = max(self.p-nearbytes,0)
      maxx = min(self.p+nearbytes,self.inputLength)
      tStr = self.inStr[o:maxx]
      found = 0
      while self.dPtr < tPtr and self.dPtr < fPtr:
        if self.readRefStr() in tStr:
          found = 1 ; break
      if found: self.dPtr = tPtr
      else: self.dPtr = fPtr
    else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1]))+")")

def annotate(inStr,p="ruby"): return Annotator()(inStr,p)
def main():
  import sys
  if sys.argv[-1].startswith("--"): param=sys.argv[-1][2:]
  else: param = "ruby"
  sys.stdout.write(annotate(sys.stdin.read(),param))
if __name__=="__main__": main()
"""

c_zlib = r"""static unsigned char *data=NULL;
static void init() {
  z_stream s; memset(&s,0,sizeof(s));
  s.next_in=origData; s.avail_in=%%ZLIBLEN%%;
  data=malloc(%%ORIGLEN%%); // TODO: check non-NULL
  s.next_out=data; s.avail_out=%%ORIGLEN%%;
  inflateInit(&s); inflate(&s, Z_NO_FLUSH); // TODO: check for memory and data-corruption errors
  inflateEnd(&s);
}
"""
Silas S. Brown's avatar
Silas S. Brown committed
c_datadrive = r"""
static unsigned char *dPtr; static int addrLen;

#include <stdlib.h>

static unsigned char * readAddr() {
  size_t i,addr=0;
  for (i=addrLen; i; i--) addr=(addr << 8) | *dPtr++;
  return data + addr;
}
static void readData() {
  POSTYPE *savedPositions = NULL;
  size_t numSavedPositions = 0;
  while(1) {
    unsigned char c = *dPtr++;
    if (c & 0x80) dPtr += (c&0x7F); // short relative forward jump (up to 128 bytes from addr after instruction)
    else if(c < 20) { // switchbyte with short jumps
      c++; // now c == nBytes
      unsigned char byte=(unsigned char)NEXTBYTE;
      int i;
      for (i=0; i<c; i++) if(byte==dPtr[i]) break;
      dPtr += c+c+1 + dPtr[c+i]; // relative from end of switch (after all bytes, 1-byte addresses and the 1-byte default address: up to 256 bytes after)
    } else switch(c) {
    case 50: /* jump */ dPtr = readAddr(); break;
    case 51: /* call */ {
Silas S. Brown's avatar
Silas S. Brown committed
      unsigned char *funcToCall=readAddr();
      unsigned char *retAddr = dPtr;
      dPtr = funcToCall; readData(); dPtr = retAddr;
      break; }
    case 52: /* return */
Silas S. Brown's avatar
Silas S. Brown committed
      if (savedPositions) free(savedPositions);
      return;
    case 60: /* switchbyte */ {
Silas S. Brown's avatar
Silas S. Brown committed
      int nBytes=(*dPtr++)+1, i;
      unsigned char byte=(unsigned char)NEXTBYTE;
      for (i=0; i<nBytes; i++) if(byte==dPtr[i]) break;
      dPtr += (nBytes + i * addrLen);
      dPtr = readAddr(); break; }
    case 71: /* copyBytes */ {
Silas S. Brown's avatar
Silas S. Brown committed
      int numBytes=*dPtr++;
      for(;numBytes;numBytes--)
        OutWriteByte(NEXT_COPY_BYTE);
      break; }
    case 72: /* o */ {
Silas S. Brown's avatar
Silas S. Brown committed
      int numBytes=*dPtr++;
      char *annot = (char*)readAddr();
      o(numBytes,annot); break; }
    case 73: /* o2 */ {
Silas S. Brown's avatar
Silas S. Brown committed
      int numBytes=*dPtr++;
      char *annot = (char*)readAddr();
      char *title = (char*)readAddr();
      o2(numBytes,annot,title); break; }
    case 80: /* savepos */
Silas S. Brown's avatar
Silas S. Brown committed
      savedPositions=realloc(savedPositions,++numSavedPositions*sizeof(POSTYPE)); // TODO: check non-NULL?
      savedPositions[numSavedPositions-1]=THEPOS;
      break;
    case 81: /* restorepos */
Silas S. Brown's avatar
Silas S. Brown committed
      SETPOS(savedPositions[--numSavedPositions]);
      break;
    case 90: /* neartest */ {
Silas S. Brown's avatar
Silas S. Brown committed
      unsigned char *truePtr = readAddr();
      unsigned char *falsePtr = readAddr();
      setnear(*dPtr++); int found=0;
      while(dPtr < truePtr && dPtr < falsePtr) if(near((char*)readAddr())) { found = 1; break; }
Silas S. Brown's avatar
Silas S. Brown committed
      dPtr = found ? truePtr : falsePtr; break; }
      // default: TODO: error about corrupt data?
    }
  }
}
static void topLevelMatch() {
  addrLen = data[0];
  dPtr=data+1; readData();
}
"""

def splitWords(text,phrases=False):
    # split text into words, ignoring anything between markupStart and markupEnd
    # if phrases = True, instead of words, split on any non-whitespace char outside markupStart..markupEnd
    if phrases: it=re.finditer(phrasePattern,text)
    else: it=re.finditer(wordPattern,text)
    for i in it: yield i.group()
markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd))
wordPattern = re.escape(markupStart)+'.*?'+re.escape(markupEnd)
phrasePattern = re.compile(wordPattern+r'(\s*'+wordPattern+r')*')
wordPattern = re.compile(wordPattern)
wspPattern = re.compile(r"\s+")
    for w in re.finditer(markupPattern,text):
        if mreverse: ret.append(w.group(1))
        else: ret.append(w.group(2))
    return ' '.join(ret)

def markDown(text):
    # Return just the original text, without markup
    return re.sub(markupPattern,group,text)
def markUp(text,annotation):
  if mreverse: text,annotation = annotation,text
  return markupStart + text + markupMid + annotation + markupEnd
    
def checkpoint_exit(doIt=1):
  if not checkpoint: return
  try: open(checkpoint+os.sep+"ExitASAP")
  except: return
  if doIt:
    assert main, "Only annogen's main module should call checkpoint_exit with doIt=1"
    os.remove(checkpoint+os.sep+"ExitASAP")
    sys.stderr.write("\nExitASAP found: exit\n")
    raise SystemExit
  else: return True
try: import cPickle as pickle
except:
  try: import pickle
  except: pickle = None
def read_checkpoint():
  t = pickle.Unpickler(open(checkpoint+os.sep+'checkpoint','rb')).load()
  sys.stderr.write("Checkpoint loaded from %d phrases\n" % t[0])
  return t
def write_checkpoint(t):
  pickle.Pickler(open(checkpoint+os.sep+'checkpoint-NEW','wb'),-1).dump(t) # better write to checkpoint-NEW, in case we reboot or have an OS-level "Out of memory" condition *while* checkpointing
  try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
  except OSError: # OS can't do it atomically?
    rm_f(checkpoint+os.sep+'checkpoint')
    try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
    except OSError: pass
  checkpoint_exit()

def status_update(phraseNo,numPhrases,wordsThisPhrase,nRules,phraseLastUpdate,lastUpdate,phraseLastCheckpoint,lastCheckpoint,coverP,nRej,startTime):
  phraseSec = (phraseNo-phraseLastUpdate)*1.0/(time.time()-lastUpdate)
  if phraseSec < 100:
    phraseSecS = "%.1f" % phraseSec
  else: phraseSecS = "%d" % int(phraseSec)
  progress = status_prefix + "%s phrase/sec (%d%%/#w=%d) rules=%d cover=%d%%" % (phraseSecS,int(100.0*phraseNo/numPhrases),wordsThisPhrase,nRules,coverP)
  if warn_yarowsky: progress += (" rej=%d" % nRej)
  if time_estimate:
    if phraseNo-phraseLastCheckpoint < 10: phraseMin = phraseSec*60 # current 'instantaneous' speed
    else: phraseMin = (phraseNo-phraseLastCheckpoint)*60/(time.time()-lastCheckpoint) # longer-term average
    minsLeft = (numPhrases-phraseNo)/phraseMin
    if minsLeft>60*24: progress += " %dd+" % int(minsLeft/60/24)
    elif minsLeft>60: progress += " %dh+" % int(minsLeft/60)
    elif minsLeft: progress += " %dmin+" % minsLeft
    # (including the + because this is liable to be an underestimate; see comment after the --time-estimate option)
    if len(progress) + 14 < screenWidth:
     progress += " (at %02d:%02d:%02d" % time.localtime()[3:6] # clock time: might be useful for checking if it seems stuck
     if len(progress) + 20 < screenWidth and not clear_eol == "  \r": # (being able to fit this in can be intermittent)
      elapsed = time.time() - startTime
      progress += ", analyse=%d:%02d:%02d" % (elapsed/3600,(elapsed%3600)/60,elapsed%60)
     progress += ")"
  sys.stderr.write(progress+clear_eol)
def normalise():
    if capitalisation and annot_whitespace: return
        f=open_try_bz2(checkpoint+os.sep+'normalised','rb')
        corpus_unistr = f.read().decode('utf-8')
        return
      except: # if re-generating 'normalised', will also need to regenerate 'map' and 'checkpoint' if present
        assert main, "normalise checkpoint not readable in non-main module"
        rm_f(checkpoint+os.sep+'map.bz2') ; rm_f(checkpoint+os.sep+'map')
        rm_f(checkpoint+os.sep+'checkpoint')
    else: assert main, "normalise called in non-main module and checkpoint isn't even set"
    if removeSpace:
     corpus_unistr = re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),markupEnd+markupStart,corpus_unistr) # so getOkStarts works consistently if corpus has some space-separated and some not
     if not annot_whitespace:
      # normalise trailing hyphens e.g. from OCR'd scans:
      cu0 = corpus_unistr ; ff = 0
      for hTry in [1,2]:
        for w in allWords:
          if '-'+aoEnd in w:
            idx = w.index('-'+aoEnd)
            if w[:idx].endswith(aoStart) or w[:idx].endswith("-"): continue # ignore this one (a mess of some kind)
            if hTry==2: # ouch, this doesn't look good
              sys.stderr.write(" (can't normalise hyphens due to '%s') " % w.encode(terminal_charset,'replace'))
              corpus_unistr = cu0 ; break
            if mreverse: grp,mdG=r"-\1",r"\2"
            else: grp,mdG=r"-\2",r"\1"
            # TODO: batch up the following replacements by using something similar to Replacer but with a common destination regexp that takes groups from the 'w' entries as well.  (Low priority because don't typically get TOO many of these dangling hyphens in most corpuses.)
            corpus_unistr = re.sub(re.escape(w)+r"\s*"+re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),re.escape(w).replace(re.escape('-'+aoEnd),grp+re.escape(aoEnd)).replace(re.escape(mdEnd),mdG+re.escape(mdEnd)),corpus_unistr)
            ff = 1
        if ff: allWords = getAllWords() # re-generate
      del cu0
    sys.stderr.write(":")
    class Replacer:
      def __init__(self): self.dic = {}
      def add(self,x,y):
        if diagnose and diagnose in x: diagnose_write("Replacer.add(%s,%s)" % (x,y))
        if not (len(self.dic)%1500): sys.stderr.write('.') # try this instead
      def flush(self):
        if not self.dic: return
        global corpus_unistr
        for exp in orRegexes(re.escape(k) for k in self.dic.iterkeys()):
          sys.stderr.write(';')
          corpus_unistr = re.sub(exp,lambda k:self.dic[k.group(0)],corpus_unistr)
    rpl = Replacer() ; rpl.cu_nosp = None
    def normWord(w):
      if '-' in w: hTry=set([w.replace('-','')]) # if not annot_whitespace, we'll replace any non-hyphenated 'run together' version by the version with the hyphen; that's often the sensible thing to do with pinyin etc (TODO more customisation??)
      else: hTry=None
      if not capitalisation:
        wl = w.lower() # (as long as it's all Unicode strings, .lower() and .upper() work with accents etc)
        if not w==wl and wl in allWords:
            # This word is NOT always capitalised, just
            # sometimes at the start of a sentence.
            # To simplify rules, make it always lower.
      if annot_whitespace or (keep_whitespace and markDown(w) in keep_whitespace): return w,None
      if not re.search(wspPattern,w): return w,hTry
      nowsp = re.sub(wspPattern,"",w)
      if not capitalisation and not nowsp.lower()==nowsp and nowsp.lower() in allWords: nowsp = nowsp.lower()
      if nowsp in allWords: return nowsp,hTry # varying whitespace in the annotation of a SINGLE word: probably simplest if we say the version without whitespace, if it exists, is 'canonical' (there might be more than one with-whitespace variant), at least until we can set the relative authority of the reference (TODO)
      ao,md = annotationOnly(w),markDown(w)
      aoS = ao.split()
      if len(md.split())==1 and len(md) <= 5 and len(aoS) <= len(md): # TODO: 5 configurable?  don't want different_ways_of_splitting to take too long
        # if not too many chars, try different ways of
        # assigning each word to chars, and see if any
        # of these exist in the corpus; if any does,
        # assume we have "ABC|a bc" <= "A|a BC|bc" type
        # situations - the latter shouldn't necessarily be
        # converted into the former, but the former might
        # be convertible into the latter to simplify rules
        if rpl.cu_nosp == None:
          rpl.cu_nosp = re.sub(wspPattern,"",corpus_unistr)
          if not capitalisation: rpl.cu_nosp = rpl.cu_nosp.lower() # ignore capitalisation when searching for this
        if capitalisation: aoS2 = aoS
        else: aoS2 = [w0.lower() for w0 in aoS]
        for charBunches in different_ways_of_splitting(md,len(aoS)):
            mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS2)]
            multiword = "".join(mw)
            if multiword in rpl.cu_nosp:
              # we're about to return a split version of the words, but we now have to pretend it went through the initial capitalisation logic that way (otherwise could get unnecessarily large collocation checks)
              if not capitalisation:
                mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS)] # the original capitalisation. for selective .lower()
                  w0 = mw[i]
                  wl = w0.lower()
                  if not w0==wl and wl in allWords:
          # TODO: is there ANY time where we want multiword to take priority over the nowsp (no-whitespace) version above?  or even REPLACE multiword occurrences in the corpus with the 1-word nowsp version?? (must be VERY CAREFUL doing that)
      # TODO: anything else?
      return w,hTry
    for w in allWords:
      w2,hTry = normWord(w)
      if hTry:
        hTry.add(w2.replace('-','')) # in case not already there
        for h in hTry:
          if h in allWords: rpl.add(h,w2)
      if not w==w2: rpl.add(w,w2)
    rpl.flush()
    if checkpoint: open_try_bz2(checkpoint+os.sep+'normalised','wb').write(corpus_unistr.encode('utf-8'))
def getAllWords():
  allWords = set()
  for phrase in splitWords(corpus_unistr,phrases=True):
    allWords.update(splitWords(phrase))
  return allWords # do NOT cache (is called either side of the normaliser)
def orRegexes(escaped_keys):
  escaped_keys = list(escaped_keys) # don't just iterate
  try: yield re.compile('|'.join(escaped_keys))
  except OverflowError: # regex too big (e.g. default Python on Mac OS 10.7 i.e. Python 2.7.1 (r271:86832, Jul 31 2011, 19:30:53); probably some Windows versions also; does not affect Mac HomeBrew's Python 2.7.12)
    ek = escaped_keys[:len(escaped_keys)/2]
    for r in orRegexes(ek): yield r
    ek = escaped_keys[len(ek):]
    for r in orRegexes(ek): yield r
if mreverse: mdStart,mdEnd,aoStart,aoEnd = markupMid,markupEnd,markupStart,markupMid
else: mdStart,mdEnd,aoStart,aoEnd = markupStart,markupMid,markupMid,markupEnd
def different_ways_of_splitting(chars,numWords):
  if numWords > len(chars): return
  elif numWords == len(chars):
  elif numWords == 1:
  spAt_try1 = len(chars) / numWords + 1
  for spAt in range(spAt_try1,0,-1) + range(spAt_try1+1, len(chars)-numWords+1):
    for r in different_ways_of_splitting(chars[spAt:],numWords-1): yield [chars[:spAt]]+r
def yarowsky_indicators(withAnnot_unistr,canBackground):
    # yields True if rule always works (or in majority of cases with ymajority), or lists enough indicators to cover example instances and yields (negate, list, nbytes), or just list if empty.
    # (If too few indicators can be found, will list the ones it can, or empty if no clearly-distinguishable indicators can be found within ybytes of end of match.)
    # yield "backgrounded" = task has been backgrounded; .next() collects result
    if nonAnnot in yPriorityDic: # TODO: enforce len==1 ?
        if yPriorityDic[nonAnnot] == withAnnot_unistr:
            # we want this case to be the default (TODO: can't we just put it straight into the rules when making yPriorityDic, and skip this?  although I'm not sure if that would give much of a speedup, as the phrase/sec count tends to go into the thousands anyway when it's processing a yPriorityDic section)
            if nonAnnot==diagnose: diagnose_write("yPriorityDic forces %s" % (withAnnot_unistr,))
            yield True ; return
          if nonAnnot==diagnose: diagnose_write("yPriorityDic forbids default %s" % (withAnnot_unistr,))
          can_be_default = False # another is default, don't make this one default even if it occurs more
    # First, find positions in corpus_markedDown which match withAnnot_unistr in corpus_unistr
    okStarts = getOkStarts(withAnnot_unistr)
    # now check for corpus_markedDown matches that *don't* have withAnnot_unistr
    badStarts = getBadStarts(nonAnnot,okStarts)
      if nonAnnot==diagnose: diagnose_write("%s has no badStarts" % (withAnnot_unistr,))
      yield True ; return # rule always works, no Yarowsky indicators needed
    if can_be_default and len(okStarts) > len(badStarts) and len(nonAnnot)==1:
      if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule" % (withAnnot_unistr,))
      yield True ; return # duplicate of code below (can test for this case early before reducing-down badStarts)
    badStarts = getReallyBadStarts(badStarts,nonAnnot) # see its comments (ignore some badStarts)
      if nonAnnot==diagnose: diagnose_write("%s has only probably-irrelevant badStarts" % (withAnnot_unistr,))
      yield True ; return
    # Now, if it's right more often than not:
    if can_be_default and len(okStarts) > len(badStarts):
        # could we have this as a "default" rule, with the other cases as exceptions that will be found first?
          if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule after removing irrelevant badStarts" % (withAnnot_unistr,))
          yield True ; return # should be safe, and should cover most "common short Chinese word with thousands of contexts" cases
        # If len 2 or more, it's risky because the correct solution could be to process just a fraction of the word now and the rest will become the start of a longer word, so we probably don't want it matching the whole lot by default unless can be sure about it
        # e.g. looking at rule AB, text ABC and correct segmentation is A BC, don't want it to 'greedily' match AB by default without positive indicators it should do so
        # Check for no "A BC" situations, i.e. can't find any possible SEQUENCE of rules that STARTS with ALL the characters in nonAnnot and that involves having them SPLIT across multiple words:
        # (The below might under-match if there's the appearance of a split rule but it actually has extra non-marked-up text in between, but it shouldn't over-match.)
        # TODO: if we can find the actual "A BC" sequences (instead of simply checking for their possibility as here), and if we can guarantee to make 'phrase'-length rules for all of them, then AB can still be the default.  This might be useful if okStarts is very much greater than badStarts.
        # (TODO: until the above is implemented, consider recommending --ymax-threshold=0, because, now that Yarowsky-like collocations can be negative, the 'following word' could just go in as a collocation with low ybytes)
        # TODO: also, if the exceptions to rule AB are always of the form "Z A B", and we can guarantee to generate a phrase rule for "Z A B", then AB can still be default.  (We should already catch this when the exceptions are "ZA B", but not when they are "Z A B", and --ymax-threshold=0 probably won't always help here, especially if Z==B; Mandarin "mei2you3" / "you3 mei2 you3" comes to mind)
Silas S. Brown's avatar
Silas S. Brown committed
        if all(x.end()-x.start()==llen for x in re.finditer(re.escape(mdStart)+("("+re.escape(mdEnd)+"((?!"+re.escape(mdStart)+").)*.?"+re.escape(mdStart)+")?").join(re.escape(c) for c in list(nonAnnot)),corpus_unistr)):
          if nonAnnot==diagnose: diagnose_write("%s is default by majority-case rule after checking for dangerous overlaps etc" % (withAnnot_unistr,))
          yield True ; return
    run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds.  TODO: does mpi4py.futures have a different 'sweet spot' here? (low priority unless we can get MPI to outdo concurrent.futures in this application)
    may_take_time = canBackground and len(okStarts) > 1000
    if may_take_time: sys.stderr.write("\nLarge collocation check (%s has %d matches + %s), %s....  \n" % (withAnnot_unistr.encode(terminal_charset,'replace'),len(okStarts),badInfo(badStarts,nonAnnot),cond(run_in_background,"backgrounding","could take some time")))
    if run_in_background:
      job = executor.submit(yarowsky_indicators_wrapped,withAnnot_unistr) # recalculate the above on the other CPU in preference to passing, as memory might not be shared
      yield "backgrounded" ; yield job.result() ; return
    if ybytes_max > ybytes and (not ymax_threshold or len(nonAnnot) <= ymax_threshold):
      for nbytes in range(ybytes,ybytes_max+1,ybytes_step):
        negate,ret,covered,toCover = tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr)
        if covered==toCover and len(ret)==1:
          if may_take_time: sys.stderr.write(" - using 1 indicator, negate=%s\n" % repr(negate))
          yield (negate,ret,nbytes) ; return # a single indicator that covers everything will be better than anything else we'll find
        append((-int(covered*100/toCover),len(ret),nbytes,negate,toCover,ret)) # (1st 4 of these are the sort keys: maximum coverage to nearest 1%, THEN minimum num indicators for the same coverage, THEN minimum nbytes (TODO: problems of very large nbytes might outweigh having more indicators; break if found 100% coverage by N?), THEN avoid negate)
        # TODO: try finding an OR-combination of indicators at *different* proximity lengths ?
      retList.sort()
      if nonAnnot==diagnose: diagnose_write("Best coverage is %d%% of %d" % (-retList[0][0],retList[0][-2]))
      negate,ret = retList[0][-3],retList[0][-1]
      negate,ret = tryNBytes(ybytes_max,nonAnnot,badStarts,okStarts,withAnnot_unistr)[:2]
      if ybytes < ybytes_max: distance = ybytes_max
      else: distance = None # all the same anyway
    if not ret and warn_yarowsky: sys.stderr.write("Couldn't find ANY Yarowsky-like indicators for %s   \n" % (withAnnot_unistr.encode(terminal_charset,'replace'))) # (if nonAnnot==diagnose, this'll be reported by tryNBytes below)
    # elif ybytes_max > ybytes: sys.stderr.write("Debugger: %s best coverage=%d/%d by %d indicators at nbytes=%d   \n" % (withAnnot_unistr.encode(terminal_charset,'replace'),-retList[0][0],retList[0][3],retList[0][1],retList[0][2]))
    # TODO: if partially but not completely covered, shouldn't entirely count the word as 'covered' in analyse()
    elif ret and may_take_time: sys.stderr.write(" - using %d indicators, negate=%s\n" % (len(ret),repr(negate)))
    if not ret or (not distance and not negate):
      yield ret
    else:
      if not distance: distance = ybytes_max
      yield negate,ret,distance
def yarowsky_indicators_wrapped(withAnnot_unistr):
    check_globals_are_set_up()
    return yarowsky_indicators(withAnnot_unistr,False).next()
def getOkStarts(withAnnot_unistr):
    if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr]
    walen = len(withAnnot_unistr)
    return set(x for x in precalc_sets[splitWords(withAnnot_unistr).next()] if corpus_unistr[m2c_map[x]:m2c_map[x]+walen]==withAnnot_unistr)
def getBadStarts(nonAnnot,okStarts): return set(x.start() for x in re.finditer(re.escape(nonAnnot),corpus_markedDown) if not x.start() in okStarts)
def getReallyBadStarts(badStarts,nonAnnot):
    # Some of the badStarts can be ignored on the grounds that they should be picked up by other rules first: any where the nonAnnot match does not start at the start of a word (the rule matching the word starting earlier should get there first), and any where it starts at the start of a word that is longer than its own first word (the longest-first ordering should take care of this).  So keep only the ones where it starts at the start of a word and that word is no longer than len(nonAnnot).
    reallyBadStarts = [] ; append=reallyBadStarts.append
    nonAnnotLen = len(mdStart+nonAnnot+mdEnd)
    theRe = re.compile(re.escape(mdStart+nonAnnot[0])+".*?"+re.escape(mdEnd))
    for b in badStarts:
      try: s = m2c_map[b]
      except KeyError: continue # it wasn't the start of a word (only start positions are in that map)
      m=theRe.search(corpus_unistr, s) # will either start at s, or after it if mreverse
      s,e = m.start(),m.end()
      if e-s > nonAnnotLen: continue # this word is too long, should be matched by a longer rule 1st
      append(b) # to reallyBadStarts
    return reallyBadStarts
def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr):
    # try to find either positive or negative Yarowsky-like indicators, whichever gives a smaller set.  Negative indicators might be useful if there are many matches and only a few special exceptions (TODO: but put in an option to avoid checking for them as per v0.57 and below? although I'm not sure what application would have to be that careful but still use Yarowsky-like indicators)
    # (Negative indicators are used only if they cover 100% of the exceptions - see below re negate==None)
    def bytesAround(start): return within_Nbytes(start+len(nonAnnot),nbytes)
    okStrs=list(set(bytesAround(s) for s in okStarts))
    badStrs=list(set(bytesAround(s) for s in badStarts))
    pOmit = unichr(1).join(badStrs) # omit anything that occurs in this string from +ve indicators
    nOmit = unichr(1).join(okStrs) # ditto for -ve indicators
    pCovered=[False]*len(okStrs)
    nCovered=[False]*len(badStrs)
    pRet = [] ; pAppend=pRet.append
    nRet = [] ; nAppend=nRet.append
    negate = None # not yet set
    stuffToCheck = [(okStrs,pAppend,pCovered,unique_substrings(okStrs,markedUp_unichars,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))] # a generator and associated parameters for positive indicators
    if len(okStrs) > len(badStrs) or not okStrs: stuffToCheck.append((badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s)))) # and for negative indicators, if it seems badStrs are in the minority (or if not okStrs, which is for test_manual_rules) (TODO: smaller minority?  we'll try a string from each generator in turn, stopping if we find one that covers everything; that way we're hopefully more likely to finish early if one of the two is going to quickly give a string that matches everything, but TODO is this always so optimal in other cases?  especially if there are far more negative indicators than positive ones, in which case it's unlikely to end up being a "many matches and only a few special exceptions" situation, and checking through ALL the negative indicators is a lot of work for comparatively little benefit; TODO: also have 'if len(nAppend) > SOME_THRESHOLD and len(stuffToCheck)==2: del stuffToCheck[1] # give up on negative indicators if too many' ? )
    while stuffToCheck and negate==None:
      for i in range(len(stuffToCheck)):
        strs,append,covered,generator = stuffToCheck[i]
        try: indicator = generator.next()
        except StopIteration:
          del stuffToCheck[i] ; break
        found = True ; cChanged = False
        for i in xrange(len(strs)):
          if not covered[i] and indicator in strs[i]:
            covered[i]=cChanged=True
        if cChanged: append(indicator)
        if all(covered):
          if append==pAppend: negate=False
          else: negate=True
          break
    # and if negate==None AFTER this loop, didn't get all(pCovered) OR all(nCovered), in which case we fall back to negate=False.  In other words, negative indicators have to cover ALL non-occurrences to be passed, wheras positive indicators just have to cover SOME.  This is in keeping with the idea of 'under-match is better than over-match' (because an under-matching negative indicator is like an over-matching positive one)
    if negate: ret,covered = nRet,nCovered
    else: ret,covered = pRet,pCovered
      if ret:
        if negate: indicators = "negative indicators "
        else: indicators = "indicators "
        if len(ret) > 30: indicators=str(len(ret))+" "+indicators # +'/'.join(ret[:30]+['...'])
        else: indicators += '/'.join(ret)
      if len(pOmit) > 200: pOmit = pOmit[:200]+"..."
      if all(covered): notCovered = ""
      else:
        if negate: strs = badStrs
        else: strs = okStrs
        notCovered = [strs[i] for i in xrange(len(covered)) if not covered[i]]
        if len(notCovered) > 10: notCovered = notCovered[:10]+["..."]
        notCovered = " (not "+'/'.join(notCovered).replace('\n',"\\n")+")"
      diagnose_write("tryNBytes(%d) on %s found %s (avoiding '%s'), covers %d/%d contexts%s" % (nbytes,withAnnot_unistr,indicators,pOmit.replace(unichr(1),'/').replace('\n',"\\n"),sum(1 for x in covered if x),len(covered),notCovered))
    return negate,ret,sum(1 for x in covered if x),len(covered)
def cond(a,b,c):
  if a: return b
  else: return c

def badInfo(badStarts,nonAnnot):
Silas S. Brown's avatar
Silas S. Brown committed
  ret = "%d false positive" % len(badStarts)
  if not len(badStarts)==1: ret += "s"
  if len(badStarts) > yarowsky_debug: return ret
  for wordStart in badStarts:
   wordEnd = wordStart + len(nonAnnot)
   contextStart,contextEnd=max(0,wordStart-5),wordEnd+5
   toRead = corpus_markedDown
Silas S. Brown's avatar
Silas S. Brown committed
   # but can we report it from the original corpus_unistr?
   if wordStart in m2c_map and wordEnd in m2c_map:
Silas S. Brown's avatar
Silas S. Brown committed
    toRead = corpus_unistr
    wordStart,wordEnd = m2c_map[wordStart],m2c_map[wordEnd]
Silas S. Brown's avatar
Silas S. Brown committed
    newCStart,newCEnd = contextStart,contextEnd
    while newCStart not in m2c_map and newCStart >= contextStart-5: newCStart-=1
    while newCEnd not in m2c_map and newCEnd<contextEnd+5: newCEnd+=1
    if newCStart in m2c_map: contextStart = m2c_map[newCStart]
Silas S. Brown's avatar
Silas S. Brown committed
    else: contextStart = max(0,wordStart - 15) # This might cut across markup, but better that than failing to report the original corpus and making it look like the words might not have "lined up" when actually they did.  Might also just cut into surrounding non-markup text (if the above loop simply couldn't find anything near enough because such text was in the way).
    if newCEnd in m2c_map: contextEnd = m2c_map[newCEnd]
Silas S. Brown's avatar
Silas S. Brown committed
    else: contextEnd = wordEnd + 15 # ditto
   ret += (u" (%s%s%s%s%s)" % (toRead[contextStart:wordStart],reverse_on,toRead[wordStart:wordEnd],reverse_off,toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r").encode(terminal_charset,'replace')
Silas S. Brown's avatar
Silas S. Brown committed
  return ret
def unique_substrings(texts,allowedChars,omitFunc,valueFunc):
    # yield unique substrings of texts, in increasing length, with equal lengths sorted by highest score returned by valueFunc, and omitting any where omitFunc is true, or that uses any character not in allowedChars (allowedChars==None means all allowed)
    if allowedChars:
        # remove non-allowedChars from texts, splitting into smaller strings as necessary
            start = 0
            for i in xrange(len(text)):
                if not text[i] in allowedChars:
            if start<len(text): append(text[start:])
        texts=texts2
    if not texts: return
    length=1 ; maxlen = max(len(t) for t in texts)
    while length <= maxlen:
        ret=set()
        # sys.stderr.write("Finding (l=%d)... " % len(texts))
        for text in texts: ret.update(text[s:s+length] for s in xrange(len(text)-length+1))
        l=[(valueFunc(k),k) for k in ret if not omitFunc(k)]
        # if length == ybytes_max and not l: sys.stderr.write("Debugger: omitFunc was true for all %s\n" % repr(ret))
        # sys.stderr.write("%d of length %d\n" % (len(l),length))
def within_Nbytes(matchEndPos,nbytes):
    # return the Unicode characters within nbytes of matchEndPos, assuming the encoding will be outcode.  Used for the Yarowsky-like functions.
    # Assumes multibyte codes are self-synchronizing, i.e. if you start in the middle of a multibyte sequence, the first valid character will be the start of the next sequence, ok for utf-8 but TODO might not be the case for some codes
    return corpus_markedDown[max(0,matchEndPos-nbytes):matchEndPos].encode(outcode)[-nbytes:].decode(outcode,'ignore')+corpus_markedDown[matchEndPos:matchEndPos+nbytes].encode(outcode)[:nbytes].decode(outcode,'ignore')
def test_rule(withAnnot_unistr,yBytesRet,canBackground=None):
    # Tests to see if the rule withAnnot_unistr is
    # ALWAYS right in the examples, i.e.
    # the number of occurrences of its marked-down text
    # in the continuous marked-down string should be
    # EXACTLY equal to the number of occurrences of the
    # marked-up version.
    # (If we deal only in rules that ALWAYS work, we can
    # build them up incrementally without "cross-talk")
    # yield "backgrounded" = task has been backgrounded; .next() collects result (nb we default to NOT canBackground, as test_rule is called from several places of which ONE can handle backgrounding)
    if primitive:
      yield True ; return
        # Doesn't have to be always right, but put the indicators in yBytesRet
        ybrG = yarowsky_indicators(withAnnot_unistr,canBackground)
        ybr = ybrG.next()
        if ybr == "backgrounded":
          yield ybr ; ybr = ybrG.next()
        if ybr==True or not ybr:
          yield ybr ; return
        yBytesRet.append(ybr) # (negate, list of indicators, nbytes)
        yield True ; return
    ret = corpus_markedDown.count(phrase) == len(getOkStarts(withAnnot_unistr))
      diagnose_write("occurrences(%s)==occurrences(%s) = %s" % (phrase,withAnnot_unistr,ret))
    yield ret
def all_possible_rules(words,covered):
    # Iterate over ALL possible rules derived from the
    # word sequence (don't just "find the shortest context
    # that predicts each word" because that can have
    # trouble with overlaps; need to check them all and
    # stop when we've got enough to reproduce the example)
    # As optimisation, avoids returning rules for which
    # all(covered) over that rule's range
    if max_words: maxRuleLen = min(len(words),max_words)
    else: maxRuleLen = len(words)
    for ruleLen in range(1,maxRuleLen+1): # (sort by len)
        for wStart in range(len(words)-ruleLen+1):
          if not all(covered[wStart:wStart+ruleLen]):
            # caller join()s before adding to rules dict
def checkCoverage(ruleAsWordlist,words,coveredFlags):
    # Updates coveredFlags and returns True if any changes
    # (if False, the new rule is redundant).
    # Don't worry about ybytes - assume the Yarowsky-like
    # indicators have been calculated correctly across the
    # whole text so we don't need to re-check them now.
    assert type(ruleAsWordlist)==type(words)==list
    try: start = words.index(ruleAsWordlist[0])
        if words[start:start+ln] == ruleAsWordlist:
            if not all(coveredFlags[start:start+ln]):
                coveredFlags[start:start+ln]=[True]*ln
            try: start = words.index(ruleAsWordlist[0],start+1)
def wspJoin(l):
  if removeSpace: return "".join(l)
  else: return " ".join(l)

def potentially_bad_overlap(rulesAsWordlists,newRuleAsWords):
    # Allow overlaps only if rule(s) being overlapped are
    # entirely included within newRule.  Otherwise could
    # get problems generating closures of overlaps.
    # (If newRule not allowed, caller to try a longer one)
    # Additionally, if allow_overlaps, allow ANY overlap as
    # long as it's not found in the marked-down text.
    if len(newRuleAsWords)==1 or primitive or ybytes: return False
    for ruleAsWordlist in rulesAsWordlists:
        if len(ruleAsWordlist)==1: continue
        if not len(ruleAsWordlist)==len(newRuleAsWords) and longerStartsOrEndsWithTheShorter(ruleAsWordlist,newRuleAsWords): continue
        for overlapSize in range(1,min(len(x) for x in [newRuleAsWords,ruleAsWordlist])):
            if not (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] or newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize]): continue
            if not allow_overlaps: return True
            # Test to see if the examples "allow" this potentially-bad overlap
            def overlapOK(rAW): return not markDown(wspJoin(rAW)) in corpus_markedDown
            if (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] and not overlapOK(ruleAsWordlist[:-overlapSize]+newRuleAsWords)) or (newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize] and not overlapOK(newRuleAsWords[:-overlapSize]+ruleAsWordlist)): return True

def longerStartsOrEndsWithTheShorter(l1,l2):
    if len(l1) > len(l2): l1,l2 = l2,l1
    return l2[:len(l1)]==l1 or l2[-len(l1):]==l1

class RulesAccumulator:
  def __init__(self):
    self.rules = {}
    self.rulesAsWordlists_By1stWord = {} # starting word -> list of possible rules (as wordlists) that might apply
    self.rulesAsWordlists = list() # all rules as words (list of lists) (used if not ybytes, TODO: integrate with rulesAsWordlists_By1stWord?)
    self.seenPhrases = set() # de-duplicate, might speed up
    self.amend_rules = False
    if rulesFile: self.load()
  def save(self):
    sys.stderr.write("\nPickling rules to %s... " % rulesFile)
    f = openfile(rulesFile,'wb')
    pickle.Pickler(f,-1).dump((self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases))
    # (don't save self.rejectedRules, there might be better clues next time)
    f.close() ; sys.stderr.write("done")
  def load(self):
    if not os.path.isfile(rulesFile):
      sys.stderr.write("%s does not exist, starting with blank rules\n" % rulesFile)
      return
    sys.stderr.write("Unpickling rules from %s... " % rulesFile)
    f = openfile(rulesFile,'rb')
    self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases = pickle.Unpickler(f).load()
    sys.stderr.write("done\n")
    self.amend_rules = True
    self.newRules = set()
  def remove_old_rules(self,words): # for incremental runs - removes previously-discovered rules that would have been suggested by this new phrase but that no longer 'work' with the rest of the corpus due to alterations elsewhere.  DOES NOT remove old rules that are not suggested by any phrase in the corpus because the phrases that suggested them have been removed or changed (TODO: might want an option for that, although fundamentally you shouldn't be relying on incremental runs if you're making a lot of changes to the corpus)
    for w in set(words):
      rulesAsWordlists = self.rulesAsWordlists_By1stWord.get(w,[])
      i=0
      while i<len(rulesAsWordlists):
        if max_words and len(rulesAsWordlists[i])>max_words:
          i += 1 ; continue # better leave that one alone if we're not reconsidering rules that long (e.g. running again with single_words when previous run wasn't)
        rule = wspJoin(rulesAsWordlists[i])
Silas S. Brown's avatar
Silas S. Brown committed
        if rule not in self.newRules and checkCoverage(rulesAsWordlists[i],words,[False]*len(words)): # rule would apply to the new phrase
          yBytesRet = []
          if not test_rule(rule,yBytesRet).next() or potentially_bad_overlap(self.rulesAsWordlists,rulesAsWordlists[i]): # re-test fails.  In versions v0.543 and below, we just removed ALL rules that would apply to the new phrase, to see if they would be re-generated.  But that caused problems because addRulesForPhrase can return early if all(covered) due to other (longer) rules and we might be removing a perfectly good short rule that's needed elsewhere.  So we now re-test before removal.
Silas S. Brown's avatar
Silas S. Brown committed
            self.rejectedRules.add(rule)
            if not ybytes: self.rulesAsWordlists.discard(rulesAsWordlists[i])
Silas S. Brown's avatar
Silas S. Brown committed
            del rulesAsWordlists[i] ; del self.rules[rule]
            continue
          self.newRules.add(rule) # still current - add to newRules now to save calling test_rule again
          if len(yBytesRet): self.rules[rule] = yBytesRet[0] # overriding what it was before (since we've re-done test_rule for it, which might have returned a new set of Yarowsky-like indicators for the new version of the corpus)
        i += 1
  def addRulesForPhrase(self,phrase,canBackground=False):
    if phrase in self.seenPhrases or (diagnose_quick and diagnose):
      # if diagnose and (diagnose_quick or self.amend_rules) and mdStart+diagnose+mdEnd in phrase: pass # look at it again for diagnostics.  But do we accept a diagnose that spans multiple words?  should be pointed out by --diagnose-quick below if uncommented
      if diagnose and (diagnose_quick or self.amend_rules) and diagnose in markDown(phrase): pass # this version accepts diagnose of multiple words (and might also let some phrases through where it matches on an overlap)
      else:
        yield 0,0 ; return # TODO: document that this means the total 'covered' figure in the progress status is AFTER phrase de-duplication (otherwise we'd have to look up what the previous values were last time we saw it - no point doing that just for a quick statistic)
    words = filter(lambda x:markDown(x).strip(),splitWords(phrase)) # filter out any that don't have base text (these will be input glitches, TODO: verify the annotation text is also just whitespace, warn if not)
    if not words:
      yield 0,0 ; return
    covered = [False]*len(words)
    # first see how much is covered by existing rules
    # (don't have to worry about the order, as we've been
    # careful about overlaps)
    if self.amend_rules: self.remove_old_rules(words) # NB if yignore this might not remove all, but still removes all that affect checkCoverage below
      for ruleAsWordlist in self.rulesAsWordlists_By1stWord.get(w,[]):
        if checkCoverage(ruleAsWordlist,words,covered) and all(covered):
          yield len(covered),len(covered) ; return # no new rules needed
    for ruleAsWordlist in all_possible_rules(words,covered):
        rule = wspJoin(ruleAsWordlist) ; yBytesRet = []
        if rule in self.rejectedRules: continue
        if rule in self.rules: continue # this can still happen even now all_possible_rules takes 'covered' into account, because the above checkCoverage assumes the rule won't be applied in a self-overlapping fashion, whereas all_possible_rules makes no such assumption (TODO: fix this inconsistency?)
        rGen = test_rule(rule,yBytesRet,canBackground)
        r = rGen.next()
        if r=="backgrounded":
          yield r ; r = rGen.next()
        del rGen
        if not r or potentially_bad_overlap(self.rulesAsWordlists,ruleAsWordlist):
            self.rejectedRules.add(rule) # so we don't waste time evaluating it again (TODO: make sure rejectedRules doesn't get too big?)
            continue
        cc = checkCoverage(ruleAsWordlist,words,covered) # changes 'covered'
        assert cc, "this call to checkCoverage should never return False now that all_possible_rules takes 'covered' into account" # and it's a generator which is always working from the CURRENT copy of 'covered'
        if len(yBytesRet): self.rules[rule] = yBytesRet[0]
        else: self.rules[rule] = [] # unconditional
        if not ybytes: self.rulesAsWordlists.append(ruleAsWordlist)
        if not ruleAsWordlist[0] in self.rulesAsWordlists_By1stWord: self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]] = []
        self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]].append(ruleAsWordlist)
        if self.amend_rules: self.newRules.add(rule)
        handle_diagnose_limit(rule)
        if all(covered):
          yield len(covered),len(covered) ; return
    # If get here, failed to completely cover the phrase.
    # ruleAsWordlist should be set to the whole-phrase rule.
    yield sum(1 for x in covered if x),len(covered)
Silas S. Brown's avatar
Silas S. Brown committed
  def rulesAndConds(self):
    if self.amend_rules: return [(k,v) for k,v in self.rules.items() if not k in self.newRules] + [(k,v) for k,v in self.rules.items() if k in self.newRules] # new rules must come last for incremental runs, so they will override existing actions in byteSeq_to_action_dict when small changes have been made to the annotation of the same word (e.g. capitalisation-normalisation has been changed by the presence of new material)
    else: return self.rules.items()
def handle_diagnose_limit(rule):
  global diagnose,diagnose_limit
  if diagnose and diagnose_limit and diagnose==markDown(rule):
    diagnose_limit -= 1
    if not diagnose_limit:
      diagnose = False
      diagnose_write("limit reached, suppressing further diagnostics")

    global m2c_map, precalc_sets, yPriorityDic
        f=open_try_bz2(checkpoint+os.sep+'map','rb')
        m2c_map,precalc_sets,yPriorityDic = pickle.Unpickler(f).load()
    assert main, "Only main should generate corpus map"
    sys.stderr.write("Generating corpus map... ")
    m2c_map = {} ; precalc_sets = {}
    muStart = downLenSoFar = 0
    for s in re.finditer(re.escape(markupStart), corpus_unistr):
      s=s.start()
      downLenSoFar += len(markDown(corpus_unistr[muStart:s]))
      muStart = s
      m2c_map[downLenSoFar] = s
      # Added optimisation: do precalc_sets as well
      # (at least catch the 1-word cases)
      # -> this is now needed even if not ybytes
      e=corpus_unistr.find(markupEnd,s)
      if e>-1:
        e += len(markupEnd)
        k = corpus_unistr[s:e]
        if k not in precalc_sets: precalc_sets[k]=set()
        precalc_sets[k].add(downLenSoFar)
    if ref_pri and ybytes:
      sys.stderr.write("yPriorityDic ... ")
      for s in re.finditer(re.escape(reference_sep+ref_pri+ref_name_end), corpus_unistr):
        s = s.start()+len(reference_sep+ref_pri+ref_name_end)
        e = corpus_unistr.find(reference_sep,s)
        if e==-1: e=len(corpus_unistr)
        for w in splitWords(corpus_unistr[s:e]):
          wd = markDown(w)
          if wd in yPriorityDic: continue
          if diagnose==wd: diagnose_write("yPriorityDic[%s] = %s" % (wd,w))
          yPriorityDic[wd] = w
    if checkpoint: pickle.Pickler(open_try_bz2(checkpoint+os.sep+'map','wb'),-1).dump((m2c_map,precalc_sets,yPriorityDic))
def setup_parallelism():
    if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus and map etc)
    import commands
    try:
      commands.getoutput(
        "ps -p " + str(os.getpid()) + " -o args") \
        .index("-m mpi4py.futures") # ValueError if not found
      import mpi4py.futures # mpi4py v2.1+
      import mpi4py.MPI, mpi4py ; assert mpi4py.MPI.COMM_WORLD.size > 1, "mpi4py says world size is 1: likely a symptom of incorrectly-configured MPI.  Did you compile mpi4py using the same setup (e.g. MPICH or OpenMPI) as you are running?  mpi4py's config is: "+repr(mpi4py.get_config())
      return mpi4py.futures.MPIPoolExecutor()
    except ValueError: pass # but raise all other exceptions: if we're being run within mpi4py.futures then we want to know about MPI problems
    try:
      commands.getoutput(
        "ps -p " + str(os.getpid()) + " -o args") \
        .index("-m scoop") # ValueError if not found
      import scoop.futures
      return scoop.futures # submit() is at module level
    except ValueError: pass
    try:
      import concurrent.futures # sudo pip install futures (2.7 backport of 3.2 standard library)
      import multiprocessing
      num_cpus = multiprocessing.cpu_count()
      if num_cpus >= 2: return concurrent.futures.ProcessPoolExecutor(num_cpus-1) # leave one for the CPU-heavy control task
    except: pass

def get_phrases():
    # Returns a list of phrases in processing order, with length-numbers inserted in the list.  Caches its result.
    global _gp_cache
    try: return _gp_cache
    except: pass
    # Due to the way we handle overlaps, it's better to process the shortest phrases first, as the longer phrases will yield more rule options and therefore more likely to be able to work around any "no-overlap" constraints imposed by already-processed examples.  Something like:
    p2 = []
    for p in splitWords(corpus_unistr,phrases=True):
      p2.append((min([len(p.split(markupStart)),len(p.split(markupMid)),len(p.split(markupEnd))]),len(p2),p)) # no need for splitWords(phrase) just to get len, but we do need the min-of-3 for robustness against the occasional markup error
    p2.sort() # by length, then by original position (note: if removing this sort, remove wordsThisPhrase from status_update)
    phrases = [] ; wordLen = None
    for p in p2:
      if not wordLen == p[0]:
        wordLen = p[0]
        phrases.append(wordLen-1) # because it's a .split length (really want an actual count, but it only has to be roughly right in this instance and splitLen-1 will do for speed)
      phrases.append(p[-1])
    _gp_cache = phrases ; return phrases

def setup_other_globals():
    global corpus_markedDown
    corpus_markedDown = markDown(corpus_unistr)
    if ybytes:
        global markedUp_unichars
        if yarowsky_all: markedUp_unichars = None
        else: markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int)))
def check_globals_are_set_up(): # for use during parallelism
  global corpus_unistr
  try: corpus_unistr # if we fork()d, we may already have it
  except NameError:
    normalise() # should get corpus_unistr from checkpoint,
    try: corpus_unistr # unless we're NOT normalising,
    except: corpus_unistr = openfile(infile).read().decode(incode) # in which case we have to load the corpus from scratch (it won't be stdin)
    generate_map() # similarly this should just be a read
    setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine

def analyse():
    covered = 0 # number of phrases we managed to 'cover' with our rules
    toCover = 0 # number of phrases we TRIED to cover (==covered if 100%)
    phraseNo = 0 ; wordLen = None
      try: phraseNo,wordLen,covered,toCover,accum.__dict__ = read_checkpoint()
      except: pass
    phraseLastUpdate = phraseLastCheckpoint = phraseNo
    lastUpdate = lastCheckpoint = startTime = time.time()
    backgrounded = [] ; phrases = get_phrases()
        if type(phrases[phraseNo])==int:
          wordLen = phrases[phraseNo]
          for b in backgrounded: # flush (TODO: duplicate code)
            coveredA,toCoverA = b.next()
            covered += coveredA ; toCover += toCoverA
          backgrounded = []
          phraseNo += 1 ; continue
        if toCover:
          if checkpoint and (checkpoint_exit(0) or time.time() >= lastCheckpoint + 1000): # TODO: configurable?
            sys.stderr.write("Checkpointing..."+clear_eol)