Newer
Older

Silas S. Brown
committed
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
case 72: {
var numBytes = data.charCodeAt(dPtr++);
var annot = readRefStr();
s();
output.push("<ruby><rb>");
output.push(input.slice(copyP,copyP+numBytes));
copyP += numBytes;
output.push("</rb><rt>"); output.push(annot);
output.push("</rt></ruby>"); break; }
case 73: {
var numBytes = data.charCodeAt(dPtr++);
var annot = readRefStr();
var title = readRefStr();
s();
output.push("<ruby title=\""); output.push(title);
output.push("\"><rb>");
output.push(input.slice(copyP,copyP+numBytes));
copyP += numBytes;
output.push("</rb><rt>"); output.push(annot);
output.push("</rt></ruby>"); break; }
case 80: sPos.push(p); break;
case 81: p=sPos.pop(); break;
case 90: {
var tPtr = readAddr();
var fPtr = readAddr();
var nearbytes = data.charCodeAt(dPtr++);
var o=p;
if (o > nearbytes) o -= nearbytes; else o = 0;
var max = p + nearbytes;
if (max > inputLength) max = inputLength;
var tStr = input.slice(o,max);
var found = 0;
while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; }

Silas S. Brown
committed
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
dPtr = found ? tPtr : fPtr; break;
}
default: throw("corrupt data table at "+(dPtr-1)+" ("+data.charCodeAt(dPtr-1)+")");
}
}
}
while(p < inputLength) {
var oldPos=p;
dPtr=1;readData();
if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; }
}
return decodeURIComponent(escape(output.join(""))); // from UTF-8 back to Unicode
} // end of annotate function
};
function annotate(input) { return Annotator.annotate(input); }
if (typeof Backbone != "undefined" && Backbone.Model) { Annotator = Backbone.Model.extend(Annotator); annotate=function(input) { return new Annotator().annotate(input) } }
if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) {
// Node.js command-line test
fs=require('fs');
process.stdout.write(annotate(fs.readFileSync('/dev/stdin').toString()));
} else if (typeof module != "undefined" && module.exports) { // Common.js
module.exports = Annotator;
}
"""
py_start = '# Python '+version_stamp+r"""

Silas S. Brown
committed
# You can import this module and call annotate(utf8 bytes)
# (from multiple threads if desired),
# or you can run from the command line on standard input.
# annotate has an optional second argument, which can be
# 'ruby' (default), 'raw' (annotation only) or 'braces'.
"""
py_end = r"""
class Annotator:
version="""+'"'+version_stamp+r""""

Silas S. Brown
committed
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
def __call__(self,inStr,aType):
if aType=="ruby": self.startA,self.midA,self.endA = "<ruby><rb>","</rb><rt>","</rt></ruby>"
elif aType=="raw": self.startA=self.midA=self.endA = ""
elif aType=="braces": self.startA,self.midA,self.endA = "{","|","}"
else: raise Exception("Unrecognised annotation type "+repr(aType))
assert type(inStr)==str
self.inStr = inStr
self.addrLen = ord(data[0])
self.inputLength = len(inStr)
self.p = 0 # read-ahead pointer
self.copyP = 0 # copy pointer
self.output = []
self.needSpace = 0 ; out = self.output
while self.p < self.inputLength:
oldPos = self.p
self.dPtr = 1 ; self.readData()
if oldPos == self.p:
self.needSpace=0
out.append(inStr[self.p])
self.p += 1 ; self.copyP += 1
return "".join(self.output)
def readAddr(self):
addr = 0
for i in range(self.addrLen):
addr=(addr << 8) | ord(data[self.dPtr])
self.dPtr += 1
return addr
def readRefStr(self):
a = self.readAddr(); l=ord(data[a])
if l: return data[a+1:a+l+1]
else: return data[a+1:data.index('\x00',a+1)]
def s(self):
if self.needSpace: self.output.append(" ")
else: self.needSpace=1
def readData(self):
sPos = [] ; out = self.output
while True:
d = ord(data[self.dPtr]) ; self.dPtr += 1
if d==50: self.dPtr = self.readAddr()
elif d==51:
func = self.readAddr() ; dO = self.dPtr
self.dPtr = func ; self.readData() ; self.dPtr = dO
elif d==52: return
elif d==60:
nBytes = ord(data[self.dPtr])+1 ; self.dPtr += 1
if self.p>=len(self.inStr): i = -1
else: i = data[self.dPtr:self.dPtr+nBytes].find(self.inStr[self.p]) ; self.p += 1

Silas S. Brown
committed
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
if i==-1: i = nBytes
self.dPtr += (nBytes + i * self.addrLen)
self.dPtr = self.readAddr()
elif d==71:
numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
elif d==72:
numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
annot = self.readRefStr()
self.s()
if self.startA:
out.append(self.startA)
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
out.append(self.midA) ; out.append(annot)
out.append(self.endA)
elif d==73:
numBytes = ord(data[self.dPtr]) ; self.dPtr += 1
annot = self.readRefStr()
title = self.readRefStr()
self.s()
if self.startA=="{": # omit title in braces mode
out.append(self.startA)
out.append(self.inStr[self.copyP:self.copyP+numBytes])
elif self.startA:
out.append("<ruby title=\"");out.append(title)
out.append("\"><rb>");
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
out.append(self.midA) ; out.append(annot)
out.append(self.endA)
elif d==80: sPos.append(self.p)
elif d==81: self.p = sPos.pop()
elif d==90:
tPtr = self.readAddr()
fPtr = self.readAddr()
nearbytes = ord(data[self.dPtr]) ; self.dPtr += 1
o = max(self.p-nearbytes,0)
maxx = min(self.p+nearbytes,self.inputLength)
tStr = self.inStr[o:maxx]
found = 0
while self.dPtr < tPtr and self.dPtr < fPtr:

Silas S. Brown
committed
if self.readRefStr() in tStr:
found = 1 ; break
if found: self.dPtr = tPtr
else: self.dPtr = fPtr
else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1]))+")")
def annotate(inStr,p="ruby"): return Annotator()(inStr,p)
def main():
import sys
if sys.argv[-1].startswith("--"): param=sys.argv[-1][2:]
else: param = "ruby"
sys.stdout.write(annotate(sys.stdin.read(),param))
if __name__=="__main__": main()
"""
c_zlib = r"""static unsigned char *data=NULL;
static void init() {
z_stream s; memset(&s,0,sizeof(s));
s.next_in=origData; s.avail_in=%%ZLIBLEN%%;
data=malloc(%%ORIGLEN%%); // TODO: check non-NULL
s.next_out=data; s.avail_out=%%ORIGLEN%%;
inflateInit(&s); inflate(&s, Z_NO_FLUSH); // TODO: check for memory and data-corruption errors
inflateEnd(&s);
}
"""
c_datadrive = r"""
static unsigned char *dPtr; static int addrLen;
#include <stdlib.h>
static unsigned char * readAddr() {
size_t i,addr=0;
for (i=addrLen; i; i--) addr=(addr << 8) | *dPtr++;
return data + addr;
}
static void readData() {
POSTYPE *savedPositions = NULL;
size_t numSavedPositions = 0;
while(1) {
unsigned char c = *dPtr++;
if (c & 0x80) dPtr += (c&0x7F); // short relative forward jump (up to 128 bytes from addr after instruction)
else if(c < 20) { // switchbyte with short jumps
c++; // now c == nBytes
unsigned char byte=(unsigned char)NEXTBYTE;
int i;
for (i=0; i<c; i++) if(byte==dPtr[i]) break;
dPtr += c+c+1 + dPtr[c+i]; // relative from end of switch (after all bytes, 1-byte addresses and the 1-byte default address: up to 256 bytes after)
} else switch(c) {
case 50: /* jump */ dPtr = readAddr(); break;
case 51: /* call */ {
unsigned char *funcToCall=readAddr();
unsigned char *retAddr = dPtr;
dPtr = funcToCall; readData(); dPtr = retAddr;
break; }
if (savedPositions) free(savedPositions);
return;
int nBytes=(*dPtr++)+1, i;
unsigned char byte=(unsigned char)NEXTBYTE;
for (i=0; i<nBytes; i++) if(byte==dPtr[i]) break;
dPtr += (nBytes + i * addrLen);
dPtr = readAddr(); break; }
int numBytes=*dPtr++;
for(;numBytes;numBytes--)
OutWriteByte(NEXT_COPY_BYTE);
break; }
int numBytes=*dPtr++;
char *annot = (char*)readAddr();
o(numBytes,annot); break; }
int numBytes=*dPtr++;
char *annot = (char*)readAddr();
char *title = (char*)readAddr();
o2(numBytes,annot,title); break; }
savedPositions=realloc(savedPositions,++numSavedPositions*sizeof(POSTYPE)); // TODO: check non-NULL?
savedPositions[numSavedPositions-1]=THEPOS;
break;
SETPOS(savedPositions[--numSavedPositions]);
break;
unsigned char *truePtr = readAddr();
unsigned char *falsePtr = readAddr();
setnear(*dPtr++); int found=0;
while(dPtr < truePtr && dPtr < falsePtr) if(near((char*)readAddr())) { found = 1; break; }
dPtr = found ? truePtr : falsePtr; break; }
// default: TODO: error about corrupt data?
}
}
}
static void topLevelMatch() {
addrLen = data[0];
dPtr=data+1; readData();
}
"""

Silas S. Brown
committed
def splitWords(text,phrases=False):
# split text into words, ignoring anything between markupStart and markupEnd
# if phrases = True, instead of words, split on any non-whitespace char outside markupStart..markupEnd
if phrases: it=re.finditer(phrasePattern,text)
else: it=re.finditer(wordPattern,text)
for i in it: yield i.group()

Silas S. Brown
committed

Silas S. Brown
committed
markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd))
wordPattern = re.escape(markupStart)+'.*?'+re.escape(markupEnd)
phrasePattern = re.compile(wordPattern+r'(\s*'+wordPattern+r')*')
wordPattern = re.compile(wordPattern)
wspPattern = re.compile(r"\s+")

Silas S. Brown
committed

Silas S. Brown
committed
def annotationOnly(text):
ret = []

Silas S. Brown
committed
for w in re.finditer(markupPattern,text):
if mreverse: ret.append(w.group(1))
else: ret.append(w.group(2))

Silas S. Brown
committed
return ' '.join(ret)
def markDown(text):
# Return just the original text, without markup

Silas S. Brown
committed
if mreverse: group=r"\2"
else: group=r"\1"
return re.sub(markupPattern,group,text)

Silas S. Brown
committed

Silas S. Brown
committed
def markUp(text,annotation):
if mreverse: text,annotation = annotation,text
return markupStart + text + markupMid + annotation + markupEnd

Silas S. Brown
committed
def checkpoint_exit(doIt=1):
if not checkpoint: return
try: open(checkpoint+os.sep+"ExitASAP")
except: return
if doIt:
assert main, "Only annogen's main module should call checkpoint_exit with doIt=1"

Silas S. Brown
committed
os.remove(checkpoint+os.sep+"ExitASAP")
sys.stderr.write("\nExitASAP found: exit\n")
raise SystemExit
else: return True
try: import cPickle as pickle
except:
try: import pickle
except: pickle = None
def read_checkpoint():
t = pickle.Unpickler(open(checkpoint+os.sep+'checkpoint','rb')).load()
sys.stderr.write("Checkpoint loaded from %d phrases\n" % t[0])
return t
def write_checkpoint(t):
pickle.Pickler(open(checkpoint+os.sep+'checkpoint-NEW','wb'),-1).dump(t) # better write to checkpoint-NEW, in case we reboot or have an OS-level "Out of memory" condition *while* checkpointing
try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
except OSError: # OS can't do it atomically?
try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
except OSError: pass
checkpoint_exit()
def status_update(phraseNo,numPhrases,wordsThisPhrase,nRules,phraseLastUpdate,lastUpdate,phraseLastCheckpoint,lastCheckpoint,coverP,nRej,startTime):
phraseSec = (phraseNo-phraseLastUpdate)*1.0/(time.time()-lastUpdate)
if phraseSec < 100:
phraseSecS = "%.1f" % phraseSec
else: phraseSecS = "%d" % int(phraseSec)
progress = status_prefix + "%s phrase/sec (%d%%/#w=%d) rules=%d cover=%d%%" % (phraseSecS,int(100.0*phraseNo/numPhrases),wordsThisPhrase,nRules,coverP)
if warn_yarowsky: progress += (" rej=%d" % nRej)
if time_estimate:
if phraseNo-phraseLastCheckpoint < 10: phraseMin = phraseSec*60 # current 'instantaneous' speed
else: phraseMin = (phraseNo-phraseLastCheckpoint)*60/(time.time()-lastCheckpoint) # longer-term average
minsLeft = (numPhrases-phraseNo)/phraseMin
if minsLeft>60*24: progress += " %dd+" % int(minsLeft/60/24)
elif minsLeft>60: progress += " %dh+" % int(minsLeft/60)
elif minsLeft: progress += " %dmin+" % minsLeft
# (including the + because this is liable to be an underestimate; see comment after the --time-estimate option)
if len(progress) + 14 < screenWidth:
progress += " (at %02d:%02d:%02d" % time.localtime()[3:6] # clock time: might be useful for checking if it seems stuck
if len(progress) + 20 < screenWidth and not clear_eol == " \r": # (being able to fit this in can be intermittent)
progress += ", analyse=%d:%02d:%02d" % (elapsed/3600,(elapsed%3600)/60,elapsed%60)
progress += ")"

Silas S. Brown
committed

Silas S. Brown
committed
def normalise():
if capitalisation and annot_whitespace: return

Silas S. Brown
committed
global corpus_unistr
if checkpoint:
try:
f=open_try_bz2(checkpoint+os.sep+'normalised','rb')

Silas S. Brown
committed
corpus_unistr = f.read().decode('utf-8')
return
except: # if re-generating 'normalised', will also need to regenerate 'map' and 'checkpoint' if present
assert main, "normalise checkpoint not readable in non-main module"
rm_f(checkpoint+os.sep+'map.bz2') ; rm_f(checkpoint+os.sep+'map')
rm_f(checkpoint+os.sep+'checkpoint')
else: assert main, "normalise called in non-main module and checkpoint isn't even set"

Silas S. Brown
committed
sys.stderr.write("Normalising...")

Silas S. Brown
committed
allWords = getAllWords()
if removeSpace:
corpus_unistr = re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),markupEnd+markupStart,corpus_unistr) # so getOkStarts works consistently if corpus has some space-separated and some not
if not annot_whitespace:

Silas S. Brown
committed
# normalise trailing hyphens e.g. from OCR'd scans:
cu0 = corpus_unistr ; ff = 0
for hTry in [1,2]:
for w in allWords:
if '-'+aoEnd in w:
idx = w.index('-'+aoEnd)
if w[:idx].endswith(aoStart) or w[:idx].endswith("-"): continue # ignore this one (a mess of some kind)
if hTry==2: # ouch, this doesn't look good
sys.stderr.write(" (can't normalise hyphens due to '%s') " % w.encode(terminal_charset,'replace'))

Silas S. Brown
committed
corpus_unistr = cu0 ; break
if mreverse: grp,mdG=r"-\1",r"\2"
else: grp,mdG=r"-\2",r"\1"
# TODO: batch up the following replacements by using something similar to Replacer but with a common destination regexp that takes groups from the 'w' entries as well. (Low priority because don't typically get TOO many of these dangling hyphens in most corpuses.)
corpus_unistr = re.sub(re.escape(w)+r"\s*"+re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),re.escape(w).replace(re.escape('-'+aoEnd),grp+re.escape(aoEnd)).replace(re.escape(mdEnd),mdG+re.escape(mdEnd)),corpus_unistr)
ff = 1
if ff: allWords = getAllWords() # re-generate
del cu0

Silas S. Brown
committed
class Replacer:
def __init__(self): self.dic = {}
def add(self,x,y):
if diagnose and diagnose in x: diagnose_write("Replacer.add(%s,%s)" % (x,y))

Silas S. Brown
committed
self.dic[x] = y
if not (len(self.dic)%1500): sys.stderr.write('.') # try this instead

Silas S. Brown
committed
def flush(self):
if not self.dic: return
global corpus_unistr
for exp in orRegexes(re.escape(k) for k in self.dic.iterkeys()):
sys.stderr.write(';')
corpus_unistr = re.sub(exp,lambda k:self.dic[k.group(0)],corpus_unistr)

Silas S. Brown
committed
self.dic = {}

Silas S. Brown
committed
rpl = Replacer() ; rpl.cu_nosp = None
def normWord(w):

Silas S. Brown
committed
if '-' in w: hTry=set([w.replace('-','')]) # if not annot_whitespace, we'll replace any non-hyphenated 'run together' version by the version with the hyphen; that's often the sensible thing to do with pinyin etc (TODO more customisation??)
else: hTry=None

Silas S. Brown
committed
if not capitalisation:
wl = w.lower() # (as long as it's all Unicode strings, .lower() and .upper() work with accents etc)

Silas S. Brown
committed
if not w==wl and wl in allWords:
# This word is NOT always capitalised, just
# sometimes at the start of a sentence.
# To simplify rules, make it always lower.

Silas S. Brown
committed
w = wl
if hTry: hTry.add(w.replace('-',''))
if annot_whitespace or (keep_whitespace and markDown(w) in keep_whitespace): return w,None
if not re.search(wspPattern,w): return w,hTry
nowsp = re.sub(wspPattern,"",w)
if not capitalisation and not nowsp.lower()==nowsp and nowsp.lower() in allWords: nowsp = nowsp.lower()

Silas S. Brown
committed
if nowsp in allWords: return nowsp,hTry # varying whitespace in the annotation of a SINGLE word: probably simplest if we say the version without whitespace, if it exists, is 'canonical' (there might be more than one with-whitespace variant), at least until we can set the relative authority of the reference (TODO)

Silas S. Brown
committed
ao,md = annotationOnly(w),markDown(w)
aoS = ao.split()
if len(md.split())==1 and len(md) <= 5 and len(aoS) <= len(md): # TODO: 5 configurable? don't want different_ways_of_splitting to take too long
# if not too many chars, try different ways of
# assigning each word to chars, and see if any
# of these exist in the corpus; if any does,
# assume we have "ABC|a bc" <= "A|a BC|bc" type
# situations - the latter shouldn't necessarily be
# converted into the former, but the former might
# be convertible into the latter to simplify rules
if rpl.cu_nosp == None:
rpl.cu_nosp = re.sub(wspPattern,"",corpus_unistr)
if not capitalisation: rpl.cu_nosp = rpl.cu_nosp.lower() # ignore capitalisation when searching for this
if capitalisation: aoS2 = aoS
else: aoS2 = [w0.lower() for w0 in aoS]

Silas S. Brown
committed
for charBunches in different_ways_of_splitting(md,len(aoS)):
mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS2)]

Silas S. Brown
committed
multiword = "".join(mw)
if multiword in rpl.cu_nosp:
# we're about to return a split version of the words, but we now have to pretend it went through the initial capitalisation logic that way (otherwise could get unnecessarily large collocation checks)
if not capitalisation:
mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS)] # the original capitalisation. for selective .lower()

Silas S. Brown
committed
for i in range(len(mw)):
w0 = mw[i]
wl = w0.lower()
if not w0==wl and wl in allWords:

Silas S. Brown
committed
mw[i] = wl
return "".join(mw),hTry

Silas S. Brown
committed
# TODO: is there ANY time where we want multiword to take priority over the nowsp (no-whitespace) version above? or even REPLACE multiword occurrences in the corpus with the 1-word nowsp version?? (must be VERY CAREFUL doing that)
# TODO: anything else?

Silas S. Brown
committed
return w,hTry
for w in allWords:
w2,hTry = normWord(w)
if hTry:
hTry.add(w2.replace('-','')) # in case not already there
for h in hTry:
if h in allWords: rpl.add(h,w2)
if not w==w2: rpl.add(w,w2)
rpl.flush()

Silas S. Brown
committed
sys.stderr.write(" done\n")
if checkpoint: open_try_bz2(checkpoint+os.sep+'normalised','wb').write(corpus_unistr.encode('utf-8'))

Silas S. Brown
committed
checkpoint_exit()
def getAllWords():
allWords = set()
for phrase in splitWords(corpus_unistr,phrases=True):
allWords.update(splitWords(phrase))
return allWords # do NOT cache (is called either side of the normaliser)
def orRegexes(escaped_keys):
escaped_keys = list(escaped_keys) # don't just iterate
try: yield re.compile('|'.join(escaped_keys))
except OverflowError: # regex too big (e.g. default Python on Mac OS 10.7 i.e. Python 2.7.1 (r271:86832, Jul 31 2011, 19:30:53); probably some Windows versions also; does not affect Mac HomeBrew's Python 2.7.12)
ek = escaped_keys[:len(escaped_keys)/2]
for r in orRegexes(ek): yield r
ek = escaped_keys[len(ek):]
for r in orRegexes(ek): yield r

Silas S. Brown
committed

Silas S. Brown
committed
if mreverse: mdStart,mdEnd,aoStart,aoEnd = markupMid,markupEnd,markupStart,markupMid
else: mdStart,mdEnd,aoStart,aoEnd = markupStart,markupMid,markupMid,markupEnd

Silas S. Brown
committed
def different_ways_of_splitting(chars,numWords):
if numWords > len(chars): return
elif numWords == len(chars):

Silas S. Brown
committed
yield list(chars) ; return

Silas S. Brown
committed
yield [chars] ; return
spAt_try1 = len(chars) / numWords + 1
for spAt in range(spAt_try1,0,-1) + range(spAt_try1+1, len(chars)-numWords+1):
for r in different_ways_of_splitting(chars[spAt:],numWords-1): yield [chars[:spAt]]+r

Silas S. Brown
committed
def yarowsky_indicators(withAnnot_unistr,canBackground):
# yields True if rule always works (or in majority of cases with ymajority), or lists enough indicators to cover example instances and yields (negate, list, nbytes), or just list if empty.

Silas S. Brown
committed
# (If too few indicators can be found, will list the ones it can, or empty if no clearly-distinguishable indicators can be found within ybytes of end of match.)
# yield "backgrounded" = task has been backgrounded; .next() collects result

Silas S. Brown
committed
nonAnnot=markDown(withAnnot_unistr)

Silas S. Brown
committed
if nonAnnot in yPriorityDic: # TODO: enforce len==1 ?
if yPriorityDic[nonAnnot] == withAnnot_unistr:
# we want this case to be the default (TODO: can't we just put it straight into the rules when making yPriorityDic, and skip this? although I'm not sure if that would give much of a speedup, as the phrase/sec count tends to go into the thousands anyway when it's processing a yPriorityDic section)
if nonAnnot==diagnose: diagnose_write("yPriorityDic forces %s" % (withAnnot_unistr,))

Silas S. Brown
committed
else:
if nonAnnot==diagnose: diagnose_write("yPriorityDic forbids default %s" % (withAnnot_unistr,))

Silas S. Brown
committed
can_be_default = False # another is default, don't make this one default even if it occurs more

Silas S. Brown
committed
else: can_be_default = True
# First, find positions in corpus_markedDown which match withAnnot_unistr in corpus_unistr

Silas S. Brown
committed
okStarts = getOkStarts(withAnnot_unistr)
# now check for corpus_markedDown matches that *don't* have withAnnot_unistr
badStarts = getBadStarts(nonAnnot,okStarts)

Silas S. Brown
committed
if not badStarts:
if nonAnnot==diagnose: diagnose_write("%s has no badStarts" % (withAnnot_unistr,))
yield True ; return # rule always works, no Yarowsky indicators needed

Silas S. Brown
committed
if can_be_default and len(okStarts) > len(badStarts) and len(nonAnnot)==1:
if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule" % (withAnnot_unistr,))
yield True ; return # duplicate of code below (can test for this case early before reducing-down badStarts)

Silas S. Brown
committed
badStarts = getReallyBadStarts(badStarts,nonAnnot) # see its comments (ignore some badStarts)

Silas S. Brown
committed
if not badStarts:
if nonAnnot==diagnose: diagnose_write("%s has only probably-irrelevant badStarts" % (withAnnot_unistr,))

Silas S. Brown
committed
# Now, if it's right more often than not:
if can_be_default and len(okStarts) > len(badStarts):
# could we have this as a "default" rule, with the other cases as exceptions that will be found first?

Silas S. Brown
committed
if len(nonAnnot)==1:
if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule after removing irrelevant badStarts" % (withAnnot_unistr,))
yield True ; return # should be safe, and should cover most "common short Chinese word with thousands of contexts" cases

Silas S. Brown
committed
# If len 2 or more, it's risky because the correct solution could be to process just a fraction of the word now and the rest will become the start of a longer word, so we probably don't want it matching the whole lot by default unless can be sure about it
# e.g. looking at rule AB, text ABC and correct segmentation is A BC, don't want it to 'greedily' match AB by default without positive indicators it should do so
# Check for no "A BC" situations, i.e. can't find any possible SEQUENCE of rules that STARTS with ALL the characters in nonAnnot and that involves having them SPLIT across multiple words:
# (The below might under-match if there's the appearance of a split rule but it actually has extra non-marked-up text in between, but it shouldn't over-match.)
# TODO: if we can find the actual "A BC" sequences (instead of simply checking for their possibility as here), and if we can guarantee to make 'phrase'-length rules for all of them, then AB can still be the default. This might be useful if okStarts is very much greater than badStarts.
# (TODO: until the above is implemented, consider recommending --ymax-threshold=0, because, now that Yarowsky-like collocations can be negative, the 'following word' could just go in as a collocation with low ybytes)
# TODO: also, if the exceptions to rule AB are always of the form "Z A B", and we can guarantee to generate a phrase rule for "Z A B", then AB can still be default. (We should already catch this when the exceptions are "ZA B", but not when they are "Z A B", and --ymax-threshold=0 probably won't always help here, especially if Z==B; Mandarin "mei2you3" / "you3 mei2 you3" comes to mind)

Silas S. Brown
committed
llen = len(mdStart)+len(nonAnnot)
if all(x.end()-x.start()==llen for x in re.finditer(re.escape(mdStart)+("("+re.escape(mdEnd)+"((?!"+re.escape(mdStart)+").)*.?"+re.escape(mdStart)+")?").join(re.escape(c) for c in list(nonAnnot)),corpus_unistr)):
if nonAnnot==diagnose: diagnose_write("%s is default by majority-case rule after checking for dangerous overlaps etc" % (withAnnot_unistr,))
run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds. TODO: does mpi4py.futures have a different 'sweet spot' here? (low priority unless we can get MPI to outdo concurrent.futures in this application)
may_take_time = canBackground and len(okStarts) > 1000
if may_take_time: sys.stderr.write("\nLarge collocation check (%s has %d matches + %s), %s.... \n" % (withAnnot_unistr.encode(terminal_charset,'replace'),len(okStarts),badInfo(badStarts,nonAnnot),cond(run_in_background,"backgrounding","could take some time")))
if run_in_background:
job = executor.submit(yarowsky_indicators_wrapped,withAnnot_unistr) # recalculate the above on the other CPU in preference to passing, as memory might not be shared
yield "backgrounded" ; yield job.result() ; return

Silas S. Brown
committed
if ybytes_max > ybytes and (not ymax_threshold or len(nonAnnot) <= ymax_threshold):

Silas S. Brown
committed
retList = [] ; append=retList.append

Silas S. Brown
committed
for nbytes in range(ybytes,ybytes_max+1,ybytes_step):
negate,ret,covered,toCover = tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr)
if covered==toCover and len(ret)==1:
if may_take_time: sys.stderr.write(" - using 1 indicator, negate=%s\n" % repr(negate))
yield (negate,ret,nbytes) ; return # a single indicator that covers everything will be better than anything else we'll find
append((-int(covered*100/toCover),len(ret),nbytes,negate,toCover,ret)) # (1st 4 of these are the sort keys: maximum coverage to nearest 1%, THEN minimum num indicators for the same coverage, THEN minimum nbytes (TODO: problems of very large nbytes might outweigh having more indicators; break if found 100% coverage by N?), THEN avoid negate)

Silas S. Brown
committed
# TODO: try finding an OR-combination of indicators at *different* proximity lengths ?
if nonAnnot==diagnose: diagnose_write("Best coverage is %d%% of %d" % (-retList[0][0],retList[0][-2]))
negate,ret = retList[0][-3],retList[0][-1]

Silas S. Brown
committed
distance = retList[0][2]
else:
negate,ret = tryNBytes(ybytes_max,nonAnnot,badStarts,okStarts,withAnnot_unistr)[:2]

Silas S. Brown
committed
if ybytes < ybytes_max: distance = ybytes_max
else: distance = None # all the same anyway
if not ret and warn_yarowsky: sys.stderr.write("Couldn't find ANY Yarowsky-like indicators for %s \n" % (withAnnot_unistr.encode(terminal_charset,'replace'))) # (if nonAnnot==diagnose, this'll be reported by tryNBytes below)

Silas S. Brown
committed
# elif ybytes_max > ybytes: sys.stderr.write("Debugger: %s best coverage=%d/%d by %d indicators at nbytes=%d \n" % (withAnnot_unistr.encode(terminal_charset,'replace'),-retList[0][0],retList[0][3],retList[0][1],retList[0][2]))

Silas S. Brown
committed
# TODO: if partially but not completely covered, shouldn't entirely count the word as 'covered' in analyse()
elif ret and may_take_time: sys.stderr.write(" - using %d indicators, negate=%s\n" % (len(ret),repr(negate)))
if not ret or (not distance and not negate):
else:
if not distance: distance = ybytes_max
yield negate,ret,distance
def yarowsky_indicators_wrapped(withAnnot_unistr):
return yarowsky_indicators(withAnnot_unistr,False).next()

Silas S. Brown
committed
def getOkStarts(withAnnot_unistr):
if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr]
walen = len(withAnnot_unistr)
return set(x for x in precalc_sets[splitWords(withAnnot_unistr).next()] if corpus_unistr[m2c_map[x]:m2c_map[x]+walen]==withAnnot_unistr)
def getBadStarts(nonAnnot,okStarts): return set(x.start() for x in re.finditer(re.escape(nonAnnot),corpus_markedDown) if not x.start() in okStarts)

Silas S. Brown
committed
def getReallyBadStarts(badStarts,nonAnnot):
# Some of the badStarts can be ignored on the grounds that they should be picked up by other rules first: any where the nonAnnot match does not start at the start of a word (the rule matching the word starting earlier should get there first), and any where it starts at the start of a word that is longer than its own first word (the longest-first ordering should take care of this). So keep only the ones where it starts at the start of a word and that word is no longer than len(nonAnnot).

Silas S. Brown
committed
reallyBadStarts = [] ; append=reallyBadStarts.append
nonAnnotLen = len(mdStart+nonAnnot+mdEnd)
theRe = re.compile(re.escape(mdStart+nonAnnot[0])+".*?"+re.escape(mdEnd))
for b in badStarts:

Silas S. Brown
committed
except KeyError: continue # it wasn't the start of a word (only start positions are in that map)
m=theRe.search(corpus_unistr, s) # will either start at s, or after it if mreverse
s,e = m.start(),m.end()
if e-s > nonAnnotLen: continue # this word is too long, should be matched by a longer rule 1st
append(b) # to reallyBadStarts
return reallyBadStarts
def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr):
# try to find either positive or negative Yarowsky-like indicators, whichever gives a smaller set. Negative indicators might be useful if there are many matches and only a few special exceptions (TODO: but put in an option to avoid checking for them as per v0.57 and below? although I'm not sure what application would have to be that careful but still use Yarowsky-like indicators)
# (Negative indicators are used only if they cover 100% of the exceptions - see below re negate==None)
def bytesAround(start): return within_Nbytes(start+len(nonAnnot),nbytes)
okStrs=list(set(bytesAround(s) for s in okStarts))
badStrs=list(set(bytesAround(s) for s in badStarts))
pOmit = unichr(1).join(badStrs) # omit anything that occurs in this string from +ve indicators
nOmit = unichr(1).join(okStrs) # ditto for -ve indicators
pCovered=[False]*len(okStrs)
nCovered=[False]*len(badStrs)
pRet = [] ; pAppend=pRet.append
nRet = [] ; nAppend=nRet.append
negate = None # not yet set
stuffToCheck = [(okStrs,pAppend,pCovered,unique_substrings(okStrs,markedUp_unichars,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))] # a generator and associated parameters for positive indicators
if len(okStrs) > len(badStrs) or not okStrs: stuffToCheck.append((badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s)))) # and for negative indicators, if it seems badStrs are in the minority (or if not okStrs, which is for test_manual_rules) (TODO: smaller minority? we'll try a string from each generator in turn, stopping if we find one that covers everything; that way we're hopefully more likely to finish early if one of the two is going to quickly give a string that matches everything, but TODO is this always so optimal in other cases? especially if there are far more negative indicators than positive ones, in which case it's unlikely to end up being a "many matches and only a few special exceptions" situation, and checking through ALL the negative indicators is a lot of work for comparatively little benefit; TODO: also have 'if len(nAppend) > SOME_THRESHOLD and len(stuffToCheck)==2: del stuffToCheck[1] # give up on negative indicators if too many' ? )
while stuffToCheck and negate==None:
for i in range(len(stuffToCheck)):
strs,append,covered,generator = stuffToCheck[i]
try: indicator = generator.next()
except StopIteration:
del stuffToCheck[i] ; break
found = True ; cChanged = False
for i in xrange(len(strs)):
if not covered[i] and indicator in strs[i]:
covered[i]=cChanged=True
if cChanged: append(indicator)
if all(covered):
if append==pAppend: negate=False
else: negate=True
break
# and if negate==None AFTER this loop, didn't get all(pCovered) OR all(nCovered), in which case we fall back to negate=False. In other words, negative indicators have to cover ALL non-occurrences to be passed, wheras positive indicators just have to cover SOME. This is in keeping with the idea of 'under-match is better than over-match' (because an under-matching negative indicator is like an over-matching positive one)
if negate: ret,covered = nRet,nCovered
else: ret,covered = pRet,pCovered

Silas S. Brown
committed
if nonAnnot==diagnose:
if ret:
if negate: indicators = "negative indicators "
else: indicators = "indicators "
if len(ret) > 30: indicators=str(len(ret))+" "+indicators # +'/'.join(ret[:30]+['...'])

Silas S. Brown
committed
else: indicators = "no indicators"
if len(pOmit) > 200: pOmit = pOmit[:200]+"..."
if all(covered): notCovered = ""
else:
if negate: strs = badStrs
else: strs = okStrs
notCovered = [strs[i] for i in xrange(len(covered)) if not covered[i]]
if len(notCovered) > 10: notCovered = notCovered[:10]+["..."]
notCovered = " (not "+'/'.join(notCovered).replace('\n',"\\n")+")"
diagnose_write("tryNBytes(%d) on %s found %s (avoiding '%s'), covers %d/%d contexts%s" % (nbytes,withAnnot_unistr,indicators,pOmit.replace(unichr(1),'/').replace('\n',"\\n"),sum(1 for x in covered if x),len(covered),notCovered))
return negate,ret,sum(1 for x in covered if x),len(covered)

Silas S. Brown
committed
def cond(a,b,c):
if a: return b
else: return c
def badInfo(badStarts,nonAnnot):
ret = "%d false positive" % len(badStarts)
if not len(badStarts)==1: ret += "s"
if len(badStarts) > yarowsky_debug: return ret
for wordStart in badStarts:
wordEnd = wordStart + len(nonAnnot)
contextStart,contextEnd=max(0,wordStart-5),wordEnd+5
# but can we report it from the original corpus_unistr?
if wordStart in m2c_map and wordEnd in m2c_map:
wordStart,wordEnd = m2c_map[wordStart],m2c_map[wordEnd]
while newCStart not in m2c_map and newCStart >= contextStart-5: newCStart-=1
while newCEnd not in m2c_map and newCEnd<contextEnd+5: newCEnd+=1
if newCStart in m2c_map: contextStart = m2c_map[newCStart]
else: contextStart = max(0,wordStart - 15) # This might cut across markup, but better that than failing to report the original corpus and making it look like the words might not have "lined up" when actually they did. Might also just cut into surrounding non-markup text (if the above loop simply couldn't find anything near enough because such text was in the way).
if newCEnd in m2c_map: contextEnd = m2c_map[newCEnd]
ret += (u" (%s%s%s%s%s)" % (toRead[contextStart:wordStart],reverse_on,toRead[wordStart:wordEnd],reverse_off,toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r").encode(terminal_charset,'replace')

Silas S. Brown
committed

Silas S. Brown
committed
def unique_substrings(texts,allowedChars,omitFunc,valueFunc):
# yield unique substrings of texts, in increasing length, with equal lengths sorted by highest score returned by valueFunc, and omitting any where omitFunc is true, or that uses any character not in allowedChars (allowedChars==None means all allowed)
if allowedChars:
# remove non-allowedChars from texts, splitting into smaller strings as necessary

Silas S. Brown
committed
texts2 = [] ; append=texts2.append

Silas S. Brown
committed
for text in texts:

Silas S. Brown
committed
start = 0
for i in xrange(len(text)):
if not text[i] in allowedChars:

Silas S. Brown
committed
if i>start: append(text[start:i])

Silas S. Brown
committed
start=i+1

Silas S. Brown
committed
if start<len(text): append(text[start:])

Silas S. Brown
committed
texts=texts2
if not texts: return
length=1 ; maxlen = max(len(t) for t in texts)
while length <= maxlen:
ret=set()
# sys.stderr.write("Finding (l=%d)... " % len(texts))
for text in texts: ret.update(text[s:s+length] for s in xrange(len(text)-length+1))
l=[(valueFunc(k),k) for k in ret if not omitFunc(k)]
# if length == ybytes_max and not l: sys.stderr.write("Debugger: omitFunc was true for all %s\n" % repr(ret))

Silas S. Brown
committed
l.sort() ; l.reverse()

Silas S. Brown
committed
# sys.stderr.write("%d of length %d\n" % (len(l),length))

Silas S. Brown
committed
for v,k in l: yield k
length += 1

Silas S. Brown
committed
# return the Unicode characters within nbytes of matchEndPos, assuming the encoding will be outcode. Used for the Yarowsky-like functions.

Silas S. Brown
committed
# Assumes multibyte codes are self-synchronizing, i.e. if you start in the middle of a multibyte sequence, the first valid character will be the start of the next sequence, ok for utf-8 but TODO might not be the case for some codes
return corpus_markedDown[max(0,matchEndPos-nbytes):matchEndPos].encode(outcode)[-nbytes:].decode(outcode,'ignore')+corpus_markedDown[matchEndPos:matchEndPos+nbytes].encode(outcode)[:nbytes].decode(outcode,'ignore')

Silas S. Brown
committed
def test_rule(withAnnot_unistr,yBytesRet,canBackground=None):

Silas S. Brown
committed
# Tests to see if the rule withAnnot_unistr is
# ALWAYS right in the examples, i.e.
# the number of occurrences of its marked-down text
# in the continuous marked-down string should be
# EXACTLY equal to the number of occurrences of the
# marked-up version.
# (If we deal only in rules that ALWAYS work, we can
# build them up incrementally without "cross-talk")
# yield "backgrounded" = task has been backgrounded; .next() collects result (nb we default to NOT canBackground, as test_rule is called from several places of which ONE can handle backgrounding)
if primitive:
yield True ; return

Silas S. Brown
committed
if ybytes:

Silas S. Brown
committed
# Doesn't have to be always right, but put the indicators in yBytesRet
ybrG = yarowsky_indicators(withAnnot_unistr,canBackground)
ybr = ybrG.next()
if ybr == "backgrounded":
yield ybr ; ybr = ybrG.next()
if ybr==True or not ybr:
yield ybr ; return
yBytesRet.append(ybr) # (negate, list of indicators, nbytes)

Silas S. Brown
committed
phrase = markDown(withAnnot_unistr)
ret = corpus_markedDown.count(phrase) == len(getOkStarts(withAnnot_unistr))

Silas S. Brown
committed
if diagnose and diagnose==phrase:
diagnose_write("occurrences(%s)==occurrences(%s) = %s" % (phrase,withAnnot_unistr,ret))

Silas S. Brown
committed
def all_possible_rules(words,covered):

Silas S. Brown
committed
# Iterate over ALL possible rules derived from the
# word sequence (don't just "find the shortest context
# that predicts each word" because that can have
# trouble with overlaps; need to check them all and
# stop when we've got enough to reproduce the example)
# As optimisation, avoids returning rules for which
# all(covered) over that rule's range

Silas S. Brown
committed
if max_words: maxRuleLen = min(len(words),max_words)
else: maxRuleLen = len(words)
for ruleLen in range(1,maxRuleLen+1): # (sort by len)
for wStart in range(len(words)-ruleLen+1):
if not all(covered[wStart:wStart+ruleLen]):

Silas S. Brown
committed
yield words[wStart:wStart+ruleLen]
# caller join()s before adding to rules dict

Silas S. Brown
committed

Silas S. Brown
committed
def checkCoverage(ruleAsWordlist,words,coveredFlags):

Silas S. Brown
committed
# Updates coveredFlags and returns True if any changes
# (if False, the new rule is redundant).

Silas S. Brown
committed
# Don't worry about ybytes - assume the Yarowsky-like

Silas S. Brown
committed
# indicators have been calculated correctly across the
# whole text so we don't need to re-check them now.
assert type(ruleAsWordlist)==type(words)==list

Silas S. Brown
committed
try: start = words.index(ruleAsWordlist[0])

Silas S. Brown
committed
except ValueError: return False

Silas S. Brown
committed
ln = len(ruleAsWordlist)

Silas S. Brown
committed
changedFlags = False

Silas S. Brown
committed
while start <= len(words)-ln:

Silas S. Brown
committed
if words[start:start+ln] == ruleAsWordlist:

Silas S. Brown
committed
if not all(coveredFlags[start:start+ln]):
coveredFlags[start:start+ln]=[True]*ln

Silas S. Brown
committed
changedFlags = True
start += ln

Silas S. Brown
committed
else:
try: start = words.index(ruleAsWordlist[0],start+1)

Silas S. Brown
committed
except ValueError: break

Silas S. Brown
committed
return changedFlags
def wspJoin(l):
if removeSpace: return "".join(l)
else: return " ".join(l)
def potentially_bad_overlap(rulesAsWordlists,newRuleAsWords):

Silas S. Brown
committed
# Allow overlaps only if rule(s) being overlapped are
# entirely included within newRule. Otherwise could
# get problems generating closures of overlaps.
# (If newRule not allowed, caller to try a longer one)
# Additionally, if allow_overlaps, allow ANY overlap as
# long as it's not found in the marked-down text.
if len(newRuleAsWords)==1 or primitive or ybytes: return False

Silas S. Brown
committed
for ruleAsWordlist in rulesAsWordlists:
if len(ruleAsWordlist)==1: continue
if not len(ruleAsWordlist)==len(newRuleAsWords) and longerStartsOrEndsWithTheShorter(ruleAsWordlist,newRuleAsWords): continue
for overlapSize in range(1,min(len(x) for x in [newRuleAsWords,ruleAsWordlist])):
if not (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] or newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize]): continue

Silas S. Brown
committed
if not allow_overlaps: return True
# Test to see if the examples "allow" this potentially-bad overlap
def overlapOK(rAW): return not markDown(wspJoin(rAW)) in corpus_markedDown

Silas S. Brown
committed
if (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] and not overlapOK(ruleAsWordlist[:-overlapSize]+newRuleAsWords)) or (newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize] and not overlapOK(newRuleAsWords[:-overlapSize]+ruleAsWordlist)): return True

Silas S. Brown
committed
def longerStartsOrEndsWithTheShorter(l1,l2):
if len(l1) > len(l2): l1,l2 = l2,l1
return l2[:len(l1)]==l1 or l2[-len(l1):]==l1

Silas S. Brown
committed
class RulesAccumulator:
def __init__(self):
self.rules = {}

Silas S. Brown
committed
self.rulesAsWordlists_By1stWord = {} # starting word -> list of possible rules (as wordlists) that might apply
self.rulesAsWordlists = list() # all rules as words (list of lists) (used if not ybytes, TODO: integrate with rulesAsWordlists_By1stWord?)

Silas S. Brown
committed
self.rejectedRules = set()

Silas S. Brown
committed
self.seenPhrases = set() # de-duplicate, might speed up
self.amend_rules = False
if rulesFile: self.load()
def save(self):
sys.stderr.write("\nPickling rules to %s... " % rulesFile)

Silas S. Brown
committed
f = openfile(rulesFile,'wb')
pickle.Pickler(f,-1).dump((self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases))
# (don't save self.rejectedRules, there might be better clues next time)

Silas S. Brown
committed
def load(self):
if not os.path.isfile(rulesFile):
sys.stderr.write("%s does not exist, starting with blank rules\n" % rulesFile)
return
sys.stderr.write("Unpickling rules from %s... " % rulesFile)
f = openfile(rulesFile,'rb')
self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases = pickle.Unpickler(f).load()
sys.stderr.write("done\n")
self.amend_rules = True
self.newRules = set()
def remove_old_rules(self,words): # for incremental runs - removes previously-discovered rules that would have been suggested by this new phrase but that no longer 'work' with the rest of the corpus due to alterations elsewhere. DOES NOT remove old rules that are not suggested by any phrase in the corpus because the phrases that suggested them have been removed or changed (TODO: might want an option for that, although fundamentally you shouldn't be relying on incremental runs if you're making a lot of changes to the corpus)

Silas S. Brown
committed
for w in set(words):
rulesAsWordlists = self.rulesAsWordlists_By1stWord.get(w,[])
i=0
while i<len(rulesAsWordlists):

Silas S. Brown
committed
if max_words and len(rulesAsWordlists[i])>max_words:
i += 1 ; continue # better leave that one alone if we're not reconsidering rules that long (e.g. running again with single_words when previous run wasn't)
rule = wspJoin(rulesAsWordlists[i])
if rule not in self.newRules and checkCoverage(rulesAsWordlists[i],words,[False]*len(words)): # rule would apply to the new phrase
yBytesRet = []
if not test_rule(rule,yBytesRet).next() or potentially_bad_overlap(self.rulesAsWordlists,rulesAsWordlists[i]): # re-test fails. In versions v0.543 and below, we just removed ALL rules that would apply to the new phrase, to see if they would be re-generated. But that caused problems because addRulesForPhrase can return early if all(covered) due to other (longer) rules and we might be removing a perfectly good short rule that's needed elsewhere. So we now re-test before removal.
if not ybytes: self.rulesAsWordlists.discard(rulesAsWordlists[i])
del rulesAsWordlists[i] ; del self.rules[rule]
continue
self.newRules.add(rule) # still current - add to newRules now to save calling test_rule again
if len(yBytesRet): self.rules[rule] = yBytesRet[0] # overriding what it was before (since we've re-done test_rule for it, which might have returned a new set of Yarowsky-like indicators for the new version of the corpus)
i += 1
def addRulesForPhrase(self,phrase,canBackground=False):
if phrase in self.seenPhrases or (diagnose_quick and diagnose):
# if diagnose and (diagnose_quick or self.amend_rules) and mdStart+diagnose+mdEnd in phrase: pass # look at it again for diagnostics. But do we accept a diagnose that spans multiple words? should be pointed out by --diagnose-quick below if uncommented
if diagnose and (diagnose_quick or self.amend_rules) and diagnose in markDown(phrase): pass # this version accepts diagnose of multiple words (and might also let some phrases through where it matches on an overlap)
else:
yield 0,0 ; return # TODO: document that this means the total 'covered' figure in the progress status is AFTER phrase de-duplication (otherwise we'd have to look up what the previous values were last time we saw it - no point doing that just for a quick statistic)

Silas S. Brown
committed
self.seenPhrases.add(phrase)

Silas S. Brown
committed
words = filter(lambda x:markDown(x).strip(),splitWords(phrase)) # filter out any that don't have base text (these will be input glitches, TODO: verify the annotation text is also just whitespace, warn if not)

Silas S. Brown
committed
covered = [False]*len(words)
# first see how much is covered by existing rules
# (don't have to worry about the order, as we've been
# careful about overlaps)
if self.amend_rules: self.remove_old_rules(words) # NB if yignore this might not remove all, but still removes all that affect checkCoverage below

Silas S. Brown
committed
for w in set(words):
for ruleAsWordlist in self.rulesAsWordlists_By1stWord.get(w,[]):
if checkCoverage(ruleAsWordlist,words,covered) and all(covered):
yield len(covered),len(covered) ; return # no new rules needed
for ruleAsWordlist in all_possible_rules(words,covered):
rule = wspJoin(ruleAsWordlist) ; yBytesRet = []
if rule in self.rejectedRules: continue
if rule in self.rules: continue # this can still happen even now all_possible_rules takes 'covered' into account, because the above checkCoverage assumes the rule won't be applied in a self-overlapping fashion, whereas all_possible_rules makes no such assumption (TODO: fix this inconsistency?)
rGen = test_rule(rule,yBytesRet,canBackground)
r = rGen.next()
if r=="backgrounded":
yield r ; r = rGen.next()
del rGen
if not r or potentially_bad_overlap(self.rulesAsWordlists,ruleAsWordlist):

Silas S. Brown
committed
self.rejectedRules.add(rule) # so we don't waste time evaluating it again (TODO: make sure rejectedRules doesn't get too big?)
continue
cc = checkCoverage(ruleAsWordlist,words,covered) # changes 'covered'
assert cc, "this call to checkCoverage should never return False now that all_possible_rules takes 'covered' into account" # and it's a generator which is always working from the CURRENT copy of 'covered'

Silas S. Brown
committed
if len(yBytesRet): self.rules[rule] = yBytesRet[0]
else: self.rules[rule] = [] # unconditional

Silas S. Brown
committed
if not ybytes: self.rulesAsWordlists.append(ruleAsWordlist)
if not ruleAsWordlist[0] in self.rulesAsWordlists_By1stWord: self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]] = []
self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]].append(ruleAsWordlist)
if self.amend_rules: self.newRules.add(rule)
if all(covered):
yield len(covered),len(covered) ; return

Silas S. Brown
committed
# If get here, failed to completely cover the phrase.

Silas S. Brown
committed
# ruleAsWordlist should be set to the whole-phrase rule.
yield sum(1 for x in covered if x),len(covered)
def rulesAndConds(self):
if self.amend_rules: return [(k,v) for k,v in self.rules.items() if not k in self.newRules] + [(k,v) for k,v in self.rules.items() if k in self.newRules] # new rules must come last for incremental runs, so they will override existing actions in byteSeq_to_action_dict when small changes have been made to the annotation of the same word (e.g. capitalisation-normalisation has been changed by the presence of new material)
else: return self.rules.items()

Silas S. Brown
committed
def handle_diagnose_limit(rule):
global diagnose,diagnose_limit
if diagnose and diagnose_limit and diagnose==markDown(rule):
diagnose_limit -= 1
if not diagnose_limit:
diagnose = False
diagnose_write("limit reached, suppressing further diagnostics")

Silas S. Brown
committed
def generate_map():
global m2c_map, precalc_sets, yPriorityDic

Silas S. Brown
committed
if checkpoint:
try:
f=open_try_bz2(checkpoint+os.sep+'map','rb')
m2c_map,precalc_sets,yPriorityDic = pickle.Unpickler(f).load()

Silas S. Brown
committed
return
except: pass
assert main, "Only main should generate corpus map"

Silas S. Brown
committed
sys.stderr.write("Generating corpus map... ")
m2c_map = {} ; precalc_sets = {}

Silas S. Brown
committed
muStart = downLenSoFar = 0
for s in re.finditer(re.escape(markupStart), corpus_unistr):
s=s.start()
downLenSoFar += len(markDown(corpus_unistr[muStart:s]))
muStart = s

Silas S. Brown
committed
# Added optimisation: do precalc_sets as well
# (at least catch the 1-word cases)
# -> this is now needed even if not ybytes

Silas S. Brown
committed
e=corpus_unistr.find(markupEnd,s)
if e>-1:
e += len(markupEnd)
k = corpus_unistr[s:e]
if k not in precalc_sets: precalc_sets[k]=set()
precalc_sets[k].add(downLenSoFar)

Silas S. Brown
committed
yPriorityDic = {}

Silas S. Brown
committed
sys.stderr.write("yPriorityDic ... ")
for s in re.finditer(re.escape(reference_sep+ref_pri+ref_name_end), corpus_unistr):
s = s.start()+len(reference_sep+ref_pri+ref_name_end)
e = corpus_unistr.find(reference_sep,s)
if e==-1: e=len(corpus_unistr)
for w in splitWords(corpus_unistr[s:e]):
wd = markDown(w)
if diagnose==wd: diagnose_write("yPriorityDic[%s] = %s" % (wd,w))

Silas S. Brown
committed
sys.stderr.write("done\n")
if checkpoint: pickle.Pickler(open_try_bz2(checkpoint+os.sep+'map','wb'),-1).dump((m2c_map,precalc_sets,yPriorityDic))

Silas S. Brown
committed
checkpoint_exit()
if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus and map etc)
try:
commands.getoutput(
"ps -p " + str(os.getpid()) + " -o args") \
.index("-m mpi4py.futures") # ValueError if not found
import mpi4py.futures # mpi4py v2.1+
import mpi4py.MPI, mpi4py ; assert mpi4py.MPI.COMM_WORLD.size > 1, "mpi4py says world size is 1: likely a symptom of incorrectly-configured MPI. Did you compile mpi4py using the same setup (e.g. MPICH or OpenMPI) as you are running? mpi4py's config is: "+repr(mpi4py.get_config())
except ValueError: pass # but raise all other exceptions: if we're being run within mpi4py.futures then we want to know about MPI problems
try:
commands.getoutput(
"ps -p " + str(os.getpid()) + " -o args") \
.index("-m scoop") # ValueError if not found
import scoop.futures
return scoop.futures # submit() is at module level
except ValueError: pass
try:
import concurrent.futures # sudo pip install futures (2.7 backport of 3.2 standard library)
import multiprocessing
num_cpus = multiprocessing.cpu_count()
if num_cpus >= 2: return concurrent.futures.ProcessPoolExecutor(num_cpus-1) # leave one for the CPU-heavy control task
except: pass
def get_phrases():
# Returns a list of phrases in processing order, with length-numbers inserted in the list. Caches its result.
global _gp_cache
try: return _gp_cache
except: pass

Silas S. Brown
committed
# Due to the way we handle overlaps, it's better to process the shortest phrases first, as the longer phrases will yield more rule options and therefore more likely to be able to work around any "no-overlap" constraints imposed by already-processed examples. Something like:
p2 = []

Silas S. Brown
committed
for p in splitWords(corpus_unistr,phrases=True):
p2.append((min([len(p.split(markupStart)),len(p.split(markupMid)),len(p.split(markupEnd))]),len(p2),p)) # no need for splitWords(phrase) just to get len, but we do need the min-of-3 for robustness against the occasional markup error
p2.sort() # by length, then by original position (note: if removing this sort, remove wordsThisPhrase from status_update)
phrases = [] ; wordLen = None
for p in p2:
if not wordLen == p[0]:
wordLen = p[0]
phrases.append(wordLen-1) # because it's a .split length (really want an actual count, but it only has to be roughly right in this instance and splitLen-1 will do for speed)
phrases.append(p[-1])
_gp_cache = phrases ; return phrases
def setup_other_globals():
global corpus_markedDown
corpus_markedDown = markDown(corpus_unistr)

Silas S. Brown
committed
if ybytes:
global markedUp_unichars
if yarowsky_all: markedUp_unichars = None
else: markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int)))
def check_globals_are_set_up(): # for use during parallelism
try: corpus_unistr # if we fork()d, we may already have it
except NameError:
normalise() # should get corpus_unistr from checkpoint,
try: corpus_unistr # unless we're NOT normalising,
except: corpus_unistr = openfile(infile).read().decode(incode) # in which case we have to load the corpus from scratch (it won't be stdin)
generate_map() # similarly this should just be a read
setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine

Silas S. Brown
committed
accum = RulesAccumulator()
covered = 0 # number of phrases we managed to 'cover' with our rules
toCover = 0 # number of phrases we TRIED to cover (==covered if 100%)
phraseNo = 0 ; wordLen = None

Silas S. Brown
committed
if checkpoint:
try: phraseNo,wordLen,covered,toCover,accum.__dict__ = read_checkpoint()

Silas S. Brown
committed
except: pass
phraseLastUpdate = phraseLastCheckpoint = phraseNo
lastUpdate = lastCheckpoint = startTime = time.time()
backgrounded = [] ; phrases = get_phrases()

Silas S. Brown
committed
while phraseNo < len(phrases):
if type(phrases[phraseNo])==int:
wordLen = phrases[phraseNo]
for b in backgrounded: # flush (TODO: duplicate code)
coveredA,toCoverA = b.next()
covered += coveredA ; toCover += toCoverA
backgrounded = []
phraseNo += 1 ; continue

Silas S. Brown
committed
if toCover:
if checkpoint and (checkpoint_exit(0) or time.time() >= lastCheckpoint + 1000): # TODO: configurable?
sys.stderr.write("Checkpointing..."+clear_eol)