Newer
Older

Silas S. Brown
committed
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
import sys ; aFormat = 'ruby'
for a in sys.argv[1:]:
if a.startswith("--"): aFormat=a[2:]
if type("")==type(u""): sys.stdout.buffer.write(annotate(sys.stdin.buffer.read(),aFormat)) # Python 3
else: sys.stdout.write(annotate(sys.stdin.read(),aFormat)) # Python 2
if __name__=="__main__": main()
""" # TODO: annotation-type option from command line in py
c_zlib = br"""static unsigned char *data=NULL;
static void init() {
z_stream s; memset(&s,0,sizeof(s));
s.next_in=origData; s.avail_in=%%ZLIBLEN%%;
data=malloc(%%ORIGLEN%%); // TODO: check non-NULL
s.next_out=data; s.avail_out=%%ORIGLEN%%;
inflateInit(&s); inflate(&s, Z_NO_FLUSH); // TODO: check for memory and data-corruption errors
inflateEnd(&s);
}
"""
c_datadrive = br"""
static unsigned char *dPtr; static int addrLen;
#include <stdlib.h>
static unsigned char * readAddr() {
size_t i,addr=0;
for (i=addrLen; i; i--) addr=(addr << 8) | *dPtr++;
return data + addr;
}
static void readData() {
POSTYPE *savedPositions = NULL;
size_t numSavedPositions = 0;
while(1) {
unsigned char c = *dPtr++;
if (c & 0x80) dPtr += (c&0x7F); // short relative forward jump (up to 128 bytes from addr after instruction)
else if(c < 20) { // switchbyte with short jumps
c++; // now c == nBytes
unsigned char byte=(unsigned char)NEXTBYTE;
int i;
for (i=0; i<c; i++) if(byte==dPtr[i]) break;
if(i) dPtr += dPtr[c+i-1]; dPtr += c+c; // relative from end of switch (after all bytes, 1-byte addresses (except 1st) and the 1-byte default address)
} else switch(c) {
case 50: /* jump */ dPtr = readAddr(); break;
case 51: /* call */ {
unsigned char *funcToCall=readAddr();
unsigned char *retAddr = dPtr;
dPtr = funcToCall; readData(); dPtr = retAddr;
break; }
case 52: /* return */
if (savedPositions) free(savedPositions);
return;
case 60: /* switchbyte */ {
int nBytes=(*dPtr++)+1, i;
unsigned char byte=(unsigned char)NEXTBYTE;
for (i=0; i<nBytes; i++) if(byte==dPtr[i]) break;
dPtr += (nBytes + i * addrLen);
dPtr = readAddr(); break; }
case 70: s0(); break;
case 71: case 74: /* copyBytes */ {
int numBytes=*dPtr++;
for(;numBytes;numBytes--)
OutWriteByte(NEXT_COPY_BYTE);
if(c==74) return; else break; }
case 72: case 75: /* o */ {
int numBytes=*dPtr++;
char *annot = (char*)readAddr();
o(numBytes,annot); if(c==75) return; else break; }
case 73: case 76: /* o2 */ {
int numBytes=*dPtr++;
char *annot = (char*)readAddr();
char *title = (char*)readAddr();
o2(numBytes,annot,title);
if(c==76) return; else break; }
case 80: /* savepos */
savedPositions=realloc(savedPositions,++numSavedPositions*sizeof(POSTYPE)); // TODO: check non-NULL?
savedPositions[numSavedPositions-1]=THEPOS;
break;
case 81: /* restorepos */
SETPOS(savedPositions[--numSavedPositions]);
break;
case 90: /* neartest */ {
unsigned char *truePtr = readAddr();
unsigned char *falsePtr = readAddr();
setnear(*dPtr++); int found=0;
while(dPtr < truePtr && dPtr < falsePtr) if(near((char*)readAddr())) { found = 1; break; }
dPtr = found ? truePtr : falsePtr; break; }
// default: TODO: error about corrupt data?
}
}
}
static void topLevelMatch() {
addrLen = data[0];
dPtr=data+1; readData();
}
"""
def splitWords(text,phrases=False):
# split text into words, ignoring anything between markupStart and markupEnd
# if phrases = True, instead of words, split on any non-whitespace char outside markupStart..markupEnd
warnPhrases = phrases
if phrases: it=re.finditer(phrasePattern,text)
else: it=re.finditer(wordPattern,text)
for i in it:
y = i.group()
if len(y) > 1000000 and warnPhrases:
sys.stderr.write("WARNING: Your corpus needs more phrase delimiters!\nVery long phrases can take a LONG time to process.\n")
warnPhrases = False
yield y
markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),flags=re.DOTALL)
wordPattern = re.escape(markupStart)+'.*?'+re.escape(markupEnd)
phrasePattern = re.compile(wordPattern+r'(\s*'+wordPattern+r')*',flags=re.DOTALL+re.UNICODE)
wordPattern = re.compile(wordPattern,flags=re.DOTALL)
wspPattern = re.compile(r"\s+",flags=re.UNICODE)
def annotationOnly(text):
ret = []
for w in re.finditer(markupPattern,text):
if mreverse: ret.append(w.group(1))
else: ret.append(w.group(2))
return ' '.join(ret)
def markDown(text):
# Return just the original text, without markup
if mreverse: group=r"\2"
else: group=r"\1"
return re.sub(markupPattern,group,text)
def markUp(text,annotation):
if mreverse: text,annotation = annotation,text
return markupStart + text + markupMid + annotation + markupEnd
def checkpoint_exit(doIt=1):
if not checkpoint: return
try: open(checkpoint+os.sep+"ExitASAP")
except: return
if doIt:
assert main, "Only annogen's main module should call checkpoint_exit with doIt=1"
os.remove(checkpoint+os.sep+"ExitASAP")
sys.stderr.write("\nExitASAP found: exit\n")
raise SystemExit
else: return True
try: import cPickle as pickle
except:
try: import pickle
except: pickle = None
def read_checkpoint():
t = pickle.Unpickler(open(checkpoint+os.sep+'checkpoint','rb')).load()
sys.stderr.write("Checkpoint loaded from %d phrases\n" % t[0])
return t
def write_checkpoint(t):
pickle.Pickler(open(checkpoint+os.sep+'checkpoint-NEW','wb'),-1).dump(t) # better write to checkpoint-NEW, in case we reboot or have an OS-level "Out of memory" condition *while* checkpointing
try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
except OSError: # OS can't do it atomically?
rm_f(checkpoint+os.sep+'checkpoint')
try: os.rename(checkpoint+os.sep+'checkpoint-NEW',checkpoint+os.sep+'checkpoint')
except OSError: pass
checkpoint_exit()
def status_update(phraseNo,numPhrases,wordsThisPhrase,nRules,phraseLastUpdate,lastUpdate,phraseLastCheckpoint,lastCheckpoint,coverP,nRej,startTime):
phraseSec = (phraseNo-phraseLastUpdate)*1.0/(time.time()-lastUpdate)
if phraseSec < 100:
phraseSecS = "%.1f" % phraseSec
else: phraseSecS = "%d" % int(phraseSec)
progress = status_prefix + "%s phrase/sec (%d%%/#w=%d) rules=%d cover=%d%%" % (phraseSecS,int(100.0*phraseNo/numPhrases),wordsThisPhrase,nRules,coverP)
if warn_yarowsky: progress += (" rej=%d" % nRej)
if time_estimate:
if phraseNo-phraseLastCheckpoint < 10: phraseMin = phraseSec*60 # current 'instantaneous' speed
else: phraseMin = (phraseNo-phraseLastCheckpoint)*60/(time.time()-lastCheckpoint) # longer-term average
minsLeft = (numPhrases-phraseNo)/phraseMin
if minsLeft>60*24: progress += " %dd+" % int(minsLeft/60/24)
elif minsLeft>60: progress += " %dh+" % int(minsLeft/60)
elif minsLeft: progress += " %dmin+" % minsLeft
# (including the + because this is liable to be an underestimate; see comment after the --time-estimate option)
if len(progress) + 14 < screenWidth:
progress += " (at %02d:%02d:%02d" % time.localtime()[3:6] # clock time: might be useful for checking if it seems stuck
if len(progress) + 20 < screenWidth and not clear_eol == " \r": # (being able to fit this in can be intermittent)
elapsed = time.time() - startTime
progress += ", analyse=%d:%02d:%02d" % (elapsed/3600,(elapsed%3600)/60,elapsed%60)
progress += ")"
sys.stderr.write(progress+clear_eol)
sys.stderr.flush()
def normalise():
if normalised_file == infile: return
global capitalisation # might want to temp change it
if (capitalisation or priority_list) and annot_whitespace: return
global corpus_unistr
if checkpoint:
try:
f=open_try_bz2(checkpoint+os.sep+'normalised','r')
corpus_unistr = f.read().decode('utf-8')
sys.stderr.write("Normalised copy loaded\n")
return True # loaded from checkpoint
except: # if re-generating 'normalised', will also need to regenerate 'map' and 'checkpoint' if present
assert main, "normalise checkpoint not readable in non-main module"
rm_f(checkpoint+os.sep+'map.bz2') ; rm_f(checkpoint+os.sep+'map')
rm_f(checkpoint+os.sep+'checkpoint')
else: assert main, "normalise called in non-main module and checkpoint isn't even set"
sys.stderr.write("Normalising...");sys.stderr.flush()
old_caps = capitalisation
if priority_list: capitalisation = True # no point keeping it at False
allWords = getAllWords()
if removeSpace:
corpus_unistr = re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),(markupEnd+markupStart).replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE) # so getOkStarts works consistently if corpus has some space-separated and some not
corpus_unistr = re.sub(re.escape(markupStart)+'\s+',markupStart.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupMid),markupMid.replace('\\',r'\\'),re.sub(re.escape(markupMid)+'\s+',markupMid.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupEnd),markupEnd.replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE) # so we're tolerant of spurious whitespace between delimeters and markup (TODO: do this even if not removeSpace?)
if not annot_whitespace:
# normalise trailing hyphens e.g. from OCR'd scans:
cu0 = corpus_unistr ; ff = 0
for hTry in [1,2]:
for w in allWords.keys():

Silas S. Brown
committed
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
if '-'+aoEnd in w:
idx = w.index('-'+aoEnd)
if w[:idx].endswith(aoStart) or w[:idx].endswith("-"): continue # ignore this one (a mess of some kind)
if hTry==2: # ouch, this doesn't look good
getBuf(sys.stderr).write((u" (can't normalise hyphens due to '%s') " % w).encode(terminal_charset,'replace')) ; sys.stderr.flush()
corpus_unistr = cu0 ; break
if mreverse: grp,mdG=r"-\1",r"\2"
else: grp,mdG=r"-\2",r"\1"
# TODO: batch up the following replacements by using something similar to Replacer but with a common destination regexp that takes groups from the 'w' entries as well. (Low priority because don't typically get TOO many of these dangling hyphens in most corpuses.)
corpus_unistr = re.sub(re.escape(w)+r"\s*"+re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),w.replace('\\',r'\\').replace('-'+aoEnd.replace('\\',r'\\'),grp+aoEnd.replace('\\',r'\\')).replace(mdEnd.replace('\\',r'\\'),mdG+mdEnd.replace('\\',r'\\')),corpus_unistr,flags=re.DOTALL+re.UNICODE)
ff = 1
if ff: allWords = getAllWords() # re-generate
del cu0
sys.stderr.write(":") ; sys.stderr.flush()
KeywordProcessor = None
# try: from flashtext.keyword import KeywordProcessor # pip install flashtext
# except: pass
# # commenting out the above because the fallback code below was ~20% faster than flashtext in my test (perhaps their benchmark figures rely on a non-empty non_word_boundaries set)
if not KeywordProcessor:
class KeywordProcessor:
def __init__(self,*a,**k): self.dic = {}
def __len__(self): return len(self.dic)
def add_keyword(self,x,y):
if diagnose and diagnose in x: diagnose_write("Replacer.add(%s,%s)" % (x,y))
self.dic[x] = y
if not (len(self.dic)%1500):
sys.stderr.write('.') ; sys.stderr.flush()
def replace_keywords(self,unistr):
if not self.dic: return
for exp in orRegexes(re.escape(k) for k in iterkeys(self.dic)):
sys.stderr.write(';') ; sys.stderr.flush()
unistr = re.sub(exp,lambda k:self.dic[k.group(0)],unistr)
return unistr
rpl = KeywordProcessor(case_sensitive=True)
rpl.non_word_boundaries = set()
cu_nosp = [None]
def normWord(w):
if '-' in w: hTry=set([w.replace('-','')]) # if not annot_whitespace, we'll replace any non-hyphenated 'run together' version by the version with the hyphen; that's often the sensible thing to do with pinyin etc (TODO more customisation??)
else: hTry=None
if not capitalisation:
wl = w.lower() # (as long as it's all Unicode strings, .lower() and .upper() work with accents etc)
if not w==wl and wl in allWords:
# This word is NOT always capitalised.
# Could be 'caps if at start of sentence'
# (or title-case etc), but might also be
# a corpus error, so check numbers.
if allWords[wl]*5 < allWords[w] and allWords[wl] <= normalise_debug: # not really "Yarowsky" here in the normaliser, but similar principle
typo_report("normalise-debug.txt","allow-caps-exceptions.txt",wl,(u"%s (%d instances) overrides %s (%d instances)" % (wl,allWords[wl],w,allWords[w])))

Silas S. Brown
committed
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
# To simplify rules, make it always lower.
w = wl
if hTry: hTry.add(w.replace('-',''))
if annot_whitespace or (keep_whitespace and markDown(w) in keep_whitespace): return w,None
if not re.search(wspPattern,w): return w,hTry
nowsp = re.sub(wspPattern,"",w)
if not capitalisation and not nowsp.lower()==nowsp and nowsp.lower() in allWords: nowsp = nowsp.lower()
if nowsp in allWords: return nowsp,hTry # varying whitespace in the annotation of a SINGLE word: probably simplest if we say the version without whitespace, if it exists, is 'canonical' (there might be more than one with-whitespace variant), at least until we can set the relative authority of the reference (TODO)
ao,md = annotationOnly(w),markDown(w)
aoS = ao.split()
if len(md.split())==1 and len(md) <= 5 and len(aoS) <= len(md): # TODO: 5 configurable? don't want different_ways_of_splitting to take too long
# if not too many chars, try different ways of
# assigning each word to chars, and see if any
# of these exist in the corpus; if any does,
# assume we have "ABC|a bc" <= "A|a BC|bc" type
# situations - the latter shouldn't necessarily be
# converted into the former, but the former might
# be convertible into the latter to simplify rules
if cu_nosp[0] == None:
cu_nosp[0] = re.sub(wspPattern,"",corpus_unistr)
if not capitalisation: cu_nosp[0] = cu_nosp[0].lower() # ignore capitalisation when searching for this
if capitalisation: aoS2 = aoS
else: aoS2 = [w0.lower() for w0 in aoS]
for charBunches in different_ways_of_splitting(md,len(aoS)):
mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS2)]
multiword = "".join(mw)
if multiword in cu_nosp[0]:
# we're about to return a split version of the words, but we now have to pretend it went through the initial capitalisation logic that way (otherwise could get unnecessarily large collocation checks)
if not capitalisation:
mw = [markUp(c,w0) for c,w0 in zip(charBunches,aoS)] # the original capitalisation. for selective .lower()
for i in range(len(mw)):
w0 = mw[i]
wl = w0.lower()
if not w0==wl and wl in allWords:
mw[i] = wl
return "".join(mw),hTry
# TODO: is there ANY time where we want multiword to take priority over the nowsp (no-whitespace) version above? or even REPLACE multiword occurrences in the corpus with the 1-word nowsp version?? (must be VERY CAREFUL doing that)
# TODO: anything else?
return w,hTry
for w in allWords.keys():

Silas S. Brown
committed
w2,hTry = normWord(w)
if hTry:
hTry.add(w2.replace('-','')) # in case not already there
for h in hTry:
if h in allWords: rpl.add_keyword(h,w2)
if not w==w2: rpl.add_keyword(w,w2)
if len(rpl):
corpus_unistr = rpl.replace_keywords(corpus_unistr)
del rpl
sys.stderr.write(" done\n")
if normalised_file: openfile(normalised_file,'w').write(corpus_unistr.encode(incode))
if checkpoint and capitalisation==old_caps: open_try_bz2(checkpoint+os.sep+'normalised','w').write(corpus_unistr.encode('utf-8'))
capitalisation = old_caps
checkpoint_exit()
def getAllWords():
allWords = {}

Silas S. Brown
committed
for phrase in splitWords(corpus_unistr,phrases=True):
for w in splitWords(phrase): allWords[w]=allWords.setdefault(w,0)+1

Silas S. Brown
committed
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
return allWords # do NOT cache (is called either side of the normaliser)
def orRegexes(escaped_keys):
escaped_keys = list(escaped_keys) # don't just iterate
try: yield re.compile('|'.join(escaped_keys))
except OverflowError: # regex too big (e.g. default Python on Mac OS 10.7 i.e. Python 2.7.1 (r271:86832, Jul 31 2011, 19:30:53); probably some Windows versions also; does not affect Mac HomeBrew's Python 2.7.12)
ek = escaped_keys[:int(len(escaped_keys)/2)]
for r in orRegexes(ek): yield r
ek = escaped_keys[len(ek):]
for r in orRegexes(ek): yield r
def PairPriorities(markedDown_Phrases,existingFreqs={}):
markedDown_Phrases = list(markedDown_Phrases)
assert all(type(p)==list for p in markedDown_Phrases)
mdwSet = set(existingFreqs.keys())
for p in markedDown_Phrases: mdwSet.update(p)
assert all(type(w)==unicode for w in mdwSet)
votes = {} ; lastT = time.time()
for pi in xrange(len(markedDown_Phrases)):
if time.time() > lastT+2:
sys.stderr.write("PairPriorities: %d%%%s" % (pi*100/len(markedDown_Phrases),clear_eol)) ; sys.stderr.flush()
lastT = time.time()
p=markedDown_Phrases[pi]
for x in xrange(len(p)-1):
a,b = p[x:x+2]
combined = a+b
for i in xrange(1,len(combined)):
if i==len(a): continue
elif i<len(a): prefer,over = a,combined[i:]
else: prefer,over = b,combined[:i]
if not (prefer in mdwSet and over in mdwSet and not prefer==over): continue
k = tuple(sorted([prefer,over]))
if k[0]==prefer: direction = 1
else: direction = -1
votes[k]=votes.get(k,0)+direction
if diagnose in k: diagnose_write("Prefer %s over %s: %d vote from %s | %s" % (k+(direction,a,b)))
sys.stderr.write("PairPriorities: done\n")
del markedDown_Phrases
global closure,gtThan,lessThan
closure,gtThan, lessThan = set(),{},{}
def addToClosure(a,b):
# If a>b, then a>c for any b>c (c<b)
# (actually b>=c but we don't have equality),
# and c>b for any c>a.
candidate = set([(a,b)]+[(a,c) for c in lessThan.get(b,[])]+[(c,b) for c in gtThan.get(a,[])])
if closure==None: # no longer tracking closure
if any(y in gtThan.get(x,{}) for (x,y) in candidate): return # contradiction
else:
if any((y,x) in closure for (x,y) in candidate):
return # contradiction, use higher abs votes
closure.update(candidate)
for x,y in candidate:
# x>y y<x, so y should be in lessThan[x]
lessThan.setdefault(x,set()).add(y)
gtThan.setdefault(y,set()).add(x)
return True
for _,v,a,b in reversed(sorted([(abs(v),v,a,b) for (a,b),v in votes.items()])):
if v < 0: a,b = b,a
r=addToClosure(a,b)
if diagnose in (a,b):
if r==None: r = False
diagnose_write("addToClosure(%s,%s) [v=%d] returned %s" % (a,b,abs(v),repr(r)))
trueClosure,closure = closure,None
lastW,lastF,lastPriorW = set(),None,set()
for _,w in reversed(sorted((f,w) for w,f in existingFreqs.items())): # highest frequency first
if lastW and existingFreqs[w] < lastF:
lastPriorW,lastW = lastW,set()
lastF = existingFreqs[w]
for W in lastPriorW: addToClosure(W,w)
lastW.add(w)
sys.stderr.write("%d words\n" % len(mdwSet))
# Kahn 1962 - topological sort:
no_incoming = set(w for w in mdwSet if not w in lessThan)
del mdwSet ; mdwList = []
while no_incoming:
n = no_incoming.pop()
mdwList.append(n)
if not n in gtThan: continue
for m in gtThan[n]:
lessThan[m].remove(n)
if not lessThan[m]:
del lessThan[m] ; no_incoming.add(m)
del gtThan[n]
assert not lessThan and not gtThan, "graph has cycle(s), (%d,%d) edges remain" % (len(lessThan),len(gtThan))
tcA = set(w for w,_ in trueClosure)
if diagnose: diagnose_write("%s in tcA %s" % (diagnose,diagnose in tcA))
r = [] ; _cmpT,_cmpW=time.time(),False
for w in mdwList: # lower priorities first
if time.time() > _cmpT + 2:
sys.stderr.write("Finalising: %d/%d%s" % (len(r),len(mdwList),clear_eol)) ; sys.stderr.flush()
_cmpT=time.time()
_cmpW=True
if w in tcA:
if w==diagnose:
f0 = existingFreqs.get(w,0)
found = False
for i in xrange(len(r)):
W,f = r[i]
if (w,W) in trueClosure:
found = True
if 1+f > f0:
diagnose_write("Increasing f(%s) from %d to %d to outweigh %s (f=%d)" % (w,f0,1+f,W,f))
f0 = 1+f
else: diagnose_write("f(%s)=%d already outweighs %d for %s" % (w,f0,f,W))
elif (W,w) in trueClosure:
found = True
diagnose_write("Problem? %s (f=%d) before %s (f=%d)" % (W,f,w,f0))
if not found: diagnose_write("No interactions with %s found among %d lower-priority words" % (w,len(r)))
l = [f0-1]
else: l = [r[i][1] for i in xrange(len(r)) if (w,r[i][0]) in trueClosure]
else: l = []
r.append((w,1+max([existingFreqs.get(w,0)-1]+l)))
if _cmpW: sys.stderr.write("Finalising: done%s\n" % clear_eol)
return sorted(r)
def skipToNext(thing): return "(?:(?!"+re.escape(thing)+").)*"+re.escape(thing) # not ".*?"+re.escape(thing) as it may absorb one 'thing' to match the rest of the regex later
if mreverse: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupMid,markupEnd,re.escape(markupEnd)+r'\s*'+re.escape(markupStart)+skipToNext(markupMid),markupStart,markupMid
else: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupStart,markupMid,re.escape(markupMid)+skipToNext(markupEnd)+r'\s*'+re.escape(markupStart),markupMid,markupEnd
mdSplitR="(?:"+mdSplitR+")?" # so can use it in .join(chars) to say "maybe word-break between these chars"
def different_ways_of_splitting(chars,numWords):
if numWords > len(chars): return
elif numWords == len(chars):
yield list(chars) ; return
elif numWords == 1:
yield [chars] ; return
spAt_try1 = int(len(chars) / numWords) + 1
for spAt in list(range(spAt_try1,0,-1)) + list(range(spAt_try1+1, len(chars)-numWords+1)):
for r in different_ways_of_splitting(chars[spAt:],numWords-1): yield [chars[:spAt]]+r
if type(u"")==type(""): # Python 3
getNext = lambda gen: gen.__next__()
iterkeys = lambda d: d.keys()
itervalues = lambda d: d.values()
iteritems = lambda d: d.items()
else: # Python 2
getNext = lambda gen: gen.next()
iterkeys = lambda d: d.iterkeys()
itervalues = lambda d: d.itervalues()
iteritems = lambda d: d.iteritems()
def yarowsky_indicators(withAnnot_unistr,canBackground):
# yields True if rule always works (or in majority of cases with ymajority), or lists enough indicators to cover example instances and yields (negate, list, nbytes), or just list if empty.
# (If too few indicators can be found, will list the ones it can, or empty if no clearly-distinguishable indicators can be found within ybytes of end of match.)
# yield "backgrounded" = task has been backgrounded; getNext collects result
nonAnnot=markDown(withAnnot_unistr)
def unconditional_looks_ok(explain):
# could we have this as an unconditional rule, with the other cases as exceptions that will be found first? (NB this is not the same thing as a 'default-yes rule with exceptions', this is a rule with NO qualifying indicators either way)
if len(nonAnnot)==1:
if nonAnnot==diagnose: diagnose_write("%s is default by %s len=1 rule after removing irrelevant badStarts" % (withAnnot_unistr,explain))
return True # should be safe, and should cover most "common short Chinese word with thousands of contexts" cases
# If len 2 or more, it's risky because the correct solution could be to process just a fraction of the word now and the rest will become the start of a longer word, so we probably don't want it matching the whole lot by default: we'll want positive or negative indicators instead.
# e.g. looking at rule AB, text ABC and correct segmentation is A BC, don't want it to 'greedily' match AB by default without positive indicators it should do so
# Check for no "A BC" situations, i.e. can't find any possible SEQUENCE of rules that STARTS with ALL the characters in nonAnnot and that involves having them SPLIT across multiple words:
# (The below might under-match if there's the appearance of a split rule but it actually has extra non-marked-up text in between, but it shouldn't over-match.)
# TODO: if we can find the actual "A BC" sequences (instead of simply checking for their possibility as here), and if we can guarantee to make 'phrase'-length rules for all of them, then AB can still be the default. This might be useful if okStarts is very much greater than badStarts.
# (TODO: until the above is implemented, consider recommending --ymax-threshold=0, because, now that Yarowsky-like collocations can be negative, the 'following word' could just go in as a collocation with low ybytes)
# TODO: also, if the exceptions to rule AB are always of the form "Z A B", and we can guarantee to generate a phrase rule for "Z A B", then AB can still be default. (We should already catch this when the exceptions are "ZA B", but not when they are "Z A B", and --ymax-threshold=0 probably won't always help here, especially if Z==B; Mandarin "mei2you3" / "you3 mei2 you3" comes to mind)
llen = len(mdStart)+len(nonAnnot)
regex=re.compile(re.escape(mdStart) + mdSplitR.join(re.escape(c) for c in list(nonAnnot)))
if all(x.end()-x.start()==llen for x in re.finditer(regex,corpus_unistr)):
if nonAnnot==diagnose: diagnose_write("%s is default by %s rule after checking for dangerous overlaps etc" % (withAnnot_unistr,explain))
return True
if nonAnnot==diagnose: diagnose_write("%s cannot be default by %s due to %s" % (withAnnot_unistr,explain,', '.join(list(set(["'"+x.group()+"'" for x in re.finditer(regex,corpus_unistr) if not x.end()-x.start()==llen]))[:5])))
if nonAnnot in yPriorityDic: # TODO: enforce len==1 ?
if yPriorityDic[nonAnnot] == withAnnot_unistr:
# we want this case to be the default
if len(withAnnot_unistr)==1:
if nonAnnot==diagnose: diagnose_write("ref-pri forces %s" % (withAnnot_unistr,))
yield True ; return
else:
if nonAnnot==diagnose: diagnose_write("ref-pri wants %s by default: finding negative indicators only" % (withAnnot_unistr,))
can_be_default = "must"
# might not even need to get okStarts, etc
if unconditional_looks_ok("ref-pri"):
yield True ; return
else:
if nonAnnot==diagnose: diagnose_write("ref-pri forbids default %s" % (withAnnot_unistr,))
can_be_default = False # another is default, don't make this one default even if it occurs more
else: can_be_default = True
# First, find positions in corpus_markedDown which match withAnnot_unistr in corpus_unistr
okStarts = getOkStarts(withAnnot_unistr)
# now check for corpus_markedDown matches that *don't* have withAnnot_unistr
badStarts = getBadStarts(nonAnnot,okStarts)
if not badStarts:
if nonAnnot==diagnose: diagnose_write("%s has no badStarts" % (withAnnot_unistr,))
yield True ; return # rule always works, no Yarowsky-like indicators needed
if can_be_default and len(okStarts) > len(badStarts) and len(nonAnnot)==1:
if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule" % (withAnnot_unistr,))
yield True ; return # duplicate of code below (can test for this case early before reducing-down badStarts)
badStarts = getReallyBadStarts(badStarts,nonAnnot) # see its comments (ignore some badStarts)
if not badStarts:
if nonAnnot==diagnose: diagnose_write("%s has only probably-irrelevant badStarts" % (withAnnot_unistr,))
yield True ; return
# Now, if it's right more often than not:
if can_be_default==True and len(okStarts) > len(badStarts) and unconditional_looks_ok("majority-case"): # (if can_be_default=="must", we have already checked for unconditional_looks_ok() above before computing okStarts and badStarts)
yield True ; return
run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds. TODO: does mpi4py.futures have a different 'sweet spot' here? (low priority unless we can get MPI to outdo concurrent.futures in this application)
may_take_time = canBackground and len(okStarts) > 1000
if may_take_time:
getBuf(sys.stderr).write((u"\nLarge collocation check (%s has %d matches + %s), %s.... \n" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot),cond(run_in_background,"backgrounding","could take some time"))).encode(terminal_charset,'replace'))
if len(badStarts) <= yarowsky_debug: typo_report("yarowsky-debug.txt","allow-exceptions.txt",withAnnot_unistr,(u"%s has %d matches + %s" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot,False))))

Silas S. Brown
committed
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
if run_in_background:
job = executor.submit(yarowsky_indicators_wrapped,withAnnot_unistr) # recalculate the above on the other CPU in preference to passing, as memory might not be shared
yield "backgrounded" ; yield job.result() ; return
if ybytes_max > ybytes and (not ymax_threshold or len(nonAnnot) <= ymax_threshold):
retList = [] ; append=retList.append
for nbytes in range(ybytes,ybytes_max+1,ybytes_step):
negate,ret,covered,toCover = tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must",nbytes==ybytes_max)
if covered==toCover and len(ret)==1:
if may_take_time: sys.stderr.write(" - using 1 indicator, negate=%s\n" % repr(negate))
yield (negate,ret,nbytes) ; return # a single indicator that covers everything will be better than anything else we'll find
append((-int(covered*100/toCover),len(ret),nbytes,negate,toCover,ret)) # (1st 4 of these are the sort keys: maximum coverage to nearest 1%, THEN minimum num indicators for the same coverage, THEN minimum nbytes (TODO: problems of very large nbytes might outweigh having more indicators; break if found 100% coverage by N?), THEN avoid negate)
# TODO: try finding an OR-combination of indicators at *different* proximity lengths ?
retList.sort()
if nonAnnot==diagnose: diagnose_write("Best coverage is %d%% of %d" % (-retList[0][0],retList[0][-2]))
negate,ret = retList[0][-3],retList[0][-1]
distance = retList[0][2]
else:
negate,ret = tryNBytes(ybytes_max,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must")[:2]
if ybytes < ybytes_max: distance = ybytes_max
else: distance = None # all the same anyway
if not ret and warn_yarowsky: getBuf(sys.stderr).write((u"Couldn't find ANY Yarowsky-like indicators for %s \n" % withAnnot_unistr).encode(terminal_charset,'replace')) # (if nonAnnot==diagnose, this'll be reported by tryNBytes below)
# TODO: if partially but not completely covered, shouldn't entirely count the word as 'covered' in analyse()
elif ret and may_take_time: sys.stderr.write(" - using %d indicators, negate=%s\n" % (len(ret),repr(negate)))
if not ret or (not distance and not negate):
yield ret
else:
if not distance: distance = ybytes_max
yield negate,ret,distance
typo_data = {}
def typo_report(debugFile,exceptionFile,withAnnot_unistr,msg_unistr):
if not exceptionFile in typo_data:
try: typo_data[exceptionFile]=set(splitWords(openfile(exceptionFile).read().decode(terminal_charset)))
except IOError: typo_data[exceptionFile]=set()
if withAnnot_unistr not in typo_data[exceptionFile]:
if not debugFile in typo_data:
typo_data[debugFile] = openfile(debugFile,'w')
getBuf(sys.stderr).write(bold_on+"Writing to "+debugFile+bold_off+"\n")
typo_data[debugFile].write("Put any of the following first-of-line words into allow-exceptions.txt to avoid being alerted here next time.\n\n")
typo_data[debugFile].write((msg_unistr+u"\n").encode(terminal_charset,'replace'))
typo_data[debugFile].flush() # in case interrupted

Silas S. Brown
committed
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
def yarowsky_indicators_wrapped(withAnnot_unistr):
check_globals_are_set_up()
return getNext(yarowsky_indicators(withAnnot_unistr,False))
def getOkStarts(withAnnot_unistr):
if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr]
walen = len(withAnnot_unistr)
return set(x for x in precalc_sets[getNext(splitWords(withAnnot_unistr))] if corpus_unistr[m2c_map[x]:m2c_map[x]+walen]==withAnnot_unistr)
def getBadStarts(nonAnnot,okStarts):
r = [] ; append=r.append
l=len(nonAnnot)
k = nonAnnot[:2]
if k in bigramCache:
for i in bigramCache[k]:
if not i in okStarts and corpus_markedDown[i:i+l]==nonAnnot: append(i)
return r
find = corpus_markedDown.find
i = find(nonAnnot)
while i != -1:
if not i in okStarts: append(i)
i = find(nonAnnot,i+l)
return r
def getReallyBadStarts(badStarts,nonAnnot):
# Some of the badStarts can be ignored on the grounds that they should be picked up by other rules first: any where the nonAnnot match does not start at the start of a word (the rule matching the word starting earlier should get there first), and any where it starts at the start of a word that is longer than its own first word (the longest-first ordering should take care of this). So keep only the ones where it starts at the start of a word and that word is no longer than len(nonAnnot).
reallyBadStarts = [] ; append=reallyBadStarts.append
nonAnnotLen = len(mdStart+nonAnnot+mdEnd)
theRe = re.compile(re.escape(mdStart+nonAnnot[0])+".*?"+re.escape(mdEnd),flags=re.DOTALL)
for b in badStarts:
try: s = m2c_map[b]
except KeyError: continue # it wasn't the start of a word (only start positions are in that map)
m=theRe.search(corpus_unistr, s) # will either start at s, or after it if mreverse
assert m, "m2c_map error? "+repr(nonAnnot[0])+" "+repr(b)+"->"+repr(s)+" not found ( "+repr(corpus_markedDown[b:b+25])+"... -> "+repr(corpus_unistr[s:s+50])+"...)"
s,e = m.start(),m.end()
if e-s > nonAnnotLen: continue # this word is too long, should be matched by a longer rule 1st
append(b) # to reallyBadStarts
return reallyBadStarts
def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,force_negate,try_harder=True):
# try to find either positive or negative Yarowsky-like indicators, whichever gives a smaller set (or only negative ones if force_negate, used by ref_pri). Negative indicators might be useful if there are many matches and only a few special exceptions. (If not force_negate, then negative indicators are used only if they cover 100% of the exceptions; see below re negate==None)
def bytesAround(start): return within_Nbytes(start+len(nonAnnot),nbytes)
okStrs=list(set(bytesAround(s) for s in okStarts))
badStrs=list(set(bytesAround(s) for s in badStarts))
pOmit = unichr(1).join(badStrs) # omit anything that occurs in this string from +ve indicators
nOmit = unichr(1).join(okStrs) # ditto for -ve indicators
pCovered=[False]*len(okStrs)
nCovered=[False]*len(badStrs)
n2Covered=[False]*len(badStrs)
pRet = [] ; pAppend=pRet.append
nRet = [] ; nAppend=nRet.append
n2Ret = [] ; nAppend2 = n2Ret.append
negate = None # not yet set
stuffToCheck = [] ; stuffChecked = []
if not force_negate:
l = []
stuffChecked.append((l,"",pRet,pCovered))
stuffToCheck.append((l,okStrs,pAppend,pCovered,unique_substrings(okStrs,markedUp_unichars,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))) # a generator and associated parameters for positive indicators
diagnose_extra = []
if force_negate or 5*len(okStrs) > len(badStrs) or not okStrs: # and for negative indicators, if appropriate: (changed in v0.6892: still check for negative indicators if len(okStrs) is similar to len(badStrs) even if not strictly greater, but don't bother if len(okStrs) is MUCH less)
l = []
stuffChecked.append((l,"negative",nRet,nCovered))
stuffToCheck.append((l,badStrs,nAppend,nCovered,unique_substrings(badStrs,markedUp_unichars,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s))))
if try_harder and okStrs and not force_negate:
l = [] ; stuffChecked.append((l,"overmatch-negative",n2Ret,n2Covered))
stuffToCheck.append((l,badStrs,nAppend2,n2Covered,unique_substrings(badStrs,markedUp_unichars,lambda txt:False,lambda txt:(sum(1 for s in badStrs if txt in s),-sum(1 for s in okStrs if txt in s))))) # a harder try to find negative indicators (added in v0.6896): allow over-matching (equivalent to under-matching positive indicators) if it's the only way to get all badStrs covered; may be useful if the word can occur in isolation
elif nonAnnot==diagnose: diagnose_extra.append("Not checking for negative indicators as 5*%d>%d=%s." % (len(okStrs),len(badStrs),repr(5*len(okStrs)>len(badStrs))))
while stuffToCheck and negate==None:
for i in range(len(stuffToCheck)):
l,strs,append,covered,generator = stuffToCheck[i]
try: indicator = getNext(generator)
except StopIteration:
del stuffToCheck[i] ; break
found = True ; cChanged = False
for j in xrange(len(strs)):
if not covered[j] and indicator in strs[j]:
covered[j]=cChanged=True
if cChanged:
append(indicator)
if not l: l.append(True)
if all(covered):
if append==pAppend: negate=False
elif append==nAppend: negate=True # negate with no overmatch allowed found all the exceptions, so use it (don't use it if it doesn't find ALL the exceptions, since we don't ever want an as-if 'overmatch positive' i.e. misidentifying a word/phrase in a place where the corpus explicitly DOESN'T have it, unless force_negate see comment below)
else: # append==nAppend2 (negate with overmatch allowed): we managed to get all exceptions with overmatch-negative, but how much damage did our overmatching do to the NON-exceptions?
fxCover = [True]*len(okStrs)
for indicator in n2Ret:
for i in xrange(len(okStrs)):
if fxCover[i] and indicator in okStrs[i]:
# a negative indicator 'misfires' here, resulting in this okStr NOT being identified as 'ok'
fxCover[i] = False
if sum(1 for x in fxCover if x) >= sum(1 for x in pCovered if x): negate="harder"
else: diagnose_extra.append("Overmatch-negate got worse actual coverage than partial-positive.") # so don't set negate="harder", but we might still force_negate to set negate=True below
break
# and if negate==None AFTER this loop, didn't get all(pCovered) OR all(nCovered), in which case we fall back to negate=False (unless force_negate). In other words, negative indicators normally have to cover ALL non-occurrences to be passed, whereas positive indicators just have to cover SOME. This is in keeping with the idea of 'under-match is better than over-match' (because an under-matching negative indicator is like an over-matching positive one)
if force_negate: negate = True
if negate==True: ret,covered = nRet,nCovered
elif negate=="harder": ret,covered = n2Ret,n2Covered
else: ret,covered = pRet,pCovered
if nonAnnot==diagnose:
def report(actuallyChecked,negate,ret,covered):
if not actuallyChecked: return ""
if negate: indicators = negate+" indicators "
else: indicators = "indicators "
if ret:
if len(ret) > 30: indicators=str(len(ret))+" "+indicators # +'/'.join(ret[:30]+['...'])
else: indicators += '/'.join(ret)
else: indicators = "no "+indicators
if all(covered): notCovered = ""
else:
if negate: strs = badStrs
else: strs = okStrs
notCovered = [strs[i] for i in xrange(len(covered)) if not covered[i]]
if len(notCovered) > 10: notCovered = notCovered[:10]+["..."]
notCovered = ", not "+'/'.join(notCovered).replace('\n',"\\n")
if negate=="overmatch-negative":
overmatch=[s for s in okStrs if any(i in s for i in n2Ret)]
if len(overmatch) > 10: overmatch = overmatch[:10]+["..."]
if overmatch: notCovered += ", overmatch "+"/".join(overmatch).replace('\n',"\\n")
return "%s (cover=%d/%d%s)" % (indicators,sum(1 for x in covered if x),len(covered),notCovered)
if len(pOmit) > 200: pOmit = pOmit[:200]+"..."
diagnose_extra = " ".join(diagnose_extra)
if diagnose_extra: diagnose_extra=" "+diagnose_extra
rr = ", ".join(r for r in [report(*i) for i in stuffChecked] if r)
if not rr: rr = "nothing"
diagnose_write("tryNBytes(%d) on %s (avoiding '%s') found %s%s" % (nbytes,withAnnot_unistr,pOmit.replace(unichr(1),'/').replace('\n',"\\n"),rr,diagnose_extra))
return negate,ret,sum(1 for x in covered if x),len(covered)
def cond(a,b,c):
if a: return b
else: return c
def badInfo(badStarts,nonAnnot,for_tty=True):
ret = u"%d false positive" % len(badStarts)
if not len(badStarts)==1: ret += "s"
if len(badStarts) > yarowsky_debug: return ret
for wordStart in badStarts:
wordEnd = wordStart + len(nonAnnot)
contextStart,contextEnd=max(0,wordStart-5),wordEnd+5
toRead = corpus_markedDown
# but can we report it from the original corpus_unistr?
if wordStart in m2c_map and wordEnd in m2c_map:
toRead = corpus_unistr
wordStart,wordEnd = m2c_map[wordStart],m2c_map[wordEnd]
newCStart,newCEnd = contextStart,contextEnd
while newCStart not in m2c_map and newCStart >= contextStart-5: newCStart-=1
while newCEnd not in m2c_map and newCEnd<contextEnd+5: newCEnd+=1
if newCStart in m2c_map: contextStart = m2c_map[newCStart]
else: contextStart = max(0,wordStart - 15) # This might cut across markup, but better that than failing to report the original corpus and making it look like the words might not have "lined up" when actually they did. Might also just cut into surrounding non-markup text (if the above loop simply couldn't find anything near enough because such text was in the way).
if newCEnd in m2c_map: contextEnd = m2c_map[newCEnd]
else: contextEnd = wordEnd + 15 # ditto
if for_tty: ret += (u" (%s%s%s%s%s)" % (toRead[contextStart:wordStart],reverse_on,toRead[wordStart:wordEnd],reverse_off,toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r")
else: ret += (u" (%s <<%s>> %s)" % (toRead[contextStart:wordStart],toRead[wordStart:wordEnd],toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r")
return ret
def unique_substrings(texts,allowedChars,omitFunc,valueFunc):
# yield unique substrings of texts, in increasing length, with equal lengths sorted by highest score returned by valueFunc, and omitting any where omitFunc is true, or that uses any character not in allowedChars (allowedChars==None means all allowed)
if allowedChars:
# remove non-allowedChars from texts, splitting into smaller strings as necessary
texts2 = [] ; append=texts2.append
for text in texts:
start = 0
for i in xrange(len(text)):
if not text[i] in allowedChars:
if i>start: append(text[start:i])
start=i+1
if start<len(text): append(text[start:])
texts=texts2
if not texts: return
length=1 ; maxlen = max(len(t) for t in texts)
while length <= maxlen:
ret=set()
# sys.stderr.write("Finding (l=%d)... " % len(texts))
for text in texts: ret.update(text[s:s+length] for s in xrange(len(text)-length+1))
l=[(valueFunc(k),k) for k in ret if not omitFunc(k)]
# if length == ybytes_max and not l: sys.stderr.write("Debugger: omitFunc was true for all %s\n" % repr(ret))
l.sort() ; l.reverse()
# sys.stderr.write("%d of length %d\n" % (len(l),length))
for v,k in l: yield k
length += 1
def within_Nbytes(matchEndPos,nbytes):
# return the Unicode characters within nbytes of matchEndPos, assuming the encoding will be outcode. Used for the Yarowsky-like functions.
# Assumes multibyte codes are self-synchronizing, i.e. if you start in the middle of a multibyte sequence, the first valid character will be the start of the next sequence, ok for utf-8 but TODO might not be the case for some codes
return corpus_markedDown[max(0,matchEndPos-nbytes):matchEndPos].encode(outcode)[-nbytes:].decode(outcode,'ignore')+corpus_markedDown[matchEndPos:matchEndPos+nbytes].encode(outcode)[:nbytes].decode(outcode,'ignore')
def test_rule(withAnnot_unistr,yBytesRet,canBackground=None):
# Tests to see if the rule withAnnot_unistr is
# ALWAYS right in the examples, i.e.
# the number of occurrences of its marked-down text
# in the continuous marked-down string should be
# EXACTLY equal to the number of occurrences of the
# marked-up version.
# (If we deal only in rules that ALWAYS work, we can
# build them up incrementally without "cross-talk")
# yield "backgrounded" = task has been backgrounded; getNext collects result (nb we default to NOT canBackground, as test_rule is called from several places of which ONE can handle backgrounding)
if primitive: yield True
elif ybytes:
# Doesn't have to be always right, but put the indicators in yBytesRet
ybrG = yarowsky_indicators(withAnnot_unistr,canBackground)
ybr = getNext(ybrG)
if ybr == "backgrounded":
yield ybr ; ybr = getNext(ybrG)
if ybr==True or not ybr:
yield ybr ; return
yBytesRet.append(ybr) # (negate, list of indicators, nbytes)
yield True
else: # non-ybytes version: accept rule only if it exactly matches the corpus
phrase = markDown(withAnnot_unistr)
ret = corpus_markedDown.count(phrase) == len(getOkStarts(withAnnot_unistr))
if diagnose and diagnose==phrase:
diagnose_write("occurrences(%s)==occurrences(%s) = %s" % (phrase,withAnnot_unistr,ret))
yield ret
def all_possible_rules(words,covered):
# Iterate over ALL possible rules derived from the
# word sequence (don't just "find the shortest context
# that predicts each word" because that can have
# trouble with overlaps; need to check them all and
# stop when we've got enough to reproduce the example)
# As optimisation, avoids returning rules for which
# all(covered) over that rule's range
if max_words: maxRuleLen = min(len(words),max_words)
else: maxRuleLen = len(words)
for ruleLen in range(1,maxRuleLen+1): # (sort by len)
for wStart in range(len(words)-ruleLen+1):
if not all(covered[wStart:wStart+ruleLen]):
yield words[wStart:wStart+ruleLen]
# caller join()s before adding to rules dict
def checkCoverage(ruleAsWordlist,words,coveredFlags):
# Updates coveredFlags and returns True if any changes
# (if False, the new rule is redundant).
# Don't worry about ybytes - assume the Yarowsky-like
# indicators have been calculated correctly across the
# whole text so we don't need to re-check them now.
assert type(ruleAsWordlist)==type(words)==list
try: start = words.index(ruleAsWordlist[0])
except ValueError: return False
ln = len(ruleAsWordlist)
changedFlags = False
while start <= len(words)-ln:
if words[start:start+ln] == ruleAsWordlist:
if not all(coveredFlags[start:start+ln]):
coveredFlags[start:start+ln]=[True]*ln
changedFlags = True
start += ln
else:
try: start = words.index(ruleAsWordlist[0],start+1)
except ValueError: break
return changedFlags
def wspJoin(l):
if removeSpace: return "".join(l)
else: return " ".join(l)
def potentially_bad_overlap(rulesAsWordlists,newRuleAsWords):
# Allow overlaps only if rule(s) being overlapped are
# entirely included within newRule. Otherwise could
# get problems generating closures of overlaps.
# (If newRule not allowed, caller to try a longer one)
# Additionally, if allow_overlaps, allow ANY overlap as
# long as it's not found in the marked-down text.
if len(newRuleAsWords)==1 or primitive or ybytes: return False
for ruleAsWordlist in rulesAsWordlists:
if len(ruleAsWordlist)==1: continue
if not len(ruleAsWordlist)==len(newRuleAsWords) and longerStartsOrEndsWithTheShorter(ruleAsWordlist,newRuleAsWords): continue
for overlapSize in range(1,min(len(x) for x in [newRuleAsWords,ruleAsWordlist])):
if not (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] or newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize]): continue
if not allow_overlaps: return True
# Test to see if the examples "allow" this potentially-bad overlap
def overlapOK(rAW): return not markDown(wspJoin(rAW)) in corpus_markedDown
if (ruleAsWordlist[-overlapSize:] == newRuleAsWords[:overlapSize] and not overlapOK(ruleAsWordlist[:-overlapSize]+newRuleAsWords)) or (newRuleAsWords[-overlapSize:] == ruleAsWordlist[:overlapSize] and not overlapOK(newRuleAsWords[:-overlapSize]+ruleAsWordlist)): return True
def longerStartsOrEndsWithTheShorter(l1,l2):
if len(l1) > len(l2): l1,l2 = l2,l1
return l2[:len(l1)]==l1 or l2[-len(l1):]==l1
class RulesAccumulator:
def __init__(self):
self.rules = {}
self.rulesAsWordlists_By1stWord = {} # starting word -> list of possible rules (as wordlists) that might apply
self.rulesAsWordlists = list() # all rules as words (list of lists) (used if not ybytes, TODO: integrate with rulesAsWordlists_By1stWord?)
self.rejectedRules = set()
self.seenPhrases = set() # de-duplicate, might speed up
self.amend_rules = False
if rulesFile: self.load()
def save(self):
sys.stderr.write("\nPickling rules to %s... " % rulesFile) ; sys.stderr.flush()
f = openfile(rulesFile,'w')
pickle.Pickler(f,-1).dump((self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases))
# (don't save self.rejectedRules, there might be better clues next time)
f.close() ; sys.stderr.write("done")
sys.stderr.flush()
def load(self):
if not os.path.isfile(rulesFile):
sys.stderr.write("%s does not exist, starting with blank rules\n" % rulesFile)
return
sys.stderr.write("Unpickling rules from %s... " % rulesFile) ; sys.stderr.flush()
f = openfile(rulesFile)
self.rules,self.rulesAsWordlists_By1stWord,self.rulesAsWordlists,self.seenPhrases = pickle.Unpickler(f).load()
sys.stderr.write("done\n")
self.amend_rules = True
self.newRules = set()
def remove_old_rules(self,words): # for incremental runs - removes previously-discovered rules that would have been suggested by this new phrase but that no longer 'work' with the rest of the corpus due to alterations elsewhere. DOES NOT remove old rules that are not suggested by any phrase in the corpus because the phrases that suggested them have been removed or changed (TODO: might want an option for that, although fundamentally you shouldn't be relying on incremental runs if you're making a lot of changes to the corpus)
for w in set(words):
rulesAsWordlists = self.rulesAsWordlists_By1stWord.get(w,[])
i=0
while i<len(rulesAsWordlists):
if max_words and len(rulesAsWordlists[i])>max_words:
i += 1 ; continue # better leave that one alone if we're not reconsidering rules that long (e.g. running again with single_words when previous run wasn't)
rule = wspJoin(rulesAsWordlists[i])
if rule not in self.newRules and checkCoverage(rulesAsWordlists[i],words,[False]*len(words)): # rule would apply to the new phrase
yBytesRet = []
if not getNext(test_rule(rule,yBytesRet)) or potentially_bad_overlap(self.rulesAsWordlists,rulesAsWordlists[i]): # re-test fails. In versions v0.543 and below, we just removed ALL rules that would apply to the new phrase, to see if they would be re-generated. But that caused problems because addRulesForPhrase can return early if all(covered) due to other (longer) rules and we might be removing a perfectly good short rule that's needed elsewhere. So we now re-test before removal.
self.rejectedRules.add(rule)
if not ybytes: self.rulesAsWordlists.discard(rulesAsWordlists[i])
del rulesAsWordlists[i] ; del self.rules[rule]
continue
self.newRules.add(rule) # still current - add to newRules now to save calling test_rule again
if len(yBytesRet): self.rules[rule] = yBytesRet[0] # overriding what it was before (since we've re-done test_rule for it, which might have returned a new set of Yarowsky-like indicators for the new version of the corpus)
i += 1
def addRulesForPhrase(self,phrase,canBackground=False):
if phrase in self.seenPhrases or (diagnose_quick and diagnose):
# if diagnose and (diagnose_quick or self.amend_rules) and mdStart+diagnose+mdEnd in phrase: pass # look at it again for diagnostics. But do we accept a diagnose that spans multiple words? should be pointed out by --diagnose-quick below if uncommented
if diagnose and (diagnose_quick or self.amend_rules) and diagnose in markDown(phrase): pass # this version accepts diagnose of multiple words (and might also let some phrases through where it matches on an overlap)
else:
yield 0,0 ; return # TODO: document that this means the total 'covered' figure in the progress status is AFTER phrase de-duplication (otherwise we'd have to look up what the previous values were last time we saw it - no point doing that just for a quick statistic)
self.seenPhrases.add(phrase)
words = list(filter(lambda x:markDown(x).strip(),splitWords(phrase))) # filter out any that don't have base text (these will be input glitches, TODO: verify the annotation text is also just whitespace, warn if not)
if not words:
yield 0,0 ; return
covered = [False]*len(words)
# first see how much is covered by existing rules
# (don't have to worry about the order, as we've been
# careful about overlaps)
if self.amend_rules: self.remove_old_rules(words) # NB if yignore this might not remove all, but still removes all that affect checkCoverage below
for w in set(words):
for ruleAsWordlist in self.rulesAsWordlists_By1stWord.get(w,[]):
if checkCoverage(ruleAsWordlist,words,covered) and all(covered):
yield len(covered),len(covered) ; return # no new rules needed
for ruleAsWordlist in all_possible_rules(words,covered):
rule = wspJoin(ruleAsWordlist) ; yBytesRet = []
if rule in self.rejectedRules: continue
if rule in self.rules: continue # this can still happen even now all_possible_rules takes 'covered' into account, because the above checkCoverage assumes the rule won't be applied in a self-overlapping fashion, whereas all_possible_rules makes no such assumption (TODO: fix this inconsistency?)
rGen = test_rule(rule,yBytesRet,canBackground)
r = getNext(rGen)
if r=="backgrounded":
yield r ; r = getNext(rGen)
del rGen
if not r or potentially_bad_overlap(self.rulesAsWordlists,ruleAsWordlist):
self.rejectedRules.add(rule) # so we don't waste time evaluating it again (TODO: make sure rejectedRules doesn't get too big?)
continue
cc = checkCoverage(ruleAsWordlist,words,covered) # changes 'covered'
assert cc, "this call to checkCoverage should never return False now that all_possible_rules takes 'covered' into account" # and it's a generator which is always working from the CURRENT copy of 'covered'
if len(yBytesRet): self.rules[rule] = yBytesRet[0]
else: self.rules[rule] = [] # unconditional
if not ybytes: self.rulesAsWordlists.append(ruleAsWordlist)
if not ruleAsWordlist[0] in self.rulesAsWordlists_By1stWord: self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]] = []
self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]].append(ruleAsWordlist)
if self.amend_rules: self.newRules.add(rule)
handle_diagnose_limit(rule)
if all(covered):
yield len(covered),len(covered) ; return
# If get here, failed to completely cover the phrase.
# ruleAsWordlist should be set to the whole-phrase rule.
yield sum(1 for x in covered if x),len(covered)
def rulesAndConds(self):
if self.amend_rules: return [(k,v) for k,v in self.rules.items() if not k in self.newRules] + [(k,v) for k,v in self.rules.items() if k in self.newRules] # new rules must come last for incremental runs, so they will override existing actions in byteSeq_to_action_dict when small changes have been made to the annotation of the same word (e.g. capitalisation-normalisation has been changed by the presence of new material)
else: return self.rules.items()
def handle_diagnose_limit(rule):
global diagnose,diagnose_limit
if diagnose and diagnose_limit and diagnose==markDown(rule):
diagnose_limit -= 1
if not diagnose_limit:
diagnose = False
diagnose_write("limit reached, suppressing further diagnostics")
def generate_map():
global m2c_map, precalc_sets, yPriorityDic
if checkpoint:
try:
f=open_try_bz2(checkpoint+os.sep+'map','r')
m2c_map,precalc_sets,yPriorityDic = pickle.Unpickler(f).load()
return sys.stderr.write("Corpus map loaded\n")
except: pass
assert main, "Only main should generate corpus map"
sys.stderr.write("Generating corpus map... ")
m2c_map = {} ; precalc_sets = {}
muStart = downLenSoFar = 0
for s in re.finditer(re.escape(markupStart), corpus_unistr):
s=s.start()
md = markDown(corpus_unistr[muStart:s])
if markupStart in md: errExit("examples have nested markup! "+repr(md))
downLenSoFar += len(md)
muStart = s
m2c_map[downLenSoFar] = s
# Added optimisation: do precalc_sets as well
# (at least catch the 1-word cases)
# -> this is now needed even if not ybytes
e=corpus_unistr.find(markupEnd,s)
if e>-1:
e += len(markupEnd)
k = corpus_unistr[s:e]
if k not in precalc_sets: precalc_sets[k]=set()
precalc_sets[k].add(downLenSoFar)
yPriorityDic = {}
if ref_pri and ybytes:
sys.stderr.write("yPriorityDic ... ")
for s in re.finditer(re.escape(reference_sep+ref_pri+ref_name_end), corpus_unistr):
s = s.start()+len(reference_sep+ref_pri+ref_name_end)
e = corpus_unistr.find(reference_sep,s)
if e==-1: e=len(corpus_unistr)
for w in splitWords(corpus_unistr[s:e]):
wd = markDown(w)
if wd in yPriorityDic: continue
if diagnose==wd: diagnose_write("yPriorityDic[%s] = %s" % (wd,w))
yPriorityDic[wd] = w
sys.stderr.write("done\n")
if checkpoint: pickle.Pickler(open_try_bz2(checkpoint+os.sep+'map','w'),-1).dump((m2c_map,precalc_sets,yPriorityDic))
checkpoint_exit()
def setup_parallelism():
if single_core or not checkpoint: return # parallelise only if checkpoint (otherwise could have trouble sharing the normalised corpus and map etc)
args = getoutput("ps -p " + str(os.getpid()) + " -o args")
try:
args.index("-m mpi4py.futures") # ValueError if not found
import mpi4py.futures # mpi4py v2.1+
import mpi4py.MPI, mpi4py ; assert mpi4py.MPI.COMM_WORLD.size > 1, "mpi4py says world size is 1: likely a symptom of incorrectly-configured MPI. Did you compile mpi4py using the same setup (e.g. MPICH or OpenMPI) as you are running? mpi4py's config is: "+repr(mpi4py.get_config())
return mpi4py.futures.MPIPoolExecutor()
except ValueError: pass # but raise all other exceptions: if we're being run within mpi4py.futures then we want to know about MPI problems
try:
args.index("-m scoop") # ValueError if not found
import scoop.futures
return scoop.futures # submit() is at module level
except ValueError: pass
try:
import concurrent.futures # sudo pip install futures (2.7 backport of 3.2 standard library)
import multiprocessing
num_cpus = multiprocessing.cpu_count()
if num_cpus > 1: return concurrent.futures.ProcessPoolExecutor(num_cpus-1) # leave one for the CPU-heavy control task
except: pass
def get_phrases():
# Returns a list of phrases in processing order, with length-numbers inserted in the list. Caches its result.
global _gp_cache
try: return _gp_cache