diff --git a/annogen.py b/annogen.py index 80647389a81e04f7cbe146b9f237b76c1deb778e..e8bf858b82b6c53b39206493506e72b0375fbe3d 100755 --- a/annogen.py +++ b/annogen.py @@ -3660,15 +3660,19 @@ def PairPriorities(markedDown_Phrases,existingFreqs={}): votes[k]=votes.get(k,0)+direction sys.stderr.write("PairPriorities: done\n") del markedDown_Phrases - closure,lessThan,gtThan = set(),{},{} + global closure,gtThan,lessThan + closure,gtThan, lessThan = set(),{},{} # TODO: bitfield to save RAM on <16G machines? def addToClosure(a,b): # If a>b, then a>c for any b>c (c<b) # (actually b>=c but we don't have equality), # and c>b for any c>a. candidate = set([(a,b)]+[(a,c) for c in lessThan.get(b,[])]+[(c,b) for c in gtThan.get(a,[])]) - if any((y,x) in closure for (x,y) in candidate): + if closure==None: # no longer tracking closure + if any(y in gtThan.get(x,{}) for (x,y) in candidate): return + else: + if any((y,x) in closure for (x,y) in candidate): return # contradiction, use higher abs votes - closure.update(candidate) + closure.update(candidate) for x,y in candidate: # x>y y<x, so y should be in lessThan[x] if not x in lessThan: lessThan[x] = set() @@ -3678,35 +3682,45 @@ def PairPriorities(markedDown_Phrases,existingFreqs={}): for _,direction,a,b in reversed(sorted([(1+abs(v),v,a,b) for (a,b),v in votes.items()])): if direction < 0: a,b = b,a addToClosure(a,b) - trueClosure=closure.copy() + trueClosure,closure = closure,None fallback_order = [w for _,w in reversed(sorted((f,w) for w,f in existingFreqs.items()))] for i in xrange(len(fallback_order)-1): a,b = fallback_order[i:i+2] if not existingFreqs[a]==existingFreqs[b]: addToClosure(a,b) - global _cmp,_cmpN,_cmpT,_cmpW - _cmp,_cmpN,_cmpT,_cmpW = 0,0,time.time(),False + global _cmp,_cmpN,_cmpT,_cmpW,_cmpP + _cmp,_cmpN,_cmpT,_cmpW,_cmpP = 0,0,time.time(),False,0 def cmpFunc(x,y): # lower priorities first - global _cmp ; _cmp += 1 - if (x,y) in closure: return 1 - elif (y,x) in closure: return -1 + global _cmp,_cmpN,_cmpT,_cmpP + _cmp += 1 + if time.time() > _cmpT + 2: + sys.stderr.write("+%d (cmp=%d problems=%d)%s" % (_cmpN,_cmp,_cmpP,clear_eol)) + _cmpT,_cmpW = time.time(),True + if x in gtThan.get(y,{}): return 1 + elif y in gtThan.get(x,{}): return -1 elif x==y: return 0 else: # Make sure we're transitive later: - global _cmpN,_cmpT ; _cmpN += 1 - if time.time() > _cmpT + 2: - sys.stderr.write("+%d (cmp=%d)%s" % (_cmpN,_cmp,clear_eol)) - _cmpT,_cmpW = time.time(),True + _cmpN += 1 addToClosure(x,y) # (generates implied reln's) - if (x,y) in closure: return 1 + if x in gtThan.get(y,{}): return 1 addToClosure(y,x) # ditto - if not (y,x) in closure: - sys.stderr.write(("Warning: adding "+repr((x,y))+" contradicts "+repr(set((Y,X) for X,Y in set([(x,y)]+[(x,c) for z,c in closure if z==y]+[(c,y) for c,z in closure if c==x]) if (Y,X) in closure))+" but adding "+repr((y,x))+" contradicts "+repr(set((Y,X) for X,Y in set([(y,x)]+[(y,c) for z,c in closure if z==x]+[(c,x) for c,z in closure if c==y]) if (Y,X) in closure))+" -- adding "+repr((y,x))+" anyway, beware possible problems...\n").decode('unicode_escape').encode(terminal_charset)) - closure.add((y,x)) + if not y in gtThan.get(x,{}): + _cmpP += 1 # problem; try this for now: + if not y in gtThan: gtThan[y] = set() + gtThan[y].add(x) return -1 r = [] ; sys.stderr.write("%d words\n" % len(mdwSet)) - for w in sorted(mdwSet,cmpFunc): - r.append((w,1+max([existingFreqs.get(w,1)-1]+[r[i][1] for i in xrange(len(r)) if (w,r[i][0]) in trueClosure]))) + mdwSet = list(mdwSet) ; mdwSet.sort(cmpFunc) if _cmpW: sys.stderr.write("\n") + del gtThan,lessThan + _cmpW=False + for w in mdwSet: + if time.time() > _cmpT + 2: + sys.stderr.write("Finalising: %d/%d%s" % (len(r),len(mdwSet),clear_eol)) + _cmpT=time.time() + _cmpW=True + r.append((w,1+max([existingFreqs.get(w,1)-1]+[r[i][1] for i in xrange(len(r)) if (w,r[i][0]) in trueClosure]))) + if _cmpW: sys.stderr.write("Finalising: done%s\n" % clear_eol) return sorted(r) if mreverse: mdStart,mdEnd,aoStart,aoEnd = markupMid,markupEnd,markupStart,markupMid @@ -4794,16 +4808,16 @@ if main and not compile_only: if os.path.exists(priority_list): sys.stderr.write("Reading "+priority_list+"\n") def getFreq(line): - word,freq = line.rstrip().rsplit(None,1) + word,freq = line.decode(outcode).rstrip().rsplit(None,1) try: return word,int(freq) except: return word,float(freq) - existingFreqs={getFreq(l) for l in openfile(priority_list)} + existingFreqs=dict(getFreq(l) for l in openfile(priority_list) if len(l.strip().split())>=2) else: existingFreqs = {} sys.stderr.write("Parsing...") i=[[markDown(w) for w in splitWords(phrase)] for phrase in splitWords(corpus_unistr,phrases=True)] del corpus_unistr sys.stderr.write(" calling PairPriorities...\n") - out="".join(w+"\t"+str(f)+"\n" for w,f in PairPriorities(i,existingFreqs)) + out="".join(w+"\t"+str(f)+"\n" for w,f in PairPriorities(i,existingFreqs)).encode(outcode) # (don't open the output before here, in case exception) if existingFreqs: sys.stderr.write("Updating "+priority_list+"...") else: sys.stderr.write("Writing "+priority_list+"...")