From 31fb225cc78012bbb1e6ef6b18ab9a9888e5a309 Mon Sep 17 00:00:00 2001 From: "Silas S. Brown" <ssb22@cam.ac.uk> Date: Mon, 3 Jun 2019 07:20:24 +0100 Subject: [PATCH] Update Annotator Generator --- annogen.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/annogen.py b/annogen.py index 5b3a36e..12e2e5c 100755 --- a/annogen.py +++ b/annogen.py @@ -3637,15 +3637,17 @@ def orRegexes(escaped_keys): def PairPriorities(markedDown_Phrases,existingFreqs={}): markedDown_Phrases = list(markedDown_Phrases) assert all(type(p)==list for p in markedDown_Phrases) - markedDown_Words = reduce(lambda x,y:x+[0]+y,markedDown_Phrases) ; del markedDown_Phrases - assert all((w==0 or type(w)==unicode) for w in markedDown_Words) - mdwSet = set(markedDown_Words + existingFreqs.keys()) - try: mdwSet.remove(0) - except: pass # only one phrase - votes = {} - for x in xrange(len(markedDown_Words)-1): - a,b = markedDown_Words[x:x+2] - if 0 in [a,b]: continue + mdwSet = set(existingFreqs.keys()) + for p in markedDown_Phrases: mdwSet.update(p) + assert all(type(w)==unicode for w in mdwSet) + votes = {} ; lastT = time.time() + for pi in xrange(len(markedDown_Phrases)): + P=p[pi] + if time.time() > lastT+2: + sys.stderr.write("PairPriorities: %d/%d%s",pi,len(markedDown_Phrases),clear_eol) + lastT = time.time() + for x in xrange(len(P)-1): + a,b = P[x:x+2] combined = a+b for i in xrange(1,len(combined)): if not i==len(a): @@ -3656,7 +3658,7 @@ def PairPriorities(markedDown_Phrases,existingFreqs={}): if k[0]==prefer: direction = 1 else: direction = -1 votes[k]=votes.get(k,0)+direction - del markedDown_Words + del markedDown_Phrases closure = set() def addToClosure(a,b): candidate = set([(a,b)]+[(a,c) for x,c in closure if x==b]+[(c,b) for c,x in closure if c==a]) @@ -4779,11 +4781,11 @@ if main and not compile_only: sys.stderr.write("Parsing...") i=[[markDown(w) for w in splitWords(phrase)] for phrase in splitWords(corpus_unistr,phrases=True)] del corpus_unistr - sys.stderr.write(" getting word priorities...") + sys.stderr.write(" calling PairPriorities...\n") out="".join(w+"\t"+str(f)+"\n" for w,f in PairPriorities(i,existingFreqs)) # (don't open the output before here, in case exception) - if existingFreqs: sys.stderr.write(" updating "+priority_list+"...") - else: sys.stderr.write(" writing "+priority_list+"...") + if existingFreqs: sys.stderr.write("Updating "+priority_list+"...") + else: sys.stderr.write("Writing "+priority_list+"...") openfile(priority_list,'w').write(out) sys.stderr.write(" done\n") sys.exit() -- GitLab