From fffa7b8e6a76c6f100baa3058b360ab808764166 Mon Sep 17 00:00:00 2001
From: "Silas S. Brown" <ssb22@cam.ac.uk>
Date: Thu, 2 Apr 2015 20:05:34 +0000
Subject: [PATCH] Update adjuster/annogen/termlayout

git-svn-id: http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster@2150 29193198-4895-4776-b068-10539e920549
---
 annogen.py | 36 +++++++++---------------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/annogen.py b/annogen.py
index eb530b3..88a01b8 100755
--- a/annogen.py
+++ b/annogen.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-program_name = "Annotator Generator v0.582 (c) 2012-14 Silas S. Brown"
+program_name = "Annotator Generator v0.583 (c) 2012-15 Silas S. Brown"
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1817,33 +1817,14 @@ static void topLevelMatch() {
 def splitWords(text,phrases=False):
     # split text into words, ignoring anything between markupStart and markupEnd
     # if phrases = True, instead of words, split on any non-whitespace char outside markupStart..markupEnd
-    i=start=0
-    text = text.replace(markupEnd+markupStart, markupEnd+' '+markupStart) # force at least one breakpoint between each marked-up phrase (otherwise get problems later - much code assumes each item returned by splitWords contains at most 1 markup)
-    def isSplitpoint():
-        isspace = not text[i].split()
-        if phrases: return not isspace
-        else: return isspace
-    while i<len(text):
-        if text[i:i+len(markupStart)]==markupStart:
-            i = text.find(markupEnd,i+len(markupStart))
-            if i==-1: i=len(text)
-            else: i += len(markupEnd)
-        elif isSplitpoint():
-            if i>start: yield text[start:i]
-            if phrases:
-                # can skip to text markupStart
-                i=text.find(markupStart,i)
-                if i==-1: i=len(text)
-                start = i
-            else:
-                i += 1 # just after the 1st splitter
-                while i<len(text) and isSplitpoint(): i += 1
-                start = i # 1st non-splitter char
-        else: i += 1
-    if i>start: yield text[start:i]
+    if phrases: it=re.finditer(phrasePattern,text)
+    else: it=re.finditer(wordPattern,text)
+    for i in it: yield i.group()
 
 markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd))
 whitespacePattern = re.compile(r"\s+")
+phrasePattern = re.compile('('+re.escape(markupStart)+'.*?'+re.escape(markupEnd)+'\s*)+')
+wordPattern = re.compile(re.escape(markupStart)+'.*?'+re.escape(markupEnd))
 
 def annotationOnly(text):
     ret = []
@@ -2395,8 +2376,9 @@ def generate_map():
         if e==-1: e=len(corpus_unistr)
         for w in splitWords(corpus_unistr[s:e]):
           wd = markDown(w)
-          if wd in yPriorityDic: pass
-          else: yPriorityDic[wd] = w
+          if wd in yPriorityDic: continue
+          if diagnose==wd: sys.stderr.write(("Diagnose: yPriorityDic[%s] = %s\n" % (wd,w)).encode(terminal_charset,'replace'))
+          yPriorityDic[wd] = w
     sys.stderr.write("done\n")
     if checkpoint: pickle.Pickler(open(checkpoint+os.sep+'map','wb'),-1).dump((corpus_to_markedDown_map,c2m_inverse,precalc_sets,yPriorityDic))
     checkpoint_exit()
-- 
GitLab