From 2266e02e7feb56a377c998bfde6c155ce0c302aa Mon Sep 17 00:00:00 2001 From: "Silas S. Brown" <ssb22@cam.ac.uk> Date: Fri, 12 Jan 2018 12:04:17 -0600 Subject: [PATCH] Update Annotator Generator --- annogen.py | 166 +++++++++++++---------------------------------------- 1 file changed, 39 insertions(+), 127 deletions(-) diff --git a/annogen.py b/annogen.py index 393d0e2..037402f 100755 --- a/annogen.py +++ b/annogen.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -program_name = "Annotator Generator v0.6283 (c) 2012-18 Silas S. Brown" +program_name = "Annotator Generator v0.6284 (c) 2012-18 Silas S. Brown" # See http://people.ds.cam.ac.uk/ssb22/adjuster/annogen.html @@ -321,7 +321,7 @@ if zlib: if ios: warn("--zlib with --ios will require -lz to be added to the linker options in XCode, and I don't have instructions for that (it probably differs across XCode versions)") if data_driven and (c_sharp or java or golang): errExit("--data-driven is not yet implemented in C#, Java or Go") elif javascript or python: data_driven = True -additional_compact_opcodes = data_driven and not (python or javascript) # currently implemented only in the C version of the data-driven runtime +additional_compact_opcodes = data_driven and not python # currently implemented only in the C and Javascript versions of the data-driven runtime if java or javascript or python or c_sharp or ios or ndk or golang: c_compiler = None try: @@ -340,8 +340,6 @@ diagnose_limit = int(diagnose_limit) max_words = int(max_words) if single_words: max_words = 1 if no_input and diagnose_manual: errExit("--diagnose-manual is not compatible with --no-input") # it needs the input for diagnostic purposes -needAnnoType = False # HIGHLY EXPERIMENTAL - DO NOT USE -if needAnnoType and (windows_clipboard or ios or (java and not ndk) or c_sharp or golang): errExit("needAnnoType not yet implemented in Windows clipboard, iOS, Java without Android NDK, C# or Go") def nearCall(negate,conds,subFuncs,subFuncL): # returns what to put in the if() for ybytes near() lists @@ -698,11 +696,7 @@ int near(char* string) { return 0; } void matchAll(); -JNIEXPORT jstring JNICALL Java_%PACKAGE%_MainActivity_jniAnnotate(JNIEnv *env, jclass theClass, jstring jIn""" - if needAnnoType: c_defs += ", jint aType" - c_defs += ") {" - if needAnnoType: c_defs += "annotation_type = aType;" - c_defs += r""" +JNIEXPORT jstring JNICALL Java_%PACKAGE%_MainActivity_jniAnnotate(JNIEnv *env, jclass theClass, jstring jIn) { startPtr=(char*)(*env)->GetStringUTFChars(env,jIn,NULL); readPtr = startPtr; writePtr = startPtr; outWriteLen = strlen(startPtr)*5+1; /* initial guess (must include the +1 to ensure it's non-0 for OutWrite...'s *= code) */ @@ -865,7 +859,6 @@ if ios: c_name = "Objective-C" else: c_name = "C" if ndk: c_start = "" # because #!/bin/bash comes next else: c_start = "/* -*- coding: "+outcode+" -*- */\n/* "+c_name+" code "+version_stamp+" */\n" -if needAnnoType: c_defs += "int annotation_type=0;\n" c_start += c_preamble+r""" enum { ybytes = %%YBYTES%% }; /* for Yarowsky-like matching, minimum readahead */ static int nearbytes = ybytes; @@ -1067,9 +1060,6 @@ else: int main(int argc,char*argv[]) { int i; for(i=1; i<argc; i++) { if(!strcmp(argv[i],"--help")) { - printf("%s [options]""" - if needAnnoType: c_end += " [annotation type number]" - c_end += r"""\nOptions:\n",argv[0]); puts("--ruby = output ruby markup (default)"); puts("--raw = output just the annotations without the base text"); puts("--braces = output as {base-text|annotation}"); @@ -1079,11 +1069,7 @@ int main(int argc,char*argv[]) { } else if(!strcmp(argv[i],"--raw")) { annotation_mode = annotations_only; } else if(!strcmp(argv[i],"--braces")) { - annotation_mode = brace_notation;""" - if needAnnoType: c_end += r""" - } else if(sscanf(argv[i],"%d",&annotation_type)==1) { /* pass */ - """ - c_end += r""" + annotation_mode = brace_notation; } else { fprintf(stderr,"Unknown argument '%s'\n(Text should be on standard input)\n",argv[i]); return 1; } @@ -1209,10 +1195,7 @@ public class MainActivity extends Activity {""" if ndk: android_src += r""" static { System.loadLibrary("Annotator"); } - static synchronized native String jniAnnotate(String in""" - if needAnnoType: android_src += ", int aType" - android_src += ');' -android_src += r""" + static synchronized native String jniAnnotate(String in); @SuppressLint("SetJavaScriptEnabled") @android.annotation.TargetApi(19) // 19 for setWebContentsDebuggingEnabled; 7 for setAppCachePath; 3 for setBuiltInZoomControls (but only API 1 is required) @SuppressWarnings("deprecation") // for conditional SDK below @@ -1239,15 +1222,9 @@ android_src += r""" browser.setWebChromeClient(new WebChromeClient()); class A { public A(MainActivity act) { this.act = act; } - MainActivity act;""" -if needAnnoType: android_src += r""" - int annotation_type = 0; - @android.webkit.JavascriptInterface public void setAnnotationType(int t) { annotation_type = t; } - @android.webkit.JavascriptInterface public int getAnnotationType() { return annotation_type; }""" -android_src += r""" String copiedText=""; + MainActivity act; String copiedText=""; @android.webkit.JavascriptInterface public String annotate(String t,boolean inLink) { String r=""" -if ndk and needAnnoType: android_src += 'jniAnnotate(t,annotation_type)' -elif ndk: android_src += 'jniAnnotate(t)' +if ndk: android_src += 'jniAnnotate(t)' else: android_src += 'new %%JPACKAGE%%.Annotator(t).result()' android_src += r"""; if(!inLink) r=r.replaceAll("<ruby","<ruby onclick=\"annotPopAll(this)\""); return r; } // now we have a Copy button, it's convenient to put this on ALL ruby elements, not just ones with title @android.webkit.JavascriptInterface public void alert(String t,String a) { @@ -1360,7 +1337,7 @@ android_src += r"""; if(!inLink) r=r.replaceAll("<ruby","<ruby onclick=\"annotPo WebView browser; } """ -if ndk: c_start = c_start.replace("%%android_src%%",android_src.replace("Put *.java into src/%%JPACK2%%","Optionally edit this file, but beware it will be overwritten if the script to generate it is re-run").replace('%%ANDROID-URL%%',android)) +if ndk: c_start = c_start.replace("%%android_src%%",android_src.replace("Put *.java into src/%%JPACK2%%","Optionally edit this file, but beware it will be overwritten if the script to generate it is re-run").replace('%%ANDROID-URL%%',android).replace("%%JPACKAGE%%",ndk)) android_clipboard = r"""<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head><body> <script>window.onerror=function(msg,url,line){ssb_local_annotator.alert('Error!',''+msg); return true}</script> <h3>Clipboard</h3> @@ -1733,6 +1710,8 @@ func Annotate(src io.Reader, dest io.Writer) { class BytecodeAssembler: # Bytecode for a virtual machine run by the Javascript version etc opcodes = { + # 0-19 RESERVED for short switchbyte + # 128-255 RESERVED for short jumps 'jump': 50, # '2' params: address 'call': 51, # '3' params: function address 'return': 52, # '4' (or 'end program' if top level) @@ -1741,9 +1720,6 @@ class BytecodeAssembler: 'savepos':80, # 'P', local to the function 'restorepos':81, # 'Q' 'neartest':90, # 'Z' params: true-label, false-label, byte nbytes, addresses of conds strings until first of the 2 labels is reached (normally true-label, unless the whole neartest is negated) - 'typetest':100, # 'd' for needAnnoType; params: false-label, byte num of acceptable type numbers, (string of) bytes acceptable type numbers - # RESERVED by additional_compact_opcodes: - # 128-255 for short jumps, 0-19 for short switchbyte (1 to 20 items), 20-31 for short typetest (1 to 12 types) } def __init__(self): self.l = [] @@ -1775,10 +1751,6 @@ class BytecodeAssembler: self.addBytes(len(byteArray)-1) # num of bytes in list - 1 (so all 256 values can be accounted for if needed) self.addBytes("".join(byteArray)) for i in labelArray: self.addRef(i) - def addTypetest(self,okATypeList,falseLabel): - self.addOpcode('typetest') ; self.addRef(falseLabel) - self.addBytes(len(okATypeList)) - self.addBytes("".join(chr(n) for n in okATypeList)) def addActions(self,actionList): # assert type(actionList) in [list,tuple], repr(actionList) for a in actionList: @@ -1931,21 +1903,13 @@ class BytecodeAssembler: if 1 <= numItems <= 20: numLabels = numItems+1 # there's an extra default label at the end origOperandsLen = 1+numItems+numLabels*addrSize # number + N bytes + the labels - if all(0 <= LGet(src[count+N],origOperandsLen) <= 0xFF for N in xrange(3,3+numLabels)): - src[count] = i = src[count+1]+src[count+2]+''.join(chr(LGet(src[count+N],origOperandsLen)) for N in xrange(3,3+numLabels)) # opcode_including_nItems, string of bytes, offsets + if LGet(src[count+3],origOperandsLen)==0 and all(0 <= LGet(src[count+N],origOperandsLen) <= 0xFF for N in xrange(4,3+numLabels)): + src[count] = i = src[count+1]+src[count+2]+''.join(chr(LGet(src[count+N],origOperandsLen)) for N in xrange(4,3+numLabels)) # opcode_including_nItems, string of bytes, offsets (assume 1st offset is 0 so not listed) del src[count+1:count+3+numLabels] - newOperandsLen = numItems*2+1 # for each byte, the byte itself and an offset, + 1 more offset + newOperandsLen = numItems*2 # for each byte, the byte itself and an offset, + 1 more offset as default, - 1 because first is not given compacted += origOperandsLen-newOperandsLen bytesFromEnd -= origOperandsLen # will add new opCode + operands below compaction_types.add(opcode) - elif opcode=="typetest" and 1<=src[count+1]<=12 and 0 <= LGet(src[count+1],addrSize+1+ord(src[count+2])) <= 0xFF: # similarly with the short version of typetest: - numItems = ord(src[count+2]) - instrLen = numItems+2 # N acceptable types + short-jump + opcode-including-N - src[count] = i = chr(numItems+19)+chr(LGet(src[count+1],addrSize))+src[count+3] # we assume all acceptable annotation-type numbers are in one string at count+3 (after count+1 is falseLabel and count+2 is number of acceptable annotation types) - compacted += addrSize # as full instruction is opcode + falseLabel + byte for N + n bytes, compacted instruction is opcode_including_N + 1 (shortened falseLabel) + n bytes, so difference is (addrSize-1) + 1 = addrSize - bytesFromEnd -= addrSize+1+ord(src[count+2]) - compaction_types.add(opcode) - del src[count+1:count+4] # jumpIfFalse, numItems, itemString elif type(i) in [int,tuple]: # labels if type(i)==int: i2 = i else: i2 = i[0] @@ -2008,9 +1972,7 @@ js_start = '/* Javascript '+version_stamp+r""" Usage: - You could just include this code and then call the - annotate() function i.e. var result = annotate(input""" -if needAnnoType: js_start += ",annotation_type" -js_start += r""") + annotate() function i.e. var result = annotate(input) - Or you could use (and perhaps extend) the Annotator object, and call its annotate() method. If you have @@ -2028,11 +1990,7 @@ js_start += r""") var Annotator={ version: '"""+version_stamp+"',\n" js_end = r""" -annotate: function(input""" -if needAnnoType: js_end += ",annotation_type" -js_end += ") {" -if needAnnoType: js_end += "\n if (annotation_type==undefined) annotation_type=0;" -js_end += r""" +annotate: function(input) { /* TODO: if input is a whole html doc, insert css in head (e.g. from annoclip and/or adjuster), and hope there's no stuff that's not to be annotated (form fields...) */ @@ -2064,9 +2022,16 @@ function s() { } function readData() { - var sPos = new Array(); + var sPos = new Array(), c; while(1) { - switch(data.charCodeAt(dPtr++)) { + c = data.charCodeAt(dPtr++); + if (c & 0x80) dPtr += (c&0x7F); + else if (c < 20) { + var i = ((p>=input.length)?-1:data.slice(dPtr,dPtr+(++c)).indexOf(input.charAt(p++))); + if (i==-1) i = c; + if(i) dPtr += data.charCodeAt(dPtr+c+i-1); + dPtr += c+c; + } else switch(c) { case 50: dPtr = readAddr(); break; case 51: { var f = readAddr(); var dO=dPtr; @@ -2117,16 +2082,7 @@ function readData() { var found = 0; while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; } dPtr = found ? tPtr : fPtr; break; - }""" -if needAnnoType: js_end += r""" - case 100: { - var fPtr = readAddr(); - var okbytes = data.charCodeAt(dPtr++); - var found = 0; - while(okbytes--) if(data.charCodeAt(dPtr++)==annotation_type) found=1; - if(!found) dPtr=fPtr; break; - }""" -js_end += r""" + } default: throw("corrupt data table at "+(dPtr-1)+" ("+data.charCodeAt(dPtr-1)+")"); } } @@ -2140,17 +2096,9 @@ if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; } return decodeURIComponent(escape(output.join(""))); // from UTF-8 back to Unicode } // end of annotate function }; -function annotate(input""" -if needAnnoType: js_end += ",aType" -js_end += r""") { return Annotator.annotate(input""" -if needAnnoType: js_end += ",aType" -js_end += r"""); } - -if (typeof Backbone != "undefined" && Backbone.Model) { Annotator = Backbone.Model.extend(Annotator); annotate=function(input""" -if needAnnoType: js_end += ",aType" -js_end += r""") { return new Annotator().annotate(input""" -if needAnnoType: js_end += ",aType" -js_end += r""") } } +function annotate(input) { return Annotator.annotate(input); } + +if (typeof Backbone != "undefined" && Backbone.Model) { Annotator = Backbone.Model.extend(Annotator); annotate=function(input) { return new Annotator().annotate(input) } } if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) { // Node.js command-line test fs=require('fs'); @@ -2170,13 +2118,10 @@ py_start = '# Python '+version_stamp+r""" # 'ruby' (default), 'raw' (annotation only) or 'braces'. """ -if needAnnoType: py_start += "# Optional third argument is annotation type number.\n\n" py_end = r""" class Annotator: version="""+'"'+version_stamp+r"""" - def __call__(self,inStr,aFormat""" -if needAnnoType: py_end += ",annotation_type" -py_end += r"""): + def __call__(self,inStr,aFormat): if aFormat=="ruby": self.startA,self.midA,self.endA = "<ruby><rb>","</rb><rt>","</rt></ruby>" elif aFormat=="raw": self.startA=self.midA=self.endA = "" elif aFormat=="braces": self.startA,self.midA,self.endA = "{","|","}" @@ -2188,9 +2133,7 @@ py_end += r"""): self.p = 0 # read-ahead pointer self.copyP = 0 # copy pointer self.output = [] - self.needSpace = 0 ; out = self.output""" -if needAnnoType: py_end += " ; self.annotation_type = annotation_type" -py_end += r""" + self.needSpace = 0 ; out = self.output while self.p < self.inputLength: oldPos = self.p self.dPtr = 1 ; self.readData() @@ -2271,34 +2214,15 @@ py_end += r""" if self.readRefStr() in tStr: found = 1 ; break if found: self.dPtr = tPtr - else: self.dPtr = fPtr""" -if needAnnoType: py_end += r""" - elif d==100: - fPtr = self.readAddr() - nOK = ord(data[self.dPtr]) ; self.dPtr += 1 - if chr(self.annotation_type) in data[self.dPtr:self.dPtr+nOK]: self.dPtr += nOK else: self.dPtr = fPtr -""" -py_end += r""" else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1]))+")") -def annotate(inStr,p="ruby" """[:-1] -if needAnnoType: py_end += ",annotation_type=0" -py_end += r"""): return Annotator()(inStr,p""" -if needAnnoType: py_end += ",annotation_type" -py_end += r""") +def annotate(inStr,p="ruby"): return Annotator()(inStr,p) def main(): - import sys ; aFormat = 'ruby'""" -if needAnnoType: py_end += " ; aType = 0" -py_end += r""" + import sys ; aFormat = 'ruby' for a in sys.argv[1:]: - if a.startswith("--"): aFormat=a[2:]""" -if needAnnoType: py_end += r""" - else: aType = int(a)""" -py_end += r""" - sys.stdout.write(annotate(sys.stdin.read(),aFormat""" -if needAnnoType: py_end += ",aType" -py_end += r""")) + if a.startswith("--"): aFormat=a[2:] + sys.stdout.write(annotate(sys.stdin.read(),aFormat)) if __name__=="__main__": main() """ # TODO: annotation-type option from command line in py @@ -2333,13 +2257,7 @@ static void readData() { unsigned char byte=(unsigned char)NEXTBYTE; int i; for (i=0; i<c; i++) if(byte==dPtr[i]) break; - dPtr += c+c+1 + dPtr[c+i]; // relative from end of switch (after all bytes, 1-byte addresses and the 1-byte default address: up to 256 bytes after)""" -if needAnnoType: c_datadrive += r""" - } else if(c < 32) { // typetest with short jumps - unsigned char falseOffset=*dPtr++; int found=0; - for(c-=19;c--;) if(annotation_type==*dPtr++) { found=1; dPtr+=c; break; } - if(!found) dPtr += falseOffset;""" -c_datadrive += r""" + if(i) dPtr += dPtr[c+i-1]; dPtr += c+c; // relative from end of switch (after all bytes, 1-byte addresses (except 1st) and the 1-byte default address) } else switch(c) { case 50: /* jump */ dPtr = readAddr(); break; case 51: /* call */ { @@ -2382,15 +2300,7 @@ c_datadrive += r""" unsigned char *falsePtr = readAddr(); setnear(*dPtr++); int found=0; while(dPtr < truePtr && dPtr < falsePtr) if(near((char*)readAddr())) { found = 1; break; } - dPtr = found ? truePtr : falsePtr; break; }""" -if needAnnoType: c_datadrive += r""" - case 100: /* typetest */ { - unsigned char *falsePtr = readAddr(); - int nOK = *dPtr++, found=0; - while(nOK--) if(*dPtr++==annotation_type) { - found=1; dPtr += nOK; break; - } if(!found) dPtr = falsePtr; break; }""" -c_datadrive += r""" + dPtr = found ? truePtr : falsePtr; break; } // default: TODO: error about corrupt data? } } @@ -3208,6 +3118,8 @@ def c_escapeRawBytes(s): # as it won't be valid outcode; don't want to crash any if s.endswith(chr(0)): s=s[:-1] # as the C compiler will add a terminating 0 anyway return re.sub(r"(?<!\\)((?:\\\\)*\\x..)([0-9a-fA-F])",r'\1""\2',zapTrigraphs(s.replace('\\','\\\\').decode('unicode_escape').encode('unicode_escape').replace('"','\\"'))) +def js_escapeRawBytes(s): return re.sub("[\x00-\x1f\x7f-\xff]",lambda m:"\\x%02x"%ord(m.group()),re.sub(chr(0)+r"(?![0-9])",r"\\0",s.replace("\\",r"\\").replace('"',r'\"').replace(chr(8),r"\b").replace(chr(9),r"\t").replace(chr(10),r"\n").replace(chr(12),r"\f"))) # TODO: could also convert chars 1-7 (without following digits) to single-digit octal and 11 + 13-31 (without following digits) to double-digit octal, but deprecated in ECMAScript 5 (errors in strict mode); 11 = \v but not in MSIE 8 or below + def c_length(unistr): return len(unistr.encode(outcode)) if java or c_sharp or golang: @@ -3350,7 +3262,7 @@ def outputParser(rulesAndConds): outfile.write(js_start) b = BytecodeAssembler() b.addActionDictSwitch(byteSeq_to_action_dict,False) - outfile.write("data: "+repr(b.link())+",\n") + outfile.write("data: \""+js_escapeRawBytes(b.link())+"\",\n") del b ; outfile.write(js_end+"\n") return # skip all of below (including no_summary etc) if python: -- GitLab