FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 378 KiB
Newer Older
else: js_6bit_offset = 0

class BytecodeAssembler:
  # Bytecode for a virtual machine run by the Javascript version etc
  opcodes = {
    # 0-19    RESERVED for short switchbyte (C,Java,Py)
    'jump': 50, # '2' params: address
    'call': 51, # '3' params: function address
    'return': 52, # '4' (or 'end program' if top level)
    'switchbyte': 60, # '<' switch(NEXTBYTE) (params: numBytes-1, bytes (sorted, TODO take advantage of this), addresses, default address)
    's0':70, # 'F'
    'copyBytes':71,'o':72,'o2':73, # 'G','H','I' (don't change these numbers, they're hard-coded below)
    # 74-76 ('J','K','L') reserved for 'above + return'
    'savepos':80, # 'P', local to the function
    'restorepos':81, # 'Q'
    'neartest':90, # 'Z' params: true-label, false-label, byte nbytes, addresses of conds strings until first of the 2 labels is reached (normally true-label, unless the whole neartest is negated)
    # 91-107 RESERVED for short switchbyte (JS, UTF-8 printability optimisation for 6bit)
    # 108-127 RESERVED for short switchbyte (JS,Dart, more in the printable range to reduce escaping a bit)
    # 128-255 RESERVED for short jumps
  }
  def __init__(self):
    self.l = [] # code list
    self.d2l = {} # definition to label
    self.lastLabelNo = 0
    self.addingPosStack = []
  def addOpcode(self,opcode): self.l.append((opcode,))
  def addBytes(self,bStr):
      if type(bStr)==int: self.l.append(B(chr(bStr)))
      elif type(bStr)==bytes: self.l.append(bStr)
      else: raise Exception("unspported bytes type")
  def startAddingFunction(self):
      self.addingPosStack.append((len(self.l),self.lastLabelNo))
      self.lastLabelNo = 0
  def finishFunctionAndAddCall(self):
      # make sure to add a return instruction before this!
      fPtr, self.lastLabelNo = self.addingPosStack[-1]
      del self.addingPosStack[-1]
      fBody = tuple(self.l[fPtr:]) ; self.l=self.l[:fPtr]
      if not fBody in self.d2l: # not a duplicate
          self.d2l[fBody] = (-len(self.d2l)-1,)
      self.addOpcode('call')
      self.l.append(self.d2l[fBody])
  def addByteswitch(self,byteArray,labelArray):
      assert len(byteArray) + 1 == len(labelArray)
      # labelArray has the default case added also (TODO: could re-organize code so the bytes immediately after the switch are either the default or one of the items, saving 1 address)
      if not len(byteArray): return # empty switch = no-op
      self.addOpcode('switchbyte')
      self.addBytes(len(byteArray)-1) # num of bytes in list - 1 (so all 256 values can be accounted for if needed)
      self.addBytes(b"".join(byteArray))
      for i in labelArray: self.addRef(i)
  def addActions(self,actionList):
    # assert type(actionList) in [list,tuple], repr(actionList)
    for a in actionList:
      if a==b's0':
        self.addOpcode('s0') ; continue
      assert 1 <= len(a) <= 3 and type(a[0])==int and all(type(b)==bytes for b in a[1:]), repr(a)
      assert 1 <= a[0] <= 255, "bytecode currently supports markup or copy between 1 and 255 bytes only, not %d (but 0 is reserved for expansion)" % a[0]
      self.addBytes(70+len(a)) # 71=copyBytes 72=o() 73=o2
      if js_6bit:
        self.addBytes((a[0]+(js_6bit_offset-1))&0xFF)
      else: self.addBytes(a[0]) # num i/p bytes to copy
      for i in a[1:]: self.addRefToString(i)
  def addActionDictSwitch(self,byteSeq_to_action_dict,isFunc=True,labelToJump=None):
    # a modified stringSwitch for the bytecode
    # Actions aren't strings: they list tuples of either
    # 1, 2 or 3 items for copyBytes, o(), o2()
    # labelToJump is a jump to insert afterwards if not isFunc and if we don't emit an unconditional 'return'.  Otherwise, will ALWAYS end up with a 'return' (even if not isFunc i.e. the main program)
    allBytes = set(b[:1] for b in iterkeys(byteSeq_to_action_dict) if b)
    if isFunc:
        self.startAddingFunction()
        savePos = len(self.l)
        self.addOpcode('savepos')
    elif (b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1) or not labelToJump: # ('not labelToJump' and 'not isFunc' == main program)
        savePos = len(self.l)
        self.addOpcode('savepos')
    else: savePos = None
    if b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[b""])==1 and not byteSeq_to_action_dict[b""][0][1] and all((len(a)==1 and a[0][0][:len(byteSeq_to_action_dict[b""][0][0])]==byteSeq_to_action_dict[b""][0][0] and not a[0][1]) for a in itervalues(byteSeq_to_action_dict)):
        self.addActions(byteSeq_to_action_dict[b""][0][0])
        l = len(byteSeq_to_action_dict[b""][0][0])
        byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in iteritems(byteSeq_to_action_dict))
        del self.l[savePos] ; savePos = None
        del byteSeq_to_action_dict[b""]
        self.addActionDictSwitch(byteSeq_to_action_dict) # as a subfunction (ends up adding the call to it, which should be replaced by a jump during compaction; TODO: auto-inline if it turns out there's only this one call to it?  other calls might happen if it's merged with an identical one)
        byteSeq_to_action_dict[b""] = [(b"",[])] # for the end of this func
        self.addOpcode('return')
    elif allBytes:
      allBytes = sorted(list(allBytes))
      labels = [self.makeLabel() for b in allBytes+[0]]
      self.addByteswitch(allBytes,labels)
      for case in allBytes:
        self.addLabelHere(labels[0]) ; del labels[0]
        self.addActionDictSwitch(dict([(k[1:],v) for k,v in iteritems(byteSeq_to_action_dict) if k[:1]==case]),False,labels[-1])
      self.addLabelHere(labels[0])
    if not savePos==None: self.addOpcode('restorepos')
    if isFunc:
        self.addOpcode('return')
        if self.l[-1]==self.l[-2]: del self.l[-1] # double return
        return self.finishFunctionAndAddCall()
    elif b"" in byteSeq_to_action_dict:
        default_action = b""
        for action,conds in byteSeq_to_action_dict[b""]:
            if conds:
                if type(conds)==tuple: negate,conds,nbytes = conds
                else: negate,nbytes = False,ybytes_max
                assert 1 <= nbytes <= 255, "bytecode supports only single-byte nbytes (but nbytes=0 is reserved for expansion)"
                trueLabel,falseLabel = self.makeLabel(),self.makeLabel()
                self.addOpcode('neartest')
                self.addRef(trueLabel)
                self.addRef(falseLabel)
                assert type(nbytes)==int
                self.addBytes(nbytes)
                for c in conds: self.addRefToString(c.encode(outcode)) # TODO: how much bytecode could we save by globally merging equivalent lists of string-list references ?  (zlib helps anyway but...)
                if negate: trueLabel,falseLabel = falseLabel,trueLabel
                self.addLabelHere(trueLabel)
                self.addActions(action)
                self.addOpcode('return')
                self.addLabelHere(falseLabel)
            else: default_action = action
        if default_action or not byteSeq_to_action_dict[b""]:
            self.addActions(default_action)
            self.addOpcode('return') ; return
    if labelToJump:
        self.addOpcode('jump')
        self.addRef(labelToJump)
    else: self.addOpcode('return')
  def makeLabel(self):
      self.lastLabelNo += 1
      return self.lastLabelNo
  def addLabelHere(self,labelNo):
      assert type(labelNo)==int
      assert labelNo, "label 0 not allowed"
      self.l.append(labelNo)
  def addRef(self,labelNo):
      assert type(labelNo)==int
      self.l.append(-labelNo)
  def addRefToString(self,string):
    assert type(string)==bytes, repr(string)
    l = len(string)
    if python or java or javascript or dart:
      # prepends with a length hint if possible (or if not
      # prepends with 0 and null-terminates it)
      if js_utf8:
        string = unicodedata.normalize("NFC",string.decode('utf-8')) # NFC very important for browser_extension: some browsers seem to do it anyway, throwing off data addresses if we haven't accounted for that
        l = len(string) # we count in UCS-2 characters
        assert all((ord(c) <= 0xFFFF) for c in string), "js_utf8 addressing will be confused by non UCS-2: "+repr(string) # TODO: put surrogate pairs? (and increase l by num pairs; ensure Python will emit separate UTF-8 sequences for each part of the surrogate, if needed by JS; might need to escape the pairs after all addresses computed) + what if we're in dart ?
        # Have checked browsers + Node count combining characters separately, so len(string) should be correct (e.g. u'Moc\u0306nik')
        if 1 <= l < 0x02B0: # can use length-first unichr (avoid combining and modifier marks just in case; also avoid 0xD800+ surrogates)
          string = unichr(l) + string
        else: string = unichr(0)+string+unichr(0)
      elif js_6bit:
        string = re.sub(b"%(?=[0-9A-Fa-f])|[\x7f-\xff]",lambda m:urllib.quote(m.group()),string) # for JS 'unescape' in readRefStr, which is applied (without encodeURIComponent) if js_6bit and not js_utf8 so we can use %-encoding
        l = len(string) # length is needed BEFORE %-decode
        if 1 <= l <= 91: # use 32-122 inclusive
          string = B(chr(l+31))+string
        else: # try to avoid using \x00 for termination
          for termChar in '{|}~\x00': # 123-126 + nul
            termChar=B(termChar)
            if not termChar in string:
              string = termChar + string + termChar
              break
      elif 1 <= l < 256: # length byte + string
        string = B(chr(l))+string
      else: string = B(chr(0))+string+B(chr(0))
    else: string += b'\x00' # just null-termination for C
    if not string in self.d2l:
      self.d2l[string] = (-len(self.d2l)-1,)
    self.l.append(self.d2l[string])
  def link(self): # returns resulting bytes
    # (add an 'end program' instruction before calling)
    def f(*args): raise Exception("Must call link() only once")
    self.link = f
    sys.stderr.write("Linking... ") ; sys.stderr.flush()
    for dat,ref in sorted(iteritems(self.d2l)): # the functions and data to add to the end of self.l, sorted so we can optimise for overlaps
        assert type(ref)==tuple and type(ref[0])==int
        self.l.append((-ref[0],)) # the label
        if type(dat) in [bytes,unicode]:
            if type(self.l[-2])==type(dat) and self.l[-2][-1]==dat[0]: # overlap of termination-byte indicators (TODO: look for longer overlaps? unlikely to occur)
              self.l[-2] = self.l[-2][:-1]
            self.l.append(dat) ; continue
        # otherwise it's a function, and non-reserved labels are local, so we need to rename them
        l2l = {}
        for i in dat:
            if type(i)==int:
                if i>0: j=i
                else: j=-i
                if not j in l2l:
                    l2l[j] = self.makeLabel()
                if i>0: self.addLabelHere(l2l[j])
                else: self.addRef(l2l[j])
            else: self.l.append(i) # str or tuple just cp
    del self.d2l
    if post_normalise: # must be AFTER d2l, as EOF is used to end it
      normLabel = self.makeLabel()
      self.l.insert(0,-normLabel)
      self.l.append(normLabel)
      bmp = [(k,v) for k,v in sorted(post_normalise.items())]
      maxRLE = min(bmp[0][0],min(v for k,v in bmp))-1
      assert maxRLE >= 0, "can't have a mapping to 0"
      curPtr = 0
      def lsbmsb(i): return B(chr(i&0xFF)+chr(i>>8))
      for i in xrange(len(bmp)):
        delta = bmp[i][0]-curPtr
        while delta:
          skip = min(delta,maxRLE)
          self.l.append(lsbmsb(skip))
          delta -= skip ; curPtr += skip
        self.l.append(lsbmsb(bmp[i][1]))
        curPtr += 1
    # elements of self.l are now:
    # - (byte) strings (just copied in)
    # - positive integers (labels for code)
    # - negative integers (references to labels)
    # - +ve or -ve integers in tuples (labels for functions and text strings: different 'namespace')
    # strings in tuples: opcodes
    # 1st byte of o/p is num bytes needed per address
    class TooNarrow(Exception): pass
    if js_6bit: aBits,aMask = 6,0x3F
    else: aBits,aMask = 8,0xFF
    for addrSize in xrange(1,256):
        sys.stderr.write("(%d-bit) " % (aBits*addrSize))
        sys.stderr.flush()
        src = self.l[:] # must start with fresh copy, because compaction modifies src and we don't want a false start with wrong addrSize to affect us
        try:
          compacted = 0 ; compaction_types = set()
          if compact_opcodes:
            # The compact opcodes all rely on relative addressing (relative to AFTER the compact instruction) that goes only forward.  Easiest way to deal with that is to work backwards from the end, inlining the compactions, before running a conventional 2-pass assembly.
            # TODO: Could move the below loop into this one in its entirety, and just assemble backwards.  Most within-function label references point forwards anyway.  (Would still need some backward refs for functions though)
            bytesFromEnd = 0
            lDic = {} # labelNo -> bytesFromEnd
            def LGet(lRef,origOperandsLen):
              # return the number of bytes between the end of the new instruction and the label.  Since bytesFromEnd includes origOperandsLen, we need to subtract that out, which would then leave bytes from end of code to end of new instruction (no matter what the length of the new instruction will be)
              if not -lRef in lDic: return -1
              return bytesFromEnd-origOperandsLen-lDic[-lRef]
            counts_to_del = set()
            for count in xrange(len(src)-1,-1,-1):
                i = src[count]
                if type(i) in [bytes,unicode] and len(i)==1 and 71<=ord(i)<=73 and src[count+ord(i)-70+1]==('return',):
                  # (74 to 76 = 71 to 73 + return)
                  src[count] = B(chr(ord(i)+3))
                  counts_to_del.add(count+ord(i)-70+1)
                  compacted += 1 ; bytesFromEnd -= 1
                  compaction_types.add('return')
                elif type(i)==tuple and type(i[0])==str:
                    opcode = i[0]
                    i = "-" # for len() at end of block
                    if opcode=='call' and src[count+2]==('return',):
                      src[count] = ('jump',)
                      counts_to_del.add(count+2)
                      compacted += 1 ; bytesFromEnd -= 1
                      compaction_types.add(opcode)
                      # can't fall through by setting opcode='jump', as the address will be in the function namespace (integer in tuple, LGet would need adjusting) and is highly unlikely to be within range (TODO: unless we try to arrange the functions to make it so for some cross-calls)
                    if opcode=='jump' and 0 <= LGet(src[count+1],addrSize) < 0x80: # we can use a 1-byte relative forward jump (up to 128 bytes), useful for 'break;' in a small switch
                      offset = LGet(src[count+1],addrSize)
                      if offset == 0:
                        # can remove this jump completely
                        i = "" # for len() at end of block
                        compacted += 1
                        counts_to_del.add(count) # zap jmp
                      else: src[count] = i = B(chr(0x80 | offset)) # new instr: 0x80|offset
                      counts_to_del.add(count+1) # zap the label
                      compacted += addrSize # as we're having a single byte instead of byte + address
                      bytesFromEnd -= addrSize
                      compaction_types.add(opcode)
                    elif opcode=='switchbyte':
                      numItems = len(src[count+2]) # = ord(src[count+1]) + 1
                      if 1 <= numItems <= 20:
                       numLabels = numItems+1 # there's an extra default label at the end
                       origOperandsLen = 1+numItems+numLabels*addrSize # number + N bytes + the labels
                       if LGet(src[count+3],origOperandsLen)==0 and all(0 <= LGet(src[count+N],origOperandsLen) <= 0xFF-js_6bit_offset for N in xrange(4,3+numLabels)): # 1st label is immediately after the switchbyte, and all others are in range
                        if javascript or dart: # use printable range
                          if js_6bit and numItems<=17 and all(0x80<=ord(x)<=0xBF or 0xD4<=ord(x)<=0xEF for x in S(src[count+2])): # if bytes being switched on are all from UTF-8 representations of U+0500 through U+FFFF, move to printable range (in one test this saved 780k for the continuation bytes and another 200k for the rest)
                            def mv(x):
                              if x>=0xD4: x -= 20 # or, equivalently, if (x-93)>118, which is done to the input byte in JS before searching on these
                              return B(chr(x-93))
                            src[count+2]=b''.join(mv(ord(x)) for x in S(src[count+2]))
                            i = B(chr(ord(src[count+1])+91)) # and a printable opcode
                          else: i = B(chr(ord(src[count+1])+108)) # can't make the match bytes printable, but at least we can have a printable opcode 108-127 for short switchbyte in Javascript or Dart
                        else: i = B(src[count+1]) # 0-19 for short switchbyte in C,Java,Python
                        src[count] = i = i+src[count+2]+b''.join(B(chr(LGet(src[count+N],origOperandsLen)+js_6bit_offset)) for N in xrange(4,3+numLabels)) # opcode_including_nItems, string of bytes, offsets (assume 1st offset at count+3 is 0 so not listed)
                        for ctd in xrange(count+1,count+3+numLabels): counts_to_del.add(ctd)
                        newOperandsLen = numItems*2 # for each byte, the byte itself and an offset, + 1 more offset as default, - 1 because first is not given
                        compacted += origOperandsLen-newOperandsLen
                        bytesFromEnd -= origOperandsLen # will add new opCode + operands below
                        compaction_types.add(opcode)
                elif type(i) in [int,tuple]: # labels
                    if type(i)==int: i2 = i
                    else: i2 = i[0]
                    assert type(i2)==int
                    if i2 > 0:
                        lDic[i] = bytesFromEnd ; i = ""
                        if bytesFromEnd >> (aBits*addrSize+1): raise TooNarrow() # fair assumption (but do this every label, not every instruction)
                    else: i = "-"*addrSize # a reference
                bytesFromEnd += len(i)
            src=[s for s,i in zip(src,xrange(len(src))) if not i in counts_to_del] # batched up because del is O(n)
          # End of compact_opcodes
          lDic = {} # label dictionary: labelNo -> address
          for P in [1,2]:
            r = [B(chr(addrSize))] # List to hold the output bytecode, initialised with a byte indicating how long our addresses will be.
            ll = 1 # cumulative length of output list, normally in bytes, but if js_utf8 then we count in Javascript (UCS-2) characters
            count = 0 # reading through src opcodes etc
            while count < len(src):
                i = src[count] ; count += 1
                if type(i)==tuple and type(i[0])==str: i = B(chr(BytecodeAssembler.opcodes[i[0]]))
                elif type(i) in [int,tuple]: # labels
                    if type(i)==int: i2,iKey = i,-i # +ve integers are labels, -ve integers are references to them
                    else: i2,iKey = i[0],(-i[0],) # reserved labels (a different counter)
                    assert type(i2)==int
                    # At this point, if i2<0 then iKey will be the lDic key for looking up the label.
                    if i2 > 0: # label going in here: set lDic etc (without outputting any bytes of course)
                        if (ll >> (aBits*addrSize)): raise TooNarrow() # on the assumption that somebody will reference this label, figure out early that we need more bits
                        if i in lDic:
                          assert lDic[i] == ll, "%s moved %d->%d" % (repr(i),lDic[i],ll)
                        lDic[i] = ll ; i = ""
                    elif iKey in lDic: # known label
                        i = lDic[iKey] # the address to convert to MSB-LSB bytes and output:
                        shift = aBits*addrSize
                        if (i >> shift): raise TooNarrow()
                        j = []
                        for b in xrange(addrSize):
                            # MSB-LSB (easier to do in JS)
                            shift -= aBits
                            j.append(B(chr(((i>>shift)&aMask)+js_6bit_offset)))
                        i = b"".join(j)
                        assert len(i)==addrSize
                    else: # ref to as-yet unknown label
                        assert P==1, "undefined label %d" % -i
                        i = B("-"*addrSize) # placeholder (well we could just advance ll, but setting this makes things easier if you ever want to inspect partial results)
                if len(i): # bytes or Unicode
                  r.append(i) ; ll += len(i)
            sys.stderr.write(".") ; sys.stderr.flush()
          if js_utf8: # normalise all before join
            for i in xrange(len(r)):
              if type(r[i])==bytes:
                r[i]=unicode(r[i],'latin1')
            r = u"".join(r)
          else: r = b"".join(r)
          if zlib:
            self.origLen = ll # needed for efficient malloc in the C code later
            oR,r = r,zlib.compress(r,9)
            if compact_opcodes: sys.stderr.write("%d bytes (%s compressed from %d after opcode compaction saved %d on %s)\n" % (len(r),zlib_name,ll,compacted,','.join(sorted(list(compaction_types)))))
            else: sys.stderr.write("%d bytes (%s compressed from %d)\n" % (len(r),zlib_name,ll))
          elif compact_opcodes: sys.stderr.write("%d bytes (opcode compaction saved %d on %s)\n" % (ll,compacted,','.join(sorted(list(compaction_types)))))
          else: sys.stderr.write("%d bytes\n" % ll)
          return r
        except TooNarrow: pass
    assert 0, "can't even assemble it with 255-byte addressing !?!"

if not browser_extension:
  js_start = b'/* Javascript '+version_stamp+br"""

Usage:

 - You could just include this code and then call the
   annotate() function i.e. var result = annotate(input"""
  if sharp_multi: js_start += b", annotation_type_number"
  if glossfile: js_start += b", lines=2"
  js_start += ")"
  if not os.environ.get("JS_OMIT_DOM",""):
    js_start += br"""

   or, if you're in a browser and have loaded a page,
   annotate_page("""
    if sharp_multi:
      js_start += b"annotation_type_number"
      if glossfile: js_start += b","
    if glossfile: js_start += b"lines=2"
    js_start += br""")
   (run annogen with JS_OMIT_DOM environment variable set
   if you want to omit the annotate_page code)"""
  js_start += br"""

 - Or you could use (and perhaps extend) the Annotator
   object, and call its annotate() method.  If you have
   Backbone.JS, Annotator will instead be a generator
   (extending Backbone.Model) which you will have to
   instantiate yourself (possibly after extending it).
   The Annotator object/class is also what will be
   exported by this module if you're using Common.JS.

 - On Unix systems with Node.JS, you can run this file in
   "node" to annotate standard input as a simple test.
"""
  if zlib:
    js_start += br"""
   zlib'd version uses Uint8Array so has minimum browser requirements
   (Chrome 7, Ffx 4, IE10, Op11.6, Safari5.1, 4.2 on iOS)
   - generate without --zlib to support older browsers.
*/"""
  else: js_start += b"*/"
js_start += b"var Annotator={\n"
if not browser_extension:
  js_start += b" version: '"+version_stamp+b"',\n"
  if glossfile: js_start += b"numLines: 2 /* override to 1 or 3 if you must, but not recommended for learning */,\n"
if sharp_multi: js_start += b"annotate: function(input,aType) { if(aType==undefined) aType=0;"
else: js_start += b"annotate: function(input) {"
if removeSpace: js_start += br" input=input.replace(/\B +\B/g,'');" # TODO: document that we do this (currently only in JS annotator here, and Android app via jsAnnot, although Web Adjuster does it separately in Python before calling the filter).  It deals with software that adds ASCII spaces between Chinese characters of the same word, without deleting spaces between embedded English words (TODO: this 'JS + app' version may still delete spaces between punctuation characters, which may be an issue for consecutive quoted words e.g. 'so-called "word1" "word2"').  If doing it at the nextbyte level, we'd have to update prevbyte; if this or doing it at switchbyte level (e.g. recurse) we'd have to do something about the copy pointer (skip the spaces?) and the near-call distance (and associated buffer sizes in C) so they're best pre-removed, but only from between characters we annotate.
js_start += br"""
input = unescape(encodeURIComponent(input)); // to UTF-8
var data = this.data""" # TODO: if input is a whole html doc, insert css in head (e.g. from annoclip and/or adjuster), and hope there's no stuff that's not to be annotated (form fields etc).  But really want them to be using browser_extension or annotate_page if doing this (TODO add css to annotate_page, already there in browser_extension)
if glossfile: js_start += b", numLines = this.numLines"
js_start += br""";
var addrLen = data.charCodeAt(0);
var dPtr, inputLength = input.length;
var p = 0; // read-ahead pointer
var copyP = 0; // copy pointer
var output = new Array(), needSpace = 0;

function readAddr() {
  var i,addr=0;
  for (i=addrLen; i; i--) addr=(addr << """
if js_6bit: js_start += b"6) | (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset))+b");"
else: js_start += b"8) | data.charCodeAt(dPtr++);"
js_start += br"""
  
  return addr;
}

function readRefStr() {
  var a = readAddr(); var l=data.charCodeAt(a);"""
if js_6bit and not js_utf8:
  js_start += br"""
  if(l && l<123) a = data.slice(a+1,a+l-30);
  else a = data.slice(a+1,data.indexOf(data.charAt(a),a+1));"""
elif zlib: js_start += br"""
  if (l != 0) a = data.slice(a+1,a+l+1);
  else a = data.slice(a+1,data.indexOf(0,a+1));"""
else: js_start += br"""
  if (l != 0) a = data.slice(a+1,a+l+1);
  else a = data.slice(a+1,data.indexOf('\x00',a+1));"""
if zlib: js_start += b"return String.fromCharCode.apply(null,a)" # gets UTF-8 from Uint8array
elif js_utf8: js_start += b"return unescape(encodeURIComponent(a))" # Unicode to UTF-8 (TODO: or keep as Unicode? but copyP things will be in UTF-8, as will the near tests)
elif js_6bit: js_start += b"return unescape(a)" # %-encoding
else: js_start += b"return a"
js_start += br"""}
function s() {
  if (needSpace) output.push(" ");
  else needSpace=1; // for after the word we're about to write (if no intervening bytes cause needSpace=0)
}

function readData() {
    var sPos = new Array(), c;
    while(1) {
        c = data.charCodeAt(dPtr++);
        if (c & 0x80) dPtr += (c&0x7F);"""
if js_6bit: js_start += br"""
        else if (c > 90) { c-=90; 
            var i=-1;if(p<input.length){var cc=input.charCodeAt(p++)-93; if(cc>118)cc-=20; i=data.slice(dPtr,dPtr+c).indexOf(String.fromCharCode(cc))}
            if (i==-1) i = c;
            if(i) dPtr += data.charCodeAt(dPtr+c+i-1)-"""+str(js_6bit_offset)+br""";
            dPtr += c+c }"""
else: js_start += br"""
        else if (c > 107) { c-=107;
            var i = ((p>=input.length)?-1:data.slice(dPtr,dPtr+c).indexOf(input.charAt(p++)));
            if (i==-1) i = c;
            if(i) dPtr += data.charCodeAt(dPtr+c+i-1);
            dPtr += c+c;
        }"""
js_start += br""" else switch(c) {
            case 50: dPtr = readAddr(); break;
            case 51: {
              var f = readAddr(); var dO=dPtr;
              dPtr = f; readData() ; dPtr = dO;
              break; }
            case 52: return;
            case 60: {
              var nBytes = data.charCodeAt(dPtr++)+1;
              var i = ((p>=input.length)?-1:data.slice(dPtr,dPtr+nBytes).indexOf(input.charAt(p++)));
              if (i==-1) i = nBytes;
              dPtr += (nBytes + i * addrLen);
              dPtr = readAddr(); break; }
            case 70: if(needSpace) { output.push(' '); needSpace=0; } break;
            case 71: case 74: {
              var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
              var base = input.slice(copyP, copyP + numBytes);
              output.push(base);
              copyP += numBytes;
              if(c==74) return; break; }
            case 72: case 75: {
              var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
              var annot = readRefStr();
              var base = input.slice(copyP, copyP + numBytes); copyP += numBytes;
              s();"""
if glossfile: js_start += br"""
              switch (numLines) {
                case 1:
                  output.push("<ruby><rb>");
                  output.push(base);
                  output.push("</rb></ruby>");
                  break;
                case 3:
                  output.push("<ruby><rt>&nbsp;</rt><rb>");
                  output.push(annot); output.push("</rb><rb>");
                  output.push(base);
                  output.push("</rb></ruby>");
                  break;
                default:"""
js_start += br"""
                  output.push("<ruby><rb>");
                  output.push(base);
                  output.push("</rb><rt>");
                  output.push(annot);
                  output.push("</rt></ruby>")"""
if glossfile: js_start += b"}"
else: js_start += b";"
js_start += br"""
              if(c==75) return; break; }"""
if glossfile: js_start += br"""
            case 73: case 76: {
              var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
              var annot = readRefStr();
              var title = readRefStr();
              var base = input.slice(copyP, copyP + numBytes); copyP += numBytes;
              s();
              switch (numLines) {
                case 1:
                  output.push("<ruby title=\"");
                  output.push(title);
                  output.push("\"><rb>");
                  output.push(base);
                  output.push("</rb>");
                  output.push("</ruby>");
                  break;
                case 3:
                  output.push("<ruby title=\"");
                  output.push(title);
                  output.push("\"><rt>");
                  output.push(title.match(/[^/(;]*/)[0]);
                  output.push("</rt><rb>");
                  output.push(annot);
                  output.push("</rb><rb>");
                  output.push(base);
                  output.push("</rb></ruby>");
                  break;
                default:
                  output.push("<ruby title=\"");
                  output.push(title);
                  output.push("\"><rb>");
                  output.push(base);
                  output.push("</rb><rt>");
                  output.push(annot);
                  output.push("</rt></ruby>") }
              if(c==76) return; break; }"""
if not js_6bit: js_start = js_start.replace(b"(data.charCodeAt(dPtr++)-34)&0xFF",b"data.charCodeAt(dPtr++)")
js_start += br"""
            case 80: sPos.push(p); break;
            case 81: p=sPos.pop(); break;
            case 90: {
                var tPtr = readAddr();
                var fPtr = readAddr();
                var nearbytes = data.charCodeAt(dPtr++);
  var o=p;
  if (o > nearbytes) o -= nearbytes; else o = 0;
  var max = p + nearbytes;
  if (max > inputLength) max = inputLength;
  var tStr = input.slice(o,max);
                var found = 0;
                while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; }
                dPtr = found ? tPtr : fPtr; break;
                }
        default: throw("corrupt data table at "+(dPtr-1)+"/"+data.length+" ("+c+")");
            }
        }
    }

while(p < inputLength) {
var oldPos=p;
dPtr=1;readData();
if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; }
}
return decodeURIComponent(escape(output.join("")))"""
if js_6bit: js_start = js_start.replace(b"var numBytes = data.charCodeAt(dPtr++);",b"var numBytes = (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset-1))+b")&0xFF;")
if sharp_multi: js_start += br""".replace(new RegExp("(</r[bt]><r[bt]>)"+"[^#]*#".repeat(aType)+"(.*?)(#.*?)?</r","g"),"$1$2</r")""" # normally <rt>, but this regexp will also work if someone changes the generated code to put annotation into second <rb> and title into <rt> as long as annotation is not given first.  Cannot put [^#<] as there might be <sup> etc in the annotation, and .*?# still matches across ...</rb><rt>... :-(
js_start += br"""; // from UTF-8 back to Unicode
}"""
if not browser_extension: b", // end of annotate method\n" # data: ... \n goes here
js_end = br"""};
function annotate(input"""
if sharp_multi: js_end += b",aType"
if glossfile: js_end += b",numLines"
js_end += b") { "
if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; "
js_end += b"return Annotator.annotate(input"
if sharp_multi: js_end += b",aType"
js_end += b")}"
if browser_extension:
  js_end += b"""
if(localStorage.aType===undefined) localStorage.aType=0;
if(localStorage.numLines===undefined) localStorage.numLines=2;
var aType=localStorage.aType,numLines=localStorage.numLines;
function handleMessage(request, sender, sendResponse) {
  if(typeof request=='number') {
    if(request<0) localStorage.numLines=numLines=-request; else {localStorage.aType=aType=request;if(numLines==1)localStorage.numLines=numLines=2}
    (chrome.tabs && chrome.tabs.query?chrome.tabs.query:browser.tabs.query)({},(T)=>{for (let t of T)(chrome.tabs && chrome.tabs.executeScript?chrome.tabs.executeScript:browser.tabs.executeScript)(t.id,{allFrames: true, code: 'for(let c of Array.prototype.slice.call(document.getElementsByClassName("_adjust0")))if(c.oldTxt)c.parentNode.replaceChild(document.createTextNode(c.oldTxt),c); annotWalk(document,document)'})})
  } else if(typeof request=='boolean') sendResponse(request?(numLines==1?-1:aType):numLines); // popup status query
  else { if(request==null) request=getClip();
  sendResponse(numLines>1?annotate(request""" # (we DO need the extra call to annotWalk above: the MutationObserver will NOT pick up on changes we made from here)
  if sharp_multi: js_end += b",aType"
  if glossfile: js_end += b",numLines"
  js_end += br"""):request)} }
function getClip(){var area=document.createElement("textarea"); document.body.appendChild(area); area.focus();area.value='';document.execCommand("Paste");var txt=area.value; document.body.removeChild(area); return txt?txt:"Failed to read clipboard"}
fetch(chrome.extension.getURL("annotate-dat.txt")).then((r)=>{r.text().then((r)=>{Annotator.data=r;chrome.runtime.onMessage.addListener(handleMessage)})})""" # if not js_utf8, having to encode latin1 as utf8 adds about 25% to the file size, but text() supports only utf8; could use arrayBuffer() instead, but inefficient to read w. DataView(buf,offset,1), or could reinstate zlib (probably using base64 read in from file: would probably need to include a versioned unzip library instead of inline-minified subset)
elif not os.environ.get("JS_OMIT_DOM",""):
  js_end += br"""
function annotate_page("""
  if sharp_multi:
    js_end += b"aType"
    if glossfile: js_end += b","
  if glossfile: js_end += b"numLines"
  js_end += b") { "
  if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; "
  js_end += jsAnnot(False) + br"""return annotWalk(document,document)
}"""
if not browser_extension:
  js_end += br"""

if (typeof Backbone != "undefined" && Backbone.Model) {
  Annotator = Backbone.Model.extend(Annotator);"""
  if sharp_multi: js_end += br"""
  annotate=function(input,aType) { return new Annotator().annotate(input,aType) }"""
  else: js_end += br"""
  annotate=function(input) { return new Annotator().annotate(input) }"""
  js_end += br"""
}
if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) {
  // Node.js command-line test
  fs=require('fs');
  process.stdout.write(annotate(fs.readFileSync('/dev/stdin').toString()));
} else if (typeof module != "undefined" && module.exports) { // Common.js
  module.exports = Annotator;
}
"""

if browser_extension:
  # we can assume window.atob
  js_inflate = br"""((dat,expandLen)=>{var buf=new Uint8Array(expandLen);dat=((r)=>{for(var e=new Uint8Array(r.length),t=0,n=e.length;t<n;t++)e[t]=r.charCodeAt(t);return e})(atob(dat));"""
else: js_inflate = br"""(function(dat,expandLen){
  var buf=new Uint8Array(expandLen); dat=
  "undefined"!=typeof window && window.atob ?
    function(r){for(var e=new Uint8Array(r.length),t=0,n=e.length;t<n;t++)e[t]=r.charCodeAt(t);return e}(atob(dat))
  :"undefined"!=typeof Buffer ? new Buffer(dat,"base64")
  :function(r){var e,t,n={},f=65,a=0,o=0,i=new Uint8Array(r.length),d=0,l=String.fromCharCode,v=r.length;for(e="";f<91;)e+=l(f++);for(e+=e.toLowerCase()+"0123456789+/",f=0;f<64;f++)n[e.charAt(f)]=f;for(e=0;e<v;e++)for(a=(a<<6)+(f=n[r.charAt(e)]),o+=6;8<=o;)((t=a>>>(o-=8)&255)||e<v-2)&&(i[d++]=t);return i}(dat);"""
js_inflate += br"""
/* Inflate code taken from UZip.js (c) 2019 "Photopea" (MIT-licensed), cut down and JSCompress'd: */
function inflate(e,r){var t,n,E={iR:function(e,r){return E.F.inflate(e,r)},inflate:function(e,r){e[0],e[1];return E.iR(new Uint8Array(e.buffer,e.byteOffset+2,e.length-6),r)}};return E.F={},E.F.inflate=function(e,r){var t=Uint8Array;if(3==e[0]&&0==e[1])return r||new t(0);var n=E.F,f=n._F,i=n._E,o=n.dT,a=n.mC,u=n.cm,l=n.g7,d=n.U;for(var s,h,v,F,_,w,p=0,g=0,b=0,U=0,m=0;0==p;)if(p=f(e,m,1),s=f(e,m+1,2),m+=3,0!=s){if(1==s&&(_=d.flm,w=d.fm,g=511,b=31),2==s){h=i(e,m,5)+257,v=i(e,m+5,5)+1,F=i(e,m+10,4)+4;m+=14;for(var y=0;y<38;y+=2)d.it[y]=0,d.it[y+1]=0;for(var C=1,y=0;y<F;y++){var A=i(e,m+3*y,3);C<(d.it[1+(d.ordr[y]<<1)]=A)&&(C=A)}m+=3*F,a(d.it,C),u(d.it,C,d.im),_=d.lm,w=d.dm,m=o(d.im,(1<<C)-1,h+v,e,m,d.tt);var x=n.cO(d.tt,0,h,d.lt),g=(1<<x)-1,T=n.cO(d.tt,h,v,d.dt),b=(1<<T)-1;a(d.lt,x),u(d.lt,x,_),a(d.dt,T),u(d.dt,T,w)}for(;;){var k=_[l(e,m)&g];m+=15&k;k=k>>>4;if(k>>>8==0)r[U++]=k;else{if(256==k)break;var z=U+k-254;264<k&&(z=U+((M=d.ldef[k-257])>>>3)+i(e,m,7&M),m+=7&M);var M=w[l(e,m)&b];m+=15&M;var M=M>>>4,M=d.ddef[M],S=(M>>>4)+f(e,m,15&M);for(m+=15&M;U<z;)r[U]=r[U++-S],r[U]=r[U++-S],r[U]=r[U++-S],r[U]=r[U++-S];U=z}}}else{0!=(7&m)&&(m+=8-(7&m));x=4+(m>>>3),T=e[x-4]|e[x-3]<<8;r.set(new t(e.buffer,e.byteOffset+x,T),U),m=x+T<<3,U+=T}return r.length==U?r:r.slice(0,U)},E.F.dT=function(e,r,t,n,f,i){for(var o=E.F._E,a=E.F.g7,u=0;u<t;){var l=e[a(n,f)&r];f+=15&l;var d=l>>>4;if(d<=15)i[u]=d,u++;else{var c=0,l=0;16==d?(l=3+o(n,f,2),f+=2,c=i[u-1]):17==d?(l=3+o(n,f,3),f+=3):18==d&&(l=11+o(n,f,7),f+=7);for(var s=u+l;u<s;)i[u]=c,u++}}return f},E.F.cO=function(e,r,t,n){for(var f=0,i=0,o=n.length>>>1;i<t;){var a=e[i+r];n[i<<1]=0,f<(n[1+(i<<1)]=a)&&(f=a),i++}for(;i<o;)n[i<<1]=0,n[1+(i<<1)]=0,i++;return f},E.F.mC=function(e,r){for(var t,n,f,i=E.F.U,o=e.length,a=i.bl_count,u=0;u<=r;u++)a[u]=0;for(u=1;u<o;u+=2)a[e[u]]++;var l=i.next_code,d=0;for(a[0]=0,t=1;t<=r;t++)d=d+a[t-1]<<1,l[t]=d;for(n=0;n<o;n+=2)0!=(f=e[n+1])&&(e[n]=l[f],l[f]++)},E.F.cm=function(e,r,t){for(var n=e.length,f=E.F.U.r5,i=0;i<n;i+=2)if(0!=e[i+1])for(var o=i>>1,a=e[i+1],u=o<<4|a,a=r-a,l=e[i]<<a,d=l+(1<<a);l!=d;)t[f[l]>>>15-r]=u,l++},E.F.rC=function(e,r){for(var t=E.F.U.r5,n=15-r,f=0;f<e.length;f+=2){var i=e[f]<<r-e[f+1];e[f]=t[i]>>>n}},E.F._E=function(e,r,t){return(e[r>>>3]|e[1+(r>>>3)]<<8)>>>(7&r)&(1<<t)-1},E.F._F=function(e,r,t){return(e[r>>>3]|e[1+(r>>>3)]<<8|e[2+(r>>>3)]<<16)>>>(7&r)&(1<<t)-1},E.F.g7=function(e,r){return(e[r>>>3]|e[1+(r>>>3)]<<8|e[2+(r>>>3)]<<16)>>>(7&r)},E.F.U=(t=Uint16Array,n=Uint32Array,{next_code:new t(16),bl_count:new t(16),ordr:[16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15],of0:[3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258,999,999,999],exb:[0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0,0],ldef:new t(32),df0:[1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,65535,65535],dxb:[0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,0,0],ddef:new n(32),flm:new t(512),flt:[],fm:new t(32),fdt:[],lm:new t(32768),lt:[],tt:[],dm:new t(32768),dt:[],im:new t(512),it:[],r5:new t(32768),lhst:new n(286),dhst:new n(30),ihst:new n(19),lits:new n(15e3),strt:new t(65536),prev:new t(32768)}),function(){for(var e=E.F.U,r=0;r<32768;r++){var t=r;t=(4278255360&(t=(4042322160&(t=(3435973836&(t=(2863311530&t)>>>1|(1431655765&t)<<1))>>>2|(858993459&t)<<2))>>>4|(252645135&t)<<4))>>>8|(16711935&t)<<8,e.r5[r]=(t>>>16|t<<16)>>>17}function n(e,r,t){for(;0!=r--;)e.push(0,t)}for(r=0;r<32;r++)e.ldef[r]=e.of0[r]<<3|e.exb[r],e.ddef[r]=e.df0[r]<<4|e.dxb[r];n(e.flt,144,8),n(e.flt,112,9),n(e.flt,24,7),n(e.flt,8,8),E.F.mC(e.flt,9),E.F.cm(e.flt,9,e.flm),E.F.rC(e.flt,9),n(e.fdt,32,5),E.F.mC(e.fdt,5),E.F.cm(e.fdt,5,e.fm),E.F.rC(e.fdt,5),n(e.it,19,0),n(e.lt,286,0),n(e.dt,30,0),n(e.tt,320,0)}(),E.inflate(e,r)}
return inflate(dat,buf) })
"""
extension_rubycss = b"span._adjust0 ruby{display:inline-table !important;vertical-align:bottom !important;-webkit-border-vertical-spacing:1px !important;padding-top:0.5ex !important;margin:0px !important;} span._adjust0 ruby *{display: inline !important;vertical-align:top !important;line-height:1.0 !important;text-indent:0 !important;text-align:center !important;white-space:nowrap !important;padding-left:0px !important;padding-right:0px !important;} span._adjust0 rb{display:table-row-group !important;font-size:100% !important; opacity: 1.0 !important;} span._adjust0 rt{display:table-header-group !important;font-size:100% !important;line-height:1.1 !important; opacity: 1.0 !important;font-family: FreeSerif, Lucida Sans Unicode, Times New Roman, serif !important;}"
extension_config=br"""<html><head><meta charset="utf-8">
<style>#cr{width:100%;border:thin dotted grey;max-width:15em;max-height:10em;overflow:auto} #cr:empty{padding:0.5ex}
button{background:#ededed;color:inherit}
"""+extension_rubycss.replace(b"span._adjust0 ",b"")+br"""</style>
</head><body>
<nobr><button id="-1">Off</button> <button id="-2">2-line</button>"""
# -ve = num lines (if glossfile), +ve = annotNo (if sharp-multi)
if glossfile:
  extension_config += b' <button id="-3">3-line</button>'
  rangeStart = -3
else:
  rangeStart = -2
  extension_config=extension_config.replace(b'2-line',b'On')
extension_config += b'</nobr>'
if sharp_multi and annotation_names and ',' in annotation_names:
  extension_config += b"".join((b'<br><button id="%d">%s</button>' % (num,B(name))) for num,name in enumerate(annotation_names.split(',')))
  rangeEnd = len(annotation_names.split(','))
else: rangeEnd = 0
extension_config += b'<div id="cr"></div><button id="c">Clipboard</button><script src="config.js"></script></body></html>'
Silas S. Brown's avatar
Silas S. Brown committed
# Don't want Clipboard button to auto-refresh (and hide the button) in the desktop extension version, since would need to stop the refresh when view is no longer visible + is it really a good idea to timer-paste the clipboard on a desktop when conversion to text could be costly etc + many desktops would dismiss the extension box before letting you switch to another window to change the clipboard (unless it's in a VM)
extension_confjs = br"""function updateClip() {
    chrome.runtime.sendMessage(null,((cr)=>{
        var v=document.getElementById("cr");
        v.textContent = ''; // clear
        if(cr) {
            try {
                for(const t of new DOMParser().parseFromString('<span> '+cr+' </span>','text/html').body.firstChild.childNodes) v.appendChild(t.cloneNode(true));
                var a=v.getElementsByTagName('ruby'),i; for(i=0; i < a.length; i++) if(a[i].title) ((e)=>{e.addEventListener('click',(()=>{alert(e.title)}))})(a[i])
            } catch(err) { console.log(err.message) }
        }
    }))}
function update() {
chrome.runtime.sendMessage(false,function(r) {var i;for(i=%d;i;i++){var e=document.getElementById(""+i);if(i==-r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})"""  % rangeStart
chrome.runtime.sendMessage(true,function(r) {for(var i=0;i<%d;i++){var e=document.getElementById(""+i);if(i==r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})"""  % rangeEnd
extension_confjs += b';\nif(document.getElementById("cr").firstChild) updateClip()\n'
extension_confjs += b';'.join((b'document.getElementById("%d").addEventListener("click",function(){chrome.runtime.sendMessage(%d,update)})' % (n,n)) for n in xrange(rangeStart,rangeEnd))
extension_confjs += b';document.getElementById("c").addEventListener("click",updateClip)'
3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000

dart_src = br"""

/* Usage
   -----
   If this file is saved as annotator.dart,
   you can import 'annotator.dart';
   and then call the annotate() function."""
if dart_datafile: dart_src += br"""
   E.g. String result = await annotate(...);
   (make your function async.)  Will read """+B(dart_datafile)
dart_src += br"""
*/

import 'dart:convert';"""
if zlib: dart_src += b"import 'dart:io';"
dart_src += br"""
class _Annotator {
  static const version="""+b'"'+version_stamp+br"""";
  int numLines = 2;  // override to 1 or 3 if you must, but not recommended for learning"""
if dart_datafile: dart_src+=b"\n  static String data=null;"
else: dart_src+=b"\n  static final String data=%%DATA_INIT%%;"
dart_src += br"""
  int addrLen=data.codeUnitAt(0),dPtr;
  bool needSpace; StringBuffer output;
  int p, copyP; List<int> inBytes; int inputLength;
  String annotate(String input"""
if sharp_multi: dart_src += br""",[int aType=0]"""
dart_src += br""") {
    inBytes=utf8.encode(input); dPtr=0;
    inputLength=input.length;
    p=0; copyP=0;
    output = StringBuffer(); needSpace = false;
    while(p < inputLength) {
      int oldPos=p;
      dPtr=1;_readData();
      if (oldPos==p) { needSpace=false; output.write(String.fromCharCode(inBytes[p++])); copyP++; }
    }
    return Utf8Decoder().convert(output.toString().codeUnits)"""
if sharp_multi: dart_src += br""".replaceAllMapped(new RegExp("(</r[bt]><r[bt]>)"+"[^#]*#"*aType+"(.*?)(#.*?)?</r"),(Match m)=>"${m[1]}${m[2]}</r")"""
dart_src += br""";
  }
  int _readAddr() { int addr=0; for (int i=addrLen; i>0; i--) addr=(addr << 8) | data.codeUnitAt(dPtr++); return addr; }
  String _readRefStr() {
    int a=_readAddr();
    int l=data.codeUnitAt(a);
    String r;
    if (l != 0) r=data.substring(a+1,a+l+1);
    else r=data.substring(a+1,data.indexOf("\u0000",a+1));"""
if js_utf8: dart_src += br"""
    return String.fromCharCodes(Utf8Encoder().convert(r));"""
else: dart_src += b"return r;"
dart_src += br"""
  }
  void _s() {
    if(needSpace) output.write(" ");
    else needSpace=true; // for after the word we're about to write (if no intervening bytes cause needSpace=false)
  }
  void _readData() {
    List<int> sPos=List<int>();
    while(true) {
      int c=data.codeUnitAt(dPtr++);
      if ((c & 0x80)!=0) dPtr += (c&0x7F); // short jump
      else if (c > 107) { // short switchbyte
        c-=107;
        var i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+c).indexOf(String.fromCharCode(inBytes[p++])));
        if (i==-1) i = c;
        if(i>0) dPtr += data.codeUnitAt(dPtr+c+i-1);
        dPtr += c+c;
      } else switch(c) {
        case 50: dPtr = _readAddr(); break;
        case 51: {
          int f = _readAddr(); int dO=dPtr;
          dPtr = f; _readData() ; dPtr = dO;
          break; }
        case 52: return;
        case 60: {
          int nBytes = data.codeUnitAt(dPtr++)+1;
          int i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+nBytes).indexOf(String.fromCharCode(inBytes[p++])));
          if (i==-1) i = nBytes;
          dPtr += (nBytes + i * addrLen);
          dPtr = _readAddr(); break; }
        case 70: if(needSpace) { output.write(" "); needSpace=false; } break;
        case 71: case 74: {
          int numBytes = data.codeUnitAt(dPtr++);
  output.write(String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)));
  copyP += numBytes; if(c==74) return; break; }
        case 72: case 75: {
          int numBytes = data.codeUnitAt(dPtr++);
          String annot = _readRefStr();
          String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes;
          _s();
          switch (numLines) {
            case 1:
              output.write("<ruby><rb>");
              output.write(base);
              output.write("</rb></ruby>");
              break;
            case 3:
              output.write("<ruby><rt>&nbsp;</rt><rb>");
              output.write(annot);
              output.write("</rb><rb>");
              output.write(base);
              output.write("</rb></ruby>");
              break;
            default:
              output.write("<ruby><rb>");
              output.write(base);
              output.write("</rb><rt>");
              output.write(annot);
              output.write("</rt></ruby>");
            } if(c==75) return; break; }
        case 73: case 76: {
          int numBytes = data.codeUnitAt(dPtr++);
          String annot = _readRefStr();
          String title = _readRefStr();
          String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes;
          _s();
          switch (numLines) {
            case 1:
              output.write("<ruby title=\"");
              output.write(title);
              output.write("\"><rb>");
              output.write(base);
              output.write("</rb></ruby>");
              break;
            case 3:
              output.write("<ruby title=\"");
              output.write(title);
              output.write("\"><rt>");
              output.write(RegExp("[^/(;]*").matchAsPrefix(title).group(0));
              output.write("</rt><rb>");
              output.write(annot);
              output.write("</rb><rb>");
              output.write(base);
              output.write("</rb></ruby>");
              break;
            default:
              output.write("<ruby title=\"");
              output.write(title);
              output.write("\"><rb>");
              output.write(base);
              output.write("</rb><rt>");
              output.write(annot);
              output.write("</rt></ruby>");
          } if(c==76) return; break; }
        case 80: sPos.add(p); break;
        case 81: p=sPos.removeLast(); break;
        case 90: {
          int tPtr = _readAddr();
          int fPtr = _readAddr();
          int nearbytes = data.codeUnitAt(dPtr++);
  int o=p;
  if (o > nearbytes) o -= nearbytes; else o = 0;
  var max = p + nearbytes;
  if (max > inputLength) max = inputLength;
  String tStr = String.fromCharCodes(inBytes.sublist(o,max));
                bool found = false;
                while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(_readRefStr()) != -1) { found = true; break; }
                dPtr = found ? tPtr : fPtr; break;
                }
        default: throw("corrupt data table at ${dPtr-1}/${data.length} (${c})");
      }
    }
  }
}

"""
if dart_datafile: dart_src += b"Future<String> annotate(String s,["
else: dart_src += b"String annotate(String s,["
if sharp_multi: dart_src += b"int aType=0,"
dart_src += b"int numLines=2]) "
if dart_datafile: dart_src += b"async "
dart_src += b"{ "
if dart_datafile: dart_src += b"if(_Annotator.data==null) _Annotator.data=await %%DATA_INIT%%;"
dart_src += b"var a=_Annotator(); a.numLines=numLines; return a.annotate(s"
if sharp_multi: dart_src += b",aType"
dart_src += b"); }\n"
if zlib: dart_src = dart_src.replace(b"%%DATA_INIT%%",b"String.fromCharCodes(zlib.decoder.convert(%%DATA_INIT%%))")

py_start = b'# Python '+version_stamp+br"""

# You can import this module and call annotate(utf8 bytes)
# (from multiple threads if desired),
# or you can run from the command line on standard input.

# annotate has an optional second argument, which can be
# 'ruby' (default), 'raw' (annotation only) or 'braces'.

# This module is compatible with both Python 2.7 and Python 3.

"""
py_end = br"""
class Annotator:
 version="""+b'"'+version_stamp+br""""
 def __call__(self,inStr,aFormat):
  if aFormat=="ruby": self.startA,self.midA,self.endA = b"<ruby><rb>",b"</rb><rt>",b"</rt></ruby>"
  elif aFormat=="raw": self.startA=self.midA=self.endA = b""
  elif aFormat=="braces": self.startA,self.midA,self.endA = b"{",b"|",b"}"
  else: raise Exception("Unrecognised annotation format "+repr(aFormat))
  assert type(inStr)==bytes
  self.inStr = inStr
  self.addrLen = ord(data[:1])
  self.inputLength = len(inStr)
  self.p = 0 # read-ahead pointer
  self.copyP = 0 # copy pointer
  self.output = []
  self.needSpace = 0
  while self.p < self.inputLength:
    oldPos = self.p
    self.dPtr = 1 ; self.readData()
    if oldPos == self.p:
      self.needSpace=0
      self.output.append(inStr[self.p:self.p+1])
      self.p += 1 ; self.copyP += 1
  return b"".join(self.output)
 def readAddr(self):
  addr = 0
  for i in range(self.addrLen):
    addr=(addr << 8) | ord(data[self.dPtr:self.dPtr+1])
    self.dPtr += 1
  return addr
 def readRefStr(self):
  a = self.readAddr(); l=ord(data[a:a+1])
  if l: return data[a+1:a+l+1]
  else: return data[a+1:data.index(b'\x00',a+1)]
 def s(self):
  if self.needSpace: self.output.append(b" ")
  else: self.needSpace=1
 def readData(self):
  sPos = [] ; out = self.output
  while True:
    d = ord(data[self.dPtr:self.dPtr+1]) ; self.dPtr += 1
    if d==50: self.dPtr = self.readAddr()
    elif d==51:
      func = self.readAddr() ; dO = self.dPtr
      self.dPtr = func ; self.readData() ; self.dPtr = dO
    elif d==52: return
    elif d==60 or d<20:
      if d<20: nBytes=d+1
      else:
        nBytes = ord(data[self.dPtr:self.dPtr+1])+1
        self.dPtr += 1
      if self.p>=len(self.inStr): i = -1
      else: i = data[self.dPtr:self.dPtr+nBytes].find(self.inStr[self.p:self.p+1]) ; self.p += 1
      if i==-1: i = nBytes
      if d<20:
        if i>0: self.dPtr += ord(data[self.dPtr+nBytes+i-1:self.dPtr+nBytes+i])
        self.dPtr += nBytes * 2
      else:
        self.dPtr += (nBytes + i * self.addrLen)
        self.dPtr = self.readAddr()
    elif d==70:
      if self.needSpace:
        out.append(b' ') ; self.needSpace=0
    elif d==71 or d==74:
      numBytes = ord(data[self.dPtr:self.dPtr+1])
      self.dPtr += 1
      out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
      if d==74: return
    elif d==72 or d==75:
      numBytes = ord(data[self.dPtr:self.dPtr+1])
      self.dPtr += 1
      annot = self.readRefStr()
      self.s()
      if self.startA:
        out.append(self.startA)
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
      out.append(self.midA) ; out.append(annot)
      out.append(self.endA)
      if d==75: return
    elif d==73 or d==76:
      numBytes = ord(data[self.dPtr:self.dPtr+1])
      self.dPtr += 1
      annot = self.readRefStr()
      title = self.readRefStr()
      self.s()
      if self.startA==b"{": # omit title in braces mode
        out.append(self.startA)
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      elif self.startA:
        out.append(b"<ruby title=\"");out.append(title)
        out.append(b"\"><rb>");
        out.append(self.inStr[self.copyP:self.copyP+numBytes])
      self.copyP += numBytes
      out.append(self.midA) ; out.append(annot)
      out.append(self.endA)
      if d==76: return
    elif d==80: sPos.append(self.p)
    elif d==81: self.p = sPos.pop()
    elif d==90:
      tPtr = self.readAddr()
      fPtr = self.readAddr()
      nearbytes = ord(data[self.dPtr:self.dPtr+1])
      self.dPtr += 1
      o = max(self.p-nearbytes,0)
      maxx = min(self.p+nearbytes,self.inputLength)
      tStr = self.inStr[o:maxx]
      found = False
      while self.dPtr < tPtr and self.dPtr < fPtr:
        if self.readRefStr() in tStr:
          found = True ; break
      if found: self.dPtr = tPtr
      else: self.dPtr = fPtr
    elif d>0x80: self.dPtr += d-0x80
    else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1:self.dPtr]))+")")

def annotate(inStr,p="ruby"): return Annotator()(inStr,p)
def main():