Newer
Older

Silas S. Brown
committed
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
else: js_6bit_offset = 0
class BytecodeAssembler:
# Bytecode for a virtual machine run by the Javascript version etc
opcodes = {
# 0-19 RESERVED for short switchbyte (C,Java,Py)
'jump': 50, # '2' params: address
'call': 51, # '3' params: function address
'return': 52, # '4' (or 'end program' if top level)
'switchbyte': 60, # '<' switch(NEXTBYTE) (params: numBytes-1, bytes (sorted, TODO take advantage of this), addresses, default address)
's0':70, # 'F'
'copyBytes':71,'o':72,'o2':73, # 'G','H','I' (don't change these numbers, they're hard-coded below)
# 74-76 ('J','K','L') reserved for 'above + return'
'savepos':80, # 'P', local to the function
'restorepos':81, # 'Q'
'neartest':90, # 'Z' params: true-label, false-label, byte nbytes, addresses of conds strings until first of the 2 labels is reached (normally true-label, unless the whole neartest is negated)
# 91-107 RESERVED for short switchbyte (JS, UTF-8 printability optimisation for 6bit)
# 108-127 RESERVED for short switchbyte (JS,Dart, more in the printable range to reduce escaping a bit)
# 128-255 RESERVED for short jumps
}
def __init__(self):
self.l = [] # code list
self.d2l = {} # definition to label
self.lastLabelNo = 0
self.addingPosStack = []
def addOpcode(self,opcode): self.l.append((opcode,))
def addBytes(self,bStr):
if type(bStr)==int: self.l.append(B(chr(bStr)))
elif type(bStr)==bytes: self.l.append(bStr)
else: raise Exception("unspported bytes type")
def startAddingFunction(self):
self.addingPosStack.append((len(self.l),self.lastLabelNo))
self.lastLabelNo = 0
def finishFunctionAndAddCall(self):
# make sure to add a return instruction before this!
fPtr, self.lastLabelNo = self.addingPosStack[-1]
del self.addingPosStack[-1]
fBody = tuple(self.l[fPtr:]) ; self.l=self.l[:fPtr]
if not fBody in self.d2l: # not a duplicate
self.d2l[fBody] = (-len(self.d2l)-1,)
self.addOpcode('call')
self.l.append(self.d2l[fBody])
def addByteswitch(self,byteArray,labelArray):
assert len(byteArray) + 1 == len(labelArray)
# labelArray has the default case added also (TODO: could re-organize code so the bytes immediately after the switch are either the default or one of the items, saving 1 address)
if not len(byteArray): return # empty switch = no-op
self.addOpcode('switchbyte')
self.addBytes(len(byteArray)-1) # num of bytes in list - 1 (so all 256 values can be accounted for if needed)
self.addBytes(b"".join(byteArray))
for i in labelArray: self.addRef(i)
def addActions(self,actionList):
# assert type(actionList) in [list,tuple], repr(actionList)
for a in actionList:
if a==b's0':
self.addOpcode('s0') ; continue
assert 1 <= len(a) <= 3 and type(a[0])==int and all(type(b)==bytes for b in a[1:]), repr(a)
assert 1 <= a[0] <= 255, "bytecode currently supports markup or copy between 1 and 255 bytes only, not %d (but 0 is reserved for expansion)" % a[0]
self.addBytes(70+len(a)) # 71=copyBytes 72=o() 73=o2
if js_6bit:
self.addBytes((a[0]+(js_6bit_offset-1))&0xFF)
else: self.addBytes(a[0]) # num i/p bytes to copy
for i in a[1:]: self.addRefToString(i)
def addActionDictSwitch(self,byteSeq_to_action_dict,isFunc=True,labelToJump=None):
# a modified stringSwitch for the bytecode
# Actions aren't strings: they list tuples of either
# 1, 2 or 3 items for copyBytes, o(), o2()
# labelToJump is a jump to insert afterwards if not isFunc and if we don't emit an unconditional 'return'. Otherwise, will ALWAYS end up with a 'return' (even if not isFunc i.e. the main program)
allBytes = set(b[:1] for b in iterkeys(byteSeq_to_action_dict) if b)
if isFunc:
self.startAddingFunction()
savePos = len(self.l)
self.addOpcode('savepos')
elif (b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1) or not labelToJump: # ('not labelToJump' and 'not isFunc' == main program)
savePos = len(self.l)
self.addOpcode('savepos')
else: savePos = None
if b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[b""])==1 and not byteSeq_to_action_dict[b""][0][1] and all((len(a)==1 and a[0][0][:len(byteSeq_to_action_dict[b""][0][0])]==byteSeq_to_action_dict[b""][0][0] and not a[0][1]) for a in itervalues(byteSeq_to_action_dict)):
self.addActions(byteSeq_to_action_dict[b""][0][0])
l = len(byteSeq_to_action_dict[b""][0][0])
byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in iteritems(byteSeq_to_action_dict))
del self.l[savePos] ; savePos = None
del byteSeq_to_action_dict[b""]
self.addActionDictSwitch(byteSeq_to_action_dict) # as a subfunction (ends up adding the call to it, which should be replaced by a jump during compaction; TODO: auto-inline if it turns out there's only this one call to it? other calls might happen if it's merged with an identical one)
byteSeq_to_action_dict[b""] = [(b"",[])] # for the end of this func
self.addOpcode('return')
elif allBytes:
allBytes = sorted(list(allBytes))
labels = [self.makeLabel() for b in allBytes+[0]]
self.addByteswitch(allBytes,labels)
for case in allBytes:
self.addLabelHere(labels[0]) ; del labels[0]
self.addActionDictSwitch(dict([(k[1:],v) for k,v in iteritems(byteSeq_to_action_dict) if k[:1]==case]),False,labels[-1])
self.addLabelHere(labels[0])
if not savePos==None: self.addOpcode('restorepos')
if isFunc:
self.addOpcode('return')
if self.l[-1]==self.l[-2]: del self.l[-1] # double return
return self.finishFunctionAndAddCall()
elif b"" in byteSeq_to_action_dict:
default_action = b""
for action,conds in byteSeq_to_action_dict[b""]:
if conds:
if type(conds)==tuple: negate,conds,nbytes = conds
else: negate,nbytes = False,ybytes_max
assert 1 <= nbytes <= 255, "bytecode supports only single-byte nbytes (but nbytes=0 is reserved for expansion)"
trueLabel,falseLabel = self.makeLabel(),self.makeLabel()
self.addOpcode('neartest')
self.addRef(trueLabel)
self.addRef(falseLabel)
assert type(nbytes)==int
self.addBytes(nbytes)
for c in conds: self.addRefToString(c.encode(outcode)) # TODO: how much bytecode could we save by globally merging equivalent lists of string-list references ? (zlib helps anyway but...)
if negate: trueLabel,falseLabel = falseLabel,trueLabel
self.addLabelHere(trueLabel)
self.addActions(action)
self.addOpcode('return')
self.addLabelHere(falseLabel)
else: default_action = action
if default_action or not byteSeq_to_action_dict[b""]:
self.addActions(default_action)
self.addOpcode('return') ; return
if labelToJump:
self.addOpcode('jump')
self.addRef(labelToJump)
else: self.addOpcode('return')
def makeLabel(self):
self.lastLabelNo += 1
return self.lastLabelNo
def addLabelHere(self,labelNo):
assert type(labelNo)==int
assert labelNo, "label 0 not allowed"
self.l.append(labelNo)
def addRef(self,labelNo):
assert type(labelNo)==int
self.l.append(-labelNo)
def addRefToString(self,string):
assert type(string)==bytes, repr(string)
l = len(string)
if python or java or javascript or dart:
# prepends with a length hint if possible (or if not
# prepends with 0 and null-terminates it)
if js_utf8:
string = unicodedata.normalize("NFC",string.decode('utf-8')) # NFC very important for browser_extension: some browsers seem to do it anyway, throwing off data addresses if we haven't accounted for that
l = len(string) # we count in UCS-2 characters
assert all((ord(c) <= 0xFFFF) for c in string), "js_utf8 addressing will be confused by non UCS-2: "+repr(string) # TODO: put surrogate pairs? (and increase l by num pairs; ensure Python will emit separate UTF-8 sequences for each part of the surrogate, if needed by JS; might need to escape the pairs after all addresses computed) + what if we're in dart ?
# Have checked browsers + Node count combining characters separately, so len(string) should be correct (e.g. u'Moc\u0306nik')
if 1 <= l < 0x02B0: # can use length-first unichr (avoid combining and modifier marks just in case; also avoid 0xD800+ surrogates)

Silas S. Brown
committed
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
string = unichr(l) + string
else: string = unichr(0)+string+unichr(0)
elif js_6bit:
string = re.sub(b"%(?=[0-9A-Fa-f])|[\x7f-\xff]",lambda m:urllib.quote(m.group()),string) # for JS 'unescape' in readRefStr, which is applied (without encodeURIComponent) if js_6bit and not js_utf8 so we can use %-encoding
l = len(string) # length is needed BEFORE %-decode
if 1 <= l <= 91: # use 32-122 inclusive
string = B(chr(l+31))+string
else: # try to avoid using \x00 for termination
for termChar in '{|}~\x00': # 123-126 + nul
termChar=B(termChar)
if not termChar in string:
string = termChar + string + termChar
break
elif 1 <= l < 256: # length byte + string
string = B(chr(l))+string
else: string = B(chr(0))+string+B(chr(0))
else: string += b'\x00' # just null-termination for C
if not string in self.d2l:
self.d2l[string] = (-len(self.d2l)-1,)
self.l.append(self.d2l[string])
def link(self): # returns resulting bytes
# (add an 'end program' instruction before calling)
def f(*args): raise Exception("Must call link() only once")
self.link = f
sys.stderr.write("Linking... ") ; sys.stderr.flush()
for dat,ref in sorted(iteritems(self.d2l)): # the functions and data to add to the end of self.l, sorted so we can optimise for overlaps
assert type(ref)==tuple and type(ref[0])==int
self.l.append((-ref[0],)) # the label
if type(dat) in [bytes,unicode]:
if type(self.l[-2])==type(dat) and self.l[-2][-1]==dat[0]: # overlap of termination-byte indicators (TODO: look for longer overlaps? unlikely to occur)
self.l[-2] = self.l[-2][:-1]
self.l.append(dat) ; continue
# otherwise it's a function, and non-reserved labels are local, so we need to rename them
l2l = {}
for i in dat:
if type(i)==int:
if i>0: j=i
else: j=-i
if not j in l2l:
l2l[j] = self.makeLabel()
if i>0: self.addLabelHere(l2l[j])
else: self.addRef(l2l[j])
else: self.l.append(i) # str or tuple just cp
del self.d2l
if post_normalise: # must be AFTER d2l, as EOF is used to end it
normLabel = self.makeLabel()
self.l.insert(0,-normLabel)
self.l.append(normLabel)
bmp = [(k,v) for k,v in sorted(post_normalise.items())]
maxRLE = min(bmp[0][0],min(v for k,v in bmp))-1
assert maxRLE >= 0, "can't have a mapping to 0"
curPtr = 0
def lsbmsb(i): return B(chr(i&0xFF)+chr(i>>8))
for i in xrange(len(bmp)):
delta = bmp[i][0]-curPtr
while delta:
skip = min(delta,maxRLE)
self.l.append(lsbmsb(skip))
delta -= skip ; curPtr += skip
self.l.append(lsbmsb(bmp[i][1]))
curPtr += 1
# elements of self.l are now:
# - (byte) strings (just copied in)
# - positive integers (labels for code)
# - negative integers (references to labels)
# - +ve or -ve integers in tuples (labels for functions and text strings: different 'namespace')
# strings in tuples: opcodes
# 1st byte of o/p is num bytes needed per address
class TooNarrow(Exception): pass
if js_6bit: aBits,aMask = 6,0x3F
else: aBits,aMask = 8,0xFF
for addrSize in xrange(1,256):
sys.stderr.write("(%d-bit) " % (aBits*addrSize))
sys.stderr.flush()
src = self.l[:] # must start with fresh copy, because compaction modifies src and we don't want a false start with wrong addrSize to affect us
try:
compacted = 0 ; compaction_types = set()
if compact_opcodes:
# The compact opcodes all rely on relative addressing (relative to AFTER the compact instruction) that goes only forward. Easiest way to deal with that is to work backwards from the end, inlining the compactions, before running a conventional 2-pass assembly.
# TODO: Could move the below loop into this one in its entirety, and just assemble backwards. Most within-function label references point forwards anyway. (Would still need some backward refs for functions though)
bytesFromEnd = 0
lDic = {} # labelNo -> bytesFromEnd
def LGet(lRef,origOperandsLen):
# return the number of bytes between the end of the new instruction and the label. Since bytesFromEnd includes origOperandsLen, we need to subtract that out, which would then leave bytes from end of code to end of new instruction (no matter what the length of the new instruction will be)
if not -lRef in lDic: return -1
return bytesFromEnd-origOperandsLen-lDic[-lRef]
counts_to_del = set()
for count in xrange(len(src)-1,-1,-1):
i = src[count]
if type(i) in [bytes,unicode] and len(i)==1 and 71<=ord(i)<=73 and src[count+ord(i)-70+1]==('return',):
# (74 to 76 = 71 to 73 + return)
src[count] = B(chr(ord(i)+3))
counts_to_del.add(count+ord(i)-70+1)
compacted += 1 ; bytesFromEnd -= 1
compaction_types.add('return')
elif type(i)==tuple and type(i[0])==str:
opcode = i[0]
i = "-" # for len() at end of block
if opcode=='call' and src[count+2]==('return',):
src[count] = ('jump',)
counts_to_del.add(count+2)
compacted += 1 ; bytesFromEnd -= 1
compaction_types.add(opcode)
# can't fall through by setting opcode='jump', as the address will be in the function namespace (integer in tuple, LGet would need adjusting) and is highly unlikely to be within range (TODO: unless we try to arrange the functions to make it so for some cross-calls)
if opcode=='jump' and 0 <= LGet(src[count+1],addrSize) < 0x80: # we can use a 1-byte relative forward jump (up to 128 bytes), useful for 'break;' in a small switch
offset = LGet(src[count+1],addrSize)
if offset == 0:
# can remove this jump completely
i = "" # for len() at end of block
compacted += 1
counts_to_del.add(count) # zap jmp
else: src[count] = i = B(chr(0x80 | offset)) # new instr: 0x80|offset
counts_to_del.add(count+1) # zap the label
compacted += addrSize # as we're having a single byte instead of byte + address
bytesFromEnd -= addrSize
compaction_types.add(opcode)
elif opcode=='switchbyte':
numItems = len(src[count+2]) # = ord(src[count+1]) + 1
if 1 <= numItems <= 20:
numLabels = numItems+1 # there's an extra default label at the end
origOperandsLen = 1+numItems+numLabels*addrSize # number + N bytes + the labels
if LGet(src[count+3],origOperandsLen)==0 and all(0 <= LGet(src[count+N],origOperandsLen) <= 0xFF-js_6bit_offset for N in xrange(4,3+numLabels)): # 1st label is immediately after the switchbyte, and all others are in range
if javascript or dart: # use printable range
if js_6bit and numItems<=17 and all(0x80<=ord(x)<=0xBF or 0xD4<=ord(x)<=0xEF for x in S(src[count+2])): # if bytes being switched on are all from UTF-8 representations of U+0500 through U+FFFF, move to printable range (in one test this saved 780k for the continuation bytes and another 200k for the rest)
def mv(x):
if x>=0xD4: x -= 20 # or, equivalently, if (x-93)>118, which is done to the input byte in JS before searching on these
return B(chr(x-93))
src[count+2]=b''.join(mv(ord(x)) for x in S(src[count+2]))
i = B(chr(ord(src[count+1])+91)) # and a printable opcode
else: i = B(chr(ord(src[count+1])+108)) # can't make the match bytes printable, but at least we can have a printable opcode 108-127 for short switchbyte in Javascript or Dart
else: i = B(src[count+1]) # 0-19 for short switchbyte in C,Java,Python
src[count] = i = i+src[count+2]+b''.join(B(chr(LGet(src[count+N],origOperandsLen)+js_6bit_offset)) for N in xrange(4,3+numLabels)) # opcode_including_nItems, string of bytes, offsets (assume 1st offset at count+3 is 0 so not listed)
for ctd in xrange(count+1,count+3+numLabels): counts_to_del.add(ctd)
newOperandsLen = numItems*2 # for each byte, the byte itself and an offset, + 1 more offset as default, - 1 because first is not given
compacted += origOperandsLen-newOperandsLen
bytesFromEnd -= origOperandsLen # will add new opCode + operands below
compaction_types.add(opcode)
elif type(i) in [int,tuple]: # labels
if type(i)==int: i2 = i
else: i2 = i[0]
assert type(i2)==int
if i2 > 0:
lDic[i] = bytesFromEnd ; i = ""
if bytesFromEnd >> (aBits*addrSize+1): raise TooNarrow() # fair assumption (but do this every label, not every instruction)
else: i = "-"*addrSize # a reference
bytesFromEnd += len(i)
src=[s for s,i in zip(src,xrange(len(src))) if not i in counts_to_del] # batched up because del is O(n)
# End of compact_opcodes
lDic = {} # label dictionary: labelNo -> address
for P in [1,2]:
r = [B(chr(addrSize))] # List to hold the output bytecode, initialised with a byte indicating how long our addresses will be.
ll = 1 # cumulative length of output list, normally in bytes, but if js_utf8 then we count in Javascript (UCS-2) characters
count = 0 # reading through src opcodes etc
while count < len(src):
i = src[count] ; count += 1
if type(i)==tuple and type(i[0])==str: i = B(chr(BytecodeAssembler.opcodes[i[0]]))
elif type(i) in [int,tuple]: # labels
if type(i)==int: i2,iKey = i,-i # +ve integers are labels, -ve integers are references to them
else: i2,iKey = i[0],(-i[0],) # reserved labels (a different counter)
assert type(i2)==int
# At this point, if i2<0 then iKey will be the lDic key for looking up the label.
if i2 > 0: # label going in here: set lDic etc (without outputting any bytes of course)
if (ll >> (aBits*addrSize)): raise TooNarrow() # on the assumption that somebody will reference this label, figure out early that we need more bits
if i in lDic:
assert lDic[i] == ll, "%s moved %d->%d" % (repr(i),lDic[i],ll)
lDic[i] = ll ; i = ""
elif iKey in lDic: # known label
i = lDic[iKey] # the address to convert to MSB-LSB bytes and output:
shift = aBits*addrSize
if (i >> shift): raise TooNarrow()
j = []
for b in xrange(addrSize):
# MSB-LSB (easier to do in JS)
shift -= aBits
j.append(B(chr(((i>>shift)&aMask)+js_6bit_offset)))
i = b"".join(j)
assert len(i)==addrSize
else: # ref to as-yet unknown label
assert P==1, "undefined label %d" % -i
i = B("-"*addrSize) # placeholder (well we could just advance ll, but setting this makes things easier if you ever want to inspect partial results)
if len(i): # bytes or Unicode
r.append(i) ; ll += len(i)
sys.stderr.write(".") ; sys.stderr.flush()
if js_utf8: # normalise all before join
for i in xrange(len(r)):
if type(r[i])==bytes:
r[i]=unicode(r[i],'latin1')
r = u"".join(r)
else: r = b"".join(r)
if zlib:
self.origLen = ll # needed for efficient malloc in the C code later
oR,r = r,zlib.compress(r,9)
if compact_opcodes: sys.stderr.write("%d bytes (%s compressed from %d after opcode compaction saved %d on %s)\n" % (len(r),zlib_name,ll,compacted,','.join(sorted(list(compaction_types)))))
else: sys.stderr.write("%d bytes (%s compressed from %d)\n" % (len(r),zlib_name,ll))
elif compact_opcodes: sys.stderr.write("%d bytes (opcode compaction saved %d on %s)\n" % (ll,compacted,','.join(sorted(list(compaction_types)))))
else: sys.stderr.write("%d bytes\n" % ll)
return r
except TooNarrow: pass
assert 0, "can't even assemble it with 255-byte addressing !?!"
if not browser_extension:
js_start = b'/* Javascript '+version_stamp+br"""
Usage:
- You could just include this code and then call the
annotate() function i.e. var result = annotate(input"""
if sharp_multi: js_start += b", annotation_type_number"
if glossfile: js_start += b", lines=2"
js_start += ")"
if not os.environ.get("JS_OMIT_DOM",""):
js_start += br"""
or, if you're in a browser and have loaded a page,
annotate_page("""
if sharp_multi:
js_start += b"annotation_type_number"
if glossfile: js_start += b","
if glossfile: js_start += b"lines=2"
js_start += br""")
(run annogen with JS_OMIT_DOM environment variable set
if you want to omit the annotate_page code)"""
js_start += br"""
- Or you could use (and perhaps extend) the Annotator
object, and call its annotate() method. If you have
Backbone.JS, Annotator will instead be a generator
(extending Backbone.Model) which you will have to
instantiate yourself (possibly after extending it).
The Annotator object/class is also what will be
exported by this module if you're using Common.JS.
- On Unix systems with Node.JS, you can run this file in
"node" to annotate standard input as a simple test.
"""
if zlib:
js_start += br"""
zlib'd version uses Uint8Array so has minimum browser requirements
(Chrome 7, Ffx 4, IE10, Op11.6, Safari5.1, 4.2 on iOS)
- generate without --zlib to support older browsers.
*/"""
else: js_start += b"*/"

Silas S. Brown
committed
js_start += b"\n\n"

Silas S. Brown
committed
else: js_start = b"" # browser_extension

Silas S. Brown
committed
js_start += b"var Annotator={\n"
if not browser_extension:
js_start += b" version: '"+version_stamp+b"',\n"
if glossfile: js_start += b"numLines: 2 /* override to 1 or 3 if you must, but not recommended for learning */,\n"

Silas S. Brown
committed
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
if sharp_multi: js_start += b"annotate: function(input,aType) { if(aType==undefined) aType=0;"
else: js_start += b"annotate: function(input) {"
if removeSpace: js_start += br" input=input.replace(/\B +\B/g,'');" # TODO: document that we do this (currently only in JS annotator here, and Android app via jsAnnot, although Web Adjuster does it separately in Python before calling the filter). It deals with software that adds ASCII spaces between Chinese characters of the same word, without deleting spaces between embedded English words (TODO: this 'JS + app' version may still delete spaces between punctuation characters, which may be an issue for consecutive quoted words e.g. 'so-called "word1" "word2"'). If doing it at the nextbyte level, we'd have to update prevbyte; if this or doing it at switchbyte level (e.g. recurse) we'd have to do something about the copy pointer (skip the spaces?) and the near-call distance (and associated buffer sizes in C) so they're best pre-removed, but only from between characters we annotate.
js_start += br"""
input = unescape(encodeURIComponent(input)); // to UTF-8
var data = this.data""" # TODO: if input is a whole html doc, insert css in head (e.g. from annoclip and/or adjuster), and hope there's no stuff that's not to be annotated (form fields etc). But really want them to be using browser_extension or annotate_page if doing this (TODO add css to annotate_page, already there in browser_extension)
if glossfile: js_start += b", numLines = this.numLines"
js_start += br""";
var addrLen = data.charCodeAt(0);
var dPtr, inputLength = input.length;
var p = 0; // read-ahead pointer
var copyP = 0; // copy pointer
var output = new Array(), needSpace = 0;
function readAddr() {
var i,addr=0;
for (i=addrLen; i; i--) addr=(addr << """
if js_6bit: js_start += b"6) | (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset))+b");"
else: js_start += b"8) | data.charCodeAt(dPtr++);"
js_start += br"""
return addr;
}
function readRefStr() {
var a = readAddr(); var l=data.charCodeAt(a);"""
if js_6bit and not js_utf8:
js_start += br"""
if(l && l<123) a = data.slice(a+1,a+l-30);
else a = data.slice(a+1,data.indexOf(data.charAt(a),a+1));"""
elif zlib: js_start += br"""
if (l != 0) a = data.slice(a+1,a+l+1);
else a = data.slice(a+1,data.indexOf(0,a+1));"""
else: js_start += br"""
if (l != 0) a = data.slice(a+1,a+l+1);
else a = data.slice(a+1,data.indexOf('\x00',a+1));"""
if zlib: js_start += b"return String.fromCharCode.apply(null,a)" # gets UTF-8 from Uint8array
elif js_utf8: js_start += b"return unescape(encodeURIComponent(a))" # Unicode to UTF-8 (TODO: or keep as Unicode? but copyP things will be in UTF-8, as will the near tests)
elif js_6bit: js_start += b"return unescape(a)" # %-encoding
else: js_start += b"return a"
js_start += br"""}
function s() {
if (needSpace) output.push(" ");
else needSpace=1; // for after the word we're about to write (if no intervening bytes cause needSpace=0)
}
function readData() {
var sPos = new Array(), c;
while(1) {
c = data.charCodeAt(dPtr++);
if (c & 0x80) dPtr += (c&0x7F);"""
if js_6bit: js_start += br"""
else if (c > 90) { c-=90;
var i=-1;if(p<input.length){var cc=input.charCodeAt(p++)-93; if(cc>118)cc-=20; i=data.slice(dPtr,dPtr+c).indexOf(String.fromCharCode(cc))}
if (i==-1) i = c;
if(i) dPtr += data.charCodeAt(dPtr+c+i-1)-"""+str(js_6bit_offset)+br""";
dPtr += c+c }"""

Silas S. Brown
committed
else if (c > 107) { c-=107;
var i = ((p>=input.length)?-1:data.slice(dPtr,dPtr+c).indexOf(input.charAt(p++)));
if (i==-1) i = c;
if(i) dPtr += data.charCodeAt(dPtr+c+i-1);
dPtr += c+c;
}"""
js_start += br""" else switch(c) {

Silas S. Brown
committed
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
case 50: dPtr = readAddr(); break;
case 51: {
var f = readAddr(); var dO=dPtr;
dPtr = f; readData() ; dPtr = dO;
break; }
case 52: return;
case 60: {
var nBytes = data.charCodeAt(dPtr++)+1;
var i = ((p>=input.length)?-1:data.slice(dPtr,dPtr+nBytes).indexOf(input.charAt(p++)));
if (i==-1) i = nBytes;
dPtr += (nBytes + i * addrLen);
dPtr = readAddr(); break; }
case 70: if(needSpace) { output.push(' '); needSpace=0; } break;
case 71: case 74: {
var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
var base = input.slice(copyP, copyP + numBytes);
output.push(base);
copyP += numBytes;
if(c==74) return; break; }
case 72: case 75: {
var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
var annot = readRefStr();
var base = input.slice(copyP, copyP + numBytes); copyP += numBytes;
s();"""
if glossfile: js_start += br"""
switch (numLines) {
case 1:
output.push("<ruby><rb>");
output.push(base);
output.push("</rb></ruby>");
break;
case 3:
output.push("<ruby><rt> </rt><rb>");
output.push(annot); output.push("</rb><rb>");
output.push(base);
output.push("</rb></ruby>");
break;
default:"""
js_start += br"""
output.push("<ruby><rb>");
output.push(base);
output.push("</rb><rt>");
output.push(annot);
output.push("</rt></ruby>")"""
if glossfile: js_start += b"}"
else: js_start += b";"
js_start += br"""
if(c==75) return; break; }"""
if glossfile: js_start += br"""
case 73: case 76: {
var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF;
var annot = readRefStr();
var title = readRefStr();
var base = input.slice(copyP, copyP + numBytes); copyP += numBytes;
s();
switch (numLines) {
case 1:
output.push("<ruby title=\"");
output.push(title);
output.push("\"><rb>");
output.push(base);
output.push("</rb>");
output.push("</ruby>");
break;
case 3:
output.push("<ruby title=\"");
output.push(title);
output.push("\"><rt>");
output.push(title.match(/[^/(;]*/)[0]);
output.push("</rt><rb>");
output.push(annot);
output.push("</rb><rb>");
output.push(base);
output.push("</rb></ruby>");
break;
default:
output.push("<ruby title=\"");
output.push(title);
output.push("\"><rb>");
output.push(base);
output.push("</rb><rt>");
output.push(annot);
output.push("</rt></ruby>") }
if(c==76) return; break; }"""
if not js_6bit: js_start = js_start.replace(b"(data.charCodeAt(dPtr++)-34)&0xFF",b"data.charCodeAt(dPtr++)")
js_start += br"""
case 80: sPos.push(p); break;
case 81: p=sPos.pop(); break;
case 90: {
var tPtr = readAddr();
var fPtr = readAddr();
var nearbytes = data.charCodeAt(dPtr++);
var o=p;
if (o > nearbytes) o -= nearbytes; else o = 0;
var max = p + nearbytes;
if (max > inputLength) max = inputLength;
var tStr = input.slice(o,max);
var found = 0;
while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; }
dPtr = found ? tPtr : fPtr; break;
}
default: throw("corrupt data table at "+(dPtr-1)+"/"+data.length+" ("+c+")");
}
}
}
while(p < inputLength) {
var oldPos=p;
dPtr=1;readData();
if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; }
}
return decodeURIComponent(escape(output.join("")))"""
if js_6bit: js_start = js_start.replace(b"var numBytes = data.charCodeAt(dPtr++);",b"var numBytes = (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset-1))+b")&0xFF;")
if sharp_multi: js_start += br""".replace(new RegExp("(</r[bt]><r[bt]>)"+"[^#]*#".repeat(aType)+"(.*?)(#.*?)?</r","g"),"$1$2</r")""" # normally <rt>, but this regexp will also work if someone changes the generated code to put annotation into second <rb> and title into <rt> as long as annotation is not given first. Cannot put [^#<] as there might be <sup> etc in the annotation, and .*?# still matches across ...</rb><rt>... :-(
js_start += br"""; // from UTF-8 back to Unicode

Silas S. Brown
committed
}"""
if not browser_extension: b", // end of annotate method\n" # data: ... \n goes here

Silas S. Brown
committed
js_end = br"""};
function annotate(input"""
if sharp_multi: js_end += b",aType"
if glossfile: js_end += b",numLines"
js_end += b") { "
if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; "
js_end += b"return Annotator.annotate(input"
if sharp_multi: js_end += b",aType"
js_end += b")}"
if browser_extension:
js_end += b"""

Silas S. Brown
committed
if(localStorage.aType===undefined) localStorage.aType=0;
if(localStorage.numLines===undefined) localStorage.numLines=2;
var aType=localStorage.aType,numLines=localStorage.numLines;

Silas S. Brown
committed
function handleMessage(request, sender, sendResponse) {
if(typeof request=='number') {

Silas S. Brown
committed
if(request<0) localStorage.numLines=numLines=-request; else {localStorage.aType=aType=request;if(numLines==1)localStorage.numLines=numLines=2}

Silas S. Brown
committed
(chrome.tabs && chrome.tabs.query?chrome.tabs.query:browser.tabs.query)({},(T)=>{for (let t of T)(chrome.tabs && chrome.tabs.executeScript?chrome.tabs.executeScript:browser.tabs.executeScript)(t.id,{allFrames: true, code: 'for(let c of Array.prototype.slice.call(document.getElementsByClassName("_adjust0")))if(c.oldTxt)c.parentNode.replaceChild(document.createTextNode(c.oldTxt),c); annotWalk(document,document)'})})

Silas S. Brown
committed
} else if(typeof request=='boolean') sendResponse(request?(numLines==1?-1:aType):numLines); // popup status query

Silas S. Brown
committed
else { if(request==null) request=getClip();
sendResponse(numLines>1?annotate(request""" # (we DO need the extra call to annotWalk above: the MutationObserver will NOT pick up on changes we made from here)

Silas S. Brown
committed
if sharp_multi: js_end += b",aType"
if glossfile: js_end += b",numLines"
js_end += br"""):request)} }

Silas S. Brown
committed
function getClip(){var area=document.createElement("textarea"); document.body.appendChild(area); area.focus();area.value='';document.execCommand("Paste");var txt=area.value; document.body.removeChild(area); return txt?txt:"Failed to read clipboard"}

Silas S. Brown
committed
fetch(chrome.extension.getURL("annotate-dat.txt")).then((r)=>{r.text().then((r)=>{Annotator.data=r;chrome.runtime.onMessage.addListener(handleMessage)})})""" # if not js_utf8, having to encode latin1 as utf8 adds about 25% to the file size, but text() supports only utf8; could use arrayBuffer() instead, but inefficient to read w. DataView(buf,offset,1), or could reinstate zlib (probably using base64 read in from file: would probably need to include a versioned unzip library instead of inline-minified subset)

Silas S. Brown
committed
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
elif not os.environ.get("JS_OMIT_DOM",""):
js_end += br"""
function annotate_page("""
if sharp_multi:
js_end += b"aType"
if glossfile: js_end += b","
if glossfile: js_end += b"numLines"
js_end += b") { "
if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; "
js_end += jsAnnot(False) + br"""return annotWalk(document,document)
}"""
if not browser_extension:
js_end += br"""
if (typeof Backbone != "undefined" && Backbone.Model) {
Annotator = Backbone.Model.extend(Annotator);"""
if sharp_multi: js_end += br"""
annotate=function(input,aType) { return new Annotator().annotate(input,aType) }"""
else: js_end += br"""
annotate=function(input) { return new Annotator().annotate(input) }"""
js_end += br"""
}
if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) {
// Node.js command-line test
fs=require('fs');
process.stdout.write(annotate(fs.readFileSync('/dev/stdin').toString()));
} else if (typeof module != "undefined" && module.exports) { // Common.js
module.exports = Annotator;
}
"""
if browser_extension:
# we can assume window.atob
js_inflate = br"""((dat,expandLen)=>{var buf=new Uint8Array(expandLen);dat=((r)=>{for(var e=new Uint8Array(r.length),t=0,n=e.length;t<n;t++)e[t]=r.charCodeAt(t);return e})(atob(dat));"""
else: js_inflate = br"""(function(dat,expandLen){
var buf=new Uint8Array(expandLen); dat=
"undefined"!=typeof window && window.atob ?
function(r){for(var e=new Uint8Array(r.length),t=0,n=e.length;t<n;t++)e[t]=r.charCodeAt(t);return e}(atob(dat))
:"undefined"!=typeof Buffer ? new Buffer(dat,"base64")
:function(r){var e,t,n={},f=65,a=0,o=0,i=new Uint8Array(r.length),d=0,l=String.fromCharCode,v=r.length;for(e="";f<91;)e+=l(f++);for(e+=e.toLowerCase()+"0123456789+/",f=0;f<64;f++)n[e.charAt(f)]=f;for(e=0;e<v;e++)for(a=(a<<6)+(f=n[r.charAt(e)]),o+=6;8<=o;)((t=a>>>(o-=8)&255)||e<v-2)&&(i[d++]=t);return i}(dat);"""
js_inflate += br"""
/* Inflate code taken from UZip.js (c) 2019 "Photopea" (MIT-licensed), cut down and JSCompress'd: */
function inflate(e,r){var t,n,E={iR:function(e,r){return E.F.inflate(e,r)},inflate:function(e,r){e[0],e[1];return E.iR(new Uint8Array(e.buffer,e.byteOffset+2,e.length-6),r)}};return E.F={},E.F.inflate=function(e,r){var t=Uint8Array;if(3==e[0]&&0==e[1])return r||new t(0);var n=E.F,f=n._F,i=n._E,o=n.dT,a=n.mC,u=n.cm,l=n.g7,d=n.U;for(var s,h,v,F,_,w,p=0,g=0,b=0,U=0,m=0;0==p;)if(p=f(e,m,1),s=f(e,m+1,2),m+=3,0!=s){if(1==s&&(_=d.flm,w=d.fm,g=511,b=31),2==s){h=i(e,m,5)+257,v=i(e,m+5,5)+1,F=i(e,m+10,4)+4;m+=14;for(var y=0;y<38;y+=2)d.it[y]=0,d.it[y+1]=0;for(var C=1,y=0;y<F;y++){var A=i(e,m+3*y,3);C<(d.it[1+(d.ordr[y]<<1)]=A)&&(C=A)}m+=3*F,a(d.it,C),u(d.it,C,d.im),_=d.lm,w=d.dm,m=o(d.im,(1<<C)-1,h+v,e,m,d.tt);var x=n.cO(d.tt,0,h,d.lt),g=(1<<x)-1,T=n.cO(d.tt,h,v,d.dt),b=(1<<T)-1;a(d.lt,x),u(d.lt,x,_),a(d.dt,T),u(d.dt,T,w)}for(;;){var k=_[l(e,m)&g];m+=15&k;k=k>>>4;if(k>>>8==0)r[U++]=k;else{if(256==k)break;var z=U+k-254;264<k&&(z=U+((M=d.ldef[k-257])>>>3)+i(e,m,7&M),m+=7&M);var M=w[l(e,m)&b];m+=15&M;var M=M>>>4,M=d.ddef[M],S=(M>>>4)+f(e,m,15&M);for(m+=15&M;U<z;)r[U]=r[U++-S],r[U]=r[U++-S],r[U]=r[U++-S],r[U]=r[U++-S];U=z}}}else{0!=(7&m)&&(m+=8-(7&m));x=4+(m>>>3),T=e[x-4]|e[x-3]<<8;r.set(new t(e.buffer,e.byteOffset+x,T),U),m=x+T<<3,U+=T}return r.length==U?r:r.slice(0,U)},E.F.dT=function(e,r,t,n,f,i){for(var o=E.F._E,a=E.F.g7,u=0;u<t;){var l=e[a(n,f)&r];f+=15&l;var d=l>>>4;if(d<=15)i[u]=d,u++;else{var c=0,l=0;16==d?(l=3+o(n,f,2),f+=2,c=i[u-1]):17==d?(l=3+o(n,f,3),f+=3):18==d&&(l=11+o(n,f,7),f+=7);for(var s=u+l;u<s;)i[u]=c,u++}}return f},E.F.cO=function(e,r,t,n){for(var f=0,i=0,o=n.length>>>1;i<t;){var a=e[i+r];n[i<<1]=0,f<(n[1+(i<<1)]=a)&&(f=a),i++}for(;i<o;)n[i<<1]=0,n[1+(i<<1)]=0,i++;return f},E.F.mC=function(e,r){for(var t,n,f,i=E.F.U,o=e.length,a=i.bl_count,u=0;u<=r;u++)a[u]=0;for(u=1;u<o;u+=2)a[e[u]]++;var l=i.next_code,d=0;for(a[0]=0,t=1;t<=r;t++)d=d+a[t-1]<<1,l[t]=d;for(n=0;n<o;n+=2)0!=(f=e[n+1])&&(e[n]=l[f],l[f]++)},E.F.cm=function(e,r,t){for(var n=e.length,f=E.F.U.r5,i=0;i<n;i+=2)if(0!=e[i+1])for(var o=i>>1,a=e[i+1],u=o<<4|a,a=r-a,l=e[i]<<a,d=l+(1<<a);l!=d;)t[f[l]>>>15-r]=u,l++},E.F.rC=function(e,r){for(var t=E.F.U.r5,n=15-r,f=0;f<e.length;f+=2){var i=e[f]<<r-e[f+1];e[f]=t[i]>>>n}},E.F._E=function(e,r,t){return(e[r>>>3]|e[1+(r>>>3)]<<8)>>>(7&r)&(1<<t)-1},E.F._F=function(e,r,t){return(e[r>>>3]|e[1+(r>>>3)]<<8|e[2+(r>>>3)]<<16)>>>(7&r)&(1<<t)-1},E.F.g7=function(e,r){return(e[r>>>3]|e[1+(r>>>3)]<<8|e[2+(r>>>3)]<<16)>>>(7&r)},E.F.U=(t=Uint16Array,n=Uint32Array,{next_code:new t(16),bl_count:new t(16),ordr:[16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15],of0:[3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258,999,999,999],exb:[0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0,0],ldef:new t(32),df0:[1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,65535,65535],dxb:[0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,0,0],ddef:new n(32),flm:new t(512),flt:[],fm:new t(32),fdt:[],lm:new t(32768),lt:[],tt:[],dm:new t(32768),dt:[],im:new t(512),it:[],r5:new t(32768),lhst:new n(286),dhst:new n(30),ihst:new n(19),lits:new n(15e3),strt:new t(65536),prev:new t(32768)}),function(){for(var e=E.F.U,r=0;r<32768;r++){var t=r;t=(4278255360&(t=(4042322160&(t=(3435973836&(t=(2863311530&t)>>>1|(1431655765&t)<<1))>>>2|(858993459&t)<<2))>>>4|(252645135&t)<<4))>>>8|(16711935&t)<<8,e.r5[r]=(t>>>16|t<<16)>>>17}function n(e,r,t){for(;0!=r--;)e.push(0,t)}for(r=0;r<32;r++)e.ldef[r]=e.of0[r]<<3|e.exb[r],e.ddef[r]=e.df0[r]<<4|e.dxb[r];n(e.flt,144,8),n(e.flt,112,9),n(e.flt,24,7),n(e.flt,8,8),E.F.mC(e.flt,9),E.F.cm(e.flt,9,e.flm),E.F.rC(e.flt,9),n(e.fdt,32,5),E.F.mC(e.fdt,5),E.F.cm(e.fdt,5,e.fm),E.F.rC(e.fdt,5),n(e.it,19,0),n(e.lt,286,0),n(e.dt,30,0),n(e.tt,320,0)}(),E.inflate(e,r)}
return inflate(dat,buf) })
"""

Silas S. Brown
committed
extension_rubycss = b"span._adjust0 ruby{display:inline-table !important;vertical-align:bottom !important;-webkit-border-vertical-spacing:1px !important;padding-top:0.5ex !important;margin:0px !important;} span._adjust0 ruby *{display: inline !important;vertical-align:top !important;line-height:1.0 !important;text-indent:0 !important;text-align:center !important;white-space:nowrap !important;padding-left:0px !important;padding-right:0px !important;} span._adjust0 rb{display:table-row-group !important;font-size:100% !important; opacity: 1.0 !important;} span._adjust0 rt{display:table-header-group !important;font-size:100% !important;line-height:1.1 !important; opacity: 1.0 !important;font-family: FreeSerif, Lucida Sans Unicode, Times New Roman, serif !important;}"
extension_config=br"""<html><head><meta charset="utf-8">
<style>#cr{width:100%;border:thin dotted grey;max-width:15em;max-height:10em;overflow:auto} #cr:empty{padding:0.5ex}

Silas S. Brown
committed
button{background:#ededed;color:inherit}
"""+extension_rubycss.replace(b"span._adjust0 ",b"")+br"""</style>
</head><body>

Silas S. Brown
committed
<nobr><button id="-1">Off</button> <button id="-2">2-line</button>"""
# -ve = num lines (if glossfile), +ve = annotNo (if sharp-multi)
if glossfile:
extension_config += b' <button id="-3">3-line</button>'
rangeStart = -3
else:
rangeStart = -2
extension_config=extension_config.replace(b'2-line',b'On')
extension_config += b'</nobr>'
if sharp_multi and annotation_names and ',' in annotation_names:
extension_config += b"".join((b'<br><button id="%d">%s</button>' % (num,B(name))) for num,name in enumerate(annotation_names.split(',')))
rangeEnd = len(annotation_names.split(','))
else: rangeEnd = 0
extension_config += b'<div id="cr"></div><button id="c">Clipboard</button><script src="config.js"></script></body></html>'
# Don't want Clipboard button to auto-refresh (and hide the button) in the desktop extension version, since would need to stop the refresh when view is no longer visible + is it really a good idea to timer-paste the clipboard on a desktop when conversion to text could be costly etc + many desktops would dismiss the extension box before letting you switch to another window to change the clipboard (unless it's in a VM)
extension_confjs = br"""function updateClip() {

Silas S. Brown
committed
chrome.runtime.sendMessage(null,((cr)=>{
var v=document.getElementById("cr");
v.textContent = ''; // clear
if(cr) {
try {
for(const t of new DOMParser().parseFromString('<span> '+cr+' </span>','text/html').body.firstChild.childNodes) v.appendChild(t.cloneNode(true));
var a=v.getElementsByTagName('ruby'),i; for(i=0; i < a.length; i++) if(a[i].title) ((e)=>{e.addEventListener('click',(()=>{alert(e.title)}))})(a[i])
} catch(err) { console.log(err.message) }
}
}))}
function update() {
chrome.runtime.sendMessage(false,function(r) {var i;for(i=%d;i;i++){var e=document.getElementById(""+i);if(i==-r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})""" % rangeStart

Silas S. Brown
committed
if rangeEnd: extension_confjs += br""";
chrome.runtime.sendMessage(true,function(r) {for(var i=0;i<%d;i++){var e=document.getElementById(""+i);if(i==r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})""" % rangeEnd
extension_confjs += b';\nif(document.getElementById("cr").firstChild) updateClip()\n'

Silas S. Brown
committed
extension_confjs += b"} update();\n"
extension_confjs += b';'.join((b'document.getElementById("%d").addEventListener("click",function(){chrome.runtime.sendMessage(%d,update)})' % (n,n)) for n in xrange(rangeStart,rangeEnd))
extension_confjs += b';document.getElementById("c").addEventListener("click",updateClip)'

Silas S. Brown
committed
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
dart_src = br"""
/* Usage
-----
If this file is saved as annotator.dart,
you can import 'annotator.dart';
and then call the annotate() function."""
if dart_datafile: dart_src += br"""
E.g. String result = await annotate(...);
(make your function async.) Will read """+B(dart_datafile)
dart_src += br"""
*/
import 'dart:convert';"""
if zlib: dart_src += b"import 'dart:io';"
dart_src += br"""
class _Annotator {
static const version="""+b'"'+version_stamp+br"""";
int numLines = 2; // override to 1 or 3 if you must, but not recommended for learning"""
if dart_datafile: dart_src+=b"\n static String data=null;"
else: dart_src+=b"\n static final String data=%%DATA_INIT%%;"
dart_src += br"""
int addrLen=data.codeUnitAt(0),dPtr;
bool needSpace; StringBuffer output;
int p, copyP; List<int> inBytes; int inputLength;
String annotate(String input"""
if sharp_multi: dart_src += br""",[int aType=0]"""
dart_src += br""") {
inBytes=utf8.encode(input); dPtr=0;
inputLength=input.length;
p=0; copyP=0;
output = StringBuffer(); needSpace = false;
while(p < inputLength) {
int oldPos=p;
dPtr=1;_readData();
if (oldPos==p) { needSpace=false; output.write(String.fromCharCode(inBytes[p++])); copyP++; }
}
return Utf8Decoder().convert(output.toString().codeUnits)"""
if sharp_multi: dart_src += br""".replaceAllMapped(new RegExp("(</r[bt]><r[bt]>)"+"[^#]*#"*aType+"(.*?)(#.*?)?</r"),(Match m)=>"${m[1]}${m[2]}</r")"""
dart_src += br""";
}
int _readAddr() { int addr=0; for (int i=addrLen; i>0; i--) addr=(addr << 8) | data.codeUnitAt(dPtr++); return addr; }
String _readRefStr() {
int a=_readAddr();
int l=data.codeUnitAt(a);
String r;
if (l != 0) r=data.substring(a+1,a+l+1);
else r=data.substring(a+1,data.indexOf("\u0000",a+1));"""
if js_utf8: dart_src += br"""
return String.fromCharCodes(Utf8Encoder().convert(r));"""
else: dart_src += b"return r;"
dart_src += br"""
}
void _s() {
if(needSpace) output.write(" ");
else needSpace=true; // for after the word we're about to write (if no intervening bytes cause needSpace=false)
}
void _readData() {
List<int> sPos=List<int>();
while(true) {
int c=data.codeUnitAt(dPtr++);
if ((c & 0x80)!=0) dPtr += (c&0x7F); // short jump
else if (c > 107) { // short switchbyte
c-=107;
var i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+c).indexOf(String.fromCharCode(inBytes[p++])));
if (i==-1) i = c;
if(i>0) dPtr += data.codeUnitAt(dPtr+c+i-1);
dPtr += c+c;
} else switch(c) {
case 50: dPtr = _readAddr(); break;
case 51: {
int f = _readAddr(); int dO=dPtr;
dPtr = f; _readData() ; dPtr = dO;
break; }
case 52: return;
case 60: {
int nBytes = data.codeUnitAt(dPtr++)+1;
int i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+nBytes).indexOf(String.fromCharCode(inBytes[p++])));
if (i==-1) i = nBytes;
dPtr += (nBytes + i * addrLen);
dPtr = _readAddr(); break; }
case 70: if(needSpace) { output.write(" "); needSpace=false; } break;
case 71: case 74: {
int numBytes = data.codeUnitAt(dPtr++);
output.write(String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)));
copyP += numBytes; if(c==74) return; break; }
case 72: case 75: {
int numBytes = data.codeUnitAt(dPtr++);
String annot = _readRefStr();
String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes;
_s();
switch (numLines) {
case 1:
output.write("<ruby><rb>");
output.write(base);
output.write("</rb></ruby>");
break;
case 3:
output.write("<ruby><rt> </rt><rb>");
output.write(annot);
output.write("</rb><rb>");
output.write(base);
output.write("</rb></ruby>");
break;
default:
output.write("<ruby><rb>");
output.write(base);
output.write("</rb><rt>");
output.write(annot);
output.write("</rt></ruby>");
} if(c==75) return; break; }
case 73: case 76: {
int numBytes = data.codeUnitAt(dPtr++);
String annot = _readRefStr();
String title = _readRefStr();
String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes;
_s();
switch (numLines) {
case 1:
output.write("<ruby title=\"");
output.write(title);
output.write("\"><rb>");
output.write(base);
output.write("</rb></ruby>");
break;
case 3:
output.write("<ruby title=\"");
output.write(title);
output.write("\"><rt>");
output.write(RegExp("[^/(;]*").matchAsPrefix(title).group(0));
output.write("</rt><rb>");
output.write(annot);
output.write("</rb><rb>");
output.write(base);
output.write("</rb></ruby>");
break;
default:
output.write("<ruby title=\"");
output.write(title);
output.write("\"><rb>");
output.write(base);
output.write("</rb><rt>");
output.write(annot);
output.write("</rt></ruby>");
} if(c==76) return; break; }
case 80: sPos.add(p); break;
case 81: p=sPos.removeLast(); break;
case 90: {
int tPtr = _readAddr();
int fPtr = _readAddr();
int nearbytes = data.codeUnitAt(dPtr++);
int o=p;
if (o > nearbytes) o -= nearbytes; else o = 0;
var max = p + nearbytes;
if (max > inputLength) max = inputLength;
String tStr = String.fromCharCodes(inBytes.sublist(o,max));
bool found = false;
while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(_readRefStr()) != -1) { found = true; break; }
dPtr = found ? tPtr : fPtr; break;
}
default: throw("corrupt data table at ${dPtr-1}/${data.length} (${c})");
}
}
}
}
"""
if dart_datafile: dart_src += b"Future<String> annotate(String s,["
else: dart_src += b"String annotate(String s,["
if sharp_multi: dart_src += b"int aType=0,"
dart_src += b"int numLines=2]) "
if dart_datafile: dart_src += b"async "
dart_src += b"{ "
if dart_datafile: dart_src += b"if(_Annotator.data==null) _Annotator.data=await %%DATA_INIT%%;"
dart_src += b"var a=_Annotator(); a.numLines=numLines; return a.annotate(s"
if sharp_multi: dart_src += b",aType"
dart_src += b"); }\n"
if zlib: dart_src = dart_src.replace(b"%%DATA_INIT%%",b"String.fromCharCodes(zlib.decoder.convert(%%DATA_INIT%%))")
py_start = b'# Python '+version_stamp+br"""
# You can import this module and call annotate(utf8 bytes)
# (from multiple threads if desired),
# or you can run from the command line on standard input.
# annotate has an optional second argument, which can be
# 'ruby' (default), 'raw' (annotation only) or 'braces'.
# This module is compatible with both Python 2.7 and Python 3.
"""
py_end = br"""
class Annotator:
version="""+b'"'+version_stamp+br""""
def __call__(self,inStr,aFormat):
if aFormat=="ruby": self.startA,self.midA,self.endA = b"<ruby><rb>",b"</rb><rt>",b"</rt></ruby>"
elif aFormat=="raw": self.startA=self.midA=self.endA = b""
elif aFormat=="braces": self.startA,self.midA,self.endA = b"{",b"|",b"}"
else: raise Exception("Unrecognised annotation format "+repr(aFormat))
assert type(inStr)==bytes
self.inStr = inStr
self.addrLen = ord(data[:1])
self.inputLength = len(inStr)
self.p = 0 # read-ahead pointer
self.copyP = 0 # copy pointer
self.output = []
self.needSpace = 0
while self.p < self.inputLength:
oldPos = self.p
self.dPtr = 1 ; self.readData()
if oldPos == self.p:
self.needSpace=0
self.output.append(inStr[self.p:self.p+1])
self.p += 1 ; self.copyP += 1
return b"".join(self.output)
def readAddr(self):
addr = 0
for i in range(self.addrLen):
addr=(addr << 8) | ord(data[self.dPtr:self.dPtr+1])
self.dPtr += 1
return addr
def readRefStr(self):
a = self.readAddr(); l=ord(data[a:a+1])
if l: return data[a+1:a+l+1]
else: return data[a+1:data.index(b'\x00',a+1)]
def s(self):
if self.needSpace: self.output.append(b" ")
else: self.needSpace=1
def readData(self):
sPos = [] ; out = self.output
while True:
d = ord(data[self.dPtr:self.dPtr+1]) ; self.dPtr += 1
if d==50: self.dPtr = self.readAddr()
elif d==51:
func = self.readAddr() ; dO = self.dPtr
self.dPtr = func ; self.readData() ; self.dPtr = dO
elif d==52: return
elif d==60 or d<20:
if d<20: nBytes=d+1
else:
nBytes = ord(data[self.dPtr:self.dPtr+1])+1
self.dPtr += 1
if self.p>=len(self.inStr): i = -1
else: i = data[self.dPtr:self.dPtr+nBytes].find(self.inStr[self.p:self.p+1]) ; self.p += 1
if i==-1: i = nBytes
if d<20:
if i>0: self.dPtr += ord(data[self.dPtr+nBytes+i-1:self.dPtr+nBytes+i])
self.dPtr += nBytes * 2
else:
self.dPtr += (nBytes + i * self.addrLen)
self.dPtr = self.readAddr()
elif d==70:
if self.needSpace:
out.append(b' ') ; self.needSpace=0
elif d==71 or d==74:
numBytes = ord(data[self.dPtr:self.dPtr+1])
self.dPtr += 1
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
if d==74: return
elif d==72 or d==75:
numBytes = ord(data[self.dPtr:self.dPtr+1])
self.dPtr += 1
annot = self.readRefStr()
self.s()
if self.startA:
out.append(self.startA)
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
out.append(self.midA) ; out.append(annot)
out.append(self.endA)
if d==75: return
elif d==73 or d==76:
numBytes = ord(data[self.dPtr:self.dPtr+1])
self.dPtr += 1
annot = self.readRefStr()
title = self.readRefStr()
self.s()
if self.startA==b"{": # omit title in braces mode
out.append(self.startA)
out.append(self.inStr[self.copyP:self.copyP+numBytes])
elif self.startA:
out.append(b"<ruby title=\"");out.append(title)
out.append(b"\"><rb>");
out.append(self.inStr[self.copyP:self.copyP+numBytes])
self.copyP += numBytes
out.append(self.midA) ; out.append(annot)
out.append(self.endA)
if d==76: return
elif d==80: sPos.append(self.p)
elif d==81: self.p = sPos.pop()
elif d==90:
tPtr = self.readAddr()
fPtr = self.readAddr()
nearbytes = ord(data[self.dPtr:self.dPtr+1])
self.dPtr += 1
o = max(self.p-nearbytes,0)
maxx = min(self.p+nearbytes,self.inputLength)
tStr = self.inStr[o:maxx]
found = False
while self.dPtr < tPtr and self.dPtr < fPtr:
if self.readRefStr() in tStr:
found = True ; break
if found: self.dPtr = tPtr
else: self.dPtr = fPtr
elif d>0x80: self.dPtr += d-0x80
else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1:self.dPtr]))+")")
def annotate(inStr,p="ruby"): return Annotator()(inStr,p)
def main():