FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 378 KiB
Newer Older
    except: pass
    # Due to the way we handle overlaps, it's better to process the shortest phrases first, as the longer phrases will yield more rule options and therefore more likely to be able to work around any "no-overlap" constraints imposed by already-processed examples.  Something like:
    p2 = []
    for p in splitWords(corpus_unistr,phrases=True):
      p2.append((min([len(p.split(markupStart)),len(p.split(markupMid)),len(p.split(markupEnd))]),len(p2),p)) # no need for splitWords(phrase) just to get len, but we do need the min-of-3 for robustness against the occasional markup error
    p2.sort() # by length, then by original position (note: if removing this sort, remove wordsThisPhrase from status_update)
    phrases = [] ; wordLen = None
    for p in p2:
      if not wordLen == p[0]:
        wordLen = p[0]
        phrases.append(wordLen-1) # because it's a .split length (really want an actual count, but it only has to be roughly right in this instance and splitLen-1 will do for speed)
      phrases.append(p[-1])
    _gp_cache = phrases ; return phrases

def setup_other_globals():
    global corpus_markedDown, bigramCache
    corpus_markedDown = markDown(corpus_unistr)
    if not ybytes: return
    bigramCache=dict((i,[]) for i in set(corpus_markedDown[i:i+2] for i in xrange(len(corpus_markedDown)-1)))
    for i in xrange(len(corpus_markedDown)-1):
      k=corpus_markedDown[i:i+2]
      if k in bigramCache:
        bigramCache[k].append(i)
        if len(bigramCache[k]) > 100: del bigramCache[k]
    global markedUp_unichars
    if yarowsky_all: markedUp_unichars = None
    else: markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int)))
def check_globals_are_set_up(): # for use during parallelism
  global corpus_unistr
  try: corpus_unistr # if we fork()d, we may already have it
  except NameError:
    normalise() # should get corpus_unistr from checkpoint,
    try: corpus_unistr # unless we're NOT normalising,
    except: corpus_unistr = openfile(infile).read().decode(incode) # in which case we have to load the corpus from scratch (it won't be stdin)
    generate_map() # similarly this should just be a read
    setup_other_globals() # might do a bit more work, but probably faster than copying if we're not on the same machine

def analyse():
    accum = RulesAccumulator()
    covered = 0 # number of phrases we managed to 'cover' with our rules
    toCover = 0 # number of phrases we TRIED to cover (==covered if 100%)
    phraseNo = 0 ; wordLen = None
    if checkpoint:
      try: phraseNo,wordLen,covered,toCover,accum.__dict__ = read_checkpoint()
      except: pass
    phraseLastUpdate = phraseLastCheckpoint = phraseNo
    lastUpdate = lastCheckpoint = startTime = time.time()
    backgrounded = [] ; phrases = get_phrases()
    while phraseNo < len(phrases):
        if type(phrases[phraseNo])==int:
          wordLen = phrases[phraseNo]
          for b in backgrounded: # flush (TODO: duplicate code)
            coveredA,toCoverA = getNext(b)
            covered += coveredA ; toCover += toCoverA
          backgrounded = []
          phraseNo += 1 ; continue
        if toCover:
          if checkpoint and (checkpoint_exit(0) or (checkpoint_period and time.time() >= lastCheckpoint + checkpoint_period)):
            sys.stderr.write("Checkpointing..."+clear_eol)
            sys.stderr.flush()
            for b in backgrounded: # flush (TODO: duplicate code)
              coveredA,toCoverA = getNext(b)
              covered += coveredA ; toCover += toCoverA
            backgrounded = []
            write_checkpoint((phraseNo,wordLen,covered,toCover,accum.__dict__))
            lastCheckpoint = time.time() ; phraseLastCheckpoint = phraseNo
        if time.time() >= lastUpdate + 2:
          if toCover: cov=int(100.0*covered/toCover)
          else: cov = 0
          status_update(phraseNo,len(phrases),wordLen,len(accum.rules),phraseLastUpdate,lastUpdate,phraseLastCheckpoint,lastCheckpoint,cov,len(accum.rejectedRules),startTime)
          lastUpdate = time.time() ; phraseLastUpdate = phraseNo
        aRules = accum.addRulesForPhrase(phrases[phraseNo],wordLen==1) # TODO: we're saying canBackground only if wordLen==1 because longer phrases can be backgrounded only if they're guaranteed not to have mutual effects; do we want to look into when we can do that?  (and update the help text for --single-core if changing)
        arr = getNext(aRules)
        if arr=="backgrounded": backgrounded.append(aRules)
        else:
          coveredA,toCoverA = arr
          covered += coveredA ; toCover += toCoverA
        phraseNo += 1
    if backgrounded:
      sys.stderr.write("Collecting backgrounded results... "+clear_eol) ; sys.stderr.flush()
      for b in backgrounded: getNext(b)
      del backgrounded
      sys.stderr.write("done\n")
    if rulesFile: accum.save()
    if diagnose_manual: test_manual_rules()
    return sorted(accum.rulesAndConds()) # sorting it makes the order stable across Python implementations and insertion histories: useful for diff when using concurrency etc (can affect order of otherwise-equal Yarowsky-like comparisons in the generated code)

def read_manual_rules():
  if not manualrules: return
  for l in openfile(manualrules):
    if not l.strip(): continue
    l=l.decode(incode).strip() # TODO: manualrulescode ?
    if removeSpace: l=re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),(markupEnd+markupStart).replace('\\',r'\\'),l,flags=re.UNICODE)
    yield l

def test_manual_rules():
    for l in read_manual_rules():
      words = list(splitWords(l))
      # Prevent KeyError in getOkStarts:
      for w in words:
        if w not in precalc_sets: precalc_sets[w]=set()
      # Call test_rule:
      yb = []
      if not getNext(test_rule(l,yb)) or len(yb):
        getBuf(sys.stderr).write(("\nWARNING: Manual rule '%s' may contradict the examples. " % l).encode(terminal_charset))
        if len(words)==1:
          global diagnose,diagnose_limit,ybytes
          od,odl,oy,diagnose,diagnose_limit,ybytes = diagnose,diagnose_limit,ybytes,markDown(l),0,ybytes_max
          getNext(test_rule(l,[]))
          diagnose,diagnose_limit,ybytes = od,odl,oy

def java_escape(unistr):
  ret = []
  for c in unistr:
    if c=='"': ret.append(br'\"')
    elif c=='\\': ret.append(br'\\')
    elif ord(' ') <= ord(c) <= 127: ret.append(B(c))
    elif c=='\n': ret.append(br'\n')
    else: ret.append(br'\u%04x' % ord(c))
  return b''.join(ret)

def golang_escape(unistr):
  return unistr.replace('\\','\\\\').replace('"','\\"').replace('\n',r'\n').encode(outcode)

def c_escape(unistr):
    # returns unistr encoded as outcode and escaped so can be put in C in "..."s
    return zapTrigraphs(unistr.encode(outcode).replace(b'\\',b'\\\\').replace(b'"',b'\\"').replace(b'\n',b'\\n').replace(b'\r',b'\\r')) # TODO: \r shouldn't occur, error if it does?
def zapTrigraphs(x): return re.sub(br"\?\?([=/'()<>!-])",br'?""?\1',x) # to get rid of trigraph warnings, TODO might get a marginal efficiency increase if do it to the entire C file at once instead)

def c_escapeRawBytes(s): # as it won't be valid outcode; don't want to crash any editors/viewers of the C file
  if s.endswith(b'\x00'): s=s[:-1] # as the C compiler will add a terminating 0 anyway
  return re.sub(br"(?<!\\)((?:\\\\)*\\x..)([0-9a-fA-F])",br'\1""\2',zapTrigraphs(s.replace(b'\\',b'\\\\').decode('unicode_escape').encode('unicode_escape').replace(b'"',b'\\"')))

def js_escapeRawBytes(s):
  assert not zlib # js_utf8 etc not relevant if base64
    s = s.replace("\\",r"\\").replace('"',r'\"').replace(chr(8),r"\b").replace(chr(9),r"\t").replace(chr(10),r"\n").replace(chr(12),r"\f").replace(chr(13),r"\r")
    if ignore_ie8: s = s.replace(chr(11),r"\v")
    if js_octal: s = re.sub("[\x00-\x1f](?![0-9])",lambda m:r"\%o"%ord(m.group()),s)
    else: s = re.sub(chr(0)+r"(?![0-9])",r"\\0",s) # \0 is allowed even if not js_octal (and we need \\ because we're in a regexp replacement)
    return re.sub(b"[\x00-\x1f\x7f]",lambda m:br"\x%02x"%ord(m.group()),s.encode('utf-8'))
  # otherwise typeof(s)==typeof(b"")
  s = s.replace(b"\\",br"\\").replace(b'"',br'\"').replace(B(chr(8)),br"\b").replace(B(chr(9)),br"\t").replace(B(chr(10)),br"\n").replace(B(chr(12)),br"\f").replace(B(chr(13)),br"\r")
  if ignore_ie8: s = s.replace(B(chr(11)),br"\v")
  if js_octal: s = re.sub(b"[\x00-\x1f](?![0-9])",lambda m:br"\%o"%ord(m.group()),s)
  else: s = re.sub(b'\x00'+br"(?![0-9])",br"\\0",s) # \0 is allowed even if not js_octal (and we need \\ because we're in a regexp replacement)
  return re.sub(b"[\x00-\x1f\x7f-\xff]",lambda m:br"\x%02x"%ord(m.group()),s)

def txt_escapeRawBytes(s): # for browser_extension
  if js_utf8: return s.encode('utf-8')
  else: return s.decode('latin1').encode('utf-8')

def dart_escapeRawBytes(s):
  if js_utf8: return re.sub(b"[\x00-\x1f\"\\\\$\x7f]",lambda m:br"\u{%x}"%ord(m.group()),s.encode('utf-8'))
  else: return re.sub(b"[\x00-\x1f\"\\\\$\x7f-\xff]",lambda m:br"\u{%x}"%ord(m.group()),s)

def c_length(unistr): return len(unistr.encode(outcode))

if java or c_sharp or golang:
  if golang: outLang_escape = golang_escape
  else: outLang_escape = java_escape
  if java: outLang_bool = b"boolean"
  else: outLang_bool = b"bool"
  outLang_true = b"true"
  outLang_false = b"false"
else:
  outLang_escape = c_escape
  outLang_bool = b"int"
  outLang_true = b"1"
  outLang_false = b"0"

def allVars(u):
  global cjk_cLookup
  try: cjk_cLookup
  except NameError:
    sys.stderr.write("(checking CJK closures for missing glosses)\n")
    global stderr_newline ; stderr_newline = True
    from cjklib.characterlookup import CharacterLookup
    cjk_cLookup = CharacterLookup("C") # param doesn't matter for getCharacterVariants, so just put "C" for now
    cjk_cLookup.varCache = {} # because getCharacterVariants can be slow if it uses SQL queries
  def lookupVar(u,t):
    if (u,t) not in cjk_cLookup.varCache: cjk_cLookup.varCache[(u,t)] = cjk_cLookup.getCharacterVariants(u,t)
    return cjk_cLookup.varCache[(u,t)]
  done = set([u])
  for t in "STCMZ":
    for var in lookupVar(u,t):
      if not var in done: yield var
      done.add(var)
      # In at least some versions of the data, U+63B3 needs to go via T (U+64C4) and M (U+865C) and S to get to U+864F (instead of having a direct M variant to 864F), so we need to take (S/T)/M/(S/T) variants also:
      if t in "ST":
        for var in lookupVar(var,'M'):
          if var in done: continue
          yield var ; done.add(var)
          for t2 in "ST":
            for var in lookupVar(var,t2):
              if var in done: continue
              yield var ; done.add(var)

def allVarsW(unistr):
  vRest = []
  for i in xrange(len(unistr)):
    got_vRest = False
    for v in allVars(unistr[i]):
      yield unistr[:i]+v+unistr[i+1:]
      if got_vRest:
        for vr in vRest: yield unistr[:i]+v+vr
      else:
        vRest = [] ; got_vRest = True
        for vr in allVarsW(unistr[i+1:]):
          yield unistr[:i]+v+vr ; vRest.append(vr)

def matchingAction(rule,glossDic,glossMiss,glosslist,omitlist):
  # called by addRule in outputParser, returns (actionList, did-we-actually-annotate).  Also applies reannotator and compression (both of which will require 2 passes if present)
  action = []
  gotAnnot = False
  for w in splitWords(rule):
    wStart = w.index(markupStart)+len(markupStart)
    wEnd = w.index(markupMid,wStart)
    text_unistr = w[wStart:wEnd]
    mStart = wEnd+len(markupMid)
    annotation_unistr = w[mStart:w.index(markupEnd,mStart)]
    if mreverse: text_unistr,annotation_unistr = annotation_unistr,text_unistr
    if glosslist and not text_unistr in glosslist:
      return text_unistr+" not glosslisted",None
    elif text_unistr in omitlist:
      return text_unistr+" omitlisted",None
    gloss = glossDic.get((text_unistr,annotation_unistr),glossDic.get(text_unistr,None))
    if gloss_closure and not gloss and not w in glossMiss:
      for t2 in allVarsW(text_unistr):
        gloss = glossDic.get((t2,annotation_unistr),glossDic.get(t2,None))
        if gloss:
          glossDic[text_unistr] = gloss ; break
    if gloss: gloss = gloss.replace('&','&amp;').replace('"','&quot;').replace('\n','&#10;') # because it'll be in a title= attribute
    if reannotator:
      if reannotator.startswith('##'): toAdd = text_unistr + '#' + annotation_unistr
      elif reannotator[0]=='#': toAdd=annotation_unistr
      else: toAdd = text_unistr
      if toAdd in reannotateDict:
        au = reannotateDict[toAdd]
        if au and reannotate_caps and annotation_unistr and not annotation_unistr[0]==annotation_unistr[0].lower():
          if sharp_multi: au='#'.join((w[0].upper()+w[1:]) for w in au.split('#'))
          else: au=au[0].upper()+au[1:]
        annotation_unistr = au
      else: toReannotateSet.add(toAdd)
    if compress:
      annotation_bytes0=annotation_unistr.encode(outcode)
      annotation_bytes = squash(annotation_bytes0)
      if gloss:
        gloss_bytes0 = gloss.encode(outcode)
        gloss_bytes = squash(gloss_bytes0)
      else: gloss_bytes0 = gloss_bytes = None
      if not data_driven:
        if annotation_bytes == annotation_bytes0: annotation_bytes = outLang_escape(annotation_unistr) # (if compress didn't do anything, might as well write a readable string to the C)
        else: annotation_bytes = c_escapeRawBytes(annotation_bytes)
        if gloss and gloss_bytes == gloss_bytes0: gloss_bytes = outLang_escape(gloss)
        elif gloss_bytes: gloss_bytes = c_escapeRawBytes(gloss_bytes)
    elif data_driven: # data-driven w. no compression:
      annotation_bytes = annotation_unistr.encode(outcode)
      if gloss: gloss_bytes = gloss.encode(outcode)
      else: gloss_bytes = None
    else: # non data-driven, no compression:
      annotation_bytes = outLang_escape(annotation_unistr)
      if gloss: gloss_bytes = outLang_escape(gloss)
      else: gloss_bytes = None
    if java: adot = b"a." # not used if data_driven
    else: adot = b""
    bytesToCopy = c_length(text_unistr)
    if gloss:
        if data_driven: action.append((bytesToCopy,annotation_bytes,gloss_bytes))
        else: action.append(adot+b'o2(%d,"%s","%s");' % (bytesToCopy,annotation_bytes,gloss_bytes))
    else:
        glossMiss.add(w)
        if data_driven: action.append((bytesToCopy,annotation_bytes))
        else: action.append(adot+b'o(%d,"%s");' % (bytesToCopy,annotation_bytes))
    if annotation_unistr or gloss: gotAnnot = True
  return action,gotAnnot

def readGlossfile():
    glossDic = {} ; glossMiss = set() ; glosslist = set()
    if glossfile:
        for l in openfile(glossfile):
            if not l.strip(): continue
            l=l.decode(incode,errors='replace') # TODO: glosscode ? (errors=replace because we said it's not necessary to completely debug glossfile; we don't want this to be brought down by one bad UTF8 sequence or whatever)
            try: word,annot,gloss = l.split("\t",2)
            except: # not enough tabs
              word = l.split("\t",1)[0] ; annot = gloss = ""
              if glossmiss_omit: pass # they can list words without glosses; no error if missing \t
              else: getBuf(sys.stderr).write(("Gloss: Ignoring incorrectly-formatted line "+l.strip()+"\n").encode(terminal_charset))
            word,annot,gloss = word.strip(),annot.strip(),gloss.strip().replace("\t","\n")
            if glossmiss_omit and word: glosslist.add(word)
            if not word or not gloss: continue
            if annot: glossDic[(word,annot)] = gloss
            else: glossDic[word] = gloss
    return glossDic,glossMiss,glosslist

def copyBytes(n,checkNeedspace=False): # needSpace unchanged for ignoreNewlines etc; checkNeedspace for open quotes
    if checkNeedspace:
      if data_driven: return [b's0',(n,)] # copyBytes(n)
      elif java: return br"a.s0(); a.c(%d);" % n
      else: return br"s0(); c(%d);" % n
    if data_driven: return [(n,)] # copyBytes(n)
    elif java: return br"a.c(%d);" % n
    else: return br"c(%d);" % n

def outputParser(rulesAndConds):
    glossDic, glossMiss, glosslist = readGlossfile()
    if words_omit:
      omitlist=set(w.strip() for w in openfile(words_omit).read().decode(incode).split('\n')) # TODO: glosscode?
      if diagnose and diagnose in omitlist: diagnose_write(diagnose+" is in words_omit file")
    else: omitlist = []
    sys.stderr.write("Generating byte cases...\n")
    byteSeq_to_action_dict = {}
    if ignoreNewlines: # \n shouldn't affect needSpace
      byteSeq_to_action_dict[b'\n'] = [(copyBytes(1),[])]
    for closeQuote in u'\u2019\u201d\u300b\u300d)\u3015\uff09\u3017\u3011]\uff3d':
      # close quotes should not affect needSpace
      try: closeQuote = closeQuote.encode(outcode)
      except: continue # can't do this one
      byteSeq_to_action_dict[closeQuote] = [(copyBytes(len(closeQuote)),[])]
    for openQuote in u'\u2018\u201c\u300a\u300c(\u3014\uff08\u3016\u3010[\uff3b':
      # open quotes should activate needSpace first
      try: openQuote = openQuote.encode(outcode)
      except: continue # can't do this one
      byteSeq_to_action_dict[openQuote] = [(copyBytes(len(openQuote),checkNeedspace=True),[])]
    def addRule(rule,conds,byteSeq_to_action_dict,manualOverride=False):
      md = md2 = markDown(rule)
      if post_normalise:
        md2 = post_normalise_translate(md)
        byteSeq = md2.encode(outcode)
        if type(conds)==tuple: conds=(conds[0],map(post_normalise_translate,conds[1]),conds[2])
        else: conds=map(post_normalise_translate,conds)
      else: byteSeq = md.encode(outcode)
      action,gotAnnot = matchingAction(rule,glossDic,glossMiss,glosslist,omitlist)
      if not gotAnnot: return # not glosslisted, or some spurious o("{","") rule that got in due to markup corruption
      if manualOverride or not byteSeq in byteSeq_to_action_dict:
        byteSeq_to_action_dict[byteSeq] = []
      elif post_normalise:
        if (action,conds) in byteSeq_to_action_dict[byteSeq]: return # exact duplicate after post-normalisation
        elif any((x[0]==action or x[1]==conds) for x in byteSeq_to_action_dict[byteSeq]): # near-duplicate: same conds, different action (will definitely need to prioritise one, can't do both), or same action, different conds (will probably need to prioritise one, especially if one of the conds of the non-normalised action is IN the normalised action, which could short-circuit the conds)
          if md==md2: # this is the rule that DIDN'T have to be post-normalised, so its action should take priority
            byteSeq_to_action_dict[byteSeq] = [x for x in byteSeq_to_action_dict[byteSeq] if not x[1]==conds]
          else: return # other one probably has priority
      if not data_driven: action = b' '.join(action)
      byteSeq_to_action_dict[byteSeq].append((action,conds))
    def dryRun(clearReannotator=True): # to prime the reannotator or compressor
      global toReannotateSet, reannotateDict
      toReannotateSet = set()
      if clearReannotator: reannotateDict = {} # (not if we've run the reannotator and are just doing it for the compressor)
      dummyDict = {}
      for rule,conds in rulesAndConds: addRule(rule,conds,dummyDict)
      for l in read_manual_rules(): addRule(l,[],dummyDict)
    if reannotator:
      global stderr_newline ; stderr_newline = False
      sys.stderr.write("Reannotating... ")
      sys.stderr.flush()
      dryRun()
      # Setting buffer size is not enough on all systems.
      # To ensure the pipe does not fill its output while
      # we are still writing its input, we use threads and
      # don't start writing its input until we've already
      # started reading from its output.
      global toReannotateSet, reannotateDict
      l = [ll for ll in toReannotateSet if ll and not "\n" in ll] # TODO: handle the case where "\n" is in ll?  (shouldn't happen in 'sensible' annotators)
      def reader_thread(comms):
        comms[0] = True
        comms[1] = cout.read().decode(outcode).splitlines() # TODO: reannotatorCode instead of outcode?
      if reannotator.startswith('##'): cmd=reannotator[2:]
      elif reannotator[0]=='#': cmd=reannotator[1:]
      else: cmd = reannotator
      import thread ; sys.setcheckinterval(100)
      sp=subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,close_fds=True)
      global cout ; cin,cout = sp.stdin,sp.stdout
      comms = [False,False]
      thread.start_new_thread(reader_thread,(comms,))
      while comms[0] == False: time.sleep(0.1)
      # NOW ready to start writing:
      cin.write("\n".join(l).encode(outcode)+b"\n") ; cin.close() # TODO: reannotatorCode instead of outcode?
      while comms[1] == False: time.sleep(1)
      l2 = comms[1]
      sys.setcheckinterval(32767)
      del cin,cout,cmd,comms,sp
      while len(l2)>len(l) and not l2[-1]: del l2[-1] # don't mind extra blank line(s) at end of output
      if not len(l)==len(l2):
        open('reannotator-debug-in.txt','wb').write(os.linesep.join(l).encode(outcode)+B(os.linesep))
        open('reannotator-debug-out.txt','wb').write(os.linesep.join(l2).encode(outcode)+B(os.linesep))
        errExit("Reannotator command didn't output the same number of lines as we gave it (gave %d, got %d).  Input and output have been written to reannotator-debug-in.txt and reannotator-debug-out.txt for inspection.  Bailing out." % (len(l),len(l2)))
      if stderr_newline: sys.stderr.write("Reannotated %d items\n" % len(l))
      else: sys.stderr.write("(%d items)\n" % len(l))
      toReannotateSet = set() ; reannotateDict = dict(zip(l,l2)) ; del l,l2
    if compress:
      global squashStrings ; squashStrings = set() # discard any that were made in any reannotator dry-run
      dryRun(False) # redo with the new annotation strings (or do for the first time if no reannotator)
      pairs = squashFinish()
    else: pairs = b""
    for rule,conds in rulesAndConds: addRule(rule,conds,byteSeq_to_action_dict)
    for l in read_manual_rules(): addRule(l,[],byteSeq_to_action_dict,True)
    write_glossMiss(glossMiss)
    longest_rule_len = max(len(b) for b in iterkeys(byteSeq_to_action_dict))
    longest_rule_len += ybytes_max # because buffer len is 2*longest_rule_len, we shift half of it when (readPtr-bufStart +ybytes >= bufLen) and we don't want this shift to happen when writePtr-bufStart = Half_Bufsize-1 and readPtr = writePtr + Half_Bufsize-1 (TODO: could we get away with max(0,ybytes_max-1) instead? but check how this interacts with the line below; things should be safe as they are now).  This line's correction was missing in Annogen v0.599 and below, which could therefore occasionally emit code that, when running from stdin, occasionally replaced one of the document's bytes with an undefined byte (usually 0) while emitting correct annotation for the original byte.  (This could result in bad UTF-8 that crashed the bookmarklet feature of Web Adjuster v0.21 and below.)
    longest_rule_len = max(ybytes_max*2, longest_rule_len) # make sure the half-bufsize is at least ybytes_max*2, so that a read-ahead when pos is ybytes_max from the end, resulting in a shift back to the 1st half of the buffer, will still leave ybytes_max from the beginning, so yar() can look ybytes_max-wide in both directions
    if data_driven:
      b = BytecodeAssembler()
      b.addActionDictSwitch(byteSeq_to_action_dict,False)
      ddrivn = b.link()
      if zlib: origLen = b.origLen
      del b
    else: ddrivn = None
    if javascript:
      if zlib:
        import base64
        return outfile.write(re.sub(br"data\.charCodeAt\(([^)]*)\)",br"data[\1]",js_start).replace(b"indexOf(input.charAt",b"indexOf(input.charCodeAt")+b"data: "+js_inflate+b"(\""+base64.b64encode(ddrivn)+b"\","+B(str(origLen))+b")\n"+js_end+b"\n")
      elif browser_extension: return outfile.write(txt_escapeRawBytes(ddrivn))
      else: return outfile.write(js_start+b"data: \""+js_escapeRawBytes(ddrivn)+b"\",\n"+js_end+b"\n") # not Uint8Array (even if browser compatibility is known): besides taking more source space, it's typically ~25% slower to load than string, even from RAM
    elif dart:
      if dart_datafile:
        if os.sep in c_filename: d=c_filename[:c_filename.rindex(os.sep)]+os.sep
        else: d = ""
        if os.sep in dart_datafile: d += dart_datafile[dart_datafile.rindex(os.sep)+1:]
        else: d += dart_datafile
        open(d,'wb').write(ddrivn)
        sys.stderr.write("Wrote "+d+" (ensure this ships as "+dart_datafile+")\n")
      if dart_datafile and zlib: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"await(File('"+B(dart_datafile)+b"').readAsBytes())"))
      elif zlib: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"\""+dart_escapeRawBytes(ddrivn)+b"\".codeUnits"))
      elif dart_datafile: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"String.fromCharCodes(await(File('"+B(dart_datafile)+b"').readAsBytes()))"))
      else: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"\""+B(dart_escapeRawBytes(ddrivn))+b"\""))
    elif python:
      dd2 = repr(ddrivn)
      if not dd2.startswith('b'): dd2='b'+dd2 # (if we're generating in Python 2, we still want 2+3 compatibility)
      outfile.write(py_start+b"\ndata="+B(dd2)+b"\n")
      if zlib: outfile.write(b"import zlib; data=zlib.decompress(data)\n")
      return outfile.write(py_end+b"\n")
    elif java:
      start = java_src.replace(b"%%JPACKAGE%%",B(jPackage))
      if data_driven:
        a = android_loadData.replace(b"%%DLEN%%",B(str(len(ddrivn))))
        if zlib: a = a.replace(b"%%ULEN%%",B(str(origLen)))
        start = start.replace(b"() { %%JDATA%%",b"(android.content.Context context) throws java.io.IOException { "+a) # Annotator c'tor needs a context argument if it's data-driven, to load annotate.dat
        if zlib: start = start.replace(b"context) throws java.io.IOException {",b"context) throws java.io.IOException,java.util.zip.DataFormatException {")
      else: start = start.replace(b"%%JDATA%%",b"")
    elif c_sharp: start = cSharp_start
    elif golang: start = golang_start
    else: start = c_start
    outfile.write(start.replace(b'%%LONGEST_RULE_LEN%%',B(str(longest_rule_len))).replace(b"%%YBYTES%%",B(str(ybytes_max))).replace(b"%%PAIRS%%",pairs)+b"\n")
    if data_driven:
      if zlib: dataName = "origData"
      else: dataName = "data"
      if java: open(jSrc+"/../assets/annotate.dat","wb").write(ddrivn)
      else:
        outfile.write(b"static unsigned char "+dataName+b"[]=\""+c_escapeRawBytes(ddrivn)+b'\";\n')
        if zlib: outfile.write(c_zlib.replace(b'%%ORIGLEN%%',B(str(origLen))).replace(b'%%ZLIBLEN%%',B(str(len(ddrivn))))+b"\n") # rather than using sizeof() because we might or might not want to include the compiler's terminating nul byte
        outfile.write(c_datadrive+b"\n")
      del ddrivn
    else: # not data_driven
      subFuncL = []
      ret = stringSwitch(byteSeq_to_action_dict,subFuncL)
      if java:
        for f in subFuncL: open(java+os.sep+S(f[f.index(b"class ")+6:].split(None,1)[0])+".java","wb").write(f)
        open(java+os.sep+"topLevelMatch.java","wb").write(b"\n".join(ret))
      elif golang: outfile.write(b"\n".join(subFuncL + ret).replace(b';\n',b'\n')+b"\n") # (this 'elif' line is not really necessary but it might save someone getting worried about too many semicolons)
      else: outfile.write(b"\n".join(subFuncL+ret)+b"\n")
      del subFuncL,ret
    if android:
      open(java+os.sep+"MainActivity.java","wb").write(android_src.replace(b"%%JPACKAGE%%",B(jPackage)).replace(b'%%ANDROID-URL%%',B(android)))
      open(java+os.sep+"BringToFront.java","wb").write(android_bringToFront.replace(b"%%JPACKAGE%%",B(jPackage)))
      open(jSrc+"/../assets/clipboard.html",'wb').write(android_clipboard)
      if android_template:
        aStamp = android_version_stamp
        try: versionName = re.findall(B(re.escape("versionName")+r'\s*=\s*"([^"]*)"'),open(jSrc+"/../AndroidManifest.xml",'rb').read())[0]
        except: versionName = None
        if versionName: aStamp = aStamp.replace(b"%%DATE%% version",b"%%DATE%% version "+versionName)
        aTemp = android_template.replace(b"</body",aStamp.replace(b"%%DATE%%",b"%d-%02d-%02d" % time.localtime()[:3]).replace(b"%%TIME%%",b"%d:%02d" % time.localtime()[3:5])+b"</body")
        open(jSrc+"/../assets/index.html",'wb').write(aTemp)
      update_android_manifest()
      open(jSrc+"/../res/layout/activity_main.xml","wb").write(android_layout)
      open(jSrc+"/../res/menu/main.xml","wb").write(b'<menu xmlns:android="http://schemas.android.com/apk/res/android" ></menu>\n') # TODO: is this file even needed at all?
      open(jSrc+"/../res/values/dimens.xml","wb").write(b'<resources><dimen name="activity_horizontal_margin">16dp</dimen><dimen name="activity_vertical_margin">16dp</dimen></resources>\n')
      open(jSrc+"/../res/values/styles.xml","wb").write(b'<resources><style name="AppBaseTheme" parent="android:Theme.Light"></style><style name="AppTheme" parent="AppBaseTheme"><item name="android:forceDarkAllowed">true</item></style></resources>\n')
      open(jSrc+"/../res/values/strings.xml","wb").write(B('<?xml version="1.0" encoding="utf-8"?>\n<resources><string name="app_name">'+app_name.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')+'</string></resources>\n'))
      open(jSrc+"/../res/xml/network_security_config.xml","wb").write(b'<?xml version="1.0" encoding="utf-8"?>\n<network-security-config><base-config cleartextTrafficPermitted="true" /></network-security-config>\n')
    elif c_sharp: outfile.write(cSharp_end)
    elif golang: outfile.write(golang_end)
    elif not java: outfile.write(c_end)
    outfile.write(b"\n")
    del byteSeq_to_action_dict
    if no_summary or not rulesAndConds: return
    if reannotator:
        outfile.write(b"\n/* Tab-delimited rules summary not yet implemented with reannotator option */\n")
        return
    outfile.write(b"\n/* Tab-delimited summary of the rules: (total %d)\n" % len(rulesAndConds))
    outputRulesSummary(rulesAndConds)
    outfile.write(b"*/\n")

def update_android_manifest():
  try: manifest = old_manifest = open(jSrc+"/../AndroidManifest.xml",'rb').read()
  except IOError: manifest,old_manifest = android_manifest,None
  def readAttr(aName):
    allVals = re.findall(B(re.escape(aName)+r'\s*=\s*"([^"]*)"'),manifest)
    assert len(allVals)==1, "AndroidManifest.xml has %d instances of %s, should be 1" % (len(allVals),aName)
    return allVals[0]
  versionCode,versionName = readAttr("android:versionCode"),readAttr("android:versionName")
  if b"android:sharedUserId" in manifest: sharedUID = readAttr("android:sharedUserId")
  else: sharedUID = b""
  if android_upload:
    sys.stderr.write("AndroidManifest.xml: bumping versionCode for upload\n (assuming you've taken care of versionName separately, if needed)\n") # (might not be needed if the previous upload wasn't actually released for example)
    versionCode = B(str(int(versionCode)+1))
  def pathQ(x):
    x = urlparse.urlparse(x)
    if x.query: x=x.path+"?"+x.query
    else: x=x.path
    if ".*" in x: return B('android:pathPattern="%s"' % (x,))
    else: return B('android:pathPrefix="%s"' % (x,))
  manifest = android_manifest.replace(b'%%JPACKAGE%%',B(jPackage)).replace(b'android:versionCode="1"',b'android:versionCode="'+versionCode+b'"').replace(b'android:versionName="1.0"',b'android:versionName="'+versionName+b'"').replace(b'android:sharedUserId=""',b'android:sharedUserId="'+sharedUID+b'"').replace(b'android:sharedUserId="" ',b'') + b''.join((b'\n<intent-filter><action android:name="android.intent.action.VIEW" /><category android:name="android.intent.category.DEFAULT" /><category android:name="android.intent.category.BROWSABLE" /><data android:scheme="%s" android:host="%s" %s /></intent-filter>'%(B(urlparse.urlparse(x).scheme),B(urlparse.urlparse(x).netloc),B(pathQ(x)))) for x in android_urls.split()) + b"\n</activity></application></manifest>\n"
  if not manifest==old_manifest:
    open(jSrc+"/../AndroidManifest.xml","wb").write(manifest)
  else: assert not android_upload, "Couldn't bump version code in "+repr(manifest)

def setup_browser_extension():
  dirToUse = browser_extension.replace(' ','')
  sys.stderr.write("Writing to "+dirToUse+"\n")
  try: os.mkdir(dirToUse)
  except: pass
  def icons(key,sizes):
    if any(os.path.isfile(dirToUse+os.sep+s+".png") for s in sizes):
      return b',"'+B(key)+'":{'+b",".join(('"%s":"%s.png"' % (s,s)) for s in sizes if os.path.isfile(dirToUse+os.sep+s+".png"))+b"}"
    else: return b""
  try: # increment existing version if present
    versionName = re.search(b'"version": *"([^"]*)"',open(dirToUse+"/manifest.json","rb").read()).group(1)
    versionName = versionName.split(b'.')
    versionName[-1] = B(str(int(versionName[-1])+1))
    versionName = b'.'.join(versionName)
  except: versionName = b"0.1"
  open(dirToUse+"/manifest.json","wb").write(br"""{
  "manifest_version": 2,
  "name": "%s",
  "background": { "scripts": ["background.js"] },
  "content_scripts": [{"matches": ["<all_urls>"], "js": ["content.js"], "css": ["ruby.css"]}],
  "browser_action":{"default_title":"Annotate","default_popup":"config.html","browser_style": true%s},
  "permissions": ["<all_urls>","clipboardRead"]%s}""" % (B(browser_extension),versionName,icons("default_icon",["16","32"]),icons("icons",["16","32","48","96"])))
  open(dirToUse+"/background.js","wb").write(js_start+js_end)
  open(dirToUse+"/content.js","wb").write(jsAnnot(False,True))
  open(dirToUse+"/config.html","wb").write(extension_config)
  open(dirToUse+"/config.js","wb").write(extension_confjs)
  open(dirToUse+"/ruby.css","wb").write(extension_rubycss)
  c_filename = dirToUse+"/annotate-dat.txt"
5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846

def write_glossMiss(glossMiss):
  if not glossmiss: return
  sys.stderr.write("Writing glossmiss (norefs=%s) to %s...\n" % (repr(norefs),glossmiss))
  gm = openfile(glossmiss,'w')
  count = 1 ; t = time.time() ; prndProg=False
  for w in sorted(list(glossMiss)):
    try: num = str(len(getOkStarts(w)))+'\t'
    except: num = '?\t' # num occurrences in e.g.s
    a,b,r = markDown(w),annotationOnly(w),refs(w,True)
    if a and b and not r=="\t": gm.write((num+a+"\t"+b+r+os.linesep).encode(incode)) # TODO: glosscode ? glossMissCode ??
    if time.time() >= t + 2:
      sys.stderr.write(("(%d of %d)" % (count,len(glossMiss)))+clear_eol) ; sys.stderr.flush()
      t = time.time() ; prndProg = True
    count += 1
  if prndProg: sys.stderr.write("done"+clear_eol+"\n")

if norefs:
  def refs(*args): return ""
else:
  def refs(rule,omit=False):
    try: okStarts = getOkStarts(rule)
    except: return "" # KeyError can happen in some incremental-run glossMiss situations: just omit that reference in the debug file
    return starts2refs(okStarts,omit)

def starts2refs(startSet,omit=False):
    # assumes generate_map() has been called
    global refMap
    try: refMap
    except:
      refMap = [(m.end(),m.group(1)) for m in re.finditer(re.escape(reference_sep)+"(.*?)"+re.escape(ref_name_end), corpus_unistr, flags=re.DOTALL)]
      i = 0
      while True:
        if i+1 >= len(refMap): break
        if refMap[i][1] == refMap[i+1][1]: del refMap[i+1]
        else: i += 1
    rmPos = 0 ; ret = []
    while len(ret) < maxrefs and rmPos < len(refMap):
      s = refMap[rmPos][0] ; i = -1
      while i < s and startSet:
        i = min(startSet) ; startSet.remove(i)
        i = m2c_map[i]
      if i < s: break
      rmE = len(refMap)-1
      while refMap[rmE][0] > i:
        mid = int((rmPos+rmE)/2)
        if mid==rmPos or refMap[mid][0] > i: rmE = mid
        else: rmPos = mid
      rmPos = rmE
      app=refMap[rmPos][1]
      if not app in ret: ret.append(app)
      rmPos += 1
    if not ret: return ""
    elif not omit: return "\t"+"; ".join(ret)
    else: return "\t"+"; ".join(r for r in ret if not r in glossmiss_hide and (not glossmiss_match or re.match(glossmiss_match,r))) # (if all in omit, still return the \t to indicate we did find some)

def outputRulesSummary(rulesAndConds):
    # (called "summary" because we don't here specify which part
    # of the annotation goes with which part of the text, plus
    # we remove /* and */ so it can be placed into a C comment)
    sys.stderr.write("Writing rules summary...\n")
    if summary_omit: omit=set(openfile(summary_omit).read().splitlines())
    else: omit=[]
    count = 1 ; t = time.time()
    # If incremental or manualrules, some rules might now have been overridden by newer ones.  Rules listed later take priority in byteSeq_to_action_dict.  This should remove earlier duplicate (markedDown,conds) combinations from the summary:
    d = {}
    for r,c in rulesAndConds:
      d[(markDown(r),repr(c))] = (r,c)
    # Now sort so diff is possible between 2 summaries
    # (case-insensitive because capitalisation may change)
    d = sorted(((annotationOnly(r),markDown(r),r,c) for r,c in d.values()),key=lambda x:(x[0].lower(),)+x[1:])
    # Can now do the summary:
    for annot,orig,rule,conditions in d:
        if time.time() >= t + 2:
          sys.stderr.write(("(%d of %d)" % (count,len(rulesAndConds)))+clear_eol) ; sys.stderr.flush()
          t = time.time()
        count += 1
        def code(x):
          if not x.strip(): return repr(x)
          else: return x.encode(outcode).replace(b'\n',br'\n').replace(b'\t',br'\t')
        toPrn = code(orig)+b"\t"+code(annot)
        if ybytes:
            toPrn += b"\t"
            if conditions:
                if type(conditions)==tuple:
                  negate,conds,nbytes = conditions[:3]
                  if negate: negate=b" not"
                  else: negate=b""
                  toPrn += b"if"+negate+b" within "+B(str(nbytes))+b" bytes of "+b" or ".join(code(c) for c in conds)
                else: toPrn += b"if near "+b" or ".join(code(c) for c in conditions)
        if not toPrn in omit: outfile.write((toPrn+refs(rule).encode(outcode)).replace(b'/*',b'').replace(b'*/',b'')+b"\n")
    if ybytes: extraTab='\t'
    else: extraTab = ''
    for l in read_manual_rules(): outfile.write((markDown(l)+'\t'+annotationOnly(l)+extraTab+'\t--manualrules '+manualrules).encode(outcode)+b"\n")
    sys.stderr.write("done"+clear_eol+"\n")

if isatty(sys.stdout):
    if summary_only:
        warn("Rules summary will be written to STANDARD OUTPUT\nYou might want to redirect it to a file or a pager such as 'less'")
        c_filename = None
    elif not java and main and not priority_list and not normalise_only and not browser_extension: sys.stderr.write("Will write to "+c_filename+"\n") # will open it later (avoid having a 0-length file sitting around during the analyse() run so you don't rm it by mistake)

def openfile(fname,mode='r'):
    lzma = bz2 = None
    mode += 'b' # Python 2+3 compatibility: always binary
    if fname.endswith(".xz"): import lzma # 'pip install lzma' or 'apt-get install python2.7-lzma' may be required for .xz files
    elif fname.endswith(".bz2"): import bz2
    if re.match("https?://",fname) or fname.startswith("ftp://"):
        assert mode=='rb', "cannot write to "+fname
        try: from urllib2 import urlopen # Python 2
        except: from urllib.request import urlopen # Py3
        sys.stderr.write("Fetching "+fname+"\n")
        fileobj = urlopen(fname)
        # If it's bz2 or xz, we'd better decompress in one operation.  (gz library can stream)
        if fname.endswith(".bz2"):
            from cStringIO import StringIO
            return StringIO(bz2.decompress(fileobj.read()))
        elif fname.endswith(".xz"):
            from cStringIO import StringIO
            return StringIO(lzma.decompress(fileobj.read()))
    elif fname.endswith(".bz2"):
        return bz2.BZ2File(fname,mode)
    elif fname.endswith(".xz"):
        return lzma.LZMAFile(fname,mode)
    else: fileobj = open(fname,mode)
    # if get this far, we can use fileobj
    if fname.endswith(".gz"):
        import gzip ; return gzip.GzipFile(fileobj=fileobj,mode=mode)
    else: return fileobj
def open_try_bz2(fname,mode='r'): # use .bz2 iff available (for checkpoints)
  try: return openfile(fname+".bz2",mode)
  except: return openfile(fname,mode)
def rm_f(fname):
  try: os.remove(fname)
  except OSError: pass

import atexit
def set_title(t):
  if not isatty(sys.stderr): return
  if t: atexit.register(set_title,"")
  is_screen = (term=="screen" and os.environ.get("STY",""))
  is_tmux = (term=="screen" and os.environ.get("TMUX",""))
  if is_xterm or is_tmux: sys.stderr.write("\033]0;%s\007" % (t,)) # ("0;" sets both title and minimised title, "1;" sets minimised title, "2;" sets title.  Tmux takes its pane title from title (but doesn't display it in the titlebar))
  elif is_screen: os.system("screen -X title \"%s\"" % (t,))
def diagnose_write(s): getBuf(sys.stderr).write(bold_on+"Diagnose: "+bold_off+s.encode(terminal_charset,'replace')+clear_eol+'\n')
try: screenWidth = int(os.environ['COLUMNS'])
except:
  import struct, fcntl, termios
  try: screenWidth = struct.unpack('hh',fcntl.ioctl(sys.stderr,termios.TIOCGWINSZ,'xxxx'))[1]
  except: screenWidth = 45 # conservative

if main and not compile_only:
 set_title("annogen")
 if checkpoint:
  try: os.mkdir(checkpoint)
  except: pass
 if no_input:
   rulesAndConds = RulesAccumulator().rulesAndConds() # should load rulesFile
 if read_input:
  if infile: infile=openfile(infile)
  else:
    infile = sys.stdin
    if isatty(infile): sys.stderr.write("Reading from standard input\n(If that's not what you wanted, press Ctrl-C and run again with --help)\n")
  corpus_unistr = getBuf(infile).read().decode(incode)
  if diagnose and not diagnose in corpus_unistr:
    diagnose_write(diagnose+" is not present in the corpus, even before normalisation")
    suppress = True
  else: suppress = False
  loaded_from_checkpoint = normalise()
  if diagnose and not suppress and not diagnose in corpus_unistr:
    diagnose_write(diagnose+" was in the corpus before normalisation, but not after")
    if loaded_from_checkpoint: diagnose_write("You might want to remove "+checkpoint+os.sep+'normalised* and redo the diagnose')
  if normalise_only: sys.exit()
  if priority_list:
    if os.path.exists(priority_list):
      sys.stderr.write("Reading "+priority_list+"\n")
      def getFreq(line):
        word,freq = line.decode(outcode).rstrip().rsplit(None,1)
        try: return word,int(freq)
        except: return word,float(freq)
      existingFreqs=dict(getFreq(l) for l in openfile(priority_list) if len(l.strip().split())>=2)
    else: existingFreqs = {}
    sys.stderr.write("Parsing...") ; sys.stderr.flush()
    i=[[markDown(w) for w in splitWords(phrase)] for phrase in splitWords(corpus_unistr,phrases=True)]
    del corpus_unistr
    sys.stderr.write(" calling PairPriorities...\n")
    out="".join(w+"\t"+str(f)+os.linesep for w,f in PairPriorities(i,existingFreqs) if f).encode(outcode)
    # (don't open the output before here, in case exception)
    if existingFreqs: sys.stderr.write("Updating "+priority_list+"...")
    else: sys.stderr.write("Writing "+priority_list+"...")
    sys.stderr.flush()
    openfile(priority_list,'w').write(out)
    sys.stderr.write(" done\n")
    sys.exit()
  generate_map() ; setup_other_globals()
  if not no_input:
    executor = setup_parallelism()
    if executor and capitalisation and annot_whitespace and infile==sys.stdin: open_try_bz2(checkpoint+os.sep+'normalised','w').write(corpus_unistr.encode('utf-8')) # normalise won't have done it and the other nodes will need it (TODO: unless we're doing concurrent.futures with fork)
    try: rulesAndConds = analyse()
    finally: sys.stderr.write("\n") # so status line is not overwritten by 1st part of traceback on interrupt etc
  del _gp_cache

def cmd_or_exit(cmd):
  sys.stderr.write(cmd+"\n")
  r = os.system(cmd)
  if not r: return
  if r&0xFF == 0: r >>= 8 # POSIX
  sys.exit(r)

if main and not compile_only:
 if browser_extension: setup_browser_extension()
 if c_filename: outfile = openfile(c_filename,'w')
 else: outfile = getBuf(sys.stdout)
 if summary_only: outputRulesSummary(rulesAndConds)
 else: outputParser(rulesAndConds)
 del rulesAndConds
 outfile.close() ; sys.stderr.write("Output complete\n")
if main:
 if android:
   can_compile_android = all(x in os.environ for x in ["SDK","PLATFORM","BUILD_TOOLS"])
   can_track_android = (can_compile_android and android_upload) or ("GOOGLE_PLAY_TRACK" in os.environ and "SERVICE_ACCOUNT_KEY" in os.environ)
   if can_compile_android and compile_only and android_upload: update_android_manifest() # AndroidManifest.xml will not have been updated, so we'd better do it now
   if can_compile_android or can_track_android:
     os.chdir(jSrc+"/..")
     dirName0 = S(getoutput("pwd|sed -e s,.*./,,"))
     dirName = shell_escape(dirName0)
   if can_compile_android: # TODO: use aapt2 and figure out how to make a 'bundle' with it so Play Store can accept new apps after August 2021 ?  (which requires giving them your signing keys, and I don't see the point in enforcing the 'bundle' format for a less than 1k saving due to not having to package multiple launcher icons on each device, and you'd probably have to compile non-Store apks separately.)  Don't know if/when updates to pre-Aug2021 apps will be required to be in Bundle format.
     cmd_or_exit("$BUILD_TOOLS/aapt package -0 '' -v -f -I $PLATFORM/android.jar -M AndroidManifest.xml -A assets -S res -m -J gen -F bin/resources.ap_") # (the -0 '' (no compression) is required if targetSdkVersion=30 or above, and shouldn't make much size difference on earlier versions as annotate.dat is itself compressed)
     cmd_or_exit("find src/"+jRest+" -type f -name '*.java' > argfile && javac -Xlint:deprecation -classpath $PLATFORM/android.jar -sourcepath 'src;gen' -d bin gen/"+jRest+"/R.java @argfile && rm argfile") # as *.java likely too long (-type f needed though, in case any *.java files are locked for editing in emacs)
     a = " -JXmx4g --force-jumbo" # -J option must go first
     if "min-sdk-version" in getoutput("$BUILD_TOOLS/dx --help"):
       a += " --min-sdk-version=1" # older versions of dx don't have that flag, but will be min-sdk=1 anyway
     cmd_or_exit("$BUILD_TOOLS/dx"+a+" --dex --output=bin/classes.dex bin/")
     cmd_or_exit("cp bin/resources.ap_ bin/"+dirName+".ap_")
     cmd_or_exit("cd bin && $BUILD_TOOLS/aapt add -0 '' "+dirName+".ap_ classes.dex")
     rm_f("bin/"+dirName0+".apk") ; cmd_or_exit("$BUILD_TOOLS/zipalign 4 bin/"+dirName+".ap_ bin/"+dirName+".apk")
     rm_f("../"+dirName0+".apk")
     if all(x in os.environ for x in ["KEYSTORE_FILE","KEYSTORE_USER","KEYSTORE_PASS"]): cmd_or_exit("$BUILD_TOOLS/apksigner sign --ks \"$KEYSTORE_FILE\" --v1-signer-name \"$KEYSTORE_USER\" --ks-pass env:KEYSTORE_PASS --key-pass env:KEYSTORE_PASS --out ../"+dirName+".apk bin/"+dirName+".apk")
     else: cmd_or_exit("$BUILD_TOOLS/apksigner sign --ks \"$HOME\"/.android/debug.keystore --v1-signer-name androiddebugkey --ks-pass pass:android --key-pass pass:android --out ../"+dirName+".apk bin/"+dirName+".apk") # if KEYSTORE_FILE not provided, try to use debug.keystore generated by Eclipse/Studio (TODO: file may not be present if you haven't created/tried any projects yet)
     rm_f("bin/"+dirName0+".ap_")
     rm_f("bin/"+dirName0+".apk")
     if not can_track_android: cmd_or_exit("du -h ../"+dirName+".apk")
   if can_track_android:
     import httplib2,googleapiclient.discovery,oauth2client.service_account # pip install google-api-python-client (or pip install --upgrade google-api-python-client if yours is too old).  Might need pip install oauth2client also.
     trackToUse = os.environ.get("GOOGLE_PLAY_TRACK","").strip()
     if not trackToUse: trackToUse='beta'
     sys.stderr.write("Logging in... ")
     service = googleapiclient.discovery.build('androidpublisher', 'v3', http=oauth2client.service_account.ServiceAccountCredentials.from_json_keyfile_name(os.environ['SERVICE_ACCOUNT_KEY'],'https://www.googleapis.com/auth/androidpublisher').authorize(httplib2.Http()))
     eId = service.edits().insert(body={},packageName=jPackage).execute()['id']
     if android_upload:
       sys.stderr.write("uploading... ")
       sys.stderr.flush()
       v = service.edits().apks().upload(editId=eId,packageName=jPackage,media_body="../"+dirName+".apk").execute()['versionCode'] ; sys.stderr.write("\rUploaded "+dirName+".apk (version code "+str(v)+")\n")
       open(jSrc+"/../.last-versionCode","w").write(str(v))
     else: v = int(open(jSrc+"/../.last-versionCode").read().strip()) # if this fails, you probably didn't run annogen v0.691+ to compile the APK before trying to change track (see instructions printed when GOOGLE_PLAY_TRACK environment variable is not set)
     if os.environ.get("GOOGLE_PLAY_CHANGELOG",""): service.edits().tracks().update(editId=eId,track=trackToUse,packageName=jPackage,body={u'releases':[{u'versionCodes':[v],u"releaseNotes":[{u"language":u"en-US",u"text":T(os.environ["GOOGLE_PLAY_CHANGELOG"])}],u'status':u'completed'}],u'track':trackToUse}).execute() # needs to be "en-US" as just "en" is dropped by the Store, although it does say you can "add as supported language in your app's Store Listing"
     else:
       service.edits().tracks().update(editId=eId,track=trackToUse,packageName=jPackage,body={u'releases':[{u'versionCodes':[v],u'status':u'completed'}],u'track':trackToUse}).execute()
       if not android_upload: sys.stderr.write("Warning: GOOGLE_PLAY_CHANGELOG not set, any release notes will be deleted\n")
     sys.stderr.write("Committing... ")
     sys.stderr.flush()
     sys.stderr.write("\rCommitted edit %s: %s.apk v%s to %s\n" % (service.edits().commit(editId=eId,packageName=jPackage).execute()['id'],dirName,v,trackToUse))
   if not can_compile_android and not can_track_android: sys.stderr.write("Android source has been written to "+jSrc[:-3]+"""
To have Annogen build it for you, set these environment variables
before the Annogen run (change the examples obviously) :
   export SDK=/home/example/Android/Sdk
   export PLATFORM=$SDK/platforms/android-19
   export BUILD_TOOLS=$SDK/build-tools/21.0.2
   # To get a release build, additionally set:
   export KEYSTORE_FILE=/path/to/keystore
   export KEYSTORE_USER='your user name'
   export KEYSTORE_PASS='your password'
   # You can upload this to Google Play before August 2021
   # (or after that for updates to older apps).  In August
   # Google Play will enforce a different 'bundle' format
   # for new apps, which I don't yet know how to make.  It
   # should be possible to update existing apps in the old
   # format for some time after though.
   # To upload the release to Google Play, additionally set:
   export SERVICE_ACCOUNT_KEY=/path/to/api-*.json
   # and optionally:
   export GOOGLE_PLAY_CHANGELOG="Updated annotator"
   export GOOGLE_PLAY_TRACK=alpha # default beta (please don't put production); however sending yourself the APK file is usually faster than using the alpha track if it's just to test on your own devices
   # If the above variables including SERVICE_ACCOUNT_KEY are set (and you haven't set ANDROID_NO_UPLOAD, below), then you'll also get an openPlayStore() function added to the Javascript interface for use in 'check for updates' links.
   # After testing, you can change the track of an existing APK by setting ANDROID_NO_UPLOAD=1 but still setting SERVICE_ACCOUNT_KEY and GOOGLE_PLAY_TRACK, and run with --compile-only.  You will need to set GOOGLE_PLAY_CHANGELOG again when doing this, as the Google API now discards changelogs on track-changes unless they are re-specified.

You may also wish to create some icons in res/drawable*
   (using Android Studio or the earlier ADT tools).

On Google Play you may wish to set Release management
   - Pre-launch report - Settings - Enable pre-launch
   reports to OFF, or it'll report issues on the websites
   you link to (and maybe crashes due to Firebase issues),
   which (if you don't want them) is wasting resources.
""") # TODO: try if("true".equals(android.provider.Settings.System.getString(getContentResolver(),"firebase.test.lab"))) browser.loadUrl("about:blank"); (but turning off unwanted reports is better)
 elif c_filename and c_compiler:
    cmd = c_compiler # should include any -o option
    if zlib: cmd += " -lz" # TODO: is this always correct on all platforms? (although user can always simply redirect the C to a file and compile separately)
    cmd_or_exit(cmd + " " + shell_escape(c_filename))
 elif compile_only: errExit("Don't know what compiler to run for this set of options")