FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 119 KiB
Newer Older
Silas S. Brown's avatar
Silas S. Brown committed
program_name = "Annotator Generator v0.552 (c) 2012-14 Silas S. Brown"

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Silas S. Brown's avatar
Silas S. Brown committed
# If you want to compare this code to old versions, the old
# versions are being kept on SourceForge's E-GuideDog SVN repository
# http://sourceforge.net/p/e-guidedog/code/HEAD/tree/ssb22/adjuster/
# although some early ones are missing.
# To check out the repository, you can do:
# svn co http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster

from optparse import OptionParser
parser = OptionParser()
import sys,os,os.path,tempfile,time,re
if not "mac" in sys.platform and not "darwin" in sys.platform and ("win" in sys.platform or "mingw32" in sys.platform): exe=".exe" # Windows, Cygwin, etc
else: exe=""

try: import cPickle as pickle
except:
  try: import pickle
  except: pickle = None

#  =========== INPUT OPTIONS ==============

parser.add_option("--infile",
                  help="Filename of a text file (or a compressed .gz or .bz2 file) to read the input examples from. If this is not specified, standard input is used.")

parser.add_option("--incode",default="utf-8",
                  help="Character encoding of the input file (default %default)")

parser.add_option("--mstart",
                  dest="markupStart",
                  default="<ruby><rb>",
                  help="The string that starts a piece of text with annotation markup in the input examples; default %default")

parser.add_option("--mmid",
                  dest="markupMid",
                  default="</rb><rt>",
                  help="The string that occurs in the middle of a piece of markup in the input examples, with the word on its left and the added markup on its right (or the other way around if mreverse is set); default %default")

parser.add_option("--mend",
                  dest="markupEnd",
                  default="</rt></ruby>",
                  help="The string that ends a piece of annotation markup in the input examples; default %default")

parser.add_option("--mreverse",
                  action="store_true",default=False,
                  help="Specifies that the annotation markup is reversed, so the text BEFORE mmid is the annotation and the text AFTER it is the base text")

parser.add_option("--reference-sep",
                  help="Reference separator code used in the example input.  If you want to keep example source references for each rule, you can label the input with 'references' (chapter and section numbers or whatever), and use this option to specify what keyword or other markup the input will use between each 'reference'.  The name of the next reference will be whatever text immediately follows this string.  Note that the reference separator, and the reference name that follows it, should not be part of the text itself and should therefore not be part of any annotation markup.  If this option is not set then references will not be tracked.")

parser.add_option("--ref-name-end",default=" ",
                  help="Sets what the input uses to END a reference name.  The default is a single space, so that the first space after the reference-sep string will end the reference name.")

parser.add_option("--ref-pri",
                  help="Name of a reference to be considered \"high priority\" for Yarowsky-like seed collocations (if these are in use).  Normally the Yarowsky-like logic tries to identify a \"default\" annotation based on what is most common in the examples, with the exceptions indicated by collocations.  If however a word is found in a high priority reference then the first annotation found in that reference will be considered the ideal \"default\" even if it's in a minority in the examples; everything else will be considered as an exception.  In languages without spaces, this override should normally be used only for one-character words; if used with longer words it might have unexpected effects on rule-overlap ambiguities.")

parser.add_option("-s", "--spaces",
                  action="store_false",
                  dest="removeSpace",
                  default=True,
                  help="Set this if you are working with a language that uses whitespace in its non-markedup version (not fully tested).  The default is to assume that there will not be any whitespace in the language, which is correct for Chinese and Japanese.")

parser.add_option("-c", "--capitalisation",
                  action="store_true",
                  default=False,
                  help="Don't try to normalise capitalisation in the input.  Normally, to simplify the rules, the analyser will try to remove start-of-sentence capitals in annotations, so that the only remaining words with capital letters are the ones that are ALWAYS capitalised such as names.  (That's not perfect: it's possible that some words will always be capitalised just because they happen to never occur mid-sentence in the examples.)  If this option is used, the analyser will instead try to \"learn\" how to predict the capitalisation of ALL words (including start of sentence words) from their contexts.") # TODO: make the C program put the sentence capitals back

parser.add_option("-w", "--annot-whitespace",
                  action="store_true",
                  default=False,
                  help="Don't try to normalise the use of whitespace and hyphenation in the example annotations.  Normally the analyser will try to do this, to reduce the risk of missing possible rules due to minor typographical variations.") # TODO: can this be extended to the point where the words 'try to' can be deleted ?  see comments
                  help="Filename of an optional text file (or compressed .gz or .bz2 file) to read auxiliary \"gloss\" information.  Each line of this should be of the form: word (tab) annotation (tab) gloss.  When the compiled annotator generates ruby markup, it will add the gloss string as a popup title whenever that word is used with that annotation.  The annotation field may be left blank to indicate that the gloss will appear for any annotation of that word.  The entries in glossfile do NOT affect the annotation process itself, so it's not necessary to completely debug glossfile's word segmentation etc.")
parser.add_option("--manualrules",
                  help="Filename of an optional text file (or compressed .gz or .bz2 file) to read extra, manually-written rules.  Each line of this should be a marked-up phrase (in the input format) which is to be unconditionally added as a rule.  Use this sparingly, because these rules are not taken into account when generating the others and they will be applied regardless of context (although a manual rule might fail to activate if the annotator is part-way through processing a different rule).") # (or if there's a longer automatic match)

#  =========== OUTPUT OPTIONS ==============

parser.add_option("--rulesFile",help="Filename of an optional auxiliary binary file to hold the accumulated rules. Adding .gz or .bz2 for compression is acceptable. If this is set then the rules will be written to it (in binary format) as well as to the output. Additionally, if the file already exists then rules will first of all be read from it before generating any new rules. This might be useful if you have made some small additions to the examples and would like these to be incorporated without a complete re-run. It might not work as well as a re-run but it should be faster. If using a rulesFile then you must keep the same input (you may make small additions etc, but it won't work properly if you delete many examples or change the format between runs) and you must keep the same ybytes-related options if any.") # You may however change whether or not a --single-words / --max-words option applies to the new examples (but hopefully shouldn't have to)

parser.add_option("--no-input",
                  action="store_true",default=False,
                  help="Don't actually read the input, just use the rules that were previously stored in rulesFile. This can be used to increase speed if the only changes made are to the output options. You should still specify the input formatting options (which should not change), and any glossfile or manualrules options (which may change).")
parser.add_option("--c-filename",default=tempfile.gettempdir()+os.sep+"annotator.c",help="Where to write the C program if standard output is not connected to a pipe. Defaults to annotator.c in the system temporary directory (the program might be large, especially if Yarowsky indicators are not used, so it's best not to use a server home directory where you might have limited quota). If standard output is connected to a pipe, then this option is ignored and C code is written to the pipe instead.")
parser.add_option("--c-compiler",default="cc -o annotator"+exe,help="The C compiler to run if standard output is not connected to a pipe. The default is to use the \"cc\" command which usually redirects to your \"normal\" compiler. You can add options (remembering to enclose this whole parameter in quotes if it contains spaces), but if the C program is large then adding optimisation options may make the compile take a LONG time. If standard output is connected to a pipe, then this option is ignored because the C code will simply be written to the pipe. Default: %default")
# If compiling an experimental annotator quickly, you might try tcc as it compiles fast. If tcc is not available on your system then clang might compile faster than gcc.
# (BUT tcc can have problems on Raspberry Pi see http://www.raspberrypi.org/phpBB3/viewtopic.php?t=30036&p=263213; can be best to cross-compile, e.g. from a Mac use https://github.com/UnhandledException/ARMx/wiki/Sourcery-G---Lite-for-ARM-GNU-Linux-(2009q3-67)-for-Mac-OS-X and arm-none-linux-gnueabi-gcc)
# In large rulesets with --max-or-length=0 and --nested-switch, gcc takes time and gcc -Os can take a LOT longer, and CINT, Ch and picoc run out of memory.  Without these options the overhead of gcc's -Os isn't so bad (and does save some room).
Silas S. Brown's avatar
Silas S. Brown committed
# clang with --max-or-length=100 and --nested-switch=0 is not slowed much by -Os (slowed considerably by -O3). -Os and -Oz gave same size in my tests.
# on 64-bit distros -m32 won't always work and won't necessarily give a smaller program

Silas S. Brown's avatar
Silas S. Brown committed
parser.add_option("--max-or-length",default=100,help="The maximum number of items allowed in an OR-expression in C and Java code (used when ybytes is in effect). When an OR-expression becomes larger than this limit, it will be made into a function. 0 means unlimited, which works for tcc and gcc; many other compilers have limits. Default: %default")
Silas S. Brown's avatar
Silas S. Brown committed
parser.add_option("--nested-switch",default=0,
                  help="Allow C and Java switch() constructs to be nested to about this depth.  Default 0 tries to avoid nesting, as it slows down most C compilers for little gain in executable size.  Setting 1 nests 1 level deeper which can occasionally help get around memory problems with Java compilers.  -1 means nest to unlimited depth, which is not recommended.") # tcc is still fast (although that doesn't generate the smallest executables anyway)

parser.add_option("--outcode",default="utf-8",
                  help="Character encoding to use in the generated parser and rules summary (default %default, must be ASCII-compatible i.e. not utf-16)")

parser.add_option("-S", "--summary-only",
                  action="store_true",default=False,
                  help="Don't generate a parser, just write the rules summary to standard output")

parser.add_option("--no-summary",
                  action="store_true",default=False,
Silas S. Brown's avatar
Silas S. Brown committed
                  help="Don't add a large rules-summary comment at the end of the parser code")
parser.add_option("-O", "--summary-omit",
                  help="Filename of a text file (or a compressed .gz or .bz2 file) specifying what should be omitted from the rules summary.  Each line should be a word or phrase, a tab, and its annotation (without the mstart/mmid/mend markup).  If any rule in the summary exactly matches any of the lines in this text file, then that rule will be omitted from the summary (but still included in the parser).  Use for example to take out of the summary any entries that correspond to things you already have in your dictionary, so you can see what's new.")

parser.add_option("--maxrefs",default=3,
Silas S. Brown's avatar
Silas S. Brown committed
                  help="The maximum number of example references to record in each summary line, if references are being recorded (0 means unlimited).  Default is %default.")
parser.add_option("--norefs",
                  action="store_true",default=False,
                  help="Don't write references in the rules summary.  Use this if you need to specify reference-sep and ref-name-end for the ref-pri option but you don't actually want references in the summary (omitting references makes summary generation faster).  This option is automatically turned on if --no-input is specified.")

parser.add_option("--newlines-reset",
                  action="store_false",
                  dest="ignoreNewlines",
                  default=True,
Silas S. Brown's avatar
Silas S. Brown committed
                  help="Have the annotator reset its state on every newline byte. By default newlines do not affect state such as whether a space is required before the next word, so that if the annotator is used with Web Adjuster's htmlText option (which defaults to using newline separators) the spacing should be handled sensibly when there is HTML markup in mid-sentence.")
parser.add_option("--obfuscate",
                  action="store_true",default=False,
                  help="Obfuscate annotation strings in C code, as a deterrent to casual snooping of the compiled binary with tools like 'strings' (does NOT stop determined reverse engineering)")

parser.add_option("--javascript",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate JavaScript.  This might be useful if you want to run an annotator on a device that has a JS interpreter but doesn't let you run native code.  The JS will be table-driven to make it load faster (and --no-summary will also be set).  See comments at the start for usage.") # but it's better to use the C version if you're in an environment where 'standard input' makes sense
parser.add_option("--python",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate a Python module.  Similar to the Javascript option, this is for when you can't run native code.")

Silas S. Brown's avatar
Silas S. Brown committed
                  help="Instead of generating C code, generate Java, and place the *.java files in the directory specified by this option, removing any existing *.java files.  See --android for example use.  The last part of the directory should be made up of the package name; a double slash (//) should separate the rest of the path from the package name, e.g. --java=/path/to/wherever//org/example/package and the main class will be called Annotator.")
Silas S. Brown's avatar
Silas S. Brown committed
                  help="URL for an Android app to browse.  If this is set, code is generated for an Android app which starts a browser with that URL as the start page, and annotates the text on every page it loads.  You will need the Android SDK to compile the app (see comments in MainActivity.java for details).")
parser.add_option("--reannotator",
                  help="Shell command through which to pipe each word of the original text to obtain new annotation for that word.  This might be useful as a quick way of generating a new annotator (e.g. for a different topolect) while keeping the information about word separation and/or glosses from the previous annotator, but it is limited to commands that don't need to look beyond the boundaries of each word.  (If the command is prefixed by a # character, it will be given the word's existing annotation instead of its original text.)  The command should treat each line of its input independently, and both its input and its output should be in the encoding specified by --outcode.") # TODO: reannotatorCode instead? (see other 'reannotatorCode' TODOs)
# (Could just get the reannotator to post-process the 1st annotator's output, but that might be slower than generating an altered annotator with it)

#  =========== ANALYSIS OPTIONS ==============

parser.add_option("-o", "--allow-overlaps",
                  action="store_true",default=False,
                  help="Normally, the analyser avoids generating rules that could overlap with each other in a way that would leave the program not knowing which one to apply.  If a short rule would cause overlaps, the analyser will prefer to generate a longer rule that uses more context, and if even the entire phrase cannot be made into a rule without causing overlaps then the analyser will give up on trying to cover that phrase.  This option allows the analyser to generate rules that could overlap, as long as none of the overlaps would cause actual problems in the example phrases. Thus more of the examples can be covered, at the expense of a higher risk of ambiguity problems when applying the rules to other texts.")

parser.add_option("-P", "--primitive",
                  action="store_true",default=False,
                  help="Don't bother with any overlap or conflict checks at all, just make a rule for each word. The resulting parser is not likely to be useful, but the summary might be.")
parser.add_option("-y","--ybytes",default=0,
                  help="Look for candidate Yarowsky seed-collocations within this number of bytes of the end of a word.  If this is set then overlaps and rule conflicts will be allowed if the seed collocations can be used to distinguish between them.  Markup examples that are completely separate (e.g. sentences from different sources) must have at least this number of bytes between them.")
parser.add_option("--ybytes-max",default=0,
                  help="Extend the Yarowsky seed-collocation search to check over larger ranges up to this maximum.  If this is set then several ranges will be checked in an attempt to determine the best one for each word, but see also ymax-threshold.")
parser.add_option("--ymax-threshold",default=1,
                  help="Limits the length of word that receives the narrower-range Yarowsky search when ybytes-max is in use. For words longer than this, the search will go directly to ybytes-max. This is for languages where the likelihood of a word's annotation being influenced by its immediate neighbours more than its distant collocations increases for shorter words, and less is to be gained by comparing different ranges when processing longer words. Setting this to 0 means no limit, i.e. the full range will be explored on ALL Yarowsky checks.")
parser.add_option("--ybytes-step",default=3,
                  help="The increment value for the loop between ybytes and ybytes-max")
parser.add_option("--warn-yarowsky",
                  action="store_true",default=False,
                  help="Warn when absolutely no distinguishing Yarowsky seed collocations can be found for a word in the examples")
parser.add_option("--yarowsky-all",
                  action="store_true",default=False,
                  help="Accept Yarowsky seed collocations even from input characters that never occur in annotated words (this might include punctuation and example-separation markup)")
Silas S. Brown's avatar
Silas S. Brown committed
parser.add_option("--yarowsky-debug",default=1,
                  help="Report the details of seed-collocation false positives if there are a large number of matches and at most this number of false positives (default %default). Occasionally these might be due to typos in the corpus, so it might be worth a check.")
                  help="Do not consider any rule longer than 1 word, although it can still have Yarowsky seed collocations if -y is set. This speeds up the search, but at the expense of thoroughness. You might want to use this in conjuction with -y to make a parser quickly. It is like -P (primitive) but without removing the conflict checks.")
parser.add_option("--max-words",default=0,
                  help="Limits the number of words in a rule; rules longer than this are not considered.  0 means no limit.  --single-words is equivalent to --max-words=1.  If you need to limit the search time, and are using -y, it should suffice to use --single-words for a quick annotator or --max-words=5 for a more thorough one.")

Silas S. Brown's avatar
Silas S. Brown committed
# TODO: optionally (especially if NOT using Yarowsky) do an additional pass (after discovering all other rules) and turn whole phrases that are not completely covered by other rules into whole-phrase rules, if it doesn't conflict 1 phrase w. anothr of equal priority; shld be ok if no overlap, overlaps wld *sometimes* be ok suggest a len threshold

parser.add_option("--checkpoint",help="Periodically save checkpoint files in the specified directory.  These files can save time when starting again after a reboot (and it's easier than setting up Condor etc).  As well as a protection against random reboots, this can be used for scheduled reboots: if file called ExitASAP appears in the checkpoint directory, annogen will checkpoint, remove the ExitASAP file, and exit.  After a run has completed, the checkpoint directory should be removed, unless you want to re-do the last part of the run for some reason.")
# (Condor can checkpoint an application on Win/Mac/Linux but is awkward to set up.  Various Linux and BSD application checkpoint approaches also exist; another option is virtualisation.)
parser.add_option("-d","--diagnose",help="Output some diagnostics for the specified word. Use this option to help answer \"why doesn't it have a rule for...?\" issues. This option expects the word without markup and uses the system locale (UTF-8 if it cannot be detected).")

parser.add_option("--diagnose-limit",default=10,help="Maximum number of phrases to print diagnostics for (0 means unlimited); can be useful when trying to diagnose a common word in rulesFile without re-evaluating all phrases that contain it. Default: %default")

parser.add_option("--time-estimate",
                  action="store_true",default=False,
                  help="Estimate time to completion.  The code to do this is unreliable and is prone to underestimate.  If you turn it on, its estimate is displayed at the end of the status line as days, hours or minutes.") # Unreliable because the estimate assumes 'phrases per minute' will remain constant on average, whereas actually it will decrease because the more complex phrases are processed last

sys.stderr.write(program_name+"\n") # not sys.stdout, because may or may not be showing --help (and anyway might want to process the help text for website etc)
options, args = parser.parse_args()
globals().update(options.__dict__)

sys.setcheckinterval(32767) # won't be using threads or signals, so don't have to check for them very often
import gc ; gc.disable() # should be OK if we don't create cycles (TODO: run gc.collect() manually after init, just in case?)

if primitive and ybytes: sys.stderr.write("Warning: primitive will override ybytes\n")
if ybytes: ybytes=int(ybytes)
if ybytes_max: ybytes_max=int(ybytes_max)
else: ybytes_max = ybytes
Silas S. Brown's avatar
Silas S. Brown committed
if yarowsky_debug: yarowsky_debug=int(yarowsky_debug)
else: yarowsky_debug = 0
def errExit(msg):
  sys.stderr.write(msg+"\n") ; sys.exit(1)
if ref_pri and not (reference_sep and ref_name_end): errExit("ref-pri option requires reference-sep and ref-name-end to be set")
Silas S. Brown's avatar
Silas S. Brown committed
if android and not java: errExit('You must set --java=/path/to/src//name/of/package when using --android')
Silas S. Brown's avatar
Silas S. Brown committed
jPackage = None
if nested_switch: nested_switch=int(nested_switch) # TODO: if java, override it?  or just rely on the help text for --nested-switch (TODO cross-reference it from --java?)
if java:
  if not '//' in java: errExit("--java must include a // to separate the first part of the path from the package name")
  jPackage=java.rsplit('//',1)[1].replace('/','.')
  if 'NewFunc' in jPackage: errExit("Currently unable to include the string 'NewFunc' in your package due to an implementation detail in annogen's search/replace operations")
if java or javascript or python:
    if sum(1 for x in [java,javascript,python] if x) > 1:
      errExit("Outputting more than one programming language on the same run is not yet implemented")
    if not outcode=="utf-8": errExit("outcode must be utf-8 when using Java, Javascript or Python")
    if obfuscate: errExit("obfuscate not yet implemented for the Java, Javascript or Python versions") # (and it would probably slow down the JS far too much if it were)
Silas S. Brown's avatar
Silas S. Brown committed
    if java:
      for f in os.listdir(java):
        if f.endswith(".java"): os.remove(java+os.sep+f)
      c_filename = java+os.sep+"Annotator.java"
    elif c_filename.endswith(".c"):
      if javascript: c_filename = c_filename[:-2]+".js"
      else: c_filename = c_filename[:-2]+".py"
try:
  import locale
  terminal_charset = locale.getdefaultlocale()[1]
except: terminal_charset = "utf-8"
if diagnose: diagnose=diagnose.decode(terminal_charset)
max_words = int(max_words)
if single_words: max_words = 1

def nearCall(conds,subFuncs,subFuncL):
  # returns what to put in the if() for ybytes near() lists
  if not max_or_length or len(conds) <= max_or_length:
Silas S. Brown's avatar
Silas S. Brown committed
    if java: f="a.n"
    else: f="near"
    return " || ".join(f+"(\""+c_or_java_escape(c,0)+"\")" for c in conds)
Silas S. Brown's avatar
Silas S. Brown committed
  if java: fStart,fEnd = "package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {","} }" # put functions in separate classes to try to save the constants table of the main class
  else: fStart,fEnd = "int NewFunc() {","}"
  return subFuncCall(fStart+"\n".join("if("+nearCall(conds[i:j],subFuncs,subFuncL)+") return "+c_or_java_true+";" for i,j in zip(range(0,len(conds),max_or_length),range(max_or_length,len(conds),max_or_length)+[len(conds)]))+"\nreturn "+c_or_java_false+";"+fEnd,subFuncs,subFuncL)

def subFuncCall(newFunc,subFuncs,subFuncL):
  if newFunc in subFuncs:
    # we generated an identical one before
    subFuncName=subFuncs[newFunc]
  else:
Silas S. Brown's avatar
Silas S. Brown committed
    if java: subFuncName="z%X" % len(subFuncs) # (try to save as many bytes as possible because it won't be compiled out and we also have to watch the compiler's footprint; start with z so MainActivity.java etc appear before rather than among this lot in IDE listings)
    else: subFuncName="match%d" % len(subFuncs)
    if java: static=""
    else: static="static "
    subFuncL.append(static+newFunc.replace("NewFunc",subFuncName,1))
Silas S. Brown's avatar
Silas S. Brown committed
  if java: return jPackage+"."+subFuncName+".f(a)"
  return subFuncName+"()" # the call (without a semicolon)
Silas S. Brown's avatar
Silas S. Brown committed
def stringSwitch(byteSeq_to_action_dict,subFuncL,funcName="topLevelMatch",subFuncs={},java_localvar_counter=None,nestingsLeft=None): # ("topLevelMatch" is also mentioned in the C code)
    # make a function to switch on a large number of variable-length string cases without repeated lookahead for each case
    # (may still backtrack if no words or no suffices match)
    # byteSeq_to_action_dict is really a byte sequence to [(action, OR-list of Yarowsky-like indicators which are still in Unicode)], the latter will be c_escape()d
    # can also be byte seq to [(action,(OR-list,nbytes))] but only if OR-list is not empty, so value[1] will always be false if OR-list is empty
Silas S. Brown's avatar
Silas S. Brown committed
    if nestingsLeft==None: nestingsLeft=nested_switch
    canNestNow = not nestingsLeft==0 # (-1 = unlimited)
    if java: NEXTBYTE = 'a.nB()'
    allBytes = set(b[0] for b in byteSeq_to_action_dict.iterkeys() if b)
    if not java_localvar_counter: # unlike C, Java doesn't allow shadowing of local variable names, so we'll need to uniquify them
      java_localvar_counter=[0]
    olvc = "%X" % java_localvar_counter[0] # old localvar counter
Silas S. Brown's avatar
Silas S. Brown committed
        if funcName=="topLevelMatch": stat="static " # because we won't call subFuncCall on our result
Silas S. Brown's avatar
Silas S. Brown committed
        if java: ret.append("package "+jPackage+";\npublic class "+funcName+" { public static void f("+jPackage+".Annotator a) {")
        else: ret.append(stat+"void %s() {" % funcName)
Silas S. Brown's avatar
Silas S. Brown committed
        if java: ret.append("{ int oldPos=a.inPtr;")
    elif "" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1:
        # no funcName, but might still want to come back here as there's a possible action at this level
        savePos = len(ret)
Silas S. Brown's avatar
Silas S. Brown committed
          ret.append("{ int oP"+olvc+"=a.inPtr;")
          java_localvar_counter[0] += 1
        else: ret.append("{ SAVEPOS;")
Silas S. Brown's avatar
Silas S. Brown committed
    def restorePos():
      if not savePos==None:
        if len(' '.join(ret).split(NEXTBYTE))==2 and not called_subswitch:
            # only 1 NEXTBYTE after the SAVEPOS - just
            # do a PREVBYTE instead
            # (note however that splitting on NEXTBYTE
            # does not necessarily give a reliable value
            # for max amount of lookahead required if
            # there's more than 1.  We use max rule len
            # as an upper bound for that instead.)
            del ret[savePos]
            if java: ret.append("a.inPtr--;")
            else: ret.append("PREVBYTE;")
        elif java:
          if funcName: ret.append("a.inPtr=oldPos; }")
          else: ret.append("a.inPtr=oP"+olvc+"; }")
        else: ret.append("RESTOREPOS; }")
    if "" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[""])==1 and not byteSeq_to_action_dict[""][0][1] and all((len(a)==1 and a[0][0].startswith(byteSeq_to_action_dict[""][0][0]) and not a[0][1]) for a in byteSeq_to_action_dict.itervalues()):
        # there's an action in common for this and all subsequent matches, and no Yarowsky-like indicators, so we can do the common action up-front
        ret.append(byteSeq_to_action_dict[""][0][0])
        l = len(byteSeq_to_action_dict[""][0][0])
        byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in byteSeq_to_action_dict.iteritems())
        # and, since we'll be returning no matter what,
        # we can put the inner switch in a new function
        # (even if not re-used, this helps compiler speed)
        # + DON'T save/restore pos around it (it itself
        # will do any necessary save/restore pos)
        del byteSeq_to_action_dict[""]
Silas S. Brown's avatar
Silas S. Brown committed
        if java and (canNestNow or len(byteSeq_to_action_dict)==1): # hang on - better nest (might be using --nested-switch to get around a Java compiler-memory problem; the len condition allows us to always nest a single 'if' rather than creating a new function+class for it)
          ret += ["  "+x for x in stringSwitch(byteSeq_to_action_dict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)]
          restorePos()
          ret.append("return;")
        else: # ok, new function
          newFunc = "\n".join(stringSwitch(byteSeq_to_action_dict,subFuncL,"NewFunc",subFuncs))
          ret.append(subFuncCall(newFunc,subFuncs,subFuncL)+"; return;")
          del ret[savePos] # will be set to None below
        byteSeq_to_action_dict[""] = [("",[])] # for the end of this func
Silas S. Brown's avatar
Silas S. Brown committed
        savePos = None # as setting funcName on stringSwitch implies it'll give us a savePos, and if we didn't set funcName then we called restorePos already above
    elif allBytes:
      # deal with all actions except "" first
Silas S. Brown's avatar
Silas S. Brown committed
      if not use_if:
        if nestingsLeft > 0: nestingsLeft -= 1
        ret.append("switch("+NEXTBYTE+") {")
      for case in sorted(allBytes):
        if 32<=ord(case)<127 and case!="'": cstr="'%c'" % case
        else:
          cstr=str(ord(case))
          if java: cstr = "(byte)"+cstr
        if use_if: ret.append("if("+NEXTBYTE+"=="+cstr+") {")
Silas S. Brown's avatar
Silas S. Brown committed
        subDict = dict([(k[1:],v) for k,v in byteSeq_to_action_dict.iteritems() if k and k[0]==case])
        inner = stringSwitch(subDict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)
        if canNestNow or not inner[0].startswith("switch"): ret += ["  "+x for x in inner]
        else:
          # Put the inner switch into a different function
          # which returns 1 if we should return.
          # (TODO: this won't catch cases where there's a SAVEPOS before the inner switch; will still nest in that case.  But it shouldn't lead to big nesting in practice.)
Silas S. Brown's avatar
Silas S. Brown committed
          if nested_switch: inner = stringSwitch(subDict,subFuncL,None,subFuncs,None,None) # re-do it with full nesting counter
          if java: myFunc,funcEnd = ["package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {"], "}}"
          else: myFunc,funcEnd=["int NewFunc() {"],"}"
            if x.endswith("return;"): x=x[:-len("return;")]+"return "+c_or_java_true+";"
Silas S. Brown's avatar
Silas S. Brown committed
          ret.append("  if("+subFuncCall("\n".join(myFunc)+"  return "+c_or_java_false+";\n"+funcEnd,subFuncs,subFuncL)+") return;")
          called_subswitch=True # as it'll include more NEXTBYTE calls which are invisible to the code below
Silas S. Brown's avatar
Silas S. Brown committed
        if not (use_if or inner[-1].endswith("return;")): ret.append("  break;")
Silas S. Brown's avatar
Silas S. Brown committed
    restorePos()
    if funcName:
      if java: ret.append("} }")
      else: ret.append("}")
    elif "" in byteSeq_to_action_dict:
        # if the C code gets to this point, no return; happened - no suffices
        # so execute one of the "" actions and return
        # (which one, if any, depends on the Yarowsky-like indicators; there should be at most one "default" action without indicators)
        default_action = ""
        for action,conds in byteSeq_to_action_dict[""]:
            if conds:
                assert action, "conds without action in "+repr(byteSeq_to_action_dict[""])
                if type(conds)==tuple:
                    conds,nbytes = conds
Silas S. Brown's avatar
Silas S. Brown committed
                    if java: ret.append("a.sn(%d);" % nbytes)
                    else: ret.append("setnear(%d);" % nbytes)
                ret.append("if ("+nearCall(conds,subFuncs,subFuncL)+") {")
                ret.append((action+" return;").strip())
                ret.append("}")
            else:
Silas S. Brown's avatar
Silas S. Brown committed
                if default_action: sys.stderr.write("WARNING! More than one default action in "+repr(byteSeq_to_action_dict[""])+" - earlier one discarded!\n(This might indicate invalid markup in the corpus)\n") # see TODO in yarowsky_indicators. (TODO: we sometimes also get this if an incremental run has updated the annotation; why doesn't remove_old_rules prevent this? see also the TODO there re yBytesRet)
                default_action = action
        if default_action or not byteSeq_to_action_dict[""]: ret.append((default_action+" return;").strip()) # (return only if there was a default action, OR if an empty "" was in the dict with NO conditional actions (e.g. from the common-case optimisation above).  Otherwise, if there were conditional actions but no default, we didn't "match" anything if none of the conditions were satisfied.)
if obfuscate:
  import random ; pad=[]
  for i in xrange(32): pad.append(random.randint(128,255))
  unobfusc_func=r"""

void OutWriteDecode(const char *s) {
static const char pad[]={%s}; int i=0;
while(*s) {
int t=pad[i++]; if(i==sizeof(pad)) i=0;
if(*s==t) OutWriteByte(t); else OutWriteByte((*s)^t); s++;
}
}""" % repr(pad)[1:-1]
  def encodeOutstr(s):
    i = 0 ; r = []
    for c in s:
      t = pad[i] ; i = (i+1) % len(pad)
      if ord(c) == t: toApp = t
      else: toApp = ord(c)^t
      if toApp==ord("\\"): r.append(r'\\')
      elif toApp==ord('"'): r.append(r'\"')
      elif toApp&0x80 or toApp<32: r.append(r'\x%x" "' % toApp)
      else: r.append(chr(toApp))
    return ''.join(r)
else: unobfusc_func = ""

c_start = "/* -*- coding: "+outcode+r""" -*- */
#include <stdio.h>
#include <string.h>

/* To include this code in another program,
   define the ifndef'd macros below + define Omit_main */
enum { ybytes = %%YBYTES%% }; /* for Yarowsky matching, minimum readahead */
static int nearbytes = ybytes;
#define setnear(n) (nearbytes = (n))
/* Default definition of NEXTBYTE etc is to read input
   from stdin and write output to stdout.  */
enum { Half_Bufsize = %%LONGEST_RULE_LEN%% };
static unsigned char lookahead[Half_Bufsize*2];
static size_t readPtr=0,writePtr=0,bufStart=0,bufLen=0;
  if (readPtr-bufStart +ybytes >= bufLen) {
    if (bufLen == Half_Bufsize * 2) {
      memmove(lookahead,lookahead+Half_Bufsize,Half_Bufsize);
      bufStart += Half_Bufsize; bufLen -= Half_Bufsize;
    }
    bufLen += fread(lookahead+bufLen,1,Half_Bufsize*2-bufLen,stdin);
    if (readPtr-bufStart == bufLen) return EOF;
  return lookahead[(readPtr++)-bufStart];
  size_t offset = readPtr-bufStart, l=strlen(string),
  if (maxPos >= l) maxPos -= l; else return 0; // can't possibly start after maxPos-l
  if (offset+nearbytes>l) {
    if (maxPos > offset+nearbytes-l)
      maxPos = offset+nearbytes-l;
  } else maxPos = 0; // (don't let it go below 0, as size_t is usually unsigned)
  if (offset>nearbytes) offset-=nearbytes; else offset = 0;
  // can use strnstr(haystack,needle,n) if on a BSD system
    if(!strncmp((char*)lookahead+offset,string,l)) return 1;
#define NEXT_COPY_BYTE lookahead[(writePtr++)-bufStart]
#define COPY_BYTE_SKIP writePtr++
#define THEPOS readPtr /* or get it via a function */
#define SAVEPOS POSTYPE oldPos=THEPOS
#define RESTOREPOS readPtr=oldPos /* or set via a func */
#define PREVBYTE readPtr--
#define FINISHED (feof(stdin) && readPtr-bufStart == bufLen)
#define OutWriteStr(s) fputs(s,stdout)
#define OutWriteByte(c) putchar(c)
#endif

#ifndef Default_Annotation_Mode
#define Default_Annotation_Mode ruby_markup
#endif

enum {
  annotations_only,
  ruby_markup,
  brace_notation} annotation_mode = Default_Annotation_Mode;

static int needSpace=0;
static void s() {
  if (needSpace) OutWriteByte(' ');
  else needSpace=1; /* for after the word we're about to write (if no intervening bytes cause needSpace=0) */
static void o(int numBytes,const char *annot) {
  case annotations_only: OutWriteDecode(annot); break;
  case ruby_markup:
    OutWriteStr("<ruby><rb>");
    for(;numBytes;numBytes--)
      OutWriteByte(NEXT_COPY_BYTE);
    OutWriteStr("</rb><rt>"); OutWriteDecode(annot);
    OutWriteStr("</rt></ruby>"); break;
  case brace_notation:
    OutWriteByte('{');
    for(;numBytes;numBytes--)
      OutWriteByte(NEXT_COPY_BYTE);
    OutWriteByte('|'); OutWriteDecode(annot);
    OutWriteByte('}'); break;
static void o2(int numBytes,const char *annot,const char *title) {
    OutWriteStr("<ruby title=\""); OutWriteDecode(title);
    OutWriteStr("\"><rb>");
    for(;numBytes;numBytes--)
      OutWriteByte(NEXT_COPY_BYTE);
    OutWriteStr("</rb><rt>"); OutWriteDecode(annot);
    OutWriteStr("</rt></ruby>");
  } else o(numBytes,annot);
if not obfuscate: c_start = c_start.replace("OutWriteDecode","OutWriteStr")

void matchAll() {
  while(!FINISHED) {
    POSTYPE oldPos=THEPOS;
    if (oldPos==THEPOS) { needSpace=0; OutWriteByte(NEXTBYTE); COPY_BYTE_SKIP; }
  int i; for(i=1; i<argc; i++) {
    if(!strcmp(argv[i],"--help")) {
      puts("Use --ruby to output ruby markup (default)");
      puts("Use --raw to output just the annotations without the base text");
      puts("Use --braces to output as {base-text|annotation}");
      return 0;
    } else if(!strcmp(argv[i],"--ruby")) {
      annotation_mode = ruby_markup;
    } else if(!strcmp(argv[i],"--raw")) {
      annotation_mode = annotations_only;
    } else if(!strcmp(argv[i],"--braces")) {
      annotation_mode = brace_notation;
    }
  }
  matchAll();
}
#endif
"""

Silas S. Brown's avatar
Silas S. Brown committed
# ANDROID: setDefaultTextEncodingName("utf-8") is included as it might be needed if you include file:///android_asset/ URLs in your app (files put into assets/) as well as remote URLs.  (If including ONLY file URLs then you don't need to set the INTERNET permission in Manifest, but then you might as well pre-annotate the files and use a straightforward static HTML app like http://people.ds.cam.ac.uk/ssb22/gradint/html2apk.html )
# Also we get shouldOverrideUrlLoading to return true for URLs that end with .apk .pdf .epub .mp3 etc so the phone's normal browser can handle those (search code below for ".apk" for the list)
Silas S. Brown's avatar
Silas S. Brown committed
android_src = r"""
/* COMPILING
   ---------

   1.  Install the Android Developer Tools (ADT)
Silas S. Brown's avatar
Silas S. Brown committed
       - You might need to increase the amount of RAM it's
         allowed to use, e.g. put -Xmx2g into eclipse.ini
         (be sure to remove any existing -Xmx settings
          otherwise they might override your new setting)
   2.  Go to File / New / Android application project
   3.  Application name = anything you want (for the phone's app menu)
       Project name = anything you want (unique on your development machine)
Silas S. Brown's avatar
Silas S. Brown committed
       Package name = %%JPACKAGE%%
       Minimum Required SDK = API 1: Android 1.0
       Leave everything else as default
       but make a note of the project directory
       (usually on the second setup screen as "location")
Silas S. Brown's avatar
Silas S. Brown committed
    4. Put *.java into src/%%JPACK2%%
Silas S. Brown's avatar
Silas S. Brown committed
    5. Edit project.properties and add the line
        dex.force.jumbo=true
    6. Edit AndroidManifest.xml and make it look like:
       (you might need to change targetSdkVersion="18" if
       your SDK has a different targetSdkVersion setting)
---------------------- cut here ----------------------
<?xml version="1.0" encoding="utf-8"?>
Silas S. Brown's avatar
Silas S. Brown committed
<manifest xmlns:android="http://schemas.android.com/apk/res/android" package="%%JPACKAGE%%" android:versionCode="1" android:versionName="1.0" >
<uses-permission android:name="android.permission.INTERNET" />
<uses-sdk android:minSdkVersion="1" android:targetSdkVersion="18" />
<application android:icon="@drawable/ic_launcher" android:label="@string/app_name" android:theme="@style/AppTheme" >
Silas S. Brown's avatar
Silas S. Brown committed
<activity android:configChanges="orientation|screenSize|keyboardHidden" android:name="%%JPACKAGE%%.MainActivity" android:label="@string/app_name" >
<intent-filter><action android:name="android.intent.action.MAIN" /><category android:name="android.intent.category.LAUNCHER" /></intent-filter>
</activity></application></manifest>
---------------------- cut here ----------------------
Silas S. Brown's avatar
Silas S. Brown committed
    7. Copy new AndroidManifest.xml to the bin/ directory
       (so there will be 2 copies, one in the top level
        and the other in bin/ )
Silas S. Brown's avatar
Silas S. Brown committed
    8. Edit res/layout/activity_main.xml and make it like:
---------------------- cut here ----------------------
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android" android:layout_height="fill_parent" android:layout_width="fill_parent" android:orientation="vertical">
  <TextView android:layout_height="wrap_content" android:layout_width="fill_parent" />
  <WebView android:id="@+id/browser" android:layout_height="fill_parent" android:layout_width="fill_parent" />
</LinearLayout>
---------------------- cut here ----------------------
Silas S. Brown's avatar
Silas S. Brown committed
    9. Restart ADT, do Run / Run As / Android application
   10. Watch ADT's Console window until it says the app
       has started, then interact with the Android virtual
       device to test.  (If install fails, try again.)
Silas S. Brown's avatar
Silas S. Brown committed
   11. .apk file should now be in the bin subdirectory.
Silas S. Brown's avatar
Silas S. Brown committed
       On a real phone go to "Application settings" or
       "Security" and enable "Unknown sources".  Or if
       you're ready to ship your .apk, select it in
       Eclipse's Package Explorer (left-hand pane) and
       do File / Export / Export Android Application (it
       lets you create a keystore and private signing key)
Silas S. Brown's avatar
Silas S. Brown committed
package %%JPACKAGE%%;
import android.webkit.WebView;
import android.webkit.WebChromeClient;
import android.webkit.WebViewClient;
Silas S. Brown's avatar
Silas S. Brown committed
import android.content.Intent;
import android.app.Activity;
import android.os.Bundle;
import android.view.KeyEvent;
public class MainActivity extends Activity {
    @Override
    public void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        browser = (WebView)findViewById(R.id.browser);
        browser.getSettings().setJavaScriptEnabled(true);
        browser.setWebChromeClient(new WebChromeClient());
Silas S. Brown's avatar
Silas S. Brown committed
        class A {
Silas S. Brown's avatar
Silas S. Brown committed
            public String annotate(String t) { return new %%JPACKAGE%%.Annotator(t).result().replaceAll("<ruby title=\"","<ruby onclick=\"alert(this.title)\" title=\""); }
Silas S. Brown's avatar
Silas S. Brown committed
        browser.addJavascriptInterface(new A(),"ssb_local_annotator"); // hope no conflict with web JS
        browser.setWebViewClient(new WebViewClient() {
Silas S. Brown's avatar
Silas S. Brown committed
                public boolean shouldOverrideUrlLoading(WebView view,String url) { if(url.endsWith(".apk") || url.endsWith(".pdf") || url.endsWith(".epub") || url.endsWith(".mp3") || url.endsWith(".zip")) { startActivity(new Intent(Intent.ACTION_VIEW,android.net.Uri.parse(url))); return true; } else return false; }
                public void onPageFinished(WebView view,String url) {
                    browser.loadUrl("javascript:window.onerror=function(msg,url,line){alert(msg); return true};var leaveTags=['SCRIPT', 'STYLE', 'TITLE', 'TEXTAREA', 'OPTION'],stripTags=['WBR']; function HTMLSizeChanged(callback) { var getLen = function(w) { var r=0; if(w.frames && w.frames.length) { var i; for(i=0; i<w.frames.length; i++) r+=getLen(w.frames[i]) } if(w.document && w.document.body && w.document.body.innerHTML) r+=w.document.body.innerHTML.length; return r }; var curLen=getLen(window),stFunc=function(){window.setTimeout(tFunc,1000)},tFunc=function(){if(getLen(window)==curLen) stFunc(); else callback()};stFunc()} function all_frames_docs(c) { var f=function(w){if(w.frames && w.frames.length) { var i; for(i=0; i<w.frames.length; i++) f(w.frames[i]) } c(w.document) }; f(window) } function tw0() { all_frames_docs(function(d){walk(d,d)}) } function adjusterScan() { tw0(); all_frames_docs(function(d) { if(d.rubyScriptAdded==1 || !d.body) return; var e=d.createElement('span'); e.innerHTML='<style>ruby{display:inline-table;}ruby *{display: inline;line-height:1.0;text-indent:0;text-align:center;white-space:nowrap;}rb{display:table-row-group;font-size: 100%;}rt{display:table-header-group;font-size:100%;line-height:1.1;}rt { font-family: Gandhari, DejaVu Sans, Lucida Sans Unicode, Times New Roman, serif !important; }</style>'; d.body.insertBefore(e,d.body.firstChild); var wk=navigator.userAgent.indexOf('WebKit/');if(wk>-1 && navigator.userAgent.slice(wk+7,wk+12)>534){var rbs=document.getElementsByTagName('rb');for(var i=0;i<rbs.length;i++)rbs[i].innerHTML='&#8203;'+rbs[i].innerHTML+'&#8203;'} d.rubyScriptAdded=1 }); HTMLSizeChanged(adjusterScan) } function walk(n,document) { var c=n.firstChild; while(c) { var cNext = c.nextSibling; if (c.nodeType==1 && stripTags.indexOf(c.nodeName)!=-1) { var ps = c.previousSibling; while (c.firstChild) { var tmp = c.firstChild; c.removeChild(tmp); n.insertBefore(tmp,c); } n.removeChild(c); if (ps && ps.nodeType==3 && ps.nextSibling && ps.nextSibling.nodeType==3) { ps.nodeValue += ps.nextSibling.nodeValue; n.removeChild(ps.nextSibling) } if (cNext && cNext.nodeType==3 && cNext.previousSibling && cNext.previousSibling.nodeType==3) { cNext.previousSibling.nodeValue += cNext.nodeValue; var tmp=cNext; cNext = cNext.previousSibling; n.removeChild(tmp) } } c=cNext; } c=n.firstChild; while(c) { var cNext = c.nextSibling; switch (c.nodeType) { case 1: if (leaveTags.indexOf(c.nodeName)==-1 && c.className!='_adjust0') walk(c,document); break; case 3: { var nv=ssb_local_annotator.annotate(c.nodeValue); if(nv!=c.nodeValue) { var newNode=document.createElement('span'); newNode.className='_adjust0'; n.replaceChild(newNode, c); newNode.innerHTML=nv; } } } c=cNext } } adjusterScan()");
                } });
        browser.getSettings().setDefaultTextEncodingName("utf-8");
        browser.loadUrl("%%ANDROID-URL%%");
    }
    @Override public boolean onKeyDown(int keyCode, KeyEvent event) {
        if ((keyCode == KeyEvent.KEYCODE_BACK) &&
            browser.canGoBack()) {
            browser.goBack(); return true;
        } else return super.onKeyDown(keyCode, event);
    }
	WebView browser;
}
"""
Silas S. Brown's avatar
Silas S. Brown committed
java_src = r"""package %%JPACKAGE%%;
public class Annotator {
// use: new Annotator(txt).result()
static final java.nio.charset.Charset UTF8=java.nio.charset.Charset.forName("UTF-8");
Silas S. Brown's avatar
Silas S. Brown committed
public Annotator(String txt) { nearbytes=%%YBYTES%%; inBytes=txt.getBytes(UTF8); inPtr=0; writePtr=0; needSpace=false; outBuf=new java.util.ArrayList<Byte>(); }
Silas S. Brown's avatar
Silas S. Brown committed
byte[] inBytes;
public int inPtr,writePtr; boolean needSpace;
java.util.List<Byte> outBuf; // TODO improve efficiency (although hopefully this annotator is called for only small strings at a time)
Silas S. Brown's avatar
Silas S. Brown committed
public void sn(int n) { nearbytes = n; }
static final byte EOF = (byte)0; // TODO: a bit hacky
Silas S. Brown's avatar
Silas S. Brown committed
public byte nB() {
  if (inPtr==inBytes.length) return EOF;
  return inBytes[inPtr++];
}
Silas S. Brown's avatar
Silas S. Brown committed
public boolean n(String s) {
  // for Yarowsky-like matching (use Strings rather than byte arrays or Java compiler can get overloaded)
  byte[] bytes=s.getBytes(UTF8);
  int offset=inPtr, maxPos=inPtr+nearbytes;
  if (maxPos > inBytes.length) maxPos = inBytes.length;
  maxPos -= bytes.length;
  if(offset>nearbytes) offset-=nearbytes; else offset = 0;
  while(offset <= maxPos) {
    boolean ok=true;
    for(int i=0; i<bytes.length; i++) {
      if(bytes[i]!=inBytes[offset+i]) { ok=false; break; }
    }
    if(ok) return true;
    offset++;
  }
  return false;
}
Silas S. Brown's avatar
Silas S. Brown committed
public void o(byte c) { outBuf.add(c); }
public void o(String s) { byte[] b=s.getBytes(UTF8); for(int i=0; i<b.length; i++) outBuf.add(b[i]); } // TODO: is there a more efficient way to do it than this?
public void s() {
  if (needSpace) o((byte)' ');
  else needSpace=true;
}
Silas S. Brown's avatar
Silas S. Brown committed
public void o(int numBytes,String annot) {
  s();
  o("<ruby><rb>");
  for(;numBytes>0;numBytes--)
    o(inBytes[writePtr++]);
  o("</rb><rt>"); o(annot);
  o("</rt></ruby>");
}
Silas S. Brown's avatar
Silas S. Brown committed
public void o2(int numBytes,String annot,String title) {
  s();
  o("<ruby title=\""); o(title);
  o("\"><rb>");
  for(;numBytes>0;numBytes--)
    o(inBytes[writePtr++]);
  o("</rb><rt>"); o(annot);
  o("</rt></ruby>");
}
public String result() {
  while(inPtr < inBytes.length) {
    int oldPos=inPtr;
Silas S. Brown's avatar
Silas S. Brown committed
    %%JPACKAGE%%.topLevelMatch.f(this);
    if (oldPos==inPtr) { needSpace=false; o(nB()); writePtr++; }
  }
  byte[] b=new byte[outBuf.size()];
  for(int i=0; i<b.length; i++) b[i]=outBuf.get(i); // TODO: is this as efficient as we can get??
  return new String(b, UTF8);
}
}
"""

  # Bytecode for a virtual machine run by the Javascript version etc
  def __init__(self):
    self.l = []
    self.d2l = {}
    self.lastLabelNo = 0
    self.addingPosStack = []
  def addOpcode(self,opcode):
      self.addBytes({
        'jump': 50, # params: address
        'call': 51, # params: function address
        'return': 52, # (or 'end program' if top level)
        'switchbyte': 60, # switch(NEXTBYTE) (params: numBytes-1, bytes (sorted, TODO take advantage of this), addresses, default address)
        'copyBytes':71,'o':72,'o2':73, # (don't change these numbers, they're hard-coded below)
        'savepos':80, # local to the function
        'restorepos':81,
        'neartest':90, # params: true-label, false-label, byte nbytes, addresses of conds strings until true-label reached
        }[opcode])
  def addBytes(self,bStr):
      if type(bStr)==int: self.l.append(chr(bStr))
      elif type(bStr)==str: self.l.append(bStr)
      else: raise Exception("unspported bytes type")
  def startAddingFunction(self):
      self.addingPosStack.append((len(self.l),self.lastLabelNo))
      self.lastLabelNo = 0
  def finishFunctionAndAddCall(self):
      # make sure to add a return instruction before this!
      fPtr, self.lastLabelNo = self.addingPosStack[-1]
      del self.addingPosStack[-1]
      fBody = tuple(self.l[fPtr:]) ; self.l=self.l[:fPtr]
      if not fBody in self.d2l: # not a duplicate
          self.d2l[fBody] = (-len(self.d2l)-1,)
      self.addOpcode('call')
      self.l.append(self.d2l[fBody])
  def addByteswitch(self,byteArray,labelArray):
      assert len(byteArray) + 1 == len(labelArray)
      # labelArray has the default case added also (TODO: could re-organize code so the bytes immediately after the switch are either the default or one of the items, saving 1 address)
      if not len(byteArray): return # empty switch = no-op
      self.addOpcode('switchbyte')
      self.addBytes(len(byteArray)-1) # num of bytes in list - 1 (so all 256 values can be accounted for if needed)
      self.addBytes("".join(byteArray))
      for i in labelArray: self.addRef(i)
  def addActions(self,actionList):
    for a in actionList:
      assert 1 <= len(a) <= 3
      assert type(a[0])==int and 1 <= a[0] <= 255, "bytecode currently supports markup or copy between 1 and 255 bytes only (but 0 is reserved for expansion)"
      self.addBytes(70+len(a)) # 71=copyBytes 72=o() 73=o2
      self.addBytes(a[0])
      for i in a[1:]: self.addRefToString(i)
  def addActionDictSwitch(self,byteSeq_to_action_dict,isFunc=True,labelToJump=None):
    # a modified stringSwitch for the bytecode
    # Actions aren't strings: they list tuples of either
    # 1, 2 or 3 items for copyBytes, o(), o2()
    # labelToJump is a jump to insert afterwards if not isFunc and if we don't emit an unconditional 'return'.  Otherwise, will ALWAYS end up with a 'return' (even if not isFunc i.e. the main program)
    allBytes = set(b[0] for b in byteSeq_to_action_dict.iterkeys() if b)
    if isFunc:
        self.startAddingFunction()
        savePos = len(self.l)
        self.addOpcode('savepos')
    elif ("" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1) or not labelToJump: # ('not labelToJump' and 'not isFunc' == main program)
        savePos = len(self.l)
        self.addOpcode('savepos')
    else: savePos = None
    if "" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[""])==1 and not byteSeq_to_action_dict[""][0][1] and all((len(a)==1 and a[0][0][:len(byteSeq_to_action_dict[""][0][0])]==byteSeq_to_action_dict[""][0][0] and not a[0][1]) for a in byteSeq_to_action_dict.itervalues()):
        self.addActions(byteSeq_to_action_dict[""][0][0])
        l = len(byteSeq_to_action_dict[""][0][0])
        byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in byteSeq_to_action_dict.iteritems())
        del self.l[savePos] ; savePos = None
        del byteSeq_to_action_dict[""]
        self.addActionDictSwitch(byteSeq_to_action_dict) # as a subfunction (ends up adding the call to it)
        byteSeq_to_action_dict[""] = [("",[])] # for the end of this func
        self.addOpcode('return')
    elif allBytes:
      allBytes = list(allBytes)
      labels = [self.makeLabel() for b in allBytes+[0]]
      self.addByteswitch(allBytes,labels)
      for case in allBytes:
        self.addLabelHere(labels[0]) ; del labels[0]
        self.addActionDictSwitch(dict([(k[1:],v) for k,v in byteSeq_to_action_dict.iteritems() if k and k[0]==case]),False,labels[-1])
      self.addLabelHere(labels[0])
    if not savePos==None: self.addOpcode('restorepos')
    if isFunc:
        self.addOpcode('return')
        if self.l[-1]==self.l[-2]: del self.l[-1] # double return
        return self.finishFunctionAndAddCall()
    elif "" in byteSeq_to_action_dict:
        default_action = ""
        for action,conds in byteSeq_to_action_dict[""]:
            if conds:
                if type(conds)==tuple: conds,nbytes = conds
                else: nbytes = ybytes_max
                assert 1 <= nbytes <= 255, "bytecode supports only single-byte nbytes (but nbytes=0 is reserved for expansion)"
                trueLabel,falseLabel = self.makeLabel(),self.makeLabel()
                self.addOpcode('neartest')
                self.addRef(trueLabel)
                self.addRef(falseLabel)
                assert type(nbytes)==int
                self.addBytes(nbytes)
                for c in conds: self.addRefToString(c.encode(outcode))
                self.addLabelHere(trueLabel)
                self.addActions(action)
                self.addOpcode('return')
                self.addLabelHere(falseLabel)
            else: default_action = action
        if default_action or not byteSeq_to_action_dict[""]:
            self.addActions(default_action)
            self.addOpcode('return') ; return
    if labelToJump:
        self.addOpcode('jump')
        self.addRef(labelToJump)
    else: self.addOpcode('return')
  def makeLabel(self):
      self.lastLabelNo += 1
      return self.lastLabelNo
  def addLabelHere(self,labelNo):
      assert type(labelNo)==int
      assert labelNo, "label 0 not allowed"
      self.l.append(labelNo)
  def addRef(self,labelNo):
      assert type(labelNo)==int
      self.l.append(-labelNo)
  def addRefToString(self,string):
    # prepends with a length hint if possible (or if not,
    # prepends with 0 and null-terminates it)
    assert type(string)==str
    if 1 <= len(string) < 256:
        string = chr(len(string))+string
    else: string = chr(0)+string+chr(0)
    if not string in self.d2l:
      self.d2l[string] = (-len(self.d2l)-1,)
    self.l.append(self.d2l[string])
  def link(self): # returns resulting bytes
    # (add an 'end program' instruction before calling)
    def f(*args): raise Exception("Must call link() only once")
    self.link = f
    sys.stderr.write("Linking... ")
    for dat,ref in self.d2l.iteritems():
        assert type(ref)==tuple and type(ref[0])==int
        self.l.append((-ref[0],)) # the label
        if type(dat)==str:
            self.l.append(dat) ; continue
        # otherwise it's a function, and non-reserved labels are local, so we need to rename them
        l2l = {}
        for i in dat:
            if type(i)==int:
                if i>0: j=i
                else: j=-i
                if not j in l2l:
                    l2l[j] = self.makeLabel()
                if i>0: self.addLabelHere(l2l[j])
                else: self.addRef(l2l[j])
            else: self.l.append(i) # str or tuple just cp
    del self.d2l
    # elements of self.l are now:
    # - byte strings (just copied in)
    # - positive integers (labels)
    # - negative integers (references to labels)
    # - +ve or -ve integers in tuples (reserved labels)
    # 1st byte of o/p is num bytes needed per address
    class TooNarrow(Exception): pass
    for numBytes in xrange(1,256):
        sys.stderr.write("(%d-bit) " % (numBytes*8))
        try:
          lDic = {} ; r = [chr(numBytes)]
          for P in [1,2]:
            ll = 1
            for i in self.l:
                if type(i) in [int,tuple]:
                    if type(i)==int: i2,iKey = i,-i
                    else: i2,iKey = i[0],(-i[0],)
                    assert type(i2)==int
                    if i2 > 0: # label going in here
                        if i in lDic: assert lDic[i] == ll, "changing label %s from %d to %d, P=%d" % (repr(i),lDic[i],ll,P)
                        else: lDic[i] = ll
                        continue
                    elif iKey in lDic: # known label
                        i = lDic[iKey]
                        shift = 8*numBytes
                        if (i >> shift): raise TooNarrow()
                        j = []
                        for b in xrange(numBytes):
                            # MSB-LSB (easier to do in JS)
                            shift -= 8
                            j.append(chr((i>>shift)&0xFF))
                        i = "".join(j)
                        assert len(i)==numBytes
                    else: # as-yet unknown label
                        assert P==1, "undefined label %d" % -i
                        ll += numBytes ; continue
                if P==2: r.append(i)
                ll += len(i)
          sys.stderr.write("%d bytes\n" % ll)
          return "".join(r)
        except TooNarrow: pass
    assert 0, "can't even assemble it with 255-byte addressing !?!"

js_start = r"""/* Javascript generated by """+program_name[:program_name.index("(c)")].strip()+r"""

Usage:

 - You could just include this code and then call the
   annotate() function i.e. var result = annotate(input)

 - Or you could use (and perhaps extend) the Annotator
   object, and call its annotate() method.  If you have
   Backbone.JS, Annotator will instead be a generator
   (extending Backbone.Model) which you will have to
   instantiate yourself (possibly after extending it).
   The Annotator object/class is also what will be
   exported by this module if you're using Common.JS.

 - On Unix systems with Node.JS, you can run this file in
   "node" to annotate standard input as a simple test.

*/

var Annotator={
"""
js_end = r"""
annotate: function(input) {
/* TODO: if input is a whole html doc, insert css in head
   (e.g. from annoclip and/or adjuster), and hope there's
   no stuff that's not to be annotated (form fields...) */
input = unescape(encodeURIComponent(input)); // to UTF-8
var data = this.data;
var addrLen = data.charCodeAt(0);
var dPtr;
var inputLength = input.length;
var p = 0; // read-ahead pointer
var copyP = 0; // copy pointer
var output = new Array();
var needSpace = 0;

function readAddr() {
  var i,addr=0;
  for (i=addrLen; i; i--) addr=(addr << 8) | data.charCodeAt(dPtr++);
  return addr;
}

function readRefStr() {
  var a = readAddr(); var l=data.charCodeAt(a);
  if (l != 0) return data.slice(a+1,a+l+1);
  else return data.slice(a+1,data.indexOf('\x00',a+1));
}

function s() {
  if (needSpace) output.push(" ");
  else needSpace=1; // for after the word we're about to write (if no intervening bytes cause needSpace=0)
}

function readData() {
    var sPos = new Array();
    while(1) {
        switch(data.charCodeAt(dPtr++)) {
            case 50: dPtr = readAddr(); break;
            case 51: {
              var f = readAddr(); var dO=dPtr;
              dPtr = f; readData() ; dPtr = dO;
              break; }
            case 52: return;
            case 60: {
              var nBytes = data.charCodeAt(dPtr++)+1;
              var i = data.slice(dPtr,dPtr+nBytes).indexOf(input.charAt(p++));
              if (i==-1) i = nBytes;
              dPtr += (nBytes + i * addrLen);
              dPtr = readAddr(); break; }
            case 71: {
              var numBytes = data.charCodeAt(dPtr++);
  output.push(input.slice(copyP,copyP+numBytes));
  copyP += numBytes; break; }
            case 72: {
              var numBytes = data.charCodeAt(dPtr++);
              var annot = readRefStr();