FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
annogen.py 378 KiB
Newer Older
#!/usr/bin/env python
# (compatible with both Python 2.7 and Python 3)

"Annotator Generator v3.161 (c) 2012-21 Silas S. Brown"

# See http://ssb22.user.srcf.net/adjuster/annogen.html

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# If you want to compare this code to old versions, the old
# versions are being kept in the E-GuideDog SVN repository on
# http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster
# and on GitHub at https://github.com/ssb22/adjuster
# and on GitLab at https://gitlab.com/ssb22/adjuster
# and on BitBucket https://bitbucket.org/ssb22/adjuster
# and at https://gitlab.developers.cam.ac.uk/ssb22/adjuster
# and in China: https://gitee.com/ssb22/adjuster
# although some early ones are missing.

from optparse import OptionParser
parser = OptionParser()
import sys,os,os.path,tempfile,time,re,subprocess,unicodedata
try: from subprocess import getoutput
except: from commands import getoutput
if not "mac" in sys.platform and not "darwin" in sys.platform and ("win" in sys.platform or "mingw32" in sys.platform): exe=".exe" # Windows, Cygwin, etc
else: exe=""

#  =========== INPUT OPTIONS ==============

parser.add_option("--infile",
                  help="Filename of a text file (or a compressed .gz, .bz2 or .xz file or URL) to read the input examples from. If this is not specified, standard input is used.")

parser.add_option("--incode",default="utf-8",
                  help="Character encoding of the input file (default %default)")

parser.add_option("--mstart",
                  dest="markupStart",
                  default="<ruby><rb>",
                  help="The string that starts a piece of text with annotation markup in the input examples; default %default")

parser.add_option("--mmid",
                  dest="markupMid",
                  default="</rb><rt>",
                  help="The string that occurs in the middle of a piece of markup in the input examples, with the word on its left and the added markup on its right (or the other way around if mreverse is set); default %default")

parser.add_option("--mend",
                  dest="markupEnd",
                  default="</rt></ruby>",
                  help="The string that ends a piece of annotation markup in the input examples; default %default")

parser.add_option("-r","--mreverse",
                  action="store_true",default=False,
                  help="Specifies that the annotation markup is reversed, so the text BEFORE mmid is the annotation and the text AFTER it is the base text")
def cancelOpt(opt,act="store_false",dst=None):
  if not dst: dst=opt.replace("-","_")
  parser.add_option("--no-"+opt,action=act,dest=dst,help="Cancels any earlier --"+opt+" option in Makefile variables etc")
cancelOpt("mreverse")

parser.add_option("--reference-sep",
                  help="Reference separator code used in the example input.  If you want to keep example source references for each rule, you can label the input with 'references' (chapter and section numbers or whatever), and use this option to specify what keyword or other markup the input will use between each 'reference'.  The name of the next reference will be whatever text immediately follows this string.  Note that the reference separator, and the reference name that follows it, should not be part of the text itself and should therefore not be part of any annotation markup.  If this option is not set then references will not be tracked.")

parser.add_option("--ref-name-end",default=" ",
                  help="Sets what the input uses to END a reference name.  The default is a single space, so that the first space after the reference-sep string will end the reference name.")

parser.add_option("--ref-pri",
                  help="Name of a reference to be considered \"high priority\" for Yarowsky-like seed collocations (if these are in use).  Normally the Yarowsky-like logic tries to identify a \"default\" annotation based on what is most common in the examples, with the exceptions indicated by collocations.  If however a word is found in a high priority reference then the first annotation found in that reference will be considered the ideal \"default\" even if it's in a minority in the examples; everything else will be considered as an exception.")

parser.add_option("-s", "--spaces",
                  action="store_false",
                  dest="removeSpace",
                  default=True,
                  help="Set this if you are working with a language that uses whitespace in its non-markedup version (not fully tested).  The default is to assume that there will not be any whitespace in the language, which is correct for Chinese and Japanese.")
cancelOpt("spaces","store_true","removeSpace")

parser.add_option("-c", "--capitalisation",
                  action="store_true",
                  default=False,
                  help="Don't try to normalise capitalisation in the input.  Normally, to simplify the rules, the analyser will try to remove start-of-sentence capitals in annotations, so that the only remaining words with capital letters are the ones that are ALWAYS capitalised such as names.  (That's not perfect: some words might always be capitalised just because they never occur mid-sentence in the examples.)  If this option is used, the analyser will instead try to \"learn\" how to predict the capitalisation of ALL words (including start of sentence words) from their contexts.") # TODO: make the C program put the sentence capitals back
cancelOpt("capitalisation")

parser.add_option("-w", "--annot-whitespace",
                  action="store_true",
                  default=False,
                  help="Don't try to normalise the use of whitespace and hyphenation in the example annotations.  Normally the analyser will try to do this, to reduce the risk of missing possible rules due to minor typographical variations.") # TODO: can this be extended to the point where the words 'try to' can be deleted ?  see comments
cancelOpt("annot-whitespace")
parser.add_option("--keep-whitespace",
                  help="Comma-separated list of words (without annotation markup) for which whitespace and hyphenation should always be kept even without the --annot-whitespace option.  Use when you know the variation is legitimate. This option expects words to be encoded using the system locale (UTF-8 if it cannot be detected).")

parser.add_option("--normalised-file",
                  help="Filename of an optional text file (or compressed .gz, .bz2 or .xz file) to write a copy of the normalised input for diagnostic purposes.  If this is set to the same as --infile then it will be assumed the input file has already been normalised (use with care).")
parser.add_option("--normalise-only",
                  action="store_true",
                  default=False,
                  help="Exit after normalising the input")
cancelOpt("normalise-only")

parser.add_option("--post-normalise",
                  help="Filename of an optional Python module defining a dictionary called 'table' mapping integers to integers for arbitrary single-character normalisation on the Unicode BMP.  This is meant for reducing the size of generated Android apps, and is applied in post-processing (does not affect rules generation itself).  For example this can be used to merge the recognition of Full, Simplified and Variant forms of the same Chinese character in cases where this can be done without ambiguity, if it is acceptable for the generated annotator to recognise mixed-script words should they occur.")

parser.add_option("--glossfile",
                  help="Filename of an optional text file (or compressed .gz, .bz2 or .xz file or URL) to read auxiliary \"gloss\" information.  Each line of this should be of the form: word (tab) annotation (tab) gloss.  Extra tabs in the gloss will be converted to newlines (useful if you want to quote multiple dictionaries).  When the compiled annotator generates ruby markup, it will add the gloss string as a popup title whenever that word is used with that annotation (before any reannotator option is applied).  The annotation field may be left blank to indicate that the gloss will appear for all other annotations of that word.  The entries in glossfile do NOT affect the annotation process itself, so it's not necessary to completely debug glossfile's word segmentation etc.")
parser.add_option("-C", "--gloss-closure",
                  action="store_true",
                  default=False,
                  help="If any Chinese, Japanese or Korean word is missing from glossfile, search its closure of variant characters also. This option requires the cjklib package.") # TODO: option to put variant closures into the annotator itself? (generate new rules if not already exist + closure the 'near' tests) but that could unnecessarily increase the annotator size (with --data-driven the increase could be significant unless we implement shared-substringVariants optimisations, and even then it's unclear how this would interact with the space-saving of common-prefix multibyte sequences), + it might not be correct in all cases, e.g. U+91CC in jianti SHOULDN'T be translated to U+88E1/U+88CF in fanti if it's part of a name, although recognising a 'messed-up' name with that substitution might be acceptable. Anyway, using these closures to fill in a missing gloss should be tolerable.
cancelOpt("gloss-closure")
parser.add_option("--glossmiss",
                  help="Name of an optional file to which to write information about words recognised by the annotator that are missing in glossfile (along with frequency counts and references, if available)") # (default sorted alphabetically, but you can pipe through sort -rn to get most freq 1st)
parser.add_option("--glossmiss-hide",
                  help="Comma-separated list of references to hide from the glossmiss file (does not affect the glossmiss-omit option)")
parser.add_option("--glossmiss-match",
                  help="If supplied, any references not matching this regular expression will be hidden from the glossmiss file (does not affect the glossmiss-omit option)")
parser.add_option("-M","--glossmiss-omit",
                  action="store_true",
                  default=False,
                  help="Omit rules containing any word not mentioned in glossfile.  Might be useful if you want to train on a text that uses proprietary terms and don't want to accidentally 'leak' those terms (assuming they're not accidentally included in glossfile also).  Words may also be listed in glossfile with an empty gloss field to indicate that no gloss is available but rules using this word needn't be omitted.")
cancelOpt("glossmiss-omit")

parser.add_option("--words-omit",
                  help="File (or compressed .gz, .bz2 or .xz file or URL) containing words (one per line, without markup) to omit from the annotator.  Use this to make an annotator smaller if for example if you're working from a rules file that contains long lists of place names you don't need this particular annotator to recognise but you still want to keep them as rules for other annotators, but be careful because any word on such a list gets omitted even if it also has other meanings (some place names are also normal words).")

parser.add_option("--manualrules",
                  help="Filename of an optional text file (or compressed .gz, .bz2 or .xz file or URL) to read extra, manually-written rules.  Each line of this should be a marked-up phrase (in the input format) which is to be unconditionally added as a rule.  Use this sparingly, because these rules are not taken into account when generating the others and they will be applied regardless of context (although a manual rule might fail to activate if the annotator is part-way through processing a different rule); try checking messages from --diagnose-manual.") # (or if there's a longer automatic match)

#  =========== OUTPUT OPTIONS ==============

parser.add_option("--rulesFile",help="Filename of an optional auxiliary binary file to hold the accumulated rules. Adding .gz, .bz2 or .xz for compression is acceptable. If this is set then the rules will be written to it (in binary format) as well as to the output. Additionally, if the file already exists then rules will be read from it and incrementally updated. This might be useful if you have made some small additions to the examples and would like these to be incorporated without a complete re-run. It might not work as well as a re-run but it should be faster. If using a rulesFile then you must keep the same input (you may make small additions etc, but it won't work properly if you delete many examples or change the format between runs) and you must keep the same ybytes-related options if any.") # You may however change whether or not a --single-words / --max-words option applies to the new examples (but hopefully shouldn't have to)

parser.add_option("-n","--no-input",
                  action="store_true",default=False,
                  help="Don't process new input, just use the rules that were previously stored in rulesFile. This can be used to increase speed if the only changes made are to the output options. You should still specify the input formatting options (which should not change), and any glossfile or manualrules options (which may change). For the glossmiss and summary options to work correctly, unchanged input should be provided.")
cancelOpt("no-input")

parser.add_option("--c-filename",default="",help="Where to write the C, C#, Python, Javascript, Go or Dart program. Defaults to standard output, or annotator.c in the system temporary directory if standard output seems to be the terminal (the program might be large, especially if Yarowsky-like indicators are not used, so it's best not to use a server home directory where you might have limited quota). If MPI is in use then the default will always be standard output.") # because the main program might not be running on the launch node

parser.add_option("--c-compiler",default="cc -o annotator"+exe,help="The C compiler to run if generating C and standard output is not connected to a pipe. The default is to use the \"cc\" command which usually redirects to your \"normal\" compiler. You can add options (remembering to enclose this whole parameter in quotes if it contains spaces), but if the C program is large then adding optimisation options may make the compile take a LONG time. If standard output is connected to a pipe, then this option is ignored because the C code will simply be written to the pipe. You can also set this option to an empty string to skip compilation. Default: %default")
# If compiling an experimental annotator quickly (and don't want to use --data-driven for some reason), then you might try tcc as it compiles fast. If tcc is not available on your system then clang might compile faster than gcc.
# (BUT tcc can have problems on Raspberry Pi see http://www.raspberrypi.org/phpBB3/viewtopic.php?t=30036&p=263213; can be best to cross-compile, e.g. from a Mac use https://github.com/UnhandledException/ARMx/wiki/Sourcery-G---Lite-for-ARM-GNU-Linux-(2009q3-67)-for-Mac-OS-X and arm-none-linux-gnueabi-gcc)
# In large rulesets with --max-or-length=0 and --nested-switch, gcc takes time and gcc -Os can take a LOT longer, and CINT, Ch and picoc run out of memory.  Without these options the overhead of gcc's -Os isn't so bad (and does save some room).
# clang with --max-or-length=100 and --nested-switch=0 is not slowed much by -Os (slowed considerably by -O3). -Os and -Oz gave same size in my tests.
# on 64-bit distros -m32 won't always work and won't necessarily give a smaller program

parser.add_option("--max-or-length",default=100,help="The maximum number of items allowed in an OR-expression in non table-driven code (used when ybytes is in effect). When an OR-expression becomes larger than this limit, it will be made into a function. 0 means unlimited, which works for tcc and gcc; many other compilers have limits. Default: %default")

parser.add_option("--nested-if",
                  action="store_true",default=False,
                  help="Allow C/C#/Java/Go if() blocks (but not switch() constructs) to be nested to unlimited depth.  This probably increases the workload of the compiler's optimiser when reducing size, but may help when optimising for speed.")
cancelOpt("nested-if")
parser.add_option("--nested-switch",default=0,
                  help="Allow C/C#/Java/Go switch() constructs to be nested to about this depth.  Default 0 tries to avoid nesting, as it slows down most C compilers for small savings in executable size.  Setting 1 nests 1 level deeper which can occasionally help get around memory problems with Java compilers.  -1 means nest to unlimited depth, which is not recommended.  Setting this to anything other than 0 implies --nested-if also.") # tcc is still fast (although that doesn't generate the smallest executables anyway)

parser.add_option("--outcode",default="utf-8",
                  help="Character encoding to use in the generated parser and rules summary (default %default, must be ASCII-compatible i.e. not utf-16)")

parser.add_option("-S", "--summary-only",
                  action="store_true",default=False,
                  help="Don't generate a parser, just write the rules summary to standard output")
cancelOpt("summary-only")

parser.add_option("-N","--no-summary",
                  action="store_true",default=False,
                  help="Don't add a large rules-summary comment at the end of the parser code")
cancelOpt("no-summary")

parser.add_option("-O", "--summary-omit",
                  help="Filename of a text file (or a compressed .gz, .bz2 or .xz file or URL) specifying what should be omitted from the rules summary.  Each line should be a word or phrase, a tab, and its annotation (without the mstart/mmid/mend markup).  If any rule in the summary exactly matches any of the lines in this text file, then that rule will be omitted from the summary (but still included in the parser).  Use for example to take out of the summary any entries that correspond to things you already have in your dictionary, so you can see what's new.")

parser.add_option("--maxrefs",default=3,
                  help="The maximum number of example references to record in each summary line, if references are being recorded (0 means unlimited).  Default is %default.")

parser.add_option("-R","--norefs",
                  action="store_true",default=False,
                  help="Don't write references in the rules summary (or the glossmiss file).  Use this if you need to specify reference-sep and ref-name-end for the ref-pri option but you don't actually want references in the summary (which speeds up summary generation slightly).  This option is automatically turned on if --no-input is specified.") # the speed difference is not so great as of v0.593, but needed anyway if --no-input is set
cancelOpt("norefs")

parser.add_option("-E","--newlines-reset",
                  action="store_false",
                  dest="ignoreNewlines",
                  default=True,
                  help="Have the annotator reset its state on every newline byte. By default newlines do not affect state such as whether a space is required before the next word, so that if the annotator is used with Web Adjuster's htmlText option (which defaults to using newline separators) the spacing should be handled sensibly when there is HTML markup in mid-sentence.")
cancelOpt("newlines-reset","store_true","ignoreNewlines")

parser.add_option("-z","--compress",
                  action="store_true",default=False,
                  help="Compress annotation strings in the C code.  This compression is designed for fast on-the-fly decoding, so it saves only a limited amount of space (typically 10-20%) but might help if RAM is short; see also --data-driven.")
cancelOpt("compress")

parser.add_option("-D","--data-driven",
                  action="store_true",default=False,
                  help="Generate a program that works by interpreting embedded data tables for comparisons, instead of writing these as code.  This can take some load off the compiler (so try it if you get errors like clang's \"section too large\"), as well as compiling faster and reducing the resulting binary's RAM size (by 35-40% is typical), at the expense of a small reduction in execution speed.  Javascript, Python and Dart output is always data-driven anyway.") # If the resulting binary is compressed (e.g. in an APK), its compressed size will likely not change much (same information content), so I'm specifically saying "RAM size" i.e. when decompressed
cancelOpt("data-driven")
parser.add_option("-F","--fast-assemble",
                  action="store_true",default=False,
                  help="Skip opcode compaction when using data-driven (slightly speeds up compilation, at the expense of larger code size)") # TODO: consider removing this option now it's no longer very slow anyway
cancelOpt("fast-assemble")

parser.add_option("-Z","--zlib",
                  action="store_true",default=False,
                  help="Enable --data-driven and compress the embedded data table using zlib (or pyzopfli if available), and include code to call zlib to decompress it on load.  Useful if the runtime machine has the zlib library and you need to save disk space but not RAM (the decompressed table is stored separately in RAM, unlike --compress which, although giving less compression, at least works 'in place').  Once --zlib is in use, specifying --compress too will typically give an additional disk space saving of less than 1% (and a runtime RAM saving that's greater but more than offset by zlib's extraction RAM).  If generating a Javascript annotator with zlib, the decompression code is inlined so there's no runtime zlib dependency, but startup can be ~50% slower so this option is not recommended in situations where the annotator is frequently reloaded from source (unless you're running on Node.js in which case loading is faster due to the use of Node's \"Buffer\" class).") # compact_opcodes typically still helps no matter what the other options are
cancelOpt("zlib")

parser.add_option("-l","--library",
                  action="store_true",default=False,
                  help="Instead of generating C code that reads and writes standard input/output, generate a C library suitable for loading into Python via ctypes.  This can be used for example to preload a filter into Web Adjuster to cut process-startup delays.")
cancelOpt("library")

parser.add_option("-W","--windows-clipboard",
                  action="store_true",default=False,
                  help="Include C code to read the clipboard on Windows or Windows Mobile and to write an annotated HTML file and launch a browser, instead of using the default cross-platform command-line C wrapper.  See the start of the generated C file for instructions on how to compile for Windows or Windows Mobile.")
cancelOpt("windows-clipboard")

parser.add_option("-#","--c-sharp",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate C# (not quite as efficient as the C code but close; might be useful for adding an annotator to a C# project; see comments at the start for usage)")
cancelOpt("c-sharp")

parser.add_option("--java",
                  help="Instead of generating C code, generate Java, and place the *.java files in the directory specified by this option.  See --android for example use.  The last part of the directory should be made up of the package name; a double slash (//) should separate the rest of the path from the package name, e.g. --java=/path/to/wherever//org/example/package and the main class will be called Annotator.")
parser.add_option("--android",
                  help="URL for an Android app to browse.  If this is set, code is generated for an Android app which starts a browser with that URL as the start page, and annotates the text on every page it loads.  Use file:///android_asset/index.html for local HTML files in the assets directory; a clipboard viewer is placed in clipboard.html, and the app will also be able to handle shared text.  If certain environment variables are set, this option can also compile and sign the app using Android SDK command-line tools (SDK 24 or higher is required on the build machine, but the resulting app will be compatible with all versions of Android back to 1.x); if the necessary environment variables are not set, this option will just write the files and print a message on stderr explaining what needs to be set for automated command-line building.")
# SDK 24 was released mid-2016.  If your main OS cannot be upgraded, you should be able to install a newer SDK on a virtual machine, e.g. on a 2011 Mac stuck on MacOS 10.7, I used VirtualBox 4.3.4, Vagrant 1.9.5, Debian 8 Jessie and SSH with X11 forwarding to install Android Studio 3.5 from 2019, although for apksigner to work I also had to add 'deb http://archive.debian.org/debian/ jessie-backports main' to /etc/apt/sources.list and do 'sudo apt-get -o Acquire::Check-Valid-Until=false update' and 'sudo apt-get install -t jessie-backports openjdk-8-jdk openjdk-8-jre openjdk-8-jre-headless ca-certificates-java' and 'sudo apt-get --purge remove openjdk-7-jre-headless'
parser.add_option("--android-template",
                  help="File to use as a template for Android start HTML.  This option implies --android=file:///android_asset/index.html and generates that index.html from the file specified (or from a built-in default if the special filename 'blank' is used).  The template file may include URL_BOX_GOES_HERE to show a URL entry box and related items (offline-clipboard link etc) in the page, in which case you can optionally define a Javascript function 'annotUrlTrans' to pre-convert some URLs from shortcuts etc and 'desktopURL' if you have a page about how to get a desktop version (e.g. via browser extension); also enables better zoom controls on Android 4+, a mode selector if you use --annotation-names, a selection scope control on recent-enough WebKit, and a visible version stamp (which, if the device is in 'developer mode', you may double-tap on to show missing glosses). If you do include URL_BOX_GOES_HERE you'll have an annotating Web browser app that allows the user to navigate to arbitrary URLs: as of 2020, this is acceptable on Google Play and Huawei AppGallery, but NOT Amazon AppStore as they don't want 'competition' to their Silk browser.") # but some devices allow APKs to be 'side-loaded'.  annotUrlTrans returns undefined = uses original
parser.add_option("-L","--pleco-hanping",
                  action="store_true",default=False,
                  help="In the Android app, make popup definitions link to Pleco or Hanping if installed")
cancelOpt("pleco-hanping")

parser.add_option("--bookmarks",
                  help="Android bookmarks: comma-separated list of package names that share our bookmarks. If this is not specified, the browser will not be given a bookmarks function. If it is set to the same value as the package specified in --java, bookmarks are kept in just this Android app. If it is set to a comma-separated list of packages that have also been generated by annogen (presumably with different annotation types), and if each one has the same android:sharedUserId attribute in AndroidManifest.xml's 'manifest' tag (you'll need to add this manually), and if the same certificate is used to sign all of them, then bookmarks can be shared across the set of browser apps.  But beware the following two issues: (1) adding an android:sharedUserId attribute to an app that has already been released without one causes some devices to refuse the update with a 'cannot install' message (details via adb logcat; affected users would need to uninstall and reinstall instead of update, and some of them may not notice the instruction to do so); (2) this has not been tested with Google's new \"App Bundle\" arrangement, and may be broken if the Bundle results in APKs being signed by a different key.  In June 2019 Play Console started issuing warnings if you release an APK instead of a Bundle, even though the \"size savings\" they mention are under 1% for annogen-generated apps.") # (the only resource that might vary by device is the launcher icon)
parser.add_option("-e","--epub",
                  action="store_true",default=False,
                  help="When generating an Android browser, make it also respond to requests to open EPUB files. This results in an app that requests the 'read external storage' permission on Android versions below 6, so if you have already released a version without EPUB support then devices running Android 5.x or below will not auto-update past this change until the user notices the update notification and approves the extra permission.") # see comments around READ_EXTERNAL_STORAGE below
cancelOpt("epub")
parser.add_option("--android-print",
                  action="store_true",default=False,
                  help="When generating an Android browser, include code to provide a Print option (usually print to PDF) and a simple highlight-selection option. The Print option will require Android 4.4, but the app should still run without it on earlier versions of Android.")
cancelOpt("android-print")
parser.add_option("--android-audio",help="When generating an Android browser, include an option to convert the selection to audio using this URL as a prefix, e.g. https://example.org/speak.cgi?text= (use for languages not likely to be supported by the device itself). Optionally follow the URL with a space (quote carefully) and a maximum number of words to read in each user request. Setting a limit is recommended, or somebody somewhere will likely try 'Select All' on a whole book or something and create load problems. You should set a limit server-side too of course.") # do need https if we're Android 5+ and will be viewing HTTPS pages, or Chrome will block (OK if using EPUB-etc or http-only pages)
parser.add_option("--android-urls",
                  help="Whitespace-separated list of URL prefixes to offer to be a browser for, when a matching URL is opened by another Android application. If any path (but not scheme or domain) contains .* then it is treated as a pattern instead of a prefix, but Android cannot filter on query strings (i.e. text after question-mark).")
parser.add_option("--extra-js",help="Extra Javascript to inject into sites to fix things in the Android browser app. The snippet will be run before each scan for new text to annotate. You may also specify a file to read: --extra-js=@file.js (do not use // comments, only /* ... */ because newlines will be replaced)")
parser.add_option("--existing-ruby-js-fixes",help="Extra Javascript to run in the Android browser app whenever existing RUBY elements are encountered; the DOM node above these elements will be in the variable n, which your code can manipulate to fix known problems with sites' existing ruby (such as common two-syllable words being split when they shouldn't be). Use with caution. You may also specify a file to read: --existing-ruby-js-fixes=@file.js")
parser.add_option("--delete-existing-ruby",action="store_true",default=False,help="Set the Android app or browser extension to completely remove existing ruby elements. Use this when you expect to replace a site's own annotation with a completely different type of annotation. If you also supply --existing-ruby-js-fixes and/or --existing-ruby-shortcut-yarowsky, then --delete-existing-ruby specifies that only the first --sharp-multi option should have existing ruby preserved.")
parser.add_option("--existing-ruby-shortcut-yarowsky",action="store_true",default=False,help="Set the Android browser app to 'shortcut' Yarowsky-like collocation decisions when adding glosses to existing ruby over 2 or more characters, so that words normally requiring context to be found are more likely to be found without context (this may be needed because adding glosses to existing ruby is done without regard to context)") # (an alternative approach would be to collapse the existing ruby markup to provide the context, but that could require modifying the inner functions to 'see' context outside the part they're annotating)
parser.add_option("--extra-css",help="Extra CSS to inject into sites to fix things in the Android browser app. You may also specify a file to read --extra-css=@file.css")
parser.add_option("--app-name",default="Annotating browser",
                  help="User-visible name of the Android app")

parser.add_option("--compile-only",
                  action="store_true",default=False,
                  help="Assume the code has already been generated by a previous run, and just run the compiler")
cancelOpt("compile-only")

parser.add_option("-j","--javascript",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate JavaScript.  This might be useful if you want to run an annotator on a device that has a JS interpreter but doesn't let you run native code.  The JS will be table-driven to make it load faster (and --no-summary will also be set).  See comments at the start for usage.") # but it's better to use the C version if you're in an environment where 'standard input' makes sense
cancelOpt("javascript")

parser.add_option("-6","--js-6bit",
                  action="store_true",default=False,
                  help="When generating a Javascript annotator, use a 6-bit format for many addresses to reduce escape codes in the data string by making more of it ASCII. Not relevant if using zlib.") # May result in marginally slower JS, but it should be smaller and parse more quickly on initial load, which is normally the dominant factor if you have to reload it on every page.
cancelOpt("js-6bit")

parser.add_option("-8","--js-octal",
                  action="store_true",default=False,
                  help="When generating a Javascript annotator, use octal instead of hexadecimal codes in the data string when doing so would save space. This does not comply with ECMAScript 5 and may give errors in its strict mode. Not relevant if using zlib.")
cancelOpt("js-octal")

parser.add_option("-9","--ignore-ie8",
                  action="store_true",default=False,
                  help="When generating a Javascript annotator, do not make it backward-compatible with Microsoft Internet Explorer 8 and below. This may save a few bytes. Not relevant if using zlib.")
cancelOpt("ignore-ie8")

parser.add_option("-u","--js-utf8",
                  action="store_true",default=False,
                  help="When generating a Javascript annotator, assume the script can use UTF-8 encoding directly and not via escape sequences. In some browsers this might work only on UTF-8 websites, and/or if your annotation can be expressed without the use of Unicode combining characters.")
cancelOpt("js-utf8")

parser.add_option("--browser-extension", help="Name of a Chrome or Firefox browser extension to generate.  The extension will be placed in a directory of the same name (without spaces), which may optionally already exist and contain icons like 32.png and 48.png to be used.")
# To test the resulting extension locally:
# Firefox: about:debugging - 'this firefox' - load temporary add-on - manifest.json
# Chrome: chrome://extensions - Developer mode - Load unpacked - select the directory
# Chrome bug: browser_style true gives unreadable text in Chromium 89 with enable-force-dark set to "Enabled with selective inversion of everything" (and possibly other settings)

parser.add_option("--dart",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate Dart.  This might be useful if you want to run an annotator in a Flutter application.")
cancelOpt("dart")

parser.add_option("--dart-datafile",
                  help="When generating Dart code, put annotator data into a separate file and open it using this pathname. Not compatible with Dart's \"Web app\" option, but might save space in a Flutter app (especially along with --zlib)")

parser.add_option("-Y","--python",
                  action="store_true",default=False,
                  help="Instead of generating C code, generate a Python module.  Similar to the Javascript option, this is for when you can't run native code, and it is table-driven for fast loading.")
cancelOpt("python")

parser.add_option("--golang",
                  help="[DEPRECATED] Package name for a Go library to generate instead of C code.  See comments in the generated file for how to run this on old AppEngine with Go 1.11 or below.  Deprecated because newer AppEngine runtimes work differently (and the \"flexible\" environment can run C code); this option will probably be removed if they shut down the old free-tier runtimes.")

parser.add_option("--reannotator",
                  help="Shell command through which to pipe each word of the original text to obtain new annotation for that word.  This might be useful as a quick way of generating a new annotator (e.g. for a different topolect) while keeping the information about word separation and/or glosses from the previous annotator, but it is limited to commands that don't need to look beyond the boundaries of each word.  If the command is prefixed by a # character, it will be given the word's existing annotation instead of its original text, and if prefixed by ## it will be given text#annotation.  The command should treat each line of its input independently, and both its input and its output should be in the encoding specified by --outcode.") # TODO: reannotatorCode instead? (see other 'reannotatorCode' TODOs)
# (Could just get the reannotator to post-process the 1st annotator's output, but that might be slower than generating an altered annotator with it)

parser.add_option("-A","--reannotate-caps",
                  action="store_true",default=False,
                  help="When using --reannotator, make sure to capitalise any word it returns that began with a capital on input")
cancelOpt("reannotate-caps")

parser.add_option("--sharp-multi",
                  action="store_true",default=False,
                  help="Assume annotation (or reannotator output) contains multiple alternatives separated by # (e.g. pinyin#Yale) and include code to select one by number at runtime (starting from 0). This is to save on total space when shipping multiple annotators that share the same word grouping and gloss data, differing only in the transcription of each word.")
cancelOpt("sharp-multi")
parser.add_option("--annotation-names",help="Comma-separated list of annotation types supplied to sharp-multi (e.g. Pinyin,Yale), if you want the Android app etc to be able to name them.  You can also set just one annotation names here if you are not using sharp-multi.")

#  =========== ANALYSIS OPTIONS ==============

parser.add_option("-o", "--allow-overlaps",
                  action="store_true",default=False,
                  help="Normally, the analyser avoids generating rules that could overlap with each other in a way that would leave the program not knowing which one to apply.  If a short rule would cause overlaps, the analyser will prefer to generate a longer rule that uses more context, and if even the entire phrase cannot be made into a rule without causing overlaps then the analyser will give up on trying to cover that phrase.  This option allows the analyser to generate rules that could overlap, as long as none of the overlaps would cause actual problems in the example phrases. Thus more of the examples can be covered, at the expense of a higher risk of ambiguity problems when applying the rules to other texts.  See also the -y option.")
cancelOpt("allow-overlaps")

parser.add_option("-P", "--primitive",
                  action="store_true",default=False,
                  help="Don't bother with any overlap or conflict checks at all, just make a rule for each word. The resulting parser is not likely to be useful, but the summary might be.")
cancelOpt("primitive")

parser.add_option("-y","--ybytes",default=0,
                  help="Look for candidate Yarowsky seed-collocations within this number of bytes of the end of a word.  If this is set then overlaps and rule conflicts will be allowed when seed collocations can be used to distinguish between them, and the analysis is likely to be faster.  Markup examples that are completely separate (e.g. sentences from different sources) must have at least this number of (non-whitespace) bytes between them.")
parser.add_option("--ybytes-max",default=0,
                  help="Extend the Yarowsky seed-collocation search to check over larger ranges up to this maximum.  If this is set then several ranges will be checked in an attempt to determine the best one for each word, but see also ymax-threshold.")
parser.add_option("--ymax-threshold",default=1,
                  help="Limits the length of word that receives the narrower-range Yarowsky search when ybytes-max is in use. For words longer than this, the search will go directly to ybytes-max. This is for languages where the likelihood of a word's annotation being influenced by its immediate neighbours more than its distant collocations increases for shorter words, and less is to be gained by comparing different ranges when processing longer words. Setting this to 0 means no limit, i.e. the full range will be explored on ALL Yarowsky checks.") # TODO: see TODO below re temporary recommendation of --ymax-threshold=0
parser.add_option("--ybytes-step",default=3,
                  help="The increment value for the loop between ybytes and ybytes-max")
parser.add_option("-k","--warn-yarowsky",
                  action="store_true",default=False,
                  help="Warn when absolutely no distinguishing Yarowsky seed collocations can be found for a word in the examples")
cancelOpt("warn-yarowsky")
parser.add_option("-K","--yarowsky-all",
                  action="store_true",default=False,
                  help="Accept Yarowsky seed collocations even from input characters that never occur in annotated words (this might include punctuation and example-separation markup)")
cancelOpt("yarowsky-all")
parser.add_option("--yarowsky-debug",default=1,
                  help="Report the details of seed-collocation false positives if there are a large number of matches and at most this number of false positives (default %default). Occasionally these might be due to typos in the corpus, so it might be worth a check.")
parser.add_option("--normalise-debug",default=1,
                  help="When --capitalisation is not in effect. report words that are usually capitalised but that have at most this number of lower-case exceptions (default %default) for investigation of possible typos in the corpus")

parser.add_option("-1","--single-words",
                  action="store_true",default=False,
                  help="Do not consider any rule longer than 1 word, although it can still have Yarowsky seed collocations if -y is set. This speeds up the search, but at the expense of thoroughness. You might want to use this in conjuction with -y to make a parser quickly. It is like -P (primitive) but without removing the conflict checks.")
cancelOpt("single-words")
parser.add_option("--max-words",default=0,
                  help="Limits the number of words in a rule; rules longer than this are not considered.  0 means no limit.  --single-words is equivalent to --max-words=1.  If you need to limit the search time, and are using -y, it should suffice to use --single-words for a quick annotator or --max-words=5 for a more thorough one.")  # (There was a bug in annogen versions before 0.58 that caused --max-words to additionally limit how far away from the start of its phrase a rule-example must be placed; this has now been fixed.  There was also a bug that resulted in too many extra rules being tested over already-catered-for phrases; as this has now been fixed, the additional benefit of a --max-words limit is now reduced, but you might want to put one in anyway.  That second bug also had the effect of the coverage % being far too low in the progress stats.)

# TODO: optionally (especially if NOT using Yarowsky) do an additional pass (after discovering all other rules) and turn whole phrases that are not completely covered by other rules into whole-phrase rules, if it doesn't conflict 1 phrase w. anothr of equal priority; shld be ok if no overlap, overlaps wld *sometimes* be ok suggest a len threshold

parser.add_option("--checkpoint",help="Periodically save checkpoint files in the specified directory.  These files can save time when starting again after a reboot (and it's easier than setting up Condor etc).  As well as a protection against random reboots, this can be used for scheduled reboots: if file called ExitASAP appears in the checkpoint directory, annogen will checkpoint, remove the ExitASAP file, and exit.  After a run has completed, the checkpoint directory should be removed, unless you want to re-do the last part of the run for some reason.")
# (Condor can checkpoint an application on Win/Mac/Linux but is awkward to set up.  Various Linux and BSD application checkpoint approaches also exist, and virtual machines can have their state saved.  On the other hand the physical machine might have a 'hibernate' option which is easier.)
parser.add_option("--checkpoint-period",default=1000,help="Approximate number of seconds between checkpoints (default %default).  Setting this to 0 disables periodic checkpoints but still allows use of checkpoint directory for concurrency or ExitASAP processing.")

parser.add_option("-d","--diagnose",help="Output some diagnostics for the specified word. Use this option to help answer \"why doesn't it have a rule for...?\" issues. This option expects the word without markup and uses the system locale (UTF-8 if it cannot be detected).")
parser.add_option("--diagnose-limit",default=10,help="Maximum number of phrases to print diagnostics for (0 means unlimited); can be useful when trying to diagnose a common word in rulesFile without re-evaluating all phrases that contain it. Default: %default")
parser.add_option("-m","--diagnose-manual",
                  action="store_true",default=False,
                  help="Check and diagnose potential failures of --manualrules")
cancelOpt("diagnose-manual")
parser.add_option("-q","--diagnose-quick",
                  action="store_true",default=False,
                  help="Ignore all phrases that do not contain the word specified by the --diagnose option, for getting a faster (but possibly less accurate) diagnostic.  The generated annotator is not likely to be useful when this option is present.  You may get quick diagnostics WITHOUT these disadvantages by loading a --rulesFile instead.")
cancelOpt("diagnose-quick")

parser.add_option("--priority-list",help="Instead of generating an annotator, use the input examples to generate a list of (non-annotated) words with priority numbers, a higher number meaning the word should have greater preferential treatment in ambiguities, and write it to this file (or compressed .gz, .bz2 or .xz file).  If the file provided already exists, it will be updated, thus you can amend an existing usage-frequency list or similar (although the final numbers are priorities and might no longer match usage-frequency exactly).  The purpose of this option is to help if you have an existing word-priority-based text segmenter and wish to update its data from the examples; this approach might not be as good as the Yarowsky-like one (especially when the same word has multiple readings to choose from), but when there are integration issues with existing code you might at least be able to improve its word-priority data.")

parser.add_option("-t","--time-estimate",
                  action="store_true",default=False,
                  help="Estimate time to completion.  The code to do this is unreliable and is prone to underestimate.  If you turn it on, its estimate is displayed at the end of the status line as days, hours or minutes.") # Unreliable because the estimate assumes 'phrases per minute' will remain constant on average, whereas actually it will decrease because the more complex phrases are processed last
cancelOpt("time-estimate")

parser.add_option("-0","--single-core",
                  action="store_true",default=False,
                  help="Use only one CPU core even when others are available. If this option is not set, multiple cores are used if a 'futures' package is installed or if run under MPI or SCOOP; this currently requires --checkpoint + shared filespace, and is currently used only for large collocation checks in limited circumstances. Single-core saves on CPU power consumption, but if the computer is set to switch itself off at the end of the run then TOTAL energy used is generally less if you allow it to run multicore and reach that switchoff sooner.") # limited circumstances: namely, words that occur in length-1 phrases. TODO: Linux cpusets can reduce the number of CPUs actually available, so we might start too many processes unless run with -0 (especially in a virtual environment).
# Consider a Mac Mini that idles at 15W and maxes-out at 85W when running 2-core 4-thread i5.  The 70W difference is probably 35W for the CPU at 50% power-supply efficiency, give or take some extras.  Running 1-core should very roughly halve that 70W (below half if non-use of SMT saves a bit of power, but above if there's constant overheads and/or TurboBoost adding up to 25% to the clock when running single-core), so maybe about 50W.  One corpus ran multicore for about 40mins of its total runtime, and changing it to single-core added about 30mins to that total runtime.  So if the machine is set to halt at the end of the run, the single-core option saves 35W x 40mins at the expense of 50W x 30mins.  That's a negative saving.  On the other hand if the computer is NOT to be powered off at the end of the run then single-core does save power.
cancelOpt("single-core")

parser.add_option("-p","--status-prefix",help="Label to add at the start of the status line, for use if you batch-run annogen in multiple configurations and want to know which one is currently running")

main = (__name__ == "__main__" and not os.environ.get("OMPI_COMM_WORLD_RANK","0").replace("0",""))
term = os.environ.get("TERM","")
is_xterm = "xterm" in term
ansi_escapes = is_xterm or term in ["screen","linux"]
def isatty(f): return hasattr(f,"isatty") and f.isatty()
if ansi_escapes and isatty(sys.stderr): clear_eol,reverse_on,reverse_off,bold_on,bold_off="\x1b[K\r","\x1b[7m","\x1b[0m","\x1b[1m","\x1b[0m"
else: clear_eol,reverse_on,reverse_off,bold_on,bold_off="  \r"," **","** ","",""
if main: sys.stderr.write(bold_on+__doc__+bold_off+"\n") # not sys.stdout: may or may not be showing --help (and anyway might want to process the help text for website etc)
# else (if not main), STILL parse options (if we're being imported for parallel processing)
options, args = parser.parse_args()
globals().update(options.__dict__)

if type("")==type(u""): sys.setcheckinterval=lambda x:x # don't bother doing this on Python 3 (TODO: setswitchinterval?)
sys.setcheckinterval(32767) # won't be using threads or signals, so don't have to check for them very often
import gc ; gc.disable() # should be OK if we don't create cycles (TODO: run gc.collect() manually after init, just in case?)

def warn(msg):
  if main: sys.stderr.write("Warning: "+msg+"\n")
  # else it should have already been written
if "PyPy" in sys.version: warn("with annogen, PyPy is likely to run 60% slower than python") # (not to mention concurrent.futures being less likely to be available)

if checkpoint_period: checkpoint_period=int(checkpoint_period)
if primitive and ybytes: warn("primitive will override ybytes")
if ybytes: ybytes=int(ybytes)
if ybytes_max: ybytes_max=int(ybytes_max)
else: ybytes_max = ybytes
if yarowsky_debug: yarowsky_debug=int(yarowsky_debug)
else: yarowsky_debug = 0
if normalise_debug: normalise_debug=int(normalise_debug)
else: normalise_debug = 0
ybytes_step = int(ybytes_step)
maxrefs = int(maxrefs)
ymax_threshold = int(ymax_threshold)
if not golang: golang = ""
if nested_switch: nested_if = True
def errExit(msg):
  assert main # bad news if this happens in non-main module
  try:
    if not outfile==getBuf(sys.stdout):
      outfile.close() ; rm_f(c_filename)
  except: pass # works only if got past outfile opening
  sys.stderr.write(msg+"\n") ; sys.exit(1)
if args: errExit("Unknown argument "+repr(args[0]))
if ref_pri and not (reference_sep and ref_name_end): errExit("ref-pri option requires reference-sep and ref-name-end to be set")
if browser_extension:
  javascript = True
  if sharp_multi and not annotation_names: errExit("--sharp-multi requires --annotation-names to be set if --browser-extension")
  if zlib: errExit("--zlib not currently supported with --browser-extension") # would need to ensure it's decompressed after being read in from annotate-dat.txt
if android_template:
  android = "file:///android_asset/index.html"
if android and not java: errExit('You must set --java=/path/to/src//name/of/package when using --android')
if bookmarks and not android: errExit("--bookmarks requires --android, e.g. --android=file:///android_asset/index.html")
if android_print and not bookmarks: errExit("The current implementation of --android-print requires --bookmarks to be set as well")
if android_audio:
  if not android_print: errExit("The current implementation of --android-audio requires --android-print to be set as well") # for the highlighting (and TODO: I'm not sure about the HTML5-Audio support of Android 2.x devices etc, so should we check a minimum Android version before making the audio option available? as highlight option can be done pre-4.4 just no way to save the result)
  if "'" in android_audio or '"' in android_audio or '\\' in android_audio: errExit("The current implementation of --android-audio requires the URL not to contain any quotes or backslashes, please percent-encode them")
  if ' ' in android_audio:
    android_audio,android_audio_maxWords = android_audio.split()
    android_audio_maxWords = int(android_audio_maxWords)
  else: android_audio_maxWords=None
if (extra_js or extra_css or existing_ruby_js_fixes) and not android: errExit("--extra-js, --extra-css and --existing-ruby-js-fixes requires --android") # browser-extension: existing_ruby_js_fixes would require aType to be known by content.js (cn do via handleMessage) + oldTxt no longer sufficient for restoring page for reannotate.  TODO: even with delete_existing_ruby, oldTxt is not sufficient to restore page for annotation off (it currently needs reload after turn off if it had existing ruby, and we don't do that automatically, nor should we as they might have unsaved changes), due to nfOld/nfNew further up the DOM, and it's no good replacing it with a list of DOM objects to replaceChild on, because anything more than text does not persist in the DOM after content.js runs, nor does it persist in the content.js variable space.
if delete_existing_ruby and not (android or javascript): errExit("--delete-existing-ruby requires --android or --javascript") # (or --browser-extension, which implies --javascript)
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
if not extra_css: extra_css = ""
if not extra_js: extra_js = ""
if not existing_ruby_js_fixes: existing_ruby_js_fixes = ""
if extra_css.startswith("@"): extra_css = open(extra_css[1:],"rb").read()
if extra_js.startswith("@"):
  extra_js = extra_js[1:]
  if not os.system("which node 2>/dev/null >/dev/null"):
    # we can check the syntax
    import pipes
    if os.system("node -c "+pipes.quote(extra_js)): errExit("Syntax check failed for extra-js file "+extra_js)
  extra_js = open(extra_js,"rb").read()
if type("")==type(u""): # Python 3
  def B(s):
    try: return s.encode('latin1')
    except: return s
  def S(b):
    try: return b.decode('latin1')
    except: return b
  def getBuf(f):
    try: return f.buffer
    except: return f
else: # Python 2: pass through as quickly as possible
  def B(s): return s # (and as this particular script shouldn't need to run on a Python 2 below 2.7, we also use b"" inline for literals)
  def S(s): return s
  def getBuf(f): return f
if extra_js.rstrip() and not B(extra_js.rstrip()[-1:]) in b';}': errExit("--extra-js must end with a semicolon or a closing brace")
if existing_ruby_js_fixes.startswith("@"): existing_ruby_js_fixes = open(existing_ruby_js_fixes[1:],"rb").read()
jPackage = None
if nested_switch: nested_switch=int(nested_switch) # TODO: if java, override it?  or just rely on the help text for --nested-switch (TODO cross-reference it from --java?)
if java:
  if not '//' in java: errExit("--java must include a // to separate the first part of the path from the package name")
  jSrc,jRest=java.rsplit('//',1)
  if '.' in jRest: errExit("--java must be ...src//org/example/package not ...src//org.example.package") # (TODO: fix it automatically in both jRest and java? only on the right-hand side of the //)
  jPackage = jRest.replace('/','.')
  if 'NewFunc' in jPackage: errExit("Currently unable to include the string 'NewFunc' in your package due to an implementation detail in annogen's search/replace operations")
if not c_filename and isatty(sys.stdout): # assumed false when run under MPI
  c_filename = tempfile.gettempdir()+os.sep+"annotator.c"
def shell_escape(arg):
  if re.match("^[A-Za-z0-9_=/.%+,:@-]*$",arg): return arg
  return "'"+arg.replace("'",r"'\''")+"'"
if sharp_multi:
  if c_sharp or python or golang: errExit("sharp-multi not yet implemented in C#, Python or Go")
  elif windows_clipboard: errExit("sharp-multi not yet implemented for windows-clipboard") # would need a way to select the annotator, probably necessitating a GUI on Windows
if java or javascript or python or c_sharp or golang or dart:
    def cOnly(param,lang="C"): errExit(param+" not yet implemented in any language other than "+lang+", so cannot be used with --java, --javascript, --python, --c-sharp, --golang or --dart")
    if windows_clipboard: cOnly("--windows-clipboard")
    if library: cOnly("--library")
    if not outcode=="utf-8": cOnly("Non utf-8 outcode")
    if compress: cOnly("--compress")
    if sum(1 for x in [java,javascript,python,c_sharp,golang,dart] if x) > 1:
      errExit("Outputting more than one programming language on the same run is not yet implemented")
    if java:
      if android and not "/src//" in java: errExit("When using --android, the last thing before the // in --java must be 'src' e.g. --java=/workspace/MyProject/src//org/example/package")
      if main and not compile_only: # (delete previous files, only if we're not an MPI-etc subprocess)
       os.system("mkdir -p "+shell_escape(java))
       for f in os.listdir(java):
        if f.endswith(".java") and f.startswith("z"): os.remove(java+os.sep+f)
      c_filename = java+os.sep+"Annotator.java"
      if main and android:
        os.system("rm -rf "+shell_escape(jSrc+"/../bin")) # needed to get rid of old *.class files that might be no longer used
        for d in ["assets","bin","gen","res/layout","res/menu","res/values","res/xml"]: os.system("mkdir -p "+shell_escape(jSrc+"/../"+d))
    elif c_filename.endswith(".c"):
      if javascript: c_filename = c_filename[:-2]+".js"
      elif dart: c_filename = c_filename[:-2]+".dart"
      elif c_sharp: c_filename = c_filename[:-2]+".cs"
      elif golang: c_filename = c_filename[:-2]+".go"
      else: c_filename = c_filename[:-2]+".py"
elif windows_clipboard:
  if library: errExit("Support for having both --windows-clipboard and --library at the same time is not yet implemented") # ditto
  if c_compiler=="cc -o annotator": c_compiler="i386-mingw32-gcc -o annoclip.exe"
  if not outcode=="utf-8": errExit("outcode must be utf-8 when using --windows-clipboard")
elif library:
  if c_compiler=="cc -o annotator": c_compiler="gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 -lc"
if js_6bit:
  if not javascript: errExit("--js-6bit requires --javascript") # or just set js_6bit=False in these circumstances?
  import urllib
if dart:
  js_utf8 = not dart_datafile
  if dart_datafile and any(x in dart_datafile for x in "'\\$"): errExit("Current implementation cannot cope with ' or \\ or $ in dart_datafile")
elif dart_datafile: errExit("--dart-datafile requires --dart")
if zlib:
  if javascript: warn("--zlib with --javascript has been known to cause string corruption on some browsers and is deprecated")
  js_6bit = js_utf8 = False
  del zlib
  try:
    from zopfli import zlib # pip install zopfli
    zlib._orig_compress = zlib.compress
    zlib.compress = lambda s,level: zlib._orig_compress(s) # delete level
    zlib_name = "zopfli"
  except:
    import zlib
    zlib_name = "zlib"
  data_driven = True
  if windows_clipboard: warn("--zlib with --windows-clipboard is inadvisable because ZLib is not typically present on Windows platforms. If you really want it, you'll need to figure out the compiler options and library setup for it.")
  if dart and not dart_datafile: warn("--zlib without --dart-datafile might not be as efficient as you'd hope (and --zlib prevents the resulting Dart code from being compiled to a \"Web app\" anyway)") # as it requires dart:io
if data_driven:
  if c_sharp or golang: errExit("--data-driven and --zlib are not yet implemented in C# or Go")
  elif java and not android: errExit("In Java, --data-driven and --zlib currently require --android as we need to know where to store the data file") # TODO: option to specify path in 'pure' Java? (in which case also update the 'compress' errExit above so it doesn't check for android before suggesting zlib)
elif javascript or python or dart: data_driven = True
compact_opcodes = data_driven and not fast_assemble
if java or javascript or python or c_sharp or golang or dart: c_compiler = None
try: xrange # Python 2
except: xrange,unichr,unicode = range,chr,str # Python 3
if post_normalise:
  if not (java and data_driven): errExit('--post-normalise currently requires --java and data-driven')
  if type("")==type(u""): # Python 3 (this requires 3.5+, TODO: support 3.3/3.4 ?)
    import importlib.util as iu
    s = iu.spec_from_file_location("post.normalise", post_normalise)
    post_normalise = iu.module_from_spec(s) ; s.loader.exec_module(post_normalise)
  else: # Python 2
    import imp
    post_normalise = imp.load_source('post.normalise', post_normalise)
  post_normalise = post_normalise.table
  for k,v in list(post_normalise.items()):
    if not (k<=0xFFFF and v<=0xFFFF and len(unichr(k).encode('utf-8'))==len(unichr(v).encode('utf-8'))): del post_normalise[k] # BMP only for now, and only mappings that don't change UTF-8 length so inBytes / origInBytes are sync'd
  if type(u"")==type(""): post_normalise_translate = lambda x:x.translate(post_normalise) # Python 3 can use the dictionary as-is
  else: post_normalise_translate = lambda u: u''.join(unichr(post_normalise.get(ord(i),ord(i))) for i in u) # as Python 2 .translate can take only len=256 (at least as documented; some versions can do more but not all tested), so we'd better write it out ourselves
try:
  import locale
  terminal_charset = locale.getdefaultlocale()[1]
except: terminal_charset = None
if not terminal_charset: terminal_charset = "utf-8"
if android_urls:
  if not android: errExit("--android-urls requires --android (you need to set a default URL for direct launch)")
  try: import urlparse
  except:
    try: import urllib.parse as urlparse
    except: errExit("--android-urls requires urlparse module") # unless we re-implement
  if "?" in android_urls: errExit("You cannot include a ? in any of your --android-urls (Android does not count query-string as part of the path)")
else: android_urls = "" # so it can still be .split()
if existing_ruby_shortcut_yarowsky:
  if not (android and ybytes and glossfile): errExit("--existing-ruby-shortcut-yarowsky makes sense only when generating an Android app with both ybytes and glossfile set")
  if not data_driven: errExit("Current implementation of --existing-ruby-shortcut-yarowsky requires --data-driven") # (it doesn't have to, but doing so without would require it to be put into the non-datadriven n() test, and as we're probably turning on zlib for Android apps anyway, we might as well implement only for data-driven)
def T(s):
  if type(s)==type(u""): return s # Python 3
  return s.decode(terminal_charset)
if keep_whitespace: keep_whitespace = set(T(keep_whitespace).split(','))
if glossmiss_hide: glossmiss_hide = set(T(glossmiss_hide).split(','))
if status_prefix: status_prefix += ": "
else: status_prefix = ""
if diagnose: diagnose=T(diagnose)
diagnose_limit = int(diagnose_limit)
max_words = int(max_words)
if single_words: max_words = 1
read_input = not no_input
if not reference_sep: norefs=True
if not read_input:
  def f():
    if diagnose_manual: return "--diagnose-manual is set"
    if normalise_only: return "--normalise-only is set"
    if not norefs:
      if not no_summary: return "summary is required (and without norefs)"
      if glossmiss: return "--glossmiss is set (and without norefs)"
  msg=f()
  if msg:
    warn("Reading input despite --no-input because "+msg)
    read_input = True

def nearCall(negate,conds,subFuncs,subFuncL):
  # returns what to put in the if() for ybytes near() lists
  if not max_or_length or len(conds) <= max_or_length:
    if java: f=b"a.n"
    else: f=b"near"
    ret = b" || ".join(f+b"(\""+B(outLang_escape(c))+b"\")" for c in conds)
    if negate:
      if b" || " in ret: ret = b" ! ("+ret+b")"
      else: ret = b"!"+ret
    return ret
  if java: fStart,fEnd = B("package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {"),b"} }" # put functions in separate classes to try to save the constants table of the main class
  elif golang: fStart,fEnd = b"func NewFunc() bool {",b"}"
  else: fStart,fEnd = outLang_bool+b" NewFunc() {",b"}"
  if negate: rTrue,rFalse = outLang_false,outLang_true
  else: rTrue,rFalse = outLang_true,outLang_false
  return subFuncCall(fStart+b"\n".join(outLang_shortIf(nearCall(False,conds[i:j],subFuncs,subFuncL),b"return "+rTrue+b";") for i,j in zip(range(0,len(conds),max_or_length),range(max_or_length,len(conds),max_or_length)+[len(conds)]))+b"\nreturn "+rFalse+b";"+fEnd,subFuncs,subFuncL)

def outLang_shortIf(cond,statement):
  if golang: return b"if "+cond+b" {\n  "+statement+b"\n}"
  else: return b"if("+cond+b") "+statement

def subFuncCall(newFunc,subFuncs,subFuncL):
  if newFunc in subFuncs:
    # we generated an identical one before
    subFuncName=subFuncs[newFunc]
  else:
    if java: subFuncName=b"z%X" % len(subFuncs) # (try to save as many bytes as possible because it won't be compiled out and we also have to watch the compiler's footprint; start with z so MainActivity.java etc appear before rather than among this lot in IDE listings)
    else: subFuncName=b"match%d" % len(subFuncs)
    subFuncs[newFunc]=subFuncName
    if java or c_sharp or golang: static=b""
    else: static=b"static "
    subFuncL.append(static+newFunc.replace(b"NewFunc",subFuncName,1))
  if java: return B(jPackage)+b"."+subFuncName+b".f(a)"
  return subFuncName+b"()" # the call (without a semicolon)

def stringSwitch(byteSeq_to_action_dict,subFuncL,funcName=b"topLevelMatch",subFuncs={},java_localvar_counter=None,nestingsLeft=None): # ("topLevelMatch" is also mentioned in the C code)
    # make a function to switch on a large number of variable-length string cases without repeated lookahead for each case
    # (may still backtrack if no words or no suffices match)
    # byteSeq_to_action_dict is really a byte sequence to [(action, OR-list of Yarowsky-like indicators which are still in Unicode)], the latter will be c_escape()d
    # can also be byte seq to [(action,(OR-list,nbytes))] but only if OR-list is not empty, so value[1] will always be false if OR-list is empty
    # so byteSeq_to_action_dict[k][0][0] is 1st action,
    #    byteSeq_to_action_dict[k][0][1] is conditions,
    #    byteSeq_to_action_dict[k][1][0] is 2nd action, &c
    if nestingsLeft==None: nestingsLeft=nested_switch
    canNestNow = not nestingsLeft==0 # (-1 = unlimited)
    if java: adot = b"a."
    else: adot = b""
    if java or c_sharp or golang: NEXTBYTE = adot+b'nB()'
    else: NEXTBYTE = b'NEXTBYTE'
    allBytes = set(b[:1] for b in iterkeys(byteSeq_to_action_dict) if b)
    ret = []
    if not java_localvar_counter: # Java and C# don't allow shadowing of local variable names, so we'll need to uniquify them
      java_localvar_counter=[0]
    olvc = b"%X" % java_localvar_counter[0] # old localvar counter
    if funcName:
        if java: ret.append(b"package "+B(jPackage)+b";\npublic class "+funcName+b" { public static void f("+B(jPackage)+b".Annotator a) {")
        else:
          if funcName==b"topLevelMatch" and not c_sharp: stat=b"static " # because we won't call subFuncCall on our result
          else: stat=b""
          if golang: ret.append(b"func %s() {" % funcName)
          else: ret.append(stat+b"void %s() {" % funcName)
        savePos = len(ret)
        if java or c_sharp: ret.append(b"{ int oldPos="+adot+b"inPtr;")
        elif golang: ret.append(b"{ oldPos := inPtr;")
        else: ret.append(b"{ POSTYPE oldPos=THEPOS;")
    elif b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1:
        # no funcName, but might still want to come back here as there's a possible action at this level
        savePos = len(ret)
        if java or c_sharp:
          ret.append(b"{ int oP"+olvc+b"="+adot+b"inPtr;")
          java_localvar_counter[0] += 1
        elif golang: ret.append(b"{ oldPos := inPtr;")
        else: ret.append(b"{ POSTYPE oldPos=THEPOS;")
    else: savePos = None
    def restorePos():
      if not savePos==None:
        if len(b' '.join(ret).split(NEXTBYTE))==2 and not called_subswitch:
            # only 1 NEXTBYTE after the savePos - just
            # do a PREVBYTE instead
            # (note however that splitting on NEXTBYTE
            # does not necessarily give a reliable value
            # for max amount of lookahead required if
            # there's more than 1.  We use max rule len
            # as an upper bound for that instead.)
            del ret[savePos]
            if java: ret.append(b"a.inPtr--;")
            elif c_sharp or golang: ret.append(b"inPtr--;")
            else: ret.append(b"PREVBYTE;")
        elif java or c_sharp:
          if funcName: ret.append(adot+b"inPtr=oldPos; }")
          else: ret.append(adot+b"inPtr=oP"+olvc+b"; }")
        elif golang: ret.append(b"inPtr=oldPos; }")
        else: ret.append(b"SETPOS(oldPos); }") # restore
    called_subswitch = False
    if b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[b""])==1 and not byteSeq_to_action_dict[b""][0][1] and all((len(a)==1 and a[0][0].startswith(byteSeq_to_action_dict[b""][0][0]) and not a[0][1]) for a in itervalues(byteSeq_to_action_dict)):
        # there's an action in common for this and all subsequent matches, and no Yarowsky-like indicators, so we can do the common action up-front
        ret.append(byteSeq_to_action_dict[b""][0][0])
        l = len(byteSeq_to_action_dict[b""][0][0])
        byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in iteritems(byteSeq_to_action_dict))
        # and, since we'll be returning no matter what,
        # we can put the inner switch in a new function
        # (even if not re-used, this helps compiler speed)
        # + DON'T save/restore pos around it (it itself
        # will do any necessary save/restore pos)
        del byteSeq_to_action_dict[b""]
        if java and (canNestNow or len(byteSeq_to_action_dict)==1): # hang on - better nest (might be using --nested-switch to get around a Java compiler-memory problem; the len condition allows us to always nest a single 'if' rather than creating a new function+class for it)
          ret += [b"  "+x for x in stringSwitch(byteSeq_to_action_dict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)]
          restorePos()
          ret.append(b"return;")
        else: # ok, new function
          newFunc = b"\n".join(stringSwitch(byteSeq_to_action_dict,subFuncL,b"NewFunc",subFuncs))
          ret.append(subFuncCall(newFunc,subFuncs,subFuncL)+b"; return;")
          del ret[savePos] # will be set to None below
        byteSeq_to_action_dict[b""] = [(b"",[])] # for the end of this func
        savePos = None # as setting funcName on stringSwitch implies it'll give us a savePos, and if we didn't set funcName then we called restorePos already above
    elif allBytes:
      # deal with all actions except "" first
      use_if = (len(allBytes)==1)
      if not use_if:
        if nestingsLeft > 0: nestingsLeft -= 1
        ret.append(b"switch("+NEXTBYTE+b") {")
      for case in sorted(allBytes):
        if not c_sharp and 32<=ord(case)<127 and case!=b"'": cstr=b"'%c'" % case
        else:
          cstr=B(str(ord(case)))
          if java: cstr = b"(byte)"+cstr
        if use_if: ret.append(b"if("+NEXTBYTE+b"=="+cstr+b") {")
        else: ret.append(b"case %s:" % cstr)
        subDict = dict([(k[1:],v) for k,v in iteritems(byteSeq_to_action_dict) if k and k[:1]==case])
        inner = stringSwitch(subDict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)
        if canNestNow or not (inner[0].startswith(b"switch") or (inner[0].startswith(b"if(") and not nested_if)): ret += [b"  "+x for x in inner]
        else:
          # Put the inner switch into a different function
          # which returns 1 if we should return.
          # (TODO: this won't catch cases where there's a savePos before the inner switch; will still nest in that case.  But it shouldn't lead to big nesting in practice.)
          if nested_switch: inner = stringSwitch(subDict,subFuncL,None,subFuncs,None,None) # re-do it with full nesting counter
          if java: myFunc,funcEnd = [B("package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {")], b"}}"
          elif golang: myFunc,funcEnd=[b"func NewFunc() bool {"],b"}"
          else: myFunc,funcEnd=[outLang_bool+b" NewFunc() {"],b"}"
          for x in inner:
            if x.endswith(b"return;"): x=x[:-len(b"return;")]+b"return "+outLang_true+b";"
            myFunc.append(b"  "+x)
          ret += (b"  "+outLang_shortIf(subFuncCall(b"\n".join(myFunc)+b"\n  return "+outLang_false+b";\n"+funcEnd,subFuncs,subFuncL),b"return;")).split(b'\n') # if golang, MUST have the \n before the 1st return there (optional for other languages); also must split outLang_shortIf o/p into \n for the above 'for x in inner' rewrite to work
          called_subswitch=True # as it'll include more NEXTBYTE calls which are invisible to the code below
        if not (use_if or inner[-1].endswith(b"return;")): ret.append(b"  break;")
      ret.append(b"}") # end of switch or if
    restorePos()
    if funcName:
      if java: ret.append(b"} }")
      else: ret.append(b"}")
    elif b"" in byteSeq_to_action_dict:
        # if the C code gets to this point, no return; happened - no suffices
        # so execute one of the "" actions and return
        # (which one, if any, depends on the Yarowsky-like indicators; there should be at most one "default" action without indicators)
        default_action = b""
        for action,conds in byteSeq_to_action_dict[b""]:
            if conds:
                assert action, "conds without action in "+repr(byteSeq_to_action_dict[""])
                if type(conds)==tuple:
                    negate,conds,nbytes = conds
                    if java: ret.append(b"a.sn(%d);" % nbytes)
                    elif c_sharp or golang: ret.append(b"nearbytes=%d;" % nbytes)
                    else: ret.append(b"setnear(%d);" % nbytes)
                else: negate = False
                ret.append(b"if ("+nearCall(negate,conds,subFuncs,subFuncL)+b") {")
                ret.append((action+b" return;").strip())
                ret.append(b"}")
            else: # no conds
                if default_action:
                  sys.stderr.write("WARNING! More than one default action in "+repr(byteSeq_to_action_dict[""])+" - earlier one discarded!\n")
                  if rulesFile: sys.stderr.write("(This might indicate invalid markup in the corpus, but it might just be due to a small change or capitalisation update during an incremental run, which can be ignored.)\n") # TODO: don't write this warning at all if accum.amend_rules was set at the end of analyse() ?
                  else: sys.stderr.write("(This might indicate invalid markup in the corpus)\n")
                default_action = action
        if default_action or not byteSeq_to_action_dict[b""]: ret.append((default_action+b" return;").strip()) # (return only if there was a default action, OR if an empty "" was in the dict with NO conditional actions (e.g. from the common-case optimisation above).  Otherwise, if there were conditional actions but no default, we didn't "match" anything if none of the conditions were satisfied.)
    return ret # caller does '\n'.join

if compress:
  squashStrings = set() ; squashReplacements = []
  def squashFinish():
    assert main, "squashFinish sets globals"
    global squashStrings # so can set it to "done" at end
    tokens = set()
    for s in squashStrings: tokens.update(list(S(s)))
    totSaved = 0
    tokens = [chr(t) for t in range(1,256) if not chr(t) in tokens] ; orig_tokens = set(tokens)
    pairs = [chr(0)] * 512
    while tokens and squashStrings:
      t = tokens.pop()
      counts = {}
      for s in squashStrings:
        # To make decompression as fast and compact as possible, each 1-byte token represents 2 bytes exactly.  In practice allowing it to represent variable lengths of whole bytes up to 4 is not likely to improve the compression by more than 3.2% (that's 3.2% of the 10-20% it achieves, so it's around 0.5%), and not very much better for length 9, so we might as well stick with this simpler scheme unless we do real LZMA or whatever.
          for i in range(0,len(s)-1):
            k = s[i:i+2]
            if S(k[:1]) in orig_tokens or S(k[1:]) in orig_tokens: continue # to keep the decoder simple, don't set things up so it needs to recurse (being able to recurse within the 2-byte expansion is very unlikely to save anything in practice anyway - it didn't on my annotators - so not worth implementing the decoder for)
            counts[k] = counts.get(k,0) + 1
      bSaved, k = max((v,k) for k,v in counts.items())
      pairs[ord(t)] = k[:1]
      pairs[ord(t)+256] = k[1:]
      squashReplacements.append((k,B(t))) # this assumes we won't be doing things like 'if ALL instances of a byte end up in our tokens, add the byte's original value as an extra token'
      for s in squashStrings:
        s2 = s.replace(k,B(t))
        if not s2==s:
          squashStrings.remove(s) ; squashStrings.add(s2)
      totSaved += bSaved
      sys.stderr.write("Compress: %d/%d tokens, %d bytes saved%s" % (len(orig_tokens)-len(tokens),len(orig_tokens),totSaved,clear_eol)) ; sys.stderr.flush()
    squashStrings = "done"
    while len(pairs) > 256 and pairs[-1]==chr(0): pairs = pairs[:-1]
    sys.stderr.write("\n")
    if totSaved < len(pairs)+50: sys.stderr.write("Warning: --compress on this data made it bigger!  Consider dropping --compress\n") # 50 as rough guess for OutWriteDecompress binary (probably about 12 instructions at 4+ bytes each)
    return c_escapeRawBytes(b"".join(B(p) for p in pairs))
  decompress_func=br"""

static unsigned char pairs[]="%%PAIRS%%";
static void OutWriteDecompress(const char *s) {
while(*s) {
  int i=(unsigned char)*s;
  if (pairs[i]) { OutWriteByte(pairs[i]); OutWriteByte(pairs[i|0x100]); } else OutWriteByte(*s);
  s++;
}
}"""
  if sharp_multi: decompress_func += br"""
static int ns; static void OutWriteNSB(int b) {
  if(b=='#') ns++; else if(ns==numSharps) OutWriteByte(b);
}
static void OutWriteDecompressP(const char *s) {
ns=0; while(*s && ns<=numSharps) {
  int i=(unsigned char)*s;
  if (pairs[i]) { OutWriteNSB(pairs[i]); OutWriteNSB(pairs[i|0x100]); } else OutWriteNSB(*s);
  s++;
}
}"""
  def squash(byteStr):
    if squashStrings == "done":
      for k,v in squashReplacements:
        byteStr = byteStr.replace(k,v)
    else: squashStrings.add(byteStr) # for the dry run
    return byteStr
elif sharp_multi: decompress_func = br"""
static void OutWriteStrP(const char *annot) {
    int ns = numSharps;
    while(ns--) {
        annot = strchr(annot,'#');
        if (!annot) return; else annot++;
    }
    char* m = strchr(annot,'#');
    if(m) OutWriteStrN(annot,m-annot); else OutWriteStr(annot);
}
"""
else: decompress_func = b""

if c_filename and os.sep in c_filename: cfn = c_filename[c_filename.rindex(os.sep)+1:]
else: cfn = c_filename
if library:
  c_preamble = br"""
  /*
     This library is NOT thread safe.  But you can use it
     with single-threaded or multiprocess code like Web Adjuster
     (not in WSGI mode).
    
     To wrap this library in Python (2 or 3), you can do:

from ctypes import CDLL,c_char_p,c_int
alib = CDLL("./libannotator.so.1")
_annotate,_afree = alib.annotate,alib.afree
_annotate.restype = c_char_p
_annotate.argtypes = [c_char_p"""
  if sharp_multi: c_preamble += b",c_int"
  c_preamble += b",c_int]"
  if outcode=="utf-8":
    c_preamble += br"""
_annotateRL = alib.annotateRawLatinize
_annotateRL.restype = c_char_p
_annotateRL.argtypes = [c_char_p"""
    if sharp_multi: c_preamble += b",c_int"
    c_preamble += b"]\ndef annotR(txt"
    if sharp_multi: c_preamble += b",aType=0"
    c_preamble += br"""):
    if type(txt)==type(u''): txt = txt.encode('utf-8')
    r = _annotateRL(txt"""
    if sharp_multi: c_preamble += b",aType"
    c_preamble += br""")
    _afree() ; return r"""
  c_preamble += b"\ndef annotate(txt"
  if sharp_multi: c_preamble += b",aType=0"
  c_preamble += br""",aMode=1):
    "aMode: 0 = raw, 1 = ruby (default), 2 = braces"
    if type(txt)==type(u''): txt = txt.encode('"""+outcode+r"""')
    r = _annotate(txt"""
  if sharp_multi: c_preamble += b",aType"
  c_preamble += br""",aMode)
    _afree() ; return r
# then for Web Adjuster you can do, for example,
# adjuster.annotFunc1 = lambda t:annotate(t"""
  if sharp_multi: c_preamble += b",1"
  c_preamble += b",1)\n"
  if outcode=="utf-8":
    if sharp_multi: c_preamble += b"# adjuster.annotFunc1R = lambda t:annotR(t,1)"
    else: c_preamble += b"# adjuster.annotFunc1R = annotR"
    c_preamble += br"""
# adjuster.options.htmlFilter = "*annotFunc1#*annotFunc1R"
# adjuster.options.htmlFilterName = "ruby#annot-only"
"""
  else: c_preamble += br"""
# adjuster.options.htmlFilter = "*annotFunc1"
"""
  if not outcode=="utf-8": c_preamble += br"""
# but BEWARE Web Adjuster assumes UTF-8; you'd better write a wrapper to re-code it
""" # (TODO: automate this?)
  c_preamble += br"""
    Compile with:
    gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 annotator.c -lc

  */
  """
  if cfn: c_preamble=c_preamble.replace(b"annotator.c",B(cfn))
  c_preamble += br"""
#include <stdlib.h>
#include <string.h>
"""
  c_defs = br"""static const unsigned char *readPtr, *writePtr, *startPtr;
static char *outBytes;
static size_t outWriteLen,outWritePtr;
#define NEXTBYTE (*readPtr++)
#define NEXT_COPY_BYTE (*writePtr++)
#define COPY_BYTE_SKIP writePtr++
#define COPY_BYTE_SKIPN(n) writePtr += (n)
#define POSTYPE const unsigned char*
#define THEPOS readPtr
#define SETPOS(p) (readPtr=(p))
#define PREVBYTE readPtr--
#define FINISHED (!(*readPtr))

static void OutWriteStrN(const char *s,size_t l) {
  size_t newLen = outWriteLen;
  while (outWritePtr+l > newLen) newLen *= 2;
  if (newLen > outWriteLen) {
    char *ob2 = realloc(outBytes,newLen);
    if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
    outBytes = ob2; outWriteLen = newLen;
  }
  memcpy(outBytes+outWritePtr, s, l);
  outWritePtr += l;
}
static void OutWriteStr(const char *s) {
  OutWriteStrN(s,strlen(s));
}
static void OutWriteByte(char c) {
  if (outWritePtr >= outWriteLen) {
    size_t newLen = outWriteLen * 2;
    char *ob2 = realloc(outBytes,newLen);
    if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
    outBytes = ob2; outWriteLen = newLen;
  }
  outBytes[outWritePtr++] = c;
}
int near(char* string) {
    const unsigned char *startFrom = readPtr-nearbytes,
                     *end = readPtr+nearbytes;
    if (startFrom < startPtr) startFrom = startPtr;
    size_t l=strlen(string); end -= l;
    while (*startFrom && startFrom <= end) {
      if(!strncmp(startFrom,string,l)) return 1;
      startFrom++;
    }
    return 0;
}
void matchAll();"""
  c_defs += br"""
void afree() { if(outBytes) free(outBytes); outBytes=NULL; }
char *annotate(const char *input"""
  if sharp_multi: c_defs += b", int annotNo"
  c_defs += br""",int aMode) {
  readPtr=writePtr=startPtr=(char*)input;
  outWriteLen = strlen(startPtr)*5+1; /* initial guess (must include the +1 to ensure it's non-0 for OutWrite...'s *= code) */
  afree(); outBytes = malloc(outWriteLen);"""
  if sharp_multi: c_defs += b" numSharps=annotNo;"
  c_defs += br""" annotation_mode = aMode;
  if(outBytes) { outWritePtr = 0; matchAll(); }
  if(outBytes) OutWriteByte(0);
  return outBytes;
}
"""