Newer
Older

Silas S. Brown
committed
#!/usr/bin/env python
program_name = "Annotator Generator v0.626 (c) 2012-17 Silas S. Brown"

Silas S. Brown
committed
# See http://people.ds.cam.ac.uk/ssb22/adjuster/annogen.html

Silas S. Brown
committed
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# If you want to compare this code to old versions, the old

Silas S. Brown
committed
# versions are being kept in the E-GuideDog SVN repository on
# http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster

Silas S. Brown
committed
from optparse import OptionParser
parser = OptionParser()

Silas S. Brown
committed
import sys,os,os.path,tempfile,time,re

Silas S. Brown
committed
if not "mac" in sys.platform and not "darwin" in sys.platform and ("win" in sys.platform or "mingw32" in sys.platform): exe=".exe" # Windows, Cygwin, etc
else: exe=""
# =========== INPUT OPTIONS ==============
parser.add_option("--infile",
help="Filename of a text file (or a compressed .gz or .bz2 file) to read the input examples from. If this is not specified, standard input is used.")
parser.add_option("--incode",default="utf-8",
help="Character encoding of the input file (default %default)")
parser.add_option("--mstart",
dest="markupStart",
default="<ruby><rb>",
help="The string that starts a piece of text with annotation markup in the input examples; default %default")
parser.add_option("--mmid",
dest="markupMid",
default="</rb><rt>",

Silas S. Brown
committed
help="The string that occurs in the middle of a piece of markup in the input examples, with the word on its left and the added markup on its right (or the other way around if mreverse is set); default %default")

Silas S. Brown
committed
parser.add_option("--mend",
dest="markupEnd",
default="</rt></ruby>",
help="The string that ends a piece of annotation markup in the input examples; default %default")

Silas S. Brown
committed
parser.add_option("--mreverse",
action="store_true",default=False,
help="Specifies that the annotation markup is reversed, so the text BEFORE mmid is the annotation and the text AFTER it is the base text")

Silas S. Brown
committed
parser.add_option("--reference-sep",
help="Reference separator code used in the example input. If you want to keep example source references for each rule, you can label the input with 'references' (chapter and section numbers or whatever), and use this option to specify what keyword or other markup the input will use between each 'reference'. The name of the next reference will be whatever text immediately follows this string. Note that the reference separator, and the reference name that follows it, should not be part of the text itself and should therefore not be part of any annotation markup. If this option is not set then references will not be tracked.")
parser.add_option("--ref-name-end",default=" ",
help="Sets what the input uses to END a reference name. The default is a single space, so that the first space after the reference-sep string will end the reference name.")

Silas S. Brown
committed
parser.add_option("--ref-pri",
help="Name of a reference to be considered \"high priority\" for Yarowsky-like seed collocations (if these are in use). Normally the Yarowsky-like logic tries to identify a \"default\" annotation based on what is most common in the examples, with the exceptions indicated by collocations. If however a word is found in a high priority reference then the first annotation found in that reference will be considered the ideal \"default\" even if it's in a minority in the examples; everything else will be considered as an exception. In languages without spaces, this override should normally be used only for one-character words; if used with longer words it might have unexpected effects on rule-overlap ambiguities.")

Silas S. Brown
committed
parser.add_option("-s", "--spaces",
action="store_false",
dest="removeSpace",
default=True,
help="Set this if you are working with a language that uses whitespace in its non-markedup version (not fully tested). The default is to assume that there will not be any whitespace in the language, which is correct for Chinese and Japanese.")
parser.add_option("-c", "--capitalisation",
action="store_true",
default=False,
help="Don't try to normalise capitalisation in the input. Normally, to simplify the rules, the analyser will try to remove start-of-sentence capitals in annotations, so that the only remaining words with capital letters are the ones that are ALWAYS capitalised such as names. (That's not perfect: some words might always be capitalised just because they never occur mid-sentence in the examples.) If this option is used, the analyser will instead try to \"learn\" how to predict the capitalisation of ALL words (including start of sentence words) from their contexts.") # TODO: make the C program put the sentence capitals back

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("-w", "--annot-whitespace",
action="store_true",
default=False,

Silas S. Brown
committed
help="Don't try to normalise the use of whitespace and hyphenation in the example annotations. Normally the analyser will try to do this, to reduce the risk of missing possible rules due to minor typographical variations.") # TODO: can this be extended to the point where the words 'try to' can be deleted ? see comments
parser.add_option("--keep-whitespace",
help="Comma-separated list of words (without annotation markup) for which whitespace and hyphenation should always be kept even without the --annot-whitespace option. Use when you know the variation is legitimate. This option expects words to be encoded using the system locale (UTF-8 if it cannot be detected).")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("--glossfile",

Silas S. Brown
committed
help="Filename of an optional text file (or compressed .gz or .bz2 file) to read auxiliary \"gloss\" information. Each line of this should be of the form: word (tab) annotation (tab) gloss. When the compiled annotator generates ruby markup, it will add the gloss string as a popup title whenever that word is used with that annotation. The annotation field may be left blank to indicate that the gloss will appear for any annotation of that word. The entries in glossfile do NOT affect the annotation process itself, so it's not necessary to completely debug glossfile's word segmentation etc.")
help="Name of an optional file to which to write information about words recognised by the annotator that are missing in glossfile (along with frequency counts and references, if available)") # (default sorted alphabetically, but you can pipe through sort -rn to get most freq 1st)

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("--manualrules",
help="Filename of an optional text file (or compressed .gz or .bz2 file) to read extra, manually-written rules. Each line of this should be a marked-up phrase (in the input format) which is to be unconditionally added as a rule. Use this sparingly, because these rules are not taken into account when generating the others and they will be applied regardless of context (although a manual rule might fail to activate if the annotator is part-way through processing a different rule); try checking messages from --diagnose-manual.") # (or if there's a longer automatic match)

Silas S. Brown
committed

Silas S. Brown
committed
# =========== OUTPUT OPTIONS ==============
parser.add_option("--rulesFile",help="Filename of an optional auxiliary binary file to hold the accumulated rules. Adding .gz or .bz2 for compression is acceptable. If this is set then the rules will be written to it (in binary format) as well as to the output. Additionally, if the file already exists then rules will be read from it and incrementally updated. This might be useful if you have made some small additions to the examples and would like these to be incorporated without a complete re-run. It might not work as well as a re-run but it should be faster. If using a rulesFile then you must keep the same input (you may make small additions etc, but it won't work properly if you delete many examples or change the format between runs) and you must keep the same ybytes-related options if any.") # You may however change whether or not a --single-words / --max-words option applies to the new examples (but hopefully shouldn't have to)

Silas S. Brown
committed
parser.add_option("--no-input",
action="store_true",default=False,

Silas S. Brown
committed
help="Don't actually read the input, just use the rules that were previously stored in rulesFile. This can be used to increase speed if the only changes made are to the output options. You should still specify the input formatting options (which should not change), and any glossfile or manualrules options (which may change).")

Silas S. Brown
committed
parser.add_option("--c-filename",default="",help="Where to write the C program. Defaults to standard output, or annotator.c in the system temporary directory if standard output seems to be the terminal (the program might be large, especially if Yarowsky indicators are not used, so it's best not to use a server home directory where you might have limited quota). If MPI is in use then the default will always be standard output.") # because the main program might not be running on the launch node

Silas S. Brown
committed
parser.add_option("--c-compiler",default="cc -o annotator"+exe,help="The C compiler to run if standard output is not connected to a pipe. The default is to use the \"cc\" command which usually redirects to your \"normal\" compiler. You can add options (remembering to enclose this whole parameter in quotes if it contains spaces), but if the C program is large then adding optimisation options may make the compile take a LONG time. If standard output is connected to a pipe, then this option is ignored because the C code will simply be written to the pipe. You can also set this option to an empty string to skip compilation. Default: %default")

Silas S. Brown
committed
# If compiling an experimental annotator quickly, you might try tcc as it compiles fast. If tcc is not available on your system then clang might compile faster than gcc.
# (BUT tcc can have problems on Raspberry Pi see http://www.raspberrypi.org/phpBB3/viewtopic.php?t=30036&p=263213; can be best to cross-compile, e.g. from a Mac use https://github.com/UnhandledException/ARMx/wiki/Sourcery-G---Lite-for-ARM-GNU-Linux-(2009q3-67)-for-Mac-OS-X and arm-none-linux-gnueabi-gcc)
# In large rulesets with --max-or-length=0 and --nested-switch, gcc takes time and gcc -Os can take a LOT longer, and CINT, Ch and picoc run out of memory. Without these options the overhead of gcc's -Os isn't so bad (and does save some room).
# clang with --max-or-length=100 and --nested-switch=0 is not slowed much by -Os (slowed considerably by -O3). -Os and -Oz gave same size in my tests.

Silas S. Brown
committed
# on 64-bit distros -m32 won't always work and won't necessarily give a smaller program
parser.add_option("--max-or-length",default=100,help="The maximum number of items allowed in an OR-expression in non table-driven code (used when ybytes is in effect). When an OR-expression becomes larger than this limit, it will be made into a function. 0 means unlimited, which works for tcc and gcc; many other compilers have limits. Default: %default")

Silas S. Brown
committed
help="Allow C/C#/Java/Go switch() constructs to be nested to about this depth. Default 0 tries to avoid nesting, as it slows down most C compilers for small savings in executable size. Setting 1 nests 1 level deeper which can occasionally help get around memory problems with Java compilers. -1 means nest to unlimited depth, which is not recommended.") # tcc is still fast (although that doesn't generate the smallest executables anyway)

Silas S. Brown
committed
parser.add_option("--outcode",default="utf-8",
help="Character encoding to use in the generated parser and rules summary (default %default, must be ASCII-compatible i.e. not utf-16)")
parser.add_option("-S", "--summary-only",
action="store_true",default=False,
help="Don't generate a parser, just write the rules summary to standard output")

Silas S. Brown
committed
parser.add_option("--no-summary",
action="store_true",default=False,
help="Don't add a large rules-summary comment at the end of the parser code")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("-O", "--summary-omit",
help="Filename of a text file (or a compressed .gz or .bz2 file) specifying what should be omitted from the rules summary. Each line should be a word or phrase, a tab, and its annotation (without the mstart/mmid/mend markup). If any rule in the summary exactly matches any of the lines in this text file, then that rule will be omitted from the summary (but still included in the parser). Use for example to take out of the summary any entries that correspond to things you already have in your dictionary, so you can see what's new.")
parser.add_option("--maxrefs",default=3,
help="The maximum number of example references to record in each summary line, if references are being recorded (0 means unlimited). Default is %default.")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("--norefs",
action="store_true",default=False,
help="Don't write references in the rules summary (or the glossmiss file). Use this if you need to specify reference-sep and ref-name-end for the ref-pri option but you don't actually want references in the summary (which speeds up summary generation slightly). This option is automatically turned on if --no-input is specified.") # the speed difference is not so great as of v0.593, but needed anyway if --no-input is set

Silas S. Brown
committed
parser.add_option("--newlines-reset",
action="store_false",
dest="ignoreNewlines",
default=True,
help="Have the annotator reset its state on every newline byte. By default newlines do not affect state such as whether a space is required before the next word, so that if the annotator is used with Web Adjuster's htmlText option (which defaults to using newline separators) the spacing should be handled sensibly when there is HTML markup in mid-sentence.")

Silas S. Brown
committed

Silas S. Brown
committed
action="store_true",default=False,
help="Compress annotation strings in the C code. This compression is designed for fast on-the-fly decoding, so it saves only a limited amount of space (typically 10-20%) but that might help if memory is short; see also --data-driven.")

Silas S. Brown
committed
help="Include Objective-C code for an iOS app that opens a web-browser component and annotates the text on every page it loads. The initial page is specified by this option: it can be a URL, or a markup fragment starting with < to hard-code the contents of the page. Also provided is a custom URL scheme to annotate the local clipboard. You will need Xcode to compile the app (see the start of the generated C file for instructions); if it runs out of space, try using --data-driven")
parser.add_option("--data-driven",
action="store_true",default=False,
help="Generate a program that works by interpreting embedded data tables for comparisons, instead of writing these as code. This can take some load off the compiler (so try it if you get errors like clang's \"section too large\"), as well as compiling faster and reducing the resulting binary's RAM size (by 35-40% is typical), at the expense of a small reduction in execution speed. Javascript and Python output is always data-driven anyway.") # If the resulting binary is compressed (e.g. in an APK), its compressed size will likely not change much (same information content), so I'm specifically saying "RAM size" i.e. when decompressed
parser.add_option("--zlib",
action="store_true",default=False,
help="Enable --data-driven and compress the embedded data table using zlib, and include code to call zlib to decompress it on load. Useful if the runtime machine has the zlib library and you need to save disk space but not RAM (the decompressed table is stored separately in RAM, unlike --compress which, although giving less compression, at least works 'in place'). Once --zlib is in use, specifying --compress too will typically give an additional disk space saving of less than 1% (and a runtime RAM saving that's greater but more than offset by zlib's extraction RAM).") # and additional_compact_opcodes typically still helps no matter what the other options are

Silas S. Brown
committed
action="store_true",default=False,
help="Include C code to read the clipboard on Windows or Windows Mobile and to write an annotated HTML file and launch a browser, instead of using the default cross-platform command-line C wrapper. See the start of the generated C file for instructions on how to compile for Windows or Windows Mobile.")

Silas S. Brown
committed

Silas S. Brown
committed
action="store_true",default=False,
help="Instead of generating C code, generate C# (not quite as efficient as the C code but close; might be useful for adding an annotator to a C# project; see comments at the start for usage)")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("--java",
help="Instead of generating C code, generate Java, and place the *.java files in the directory specified by this option, removing any existing *.java files. See --android for example use. The last part of the directory should be made up of the package name; a double slash (//) should separate the rest of the path from the package name, e.g. --java=/path/to/wherever//org/example/package and the main class will be called Annotator.")

Silas S. Brown
committed
parser.add_option("--android",
help="URL for an Android app to browse. If this is set, code is generated for an Android app which starts a browser with that URL as the start page, and annotates the text on every page it loads. A function to annotate the local clipboard is also provided. You will need the Android SDK to compile the app; see comments in MainActivity.java for details.")
help="Android NDK: make a C annotator and use ndk-build to compile it into an Android JNI library. This is a more complex setup than a Java-based annotator, but it improves speed and size. The --ndk option should be set to the name of the package that will use the library, and --android should be set to the initial URL. See comments in the output file for details.")

Silas S. Brown
committed
parser.add_option("--javascript",
action="store_true",default=False,
help="Instead of generating C code, generate JavaScript. This might be useful if you want to run an annotator on a device that has a JS interpreter but doesn't let you run native code. The JS will be table-driven to make it load faster (and --no-summary will also be set). See comments at the start for usage.") # but it's better to use the C version if you're in an environment where 'standard input' makes sense
parser.add_option("--python",
action="store_true",default=False,
help="Instead of generating C code, generate a Python module. Similar to the Javascript option, this is for when you can't run native code, and it is table-driven for fast loading.")
parser.add_option("--golang",
help="Package name for a Go library to generate instead of C code. See comments in the generated file for how to run this on AppEngine.")

Silas S. Brown
committed
parser.add_option("--reannotator",
help="Shell command through which to pipe each word of the original text to obtain new annotation for that word. This might be useful as a quick way of generating a new annotator (e.g. for a different topolect) while keeping the information about word separation and/or glosses from the previous annotator, but it is limited to commands that don't need to look beyond the boundaries of each word. If the command is prefixed by a # character, it will be given the word's existing annotation instead of its original text, and if prefixed by ## it will be given text#annotation. The command should treat each line of its input independently, and both its input and its output should be in the encoding specified by --outcode.") # TODO: reannotatorCode instead? (see other 'reannotatorCode' TODOs)

Silas S. Brown
committed
# (Could just get the reannotator to post-process the 1st annotator's output, but that might be slower than generating an altered annotator with it)

Silas S. Brown
committed
# =========== ANALYSIS OPTIONS ==============
parser.add_option("-o", "--allow-overlaps",
action="store_true",default=False,
help="Normally, the analyser avoids generating rules that could overlap with each other in a way that would leave the program not knowing which one to apply. If a short rule would cause overlaps, the analyser will prefer to generate a longer rule that uses more context, and if even the entire phrase cannot be made into a rule without causing overlaps then the analyser will give up on trying to cover that phrase. This option allows the analyser to generate rules that could overlap, as long as none of the overlaps would cause actual problems in the example phrases. Thus more of the examples can be covered, at the expense of a higher risk of ambiguity problems when applying the rules to other texts. See also the -y option.")

Silas S. Brown
committed
parser.add_option("-P", "--primitive",
action="store_true",default=False,
help="Don't bother with any overlap or conflict checks at all, just make a rule for each word. The resulting parser is not likely to be useful, but the summary might be.")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("-y","--ybytes",default=0,
help="Look for candidate Yarowsky seed-collocations within this number of bytes of the end of a word. If this is set then overlaps and rule conflicts will be allowed if the seed collocations can be used to distinguish between them. Markup examples that are completely separate (e.g. sentences from different sources) must have at least this number of (non-whitespace) bytes between them.")

Silas S. Brown
committed
parser.add_option("--ybytes-max",default=0,
help="Extend the Yarowsky seed-collocation search to check over larger ranges up to this maximum. If this is set then several ranges will be checked in an attempt to determine the best one for each word, but see also ymax-threshold.")
parser.add_option("--ymax-threshold",default=1,
help="Limits the length of word that receives the narrower-range Yarowsky search when ybytes-max is in use. For words longer than this, the search will go directly to ybytes-max. This is for languages where the likelihood of a word's annotation being influenced by its immediate neighbours more than its distant collocations increases for shorter words, and less is to be gained by comparing different ranges when processing longer words. Setting this to 0 means no limit, i.e. the full range will be explored on ALL Yarowsky checks.") # TODO: see TODO below re temporary recommendation of --ymax-threshold=0

Silas S. Brown
committed
parser.add_option("--ybytes-step",default=3,
help="The increment value for the loop between ybytes and ybytes-max")
parser.add_option("--warn-yarowsky",
action="store_true",default=False,
help="Warn when absolutely no distinguishing Yarowsky seed collocations can be found for a word in the examples")
parser.add_option("--yarowsky-all",
action="store_true",default=False,
help="Accept Yarowsky seed collocations even from input characters that never occur in annotated words (this might include punctuation and example-separation markup)")
parser.add_option("--yarowsky-debug",default=1,
help="Report the details of seed-collocation false positives if there are a large number of matches and at most this number of false positives (default %default). Occasionally these might be due to typos in the corpus, so it might be worth a check.")

Silas S. Brown
committed
parser.add_option("--single-words",

Silas S. Brown
committed
action="store_true",default=False,

Silas S. Brown
committed
help="Do not consider any rule longer than 1 word, although it can still have Yarowsky seed collocations if -y is set. This speeds up the search, but at the expense of thoroughness. You might want to use this in conjuction with -y to make a parser quickly. It is like -P (primitive) but without removing the conflict checks.")

Silas S. Brown
committed
parser.add_option("--max-words",default=0,
help="Limits the number of words in a rule; rules longer than this are not considered. 0 means no limit. --single-words is equivalent to --max-words=1. If you need to limit the search time, and are using -y, it should suffice to use --single-words for a quick annotator or --max-words=5 for a more thorough one.") # (There was a bug in annogen versions before 0.58 that caused --max-words to additionally limit how far away from the start of its phrase a rule-example must be placed; this has now been fixed. There was also a bug that resulted in too many extra rules being tested over already-catered-for phrases; as this has now been fixed, the additional benefit of a --max-words limit is now reduced, but you might want to put one in anyway. That second bug also had the effect of the coverage % being far too low in the progress stats.)

Silas S. Brown
committed
# TODO: optionally (especially if NOT using Yarowsky) do an additional pass (after discovering all other rules) and turn whole phrases that are not completely covered by other rules into whole-phrase rules, if it doesn't conflict 1 phrase w. anothr of equal priority; shld be ok if no overlap, overlaps wld *sometimes* be ok suggest a len threshold

Silas S. Brown
committed
parser.add_option("--checkpoint",help="Periodically save checkpoint files in the specified directory. These files can save time when starting again after a reboot (and it's easier than setting up Condor etc). As well as a protection against random reboots, this can be used for scheduled reboots: if file called ExitASAP appears in the checkpoint directory, annogen will checkpoint, remove the ExitASAP file, and exit. After a run has completed, the checkpoint directory should be removed, unless you want to re-do the last part of the run for some reason.")
# (Condor can checkpoint an application on Win/Mac/Linux but is awkward to set up. Various Linux and BSD application checkpoint approaches also exist, and virtual machines can have their state saved. On the other hand the physical machine might have a 'hibernate' option which is easier.)

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("-d","--diagnose",help="Output some diagnostics for the specified word. Use this option to help answer \"why doesn't it have a rule for...?\" issues. This option expects the word without markup and uses the system locale (UTF-8 if it cannot be detected).")

Silas S. Brown
committed
parser.add_option("--diagnose-limit",default=10,help="Maximum number of phrases to print diagnostics for (0 means unlimited); can be useful when trying to diagnose a common word in rulesFile without re-evaluating all phrases that contain it. Default: %default")
parser.add_option("--diagnose-manual",
action="store_true",default=False,
help="Check and diagnose potential failures of --manualrules")
parser.add_option("--diagnose-quick",
action="store_true",default=False,
help="Ignore all phrases that do not contain the word specified by the --diagnose option, for getting a faster (but possibly less accurate) diagnostic. The generated annotator is not likely to be useful when this option is present. You may get quick diagnostics WITHOUT these disadvantages by loading a --rulesFile instead.")

Silas S. Brown
committed

Silas S. Brown
committed
parser.add_option("--time-estimate",
action="store_true",default=False,
help="Estimate time to completion. The code to do this is unreliable and is prone to underestimate. If you turn it on, its estimate is displayed at the end of the status line as days, hours or minutes.") # Unreliable because the estimate assumes 'phrases per minute' will remain constant on average, whereas actually it will decrease because the more complex phrases are processed last
parser.add_option("--single-core",
action="store_true",default=False,
help="Use only one CPU core even when others are available. (If this option is not set, multiple cores are used if a 'futures' package is installed or if run under MPI or SCOOP; this currently requires --checkpoint + shared filespace, and is currently used only for large collocation checks in limited circumstances.)") # namely, words that occur in length-1 phrases
parser.add_option("-p","--status-prefix",help="Label to add at the start of the status line, for use if you batch-run annogen in multiple configurations and want to know which one is currently running")
main = (__name__ == "__main__" and not os.environ.get("OMPI_COMM_WORLD_RANK","0").replace("0",""))
if main: sys.stderr.write(program_name+"\n") # not sys.stdout: may or may not be showing --help (and anyway might want to process the help text for website etc)
# else STILL parse options (if we're being imported for parallel processing)

Silas S. Brown
committed
options, args = parser.parse_args()
globals().update(options.__dict__)

Silas S. Brown
committed
sys.setcheckinterval(32767) # won't be using threads or signals, so don't have to check for them very often
import gc ; gc.disable() # should be OK if we don't create cycles (TODO: run gc.collect() manually after init, just in case?)
def warn(msg):
if main: sys.stderr.write("Warning: "+msg+"\n")
# else it should have already been written
if "PyPy" in sys.version: warn("PyPy is likely to run 60% slower than python with annogen") # (not to mention concurrent.futures being less likely to be available)
if primitive and ybytes: warn("primitive will override ybytes")

Silas S. Brown
committed
if ybytes: ybytes=int(ybytes)

Silas S. Brown
committed
if ybytes_max: ybytes_max=int(ybytes_max)
else: ybytes_max = ybytes
if yarowsky_debug: yarowsky_debug=int(yarowsky_debug)
else: yarowsky_debug = 0

Silas S. Brown
committed
ybytes_step = int(ybytes_step)

Silas S. Brown
committed
maxrefs = int(maxrefs)

Silas S. Brown
committed
ymax_threshold = int(ymax_threshold)

Silas S. Brown
committed
def errExit(msg):
assert main # bad news if this happens in non-main module

Silas S. Brown
committed
sys.stderr.write(msg+"\n") ; sys.exit(1)
if args: errExit("Unknown argument "+repr(args[0]))

Silas S. Brown
committed
if ref_pri and not (reference_sep and ref_name_end): errExit("ref-pri option requires reference-sep and ref-name-end to be set")
if android and not (java or ndk): errExit('You must set --java=/path/to/src//name/of/package or --ndk=name.of.package when using --android')
if ndk and not android: errExit("You must set --android=URL when using --ndk. E.g. --android=file:///android_asset/index.html")
jPackage = None
if nested_switch: nested_switch=int(nested_switch) # TODO: if java, override it? or just rely on the help text for --nested-switch (TODO cross-reference it from --java?)
if java:
if not '//' in java: errExit("--java must include a // to separate the first part of the path from the package name")
jPackage=java.rsplit('//',1)[1].replace('/','.')
if 'NewFunc' in jPackage: errExit("Currently unable to include the string 'NewFunc' in your package due to an implementation detail in annogen's search/replace operations")
def isatty(f): return hasattr(f,"isatty") and f.isatty()
if not c_filename and isatty(sys.stdout): # assumed false when run under MPI
c_filename = tempfile.gettempdir()+os.sep+"annotator.c"
if java or javascript or python or c_sharp or golang:
if ios: errExit("--ios not yet implemented in C#, Java, JS, Python or Go; please use C (it becomes Objective-C)")
if ndk: errExit("--ndk requires the output language to be C")
if windows_clipboard: errExit("--windows-clipboard not yet implemented in C#, Java, JS, Python or Go; please use C")
if sum(1 for x in [java,javascript,python,c_sharp,golang] if x) > 1:

Silas S. Brown
committed
errExit("Outputting more than one programming language on the same run is not yet implemented")
if not outcode=="utf-8": errExit("outcode must be utf-8 when using Java, Javascript, Python, C# or Go")
if compress: errExit("compress not yet implemented for the Java, Javascript, Python, C# or Go versions") # (and it would probably slow down JS/Python too much if it were)
if f.endswith(".java"): os.remove(java+os.sep+f)
c_filename = java+os.sep+"Annotator.java"
elif c_filename.endswith(".c"):

Silas S. Brown
committed
if javascript: c_filename = c_filename[:-2]+".js"
elif c_sharp: c_filename = c_filename[:-2]+".cs"
elif golang: c_filename = c_filename[:-2]+".go"

Silas S. Brown
committed
else: c_filename = c_filename[:-2]+".py"
if ios: errExit("Support for having both --ios and --windows-clipboard at the same time is not yet implemented") # (I suppose you could make a single output file that will compile as either C+MS-stuff or Objective-C depending on preprocessor tests)
if ndk: errExit("Support for having both --ndk and --windows-clipboard at the same time is not yet implemented")
if c_compiler=="cc -o annotator": c_compiler="i386-mingw32-gcc -o annoclip.exe"
if not outcode=="utf-8": errExit("outcode must be utf-8 when using --windows-clipboard")
if ndk: errExit("Support for having both --ios and --ndk at the same time is not yet implemented")
if not outcode=="utf-8": errExit("outcode must be utf-8 when using --ios")
if c_filename.endswith(".c"): c_filename = c_filename[:-2]+".m" # (if the instructions are followed, it'll be ViewController.m, but no need to enforce that here)
elif ndk:
if not outcode=="utf-8": errExit("outcode must be utf-8 when using --ndk")
if zlib:
del zlib ; import zlib ; data_driven = True
if javascript: errExit("--zlib is not yet implemented in Javascript") # C or Python for now
if windows_clipboard: warn("--zlib with --windows-clipboard is inadvisable because ZLib is not typically present on Windows platforms. If you really want it, you'll need to figure out the compiler options and library setup for it.")
if ios: warn("--zlib with --ios will require -lz to be added to the linker options in XCode, and I don't have instructions for that (it probably differs across XCode versions)")
if data_driven and (c_sharp or java or golang): errExit("--data-driven is not yet implemented in C#, Java or Go")
additional_compact_opcodes = data_driven and not (python or javascript) # currently implemented only in the C version of the data-driven runtime
if java or javascript or python or c_sharp or ios or ndk or golang:

Silas S. Brown
committed
try:
import locale
terminal_charset = locale.getdefaultlocale()[1]
except: terminal_charset = None
if not terminal_charset: terminal_charset = "utf-8"
try: import urlparse
except:
if os.environ.get("ANNOGEN_ANDROID_URLS"): errExit("Need urlparse module for ANNOGEN_ANDROID_URLS") # unless we re-implement
if keep_whitespace: keep_whitespace = set(keep_whitespace.decode(terminal_charset).split(','))
if status_prefix: status_prefix += ": "
else: status_prefix = ""

Silas S. Brown
committed
if diagnose: diagnose=diagnose.decode(terminal_charset)

Silas S. Brown
committed
diagnose_limit = int(diagnose_limit)

Silas S. Brown
committed
max_words = int(max_words)
if single_words: max_words = 1
if no_input and diagnose_manual: errExit("--diagnose-manual is not compatible with --no-input") # it needs the input for diagnostic purposes

Silas S. Brown
committed
def nearCall(negate,conds,subFuncs,subFuncL):

Silas S. Brown
committed
# returns what to put in the if() for ybytes near() lists
if not max_or_length or len(conds) <= max_or_length:

Silas S. Brown
committed
else: f="near"
ret = " || ".join(f+"(\""+outLang_escape(c)+"\")" for c in conds)
if negate:
if " || " in ret: ret = " ! ("+ret+")"
else: ret = "!"+ret
return ret
if java: fStart,fEnd = "package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {","} }" # put functions in separate classes to try to save the constants table of the main class
elif golang: fStart,fEnd = "func NewFunc() bool {","}"
else: fStart,fEnd = outLang_bool+" NewFunc() {","}"
if negate: rTrue,rFalse = outLang_false,outLang_true
else: rTrue,rFalse = outLang_true,outLang_false
return subFuncCall(fStart+"\n".join(outLang_shortIf(nearCall(False,conds[i:j],subFuncs,subFuncL),"return "+rTrue+";") for i,j in zip(range(0,len(conds),max_or_length),range(max_or_length,len(conds),max_or_length)+[len(conds)]))+"\nreturn "+rFalse+";"+fEnd,subFuncs,subFuncL)
def outLang_shortIf(cond,statement):
if golang: return "if "+cond+" {\n "+statement+"\n}"
else: return "if("+cond+") "+statement

Silas S. Brown
committed
def subFuncCall(newFunc,subFuncs,subFuncL):
if newFunc in subFuncs:
# we generated an identical one before
subFuncName=subFuncs[newFunc]
else:
if java: subFuncName="z%X" % len(subFuncs) # (try to save as many bytes as possible because it won't be compiled out and we also have to watch the compiler's footprint; start with z so MainActivity.java etc appear before rather than among this lot in IDE listings)

Silas S. Brown
committed
else: subFuncName="match%d" % len(subFuncs)

Silas S. Brown
committed
subFuncs[newFunc]=subFuncName
if java or c_sharp or golang: static=""

Silas S. Brown
committed
else: static="static "
subFuncL.append(static+newFunc.replace("NewFunc",subFuncName,1))
if java: return jPackage+"."+subFuncName+".f(a)"

Silas S. Brown
committed
return subFuncName+"()" # the call (without a semicolon)

Silas S. Brown
committed
def stringSwitch(byteSeq_to_action_dict,subFuncL,funcName="topLevelMatch",subFuncs={},java_localvar_counter=None,nestingsLeft=None): # ("topLevelMatch" is also mentioned in the C code)

Silas S. Brown
committed
# make a function to switch on a large number of variable-length string cases without repeated lookahead for each case
# (may still backtrack if no words or no suffices match)

Silas S. Brown
committed
# byteSeq_to_action_dict is really a byte sequence to [(action, OR-list of Yarowsky-like indicators which are still in Unicode)], the latter will be c_escape()d
# can also be byte seq to [(action,(OR-list,nbytes))] but only if OR-list is not empty, so value[1] will always be false if OR-list is empty
if nestingsLeft==None: nestingsLeft=nested_switch
canNestNow = not nestingsLeft==0 # (-1 = unlimited)
if java or c_sharp or golang: NEXTBYTE = adot + 'nB()'

Silas S. Brown
committed
else: NEXTBYTE = 'NEXTBYTE'

Silas S. Brown
committed
allBytes = set(b[0] for b in byteSeq_to_action_dict.iterkeys() if b)

Silas S. Brown
committed
ret = []
if not java_localvar_counter: # Java and C# don't allow shadowing of local variable names, so we'll need to uniquify them

Silas S. Brown
committed
java_localvar_counter=[0]
olvc = "%X" % java_localvar_counter[0] # old localvar counter

Silas S. Brown
committed
if funcName:
if java: ret.append("package "+jPackage+";\npublic class "+funcName+" { public static void f("+jPackage+".Annotator a) {")
else:
if funcName=="topLevelMatch" and not c_sharp: stat="static " # because we won't call subFuncCall on our result
else: stat=""
if golang: ret.append("func %s() {" % funcName)
else: ret.append(stat+"void %s() {" % funcName)

Silas S. Brown
committed
savePos = len(ret)
if java or c_sharp: ret.append("{ int oldPos="+adot+"inPtr;")
elif golang: ret.append("{ oldPos := inPtr;")

Silas S. Brown
committed
elif "" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1:
# no funcName, but might still want to come back here as there's a possible action at this level
savePos = len(ret)
if java or c_sharp:
ret.append("{ int oP"+olvc+"="+adot+"inPtr;")

Silas S. Brown
committed
java_localvar_counter[0] += 1
elif golang: ret.append("{ oldPos := inPtr;")

Silas S. Brown
committed
else: savePos = None
def restorePos():
if not savePos==None:
if len(' '.join(ret).split(NEXTBYTE))==2 and not called_subswitch:
# do a PREVBYTE instead
# (note however that splitting on NEXTBYTE
# does not necessarily give a reliable value
# for max amount of lookahead required if
# there's more than 1. We use max rule len
# as an upper bound for that instead.)
del ret[savePos]
if java: ret.append("a.inPtr--;")
elif c_sharp or golang: ret.append("inPtr--;")
elif java or c_sharp:
if funcName: ret.append(adot+"inPtr=oldPos; }")
else: ret.append(adot+"inPtr=oP"+olvc+"; }")
elif golang: ret.append("inPtr=oldPos; }")
else: ret.append("SETPOS(oldPos); }") # restore

Silas S. Brown
committed
called_subswitch = False

Silas S. Brown
committed
if "" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[""])==1 and not byteSeq_to_action_dict[""][0][1] and all((len(a)==1 and a[0][0].startswith(byteSeq_to_action_dict[""][0][0]) and not a[0][1]) for a in byteSeq_to_action_dict.itervalues()):
# there's an action in common for this and all subsequent matches, and no Yarowsky-like indicators, so we can do the common action up-front
ret.append(byteSeq_to_action_dict[""][0][0])
l = len(byteSeq_to_action_dict[""][0][0])
byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in byteSeq_to_action_dict.iteritems())

Silas S. Brown
committed
# and, since we'll be returning no matter what,
# we can put the inner switch in a new function

Silas S. Brown
committed
# (even if not re-used, this helps compiler speed)

Silas S. Brown
committed
# + DON'T save/restore pos around it (it itself
# will do any necessary save/restore pos)
del byteSeq_to_action_dict[""]
if java and (canNestNow or len(byteSeq_to_action_dict)==1): # hang on - better nest (might be using --nested-switch to get around a Java compiler-memory problem; the len condition allows us to always nest a single 'if' rather than creating a new function+class for it)
ret += [" "+x for x in stringSwitch(byteSeq_to_action_dict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)]
restorePos()
ret.append("return;")
else: # ok, new function
newFunc = "\n".join(stringSwitch(byteSeq_to_action_dict,subFuncL,"NewFunc",subFuncs))
ret.append(subFuncCall(newFunc,subFuncs,subFuncL)+"; return;")
del ret[savePos] # will be set to None below

Silas S. Brown
committed
byteSeq_to_action_dict[""] = [("",[])] # for the end of this func
savePos = None # as setting funcName on stringSwitch implies it'll give us a savePos, and if we didn't set funcName then we called restorePos already above

Silas S. Brown
committed
elif allBytes:
# deal with all actions except "" first

Silas S. Brown
committed
use_if = (len(allBytes)==1)
if not use_if:
if nestingsLeft > 0: nestingsLeft -= 1
ret.append("switch("+NEXTBYTE+") {")

Silas S. Brown
committed
for case in sorted(allBytes):
if not c_sharp and 32<=ord(case)<127 and case!="'": cstr="'%c'" % case
elif ios and ord(case)>127: cstr=str(ord(case)-256)

Silas S. Brown
committed
else:
cstr=str(ord(case))
if java: cstr = "(byte)"+cstr
if use_if: ret.append("if("+NEXTBYTE+"=="+cstr+") {")

Silas S. Brown
committed
else: ret.append("case %s:" % cstr)
subDict = dict([(k[1:],v) for k,v in byteSeq_to_action_dict.iteritems() if k and k[0]==case])
inner = stringSwitch(subDict,subFuncL,None,subFuncs,java_localvar_counter,nestingsLeft)
if canNestNow or not inner[0].startswith("switch"): ret += [" "+x for x in inner]

Silas S. Brown
committed
else:
# Put the inner switch into a different function
# which returns 1 if we should return.
# (TODO: this won't catch cases where there's a savePos before the inner switch; will still nest in that case. But it shouldn't lead to big nesting in practice.)
if nested_switch: inner = stringSwitch(subDict,subFuncL,None,subFuncs,None,None) # re-do it with full nesting counter
if java: myFunc,funcEnd = ["package "+jPackage+";\npublic class NewFunc { public static boolean f("+jPackage+".Annotator a) {"], "}}"
elif golang: myFunc,funcEnd=["func NewFunc() bool {"],"}"
else: myFunc,funcEnd=[outLang_bool+" NewFunc() {"],"}"

Silas S. Brown
committed
for x in inner:
if x.endswith("return;"): x=x[:-len("return;")]+"return "+outLang_true+";"

Silas S. Brown
committed
myFunc.append(" "+x)
ret += (" "+outLang_shortIf(subFuncCall("\n".join(myFunc)+"\n return "+outLang_false+";\n"+funcEnd,subFuncs,subFuncL),"return;")).split('\n') # if golang, MUST have the \n before the 1st return there (optional for other languages); also must split outLang_shortIf o/p into \n for the above 'for x in inner' rewrite to work

Silas S. Brown
committed
called_subswitch=True # as it'll include more NEXTBYTE calls which are invisible to the code below
if not (use_if or inner[-1].endswith("return;")): ret.append(" break;")

Silas S. Brown
committed
ret.append("}") # end of switch or if
restorePos()
if funcName:
if java: ret.append("} }")
else: ret.append("}")

Silas S. Brown
committed
elif "" in byteSeq_to_action_dict:
# if the C code gets to this point, no return; happened - no suffices

Silas S. Brown
committed
# so execute one of the "" actions and return
# (which one, if any, depends on the Yarowsky-like indicators; there should be at most one "default" action without indicators)
default_action = ""
for action,conds in byteSeq_to_action_dict[""]:
if conds:
assert action, "conds without action in "+repr(byteSeq_to_action_dict[""])
if type(conds)==tuple:
elif c_sharp or golang: ret.append("nearbytes=%d;" % nbytes)

Silas S. Brown
committed
else: ret.append("setnear(%d);" % nbytes)
else: negate = False
ret.append("if ("+nearCall(negate,conds,subFuncs,subFuncL)+") {")

Silas S. Brown
committed
ret.append((action+" return;").strip())
ret.append("}")
else:
if default_action:
sys.stderr.write("WARNING! More than one default action in "+repr(byteSeq_to_action_dict[""])+" - earlier one discarded!\n")
if rulesFile: sys.stderr.write("(This might indicate invalid markup in the corpus, but it might just be due to a small change or capitalisation update during an incremental run, which can be ignored.)\n") # TODO: don't write this warning at all if accum.amend_rules was set at the end of analyse() ?
else: sys.stderr.write("(This might indicate invalid markup in the corpus)\n")

Silas S. Brown
committed
default_action = action
if default_action or not byteSeq_to_action_dict[""]: ret.append((default_action+" return;").strip()) # (return only if there was a default action, OR if an empty "" was in the dict with NO conditional actions (e.g. from the common-case optimisation above). Otherwise, if there were conditional actions but no default, we didn't "match" anything if none of the conditions were satisfied.)

Silas S. Brown
committed
return ret # caller does '\n'.join
if compress:
squashStrings = set() ; squashReplacements = []
def squashFinish():
global squashStrings # so can set it to "done" at end
tokens = set()
for s in squashStrings: tokens.update(list(s))
totSaved = 0
tokens = [chr(t) for t in range(1,256) if not chr(t) in tokens] ; orig_tokens = set(tokens)
pairs = [chr(0)] * 512
while tokens:
t = tokens.pop()
counts = {}
for s in squashStrings:
# To make decompression as fast and compact as possible, each 1-byte token represents 2 bytes exactly. In practice allowing it to represent variable lengths of whole bytes up to 4 is not likely to improve the compression by more than 3.2% (that's 3.2% of the 10-20% it achieves, so it's around 0.5%), and not very much better for length 9, so we might as well stick with this simpler scheme unless we do real LZMA or whatever.
for i in range(0,len(s)-1):
k = s[i:i+2]
if k[0] in orig_tokens or k[1] in orig_tokens: continue # to keep the decoder simple, don't set things up so it needs to recurse (being able to recurse within the 2-byte expansion is very unlikely to save anything in practice anyway - it didn't on my annotators - so not worth implementing the decoder for)
counts[k] = counts.get(k,0) + 1
bSaved, k = max((v,k) for k,v in counts.items())
pairs[ord(t)] = k[0]
pairs[ord(t)+256] = k[1]
squashReplacements.append((k,t)) # this assumes we won't be doing things like 'if ALL instances of a byte end up in our tokens, add the byte's original value as an extra token'
for s in squashStrings:
s2 = s.replace(k,t)
if not s2==s:
squashStrings.remove(s) ; squashStrings.add(s2)
totSaved += bSaved
sys.stderr.write("Compress: %d/%d tokens, %d bytes saved%s" % (len(orig_tokens)-len(tokens),len(orig_tokens),totSaved,clear_eol))
squashStrings = "done"
while len(pairs) > 256 and pairs[-1]==chr(0): pairs = pairs[:-1]
sys.stderr.write("\n")
if totSaved < len(pairs)+50: sys.stderr.write("Warning: --compress on this data made it bigger! Consider dropping --compress\n") # 50 as rough guess for OutWriteDecompress binary (probably about 12 instructions at 4+ bytes each)
return c_escapeRawBytes("".join(pairs))
decompress_func=r"""

Silas S. Brown
committed
static unsigned char pairs[]="%%PAIRS%%";
static void OutWriteDecompress(const char *s) {

Silas S. Brown
committed
while(*s) {
int i=(unsigned char)*s;
if (pairs[i]) { OutWriteByte(pairs[i]); OutWriteByte(pairs[i|0x100]); } else OutWriteByte(*s);
s++;

Silas S. Brown
committed
}
}"""
def squash(byteStr):
if squashStrings == "done":
for k,v in squashReplacements:
byteStr = byteStr.replace(k,v)
else: squashStrings.add(byteStr) # for the dry run
return byteStr
else: decompress_func = ""

Silas S. Brown
committed
additional_js_instructions = r"""
If you need to inject additional Javascript into sites to
fix things, set the ANNOGEN_EXTRA_JS environment variable
before running Annotator Generator to (re)generate this
file. Make sure it ends with a semicolon, or the closing
brace of an 'if', 'for', 'while' or 'try..catch' (the
latter is probably a good idea). The snippet will be run
before each scan for new text to annotate.
Similarly you can set ANNOGEN_EXTRA_CSS for CSS 'kludges'.
if ios:
c_preamble = r"""/*
To compile this, go into Xcode and do File > New > Project
and under iOS / Application choose Single View Application.
Fill in the dialogue box as you like, then use this file
to replace the generated ViewController.m file. You should
then be able to press the Run button on the toolbar.
Tested on an iOS 6.1 simulator in Xcode 4.6 on Mac OS 10.7
(hopefully compatible with later versions too)
Swipe left to go back (as in Safari).
If your pages refer to clip://anything then that
link will show and annotate the local clipboard.
*/
#import <UIKit/UIKit.h>
#include <string.h>
"""
c_defs = r"""static const char *readPtr, *writePtr, *startPtr;
static NSMutableData *outBytes;
#define NEXTBYTE (*readPtr++)
#define NEXT_COPY_BYTE (*writePtr++)
#define COPY_BYTE_SKIP writePtr++
#define COPY_BYTE_SKIPN(n) writePtr += (n)
#define POSTYPE const char*
#define THEPOS readPtr
#define SETPOS(p) (readPtr=(p))
#define PREVBYTE readPtr--
#define FINISHED (!(*readPtr))
static void OutWriteStr(const char *s) { [outBytes appendBytes:s length:strlen(s)]; }
static void OutWriteByte(char c) { [outBytes appendBytes:(&(c)) length:1]; }
static int near(char* string) {
const char *startFrom = readPtr-nearbytes;
size_t n=2*nearbytes;
if (startFrom < startPtr) {
n -= startPtr-startFrom;
startFrom = startPtr; }
return strnstr(startFrom,string,n) != NULL;
}
""" # (strnstr is BSD-specific, but that's OK on iOS. TODO: might be nice if all loops over outWriteByte could be reduced to direct calls of appendBytes with appropriate lengths, but it wouldn't be a major speedup)
c_switch1=c_switch2=c_switch3=c_switch4="" # only ruby is needed by the iOS code
elif ndk:
c_preamble = r"""#!/bin/bash
#
# Run this script in the Android workspace to set up the
# JNI folder and compile the library (requires ndk-build).
# Then see comments in src/%%PACKAGE%%/MainActivity.java
#
mkdir -p jni src/%%PACKAGE%%
cat > jni/Android.mk <<"EOF"
LOCAL_PATH:= $(call my-dir)
LOCAL_SRC_FILES := annotator.c
LOCAL_MODULE := Annotator
LOCAL_MODULE_FILENAME := Annotator
include $(BUILD_SHARED_LIBRARY)
EOF
cat > jni/Application.mk <<"EOF"
APP_PLATFORM := android-1
APP_ABI := armeabi
EOF
cat > src/%%PACKAGE%%/MainActivity.java <<"EOF"
%%android_src%%
EOF
cat > assets/clipboard.html <<"EOF"
%%android_clipboard%%
EOF
cat > jni/annotator.c <<"EOF"
#include <stdlib.h>
#include <jni.h>
""".replace('%%PACKAGE%%',ndk.replace('.','/'))
if zlib: c_preamble=c_preamble.replace("LOCAL_PATH","LOCAL_LDLIBS := -lz\nLOCAL_PATH",1)
c_defs = r"""static const char *readPtr, *writePtr, *startPtr;
static char *outBytes;
static size_t outWriteLen,outWritePtr;
#define NEXTBYTE (*readPtr++)
#define NEXT_COPY_BYTE (*writePtr++)
#define COPY_BYTE_SKIP writePtr++
#define COPY_BYTE_SKIPN(n) writePtr += (n)
#define POSTYPE const char*
#define THEPOS readPtr
#define SETPOS(p) (readPtr=(p))
#define PREVBYTE readPtr--
#define FINISHED (!(*readPtr))
static void OutWriteStr(const char *s) {
size_t l = strlen(s), newLen = outWriteLen;
while (outWritePtr+l > newLen) newLen *= 2;
if (newLen > outWriteLen) {
char *ob2 = realloc(outBytes,newLen);
if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
outBytes = ob2; outWriteLen = newLen;
}
memcpy(outBytes+outWritePtr, s, l);
outWritePtr += l;
}
static void OutWriteByte(char c) {
if (outWritePtr >= outWriteLen) {
size_t newLen = outWriteLen * 2;
char *ob2 = realloc(outBytes,newLen);
if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
outBytes = ob2; outWriteLen = newLen;
}
outBytes[outWritePtr++] = c;
}
int near(char* string) {
const char *startFrom = readPtr-nearbytes,
*end = readPtr+nearbytes;
if (startFrom < startPtr) startFrom = startPtr;
size_t l=strlen(string); end -= l;
while (*startFrom && startFrom <= end) {
if(!strncmp(startFrom,string,l)) return 1;
startFrom++;
}
return 0;
}
void matchAll();
JNIEXPORT jstring JNICALL Java_%PACKAGE%_MainActivity_jniAnnotate(JNIEnv *env, jclass theClass, jstring jIn) {
startPtr=(char*)(*env)->GetStringUTFChars(env,jIn,NULL);
readPtr = startPtr; writePtr = startPtr;
outWriteLen = strlen(startPtr)*5+1; /* initial guess (must include the +1 to ensure it's non-0 for OutWrite...'s *= code) */
outBytes = malloc(outWriteLen);
if(outBytes) { outWritePtr = 0; matchAll(); }
(*env)->ReleaseStringUTFChars(env,jIn,startPtr);
if(outBytes) OutWriteByte(0);
else return (*env)->NewStringUTF(env,"out of memory"); /* which it might or might not be able to do. This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
jstring ret=(*env)->NewStringUTF(env,outBytes);
free(outBytes); return ret;
}
""".replace("%PACKAGE%",ndk.replace('.','_'))
c_switch1=c_switch2=c_switch3=c_switch4="" # only ruby is needed by the Android code
c_preamble = r"""/*
For running on Windows desktop or WINE, compile with:
i386-mingw32-gcc annoclip.c -o annoclip.exe
For running on Windows Mobile 2003SE, 5, 6, 6.1 or 6.5,
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
compile with:
arm-cegcc-gcc annoclip.c -D_WINCE -Os -o annoclip-WM.exe
or (if you have MSVC 2008 on a Windows machine),
set PATH=%VCINSTALLDIR%\ce\bin\x86_arm;%PATH%
set lib=%VCINSTALLDIR%\ce\lib\armv4
set include=%VSINSTALLDIR%\SmartDevices\SDK\Smartphone2003\Include;%VCINSTALLDIR%\ce\include;%VCINSTALLDIR%\include
set CL=/TP /EHsc /D "_WIN32_WCE=0x420" /D UNDER_CE /D WIN32_PLATFORM_PSPC /D _WINCE /D _WINDOWS /D ARM /D _ARM_ /D _UNICODE /D UNICODE /D POCKETPC2003_UI_MODEL
set LINK=/force:multiple /NODEFAULTLIB:oldnames.lib /SUBSYSTEM:WINDOWSCE /LIBPATH:"C:\Program Files\Windows Mobile 5.0 SDK R2\PocketPC\Lib\ARMV4I" /OUT:annoclip-WM.exe /MANIFEST:NO /STACK:65536,4096 /DYNAMICBASE:NO aygshell.lib coredll.lib corelibc.lib ole32.lib oleaut32.lib uuid.lib commctrl.lib
cl /D_WIN32_IE=0x0400 /D_WIN32_WCE=0x0400 /Os /Og annoclip.c
(you could try omitting /Os /Og for faster compilation,
but RAM is likely important on the Windows Mobile device)
*/
#include <stdio.h>
#include <string.h>
#define UNICODE 1 /* for TCHAR to be defined correctly */
#include <windows.h>
#ifdef near
#undef near
#endif
FILE* outFile = NULL;
unsigned char *p, *copyP, *pOrig;
#define OutWriteStr(s) fputs(s,outFile)
#define OutWriteByte(c) fputc(c,outFile)
#define NEXTBYTE (*p++)
#define NEXT_COPY_BYTE (*copyP++)
#define COPY_BYTE_SKIP copyP++
#define COPY_BYTE_SKIPN(n) copyP += (n)
#define POSTYPE unsigned char*
#define THEPOS p
#define PREVBYTE p--
#define FINISHED (!*p && !p[1])
"""
if c_filename and os.sep in c_filename: cfn = c_filename[c_filename.rindex(os.sep)+1:]
else: cfn = c_filename
if cfn: c_preamble=c_preamble.replace("annoclip.c",cfn)
c_defs = r"""static int near(char* string) {
POSTYPE o=p; if(p>pOrig+nearbytes) o-=nearbytes; else o=pOrig;
size_t l=strlen(string);
POSTYPE max=p+nearbytes-l;
while (*o && o <= max) {
if(!strncmp((char*)o,(char*)string,l)) return 1;
o++;
}
return 0;
}
"""
c_switch1=c_switch2=c_switch3=c_switch4="" # only ruby is needed by the windows_clipboard code
else:
c_preamble = r"""

Silas S. Brown
committed
#include <stdio.h>
#include <string.h>
/* To include this code in another program,
define the ifndef'd macros below + define Omit_main */

Silas S. Brown
committed
/* Default definition of NEXTBYTE etc is to read input
from stdin and write output to stdout. */

Silas S. Brown
committed
enum { Half_Bufsize = %%LONGEST_RULE_LEN%% };
static unsigned char lookahead[Half_Bufsize*2];

Silas S. Brown
committed
static size_t readPtr=0,writePtr=0,bufStart=0,bufLen=0;

Silas S. Brown
committed
static int nextByte() {

Silas S. Brown
committed
if (readPtr-bufStart +ybytes >= bufLen) {

Silas S. Brown
committed
if (bufLen == Half_Bufsize * 2) {
memmove(lookahead,lookahead+Half_Bufsize,Half_Bufsize);
bufStart += Half_Bufsize; bufLen -= Half_Bufsize;
}
bufLen += fread(lookahead+bufLen,1,Half_Bufsize*2-bufLen,stdin);

Silas S. Brown
committed
if (readPtr-bufStart == bufLen) return EOF;

Silas S. Brown
committed
}

Silas S. Brown
committed
return lookahead[(readPtr++)-bufStart];

Silas S. Brown
committed
}
static int near(char* string) {

Silas S. Brown
committed
/* for Yarowsky-like matching */

Silas S. Brown
committed
size_t offset = readPtr-bufStart, l=strlen(string),

Silas S. Brown
committed
maxPos = bufLen;

Silas S. Brown
committed
if (maxPos >= l) maxPos -= l; else return 0; // can't possibly start after maxPos-l
if (offset+nearbytes>l) {
if (maxPos > offset+nearbytes-l)
maxPos = offset+nearbytes-l;
} else maxPos = 0; // (don't let it go below 0, as size_t is usually unsigned)

Silas S. Brown
committed
if (offset>nearbytes) offset-=nearbytes; else offset = 0;

Silas S. Brown
committed
while (offset <= maxPos) {

Silas S. Brown
committed
if(!strncmp((char*)lookahead+offset,string,l)) return 1;

Silas S. Brown
committed
offset++;
}
return 0;
}
#define NEXTBYTE nextByte()

Silas S. Brown
committed
#define NEXT_COPY_BYTE lookahead[(writePtr++)-bufStart]
#define COPY_BYTE_SKIP writePtr++
#define COPY_BYTE_SKIPN(n) writePtr += (n)

Silas S. Brown
committed
#define POSTYPE size_t

Silas S. Brown
committed
#define THEPOS readPtr /* or get it via a function */
#define SETPOS(p) (readPtr=(p)) /* or set via a func */

Silas S. Brown
committed
#define PREVBYTE readPtr--
#define FINISHED (feof(stdin) && readPtr-bufStart == bufLen)

Silas S. Brown
committed
#define OutWriteStr(s) fputs(s,stdout)
#define OutWriteByte(c) putchar(c)
#endif
#ifndef Default_Annotation_Mode
#define Default_Annotation_Mode ruby_markup
#endif
enum {
annotations_only,
ruby_markup,
brace_notation} annotation_mode = Default_Annotation_Mode;
"""
c_switch1=r"""switch (annotation_mode) {
case annotations_only: OutWriteDecompress(annot); COPY_BYTE_SKIPN(numBytes); break;
case ruby_markup:"""
c_switch2=r"""break;
case brace_notation:
OutWriteByte('{');
for(;numBytes;numBytes--)
OutWriteByte(NEXT_COPY_BYTE);
OutWriteByte('|'); OutWriteDecompress(annot);
OutWriteByte('}'); break;
}"""
c_switch3 = "if (annotation_mode == ruby_markup) {"
c_switch4 = "} else o(numBytes,annot);"

Silas S. Brown
committed
if data_driven and not ndk: c_preamble += '#include <stdlib.h>\n' # for malloc (ndk includes it anyway, above)
if zlib: c_preamble += '#include "zlib.h"\n'
version_stamp = time.strftime("generated %Y-%m-%d by ")+program_name[:program_name.index("(c)")].strip()
if ios: c_name = "Objective-C"
else: c_name = "C"
c_start = "/* -*- coding: "+outcode+" -*- */\n/* "+c_name+" code "+version_stamp+" */\n"
enum { ybytes = %%YBYTES%% }; /* for Yarowsky matching, minimum readahead */
static int nearbytes = ybytes;
#define setnear(n) (nearbytes = (n))
""" + c_defs + r"""static int needSpace=0;

Silas S. Brown
committed
static void s() {
if (needSpace) OutWriteByte(' ');
else needSpace=1; /* for after the word we're about to write (if no intervening bytes cause needSpace=0) */

Silas S. Brown
committed

Silas S. Brown
committed
static void o(int numBytes,const char *annot) {

Silas S. Brown
committed
OutWriteStr("<ruby><rb>");
for(;numBytes;numBytes--)
OutWriteByte(NEXT_COPY_BYTE);
OutWriteStr("</rb><rt>"); OutWriteDecompress(annot);
OutWriteStr("</rt></ruby>"); """+c_switch2+r""" }
static void o2(int numBytes,const char *annot,const char *title) {"""+c_switch3+r"""

Silas S. Brown
committed
s();
OutWriteStr("<ruby title=\""); OutWriteDecompress(title);

Silas S. Brown
committed
OutWriteStr("\"><rb>");
for(;numBytes;numBytes--)
OutWriteByte(NEXT_COPY_BYTE);
OutWriteStr("</rb><rt>"); OutWriteDecompress(annot);

Silas S. Brown
committed
if not compress: c_start = c_start.replace("OutWriteDecompress","OutWriteStr")

Silas S. Brown
committed

Silas S. Brown
committed
c_end = r"""
void matchAll() {"""
if zlib: c_end += " if(!data) init();\n"
c_end += r""" while(!FINISHED) {

Silas S. Brown
committed
POSTYPE oldPos=THEPOS;

Silas S. Brown
committed
topLevelMatch();

Silas S. Brown
committed
if (oldPos==THEPOS) { needSpace=0; OutWriteByte(NEXTBYTE); COPY_BYTE_SKIP; }

Silas S. Brown
committed
}
jsAddRubyCss="all_frames_docs(function(d) { if(d.rubyScriptAdded==1 || !d.body) return; var e=d.createElement('span'); e.innerHTML='<style>ruby{display:inline-table;vertical-align:bottom;-webkit-border-vertical-spacing:1px;padding-top:0.5ex;}ruby *{display: inline;vertical-align:top;line-height:1.0;text-indent:0;text-align:center;white-space:nowrap;}rb{display:table-row-group;font-size: 100%;}rt{display:table-header-group;font-size:100%;line-height:1.1;font-family: Gandhari, DejaVu Sans, Lucida Sans Unicode, Times New Roman, serif !important; }"+os.environ.get('ANNOGEN_EXTRA_CSS','').replace('"',r"\\42").replace("'",r"\\47")+"</style>'; d.body.insertBefore(e,d.body.firstChild); d.rubyScriptAdded=1 })"
def jsAnnot(alertStr,xtra1,xtra2,annotScan,case3): return "var leaveTags=['SCRIPT', 'STYLE', 'TITLE', 'TEXTAREA', 'OPTION'];function annotPopAll(e) { function f(c) { var i=0,r='',cn=c.childNodes; for(;i < cn.length;i++) r+=(cn[i].firstChild?f(cn[i]):(cn[i].nodeValue?cn[i].nodeValue:'')); return r; } " + alertStr + " }; "+xtra1+" function all_frames_docs(c) { var f=function(w){if(w.frames && w.frames.length) { var i; for(i=0; i<w.frames.length; i++) f(w.frames[i]) } c(w.document) }; f(window) }; function tw0() { "+xtra2+"all_frames_docs(function(d){walk(d,d,false)}) }; function annotScan() {"+os.environ.get("ANNOGEN_EXTRA_JS","")+annotScan+"}; function walk(n,document,inLink) { var c=n.firstChild; while(c) { var ps = c.previousSibling, cNext = c.nextSibling; function isTxt(n) {return n && n.nodeType==3 && n.nodeValue && !n.nodeValue.match(/^"+r"\\"+"s*$/)}; if (c.nodeType==1 && (c.nodeName=='WBR' || (c.nodeName=='SPAN' && c.childNodes.length<=1 && (!c.firstChild || (c.firstChild.nodeValue && c.firstChild.nodeValue.match(/^"+r"\\"+"s*$/))))) && isTxt(cNext) && isTxt(ps)) { n.removeChild(c); cNext.previousSibling.nodeValue += cNext.nodeValue; n.removeChild(cNext); cNext = ps } c=cNext; } c=n.firstChild; while(c) { var cNext = c.nextSibling; switch (c.nodeType) { case 1: if (leaveTags.indexOf(c.nodeName)==-1 && c.className!='_adjust0') walk(c,document,inLink||(c.nodeName=='A'&&!!c.href)); break; case 3: {var cnv=c.nodeValue.replace(/\u200b/g,'');"+case3+"} } c=cNext } }"
if ios:
c_end += r"""
@interface ViewController : UIViewController <UIWebViewDelegate>
@property (nonatomic,retain) UIWebView *myWebView;
@end
@implementation ViewController
- (void)viewDidLoad {
[super viewDidLoad];
self.myWebView = [[UIWebView alloc] initWithFrame:CGRectMake(10, 20, 300,500)];
self.myWebView.backgroundColor = [UIColor whiteColor];
self.myWebView.scalesPageToFit = YES;
self.myWebView.autoresizingMask = (UIViewAutoresizingFlexibleWidth | UIViewAutoresizingFlexibleHeight);
self.myWebView.delegate = self;
[self.view addGestureRecognizer:[[UISwipeGestureRecognizer alloc] initWithTarget:self action:@selector(swipeBack:)]];
[self loadInitialPage];
}
- (void)loadInitialPage {
"""
ios=ios.replace('\\','\\\\').replace('"','\\"').replace('\n','\\n')
if ios.startswith('<'): c_end += '[self.myWebView loadHTMLString:@"'+ios+'" baseURL:nil];'
# TODO: 'file from local project' option? for now, anything that doesn't start with < is taken as URL
else:
assert "://" in ios, "not an HTML fragment and doesn't look like a URL"
c_end += '[self.myWebView loadRequest:[[NSURLRequest alloc] initWithURL:[[NSURL alloc] initWithString:@"'+ios+'"]]];'
c_end += r"""
}
-(void)swipeBack:(UISwipeGestureRecognizer *)recognizer {
if (recognizer.state == UIGestureRecognizerStateEnded) {
if ([self.myWebView canGoBack]) [self.myWebView goBack];
else [self loadInitialPage];
}
}
- (void)webViewDidFinishLoad:(UIWebView *)webView
{
[webView stringByEvaluatingJavaScriptFromString:@" """+jsAnnot("window.alertTitle=f(e.firstChild)+' '+f(e.firstChild.nextSibling); window.alertMessage=e.title; window.location='alert:a'","var texts,tLen,oldTexts,otPtr,replacements; ","texts = new Array(); tLen=0; otPtr=0; ","oldTexts = new Array(); replacements = new Array(); tw0(); window.location='scan:a'",r"""var i=otPtr;while (i<oldTexts.length && oldTexts[i]!=cnv) i++;if(i<replacements.length) {var newNode=document.createElement('span');newNode.className='_adjust0';n.replaceChild(newNode, c);var r=replacements[i]; if(!inLink) r=r.replace(/<ruby title=/g,'<ruby onclick=\"annotPopAll(this)\" title=');newNode.innerHTML=r; otPtr=i;} else if (tLen < 1024) { texts[texts.length]=cnv;tLen += cnv.length;} else return""")+r"""annotScan()"];
}
- (BOOL)webView:(UIWebView*)webView shouldStartLoadWithRequest:(NSURLRequest*)request navigationType:(UIWebViewNavigationType)navigationType {
NSURL *URL = [request URL];
if ([[URL scheme] isEqualToString:@"alert"]) {
[[[UIAlertView alloc] initWithTitle:[self.myWebView stringByEvaluatingJavaScriptFromString:@"window.alertTitle"] message:[self.myWebView stringByEvaluatingJavaScriptFromString:@"window.alertMessage"] delegate: self cancelButtonTitle: nil otherButtonTitles: @"OK",nil, nil] show];
return NO;
} else if ([[URL scheme] isEqualToString:@"clip"]) {
[self.myWebView loadHTMLString:[@"<html><head><meta name=\"mobileoptimized\" content=\"0\"><meta name=\"viewport\" content=\"width=device-width\"></head><body>" stringByAppendingString:[UIPasteboard generalPasteboard].string] baseURL:nil]; // TODO: make the string HTML-safe and refresh it if clipboard changes, like the Android version does via JS
} else if ([[URL scheme] isEqualToString:@"scan"]) {
NSString *texts=[self.myWebView stringByEvaluatingJavaScriptFromString:@"texts.join('/@@---------@@/')"];
startPtr = [texts UTF8String]; readPtr = startPtr; writePtr = startPtr;
outBytes = [NSMutableData alloc]; matchAll(); OutWriteByte(0);
if([texts length]>0) [self.myWebView stringByEvaluatingJavaScriptFromString:[@"replacements=\"" stringByAppendingString:[[[[[[NSString alloc] initWithUTF8String:[outBytes bytes]] stringByReplacingOccurrencesOfString:@"\\" withString:@"\\\\"] stringByReplacingOccurrencesOfString:@"\"" withString:@"\\\""] stringByReplacingOccurrencesOfString:@"\n" withString:@"\\n"] stringByAppendingString:@"\".split('/@@---------@@/');oldTexts=texts;tw0();"""+jsAddRubyCss+r""""]]];

Silas S. Brown
committed
[self.myWebView stringByEvaluatingJavaScriptFromString:@"if(typeof window.sizeChangedLoop=='undefined') window.sizeChangedLoop=0; var me=++window.sizeChangedLoop; var getLen = function(w) { var r=0; if(w.frames && w.frames.length) { var i; for(i=0; i<w.frames.length; i++) r+=getLen(w.frames[i]) } if(w.document && w.document.body && w.document.body.innerHTML) r+=w.document.body.innerHTML.length; return r }; var curLen=getLen(window), stFunc=function(){window.setTimeout(tFunc,1000)}, tFunc=function(){if(window.sizeChangedLoop==me){if(getLen(window)==curLen) stFunc(); else annotScan()}}; stFunc(); var m=window.MutationObserver||window.WebKitMutationObserver; if(m) new m(function(mut,obs){if(mut[0].type=="childList"){obs.disconnect();if(window.sizeChangedLoop==me)annotScan()}}).observe(document.body,{childList:true,subtree:true})"]; // HTMLSizeChanged(annotScan)
return NO;
}
return YES;
}
@end
"""
elif ndk: c_end += """
EOF
ndk-build
mv -f libs/armeabi/Annotator.so libs/armeabi/libAnnotator.so >/dev/null 2>/dev/null || true
"""
#ifdef _WINCE
#define CMD_LINE_T LPWSTR
#else
#define CMD_LINE_T LPSTR
#endif
static void errorExit(char* text) {
TCHAR msg[500];
DWORD e = GetLastError();
wsprintf(msg,TEXT("%s: %d"),text,e);
MessageBox(NULL, msg, TEXT("Error"), 0);
exit(1);

Silas S. Brown
committed
}
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, CMD_LINE_T cmdLinePSTR, int iCmdShow)
{
TCHAR *className = TEXT("annogen");
WNDCLASS wndclass;
memset(&wndclass, 0, sizeof(wndclass));
wndclass.hInstance = hInstance;
wndclass.lpfnWndProc = DefWindowProc;
wndclass.lpszClassName = className;
if (!RegisterClass(&wndclass)) errorExit("RegisterClass");
#ifndef WS_OVERLAPPEDWINDOW
#define WS_OVERLAPPEDWINDOW (WS_OVERLAPPED | \
WS_CAPTION | \
WS_SYSMENU | \
WS_THICKFRAME | \
WS_MINIMIZEBOX | \
WS_MAXIMIZEBOX)
#endif
HWND win = CreateWindow(className,className, WS_OVERLAPPEDWINDOW,CW_USEDEFAULT, CW_USEDEFAULT,CW_USEDEFAULT,CW_USEDEFAULT, NULL,NULL,hInstance, NULL);
if (!win) errorExit("CreateWindow");
// ShowWindow(win, SW_SHOW); // not needed
HANDLE hClipMemory;
if (!OpenClipboard(win)) errorExit("OpenClipboard");
hClipMemory = GetClipboardData(CF_UNICODETEXT);
if(!hClipMemory) errorExit("GetClipboardData");
TCHAR*u16 = (TCHAR*)GlobalLock(hClipMemory);