FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
adjuster.py 439 KiB
Newer Older
Silas S. Brown's avatar
Silas S. Brown committed
program_name = "Web Adjuster v0.273 (c) 2012-18 Silas S. Brown"

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Silas S. Brown's avatar
Silas S. Brown committed
# If you want to compare this code to old versions, the old
# versions are being kept in the E-GuideDog SVN repository on
# http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster
Silas S. Brown's avatar
Silas S. Brown committed
# and on GitHub at https://github.com/ssb22/adjuster
# and on GitLab at https://gitlab.com/ssb22/adjuster
# and on BitBucket https://bitbucket.org/ssb22/adjuster
Silas S. Brown's avatar
Silas S. Brown committed
# although some early ones are missing.

Silas S. Brown's avatar
Silas S. Brown committed
twoline_program_name = program_name+"\nLicensed under the Apache License, Version 2.0"
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Basic Tornado import (or not if generating my website)
# --------------------------------------------------

if '--version' in sys.argv:
Silas S. Brown's avatar
Silas S. Brown committed
    print twoline_program_name ; raise SystemExit # no imports needed
elif '--html-options' in sys.argv: # for updating the website (this option is not included in the help text)
Silas S. Brown's avatar
Silas S. Brown committed
    tornado=inDL=False
    print "<h3>Options for "+program_name[:program_name.index("(c)")].strip()+"</h3>"
    def heading(h):
        global inDL
        if inDL: print "</dl>"
        print "<h4>"+h+"</h4>"
        print "<dl>"
Silas S. Brown's avatar
Silas S. Brown committed
        inDL = True
    def define(name,default=None,help="",multiple=False):
        if default or default==False:
            if type(default)==type(""): default=default.replace(",",", ").replace("  "," ")
            else: default=repr(default)
            default=" (default "+default+")"
        else: default=""
        def amp(h): return h.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
        help = amp(help)
Silas S. Brown's avatar
Silas S. Brown committed
        for ttify in ["option=\"value\"","option='value'","\"\"\"","--"]: help=help.replace(ttify,"<nobr><kbd>"+ttify+"</kbd></nobr>")
        print "<dt><kbd>--"+name+"</kbd>"+amp(default)+"</dt><dd>"+help.replace(" - ","---")+"</dd>"
Silas S. Brown's avatar
Silas S. Brown committed
else: # normal run: go ahead with the import
    from tornado.httpclient import AsyncHTTPClient,HTTPClient,HTTPError
    try: from tornado.httpserver import HTTPServer
    except: HTTPServer = None # may happen in WSGI mode (e.g. AppEngine can have trouble importing this)
    from tornado.web import Application, RequestHandler, StaticFileHandler, asynchronous
    import tornado.options, tornado.iostream
    from tornado.options import define,options
    def heading(h): pass
Silas S. Brown's avatar
Silas S. Brown committed
    if 'port' in options:
Silas S. Brown's avatar
Silas S. Brown committed
        # Looks like we're being imported by an extension
        # Some Tornado versions don't compile if 'define' is run twice
        def define(*args,**kwargs): pass
getfqdn_default = "is the machine's domain name" # default is ... (avoid calling getfqdn unnecessarily, as the server might be offline/experimental and we don't want to block on an nslookup with every adjuster start)
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Options and help text
# --------------------------------------------------

heading("General options")
define("config",help="Name of the configuration file to read, if any. The process's working directory will be set to that of the configuration file so that relative pathnames can be used inside it. Any option that would otherwise have to be set on the command line may be placed in this file as an option=\"value\" or option='value' line (without any double-hyphen prefix). Multi-line values are possible if you quote them in \"\"\"...\"\"\", and you can use standard \\ escapes. You can also set config= in the configuration file itself to import another configuration file (for example if you have per-machine settings and global settings). If you want there to be a default configuration file without having to set it on the command line every time, an alternative option is to set the ADJUSTER_CFG environment variable.")
define("version",help="Just print program version and exit")

heading("Network listening and security settings")
Silas S. Brown's avatar
Silas S. Brown committed
define("port",default=28080,help="The port to listen on. Setting this to 80 will make it the main Web server on the machine (which will likely require root access on Unix); setting it to 0 disables request-processing entirely (for if you want to use only the Dynamic DNS and watchdog options); setting it to -1 selects a local port in the ephemeral port range, in which case address and port will be written in plain form to standard output if it's not a terminal and --background is set (see also --just-me).")
# e.g. to run over an SSH tunnel, where you can't reserve a port number on the remote machine but can use a known port on the local machine:
# ssh -N -L 28080:$(ssh MachineName python adjuster.py --background --port=-1 --publicPort=28080 --just-me --restart --pidfile=adjuster.pid) MachineName
# This can be combined with --one-request-only (inefficient!) if you don't want the process to hang around afterwards, e.g. from an inetd script on your local port 28080:
Silas S. Brown's avatar
Silas S. Brown committed
# ssh MachineName 'python adjuster.py --background --port=-1 --publicPort=28080 --just-me --one-request-only --seconds=60 --stdio 2>/dev/null'
Silas S. Brown's avatar
Silas S. Brown committed
# You probably want to set up a ControlPath if repeatedly SSH'ing.
# 
define("publicPort",default=0,help="The port to advertise in URLs etc, if different from 'port' (the default of 0 means no difference). Used for example if a firewall prevents direct access to our port but some other server has been configured to forward incoming connections.")
define("user",help="The user name to run as, instead of root. This is for Unix machines where port is less than 1024 (e.g. port=80) - you can run as root to open the privileged port, and then drop privileges. Not needed if you are running as an ordinary user.")
Silas S. Brown's avatar
Silas S. Brown committed
define("address",default="",help="The address to listen on. If unset, will listen on all IP addresses of the machine. You could for example set this to localhost if you want only connections from the local machine to be received, which might be useful in conjunction with --real_proxy.")
define("password",help="The password. If this is set, nobody can connect without specifying ?p= followed by this password. It will then be sent to them as a cookie so they don't have to enter it every time. Notes: (1) If wildcard_dns is False and you have multiple domains in host_suffix, then the password cookie will have to be set on a per-domain basis. (2) On a shared server you probably don't want to specify this on the command line where it can be seen by process-viewing tools; use a configuration file instead.")
define("password_domain",help="The domain entry in host_suffix to which the password applies. For use when wildcard_dns is False and you have several domains in host_suffix, and only one of them (perhaps the one with an empty default_site) is to be password-protected, with the others public. If this option is used then prominentNotice (if set) will not apply to the passworded domain. You may put the password on two or more domains by separating them with slash (/).") # prominentNotice not apply: on the assumption that those who know the password understand what the tool is.  DOES apply anyway if =="htmlFilter".
Silas S. Brown's avatar
Silas S. Brown committed
define("auth_error",default="Authentication error",help="What to say when password protection is in use and a correct password has not been entered. HTML markup is allowed in this message. As a special case, if this begins with http:// or https:// then it is assumed to be the address of a Web site to which the browser should be redirected; if it is set to http:// and nothing else, the request will be passed to the server specified by own_server (if set). If the markup begins with a * when this is ignored and the page is returned with code 200 (OK) instead of 401 (authorisation required).") # TODO: basic password form? or would that encourage guessing
define("open_proxy",default=False,help="Whether or not to allow running with no password. Off by default as a safeguard against accidentally starting an open proxy.")
Silas S. Brown's avatar
Silas S. Brown committed
define("prohibit",multiple=True,default="wiki.*action=edit",help="Comma-separated list of regular expressions specifying URLs that are not allowed to be fetched unless --real_proxy is in effect. Browsers requesting a URL that contains any of these will be redirected to the original site. Use for example if you want people to go direct when posting their own content to a particular site (this is of only limited use if your server also offers access to any other site on the Web, but it might be useful when that's not the case). Include ^https in the list to prevent Web Adjuster from fetching HTTPS pages for adjustment and return over normal HTTP. This access is enabled by default now that many sites use HTTPS for public pages that don't really need to be secure, just to get better placement on some search engines, but if sending confidential information to the site then beware you are trusting the Web Adjuster machine and your connection to it, plus its certificate verification might not be as thorough as your browser's.")
define("real_proxy",default=False,help="Whether or not to accept requests with original domains like a \"real\" HTTP proxy.  Warning: this bypasses the password and implies open_proxy.  Off by default.")
define("via",default=True,help="Whether or not to update the Via: and X-Forwarded-For: HTTP headers when forwarding requests") # (Via is "must" in RFC 2616)
define("uavia",default=True,help="Whether or not to add to the User-Agent HTTP header when forwarding requests, as a courtesy to site administrators who wonder what's happening in their logs (and don't log Via: etc)")
Silas S. Brown's avatar
Silas S. Brown committed
define("robots",default=False,help="Whether or not to pass on requests for /robots.txt.  If this is False then all robots will be asked not to crawl the site; if True then the original site's robots settings will be mirrored.  The default of False is recommended.") # TODO: do something about badly-behaved robots ignoring robots.txt? (they're usually operated by email harvesters etc, and start crawling the web via the proxy if anyone "deep links" to a page through it, see comments in request_no_external_referer)
Silas S. Brown's avatar
Silas S. Brown committed
define("just_me",default=False,help="Listen on localhost only, and check incoming connections with an ident server (which must be running on port 113) to ensure they are coming from the same user.  This is for experimental setups on shared Unix machines; might be useful in conjuction with --real_proxy.")
Silas S. Brown's avatar
Silas S. Brown committed
define("one_request_only",default=False,help="Shut down after handling one request.  This is for use in inefficient CGI-like environments where you cannot leave a server running permanently, but still want to start one for something that's unsupported in WSGI mode (e.g. js_reproxy): run with --one_request_only and forward the request to its port.  You may also wish to set --seconds if using this.")
define("seconds",default=0,help="The maximum number of seconds for which to run the server (0 for unlimited).  If a time limit is set, the server will shut itself down after the specified length of time.")
Silas S. Brown's avatar
Silas S. Brown committed
define("stdio",default=False,help="Forward standard input and output to our open port, in addition to being open to normal TCP connections.  This might be useful in conjuction with --one-request-only and --port=-1.")
define("upstream_proxy",help="address:port of a proxy to send our requests through. This can be used to adapt existing proxy-only mediators to domain rewriting, or for a caching proxy. Not used for ip_query_url options, own_server or fasterServer. If address is left blank (just :port) then localhost is assumed and https URLs will be rewritten into http with altered domains; you'll then need to set the upstream proxy to send its requests back through the adjuster (which will listen on localhost:port+1 for this purpose) to undo that rewrite. This can be used to make an existing HTTP-only proxy process HTTPS pages.") # The upstream_proxy option requires pycurl (will refuse to start if not present). Does not set X-Real-Ip because Via should be enough for upstream proxies. The ":port"-only option rewrites URLs in requests but NOT ones referred to in documents: we assume the proxy can cope with that.
define("ip_messages",help="Messages or blocks for specific IP address ranges (IPv4 only).  Format is ranges|message|ranges|message etc, where ranges are separated by commas; can be individual IPs, or ranges in either 'network/mask' or 'min-max' format; the first matching range-set is selected.  If a message starts with * then its ranges are blocked completely (rest of message, if any, is sent as the only reply to any request), otherwise message is shown on a 'click-through' page (requires Javascript and cookies).  If the message starts with a hyphen (-) then it is considered a minor edit of earlier messages and is not shown to people who selected `do not show again' even if they did this on a different version of the message.  Messages may include HTML.")

Silas S. Brown's avatar
Silas S. Brown committed
define("host_suffix",default=getfqdn_default,help="The last part of the domain name. For example, if the user wishes to change www.example.com and should do so by visiting www.example.com.adjuster.example.org, then host_suffix is adjuster.example.org. If you do not have a wildcard domain then you can still adjust one site by setting wildcard_dns to False, host_suffix to your non-wildcard domain, and default_site to the site you wish to adjust. If you have more than one non-wildcard domain, you can set wildcard_dns to False, host_suffix to all your domains separated by slash (/), and default_site to the sites these correspond to, again separated by slash (/); if two or more domains share the same default_site then the first is preferred in links and the others are assumed to be for backward compatibility. If wildcard_dns is False and default_site is empty (or if it's a /-separated list and one of its items is empty), then the corresponding host_suffix gives a URL box and sets its domain in a cookie (and adds a link at the bottom of pages to clear this and return to the URL box), but this should be done only as a last resort: you can browse only one domain at a time at that host_suffix; most links and HTTP redirects to other domains will leave the adjuster when not in HTML-only mode, which can negatively affect sites that use auxiliary domains for scripts etc and check Referer (unless you ensure these auxiliary domains are listed elsewhere in default_site). Also, the sites you visit at that host_suffix might be able to see some of each other's cookies etc (leaking privacy) although the URL box page will try to clear site cookies.")
# ("preferred" / "backward compatibility" thing: can be useful if old domain has become unreliable, or if "preferred" domain is actually a URL-path-forwarding service with a memorable name which redirects browsers to an actual domain that's less memorable, and you want the memorable domain to be used in links etc, although in this case you might still get the less-memorable domain in the address bar)
# TODO: (two or more domains pointing to the same default_site) "preferred" / "backward compatibility" thing above: or, add an option to periodically check which of our domains are actually 'up' and move them to the front of the host_suffix / default_site list; that way we don't have to guess ahead of time which one is more reliable and should be preferred.
# Could also do 'use the currently-requested host if it's appropriate', but what if there's a *set* of sites we adjust and we need to try to rewrite cross-site links to be in the same set of domains as the one the browser is requesting - maybe it's best to leave the "preferred" DNS to the config or the periodic check.
# TODO at lower priority: empty (item in) host_suffix to match ALL (unknown) hosts, including IP hosts and no Host: header.  Fetch the corresponding default_site (empty means use cookies), and adjust it USING THE HOST SPECIFIED BY THE BROWSER to rewrite the links.  This could be useful if setting up an adjuster with NO domain name (IP only).  Could periodically upload our public IP to a separate static website via FTP/SSH/etc in case dynamic DNS is not reliable.  But if IP address has to change then all cookies would be 'lost'.  Also, if no password is set then IP-based "webserver probes" could cause us to send malicious-looking traffic to default_site.
# TODO: Could do different hosts on different ports, which might also be useful if you have a domain name but only one.  Would have to check for cookie sharing (or just say "do this only if you don't mind it"); fasterServer would have to forward to same as incoming port.  Might be a problem if some users' firewalls disallow outgoing Web traffic to non-standard ports.
# (In the current code, setting host_suffix to a public IP address should work: most browsers set Host: to the IP if requesting a URL by IP, and then the IP will be used in rewrites if it's the first thing specified for its corresponding default_site.  But adjuster will need to be reconfigured and restarted on every change of the public IP.)
Silas S. Brown's avatar
Silas S. Brown committed
define("default_site",help="The site to fetch from if nothing is specified before host_suffix, e.g. example.org (add .0 at the end to specify an HTTPS connection, but see the 'prohibit' option). If default_site is omitted then the user is given a URL box when no site is specified; if it is 'error' then an error is shown in place of the URL box (the text of the error depends on the settings of wildcard_dns and real_proxy).") # using .0 here rather than https:// prefix because / is a separator: see the host_suffix help text (TODO: change the separator? but don't break existing installations)
define("own_server",help="Where to find your own web server. This can be something like localhost:1234 or 192.168.0.2:1234. If it is set, then any request that does not match host_suffix will be passed to that server to deal with, unless real_proxy is in effect. You can use this option to put your existing server on the same public port without much reconfiguration. Note: the password option will NOT password-protect your own_server. (You might gain a little responsiveness if you instead set up nginx or similar to direct incoming requests appropriately; see comments in adjuster.py for example nginx settings.)")
# without much reconfiguration: might just need to change which port number it listens on.
# Alternatively you could set nginx (or similar) to reverse-proxy the host_suffix domains to the adjuster, e.g.:
# location / {
#   proxy_set_header X-Real-Ip $remote_addr;
#   proxy_set_header Host $host;
#   proxy_pass_header Server;
#   access_log off;
#   proxy_pass http://localhost:<YOUR-ADJUSTER-PORT-HERE>;
#   proxy_max_temp_file_size 0;
#   proxy_read_timeout 130s;  # or whatever; default 60s
#        # - may need to be longer, especially if using
#        #    file conversion with waitpage=False on a
#        #    low-powered server and there are big files
# }
# inside a "server" block with appropriate server_name(s)
# (and set ipTrustReal to 127.0.0.1 in Adjuster's config,
# and set publicPort to the port nginx runs on e.g. 80),
# but if you're not already using nginx then you either
# have to either port your existing server to nginx or get
# nginx to reverse-proxy for your other server, so for
# small installations it might be simpler just to set
# own_server, unless it's vitally important that
# own_server is not held up in any way when the adjuster
# is under heavy CPU load.

define("ownServer_regexp",help="If own_server is set, you can set ownServer_regexp to a regular expression to match URL prefixes which should always be handled by your own server even if they match host_suffix. This can be used for example to add extra resources to any site, or to serve additional pages from the same domain, as long as the URLs used are not likely to occur on the sites being adjusted. The regular expression is matched against the requested host and the requested URL, so for example [^/]*/xyz will match any URL starting with /xyz on any host, whereas example.org/xyz will match these on your example.org domain. You can match multiple hosts and URLs by using regular expression grouping.")
define("ownServer_if_not_root",default=True,help="When trying to access an empty default_site, if the path requested is not / then redirect to own_server (if set) instead of providing a URL box. If this is False then the URL box will be provided no matter what path was requested.") # TODO: "ownServer even if root" option, i.e. option to make host_suffix by itself go to own_server?  Or make ownServer_if_not_root permanent?  The logic that deals with off-site Location: redirects assumes the URL box will normally be at / (TODO document this?)
Silas S. Brown's avatar
Silas S. Brown committed
define('search_sites',multiple=True,help="Comma-separated list of search sites to be made available when the URL box is displayed (if default_site is empty). Each item in the list should be a URL (which will be prepended to the search query), then a space, then a short description of the site. The first item on the list is used by default; the user can specify other items by making the first word of their query equal to the first word of the short description. Additionally, if some of the letters of that first word are in parentheses, the user may specify just those letters. So for example if you have an entry http://search.example.com/?q= (e)xample, and the user types 'example test' or 'e test', it will use http://search.example.com/?q=test")
define("urlbox_extra_html",help="Any extra HTML you want to place after the URL box (when shown), such as a paragraph explaining what your filters do etc.")
Silas S. Brown's avatar
Silas S. Brown committed
define("urlboxPath",default="/",help="The path of the URL box for use in links to it. This might be useful for wrapper configurations, but a URL box can be served from any path on the default domain. If however urlboxPath is set to something other than / then efforts are made to rewrite links to use it more often when in HTML-only mode with cookie domain, which might be useful for limited-server situations. You can force HTML-only mode to always be on by prefixing urlboxPath with *")
define("wildcard_dns",default=True,help="Set this to False if you do NOT have a wildcard domain and want to process only default_site. Setting this to False does not actually prevent other sites from being processed (for example, a user could override their local DNS resolver to make up for your lack of wildcard domain); if you want to really prevent other sites from being processed then you could also set own_server to deal with unrecognised domains. Setting wildcard_dns to False does stop the automatic re-writing of links to sites other than default_site. Leave it set to True to have ALL sites' links rewritten on the assumption that you have a wildcard domain.") # will then say "(default True)"

heading("General adjustment options")
define("default_cookies",help="Semicolon-separated list of name=value cookies to send to all remote sites, for example to set preferences. Any cookies that the browser itself sends will take priority over cookies in this list. Note that these cookies are sent to ALL sites. You can set a cookie only on a specific browser by putting (browser-string) before the cookie name, e.g. (iPad)x=y will set x=y only if 'iPad' occurs in the browser string (to match more than one browser-string keyword, you have to specify the cookie multiple times).") # TODO: site-specific option
# TODO: sets of adjustments can be switched on and off at a /__settings URL ?  or leave it to the injected JS
define("headAppend",help="Code to append to the HEAD section of every HTML document that has a BODY. Use for example to add your own stylesheet links and scripts. Not added to documents that lack a BODY such as framesets.")
Silas S. Brown's avatar
Silas S. Brown committed
define("headAppendCSS",help="URL of a stylesheet to add to the HEAD section of every HTML document that has a BODY.  This option automatically generates the LINK REL=... markup for it, and also tries to delete the string '!important' from other stylesheets, to emulate setting this stylesheet as a user CSS.  Additionally, it is not affected by --js-upstream as headAppend is.  You can also include one or more 'fields' in the URL, by marking them with %s and following the URL with options e.g. http://example.org/style%s-%s.css;1,2,3;A,B will allow combinations like style1-A.css or style3-B.css; in this case appropriate selectors are provided with the URL box (values may optionally be followed by = and a description), and any visitors who have not set their options will be redirected to the URL box to do so.")
define("protectedCSS",help="A regular expression matching URLs of stylesheets with are \"protected\" from having their '!important' strings deleted by headAppendCSS's logic. This can be used for example if you are adding scripts to allow the user to choose alternate CSS files in place of headAppendCSS, and you wish the alternate CSS files to have the same status as the one supplied in headAppendCSS.")
define("cssName",help="A name for the stylesheet specified in headAppendCSS, such as \"High Contrast\".  If cssName is set, then the headAppendCSS stylesheet will be marked as \"alternate\", with Javascript links at the bottom of the page for browsers that lack their own CSS switching options.  If cssName begins with a * then the stylesheet is switched on by default; if cssName is not set then the stylesheet (if any) is always on.")
Silas S. Brown's avatar
Silas S. Brown committed
define("cssNameReload",multiple=True,default="IEMobile 6,IEMobile 7,IEMobile 8,Opera Mini,Opera Mobi,rekonq",help="List of (old) browsers that require alternate code for the cssName option, which is slower as it involves reloading the page on CSS switches.  Use this if the CSS switcher provided by cssName does nothing on your browser.") # Opera Mini sometimes worked and sometimes didn't; maybe there were regressions at their proxy; JS switcher needs network traffic anyway on Opera Mini so we almost might as well use the reloading version (but in Spring 2014 they started having trouble with reload() AS WELL, see cssReload_cookieSuffix below)
# Opera Mobile 10 on WM6.1 is fine with CSS switcher but it needs cssHtmlAttrs, TODO we might be able to have a list of browsers that require cssHtmlAttrs but not cssNameReload, add cssHtmlAttrs only if CSS is selected at time of page load, and make the 'off' switch remove them
# TODO: Opera/9.5 on WM6.1 document.write can corrupt the display with EITHER script; page might also display for some time before the document.writes take effect.  Suggest those users upgrade to version 10 (= Opera/9.8) ?
Silas S. Brown's avatar
Silas S. Brown committed
cssReload_cookieSuffix = "&&_adjuster_setCookie:" # enables code that works better on Opera Mini's transcoder (Spring 2014) by setting the cookie server-side. (Set to blank to use the old code. TODO: browser-dependent? make it a 'define' option?)
define("cssHtmlAttrs",help="Attributes to add to the BODY element of an HTML document when cssNameReload is in effect (or when it would be in effect if cssName were set). This is for old browsers that try to render the document first and apply CSS later. Example: 'text=\"yellow\" bgcolor=\"black\"' (not as flexible as CSS but can still make the rendering process less annoying). If headAppendCSS has \"fields\" then cssHtmlAttrs can list multiple sets of attributes separated by ; and each set corresponds with an option in the last field of headAppendCSS.") # e.g. IEMobile 7 (or Opera 10) on WM 6.1
define("headAppendRuby",default=False,help="Convenience option which adds CSS and Javascript code to the HTML body that tries to ensure simple RUBY markup displays legibly across all modern browsers; this might be useful if you used Annotator Generator to make the htmlFilter program. (The option is named 'head' because it used to add markup to the HEAD; this was moved to the BODY to work around browser bugs.)") # IEMobile 6 drops whitespace after closing tags if document HEAD contains any STYLE element, even an empty one, except via link rel=Stylesheet. Style element works OK if placed at start of body.
define("bodyAppend",help="Code to append to the BODY section of every HTML document that has one. Use for example to add a script that needs to be run after the rest of the body has been read, or to add a footer explaining how the page has been modified. See also prominentNotice.") # TODO: note that it will go at the bottom of IFRAMEs also, and suggest using something similar to prominentNotice's iframe-detection code?
define("bodyAppendGoesAfter",help="If this is set to a regular expression matching some text or HTML code that appears verbatim in the body section, the code in bodyAppend will be inserted after the last instance of this regular expression (case sensitive) instead of at the end of the body. Use for example if a site styles its pages such that the end of the body is not a legible place for a footer.") # (e.g. it would overprint some position=fixed stuff)
define("bodyPrepend",help="Code to place at the start of the BODY section of every HTML document that has one.") # May be a useful place to put some scripts. For example, a script that changes a low-vision stylesheet according to screen size might be better in the BODY than in the HEAD, because some Webkit-based browsers do not make screen size available when processing the HEAD of the starting page. # but sometimes it still goes wrong on Chromium startup; probably a race condition; might be worth re-running the script at end of page load just to make sure
define("prominentNotice",help="Text to add as a prominent notice to processed sites (may include HTML). If the browser has sufficient Javascript support, this will float relative to the browser window and will contain an 'acknowledge' button to hide it (for the current site in the current browsing session). Use prominentNotice if you need to add important information about how the page has been modified. If you set prominentNotice to the special value \"htmlFilter\", then the output of the htmlFilter option (if any) will be placed as a prominent notice; this can be used if you want to provide extra information or links derived from the content of the page. Note: if you include Javascript document.write() code in prominentNotice, check that document.readyState is not 'complete' or you might find the document is erased on some website/browser combinations when a site script somehow causes your script to be re-run after the document stream is closed. In some rare cases you might also need to verify that document.cookie does not contain _WA_warnOK=1") # e.g. if the site does funny things with the browser cache.  Rewriting the innerHTML manipulation to appendChild doesn't fix the need to check document.readyState
define("staticDocs",help="url#path of static documents to add to every website, e.g. /_myStatic/#/var/www (make sure the first part is something not likely to be used by the websites you visit). This can be used to supply extra Javascript (e.g. for bodyPrepend to load) if it needs to be served from the same domain. Note: staticDocs currently overrides the password and own_server options.")
define("delete",multiple=True,help="Comma-separated list of regular expressions to delete from HTML documents. Can be used to delete selected items of Javascript and other code if it is causing trouble for your browser. Will also delete from the text of pages; use with caution.")
Silas S. Brown's avatar
Silas S. Brown committed
define("delete_css",multiple=True,help="Comma-separated list of regular expressions to delete from CSS documents (but not inline CSS in HTML); can be used to remove, for example, dimension limits that conflict with annotations you add, as an alternative to inserting CSS overrides.  In rare cases you might want to replace the deleted regexp with another, in which case you can use @@ to separate the two, and a second @@ can be used to specify a string in the CSS URL that must be present for the operation to take effect (this could be combined with a codeChanges to add query parameters to the URL if you want the change to occur only when the CSS is loaded from specific HTML pages).")
define("delete_doctype",default=False,help="Delete the DOCTYPE declarations from HTML pages. This option is needed to get some old Webkit browsers to apply multiple CSS files consistently.")
define("deleteOmit",multiple=True,default="iPhone,iPad,Android,Macintosh",help="A list of browsers that do not need the delete and delete-doctype options to be applied. If any of these strings occur in the user-agent then these options are disabled for that request, on the assumption that these browsers are capable enough to cope with the \"problem\" code. Any delete-css option is still applied however.")
Silas S. Brown's avatar
Silas S. Brown committed
define("cacheOmit",multiple=True,default="IEMobile",help="A list of browsers that cannot be trusted to provide correct Cache-Control headers. Use this if your browser fails to renew data when you press Reload.") # e.g. IE6 on WM6.1 sets Cache-Control to "max-age=259200" (3 days) even if you press Reload, which can result in upstream caching proxies (e.g. on AppEngine) failing to re-query the original servers on a reload (e.g. for NextBuses, frustrating if you're trying to decide whether or not you have to run!)
define("codeChanges",help="Several lines of text specifying changes that are to be made to all HTML and Javascript code files on certain sites; use as a last resort for fixing a site's scripts. This option is best set in the configuration file and surrounded by r\"\"\"...\"\"\". The first line is a URL prefix (just \"http\" matches all); append a # to match an exact URL instead of a prefix, and #+number (e.g. #1 or #2) to match an exact URL and perform the change only that number of times in the page.  The second line is a string of code to search for, and the third is a string to replace it with. Further groups of URL/search/replace lines may follow; blank lines and lines starting with # are ignored. If the 'URL prefix' starts with a * then it is instead a string to search for within the code of the document body; any documents containing this code will match; thus it's possible to write rules of the form 'if the code contains A, then replace B with C'. This processing takes place before any 'delete' option takes effect so it's possible to pick up on things that will be deleted, and it occurs after the domain rewriting so it's possible to change rewritten domains in the search/replace strings (but the URL prefix above should use the non-adjusted version).")
define("boxPrompt",default="Website to adjust",help="What to say before the URL box (when shown); may include HTML; for example if you've configured Web Adjuster to perform a single specialist change that can be described more precisely with some word other than 'adjust', you might want to set this.")
define("viewsource",default=False,help="Provide a \"view source\" option. If set, you can see a page's pre-adjustment source code, plus client and server headers, by adding \".viewsource\" to the end of a URL (after any query parameters etc)")
Silas S. Brown's avatar
Silas S. Brown committed
define("htmlonly_mode",default=True,help="Provide a checkbox allowing the user to see pages in \"HTML-only mode\", stripping out images, scripts and CSS; this might be a useful fallback for very slow connections if a site's pages bring in many external files and the browser cannot pipeline its requests. The checkbox is displayed by the URL box, not at the bottom of every page.") # if no pipeline, a slow UPLINK can be a problem, especially if many cookies have to be sent with each request for a js/css/gif/etc.
# (and if wildcard_dns=False and we're domain multiplexing, our domain can accumulate a lot of cookies, causing requests to take more uplink bandwidth, TODO: do something about this?)
Silas S. Brown's avatar
Silas S. Brown committed
define("htmlonly_css",default=False,help="Leave images and CSS in the page when in \"HTML-only mode\", removing only scripts")
define("mailtoPath",default="/@mail@to@__",help="A location on every adjusted website to put a special redirection page to handle mailto: links, showing the user the contents of the link first (in case a mail client is not set up). This must be made up of URL-safe characters starting with a / and should be a path that is unlikely to occur on normal websites and that does not conflict with renderPath. If this option is empty, mailto: links are not changed. (Currently, only plain HTML mailto: links are changed by this function; Javascript-computed ones are not.)")
define("mailtoSMS",multiple=True,default="Opera Mini,Opera Mobi,Android,Phone,Mobile",help="When using mailtoPath, you can set a comma-separated list of platforms that understand sms: links. If any of these strings occur in the user-agent then an SMS link will be provided on the mailto redirection page, to place the suggested subject and/or body into a draft SMS message instead of an email.")
define("htmlFilter",help="External program(s) to run to filter every HTML document. If more than one program is specified separated by # then the user will be given a choice (see htmlFilterName option). Any shell command can be used; its standard input will get the HTML (or the plain text if htmlText is set), and it should send the new version to standard output. Multiple copies of each program might be run at the same time to serve concurrent requests. UTF-8 character encoding is used. If you are not able to run external programs then you could use a back-end server (specify an http:// or https:// URL and input is POSTed in the request body; if this back-end server is another Web Adjuster with submitPath and submitBookmarklet set then give its submitPath plus uA for its 1st filter, uB for its 2nd, etc), or use a Python function: specify * followed by the function name, and inject the function into the adjuster module from a wrapper script (which imports adjuster, sets adjuster.options.htmlFilter etc, injects the function and calls adjuster.main). The function is run in the serving thread.") # (so try to make it fast, although this is not quite so essential in WSGI mode; if you're in WSGI mode then I suggest getting the function to import any large required modules on-demand)
define("htmlFilterName",help="A name for the task performed by htmlFilter. If this is set, the user will be able to switch it on and off from the browser via a cookie and some Javascript links at the bottom of HTML pages. If htmlFilter lists two or more options, htmlFilterName should list the same number plus one (again separated by #); the first is the name of the entire category (for example \"filters\"), and the user can choose between any one of them or none at all (hence the number of options is one more than the number of filters); if this yields more than 3 options then all but the first two are hidden behind a \"More\" option on some browsers.") # TODO: non-Javascript fallback for the switcher
define("htmlJson",default=False,help="Try to detect HTML strings in JSON responses and feed them to htmlFilter. This can help when using htmlFilter with some AJAX-driven sites. IMPORTANT: Unless you also set the 'separator' option, the external program must preserve all newline characters, because multiple HTML strings in the same JSON response will be given to it separated by newlines, and the newlines of the output determine which fragment to put back where. (If you combine htmlJson with htmlText, the external program will see text in HTML in JSON as well as text in HTML, but it won't see text in HTML in JSON in HTML.)")
define("htmlText",default=False,help="Causes the HTML to be parsed, and only the text parts (not the markup) will be sent to htmlFilter. Useful to save doing HTML parsing in the external program. The external program is still allowed to include HTML markup in its output. IMPORTANT: Unless you also set the 'separator' option, the external program must preserve all newline characters, because multiple text strings will be given to it separated by newlines, and the newlines of the output determine which modified string to put back where.")
define("separator",help="If you are using htmlFilter with htmlJson and/or htmlText, you can set separator to any text string to be used as a separator between multiple items of data when passing them to the external program. By default, newlines are used for this, but you can set it to any other character or sequence of characters that cannot be added or removed by the program. (It does not matter if a website's text happens to use the separator characters.) If separator is set, not only will it be used as a separator BETWEEN items of data but also it will be added before the first and after the last item, thus allowing you to use an external program that outputs extra text before the first and after the last item. The extra text will be discarded. If however you do not set separator then the external program should not add anything extra before/after the document.")
define("leaveTags",multiple=True,default="script,style,title,textarea,option",help="When using htmlFilter with htmlText, you can set a comma-separated list of HTML tag names whose enclosed text should NOT be sent to the external program for modification. For this to work, the website must properly close these tags and must not nest them. (This list is also used for character-set rendering.)") # not including 'option' can break pages that need character-set rendering
define("stripTags",multiple=True,default="wbr",help="When using htmlFilter with htmlText, you can set a comma-separated list of HTML tag names which should be deleted if they occur in any section of running text. For example, \"wbr\" (word-break opportunity) tags (listed by default) might cause problems with phrase-based annotators.") # TODO: <span class="whatever">&nbsp;</span> (c.f. annogen's JS) ?  have already added to the bookmarklet JS (undocumented! see 'awkwardSpan') but not to the proxy version (the two find_text_in_HTML functions)
Silas S. Brown's avatar
Silas S. Brown committed
define("htmlUrl",default=False,help="Add a line containing the document's URL to the start of what gets sent to htmlFilter (useful for writing filters that behave differently for some sites; not yet implemented for submitBookmarklet, which will show a generic URL). The URL line must not be included in the filter's response.")
define("submitPath",help="If set, accessing this path (on any domain) will give a form allowing the user to enter their own text for processing with htmlFilter. The path should be one that websites are not likely to use (even as a prefix), and must begin with a slash (/). If you prefix this with a * then the * is ignored and any password set in the 'password' option does not apply to submitPath. Details of the text entered on this form is not logged by Web Adjuster, but short texts are converted to compressed GET requests which might be logged by proxies etc.") # (see comments in serve_submitPage; "with htmlFilter" TODO: do we add "(or --render)" to this? but charset submit not entirely tested with all old browsers; TODO: consider use of chardet.detect(buf) in python-chardet)
Silas S. Brown's avatar
Silas S. Brown committed
define("submitPrompt",default="Type or paste in some text to adjust",help="What to say before the form allowing users to enter their own text when submitPath is set (compare boxPrompt)")
Silas S. Brown's avatar
Silas S. Brown committed
define("submitBookmarklet",default=True,help="If submitPath and htmlFilter is set, and if browser Javascript support seems sufficient, then add one or more 'bookmarklets' to the 'Upload Text' page (named after htmlFilterName if provided), allowing the user to quickly upload text from other sites. This might be useful if for some reason those sites cannot be made to go through Web Adjuster directly. The bookmarklets should work on modern desktop browsers and on iOS and Android; they should cope with frames and with Javascript-driven changes to a page, and on some browsers an option is provided to additionally place the page into a frameset so that links to other pages on the same site can be followed without explicitly reactivating the bookmarklet (but this does have disadvantages - page must be reloaded + URL display gets 'stuck' - so it's left to the user to choose).") # (and if the other pages check their top.location, things could break there as well)
Silas S. Brown's avatar
Silas S. Brown committed
define("submitBookmarkletFilterJS",default=r"!c.nodeValue.match(/^[ -~\s]*$/)",help="A Javascript expression that evaluates true if a DOM text node 'c' should be processed by the 'bookmarklet' Javascript when submitPath and submitBookmarklet are set. To process ALL text, set this option to c.nodeValue.length, but if your htmlFilter will not change certain kinds of text then you can make the Javascript run more efficiently by not processing these (quote the expression carefully). The default setting will not process text that is all ASCII.") # + whitespace.  TODO: add non-ascii 'smart punctuation'? entered as Unicode escapes, or rely on serving the script as utf-8. (Previously said "To process ALL text, simply set this option to 'true'", but that can have odd effects on some sites' empty nodes. Saying c.nodeValue.length for now; c.nodeValue.match(/[^\s]/) might be better but needs more quoting explanation. Could change bookmarkletMainScript so it alters the DOM only if replacements[i] != oldTexts[i], c.f. annogen's android code, but that would mean future passes would re-send all the unchanged nodes cluttering the XMLHttpRequests especially if they fill a chunk - annogen version has the advantage of immediate local processing)
define("submitBookmarkletChunkSize",default=1024,help="Specifies the approximate number of characters at a time that the 'bookmarklet' Javascript will send to the server if submitPath and submitBookmarklet are set. Setting this too high could impair browser responsiveness, but too low will be inefficient with bandwidth and pages will take longer to finish.")
define("submitBookmarkletDomain",help="If set, specifies a domain to which the 'bookmarklet' Javascript should send its XMLHttpRequests, and ensures that they are sent over HTTPS if the 'bookmarklet' is activated from an HTTPS page (this is needed by some browsers to prevent blocking the XMLHttpRequest).  submitBookmarkletDomain should be a domain for which the adjuster (or an identically-configured copy) can receive requests on both HTTP and HTTPS, and which has a correctly-configured HTTPS front-end with valid certificate.") # e.g. example.rhcloud.com (although that does introduce the disadvantage of tying bookmarklet installations to the current URLs of the OpenShift service rather than your own domain)
Silas S. Brown's avatar
Silas S. Brown committed
heading("Javascript execution options")
define("js_interpreter",default="",help="Execute Javascript on the server for users who choose \"HTML-only mode\". You can set js_interpreter to PhantomJS, HeadlessChrome or HeadlessFirefox, and must have the appropriate one installed along with an appropriate version of Selenium (and ChromeDriver if you're using HeadlessChrome).  If you have multiple users, beware logins etc may be shared!  If a URL box cannot be displayed (no wildcard_dns and default_site is full, or processing a \"real\" proxy request) then htmlonly_mode auto-activates when js_interpreter is set, thus providing a way to partially Javascript-enable browsers like Lynx.  If --viewsource is enabled then js_interpreter URLs may also be followed by .screenshot")
define("js_upstream",default=False,help="Handle --headAppend, --bodyPrepend, --bodyAppend and --codeChanges upstream of our Javascript interpreter instead of making these changes as code is sent to the client, and make --staticDocs available to our interpreter as well as to the client.  This is for running experimental 'bookmarklets' etc with browsers like Lynx.") # TODO: what of delay? (or wait for XHRs to finish, call executeJavascript instead?)
Silas S. Brown's avatar
Silas S. Brown committed
define("js_frames",default=False,help="When using js_interpreter, append the content of all frames and iframes to the main document. This might help with bandwidth reduction and with sites that have complex cross-frame dependencies that can be broken by sending separate requests through the adjuster.")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_instances",default=1,help="The number of virtual browsers to load when js_interpreter is in use. Increasing it will take more RAM but may aid responsiveness if you're loading multiple sites at once.")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_429",default=True,help="Return HTTP error 429 (too many requests) if js_interpreter queue is too long at page-prefetch time. When used with --multicore, additionally close to new requests any core that's currently processing its full share of js_instances.") # Even if some of those new requests won't immediately require js_interpreter work.  But it's better than having an excessively uneven distribution under load.  HTTP 429 is from RFC 6585, April 2012.  Without multicore, 'too long' = 'longer than 2*js_instances', but the queue can grow longer due to items already in prefetch: not all prefetches end up being queued for JS interpretation, so we can't count them prematurely. TODO: close even *before* reached full share of js_instances? as there may be other pages in prefetch, which will then have to wait for instances on this core even though there might already be spare instances on other cores.
Silas S. Brown's avatar
Silas S. Brown committed
define("js_restartAfter",default=10,help="When js_interpreter is in use, restart each virtual browser after it has been used this many times (0=unlimited); might help work around excessive RAM usage in PhantomJS v2.1.1. If you have many --js-instances (and hardware to match) you could also try --js-restartAfter=1 (restart after every request) to work around runaway or unresponsive PhantomJS processes. If you have Headless Chrome you can probably set this to 0.") # (js-restartAfter=1 precludes a faster response when a js_interpreter instance is already loaded with the page requested, although faster response is checked for only AFTER selecting an instance and is therefore less likely to work with multiple instances under load, and is in any event unlikely to work if running multicore with many cores); TODO: check if PhantomJS 2.1.1 RAM usage is a regression from 2.0.1 ? but it's getting less relevant now there's Headless Chrome
Silas S. Brown's avatar
Silas S. Brown committed
define("js_restartMins",default=10,help="Restart an idle js_interpreter instance after about this number of minutes (0=unlimited); use this to stop the last-loaded page from consuming CPU etc indefinitely if no more requests arrive at that instance.  Not applicable when --js-restartAfter=1.") # Setting it low does have the disadvantage of not being able to use an already-loaded page, see above
Silas S. Brown's avatar
Silas S. Brown committed
define("js_timeout1",default=30,help="When js_interpreter is in use, tell it to allow this number of seconds for initial page load. More time is allowed for XMLHttpRequest etc to finish (unless our client cuts the connection in the meantime).")
define("js_timeout2",default=100,help="When js_interpreter is in use, this value in seconds is treated as a 'hard timeout': if a webdriver process does not respond at all within this time, it is assumed hung and emergency restarted.")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_retry",default=True,help="If a js_interpreter fails, restart it and try the same fetch again while the remote client is still waiting")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_fallback",default="X-Js-Fallback",help="If this is set to a non-empty string and a js_interpreter fails (even after js_retry if set), serve the page without Javascript processing instead of serving an error. The HTTP header specified by this option can tell the client whether or not Javascript was processed when a page is served.")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_reproxy",default=True,help="When js_interpreter is in use, have it send its upstream requests back through the adjuster on a different port. This allows js_interpreter to be used for POST forms, fixes its Referer headers when not using real_proxy, monitors AJAX for early completion, prevents problems with file downloads, and prefetches main pages to avoid holding up a js_interpreter instance if the remote server is down.") # and works around issue #13114 in PhantomJS 2.x.  Only real reason to turn it off is if we're running in WSGI mode (which isn't recommended with js_interpreter) as we haven't yet implemented 'find spare port and run separate IO loop behind the WSGI process' logic
define("js_UA",help="Custom user-agent string for js_interpreter requests, if for some reason you don't want to use the JS browser's default. If you prefix this with a * then the * is ignored and the user-agent string is set by the upstream proxy (--js_reproxy) so scripts running in the JS browser itself will see its original user-agent.")
define("js_images",default=True,help="When js_interpreter is in use, instruct it to fetch images just for the benefit of Javascript execution. Setting this to False saves bandwidth but misses out image onload events.") # plus some versions of Webkit leak memory (PhantomJS issue 12903), TODO: return a fake image if js_reproxy? (will need to send a HEAD request first to verify it is indeed an image, as PhantomJS's Accept header is probably */*) but height/width will be wrong
define("js_size",default="1024x768",help="The virtual screen dimensions of the browser when js_interpreter is in use (changing it might be useful for screenshots)")
define("js_links",default=True,help="When js_interpreter is in use, handle some Javascript links via special suffixes on href URLs. Turn this off if you don't mind such links not working and you want to ensure URLs are unchanged modulo domain-rewriting.")
Silas S. Brown's avatar
Silas S. Brown committed
define("js_multiprocess",default=True,help="When js_interpreter is in use, handle the webdriver instances in completely separate processes (not just separate threads) when the multiprocessing module is available. Recommended: if a webdriver instance gets 'stuck' in a way that somehow hangs its controlling process, we can detect and restart it.")
Silas S. Brown's avatar
Silas S. Brown committed
define("ssl_fork",default=False,help="Run SSL-helper proxies as separate processes (Unix only) to stop the main event loop from being stalled by buggy SSL libraries. This costs RAM, but adding --multicore too will limit the number of helpers to one per core instead of one per port, so --ssl-fork --multicore is recommended if you want more js_interpreter instances than cores.")
heading("Server control options")
define("background",default=False,help="If True, fork to the background as soon as the server has started (Unix only). You might want to enable this if you will be running it from crontab, to avoid long-running cron processes.")
define("restart",default=False,help="If True, try to terminate any other process listening on our port number before we start (Unix only). Useful if Web Adjuster is running in the background and you want to quickly restart it with new options. Note that no check is made to make sure the other process is a copy of Web Adjuster; whatever it is, if it has our port open, it is asked to stop.")
define("stop",default=False,help="Like 'restart', but don't replace the other process after stopping it. This option can be used to stop a background server (if it's configured with the same port number) without starting a new one. Unix only.") # "stop" overrides "restart", so if "restart" is set in a configuration file then you can still use "stop" on the command line
Silas S. Brown's avatar
Silas S. Brown committed
define("install",default=False,help="Try to install the program in the current user's Unix crontab as an @reboot entry, unless it's already there.  The arguments of the cron entry will be the same as the command line, with no directory changes, so make sure you are in the home directory before doing this.  The program will continue to run normally after the installation attempt.  (If you are on Cygwin then you might need to run cron-config also.)")
Silas S. Brown's avatar
Silas S. Brown committed
define("pidfile",default="",help="Write our process ID to this file when running in the background, so you can set up a systemd service with Type=forking and PIDFile=this instead of using crontab. (Alternatively use 'pip install sdnotify' and run in the foreground with Type=notify.)")
Silas S. Brown's avatar
Silas S. Brown committed
define("watchdog",default=0,help="(Linux only) Ping the system's watchdog every this number of seconds, so the watchdog can reboot the system if for any reason Web Adjuster stops functioning. The default value of 0 means do not ping the watchdog. If your machine's unattended boot is no longer reliable, beware of unnecessary reboot if you remotely stop the adjuster and are unable to restart it.") # e.g. some old Raspberry Pis no longer boot 100% of the time and have watchdogs that cannot be cleanly closed with 'V'
define("watchdogWait",default=0,help="When the watchdog option is set, wait this number of seconds before stopping the watchdog pings. This causes the watchdog pings to be sent from a separate thread and therefore not stopped when the main thread is busy; they are stopped only when the main thread has not responded for watchdogWait seconds. This can be used to work around the limitations of a hardware watchdog that cannot be set to wait that long.") # such as the Raspberry Pi's Broadcom chip which defaults to 10 seconds and has max 15; you could say watchdog=5 and watchdogWait=60 (if you have an RPi which actually reboots when the watchdog goes off, see above)
Silas S. Brown's avatar
Silas S. Brown committed
define("watchdogDevice",default="/dev/watchdog",help="The watchdog device to use (set this to /dev/null to check main-thread responsiveness without actually pinging the watchdog)")
define("browser",help="The Web browser command to run. If this is set, Web Adjuster will run the specified command (which is assumed to be a web browser), and will exit when this browser exits. This is useful in conjunction with --real_proxy to have a personal proxy run with the browser. You still need to set the browser to use the proxy; this can sometimes be done via browser command line or environment variables.")
define("run",help="A command to run that is not a browser. If set, Web Adjuster will run the specified command and will restart it if it stops. The command will be stopped when Web Adjuster is shut down. This could be useful, for example, to run an upstream proxy.")
define("runWait",default=1,help="The number of seconds to wait before restarting the 'run' command if it fails")
Silas S. Brown's avatar
Silas S. Brown committed
define("ssh_proxy",help="host[:port][,URL] which, if set, can help to proxy SSH connections over HTTP if you need to perform server administration from a place with port restrictions.  See comments in adjuster.py for details.")
# - If set host (and optional port, defaults to 22), then CONNECT requests for that server are accepted even without real_proxy.  Use (e.g.) ssh -o ProxyCommand "nc -X connect -x adjuster.example.org:80 %h %p" ssh-host
# - This however won't work if the adjuster is running on a virtual hosting provider (like OpenShift) which doesn't support CONNECT (and many of them don't even support streaming 1-way connections like proxy2ssh, even if we modify Tornado to do that).  But you can set ,URL and write a ProxyCommand like this:
"""# ---------- cut here ----------
#!/usr/bin/env python
host_name = host_name_or_IP = "you need to set this"
path_part_of_URL = "/you need to set this too"
import sys,socket,select,time,os ; lastPostTime = 0
def connect():
  global s ; s=socket.socket() ; s.connect((host_name_or_IP,80))
connect()
def post(dat):
  global lastPostTime
  if not lastPostTime: dat="new connection"
  s.sendall('POST %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\nContent-Length: %d\r\n\r\n%s' % (path_part_of_URL,host_name,len(dat),dat)) ; r="" ; rx = True
  while rx and not "\r\n\r\n" in r:
    try: rx = s.recv(1024)
    except socket.error: break
    r += rx
  if not "\r\n\r\n" in r: # probably keep-alive interrupted by virtualiser
    connect() ; return post(dat)
  cl=r[r.index(':',r.index("\nContent-Length:"))+1:].lstrip() ; cl=cl[:cl.index('\r')] ; cl=int(cl) ; r=r[r.index("\r\n\r\n")+4:]
  while len(r) < cl:
    rx = s.recv(1024) ; assert rx ; r += rx
  r = r[:cl] ; sys.stdout.write(r) ; sys.stdout.flush()
  lastPostTime = time.time()
interval = 1
while True:
  read = []
  while 0 in select.select([0], [], [], 0)[0]:
    rx = os.read(0,1) ; assert rx ; read.append(rx)
  if read or time.time() > lastPostTime+interval: post("".join(read))
  if read: interval = 1
  elif interval < 30: interval *= 2
  time.sleep(0.1)
# ---------- cut here ---------- """
Silas S. Brown's avatar
Silas S. Brown committed
# and if you then need to forward to the adjuster from a CGI
# script (for example because the adjuster itself can't be
# run on port 80) then try something like this:
"""# ---------- cut here ----------
#!/bin/bash
export URL=http://localhost:28080/LetMeIn # or whatever
export T=$(mktemp /dev/shm/XXXXXX) ; cat > $T
export T2=$(mktemp /dev/shm/XXXXXX)
wget --post-file $T -q -O - "$URL" > $T2
echo "Content-Length: $(wc -c < $T2)" # please don't "chunk" it
echo ; cat $T2 ; rm $T $T2
# ---------- cut here ---------- """

heading("Media conversion options")
define("bitrate",default=0,help="Audio bitrate for MP3 files, or 0 to leave them unchanged. If this is set to anything other than 0 then the 'lame' program must be present. Bitrate is normally a multiple of 8. If your mobile device has a slow link, try 16 for speech.")
define("askBitrate",default=False,help="If True, instead of recoding MP3 files unconditionally, try to add links to \"lo-fi\" versions immediately after each original link so you have a choice.")
define("pdftotext",default=False,help="If True, add links to run PDF files through the 'pdftotext' program (which must be present if this is set). A text link will be added just after any PDF link that is found, so that you have a choice of downloading PDF or text; note that pdftotext does not always manage to extract all text (you can use --pdfomit to specify URL patterns that should not get text links). The htmlJson setting will also be applied to the PDF link finder, and see also the guessCMS option.")
define("pdfomit",help="A comma-separated list of regular expressions which, if any are found in a PDF link's URL, will result in a text link not being generated for that PDF link (although a conversion can still be attempted if a user manually enters the modified URL).  Use this to avoid confusion for PDF files you know cannot be converted.")
define("epubtotext",default=False,help="If True, add links to run EPUB files through Calibre's 'ebook-convert' program (which must be present), to produce a text-only option (or a MOBI option if a Kindle is in use). A text link will be added just after any EPUB link that is found, so that you have a choice of downloading EPUB or text. The htmlJson setting will also be applied to the EPUB link finder, and see also the guessCMS option.")
# pdftotext and epubtotext both use temporary files, which are created in the system default temp directory unless overridden by environment variables TMPDIR, TEMP or TMP, TODO: do we want an override for NamedTemporaryFile's dir= option ourselves?  (/dev/shm might make more sense on some Flash-based systems, although filling the RAM and writing to swap might do more damage than writing files in /tmp if it gets big; also hopefully some OS's won't actually write anything if the file has been deleted before the buffer needed to be flushed (TODO: check this))
define("epubtozip",default=False,help="If True, add links to download EPUB files renamed to ZIP, as a convenience for platforms that don't have EPUB readers but can open them as ZIP archives and display the XHTML files they contain. The htmlJson setting will also be applied to the EPUB link finder, and see also the guessCMS option.") # TODO: option to cache the epub file and serve its component files individually, so other transforms can be applied and for platforms without ZIP capabilities
define("guessCMS",default=False,help="If True, then the pdftotext, epubtotext and epubtozip options attempt to guess if a link is pointing to a PDF or EPUB file via a Content Management System (i.e. the URL does not end in .pdf or .epub, but contains something like ?format=PDF)") # (doesn't seem to work very well with the askBitrate option)
define("pdfepubkeep",default=200,help="Number of seconds to keep any generated text files from PDF and EPUB.  If this is 0, the files will be deleted immediately, but that might be undesirable: if a mobile phone browser has a timeout that takes effect before ebook-convert has finished (this can sometimes be the case with Opera Mini for example), it might be best to allow the user to wait a short time and re-submit the request, this time getting a cached response.") # Opera Mini's opera:config can set the loading timeout to longer, default is 30 seconds.
define("waitpage",default=True,help="If the browser seems to be an interactive one, generate a 'please wait' page while converting PDF or EPUB files to text. Not effective if pdfepubkeep is set too low.") # TODO: mp3 also? (would need to add MP3s to pdfepubkeep)

heading("Character rendering options")
# TODO: option to add a switch at top of page ?
define("render",default=False,help="Whether to enable the character-set renderer. This functionality requires the Python Imaging Library and suitable fonts. The settings of htmlJson and leaveTags will also be applied to the renderer. Text from computed Javascript writes might not be rendered as images.") # ("computed" as in not straight from a JSON document.  TODO: could write a piece of JS that goes through the DOM finding them? ditto any JS alterations that haven't been through htmlFilter, although you'd have to mark the ones that have and this could be filter-dependent)
define("renderFont",help="The font file to use for the character-set renderer (if enabled). This should be a font containing all the characters you want to render, and it should be in .TTF, .OTF or other Freetype-supported format (.PCF is sometimes possible if renderSize is set correctly, e.g. 16 for wenquanyi_12pt.pcf)") # TODO: different fonts for different Unicode ranges? (might be hard to auto-detect missing characters)
define("renderInvert",default=False,help="If True, the character-set renderer (if enabled) will use a black background. Useful when you are also adding a stylesheet with a dark background.")
define("renderSize",default=20,help="The height (in pixels) to use for the character-set renderer if it is enabled.")
define("renderPath",default="/@_",help="The location on every adjusted website to put the character-set renderer's images, if enabled. This must be made up of URL-safe characters starting with a / and should be a short path that is unlikely to occur on normal websites.")
define("renderFormat",default="png",help="The file format of the images to be created by the character-set renderer if it is enabled, for example 'png' or 'jpeg'.")
define("renderRange",multiple=True,help="The lowest and highest Unicode values to be given to the character-set renderer if it is enabled. For example 3000:A6FF for most Chinese characters. Multiple ranges are allowed. Any characters NOT in one of the ranges will be passed to the browser to render. If the character-set renderer is enabled without renderRange being set, then ALL text will be rendered to images.")
define("renderOmit",multiple=True,default="iPhone,iPad,Android,Macintosh,Windows NT 6,Windows NT 10,Windows Phone OS,Lynx/2",help="A list of platforms that do not need the character-set renderer. If any of these strings occur in the user-agent then the character set renderer is turned off even if it is otherwise enabled, on the assumption that these platforms either have enough fonts already, or wouldn't show the rendered images anyway.") # (Win: Vista=6.0 7=6.1 8=6.2 reportedly don't need language packs for display) (Lynx: being careful by specifying /2 to try to avoid false positives; don't list w3m as some versions can do graphics; not sure about Links/ELinks etc)
define("renderOmitGoAway",default=False,help="If set, any browsers that match renderOmit will not be allowed to use the adjuster. This is for servers that are set to do character rendering only and do not have enough bandwidth for people who don't need this function and just want a proxy.") # (See also the extended syntax of the headAppendCSS option, which forces all users to choose a stylesheet, especially if cssName is not set; that might be useful if the server's sole purpose is to add stylesheets and you don't want to provide a straight-through service for non-stylesheet users.)
define("renderCheck",help="If renderOmit does not apply to the browser, it might still be possible to check for native character-set support via Javascript. renderCheck can be set to the Unicode value of a character to be checked (try 802F for complete Chinese support); if the browser reports its width differently from known unprintable characters, we assume it won't need our renderer.") # 802F shouldn't create false positives in environments that support only GB2312, only Big5, only SJIS or only KSC instead of all Chinese. It does have GB+ and Big5+ codes (and also demonstrates that we want a hex number). If browser's "unprintable character" glyph happens to be the same width as renderCheck anyway then we could have a false negative, but that's better than a false positive and the user can still switch it off manually if renderName is left set.
define("renderNChar",default=1,help="The maximum number of characters per image to be given to the character-set renderer if it is enabled. Keeping this low means the browser cache is more likely to be able to re-use images, but some browsers might struggle if there are too many separate images. Don't worry about Unicode \"combining diacritic\" codes: any found after a character that is to be rendered will be included with it without counting toward the renderNChar limit and without needing to be in renderRange.")
define("renderWidth",default=0,help="The maximum pixel width of a 'word' when using the character-set renderer. If you are rendering a language that uses space to separate words, but are using only one or two characters per image, then the browser might split some words in the middle. Setting renderWidth to some value other than 0 can help to prevent this: any word narrower than renderWidth will be enclosed in a <nobr> element. (This will however be ineffective if your stylesheet overrides the behaviour of <nobr>.) You should probably not set renderWidth if you intend to render languages that do not separate words with spaces.")
define("renderDebug",default=False,help="If the character-set renderer is having problems, try to insert comments in the HTML source to indicate why.  The resulting HTML is not guaranteed to be well-formed, but it might help you debug a misbehaving htmlFilter.  This option may also insert comments in bad HTML before the htmlFilter stage even when the renderer is turned off.")
define("renderName",default="Fonts",help="A name for a switch that allows the user to toggle character set rendering on and off from the browser (via a cookie and Javascript links at the bottom of HTML pages); if set to the empty string then no switch is displayed. At any rate none is displayed when renderOmit applies.") # TODO: non-Javascript fallback for the switcher

heading("Dynamic DNS options")
define("ip_change_command",help="An optional script or other shell command to launch whenever the public IP address changes. The new IP address will be added as a parameter; ip_query_url must be set to make this work. The script can for example update any Dynamic DNS services that point to the server.")
Silas S. Brown's avatar
Silas S. Brown committed
define("ip_query_url",help="URL that will return your current public IP address, as a line of text with no markup added. Used for the ip_change_command option. You can set up a URL by placing a CGI script on a server outside your network and having it do: echo Content-type: text/plain;echo;echo $REMOTE_ADDR (but if you want your IPv4 address, ensure the adjuster machine and the outside server are not both configured for IPv6)")
define("ip_query_url2",help="Optional additional URL that might sometimes return your public IP address along with other information. This can for example be a status page served by a local router (http://user:password@192.168... is accepted, and if the password is the name of an existing file then its contents are read instead). If set, the following behaviour occurs: Once ip_query_interval has passed since the last ip_query_url check, ip_query_url2 will be queried at an interval of ip_query_interval2 (which can be short), to check that the known IP is still present in its response. Once the known IP is no longer present, ip_query_url will be queried again. This arrangement can reduce the load on ip_query_url as well as providing a faster response to IP changes, while not completely trusting the local router to report the correct IP at all times. See also ip_query_aggressive if the router might report an IP change before connectivity is restored. You may also set ip_query_url2 to the special value 'upnp' if you want it to query a router via UPnP (miniupnpc package required).") # (If using filename then its contents will be re-read every time the URL is used; this might be useful for example if the router password can change)
define("ip_check_interval",default=8000,help="Number of seconds between checks of ip_query_url for the ip_change_command option")
define("ip_check_interval2",default=60,help="Number of seconds between checks of ip_query_url2 (if set), for the ip_change_command option")
define("ip_query_aggressive",default=False,help="If a query to ip_query_url fails with a connection error or similar, keep trying again until we get a response. This is useful if the most likely reason for the error is that our ISP is down: we want to get the new IP just as soon as we're back online. However, if the error is caused by a problem with ip_query_url itself then this option can lead to excessive traffic, so use with caution. (Log entries are written when this option takes effect, and checking the logs is advisable.)")
define("ip_force_interval",default=7*24*3600,help="Number of seconds before ip_change_command (if set) is run even if there was no IP change.  This is to let Dynamic DNS services know that we are still around.  Set to 0 to disable forced updates (a forced update will occur on server startup anyway), otherwise an update will occur on the next IP check after ip_force_interval has elapsed.")
define("useLXML",default=False,help="Use the LXML library for parsing HTML documents. This is usually faster, but it can fail if your system does not have a good installation of LXML and its dependencies. Use of LXML libraries may also result in more changes to all HTML markup: this should be harmless for browsers, but beware when using options like bodyAppendGoesAfter then you might or might not be dealing with the original HTML depending on which filters are switched on.") # (hence bodyAppendGoesAfter now takes regexps as of adjuster 0.1836) / dependencies: did have ", or if the websites you visit are badly broken" but it turns out some breakages are actually better handled by LXML than by HTMLParser, e.g. <div id=something">
Silas S. Brown's avatar
Silas S. Brown committed
define("usepycurl",default=True,help="Use the pycurl library if available (setting this to False might save a little RAM at the expense of remote-server tolerance)")
define("renderBlocks",default=False,help="Treat all characters rendered by the character-set renderer as \"blocks\" that are guaranteed to have the same dimensions (true for example if you are using the renderer for Chinese characters only). This is faster than checking words individually, but it may produce incorrect HEIGHT and WIDTH attributes if given a range of characters whose dimensions do differ.") # TODO: blocksRange option for if want to render some that do and some that don't? (but profile it: PIL's getsize just might turn out to be quicker than the high-level range-check code)
define("fasterServer",help="Address:port of another instance of Web Adjuster to which we forward all traffic whenever it is available. When the other instance is not available, traffic will be handled by this one. Use for example if you have a slower always-on machine and a faster not-always-on machine and you want the slower machine to delegate to the faster machine when available. See also ipTrustReal.")
define("ipTrustReal",help="IP address of a machine that we trust, for example a machine that is using us as fasterServer. Any traffic coming from this machine with an X-Real-Ip header will be logged as though it originated at the value of its X-Real-Ip header. Setting this to * will cause X-Real-Ip to be trusted from ANY connection.") # , which might be useful in an environment where you know the adjuster can be reached only via a proxy but the proxy's address can change; see also trust_XForwardedFor. (TODO: multiple IPs option like ip_messages?  but might need to make it ipv6 ready)
define("trust_XForwardedFor",default=False,help="Like ipTrustReal but trusts X-Forwarded-For header from any IP if set to True (use this in an environment where the adjuster can be reached only via a load balancer etc)")
define("fasterServerNew",default=True,help="If fasterServer is set, assume it is running Web Adjuster v0.17 or later and use a more lightweight method of checking its availability. You might need to set this to False if for some reason you can't upgrade the fasterServer first.") # (don't do auto-fallback as that creates unnecessary extra traffic, plus sending an unrecognized ping2 could clutter logs)
define("machineName",help="A name for the current machine to insert into the \"Server\" HTTP header for adjusted requests, for example to let users know if it's your faster or your slower machine that's currently serving them (although they'd need to inspect the headers to find out)")
define("redirectFiles",default=False,help="If, when not functioning as a \"real\" HTTP proxy, a URL is received that looks like it requires no processing on our part (e.g. an image or downloadable file that the user does not want converted), and if this is confirmed via a HEAD request to the remote server, then redirect the browser to fetch it directly and not via Web Adjuster. This takes bandwidth off the adjuster server, and should mean faster downloads, especially from sites that are better connected than the adjuster machine. However it might not work with sites that restrict \"deep linking\". (As a precaution, the confirmatory HEAD request is sent with a non-adjusted Referer header to simulate what the browser would send if fetching directly. If this results in an HTML \"Referer denied\" message then Web Adjuster will proxy the request in the normal way. This precaution might not detect ALL means of deep-linking denial though.)") # e.g. cookie-based, or serving an image but not the real one.  But it works with Akamai-based assets servers as of 2013-09 (but in some cases you might be able to use codeChanges to point these requests back to the site's original server instead of the Akamai one, if the latter just mirrors the former which is still available, and therefore save having to proxy the images.  TODO: what if you can't do that but you can run another service on a higher bandwidth machine that can cache them, but can't run the adjuster on the higher-bandwidth machine; can we redirect?)
# If adjuster machine is running on a home broadband connection, don't forget the "uplink" speed of that broadband is likely to be lower than the "downlink" speed; the same should not be the case of a site running at a well-connected server farm.  There's also extra delay if Web Adjuster has to download files first (which might be reduced by implementing streaming).  Weighed against this is the extra overhead the browser has of repeating its request elsewhere, which could be an issue if the file is small and the browser's uplink is slow; in that case fetching it ourselves might be quicker than having the browser repeat the request; see TODO comment elsewhere about minimum content length before redirectFiles.
Silas S. Brown's avatar
Silas S. Brown committed
# TODO: for Referer problems in redirectFiles, if we're not on HTTPS, could redirect to an HTTPS page (on a separate private https server, or https://www.google.com/url/?q= but they might add checks) which then redirs to the target HTTP page, but that might not strip Referer on MSIE 7 etc, may have to whitelist browsers+versions for it, or test per-request but that wld lead to 4 redirects per img instead of 2 although cld cache (non-empty) ok-browser-strings (and hold up other requests from same browser until we know or have timed out ??); do this only if sendHead returns false but sendHead with proper referer returns ok (and cache a few sites where this is the case so don't have to re-test) ??  also it might not work in places where HTTPS is forbidden
Silas S. Brown's avatar
Silas S. Brown committed
# TODO: redirectFiles could call request_no_external_referer and test with blank Referer instead of non-adjusted Referer, but we'd have to figure out some way of verifying that the browser actually supports 'Referrer-Policy: same-origin' before doing this

define("upstream_guard",default=True,help="Modify scripts and cookies sent by upstream sites so they do not refer to the cookie names that our own scripts use. This is useful if you chain together multiple instances of Web Adjuster, such as for testing another installation without coming out of your usual proxy. If however you know that this instance will not be pointed to another, you can set upstream_guard to False to save some processing.")
define("skipLinkCheck",multiple=True,help="Comma-separated list of regular expressions specifying URLs to which we won't try to add or modify links for the pdftotext, epubtotext, epubtozip, askBitrate or mailtoPath options.  This processing can take some time on large index pages with thousands of links; if you know that none of them are PDF, EPUB, MP3 or email links, or if you don't mind not processing any that are, then it saves time to skip this step for those pages.") # TODO: it would be nice to have a 'max links on the page' limit as an alternative to a list of URL patterns

define("extensions",help="Name of a custom Python module to load to handle certain requests; this might be more efficient than setting up a separate Tornado-based server. The module's handle() function will be called with the URL and RequestHandler instance as arguments, and should return True if it processed the request, but anyway it should return as fast as possible. This module does NOT take priority over forwarding the request to fasterServer.")

define("loadBalancer",default=False,help="Set this to True if you have a default_site set and you are behind any kind of \"load balancer\" that works by issuing a GET / with no browser string. This option will detect such requests and avoid passing them to the remote site.")
Silas S. Brown's avatar
Silas S. Brown committed
define("multicore",default=False,help="(Linux only) On multi-core CPUs, fork enough processes for all cores to participate in handling incoming requests. This increases RAM usage, but can help with high-load situations. Disabled on BSD/Mac due to unreliability (other cores can still be used for htmlFilter etc)") # and --ssl-fork if there's not TOO many instances taking up the RAM; if you really want multiple cores to handle incoming requests on Mac/BSD you could run GNU/Linux in a virtual machine (or use a WSGI server)
Silas S. Brown's avatar
Silas S. Brown committed
define("internalPort",default=0,help="The first port number to use for internal purposes when ssl_fork is in effect.  Internal ports needed by real_proxy (for SSL) and js_reproxy are normally allocated from the ephemeral port range, but if ssl_fork delegates to independent processes then some of them need to be at known numbers. The default of 0 means one higher than 'port'; several unused ports may be needed starting at this number. If your Tornado is modern enough to support reuse_port then you can have multiple Adjuster instances listening on the same port (e.g. for one_request_only) provided they have different internalPort settings when run with ssl_fork.  Note however that the --stop and --restart options will NOT distinguish between different internalPort settings, only 'port'.") # If running on Openshift in non-WSGI mode, you'd better not use real_proxy or js_reproxy because Openshift won't let you open ports other than OPENSHIFT_PYTHON_PORT (TODO: find some way to multiplex everything on one port? how to authenticate our JS-interpreter connections if the load-balancer makes remote connections to that port also seem to come from our IP?)
define("compress_responses",default=True,help="Use gzip to compress responses for clients that indicate they are compatible with it. You may want to turn this off if your server's CPU is more important than your network bandwidth (e.g. browser on same machine).")

# THIS MUST BE THE LAST SECTION because it continues into
# the note below about Tornado logging options.  (The order
# of define()s affects the HTML order only; --help will be
# sorted alphabetically by Tornado.)
heading("Logging options")
define("profile",default=0,help="Log timing statistics every N seconds (only when not idle)")
define("profile_lines",default=5,help="Number of lines to log when profile option is in use (not applicable if using --multicore)")
define("renderLog",default=False,help="Whether or not to log requests for character-set renderer images. Note that this can generate a LOT of log entries on some pages.")
define("logUnsupported",default=False,help="Whether or not to log attempts at requests using unsupported HTTP methods. Note that this can sometimes generate nearly as many log entries as renderLog if some browser (or malware) tries to do WebDAV PROPFIND requests on each of the images.")
define("logRedirectFiles",default=True,help="Whether or not to log requests that result in the browser being simply redirected to the original site when the redirectFiles option is on.") # (Since this still results in a HEAD request being sent to the remote site, this option defaults to True in case you need it to diagnose "fair use of remote site" problems)
define("ownServer_useragent_ip",default=False,help="If own_server is set, and that server cannot be configured to log the X-Real-Ip header we set when we proxy for it, you can if you wish turn on this option, which will prepend the real IP to the User-Agent header on the first request of each connection (most servers can log User-Agent). This is slightly dangerous: fake IPs can be inserted into the log if keep-alive is used.") # (and it might break some user-agent detection)
define("ipNoLog",multiple=True,help="A comma-separated list of IP addresses which can use the adjuster without being logged. If your network has a \"friendly probing\" service then you might want to use this to stop it filling up the logs.  (Any tracebacks it causes will still be logged however.)")
define("squashLogs",default=True,help="Try to remove some duplicate information from consecutive log entries, to make logs easier to check. You might want to set this to False if you plan to use automatic search tools on the logs. Currently not supported with multicore, and will automatically be set to False if multicore is enabled.") # (word 'some' is important as not all duplicate info is guaranteed to be removed. TODO: move BrowserLogger to the collection process so can collate for multicore?)
define("whois",default=False,help="Try to log the Internet service provider for each IP address in the logs.  Requires the 'whois' program.  The extra information is written as separate log entries when it becomes available, and not for recent duplicate IPs or IPs that do not submit valid requests.")
Silas S. Brown's avatar
Silas S. Brown committed
define("errorHTML",default="Adjuster error has been logged",help="What to say when an uncaught exception (due to a misconfiguration or programming error) has been logged. HTML markup is allowed in this message. If for some reason you have trouble accessing the log files, the traceback can usually be included in the page itself by placing {traceback} in the message.") # TODO: this currently requires Tornado 2.1+ (document this? see TODO in write_error)
Silas S. Brown's avatar
Silas S. Brown committed
define("logDebug",default=False,help="Write debugging messages (to standard error if in the foreground, or to the logs if in the background). Use as an alternative to --logging=debug if you don't also want debug messages from other Tornado modules. On Unix you may also toggle this at runtime by sending SIGUSR1 to the process(es).") # see debuglog()
# and continuing into the note below:
if not tornado:
    print "</dl>"
Silas S. Brown's avatar
Silas S. Brown committed
    print "Tornado-provided logging options are not listed above because they might vary across Tornado versions; run <kbd>python adjuster.py --help</kbd> to see a full list of the ones available on your setup. They typically include <kbd>log_file_max_size</kbd>, <kbd>log_file_num_backups</kbd>, <kbd>log_file_prefix</kbd> and <kbd>log_to_stderr</kbd>." # and --logging=debug but that may generate a lot of entries from curl_httpclient
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Further imports
# --------------------------------------------------

import time,os,commands,string,urllib,urlparse,re,socket,logging,subprocess,threading,base64,htmlentitydefs,signal,traceback
try: import simplejson as json # Python 2.5, and faster?
except: import json # Python 2.6
from HTMLParser import HTMLParser,HTMLParseError
Silas S. Brown's avatar
Silas S. Brown committed
try: import psutil
except ImportError: psutil = None
    # (Tornado 2 just calls the module-level print_help, but Tornado 3 includes some direct calls to the object's method, so we have to override the latter.  Have to use __dict__ because they override __setattr__.)
    import pydoc,cStringIO ; pydoc.pager # ensure present
    def new_top(*args):
        dat = cStringIO.StringIO()
Silas S. Brown's avatar
Silas S. Brown committed
        dat.write(twoline_program_name+"\n")
    tornado.options.options.__dict__['old_top'] = tornado.options.options.print_help
    tornado.options.options.__dict__['print_help'] = new_top
except: raise
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Domain-rewriting service routines
# --------------------------------------------------

def hostSuffix(n=0):
    if options.host_suffix:
        return options.host_suffix.split("/")[n]
    return ""
def defaultSite(n=0):
Silas S. Brown's avatar
Silas S. Brown committed
    return options.default_site.split("/")[n]

def convert_to_real_host(requested_host,cookie_host=None):
    # Converts the host name requested by the user into the
    # actual host that we should request, or returns "" if
    # we should display the URL entry box etc.
    # Returns -1 if we should pass to options.own_server.
    if requested_host:
      port=":"+str(options.publicPort) # might or might not be present in the user's request
      orig_requested_host = requested_host
      if requested_host.endswith(port): requested_host=requested_host[:-len(port)]
      n=0
      for h in options.host_suffix.split("/"):
        if requested_host.endswith("."+h): return redot(requested_host[:-len(h)-1])
        if requested_host == h:
            d = defaultSite(n)
            if d: return d
            elif cookie_host==h: return 0 # special type of (false) value to tell the code that we're handling this request ourselves but possibly via ownServer_if_not_root
            else: return cookie_host
        n += 1
      if options.real_proxy: return orig_requested_host
    if options.own_server: return -1
    else: return defaultSite()
def convert_to_via_host(requested_host):
Silas S. Brown's avatar
Silas S. Brown committed
    if not requested_host: requested_host = "" # ?
    port=":"+str(options.publicPort) # the port to advertise
    orig_requested_host = requested_host
    if requested_host.endswith(port): requested_host=requested_host[:-len(port)]
    if options.publicPort==80: port=""
    for h in options.host_suffix.split("/"):
      if (requested_host == h and options.default_site) or requested_host.endswith("."+h): return h+port
    return options.host_suffix+port
def publicPortStr():
    if options.publicPort==80: return ""
    else: return ":"+str(options.publicPort)
def convert_to_requested_host(real_host,cookie_host=None):
    # Converts the actual host name into the host name that
    # the user should request to get it through us
    if not real_host: return ""
    if options.default_site:
      n=0
      for i in options.default_site.split("/"):
        if not i: i=cookie_host
        if real_host == i:
            return hostSuffix(n)+port
        n += 1
Silas S. Brown's avatar
Silas S. Brown committed
    elif not options.wildcard_dns and real_host == cookie_host:
        return hostSuffix(0)+port # no default_site, cookie_host everywhere
    if not options.wildcard_dns: return real_host # leave the proxy
    else: return dedot(real_host)+"."+hostSuffix()+port

# RFC 2109: A Set-Cookie from request-host y.x.example.com for Domain=.example.com would be rejected, because H is y.x and contains a dot.
Silas S. Brown's avatar
Silas S. Brown committed
# That means (especially if a password is set) we'd better make sure our domain-rewrites don't contain dots.  If requested with dot, relocate to without dot.  (But see below re RFC 1035 limitation.)
def dedot(domain):
    # - means . but -- is a real - (OK as 2 dots can't come together and a - can't come immediately after a dot in domain names, so --- = -., ---- = --, ----- = --. etc)
Silas S. Brown's avatar
Silas S. Brown committed
    d2 = domain.replace("-","--").replace(".","-")
    if len(d2) > 63: return domain # because RFC 1035 puts a 63-byte limit on each label (so our cross-domain preferences cookies can't work on very long domains, TODO document this?)
    else: return d2
Silas S. Brown's avatar
Silas S. Brown committed
def redot(domain): return domain.replace("--","@MINUS@").replace("-",".").replace("@MINUS@","-")
Silas S. Brown's avatar
Silas S. Brown committed
def protocolAndHost(realHost):
    # HTTPS hack: host ends with .0 = use HTTPS instead of HTTP
    # (the dot will be represented as a hyphen by dedot/redot,
    # but some servers e.g. GAE can't cope with any part of the
    # wildcard domain ending with a hyphen, so add the 0;
Silas S. Brown's avatar
Silas S. Brown committed
    # TODO: what about fetching from IP addresses, although it's rare to get a server with IP ending .0 because it used to represent "the network")
Silas S. Brown's avatar
Silas S. Brown committed
    if realHost.endswith(".0"): return "https://",realHost[:-2]
    else: return "http://",realHost
def protocolWithHost(realHost):
    x,y = protocolAndHost(realHost) ; return x+y

Silas S. Brown's avatar
Silas S. Brown committed
def domain_process(text,cookieHost=None,stopAtOne=False,https=None,isProxyRequest=False,isSslUpstream=False):
    if isProxyRequest: # called for Location: headers etc (not for document bodies)
        if upstream_rewrite_ssl and not isSslUpstream:
            # Although we don't need a full domain_process when the client is sending us a proxy request, we still have to beware of our UPstream proxy saying .0 in a Location: URL due to upstream_rewrite_ssl: take it out
            m = re.match(r"http(://[A-Za-z0-9.-]*)\.0(?![A-Za-z0-9.-])",text)
            if m: return "https"+m.group(1)
        return text
    # Change the domains on appropriate http:// and https:// URLs.
    # Also on // URLs using 'https' as default (if it's not None).
    # Hope that there aren't any JS-computed links where
    # the domain is part of the computation.
    # TODO: what of links to alternate ports or user:password links, currently we leave them unchanged (could use .<portNo> as an extension of the 'HTTPS hack' of .0, but allowing the public to request connects to any port could be a problem, and IP addresses would have to be handled carefully: can no longer rely on ".0 used to mean the network" sort-of saving us)
    # TODO: leave alone URLs in HTML text/comments and JS comments? but script overload can make it hard to judge what is and isn't text. (NB this function is also called for Location headers)
    if "<!DOCTYPE" in text:
        # don't touch URLs inside the doctype!
        dtStart = text.index("<!DOCTYPE")
        dtEnd = text.find(">",dtStart)
    else: dtStart = dtEnd = -1
    def mFunc(m):
        if dtStart<m.start()<dtEnd: return m.group() # avoid doctype
        i = m.start()
        if i and text[i-1].split() and text[:i].rsplit(None,1)[-1].startswith("xmlns"): return m.group() # avoid xmlns="... xmlns:elementname='... etc
        protocol,oldhost = m.groups()
        if oldhost[-1] in ".-": return m.group() # omit links ending with . or - because they're likely to be part of a domain computation; such things are tricky but might be more likely to work if we DON'T touch them if it has e.g. "'test.'+domain" where "domain" is a variable that we've previously intercepted
        if protocol=="//":
            if https: protocol = "https://"
            else: protocol = "http://"
        if protocol=="https://": oldhost += ".0" # HTTPS hack (see protocolAndHost)
        newHP = "http://" + convert_to_requested_host(oldhost,cookieHost) # TODO: unless using https to communicate with the adjuster itself, in which case would either have to run a server with certificates set up or make it a WSGI-etc script running on one, and if that's the case then might wish to check through the rest of the code (search http://) to ensure this would always work well
        if newHP.endswith(".0"): return m.group() # undo HTTPS hack if we have no wildcard_dns and convert_to_requested_host sent that URL off-site
        return newHP
    if stopAtOne: count=1
    else: count=0
    return re.sub(r"((?:https?://)|(?:(?<=['"+'"'+r"])//))([A-Za-z0-9.-]+)(?=[/?'"+'"'+r"]|$)",mFunc,text,count) # http:// https:// or "// in scripts (but TODO: it won't pick up things like host="www.example.com"; return "https://"+host, also what about embedded IPv6 addresses i.e. \[[0-9a-fA-F:]*\] in place of hostnames (and what should we rewrite them to?)  Hopefully IPv6-embedding is rare as such sites wouldn't be usable by IPv4-only users (although somebody might have IPv6-specific versions of their pages/servers); if making Web Adjuster IPv6 ready, also need to check all instances of using ':' to split host from port as this won't be the case if host is '[' + IPv6 + ']'.  Splitting off hostname from protocol is more common though, e.g. used in Google advertising iframes 2017-06)

def cookie_domain_process(text,cookieHost=None):
    start=0
    while True:
        i = text.lower().find("; domain=",start)
        if i==-1: break
        i += len("; domain=")
        if text[i]=='.': i += 1 # leading . on the cookie (TODO: what if we're not wildcard_dns?)
        j = i
        while j<len(text) and not text[j]==';': j += 1
        newhost = convert_to_requested_host(text[i:j],cookieHost)
        if ':' in newhost: newhost=newhost[:newhost.index(':')] # apparently you don't put the port number, see comment in authenticates_ok
        if newhost==text[i:j] and cookieHost and cookieHost.endswith(text[i:j]): newhost = convert_to_requested_host(cookieHost,cookieHost) # cookie set server.example.org instead of www.server.example.org; we can deal with that
        text = text[:i] + newhost + text[j:]
        j=i+len(newhost)
        start = j
    return text

def can_do_cookie_host():
    return "" in options.default_site.split("/")

def url_is_ours(url,cookieHost="cookie-host\n"):
    # check if url has been through domain_process
    if not url.startswith("http://"): return False
    url=url[len("http://"):]
    if '/' in url:
        url,rest=url.split('/',1)
        rest = '/'+rest
    else: rest = ""
    if '?' in url:
        url,r2=url.split('?',1)
        rest = '?'+r2+rest
    rh = convert_to_real_host(url,cookieHost)
    if rh and type(rh)==type("") and not rh==url:
        # (exact value is used by RewriteExternalLinks)
        if rh.endswith(".0"): r="https://"+rh[:-2]
        else: r="http://"+rh
        return r + rest

# --------------------------------------------------
# Reading configuration files etc
# --------------------------------------------------

def changeConfigDirectory(fname):
    fdir,ffile = os.path.split(fname)
    def tryDir(d):
        d2 = d
        if d2 and not d2.endswith(os.sep): d2 += os.sep
        if os.path.isfile(d2+fname):
            if d: os.chdir(d)
            if fdir: os.chdir(fdir)
            return True # found it
    if tryDir("") or not (os.sep in sys.argv[0] or (os.sep=='\\' and '/' in sys.argv[0])): return ffile
    if os.sep=="\\" and '/' in sys.argv[0] and os.path.isfile(sys.argv[0].replace('/','\\')): sys.argv[0]=sys.argv[0].replace('/','\\') # hack for some Windows Python builds accepting slash in command line but reporting os.sep as backslash
    if tryDir(sys.argv[0][:sys.argv[0].rfind(os.sep)]):
        return ffile
    return fname

def errExit(msg):
    # Exit with an error message BEFORE server start
Silas S. Brown's avatar
Silas S. Brown committed
    # usually due to a configuration problem
        if not istty(): logging.error(msg)
        # in case run from crontab w/out output (and e.g. PATH not set properly)
        # (but don't do this if not options.background, as log_to_stderr is likely True and it'll be more cluttered than the simple sys.stderr.write below)
    except: pass # options or logging not configured yet
    sys.stderr.write(msg+"\n")
def warn(msg):
    msg = "WARNING: "+msg
    try:
        if not istty(): logging.error(msg)
    except: pass
    sys.stderr.write(msg+"\n\n")

Silas S. Brown's avatar
Silas S. Brown committed
  try:
    if len(tornado.options.parse_command_line.func_defaults)==1: # Tornado 2.x
Silas S. Brown's avatar
Silas S. Brown committed
        rest = tornado.options.parse_command_line()
    else:
        rest=tornado.options.parse_command_line(final=final)
Silas S. Brown's avatar
Silas S. Brown committed
    if rest: errExit("Unrecognised command-line argument '%s'" % rest[0]) # maybe they missed a '--' at the start of an option: don't want result to be ignored without anyone noticing
Silas S. Brown's avatar
Silas S. Brown committed
  except tornado.options.Error,e: optErr(e.message)
def optErr(m):
    if "PhantomJS" in m: m += " (try --js_interpreter=PhantomJS instead?)" # old option was --PhantomJS
    errExit(m)
Silas S. Brown's avatar
Silas S. Brown committed
def parse_config_file(cfg):
Silas S. Brown's avatar
Silas S. Brown committed
  try:
    check_config_file(cfg)
    if not tornado.options.parse_config_file.func_defaults: # Tornado 2.x
        tornado.options.parse_config_file(cfg)
Silas S. Brown's avatar
Silas S. Brown committed
    else: tornado.options.parse_config_file(cfg,final=False)
Silas S. Brown's avatar
Silas S. Brown committed
  except tornado.options.Error,e: optErr(e.message)
def check_config_file(cfg):
    # (why doesn't Tornado do this by default?  catch
    # capitalisation and spelling errors etc)
    try:
        options = tornado.options.options._options
        from tornado.util import exec_in
    except: return
    d = {} ; exec_in(open(cfg,'rb').read(),d,d)
    for k in d.keys():
Silas S. Brown's avatar
Silas S. Brown committed
        if not k in options and not k.replace('_','-') in options and type(d[k]) in [str,unicode,list,bool,int]: # (allow functions etc)
            errExit("Unrecognised global '%s' in configuration file '%s'" % (k,cfg))
    # Reads options from command line and/or config files
Silas S. Brown's avatar
Silas S. Brown committed
    parse_command_line(final=False)
    configsDone = [] ; cDir = []
    if not options.config: options.config=os.environ.get("ADJUSTER_CFG","") # must do HERE rather than setting default= in the define() call, or options.config=None below might not work
    while options.config and (options.config,os.getcwd()) not in configsDone:
Silas S. Brown's avatar
Silas S. Brown committed
        config = options.config ; options.config=None
        oldDir = os.getcwd()
        config2 = changeConfigDirectory(config)
        try: open(config2)
        except: errExit("Cannot open configuration file %s (current directory is %s)" % (config2,os.getcwd()))
Silas S. Brown's avatar
Silas S. Brown committed
        parse_config_file(config2)
        configsDone.append((config,oldDir))
        cDir.append(os.getcwd())
    configsDone.reverse() # we want config= within a config file to mean the outermost config overrides anything set in the innermost config, so read them in reverse order:
    for (config,_),cd in zip(configsDone,cDir):
        os.chdir(cd) ; parse_config_file(config)
    parse_command_line(True) # need to do this again to ensure logging is set up for the *current* directory (after any chdir's while reading config files) + ensure command-line options override config files
def preprocessOptions():
Silas S. Brown's avatar
Silas S. Brown committed
    if hasattr(signal,"SIGUSR1") and not wsgi_mode:
Silas S. Brown's avatar
Silas S. Brown committed
        signal.signal(signal.SIGUSR1, toggleLogDebug)
Silas S. Brown's avatar
Silas S. Brown committed
        if hasattr(signal,"SIGUSR2"):
            signal.signal(signal.SIGUSR2, requestStatusDump)
    if options.version: errExit("--version is for the command line only, not for config files") # to save confusion.  (If it were on the command line, we wouldn't get here: we process it before loading Tornado.  TODO: if they DO try to put it in a config file, they might set some type other than string and get a less clear error message from tornado.options.)
Silas S. Brown's avatar
Silas S. Brown committed
    if options.one_request_only:
        if options.multicore or options.fasterServer or options.whois or options.own_server or options.ssh_proxy: errExit("--one-request-only is not compatible with multicore, fasterServer, whois, own_server or ssh_proxy") # (TODO: it could be MADE compatible with fasterServer, whois, etc, but that would need more work.  watchdog works in theory but is inadvisable unless you're running this in some kind of loop)
        if (options.pdftotext or options.epubtotext or options.epubtozip) and (options.pdfepubkeep or options.waitpage):
            warn("pdfepubkeep and waitpage won't work with --one-request-only: clearing them")
            options.pdfepubkeep = options.waitpage = False
        if options.js_interpreter and not options.js_instances==1: errExit("--one-request-only doesn't make sense with a js_instances value other than 1") # (well we could start N instances if you like, but what's the point? - this probably indicates 'wrong config= option' or something, so flag it)
Silas S. Brown's avatar
Silas S. Brown committed
    if options.restart and options.watchdog and options.watchdogDevice=="/dev/watchdog" and options.user and os.getuid(): errExit("This configuration looks like it should be run as root.") # if the process we're restarting has the watchdog open, and the watchdog is writable only by root (which is probably at least one of the reasons why options.user is set), there's no guarantee that stopping that other process will properly terminate the watchdog, and we won't be able to take over, = sudden reboot
    if options.host_suffix==getfqdn_default: options.host_suffix = socket.getfqdn()
    if type(options.mailtoSMS)==type(""): options.mailtoSMS=options.mailtoSMS.split(',')
    if type(options.leaveTags)==type(""): options.leaveTags=options.leaveTags.split(',')
    if type(options.stripTags)==type(""): options.stripTags=options.stripTags.split(',')
    if options.render:
        try: import PIL
        except ImportError: errExit("render requires PIL")
Silas S. Brown's avatar
Silas S. Brown committed
    global force_htmlonly_mode
    if options.urlboxPath.startswith("*"):
        options.urlboxPath = options.urlboxPath[1:]
        force_htmlonly_mode = True
    else: force_htmlonly_mode = False
Silas S. Brown's avatar
Silas S. Brown committed
    if not options.urlboxPath.startswith("/"): options.urlboxPath = "/" + options.urlboxPath
Silas S. Brown's avatar
Silas S. Brown committed
    if options.stdio:
        if options.background: errExit("stdio is not compatible with background")
        if not options.port: errExit("stdio requires a port to be listening (haven't yet implemented processing a request on stdio without a port to forward it to; you could try --just-me etc in the meantime)")
Silas S. Brown's avatar
Silas S. Brown committed
    global tornado
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_interpreter:
Silas S. Brown's avatar
Silas S. Brown committed
      if options.js_instances < 1: errExit("js_interpreter requires positive js_instances")
Silas S. Brown's avatar
Silas S. Brown committed
      global webdriver
Silas S. Brown's avatar
Silas S. Brown committed
      try: from selenium import webdriver
Silas S. Brown's avatar
Silas S. Brown committed
      except: errExit("js_interpreter requires selenium")
Silas S. Brown's avatar
Silas S. Brown committed
      if not options.js_interpreter in ["PhantomJS","HeadlessChrome","HeadlessFirefox"]: errExit("js_interpreter (if set) must be PhantomJS, HeadlessChrome or HeadlessFirefox")
Silas S. Brown's avatar
Silas S. Brown committed
      if not multiprocessing: options.js_multiprocess = False
      if options.js_429 and options.multicore:
        if int(tornado.version.split('.')[0]) > 4: errExit("js_429 with multicore not yet working on Tornado versions above 4.\nTornado "+tornado.version+" detected.\nPlease downgrade to 4.x, e.g.: pip install tornado==4.5.3 --upgrade")
        elif not multiprocessing: errExit("js_429 with multicore requires the multiprocessing module to be available (Python 2.6+)")
Silas S. Brown's avatar
Silas S. Brown committed
    elif options.js_upstream: errExit("js_upstream requires a js_interpreter to be set")
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_timeout2 <= options.js_timeout1: errExit("js_timeout2 must be greater than js_timeout1")
Silas S. Brown's avatar
Silas S. Brown committed
    assert not (options.js_upstream and set_window_onerror), "Must have set_window_onerror==False when using options.js_upstream"
    create_inRenderRange_function(options.renderRange)
    if type(options.renderOmit)==type(""): options.renderOmit=options.renderOmit.split(',')
Silas S. Brown's avatar
Silas S. Brown committed
    if type(options.cacheOmit)==type(""): options.cacheOmit=options.cacheOmit.split(',')
Silas S. Brown's avatar
Silas S. Brown committed
        if options.renderCheck: errExit("Setting both renderOmitGoAway and renderCheck is not yet implemented (renderOmitGoAway assumes all testing is done by renderOmit only).  Please unset either renderOmitGoAway or renderCheck.")
        options.renderName = "" # so it can't be switched on/off (because there's not a lot of point in switching it off if we're renderOmitGoAway; TODO: document this behaviour?)
    if type(options.deleteOmit)==type(""): options.deleteOmit=options.deleteOmit.split(',')
    if type(options.cssName)==type(""): options.cssName=options.cssName.replace('"',"&quot;") # for embedding in JS
    if type(options.cssNameReload)==type(""): options.cssNameReload=options.cssNameReload.split(',')
    if type(options.search_sites)==type(""): options.search_sites=options.search_sites.split(',')
    if type(options.ipNoLog)==type(""): options.ipNoLog=options.ipNoLog.split(',')
    if type(options.delete)==type(""): options.delete=options.delete.split(',')
    if type(options.delete_css)==type(""): options.delete_css=options.delete_css.split(',')
    if type(options.prohibit)==type(""): options.prohibit=options.prohibit.split(',')
    if type(options.skipLinkCheck)==type(""): options.skipLinkCheck=options.skipLinkCheck.split(',')
    global viaName,serverName,serverName_html
    viaName = program_name[:program_name.index("(c)")].strip() # Web Adjuster vN.NN
    if options.machineName: serverName = viaName + " on "+options.machineName
    else: serverName = viaName
    serverName_html = re.sub(r"([0-9])([0-9])",r"\1<span></span>\2",serverName) # stop mobile browsers interpreting the version number as a telephone number
    global upstream_proxy_host, upstream_proxy_port
    upstream_proxy_host = upstream_proxy_port = None
    global upstream_rewrite_ssl ; upstream_rewrite_ssl=False
    global cores ; cores = 1
    if options.multicore:
        options.squashLogs = False
        if not 'linux' in sys.platform: errExit("multicore option not supported on this platform") # it does work on BSD/Mac, but some incoming connections get 'lost' so it's not a good idea
        import tornado.process
        cores = tornado.process.cpu_count()
        if cores==1: options.multicore = False
Silas S. Brown's avatar
Silas S. Brown committed
        elif options.js_interpreter and options.js_instances % cores:
            old = options.js_instances
            options.js_instances += (cores - (options.js_instances % cores))
            sys.stderr.write("multicore: changing js_instances %d -> %d (%d per core x %d cores)\n" % (old,options.js_instances,options.js_instances/cores,cores))
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_interpreter=="HeadlessChrome":
        try: maxI=int(open("/proc/sys/fs/inotify/max_user_instances")) # Linux only
        except: maxI = -1
        if not maxI==-1 and options.js_instances > maxI*20: warn("This system might run out of inotify instances with that number of Headless Chrome processes.  Try:\nsudo sysctl -n -w fs.inotify.max_user_watches=%d\nsudo sysctl -n -w fs.inotify.max_user_instances=%d" % (options.js_instances*40,options.js_instances*20))
Silas S. Brown's avatar
Silas S. Brown committed
    global js_per_core
    js_per_core = options.js_instances/cores
Silas S. Brown's avatar
Silas S. Brown committed
        maxCurls = 30*js_per_core
        if options.ssl_fork: maxCurls /= 2
Silas S. Brown's avatar
Silas S. Brown committed
        if not options.usepycurl: errExit("upstream_proxy is not compatible with --usepycurl=False")
        setupCurl(maxCurls,"upstream_proxy requires pycurl (try sudo pip install pycurl)")
        if not ':' in options.upstream_proxy: options.upstream_proxy += ":80"
        upstream_proxy_host,upstream_proxy_port = options.upstream_proxy.split(':') # TODO: IPv6 ?
        if not upstream_proxy_host:
            upstream_proxy_host = "127.0.0.1"
Silas S. Brown's avatar
Silas S. Brown committed
            if wsgi_mode: warn("Can't do SSL-rewrite for upstream proxy when in WSGI mode")
            else: upstream_rewrite_ssl = True
        upstream_proxy_port = int(upstream_proxy_port)
Silas S. Brown's avatar
Silas S. Brown committed
    elif options.usepycurl and not options.submitPath=='/': setupCurl(3*js_per_core) # and no error if not there
    global codeChanges ; codeChanges = []
    if options.codeChanges:
Silas S. Brown's avatar
Silas S. Brown committed
      ccLines = [x for x in [x.strip() for x in options.codeChanges.split("\n")] if x and not x.startswith("#")]
        if len(ccLines)<3: errExit("codeChanges must be a multiple of 3 lines (see --help)")
        codeChanges.append(tuple(ccLines[:3]))
        ccLines = ccLines[3:]
Silas S. Brown's avatar
Silas S. Brown committed
    if options.real_proxy:
        options.open_proxy=True
Silas S. Brown's avatar
Silas S. Brown committed
        if options.browser and "lynx" in options.browser and not "I_PROMISE_NOT_TO_LYNX_DUMP_SSL" in os.environ and not "-stdin" in options.browser and ("-dump" in options.browser or "-source" in options.browser or "-mime_header" in options.browser): errExit("Don't do that.  If Lynx wants to ask you about our self-signed certificates, it'll assume the answer is No when running non-interactively, and this will cause it to fetch the page directly (not via our proxy) which could confuse you into thinking the adjuster's not working.  If you know what you're doing, put I_PROMISE_NOT_TO_LYNX_DUMP_SSL in the environment to suppress this message (but if using js_interpreter beware of redirect to SSL).  Or you can use wget --no-check-certificate -O - | lynx -dump -stdin") # TODO: could we configure Lynx to always accept when running non-interactively?
    if options.htmlFilter and '#' in options.htmlFilter and not len(options.htmlFilter.split('#'))+1 == len(options.htmlFilterName.split('#')): errExit("Wrong number of #s in htmlFilterName for this htmlFilter setting")
Silas S. Brown's avatar
Silas S. Brown committed
    if options.port == -1:
        if wsgi_mode:
            warn("port=-1 won't work in WSGI mode, assuming 80")
            options.port = 80
Silas S. Brown's avatar
Silas S. Brown committed
        elif options.ssl_fork or options.background: errExit("Can't run in background or ssl-fork with an ephemeral main port, as that requires fork-before-listen so won't be able to report the allocated port number")
Silas S. Brown's avatar
Silas S. Brown committed
        else:
            port_randomise[options.port] = True
            if not options.internalPort:
                # DON'T set it to -1 + 1 = 0
                options.internalPort = 1024
    elif options.port < 0 or options.port > 65535:
        errExit("port out of range")
    elif not options.port:
Silas S. Brown's avatar
Silas S. Brown committed
        if wsgi_mode:
            warn("port=0 won't work in WSGI mode, assuming 80")
Silas S. Brown's avatar
Silas S. Brown committed
            options.port = 80
        else:
Silas S. Brown's avatar
Silas S. Brown committed
            options.real_proxy=options.js_reproxy=False ; options.fasterServer=""
Silas S. Brown's avatar
Silas S. Brown committed
            options.open_proxy = True # bypass the check
    if not options.publicPort:
        options.publicPort = options.port
Silas S. Brown's avatar
Silas S. Brown committed
    if not options.internalPort:
        options.internalPort = options.port + 1
    if options.internalPort in [options.publicPort,options.port]: errExit("--internalPort cannot match --port or --publicPort")
Silas S. Brown's avatar
Silas S. Brown committed
    if options.just_me:
        options.address = "localhost"
        try: socket.socket().connect(('localhost',113))
        except: errExit("--just_me requires an ident server to be running on port 113")
        import getpass ; global myUsername ; myUsername = getpass.getuser()
    elif not options.password and not options.open_proxy and not options.submitPath=='/' and not options.stop: errExit("Please set a password (or --just_me), or use --open_proxy.\n(Try --help for help; did you forget a --config=file?)") # (as a special case, if submitPath=/ then we're serving nothing but submit-your-own-text and bookmarklets, which means we won't be proxying anything anyway and don't need this check)
    if options.submitBookmarkletDomain and not options.publicPort==80: warn("You will need to run another copy on "+options.submitBookmarkletDomain+" ports 80/443 for bookmarklets to work (submitBookmarkletDomain without publicPort=80)")
    if options.pdftotext and not "pdftotext version" in os.popen4("pdftotext -h")[1].read(): errExit("pdftotext command does not seem to be usable\nPlease install it, or unset the pdftotext option")
    if options.epubtotext and not "calibre" in os.popen4("ebook-convert -h")[1].read(): errExit("ebook-convert command does not seem to be usable\nPlease install calibre, or unset the epubtotext option")
    global extensions
    if options.extensions:
        extensions = __import__(options.extensions)
    else:
        class E:
            def handle(*args): return False
        extensions = E()
    global ownServer_regexp
    if options.ownServer_regexp:
        if not options.own_server: errExit("Cannot set ownServer_regexp if own_sever is not set")
        ownServer_regexp = re.compile(options.ownServer_regexp)
    else: ownServer_regexp = None
    global ipMatchingFunc
    if options.ip_messages: ipMatchingFunc=ipv4ranges_func(options.ip_messages)
    else: ipMatchingFunc = None
    global submitPathIgnorePassword, submitPathForTest
    if options.submitPath and options.submitPath.startswith('*'):
        submitPathIgnorePassword = True
        options.submitPath = options.submitPath[1:]
    else: submitPathIgnorePassword = False
    submitPathForTest = options.submitPath
    if submitPathForTest and submitPathForTest[-1]=="?": submitPathForTest = submitPathForTest[:-1] # for CGI mode: putting the ? in tells adjuster to ADD a ? before any parameters, but does not require it to be there for the base submit URL (but don't do this if not submitPathForTest because it might not be a string)
    if options.submitPath and not options.htmlText: errExit("submitPath only really makes sense if htmlText is set (or do you want users to submit actual HTML?)") # TODO: allow this? also with submitBookmarklet ??
    if options.separator and '\xe2\x80\x8b' in options.separator: errExit("U+200B in separator not supported (see code)")
    if options.prominentNotice=="htmlFilter":
        if not options.htmlFilter: errExit("prominentNotice=\"htmlFilter\" requires htmlFilter to be set")
        if options.htmlJson or options.htmlText: errExit("prominentNotice=\"htmlFilter\" does not work with the htmlJson or htmlText options")
Silas S. Brown's avatar
Silas S. Brown committed
    if not (options.submitPath and options.htmlFilter): options.submitBookmarklet = False # TODO: bookmarklet for character rendering? (as an additional bookmarklet if there are filters as well, and update submitBookmarklet help text) although it's rare to find a machine that lacks fonts but has a bookmarklet-capable browser
    if options.submitBookmarklet and '_IHQ_' in options.submitPath: errExit("For implementation reasons, you cannot have the string _IHQ_ in submitPath when submitBookmarklet is on.") # Sorry.  See TODO in 'def bookmarklet'
    global upstreamGuard, cRecogniseAny, cRecognise1
    upstreamGuard = set() ; cRecogniseAny = set() ; cRecognise1 = set() # cRecognise = cookies to NOT clear at url box when serving via adjust_domain_cookieName; upstreamGuard = cookies to not pass to upstream (and possibly rename if upstream sets them)
    if options.password:
        upstreamGuard.add(password_cookie_name)
        cRecogniseAny.add(password_cookie_name)
    if options.cssName:
        upstreamGuard.add("adjustCssSwitch")
        cRecognise1.add("adjustCssSwitch")
    if options.htmlFilterName:
        upstreamGuard.add("adjustNoFilter")
        cRecognise1.add("adjustNoFilter")
    if options.renderName:
        upstreamGuard.add("adjustNoRender")
        cRecognise1.add("adjustNoRender")
    if options.prominentNotice:
        upstreamGuard.add("_WA_warnOK")
        cRecognise1.add("_WA_warnOK")
    if options.htmlonly_mode:
        upstreamGuard.add(htmlmode_cookie_name)
        cRecognise1.add(htmlmode_cookie_name)
    if options.ip_messages:
        upstreamGuard.add(seen_ipMessage_cookieName)
        cRecognise1.add(seen_ipMessage_cookieName)
    h = options.headAppendCSS
    if h and '%s' in h:
        if not ';' in h: errExit("If putting %s in headAppendCSS, must also put ; with options (please read the help text)")
        if options.default_site: errExit("Cannot set default_site when headAppendCSS contains options, because we need the URL box to show those options") # TODO: unless we implement some kind of inline setting, or special options URL ?
        if options.cssHtmlAttrs and ';' in options.cssHtmlAttrs and not len(options.cssHtmlAttrs.split(';'))==len(h.rsplit(';',1)[1].split(',')): errExit("Number of choices in headAppendCSS last field does not match number of choices in cssHtmlAttrs")
        for n in range(len(h.split(';'))-1):
            upstreamGuard.add("adjustCss"+str(n)+"s")
            cRecogniseAny.add("adjustCss"+str(n)+"s")
    if options.useLXML: check_LXML()
Silas S. Brown's avatar
Silas S. Brown committed
    global allowConnectHost,allowConnectPort,allowConnectURL
    allowConnectHost=allowConnectPort=allowConnectURL=None
    if options.ssh_proxy:
        if ',' in options.ssh_proxy: sp,allowConnectURL = options.ssh_proxy.split(',',1)
        else: sp = options.ssh_proxy
        if ':' in sp: allowConnectHost,allowConnectPort=sp.rsplit(':',1)
        else: allowConnectHost,allowConnectPort = sp,"22"
Silas S. Brown's avatar
Silas S. Brown committed
    if not options.default_site: options.default_site = ""
    # (so we can .split it even if it's None or something)
Silas S. Brown's avatar
Silas S. Brown committed
    if not options.js_interpreter:
        options.js_reproxy=options.js_frames=False
Silas S. Brown's avatar
Silas S. Brown committed
    elif not options.htmlonly_mode: errExit("js_interpreter requires htmlonly_mode")
Silas S. Brown's avatar
Silas S. Brown committed

def check_injected_globals():
    # for making sure we're used correctly when imported
    # as a module by a wrapper script
    try: defined_globals
    except: return
    for s in set(globals().keys()).difference(defined_globals):
        if s in options: errExit("Error: adjuster.%s should be adjuster.options.%s" % (s,s)) # (tell them off, don't try to patch up: this could go more subtly wrong if they do it again with something we happened to have defined in our module before)
        elif type(eval(s)) in [str,bool,int]: errExit("Don't understand injected %s %s (misspelled option?)" % (repr(type(eval(s))),s))
def setup_defined_globals(): # see above
    global defined_globals
    defined_globals = True # so included in itself
    defined_globals = set(globals().keys())

# --------------------------------------------------
# Logging and busy-signalling (especially multicore)
# --------------------------------------------------

class CrossProcessLogging(logging.Handler):
    def needed(self): return (options.multicore or options.ssl_fork or (options.js_interpreter and options.js_multiprocess)) and options.log_file_prefix # (not needed if stderr-only or if won't fork)
    def init(self):
        "Called by initLogging before forks.  Starts the separate logListener process."
        if not self.needed(): return
        try: logging.getLogger().handlers
        except: errExit("The logging module on this system is not suitable for --log-file-prefix with --ssl-fork or --js-multiprocess") # because we won't know how to clear its handlers and start again in the child processes
        if not multiprocessing: return # we'll have to open multiple files in initChild instead
        self.loggingQ=multiprocessing.Queue()
        def logListener():
          try:
            while True: logging.getLogger().handle(logging.makeLogRecord(self.loggingQ.get()))
          except KeyboardInterrupt: pass
        self.p = multiprocessing.Process(target=logListener) ; self.p.start()
        logging.getLogger().handlers = [] # clear what Tornado has already put in place when it read the configuration
        logging.getLogger().addHandler(self)
    def initChild(self,toAppend=""):
        "Called after a fork.  toAppend helps to describe the child for logfile naming when multiprocessing is not available."
        if not options.log_file_prefix: return # stderr is OK
        if multiprocessing:
            try: multiprocessing.process.current_process()._children.clear() # so it doesn't try to join() to children it doesn't have (multiprocessing wasn't really designed for the parent to fork() outside of multiprocessing later on)
            except: pass # probably wrong version
            return # should be OK now
        logging.getLogger().handlers = [] # clear Tornado's
        if toAppend: options.log_file_prefix += "-"+toAppend
        else: options.log_file_prefix += "-"+str(os.getpid())
        # and get Tornado to (re-)initialise logging with these parameters:
        if hasattr(tornado.options,"enable_pretty_logging"): tornado.options.enable_pretty_logging() # Tornado 2
        else: # Tornado 4
            import tornado.log
            tornado.log.enable_pretty_logging()
    def shutdown(self):
        try: self.p.terminate() # in case KeyboardInterrupt hasn't already stopped it
        except: pass
    def emit(self, record): # simplified from Python 3.2 (but put just the dictionary, not the record obj itself, to make pickling errors less likely)
        try:
            if record.exc_info:
                dummy = self.format(record) # record.exc_text
                record.exc_info = None
            d = record.__dict__
            d['msg'],d['args'] = record.getMessage(),None
            self.loggingQ.put(d)
        except (KeyboardInterrupt, SystemExit): raise
        except: self.handleError(record)

class CrossProcess429:
    def needed(self): return options.multicore and options.js_429
    def init(self): self.q = multiprocessing.Queue()
    def startThread(self):
        if not self.needed(): return
        self.b = [False]*cores
        def listener():
            allServersBusy = False
            while True:
                coreToSet, busyStatus = self.q.get()
                self.b[coreToSet] = busyStatus
                newASB = all(self.b)
                if not newASB == allServersBusy:
                    allServersBusy = newASB
                    if allServersBusy: IOLoop.instance().add_callback(lambda *args:reallyPauseOrRestartMainServer(True)) # run it just to serve the 429s, but don't set mainServerPaused=False or add an event to the queue
                    else: IOLoop.instance().add_callback(lambda *args:reallyPauseOrRestartMainServer("IfNotPaused")) # stop it if and only if it hasn't been restarted by the main thread before this callback
        threading.Thread(target=listener,args=()).start()

def initLogging(): # MUST be after unixfork() if background
    global CrossProcessLogging
    CrossProcessLogging = CrossProcessLogging()
    CrossProcessLogging.init()

def init429():
    global CrossProcess429
    CrossProcess429 = CrossProcess429()
    if CrossProcess429.needed(): CrossProcess429.init()

# --------------------------------------------------
# Profiling and process naming
# --------------------------------------------------
Silas S. Brown's avatar
Silas S. Brown committed

profile_forks_too = False # TODO: configurable
def open_profile():
Silas S. Brown's avatar
Silas S. Brown committed
    if options.profile:
Silas S. Brown's avatar
Silas S. Brown committed
        global cProfile,pstats,cStringIO,profileIdle
Silas S. Brown's avatar
Silas S. Brown committed
        import cProfile, pstats, cStringIO
        setProfile() ; profileIdle = False
Silas S. Brown's avatar
Silas S. Brown committed
        global reqsInFlight,origReqInFlight
        reqsInFlight = set() ; origReqInFlight = set()
def open_profile_pjsOnly(): # TODO: combine with above
    if options.profile:
Silas S. Brown's avatar
Silas S. Brown committed
        global profileIdle
        setProfile_pjsOnly() ; profileIdle = False
        global reqsInFlight,origReqInFlight
        reqsInFlight = set() ; origReqInFlight = set()
Silas S. Brown's avatar
Silas S. Brown committed
def setProfile():
    global theProfiler, profileIdle
Silas S. Brown's avatar
Silas S. Brown committed
    theProfiler = cProfile.Profile()
    IOLoop.instance().add_timeout(time.time()+options.profile,lambda *args:pollProfile())
    profileIdle = True ; theProfiler.enable()
def setProfile_pjsOnly():
    IOLoop.instance().add_timeout(time.time()+options.profile,lambda *args:pollProfile_pjsOnly())
    global profileIdle ; profileIdle = True
Silas S. Brown's avatar
Silas S. Brown committed
def pollProfile():
    theProfiler.disable()
    if not profileIdle: showProfile()
    setProfile()
def pollProfile_pjsOnly():
    if not profileIdle: showProfile(pjsOnly=True)
    setProfile_pjsOnly()
def showProfile(pjsOnly=False):
Silas S. Brown's avatar
Silas S. Brown committed
    global _doneShowProfile
    try: _doneShowProfile
    except: _doneShowProfile = False
    if pjsOnly: pr = ""
    else:
        s = cStringIO.StringIO()
        pstats.Stats(theProfiler,stream=s).sort_stats('cumulative').print_stats()
        pr = "\n".join([x for x in s.getvalue().split("\n") if x and not "Ordered by" in x][:options.profile_lines])
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_interpreter and len(webdriver_runner):
Silas S. Brown's avatar
Silas S. Brown committed
        global webdriver_lambda,webdriver_mu,webdriver_maxBusy
Silas S. Brown's avatar
Silas S. Brown committed
        stillUsed = sum(1 for i in webdriver_runner if i.wd_threadStart)
Silas S. Brown's avatar
Silas S. Brown committed
        maybeStuck = set()
Silas S. Brown's avatar
Silas S. Brown committed
        for i in webdriver_runner:
Silas S. Brown's avatar
Silas S. Brown committed
            ms,tr = i.maybe_stuck,i.wd_threadStart
Silas S. Brown's avatar
Silas S. Brown committed
            if ms and ms == tr and tr+30 < time.time():
                maybeStuck.add(ms)
            i.maybe_stuck = tr
Silas S. Brown's avatar
Silas S. Brown committed
        webdriver_maxBusy = max(webdriver_maxBusy,stillUsed)
Silas S. Brown's avatar
Silas S. Brown committed
        if pr: pr += "\n"
        elif not options.background: pr += ": "
Silas S. Brown's avatar
Silas S. Brown committed
        pr += "js_interpreter"
        if options.multicore: pr += "%d" % (webdriver_runner[0].start/js_per_core,)
        pr += " "
        if not webdriver_maxBusy: pr += "idle"
Silas S. Brown's avatar
Silas S. Brown committed
        else:
Silas S. Brown's avatar
Silas S. Brown committed
            try: # NameError unless js_429 and multicore
                if mainServerPaused: pr += "closed, "
                else: pr += "open, "
            except NameError: pass
Silas S. Brown's avatar
Silas S. Brown committed
            served = "%d served" % webdriver_mu
Silas S. Brown's avatar
Silas S. Brown committed
            if webdriver_lambda==webdriver_mu==len(webdriver_queue)==0: queue = "" # "; queue unused"
            elif not webdriver_queue: queue="; queue empty: "+served
            else: queue = "; queue %d: %d arrived, %s" % (len(webdriver_queue),webdriver_lambda,served)
            if not _doneShowProfile:
                if pjsOnly: stuck = ", next SIGUSR2 checks stuck;"
                else: stuck = ";"
            elif maybeStuck:
                stuck = ", %d stuck for " % len(maybeStuck)