From f2356dd58cbb257afe5ef3af54a09dfde589d484 Mon Sep 17 00:00:00 2001 From: "Silas S. Brown" <ssb22@cam.ac.uk> Date: Mon, 24 Feb 2014 17:50:23 +0000 Subject: [PATCH] Web Adjuster 0.181 (an old version I happened to have 'knocking around' from August 2013 which would probably be better off in SVN than in my home directory) git-svn-id: http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster@1794 29193198-4895-4776-b068-10539e920549 --- adjuster.py | 1173 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 878 insertions(+), 295 deletions(-) diff --git a/adjuster.py b/adjuster.py index fa790fa..ac9a03b 100755 --- a/adjuster.py +++ b/adjuster.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -program_name = "Web Adjuster v0.1684 (c) 2012-13 Silas S. Brown" +program_name = "Web Adjuster v0.181 (c) 2012-13 Silas S. Brown" # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,20 +53,56 @@ define("config",help="Name of the configuration file to read, if any. The proces heading("Network listening and security settings") define("port",default=28080,help="The port to listen on. Setting this to 80 will make it the main Web server on the machine (which will likely require root access on Unix).") +define("publicPort",default=0,help="The port to advertise in URLs etc, if different from 'port' (the default of 0 means no difference). Used for example if a firewall prevents direct access to our port but some other server has been configured to forward incoming connections.") define("user",help="The user name to run as, instead of root. This is for Unix machines where port is less than 1024 (e.g. port=80) - you can run as root to open the privileged port, and then drop privileges. Not needed if you are running as an ordinary user.") define("address",default="",help="The address to listen on. If unset, will listen on all IP addresses of the machine. You could for example set this to localhost if you want only connections from the local machine to be received, which might be useful in conjunction with real_proxy.") define("password",help="The password. If this is set, nobody can connect without specifying ?p= followed by this password. It will then be sent to them as a cookie so they don't have to enter it every time. Notes: (1) If wildcard_dns is False and you have multiple domains in host_suffix, then the password cookie will have to be set on a per-domain basis. (2) On a shared server you probably don't want to specify this on the command line where it can be seen by process-viewing tools; use a configuration file instead.") -define("password_domain",help="The domain entry in host_suffix to which the password applies. For use when wildcard_dns is False and you have several domains in host_suffix, and only one of them (perhaps the one with an empty default_site) is to be password-protected, with the others public. If this option is used then prominentNotice (if set) will not apply to the passworded domain.") # on the assumption that those who know the password understand what the tool is +define("password_domain",help="The domain entry in host_suffix to which the password applies. For use when wildcard_dns is False and you have several domains in host_suffix, and only one of them (perhaps the one with an empty default_site) is to be password-protected, with the others public. If this option is used then prominentNotice (if set) will not apply to the passworded domain. You may put the password on two or more domains by separating them with slash (/).") # prominentNotice not apply: on the assumption that those who know the password understand what the tool is define("auth_error",default="Authentication error",help="What to say when password protection is in use and a correct password has not been entered. HTML markup is allowed in this message. As a special case, if this begins with http:// then it is assumed to be the address of a Web site to which the browser should be redirected; if it is set to http:// and nothing else, the request will be passed to the server specified by own_server (if set).") # TODO: basic password form? or would that encourage guessing define("open_proxy",default=False,help="Whether or not to allow running with no password. Off by default as a safeguard against accidentally starting an open proxy.") -define("real_proxy",default=False,help="Whether or not to accept requests in real HTTP 'proxy' format with original domains. Warning: this bypasses the password and implies open_proxy. Off by default.") +define("prohibit",multiple=True,default="wikipedia.*action=edit",help="Comma-separated list of regular expressions specifying URLs that are not allowed to be fetched unless --real-proxy is in effect. Browsers requesting a URL that contains any of these will be redirected to the original site. Use for example if you want people to go direct when posting their own content to a particular site (this is of only limited use if your server also offers access to any other site on the Web, but it might be useful when that's not the case).") +define("real_proxy",default=False,help="Whether or not to accept requests with original domains like a \"real\" HTTP proxy. Warning: this bypasses the password and implies open_proxy. Off by default.") define("via",default=True,help="Whether or not to update the Via: and X-Forwarded-For: HTTP headers when forwarding requests") # (Via is "must" in RFC 2616) define("robots",default=False,help="Whether or not to pass on requests for /robots.txt. If this is False then all robots will be asked not to crawl the site; if True then the original site's robots settings will be mirrored. The default of False is recommended.") +define("upstream_proxy",help="address:port of a proxy to send our requests through, such as a caching proxy to reduce load on websites (putting this upstream of the adjuster should save the site from having to re-serve pages when adjuster settings are changed). This proxy (if set) is used for normal requests, but not for ip_query_url options, own_server, fasterServer or HTTPS requests.") # The upstream_proxy option requires pycurl (will refuse to start if not present). Does not set X-Real-Ip because Via should be enough for upstream proxies. + heading("DNS and website settings") -define("host_suffix",default=getfqdn(),help="The last part of the domain name. For example, if the user wishes to change www.example.com and should do so by visiting www.example.com.adjuster.example.org, then host_suffix is adjuster.example.org. If you do not have a wildcard domain then you can still adjust one site by setting wildcard_dns to False, host_suffix to your non-wildcard domain, and default_site to the site you wish to adjust. If you have more than one non-wildcard domain, you can set wildcard_dns to False, host_suffix to all your domains separated by slash (/), and default_site to the sites these correspond to, again separated by slash (/). If wildcard_dns is False and default_site is empty (or if it's a /-separated list and one of its items is empty), then the corresponding host_suffix gives a URL box and sets its domain in a cookie (and adds a link at the bottom of pages to clear this and return to the URL box), but this should be done only as a last resort: you can browse only one domain at a time at that host_suffix (links and HTTP redirects to other domains will leave the adjuster), and the sites you visit at that host_suffix might be able to see some of each other's cookies etc (leaking privacy) although the URL box page will try to clear site cookies.") +define("host_suffix",default=getfqdn(),help="The last part of the domain name. For example, if the user wishes to change www.example.com and should do so by visiting www.example.com.adjuster.example.org, then host_suffix is adjuster.example.org. If you do not have a wildcard domain then you can still adjust one site by setting wildcard_dns to False, host_suffix to your non-wildcard domain, and default_site to the site you wish to adjust. If you have more than one non-wildcard domain, you can set wildcard_dns to False, host_suffix to all your domains separated by slash (/), and default_site to the sites these correspond to, again separated by slash (/); if two or more domains share the same default_site then the first is preferred in links and the others are assumed to be for backward compatibility. If wildcard_dns is False and default_site is empty (or if it's a /-separated list and one of its items is empty), then the corresponding host_suffix gives a URL box and sets its domain in a cookie (and adds a link at the bottom of pages to clear this and return to the URL box), but this should be done only as a last resort: you can browse only one domain at a time at that host_suffix (links and HTTP redirects to other domains will leave the adjuster), and the sites you visit at that host_suffix might be able to see some of each other's cookies etc (leaking privacy) although the URL box page will try to clear site cookies.") +# ("preferred" / "backward compatibility" thing: can be useful if old domain has become unreliable, or if "preferred" domain is actually a URL-path-forwarding service with a memorable name which redirects browsers to an actual domain that's less memorable, and you want the memorable domain to be used in links etc, although in this case you might still get the less-memorable domain in the address bar) +# TODO: (two or more domains pointing to the same default_site) "preferred" / "backward compatibility" thing above: or, add an option to periodically check which of our domains are actually 'up' and move them to the front of the host_suffix / default_site list; that way we don't have to guess ahead of time which one is more reliable and should be preferred. +# Could also do 'use the currently-requested host if it's appropriate', but what if there's a *set* of sites we adjust and we need to try to rewrite cross-site links to be in the same set of domains as the one the browser is requesting - maybe it's best to leave the "preferred" DNS to the config or the periodic check. +# TODO at lower priority: empty (item in) host_suffix to match ALL (unknown) hosts, including IP hosts and no Host: header. Fetch the corresponding default_site (empty means use cookies), and adjust it USING THE HOST SPECIFIED BY THE BROWSER to rewrite the links. This could be useful if setting up an adjuster with NO domain name (IP only). Could periodically upload our public IP to a separate static website via FTP/SSH/etc in case dynamic DNS is not reliable. But if IP address has to change then all cookies would be 'lost'. Also, if no password is set then IP-based "webserver probes" could cause us to send malicious-looking traffic to default_site. +# TODO: Could do different hosts on different ports, which might also be useful if you have a domain name but only one. Would have to check for cookie sharing (or just say "do this only if you don't mind it"); fasterServer would have to forward to same as incoming port. Might be a problem if some users' firewalls disallow outgoing Web traffic to non-standard ports. +# (In the current code, setting host_suffix to a public IP address should work: most browsers set Host: to the IP if requesting a URL by IP, and then the IP will be used in rewrites if it's the first thing specified for its corresponding default_site. But adjuster will need to be reconfigured and restarted on every change of the public IP.) define("default_site",help="The site to fetch from if nothing is specified before host_suffix. If this is omitted then the user is given a URL box when that happens.") -define("own_server",help="Where to find your own web server. This can be something like localhost:1234 or 192.168.0.2:1234. If it is set, then any request that does not match host_suffix will be passed to that server to deal with, unless real_proxy is in effect. You can use this option as a quick way to put your existing server on the same public port if you don't want to go via nginx or whatever. Note: the password option will NOT password-protect your own_server.") +define("own_server",help="Where to find your own web server. This can be something like localhost:1234 or 192.168.0.2:1234. If it is set, then any request that does not match host_suffix will be passed to that server to deal with, unless real_proxy is in effect. You can use this option to put your existing server on the same public port without much reconfiguration. Note: the password option will NOT password-protect your own_server. (You might gain a little responsiveness if you instead set up nginx or similar to direct incoming requests appropriately; see comments in adjuster.py for example nginx settings.)") +# without much reconfiguration: might just need to change which port number it listens on. +# Alternatively you could set nginx (or similar) to reverse-proxy the host_suffix domains to the adjuster, e.g.: +# location / { +# proxy_set_header X-Real-Ip $remote_addr; +# proxy_set_header Host $host; +# proxy_pass_header Server; +# access_log off; +# proxy_pass http://localhost:<YOUR-ADJUSTER-PORT-HERE>; +# proxy_max_temp_file_size 0; +# proxy_read_timeout 130s; # or whatever; default 60s +# # - may need to be longer, especially if using +# # file conversion with waitpage=False on a +# # low-powered server and there are big files +# } +# inside a "server" block with appropriate server_name(s) +# (and set ipTrustReal to 127.0.0.1 in Adjuster's config, +# and set publicPort to the port nginx runs on e.g. 80), +# but if you're not already using nginx then you either +# have to either port your existing server to nginx or get +# nginx to reverse-proxy for your other server, so for +# small installations it might be simpler just to set +# own_server, unless it's vitally important that +# own_server is not held up in any way when the adjuster +# is under heavy CPU load. + +define("ownServer_regexp",help="If own_server is set, you can set ownServer_regexp to a regular expression to match URL prefixes which should always be handled by your own server even if they match host_suffix. This can be used for example to add extra resources to any site, or to serve additional pages from the same domain, as long as the URLs used are not likely to occur on the sites being adjusted. The regular expression is matched against the requested host and the requested URL, so for example [^/]*/xyz will match any URL starting with /xyz on any host, whereas example.org/xyz will match these on your example.org domain. You can match multiple hosts and URLs by using regular expression grouping.") define("ownServer_if_not_root",default=True,help="When trying to access an empty default_site, if the path requested is not / then redirect to own_server (if set) instead of providing a URL box. If this is False then the URL box will be provided no matter what path was requested.") # TODO: "ownServer even if root" option, i.e. option to make host_suffix by itself go to own_server? Or make ownServer_if_not_root permanent? The logic that deals with off-site Location: redirects assumes the URL box will normally be at / (TODO document this?) define('search_sites',multiple=True,help="Comma-separated list of search sites to be made available when the URL box is displayed (if default_site is empty). Each item in the list should be a URL (which will be prepended to the search query), then a space, then a short description of the site. The first item on the list is used by default; the user can specify other items by making the first word of their query equal to the first word of the short description. Additionally, if some of the letters of that first word are in parentheses, the user may specify just those letters. So for example if you have an entry http://search.example.com?q= (e)xample, and the user types 'example test' or 'e test', it will use http://search.example.com?q=test") define("wildcard_dns",default=True,help="Set this to False if you do NOT have a wildcard domain and want to process only default_site. Setting this to False does not actually prevent other sites from being processed (for example, a user could override their local DNS resolver to make up for your lack of wildcard domain); if you want to really prevent other sites from being processed then you could also set own_server to deal with unrecognised domains. Setting wildcard_dns to False does stop the automatic re-writing of links to sites other than default_site. Leave it set to True to have ALL sites' links rewritten on the assumption that you have a wildcard domain.") # will then say "(default True)" @@ -76,29 +112,39 @@ define("default_cookies",help="Semicolon-separated list of name=value cookies to # TODO: sets of adjustments can be switched on and off at a /__settings URL ? or leave it to the injected JS define("headAppend",help="Code to append to the HEAD section of every HTML document that has a BODY. Use for example to add your own stylesheet links and scripts. Not added to documents that lack a BODY such as framesets.") define("headAppendCSS",help="URL of a stylesheet for headAppend. This option automatically generates the LINK REL=... markup for it, and also tries to delete the string '!important' from other stylesheets, to emulate setting this stylesheet as a user CSS.") -define("cssName",help="A name for the stylesheet specified in headAppendCSS, such as \"High Contrast\". If cssName is set, then the headAppendCSS stylesheet will be marked as \"alternate\", with Javascript links at the bottom of the page for browsers that lack their own CSS switching options. If cssName is not set (default) then any stylesheet specified in headAppendCSS will be always on.") # TODO: non-Javascript fallback for the switcher -define("cssNameReload",multiple=True,default="IEMobile 6,Opera Mini,rekonq",help="List of (old) browsers that require alternate code for the cssName option, which is slower as it involves reloading the page on CSS switches. Use this if the CSS switcher provided by cssName does nothing on your browser.") # Opera Mini sometimes worked and sometimes didn't; maybe there were regressions at their proxy. JS switcher needs network traffic anyway on Opera Mini so we almost might as well use the non-JS version -define("headAppendRuby",default=False,help="Convenience option which appends CSS and Javascript code to the HEAD that tries to ensure simple RUBY markup displays legibly across all modern browsers; this might be useful if you used Annotator Generator to make the htmlFilter program.") +define("protectedCSS",help="A regular expression matching URLs of stylesheets with are \"protected\" from having their '!important' strings deleted by headAppendCSS's logic. This can be used for example if you are adding scripts to allow the user to choose alternate CSS files in place of headAppendCSS, and you wish the alternate CSS files to have the same status as the one supplied in headAppendCSS.") +define("cssName",help="A name for the stylesheet specified in headAppendCSS, such as \"High Contrast\". If cssName is set, then the headAppendCSS stylesheet will be marked as \"alternate\", with Javascript links at the bottom of the page for browsers that lack their own CSS switching options. If cssName begins with a * then the stylesheet is switched on by default; if cssName is not set then the stylesheet (if any) is always on.") +define("cssNameReload",multiple=True,default="IEMobile 6,IEMobile 7,IEMobile 8,Opera Mini,Opera Mobi,rekonq",help="List of (old) browsers that require alternate code for the cssName option, which is slower as it involves reloading the page on CSS switches. Use this if the CSS switcher provided by cssName does nothing on your browser.") # Opera Mini sometimes worked and sometimes didn't; maybe there were regressions at their proxy; JS switcher needs network traffic anyway on Opera Mini so we almost might as well use the non-JS version +# Opera Mobile 10 on WM6.1 is fine with CSS switcher but it needs cssHtmlAttrs, TODO we might be able to have a list of browsers that require cssHtmlAttrs but not cssNameReload, add cssHtmlAttrs only if CSS is selected at time of page load, and make the 'off' switch remove them +# TODO: Opera/9.5 on WM6.1 document.write can corrupt the display with EITHER script; page might also display for some time before the document.writes take effect. Suggest those users upgrade to version 10 (= Opera/9.8) ? +define("cssHtmlAttrs",help="Attributes to add to the BODY element of an HTML document when cssNameReload is in effect. This is for old browsers that try to render the document first and apply CSS later. Example: 'text=\"yellow\" bgcolor=\"black\"' (not as flexible as CSS but can still make the rendering process less annoying)") # e.g. IEMobile 7 on WM 6.1 +define("headAppendRuby",default=False,help="Convenience option which adds CSS and Javascript code to the HTML body that tries to ensure simple RUBY markup displays legibly across all modern browsers; this might be useful if you used Annotator Generator to make the htmlFilter program. (The option is named 'head' because it used to add markup to the HEAD; this was moved to the BODY to work around browser bugs.)") # IEMobile 6 drops whitespace after closing tags if document HEAD contains any STYLE element, even an empty one, except via link rel=Stylesheet. Style element works OK if placed at start of body. define("bodyAppend",help="Code to append to the BODY section of every HTML document that has one. Use for example to add a script that needs to be run after the rest of the body has been read, or to add a footer explaining how the page has been modified. See also prominentNotice.") # TODO: note that it will go at the bottom of IFRAMEs also, and suggest using something similar to prominentNotice's iframe-detection code? define("bodyAppendGoesAfter",help="If this is set to some text or HTML code that appears verbatim in the body section, the code in bodyAppend will be inserted after the last instance of this text (case sensitive) instead of at the end of the body. Use for example if a site styles its pages such that the end of the body is not a legible place for a footer.") # (e.g. it would overprint some position=fixed stuff) define("bodyPrepend",help="Code to place at the start of the BODY section of every HTML document that has one.") # May be a useful place to put some scripts. For example, a script that changes a low-vision stylesheet according to screen size might be better in the BODY than in the HEAD, because some Webkit-based browsers do not make screen size available when processing the HEAD of the starting page. # but sometimes it still goes wrong on Chromium startup; probably a race condition; might be worth re-running the script at end of page load just to make sure -define("prominentNotice",help="Text to add as a brief prominent notice to processed sites (may include HTML). If the browser has sufficient Javascript support, this will float relative to the browser window and will contain an 'acknowledge' button to hide it (for the current site in the current browsing session). Use prominentNotice if you need to add important information about how the page has been modified.") +define("prominentNotice",help="Text to add as a brief prominent notice to processed sites (may include HTML). If the browser has sufficient Javascript support, this will float relative to the browser window and will contain an 'acknowledge' button to hide it (for the current site in the current browsing session). Use prominentNotice if you need to add important information about how the page has been modified. Note: if you include Javascript document.write() code in prominentNotice, check that document.readyState is not 'complete' or you might find the document is erased on some website/browser combinations when a site script somehow causes your script to be re-run after the document stream is closed. In some rare cases you might also need to verify that document.cookie.indexOf('_WA_warnOK=1')==-1.") # e.g. if the site does funny things with the browser cache. Rewriting the innerHTML manipulation to appendChild doesn't fix the need to check document.readyState define("delete",multiple=True,help="Comma-separated list of regular expressions to delete from HTML documents. Can be used to delete selected items of Javascript and other code if it is causing trouble for your browser. Will also delete from the text of pages; use with caution.") +define("delete_css",multiple=True,help="Comma-separated list of regular expressions to delete from CSS documents (but not inline CSS in HTML); can be used to remove, for example, dimension limits that conflict with annotations you add, as an alternative to inserting CSS overrides.") define("delete_doctype",default=False,help="Delete the DOCTYPE declarations from HTML pages. This option is needed to get some old Webkit browsers to apply multiple CSS files consistently.") -define("deleteOmit",multiple=True,default="iPhone,iPad,Android,Macintosh",help="A list of browsers that do not need the delete and delete-doctype options to be applied. If any of these strings occur in the user-agent then these options are disabled for that request, on the assumption that these browsers are capable enough to cope with the \"problem\" code.") +define("deleteOmit",multiple=True,default="iPhone,iPad,Android,Macintosh",help="A list of browsers that do not need the delete and delete-doctype options to be applied. If any of these strings occur in the user-agent then these options are disabled for that request, on the assumption that these browsers are capable enough to cope with the \"problem\" code. Any delete-css option is still applied however.") define("codeChanges",help="Several lines of text specifying changes that are to be made to all HTML and Javascript code files on certain sites; use as a last resort for fixing a site's scripts. This option is best set in the configuration file and surrounded by r\"\"\"...\"\"\". The first line is a URL prefix, the second is a string of code to search for, and the third is a string to replace it with. Further groups of URL/search/replace lines may follow; blank lines and lines starting with # are ignored.") define("viewsource",default=False,help="Provide a \"view source\" option. If set, you can see a page's pre-adjustment source code, plus client and server headers, by adding \".viewsource\" to the end of a URL (after any query parameters etc)") define("htmlonly_mode",default=True,help="Provide a checkbox allowing the user to see pages in \"HTML-only mode\", stripping out most images, scripts and CSS; this might be a useful fallback for very slow connections if a site's pages bring in many external files and the browser cannot pipeline its requests. The checkbox is displayed by the URL box, not at the bottom of every page.") # if no pipeline, a slow UPLINK can be a problem, especially if many cookies have to be sent with each request for a js/css/gif/etc. # (and if wildcard_dns=False and we're domain multiplexing, our domain can accumulate a lot of cookies, causing requests to take more uplink bandwidth, TODO: do something about this?) # Above says "most" not "all" because some stripping not finished (see TODO comments) and because some scripts/CSS added by Web Adjuster itself are not stripped +define("mailtoPath",default="/@mail@to@__",help="A location on every adjusted website to put a special redirection page to handle mailto: links, showing the user the contents of the link first (in case a mail client is not set up). This must be made up of URL-safe characters starting with a / and should be a path that is unlikely to occur on normal websites and that does not conflict with renderPath. If this option is empty, mailto: links are not changed. (Currently, only plain HTML mailto: links are changed by this function; Javascript-computed ones are not.)") +define("mailtoSMS",multiple=True,default="Opera Mini,Opera Mobi,Android,Phone,Mobile",help="When using mailtoPath, you can set a comma-separated list of platforms that understand sms: links. If any of these strings occur in the user-agent then an SMS link will be provided on the mailto redirection page.") heading("External processing options") -define("htmlFilter",help="External program to run to filter every HTML document. This can be any shell command; its standard input will get the HTML (or the plain text if htmlText is set), and it should send the new version to standard output. Multiple copies of the program might be run at the same time to serve concurrent requests. UTF-8 character encoding is used.") -define("htmlFilterName",help="A name for the task performed by htmlFilter. If this is set, the user will be able to switch it on and off from the browser via a cookie and some Javascript links at the bottom of HTML pages.") # TODO: non-Javascript fallback for the switcher +define("htmlFilter",help="External program(s) to run to filter every HTML document. If more than one program is specified separated by # then the user will be given a choice (see htmlFilterName option). Any shell command can be used; its standard input will get the HTML (or the plain text if htmlText is set), and it should send the new version to standard output. Multiple copies of each program might be run at the same time to serve concurrent requests. UTF-8 character encoding is used.") +define("htmlFilterName",help="A name for the task performed by htmlFilter. If this is set, the user will be able to switch it on and off from the browser via a cookie and some Javascript links at the bottom of HTML pages. If htmlFilter lists two or more options, htmlFilterName should list the same number plus one (again separated by #); the first is the name of the entire category (for example \"filters\"), and the user can choose between any one of them or none at all (hence the number of options is one more than the number of filters); if this yields more than 3 options then all but the first two are hidden behind a \"More\" option on some browsers.") # TODO: non-Javascript fallback for the switcher define("htmlJson",default=False,help="Try to detect HTML strings in JSON responses and feed them to htmlFilter. This can help when using htmlFilter with some AJAX-driven sites. IMPORTANT: Unless you also set the 'separator' option, the external program must preserve all newline characters, because multiple HTML strings in the same JSON response will be given to it separated by newlines, and the newlines of the output determine which fragment to put back where. (If you combine htmlJson with htmlText, the external program will see text in HTML in JSON as well as text in HTML, but it won't see text in HTML in JSON in HTML.)") define("htmlText",default=False,help="Causes the HTML to be parsed, and only the text parts (not the markup) will be sent to htmlFilter. Useful to save doing HTML parsing in the external program. The external program is still allowed to include HTML markup in its output. IMPORTANT: Unless you also set the 'separator' option, the external program must preserve all newline characters, because multiple text strings will be given to it separated by newlines, and the newlines of the output determine which modified string to put back where.") -define("separator",help="If you are using htmlFilter with htmlJson and/or htmlText, you can set separator to any text string to be used as a separator between multiple items of data when passing them to the external program. By default, newlines are used for this, but you can set it to any other character or sequence of characters that cannot be added or removed by the program. (It does not matter if a website's text happens to use the separator characters.) If you set separator, not only will it be used as a separator BETWEEN items of data but also it will be added before the first and after the last item, thus allowing you to use an external program that outputs extra text before the first and after the last item. The extra text will be discarded. If however you do not set separator then the external program should not add anything extra before/after the document.") +define("separator",help="If you are using htmlFilter with htmlJson and/or htmlText, you can set separator to any text string to be used as a separator between multiple items of data when passing them to the external program. By default, newlines are used for this, but you can set it to any other character or sequence of characters that cannot be added or removed by the program. (It does not matter if a website's text happens to use the separator characters.) If separator is set, not only will it be used as a separator BETWEEN items of data but also it will be added before the first and after the last item, thus allowing you to use an external program that outputs extra text before the first and after the last item. The extra text will be discarded. If however you do not set separator then the external program should not add anything extra before/after the document.") define("leaveTags",multiple=True,default="script,style,title,textarea,option",help="When using htmlFilter with htmlText, you can set a comma-separated list of HTML tag names whose enclosed text should NOT be sent to the external program for modification. For this to work, the website must properly close these tags and must not nest them. (This list is also used for character-set rendering.)") # not including 'option' can break pages that need character-set rendering +define("stripTags",multiple=True,default="wbr",help="When using htmlFilter with htmlText, you can set a comma-separated list of HTML tag names which should be deleted if they occur in any section of running text. For example, \"wbr\" (word-break opportunity) tags (listed by default) might cause problems with phrase-based annotators.") + +define("submitPath",help="If set, accessing this path (on any domain) will give a form allowing the user to enter their own text for processing with htmlFilter. The path should be one that websites are not likely to use (even as a prefix), and must begin with a slash (/). Details of the text entered on this form is not logged by Web Adjuster, but short texts are converted to compressed GET requests which might be logged by proxies etc.") # (see comments in serve_submitPage) heading("Server control options") define("background",default=False,help="If True, fork to the background as soon as the server has started (Unix only). You might want to enable this if you will be running it from crontab, to avoid long-running cron processes.") @@ -112,23 +158,25 @@ define("browser",help="The Web browser command to run. If this is set, Web Adjus heading("Media conversion options") define("bitrate",default=0,help="Audio bitrate for MP3 files, or 0 to leave them unchanged. If this is set to anything other than 0 then the 'lame' program must be present. Bitrate is normally a multiple of 8. If your mobile device has a slow link, try 16 for speech.") define("askBitrate",default=False,help="If True, instead of recoding MP3 files unconditionally, try to add links to \"lo-fi\" versions immediately after each original link so you have a choice.") -define("pdftotext",default=False,help="If True, add links to run PDF files through the 'pdftotext' program (which must be present if this is set). A text link will be added just after any PDF link that is found, so that you have a choice of downloading PDF or text; note that pdftotext does not always manage to extract all text. The htmlJson setting will also be applied to the PDF link finder, and see also the guessCMS option.") +define("pdftotext",default=False,help="If True, add links to run PDF files through the 'pdftotext' program (which must be present if this is set). A text link will be added just after any PDF link that is found, so that you have a choice of downloading PDF or text; note that pdftotext does not always manage to extract all text (you can use --pdfomit to specify URL patterns that should not get text links). The htmlJson setting will also be applied to the PDF link finder, and see also the guessCMS option.") +define("pdfomit",help="A comma-separated list of regular expressions which, if any are found in a PDF link's URL, will result in a text link not being generated for that PDF link (although a conversion can still be attempted if a user manually enters the modified URL). Use this to avoid confusion for PDF files you know cannot be converted.") define("epubtotext",default=False,help="If True, add links to run EPUB files through Calibre's 'ebook-convert' program (which must be present), to produce a text-only option. A text link will be added just after any EPUB link that is found, so that you have a choice of downloading EPUB or text. The htmlJson setting will also be applied to the EPUB link finder, and see also the guessCMS option.") # pdftotext and epubtotext both use temporary files, which are created in the system default temp directory unless overridden by environment variables TMPDIR, TEMP or TMP, TODO: do we want an override for NamedTemporaryFile's dir= option ourselves? (/dev/shm might make more sense on some Flash-based systems, although filling the RAM and writing to swap might do more damage than writing files in /tmp if it gets big; also hopefully some OS's won't actually write anything if the file has been deleted before the buffer needed to be flushed (TODO: check this)) define("epubtozip",default=False,help="If True, add links to download EPUB files renamed to ZIP, as a convenience for platforms that don't have EPUB readers but can open them as ZIP archives and display the XHTML files they contain. The htmlJson setting will also be applied to the EPUB link finder, and see also the guessCMS option.") # TODO: option to cache the epub file and serve its component files individually, so other transforms can be applied and for platforms without ZIP capabilities define("guessCMS",default=False,help="If True, then the pdftotext, epubtotext and epubtozip options attempt to guess if a link is pointing to a PDF or EPUB file via a Content Management System (i.e. the URL does not end in .pdf or .epub, but contains something like ?format=PDF)") -define("pdfepubkeep",default=200,help="Number of seconds to keep any generated text files from PDF and EPUB. If this is 0, the files will be deleted immediately, but that might be undesirable: if a mobile phone browser has a timeout that takes effect before ebook-convert has finished (this can sometimes be the case with Opera Mini for example), it might be best to allow the user to wait a short time and re-submit the request, this time getting a cached response.") # Opera Mini's opera:config can set the loading timeout to longer, default is 30 seconds +define("pdfepubkeep",default=200,help="Number of seconds to keep any generated text files from PDF and EPUB. If this is 0, the files will be deleted immediately, but that might be undesirable: if a mobile phone browser has a timeout that takes effect before ebook-convert has finished (this can sometimes be the case with Opera Mini for example), it might be best to allow the user to wait a short time and re-submit the request, this time getting a cached response.") # Opera Mini's opera:config can set the loading timeout to longer, default is 30 seconds. +define("waitpage",default=True,help="If the browser seems to be an interactive one, generate a 'please wait' page while converting PDF or EPUB files to text. Not effective if pdfepubkeep is set too low.") # TODO: mp3 also? (would need to add MP3s to pdfepubkeep) heading("Character rendering options") # TODO: option to add a switch at top of page ? define("render",default=False,help="Whether to enable the character-set renderer. This functionality requires the Python Imaging Library and suitable fonts. The settings of htmlJson and leaveTags will also be applied to the renderer. Text from computed Javascript writes might not be rendered as images.") # ("computed" as in not straight from a JSON document. TODO: could write a piece of JS that goes through the DOM finding them? ditto any JS alterations that haven't been through htmlFilter, although you'd have to mark the ones that have and this could be filter-dependent) -define("renderFont",help="The font file to use for the character-set renderer (if enabled). This should be a font containing all the characters you want to render, and it should be in .TTF, .OTF or other Freetype-supported format (.PCF is sometimes possible if you set renderSize correctly, e.g. 16 for wenquanyi_12pt.pcf)") # TODO: different fonts for different Unicode ranges? (might be hard to auto-detect missing characters) +define("renderFont",help="The font file to use for the character-set renderer (if enabled). This should be a font containing all the characters you want to render, and it should be in .TTF, .OTF or other Freetype-supported format (.PCF is sometimes possible if renderSize is set correctly, e.g. 16 for wenquanyi_12pt.pcf)") # TODO: different fonts for different Unicode ranges? (might be hard to auto-detect missing characters) define("renderInvert",default=False,help="If True, the character-set renderer (if enabled) will use a black background. Useful when you are also adding a stylesheet with a dark background.") define("renderSize",default=20,help="The height (in pixels) to use for the character-set renderer if it is enabled.") define("renderPath",default="/@_",help="The location on every adjusted website to put the character-set renderer's images, if enabled. This must be made up of URL-safe characters starting with a / and should be a short path that is unlikely to occur on normal websites.") define("renderFormat",default="png",help="The file format of the images to be created by the character-set renderer if it is enabled, for example 'png' or 'jpeg'.") define("renderRange",multiple=True,help="The lowest and highest Unicode values to be given to the character-set renderer if it is enabled. For example 3000:A6FF for most Chinese characters. Multiple ranges are allowed. Any characters NOT in one of the ranges will be passed to the browser to render. If the character-set renderer is enabled without renderRange being set, then ALL text will be rendered to images.") -define("renderOmit",multiple=True,default="iPhone,iPad,Android,Macintosh",help="A list of browsers that do not need the character-set renderer. If any of these strings occur in the user-agent then the character set renderer is turned off even if it is otherwise enabled. The assumption is that these browsers can always do their own character-set rendering.") +define("renderOmit",multiple=True,default="iPhone,iPad,Android,Macintosh,Windows NT 6,Windows Phone OS",help="A list of platforms that do not need the character-set renderer. If any of these strings occur in the user-agent then the character set renderer is turned off even if it is otherwise enabled, on the assumption that these platforms will always have enough fonts.") # (Win: Vista=6.0 7=6.1 8=6.2 reportedly don't need language packs for display) define("renderCheck",help="If renderOmit does not apply to the browser, it might still be possible to check for native character-set support via Javascript. renderCheck can be set to the Unicode value of a character to be checked (try 802F for complete Chinese support); if the browser reports its width differently from known unprintable characters, we assume it won't need our renderer.") # 802F shouldn't create false positives in environments that support only GB2312, only Big5, only SJIS or only KSC instead of all Chinese. It does have GB+ and Big5+ codes (and also demonstrates that we want a hex number). If browser's "unprintable character" glyph happens to be the same width as renderCheck anyway then we could have a false negative, but that's better than a false positive and the user can still switch it off manually if renderName is left set. define("renderNChar",default=1,help="The maximum number of characters per image to be given to the character-set renderer if it is enabled. Keeping this low means the browser cache is more likely to be able to re-use images, but some browsers might struggle if there are too many separate images. Don't worry about Unicode \"combining diacritic\" codes: any found after a character that is to be rendered will be included with it without counting toward the renderNChar limit and without needing to be in renderRange.") define("renderWidth",default=0,help="The maximum pixel width of a 'word' when using the character-set renderer. If you are rendering a language that uses space to separate words, but are using only one or two characters per image, then the browser might split some words in the middle. Setting renderWidth to some value other than 0 can help to prevent this: any word narrower than renderWidth will be enclosed in a <nobr> element. (This will however be ineffective if your stylesheet overrides the behaviour of <nobr>.) You should probably not set renderWidth if you intend to render languages that do not separate words with spaces.") @@ -136,19 +184,25 @@ define("renderDebug",default=False,help="If the character-set renderer is having define("renderName",default="Fonts",help="A name for a switch that allows the user to toggle character set rendering on and off from the browser (via a cookie and Javascript links at the bottom of HTML pages); if set to the empty string then no switch is displayed. At any rate none is displayed when renderOmit applies.") # TODO: non-Javascript fallback for the switcher heading("Dynamic DNS options") -define("dynamic_dns_api",help="URL (http or https) that will cause one of your dynamic DNS entries to be updated to a new IP address. If this is set, it will be used to automatically update the domains listed in host_suffix. The URL should contain two instances of %s; the first will be substituted with the domain name and the second with its new IP address.") -define("ddns_api_user",help="The user name to supply to the dynamic_dns_api URL (Basic authentication over HTTP or HTTPS)") -define("ddns_api_pwd",help="The password to supply to the dynamic_dns_api URL (Basic authentication over HTTP or HTTPS). Best not placed on the command line on a shared machine where it can be seen by process-viewing tools; use a configuration file instead.") -define("ip_query_url",help="URL that will return your current public IP address, as a line of text with no markup added. Used for the dynamic_dns_api option. You can set up a URL by placing a CGI script on a server outside your network and having it do: echo Content-type: text/plain;echo;echo $REMOTE_ADDR") -define("ip_check_interval",default=8000,help="Number of seconds between checks of ip_query_url for the dynamic_dns_api option") -define("ip_force_interval",default=7*24*3600,help="Number of seconds before dynamic_dns_api (if set) is forced to update even if there was no IP change. This is to let the Dynamic DNS system know that we are still around. Set to 0 to disable forced updates (a forced update will occur on server startup anyway), otherwise an update will occur on the next ip_check_interval after ip_force_interval has elapsed.") -define("ip_change_command",help="An optional shell command to run (in a separate thread) whenever your IP changes. Use instead of, or in addition to, dynamic_dns_api. The new IP address will be appended to this command.") +define("ip_change_command",help="An optional script or other shell command to launch whenever the public IP address changes. The new IP address will be added as a parameter; ip_query_url must be set to make this work. The script can for example update any Dynamic DNS services that point to the server.") +define("ip_query_url",help="URL that will return your current public IP address, as a line of text with no markup added. Used for the ip_change_command option. You can set up a URL by placing a CGI script on a server outside your network and having it do: echo Content-type: text/plain;echo;echo $REMOTE_ADDR") +define("ip_query_url2",help="Optional additional URL that might sometimes return your public IP address along with other information. This can for example be a status page served by a local router (http://user:password@192.168... is accepted). If set, the following behaviour occurs: Once ip_query_interval has passed since the last ip_query_url check, ip_query_url2 will be queried at an interval of ip_query_interval2 (which can be short), to check that the known IP is still present in its response. Once the known IP is no longer present, ip_query_url will be queried again. This arrangement can reduce the load on ip_query_url as well as providing a faster response to IP changes, while not completely trusting the local router to report the correct IP at all times. See also ip_query_aggressive if the router might report an IP change before connectivity is restored.") +define("ip_check_interval",default=8000,help="Number of seconds between checks of ip_query_url for the ip_change_command option") +define("ip_check_interval2",default=60,help="Number of seconds between checks of ip_query_url2 (if set), for the ip_change_command option") +define("ip_query_aggressive",default=False,help="If a query to ip_query_url fails with a connection error or similar, keep trying again until we get a response. This is useful if the most likely reason for the error is that our ISP is down: we want to get the new IP just as soon as we're back online. However, if the error is caused by a problem with ip_query_url itself then this option can lead to excessive traffic, so use with caution. (Log entries are written when this option takes effect, and checking the logs is advisable.)") +define("ip_force_interval",default=7*24*3600,help="Number of seconds before ip_change_command (if set) is run even if there was no IP change. This is to let Dynamic DNS services know that we are still around. Set to 0 to disable forced updates (a forced update will occur on server startup anyway), otherwise an update will occur on the next IP check after ip_force_interval has elapsed.") heading("Speedup options") define("useLXML",default=False,help="Use the LXML library for parsing HTML documents. This is usually faster, but it can fail if your system does not have a good installation of LXML and its dependencies, or if the websites you visit are badly broken. Use of LXML libraries may also result in more changes to all HTML markup, although this should be harmless.") define("renderBlocks",default=False,help="Treat all characters rendered by the character-set renderer as \"blocks\" that are guaranteed to have the same dimensions (true for example if you are using the renderer for Chinese characters only). This is faster than checking words individually, but it may produce incorrect HEIGHT and WIDTH attributes if given a range of characters whose dimensions do differ.") # TODO: blocksRange option for if want to render some that do and some that don't? (but profile it: PIL's getsize just might turn out to be quicker than the high-level range-check code) define("fasterServer",help="Address:port of another instance of Web Adjuster to which we forward all traffic whenever it is available. When the other instance is not available, traffic will be handled by this one. Use for example if you have a slower always-on machine and a faster not-always-on machine and you want the slower machine to delegate to the faster machine when available. See also ipTrustReal.") define("ipTrustReal",help="IP address of a machine that we trust, for example a machine that is using us as fasterServer. Any traffic coming from this machine with an X-Real-Ip header will be logged as though it originated at the value of its X-Real-Ip header.") # (TODO: multiple IPs option?) +define("fasterServerNew",default=True,help="If fasterServer is set, assume it is running Web Adjuster v0.17 or later and use a more lightweight method of checking its availability. You might need to set this to False if for some reason you can't upgrade the fasterServer first.") # (don't do auto-fallback as that creates unnecessary extra traffic, plus sending an unrecognized ping2 could clutter logs) +define("machineName",help="A name for the current machine to insert into the \"Server\" HTTP header for adjusted requests, for example to let users know if it's your faster or your slower machine that's currently serving them (although they'd need to inspect the headers to find out)") +define("redirectFiles",default=False,help="If, when not functioning as a \"real\" HTTP proxy, a URL is received that looks like it requires no processing on our part (e.g. an image or downloadable file that the user does not want converted), and if this is confirmed via a HEAD request to the remote server, then redirect the browser to fetch it directly and not via Web Adjuster. This takes bandwidth off the adjuster server, and should mean faster downloads, especially from sites that are better connected than the adjuster machine. However it might not work with sites that restrict \"deep linking\".") # (TODO: option to list sites that are OK for redirectFiles, and/or list sites that aren't and that we're willing to full-proxy for) +# If adjuster machine is running on a home broadband connection, don't forget the "uplink" speed of that broadband is likely to be lower than the "downlink" speed; the same should not be the case of a site running at a well-connected server farm. There's also extra delay if Web Adjuster has to download files first (which might be reduced by implementing streaming). Weighed against this is the extra overhead the browser has of repeating its request elsewhere, which could be an issue if the file is small and the browser's uplink is slow; in that case fetching it ourselves might be quicker than having the browser repeat the request; see TODO comment elsewhere about minimum content length before redirectFiles. + +define("upstream_guard",default=True,help="Modify scripts and cookies sent by upstream sites so they do not refer to the cookie names that our own scripts use. This is useful if you chain together multiple instances of Web Adjuster, such as for testing another installation without coming out of your usual proxy. If however you know that this instance will not be pointed to another, you can set upstream_guard to False to save some processing.") # THIS MUST BE THE LAST SECTION because it continues into # the note below about Tornado logging options. (The order @@ -157,6 +211,8 @@ define("ipTrustReal",help="IP address of a machine that we trust, for example a heading("Logging options") define("renderLog",default=False,help="Whether or not to log requests for character-set renderer images. Note that this can generate a LOT of log entries on some pages.") define("logUnsupported",default=False,help="Whether or not to log attempts at requests using unsupported HTTP methods. Note that this can sometimes generate nearly as many log entries as renderLog if some browser (or malware) tries to do WebDAV PROPFIND requests on each of the images.") +define("logRedirectFiles",default=True,help="Whether or not to log requests that result in the browser being simply redirected to the original site when the redirectFiles option is on.") # (Since this still results in a HEAD request being sent to the remote site, this option defaults to True in case you need it to diagnose "fair use of remote site" problems) +define("ownServer_useragent_ip",default=False,help="If own_server is set, and that server cannot be configured to log the X-Real-Ip header we set when we proxy for it, you can if you wish turn on this option, which will prepend the real IP to the User-Agent header on the first request of each connection (most servers can log User-Agent). This is slightly dangerous: fake IPs can be inserted into the log if keep-alive is used.") # (and it might break some user-agent detection) define("ipNoLog",multiple=True,help="A comma-separated list of IP addresses which can use the adjuster without being logged. If your network has a \"friendly probing\" service then you might want to use this to stop it filling up the logs. (Any tracebacks it causes will still be logged however.)") define("squashLogs",default=True,help="Try to remove some duplicate information from consecutive log entries, to make logs easier to check. You might want to set this to False if you plan to use automatic search tools on the logs.") # (word 'some' is important as not all duplicate info is guaranteed to be removed) define("whois",default=False,help="Try to log the Internet service provider for each IP address in the logs. Requires the 'whois' program. The extra information is written as separate log entries when it becomes available, and not for recent duplicate IPs or IPs that do not submit valid requests.") @@ -172,13 +228,15 @@ import time,os,commands,string,urllib,urlparse,re,socket,logging,subprocess,thre from HTMLParser import HTMLParser,HTMLParseError try: # can we page the help text? + # (Tornado 2 just calls the module-level print_help, but Tornado 3 includes some direct calls to the object's method, so we have to override the latter. Have to use __dict__ because they override __setattr__.) import pydoc,cStringIO ; pydoc.pager # ensure present - old_top = tornado.options.print_help - def new_top(): - dat = cStringIO.StringIO() ; old_top(dat) + def new_top(*args): + dat = cStringIO.StringIO() + tornado.options.options.old_top(dat) pydoc.pager(dat.getvalue()) - tornado.options.print_help = new_top -except: pass + tornado.options.options.__dict__['old_top'] = tornado.options.options.print_help + tornado.options.options.__dict__['print_help'] = new_top +except: raise def hostSuffix(n=0): if options.host_suffix: @@ -195,7 +253,7 @@ def convert_to_real_host(requested_host,cookie_host=None): # we should display the URL entry box etc. # Returns -1 if we should pass to options.own_server. if requested_host: - port=":"+str(options.port) + port=":"+str(options.publicPort) # might or might not be present in the user's request orig_requested_host = requested_host if requested_host.endswith(port): requested_host=requested_host[:-len(port)] n=0 @@ -212,10 +270,10 @@ def convert_to_real_host(requested_host,cookie_host=None): else: return defaultSite() def convert_to_via_host(requested_host): if requested_host: - port=":"+str(options.port) + port=":"+str(options.publicPort) # the port to advertise orig_requested_host = requested_host if requested_host.endswith(port): requested_host=requested_host[:-len(port)] - if options.port==80: port="" + if options.publicPort==80: port="" for h in options.host_suffix.split("/"): if (requested_host == h and options.default_site) or requested_host.endswith("."+h): return h+port if options.wildcard_dns and not '/' in options.host_suffix: return options.host_suffix+port @@ -224,8 +282,8 @@ def convert_to_requested_host(real_host,cookie_host=None): # Converts the actual host name into the host name that # the user should request to get it through us if not real_host: return "" - if options.port==80: port="" - else: port=":"+str(options.port) + if options.publicPort==80: port="" + else: port=":"+str(options.publicPort) if options.default_site: n=0 for i in options.default_site.split("/"): @@ -258,6 +316,13 @@ def changeConfigDirectory(fname): return ffile return fname +def errExit(msg): + # Exit with an error message BEFORE server start + try: logging.error(msg) # in case run from crontab w/out output (and e.g. PATH not set properly) + except: pass + sys.stderr.write(msg) + sys.exit(1) + def readOptions(): tornado.options.parse_command_line() configsDone = set() @@ -268,13 +333,13 @@ def readOptions(): oldDir = os.getcwd() config2 = changeConfigDirectory(config) try: open(config2) - except: - sys.stderr.write("Cannot open configuration file %s (current directory is %s)\n" % (config2,os.getcwd())) - sys.exit(1) + except: errExit("Cannot open configuration file %s (current directory is %s)\n" % (config2,os.getcwd())) tornado.options.parse_config_file(config2) configsDone.add((config,oldDir)) tornado.options.parse_command_line() # need to do this again to ensure logging is set up for the *current* directory (after any chdir's while reading config files) + if type(options.mailtoSMS)==type(""): options.mailtoSMS=options.mailtoSMS.split(',') if type(options.leaveTags)==type(""): options.leaveTags=options.leaveTags.split(',') + if type(options.stripTags)==type(""): options.stripTags=options.stripTags.split(',') create_inRenderRange_function(options.renderRange) if type(options.renderOmit)==type(""): options.renderOmit=options.renderOmit.split(',') if type(options.deleteOmit)==type(""): options.deleteOmit=options.deleteOmit.split(',') @@ -283,19 +348,40 @@ def readOptions(): if type(options.search_sites)==type(""): options.search_sites=options.search_sites.split(',') if type(options.ipNoLog)==type(""): options.ipNoLog=options.ipNoLog.split(',') if type(options.delete)==type(""): options.delete=options.delete.split(',') + if type(options.delete_css)==type(""): options.delete_css=options.delete_css.split(',') + if type(options.prohibit)==type(""): options.prohibit=options.prohibit.split(',') + global viaName,serverName,serverName_html + viaName = program_name[:program_name.index("(c)")].strip() # Web Adjuster vN.NN + if options.machineName: serverName = viaName + " on "+options.machineName + else: serverName = viaName + serverName_html = re.sub(r"([0-9])([0-9])",r"\1<span></span>\2",serverName) # stop mobile browsers interpreting the version number as a telephone number + global upstream_proxy_host, upstream_proxy_port + upstream_proxy_host = upstream_proxy_port = None + if options.upstream_proxy: + try: import pycurl + except ImportError: errExit("upstream_proxy requires pycurl (try sudo pip install pycurl)\n") + if not ':' in options.upstream_proxy: options.upstream_proxy += ":80" + upstream_proxy_host,upstream_proxy_port = options.upstream_proxy.split(':') + upstream_proxy_port = int(upstream_proxy_port) global codeChanges ; codeChanges = [] if options.codeChanges: ccLines = [x for x in options.codeChanges.split("\n") if x and not x.startswith("#")] while ccLines: - if len(ccLines)<3: - sys.stderr.write("codeChanges must be a multiple of 3 lines (see --help)\n") - sys.exit(1) + if len(ccLines)<3: errExit("codeChanges must be a multiple of 3 lines (see --help)\n") codeChanges.append(tuple(ccLines[:3])) ccLines = ccLines[3:] if options.real_proxy: options.open_proxy=True - if not options.password and not options.open_proxy: - stderr.write("Please set a password, or use --open_proxy.\n(Try --help for help)\n") - sys.exit(1) + if not options.password and not options.open_proxy: errExit("Please set a password, or use --open_proxy.\n(Try --help for help; did you forget a --config=file?)\n") + if options.htmlFilter and '#' in options.htmlFilter and not len(options.htmlFilter.split('#'))+1 == len(options.htmlFilterName.split('#')): errExit("Wrong number of #s in htmlFilterName for this htmlFilter setting") + if not options.publicPort: + options.publicPort = options.port + if options.pdftotext and not "pdftotext version" in os.popen4("pdftotext -h")[1].read(): errExit("pdftotext command does not seem to be usable\nPlease install it, or unset the pdftotext option\n") + if options.epubtotext and not "calibre" in os.popen4("ebook-convert -h")[1].read(): errExit("ebook-convert command does not seem to be usable\nPlease install calibre, or unset the epubtotext option\n") + global ownServer_regexp + if options.ownServer_regexp: + if not options.own_server: errExit("Cannot set ownServer_regexp if own_sever is not set\n") + ownServer_regexp = re.compile(options.ownServer_regexp) + else: ownServer_regexp = None if options.install: current_crontab = commands.getoutput("crontab -l 2>/dev/null") new_cmd = "@reboot python "+" ".join(sys.argv) # TODO: crontab-friendly quoting of special characters @@ -308,25 +394,24 @@ def readOptions(): if options.stop: if not pidFound: sys.stderr.write("Could not find which PID to stop (maybe nothing was running?)\n") sys.exit(0) - elif pidFound: time.sleep(0.5) # give it time to stop def main(): readOptions() - handlers = [ - (r"/(.*)", RequestForwarder, {}) - ] - if options.real_proxy: handlers.append((r"(.*)", RequestForwarder, {})) # doesn't have to start with / - application = Application(handlers,log_function=accessLog,gzip=True) + application = Application([(r"(.*)",RequestForwarder,{})],log_function=accessLog,gzip=True) # tornado.web.Application.__init__(self, transforms=[ChunkedTransferEncoding], gzip=True) - if not hasattr(application,"listen"): - sys.stderr.write("Your version of Tornado is too old. Please install version 2.x.\n") - sys.exit(1) + if not hasattr(application,"listen"): errExit("Your version of Tornado is too old. Please install version 2.x.\n") if options.useLXML: check_LXML() if fork_before_listen and options.background: sys.stderr.write("%s\nLicensed under the Apache License, Version 2.0\nChild will listen on port %d\n(can't report errors here as this system needs early fork)\n" % (program_name,options.port)) # (need some other way of checking it really started) unixfork() - try: application.listen(options.port,options.address) - except: + for portTry in [5,4,3,2,1,0]: + try: + application.listen(options.port,options.address) + break + except: + if portTry: + time.sleep(0.5) ; continue + # tried 6 times over 3 seconds, can't open the port (either the other process is taking a long time to stop or something) if options.browser: # there's probably another adjuster instance, in which case we probably want to let the browser open a new window and let our listen() fail dropPrivileges() @@ -341,29 +426,81 @@ def main(): if options.watchdogWait: sys.stderr.write("(abort if unresponsive for %d seconds)\n" % options.watchdogWait) if options.background and not fork_before_listen: unixfork() + try: os.setpgrp() # for killpg later + except: pass if options.browser: IOLoop.instance().add_callback(runBrowser) if options.watchdog: WatchdogPings(watchdog) - if options.fasterServer: IOLoop.instance().add_callback(checkServer) - if options.ip_query_url and (options.dynamic_dns_api or options.ip_change_command): Dynamic_DNS_updater() + if options.fasterServer: + if not ':' in options.fasterServer: options.fasterServer += ":80" # needed for the new code + logging.getLogger("tornado.general").disabled=1 # needed to suppress socket-level 'connection refused' messages from ping2 code in Tornado 3 + class NoConErrors: + def filter(self,record): return not record.getMessage().startswith("Connect error on fd") + logging.getLogger().addFilter(NoConErrors()) # ditto in Tornado 2 (which uses the root logger) (don't be tempted to combine this by setting tornado.general to a filter, as the message might change in future Tornado 3 releases) + IOLoop.instance().add_callback(checkServer) + if options.ip_query_url and options.ip_change_command: + # check for user:password@ in ip_query_url2 + global ip_query_url2,ip_query_url2_user,ip_query_url2_pwd + ip_query_url2 = options.ip_query_url2 + ip_query_url2_user=ip_query_url2_pwd=None + if ip_query_url2: + netloc = urlparse.urlparse(ip_query_url2).netloc + if '@' in netloc: + auth,rest = netloc.split('@',1) + ip_query_url2 = ip_query_url2.replace(netloc,rest,1) + ip_query_url2_user,ip_query_url2_pwd = auth.split(':',1) + # and start the updater + Dynamic_DNS_updater() try: import signal signal.signal(signal.SIGTERM, stopServer) except: pass # signal not supported on this platform? if options.background: logging.info("Server starting") - IOLoop.instance().start() + else: set_title("adjuster") + try: IOLoop.instance().start() + except KeyboardInterrupt: + if options.background: logging.info("SIGINT received") + else: sys.stderr.write("\nKeyboard interrupt\n") # gets here after stopServer (e.g. got SIGTERM from a --stop, or options.browser and the browser finished) if options.background: logging.info("Server shutdown") if options.watchdog: + options.watchdog = 0 # tell any separate_thread() to stop (that thread is not counted in helper_thread_count) watchdog.write('V') # this MIGHT be clean exit, IF the watchdog supports it (not all of them do, so it might not be advisable to use the watchdog option if you plan to stop the server without restarting it) watchdog.close() if not options.background: sys.stderr.write("Adjuster shutdown\n") + if helper_thread_count: + msg = "Terminating %d runaway helper threads" % (helper_thread_count,) + # in case someone needs our port quickly. + # Most likely "runaway" thread is ip_change_command if you did a --restart shortly after the server started. + # TODO it would be nice if the port can be released at the IOLoop.instance.stop, and make sure os.system doesn't dup any /dev/watchdog handle we might need to release, so that it's not necessary to stop the threads + if options.background: logging.info(msg) + else: sys.stderr.write(msg) + try: + import signal + signal.signal(signal.SIGTERM, signal.SIG_DFL) + os.killpg(os.getpgrp(),signal.SIGTERM) + except: pass + os.abort() + +def set_title(t): + if not (hasattr(sys.stderr,"isatty") and sys.stderr.isatty()): return + import atexit + if t: atexit.register(set_title,"") + term = os.environ.get("TERM","") + is_xterm = "xterm" in term + is_screen = (term=="screen" and os.environ.get("STY","")) + is_tmux = (term=="screen" and os.environ.get("TMUX","")) + if is_xterm or is_tmux: sys.stderr.write("\033]0;%s\007" % (t,)) # ("0;" sets both title and minimised title, "1;" sets minimised title, "2;" sets title. Tmux takes its pane title from title (but doesn't display it in the titlebar)) + elif is_screen: os.system("screen -X title \"%s\"" % (t,)) def dropPrivileges(): if options.user and not os.getuid(): # need to drop privileges - import pwd - os.setuid(pwd.getpwnam(options.user)[2]) + import pwd ; pwd=pwd.getpwnam(options.user) + os.setuid(pwd[2]) + # and help our external programs: + os.environ['HOME'] = pwd[5] # (so they don't try to load root's preferences etc) + os.environ['USER']=os.environ['LOGNAME']=options.user fork_before_listen = not 'linux' in sys.platform @@ -376,7 +513,7 @@ def unixfork(): def stopOther(): import commands,signal - out = commands.getoutput("lsof -iTCP:"+str(options.port)+" -sTCP:LISTEN") + out = commands.getoutput("lsof -iTCP:"+str(options.port)+" -sTCP:LISTEN") # TODO: lsof can hang if ANY programs have files open on stuck remote mounts etc, even if this is nothing to do with TCP connections. -S 2 might help a BIT but it's not a solution. Linux's netstat -tlp needs root, and BSD's can't show PIDs. Might be better to write files or set something in the process name. if out.startswith("lsof: unsupported"): # lsof 4.81 has -sTCP:LISTEN but lsof 4.78 does not. However, not including -sTCP:LISTEN can cause lsof to make unnecessary hostname queries for established connections. So fall back only if have to. out = commands.getoutput("lsof -iTCP:"+str(options.port)+" -Ts") # -Ts ensures will say LISTEN on the pid that's listening @@ -406,7 +543,7 @@ the_supported_methods = ("GET", "HEAD", "POST", "PUT", "DELETE", "PATCH", "OPTIO class BrowserLogger: def __init__(self): # Do NOT read options here - they haven't been read yet - self.lastBrowser = self.lastDate = None + self.lastBrowser = None self.lastIp = self.lastMethodStuff = None self.whoisLogger = WhoisLogger() def __call__(self,req): @@ -433,10 +570,7 @@ class BrowserLogger: browser=" "+browser else: self.lastBrowser,browser = None," -" if options.squashLogs: - # Time will already be included in Tornado logging format (a format we don't want to override, especially as it has 'start of log string syntax highlighting' on some platforms), so don't add it here. Just add the date if different. - t = time.strftime("[%d/%b/%Y] ") - if t==self.lastDate: t="" - else: self.lastDate = t + # Date (as YYMMDD) and time are already be included in Tornado logging format, a format we don't want to override, especially as it has 'start of log string syntax highlighting' on some platforms if req.remote_ip == self.lastIp: ip="" else: @@ -449,11 +583,13 @@ class BrowserLogger: else: r='"%s %s%s %s"' % (req.method, host, req.uri, req.version) self.lastMethodStuff = methodStuff - msg = t+ip+r+browser - else: msg = '%s - - [%s] "%s %s%s %s" - - - %s' % (req.remote_ip, time.strftime("%d/%b/%Y:%X"), req.method, host, req.uri, req.version, browser) + msg = ip+r+browser + else: msg = '%s "%s %s%s %s" %s' % (req.remote_ip, req.method, host, req.uri, req.version, browser) # could add "- - [%s]" with time.strftime("%d/%b/%Y:%X") if don't like Tornado-logs date-time format (and - - - before the browser %s) logging.info(msg) if options.whois and hasattr(req,"valid_for_whois"): self.whoisLogger(req.remote_ip) +helper_thread_count = 0 + class WhoisLogger: def __init__(self): # Do NOT read options here - haven't been read yet @@ -484,9 +620,12 @@ def getWhois(ip): if field in checkList or (field=="country:" and ret) and not value in ret: ret.append(value) # omit 1st country: from RIPE/APNIC/&c, and de-dup return ", ".join(ret) def whois_thread(ip,logger): + global helper_thread_count + helper_thread_count += 1 address = getWhois(ip) logger.thread_running = False if address: IOLoop.instance().add_callback(lambda *args:logging.info("whois "+ip+": "+address)) + helper_thread_count -= 1 accessLog = BrowserLogger() @@ -495,11 +634,25 @@ try: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") except: pass # fall back to the pure-Python one +try: import zlib +except: # Windows? + class zlib: + def compress(self,s,level): return s + def decompressobj(): + class o: + def decompress(self,s,maxlen): return s + return o() + zlib = zlib() + cookieExpires = "Tue Jan 19 03:14:07 2038" # TODO: S2G +set_window_onerror = False # for debugging Javascript on some mobile browsers (TODO make this a config option? but will have to check which browsers do and don't support window.onerror) + def writeAndClose(stream,data): # This helper function is needed for CONNECT and own_server handling because, contrary to Tornado docs, some Tornado versions (e.g. 2.3) send the last data packet in the FIRST callback of IOStream's read_until_close - if data: stream.write(data) + if data: + try: stream.write(data) + except IOError: pass # probably client disconnected, don't fill logs with tracebacks if not stream.closed(): stream.close() # Domain-setting cookie for when we have no wildcard_dns and no default_site: @@ -510,6 +663,8 @@ enable_adjustDomainCookieName_URL_override = True # TODO: document this! (Allow htmlmode_cookie_name = "_adjustZJCG_" # zap JS, CSS and Graphics password_cookie_name = "_pxyAxsP_" # "proxy access password". have to pick something that's unlikely to collide with a site's cookie +redirectFiles_Extensions=set("pdf epub mp3 aac zip gif png jpeg jpg exe tar tgz tbz".split()) # TODO: make this list configurable + maybe add a "minimum content length before it's worth re-directing" option + class RequestForwarder(RequestHandler): def get_error_html(self,status,**kwargs): return "<html><body>"+options.errorHTML+"</body></html>" @@ -577,7 +732,7 @@ class RequestForwarder(RequestHandler): def authenticates_ok(self,host): if not options.password: return True - if options.password_domain and host and not (host==options.password_domain or host.endswith("."+options.password_domain)): return True + if options.password_domain and host and not any((host==p or host.endswith("."+p)) for p in options.password_domain.split('/')): return True if options.password_domain: self.is_password_domain=True # if they said ?p=(password), it's OK and we can # give them a cookie with it @@ -623,38 +778,84 @@ class RequestForwarder(RequestHandler): def myfinish(self): if hasattr(self,"_finished") and self._finished: return # try to avoid "connection closed" exceptions if browser has already gone away - try: self.finish() + try: + self.finish() + self._finished = 1 # (just in case) except: pass # belt and braces (depends on Tornado version?) - def redirect(self,redir): - self.set_status(301) + def redirect(self,redir,status=301): + self.set_status(status) + for h in ["Location","Content-Location","Content-Type","Content-Language"]: self.clear_header(h) # so redirect() can be called AFTER a site's headers are copied in self.add_header("Location",redir) - self.write('<html><body><a href="%s">Redirect</a></body></html>' % redir) + self.add_header("Content-Type","text/html") + self.write('<html><body><a href="%s">Redirect</a></body></html>' % redir.replace('&','&').replace('"','"')) self.myfinish() + def inProgress(self): + # If appropriate, writes a "conversion in progress" page and returns True, and then self.inProgress_run() should return True. + # Not on wget or curl (TODO: configurable?) + if not options.waitpage or not options.pdfepubkeep: return False + ua = " "+self.request.headers.get("User-Agent","") + if " curl/" in ua or " Wget/" in ua: return False # (but don't return false for libcurl/) + self.set_status(200) + self.add_header("Pragma","no-cache") ; self.add_header("Vary","*") ; self.add_header("Expires","Thu Jan 01 00:00:00 1970") ; self.add_header("Cache-Control","no-cache, no-store, must-revalidate, max-stale=0, post-check=0, pre-check=0") # don't ANYBODY even THINK about caching this! + self.add_header("Refresh","10") # TODO: configurable? and make sure it does not exceed options.pdfepubkeep + self.clear_header("Content-Disposition") + self.clear_header("Content-Type") + self.add_header("Content-Type","text/html") + self.inProgress_has_run = True # doResponse2 may set a callback for render, so can't set _finished yet, but do need to set something so txtCallback knows not to write the actual text into this response (TODO could do a "first one there gets it" approach, but it's unlikely to be needed) + if self.checkBrowser(["IEMobile 6","IEMobile 7","Opera Mobi"]): warn="<h3>WARNING: Your browser might not save this file</h3>You are using a browser which has been known to try to display text attachments in its own window using very small print, giving no option to save to a file. You might get better results in IEMobile 8+ or Opera Mini (although the latter may have a more limited range of font sizes in the browser itself)." # TODO: make this warning configurable? See comment after set_header("Content-Disposition",...) below for details + else: warn="" + self.doResponse2(("""<html><head><title>File conversion in progress</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body><h1>File conversion in progress</h1>The result should start downloading soon. If it does not, try <script><!-- +document.write('<a href="javascript:location.reload(true)">refreshing this page</a>') +//--></script><noscript>refreshing this page</noscript>.%s%s<hr>This is %s</body></html>""" % (backScript,warn,serverName_html)),True,False) + # TODO: if (and only if) refreshing from this page, might then need a final 'conversion finished' page before serving the attachment, so as not to leave an 'in progress' page up afterwards + return True + def inProgress_run(self): return hasattr(self,"inProgress_has_run") and self.inProgress_has_run + def addToHeader(self,header,toAdd): val = self.request.headers.get(header,"") if val: val += ", " self.request.headers[header] = val+toAdd - def proxyFor(self,server): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0) - upstream = tornado.iostream.IOStream(s) + def forwardFor(self,server): + if server==options.own_server and options.ownServer_useragent_ip: + r = self.request.headers.get("User-Agent","") + if r: r=" "+r + r="("+self.request.remote_ip+")"+r + self.request.headers["User-Agent"]=r + upstream = tornado.iostream.IOStream(socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)) client = self.request.connection.stream if ':' in server: host, port = server.split(':') else: host, port = server, 80 upstream.connect((host, int(port)),lambda *args:(upstream.read_until_close(lambda data:writeAndClose(client,data),lambda data:client.write(data)),client.read_until_close(lambda data:writeAndClose(upstream,data),lambda data:upstream.write(data)))) + try: self.request.uri = self.request.original_uri + except: pass upstream.write(self.request.method+" "+self.request.uri+" "+self.request.version+"\r\n"+"\r\n".join(("%s: %s" % (k,v)) for k,v in (list(h for h in self.request.headers.get_all() if not h[0].lower()=="x-real-ip")+[("X-Real-Ip",self.request.remote_ip)]))+"\r\n\r\n"+self.request.body) - def answerPing(self): + def answerPing(self,newVersion): # answer a "ping" request from another machine that's using us as a fasterServer # Need to make the response short, but still allow keepalive self.request.suppress_logging = True - for h in ["Server","Content-Type"]: + for h in ["Server","Content-Type","Date"]: try: self.clear_header(h) except: pass + # (Date is added by Tornado 3, which can also add "Vary: Accept-Encoding" but that's done after we get here, TODO: option to ping via a connect and low-level TCP keepalive bytes?) self.set_header("Etag","0") # shorter than Tornado's computed one (clear_header won't work with Etag) - self.write("1") ; self.myfinish() + if newVersion: + # Forget the headers, just write one byte per second for as long as the connection is open + stream = self.request.connection.stream + stream.socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) + def writeBytes(): + try: + stream.write("1") + IOLoop.instance().add_timeout(time.time()+1,lambda *args:writeBytes()) + except: + # logging.info("ping2 disconnected") + self.myfinish() + writeBytes() + else: + self.write("1") ; self.myfinish() def find_real_IP(self): if not self.request.remote_ip == options.ipTrustReal: return @@ -678,6 +879,7 @@ class RequestForwarder(RequestHandler): self.add_header("Content-Type","image/"+options.renderFormat) self.add_header("Last-Modified","Sun, 06 Jul 2008 13:20:05 GMT") self.add_header("Expires","Wed, 1 Dec 2036 23:59:59 GMT") # TODO: S2G + # self.clear_header("Server") # save bytes if possible as we could be serving a LOT of these images .. but is this really needed? (TODO) self.write(img) ; self.myfinish() def set_htmlonly_cookie(self): @@ -715,25 +917,121 @@ class RequestForwarder(RequestHandler): else: v2=None # not needed if wildcard_dns self.redirect(domain_process(v,v2,True)) + def handleFullLocation(self): + # HTTP 1.1 spec says ANY request can be of form http://...., not just a proxy request. The differentiation of proxy/not-proxy depends on what host is requested. So rewrite all http://... requests to HTTP1.0-style host+uri requests. + if self.request.uri.startswith("http://"): + self.request.original_uri = self.request.uri + parsed = urlparse.urlparse(self.request.uri) + self.request.host = self.request.headers["Host"] = parsed.netloc + self.request.uri = urlparse.urlunparse(("","")+parsed[2:]) + if not self.request.uri: self.request.uri="/" + elif not self.request.uri.startswith("/"): # invalid + self.set_status(400) ; self.myfinish() ; return True + def serve_URLbox(self): if not options.wildcard_dns: self.clearUnrecognisedCookies() # TODO: optional? self.addCookieFromURL() self.doResponse2(urlbox_html(self.htmlOnlyMode()),True,False) # TODO: run htmlFilter on it also? (render etc will be done by doResponse2) - + + def serve_mailtoPage(self): + uri = self.request.uri[len(options.mailtoPath):] + if '?' in uri: addr=uri[:uri.index('?')] + else: addr=uri + addr = urllib.unquote(addr) + body = self.request.arguments.get("body",None) + if body and type(body)==type([]): body=body[0] + subj = self.request.arguments.get("subject",None) + if subj and type(subj)==type([]): subj=subj[0] + r = [] ; smsLink = "" + if addr: r.append("To: "+ampEncode(addr)) + if subj: r.append("Subject: "+ampEncode(subj)) + if body: + r.append("Body: "+ampEncode(body)) + if self.checkBrowser(options.mailtoSMS): + if subj and not body.startswith(subj): smsLink = subj+" "+body + else: smsLink = body + if '&' in smsLink: smsLink="[Before sending this text, replace -amp- with an ampersand. This substitution has been done in case your phone isn't compliant with RFC 5724.] "+smsLink.replace('&',' -amp- ') + # RFC 5724 shows we ought to get away with ampersands encoded as %26, but on Windows Mobile (Opera or IE) we don't; the SMS is truncated at that point. TODO: whitelist some other platforms? (test with <a href="sms:?body=test1%26test2">this</a>) + smsLink = '<br><a href="sms:?body=%s">Send as SMS (text message)</a>' % urllib.quote(rm_u8punc(smsLink)) + if len(r)==1: # different format if only 1 item is specified + if addr: r=["The email will be sent to "+ampEncode(addr)] + elif subj: r=["The email's Subject will be: "+ampEncode(subj)] + else: r=["The email's Body will be: "+ampEncode(body)] + elif not r: r.append("The link does not specify any recognised email details") + else: r.insert(0,"The following information will be sent to the email client:") + self.doResponse2(('<html><head><title>mailto: link - Web Adjuster</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body><h3>mailto: link</h3>This link is meant to open an email client.<br>%s<br><a href=\"mailto:%s\">Open in email client</a> (if set up)%s%s<hr>This is %s</body></html>' % ("<br>".join(r),uri,smsLink,backScript,serverName_html)),True,False) + + def serve_submitPage(self): + if len(self.request.uri) > len(options.submitPath): + txt = zlib.decompressobj().decompress(base64.b64decode(self.request.uri[len(options.submitPath):]),16834) # limit to 16k to avoid zip bombs (limit is also in the compress below) + self.request.uri = "%s (input not logged, len=%d)" % (options.submitPath,len(txt)) + else: txt = self.request.arguments.get("i",None) + if not txt: + self.is_password_domain=True # no prominentNotice needed + # In the markup below, body's height=100% is needed to ensure we can set a percentage height on the textarea consistently across many browsers (otherwise e.g. Safari 6 without user CSS might start making the textarea larger as soon as it contains input, overprinting the rest of the document) + return self.doResponse2(("""<html><head><title>Upload Text - Web Adjuster</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body style="height:100%%;overflow:auto"><form method="post" action="%s"><h3 style="float:left;padding:0px;margin:0px">Upload Text</h3><span style="float:right"><input type="submit"><script><!-- +document.write(' (Ctrl-Enter) | <a href="javascript:history.go(-1)">Back</a>') +//--></script></span><br><textarea name="i" style="width:100%%;clear:both;height:60%%" rows="5" cols="20" placeholder="Type or paste your text here" +onKeyDown="if((event.ctrlKey||event.metaKey) && (event.keyCode==13 || event.which==13)) document.forms[0].submit(); else return true;"> +</textarea> +</form> +<script><!-- +document.forms[0].i.focus() +//--></script></body></html>""" % (options.submitPath,)),True,False) + if type(txt) == list: # came from the POST form + txt = txt[0].strip() + # On at least some browsers (e.g. some Safari versions), clicking one of our JS reload links after the POST text has been shown will reload the form (instead of re-submitting the POST text) and can scroll to an awkward position whether the code below calls focus() or not. Could at least translate to GET if it's short enough (don't want to start storing things on the adjuster machine - that would require a shared database if load-balancing) + if len(txt) <= 16384: # (else we wouldn't decompress all; see comment above) + enc = base64.b64encode(zlib.compress(txt,9)) + if 0 < len(enc) < 2000: return self.redirect(options.submitPath+enc,303) # POST to GET + + # pretend it was served by a remote site; go through everything including filters (TODO: could bypass most of doResponse instead of rigging it up like this; alternatively keep this as it shows how to feed data to doResponse) + self.connection_header = None + self.urlToFetch = "" # for js_process + class H: + def get_all(self): return [("Content-Type","text/html; charset=utf-8")] + class R: + code = 200 + headers = H() + r=R() ; r.body="""<html><head><title>Uploaded Text - Web Adjuster</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body><h3>Your text</h3>%s<hr>This is %s. %s</body></html>""" % (txt2html(txt),serverName_html,backScriptNoBr) # backScriptNoBr AFTER the server notice to save vertical space + self.doResponse(r,[False]*4,False,False) + + def checkTextCache(self): + # check for PDF/EPUB conversion on other threads or cached + if not options.pdfepubkeep: return False # we don'tguarantee to update kept_tempfiles properly if it's 0 (e.g. pdf can just pipe, so don't need unlinkOutputLater) + ktkey = (self.request.host, self.request.uri) + if ktkey in kept_tempfiles: + def tryRead(): + try: txt=open(kept_tempfiles[ktkey]).read() + except: txt = None + if txt: + self.write(remove_blanks_add_utf8_BOM(txt)) + self.myfinish() + elif not self.inProgress(): IOLoop.instance().add_timeout(time.time()+1,lambda *args:tryRead()) + tryRead() ; return True + kept_tempfiles[ktkey] = 1 # conversion in progress + return False + def doReq(self): debuglog("doReq "+self.request.uri) - if self.request.uri=="/ping" and self.request.headers.get("User-Agent","")=="ping": return self.answerPing() + if self.request.headers.get("User-Agent","")=="ping": + if self.request.uri=="/ping2": return self.answerPing(True) + elif self.request.uri=="/ping": return self.answerPing(False) + self.find_real_IP() # must do this BEFORE forwarding to fasterServer, because might also be behind nginx etc if fasterServer_up: - return self.proxyFor(options.fasterServer) - self.find_real_IP() + return self.forwardFor(options.fasterServer) # TODO: option to restrict by self.request.remote_ip ? Slow down heavy users? + if self.handleFullLocation(): return + if ownServer_regexp and ownServer_regexp.match(self.request.host+self.request.uri): + self.request.headers["Connection"] = "close" # MUST do this (keepalive can go wrong if it subsequently fetches a URL that DOESN'T match ownServer_regexp, but comes from the same domain, and this goes to ownServer incorrectly), TODO mention it in the help text?, TODO might we occasionally need something similar for ownServer_if_not_root etc?, TODO at lower priority: if we can reasonably repeat the requests then do that insntead of using forwardFor + return self.forwardFor(options.own_server) viewSource = self.checkViewsource() self.cookieViaURL = None realHost = convert_to_real_host(self.request.host,self.cookie_host(checkReal=False)) # don't need checkReal if return value will be passed to convert_to_real_host anyway if realHost == -1: - return self.proxyFor(options.own_server) + return self.forwardFor(options.own_server) # (TODO: what if it's keep-alive and some browser figures out our other domains are on the same IP and tries to fetch them through the same connection? is that supposed to be allowed?) - elif realHost==0 and options.ownServer_if_not_root: realHost=options.own_server # asking by cookie to adjust the same host, so don't proxyFor() it but fetch it normally and adjust it + elif realHost==0 and options.ownServer_if_not_root: realHost=options.own_server # asking by cookie to adjust the same host, so don't forwardFor() it but fetch it normally and adjust it self.request.valid_for_whois = 1 # (if options.whois, don't whois unless it gets this far, e.g. don't whois any that didn't even match "/(.*)" etc) @@ -745,18 +1043,18 @@ class RequestForwarder(RequestHandler): # (This is not done if options.real_proxy because we don't want to touch the hostname for that) host = self.request.host if host: - if host.endswith(":"+str(options.port)): host=host[:-len(":"+str(options.port))] + if host.endswith(":"+str(options.publicPort)): host=host[:-len(":"+str(options.publicPort))] for hs in options.host_suffix.split("/"): ohs = "."+hs if host.endswith(ohs) and host.index(".")<len(host)-len(ohs): if maybeRobots: return self.serveRobots() - if options.port==80: colPort="" - else: colPort=":"+str(options.port) + if options.publicPort==80: colPort="" + else: colPort=":"+str(options.publicPort) return self.redirect("http://"+dedot(host[:-len(ohs)])+ohs+colPort+self.request.uri) # Now OK to check authentication: if not self.authenticates_ok(host): if options.auth_error=="http://": - if options.own_server: return self.proxyFor(options.own_server) + if options.own_server: return self.forwardFor(options.own_server) elif maybeRobots: return self.serveRobots() else: options.auth_error = "auth_error set incorrectly (own_server not set)" # see auth_error help (TODO: is it really a good idea to say this HERE?) elif maybeRobots: return self.serveRobots() @@ -765,15 +1063,17 @@ class RequestForwarder(RequestHandler): self.write("<html><body>"+options.auth_error+"</body></html>") self.myfinish() ; return # Authentication is now OK - self.set_header("Server",program_name[:program_name.index("(c)")].strip()) + self.set_header("Server",serverName) # TODO: in "real" proxy mode, "Server" might not be the most appropriate header to set for this + try: self.clear_header("Date") # Date is added by Tornado 3; HTTP 1.1 says it's mandatory but then says don't put it if you're a clockless server (which we might be I suppose) so it seems leaving it out is OK especially if not specifying Age etc, and leaving it out saves bytes. But if the REMOTE server specifies a Date then we should probably pass it on (see comments below) + except: pass # (ok if "Date" wasn't there) # Now check if it's an image request: - path = self.request.uri - if path.startswith("http://"): path=urlparse.urlunparse(("","")+urlparse.urlparse(path)[2:]) # (gets here only if options.real_proxy, otherwise we won't have added a handler for paths that don't start with '/') - img = Renderer.getImage(path) + img = Renderer.getImage(self.request.uri) if img: return self.serveImage(img) # Not an image: + if options.mailtoPath and self.request.uri.startswith(options.mailtoPath): return self.serve_mailtoPage() + if options.submitPath and self.request.uri.startswith(options.submitPath): return self.serve_submitPage() if not realHost: # default_site(s) not set - if options.own_server and options.ownServer_if_not_root and len(self.request.path)>1: return self.proxyFor(options.own_server) + if options.own_server and options.ownServer_if_not_root and len(self.request.path)>1: return self.forwardFor(options.own_server) elif maybeRobots: return self.serveRobots() # Serve URL box v=self.request.arguments.get("q","") @@ -819,40 +1119,60 @@ class RequestForwarder(RequestHandler): if self.request.headers.get_list("Cookie"): # some sites require them all in one header ck = "; ".join(self.request.headers.get_list("Cookie")) - self.request.headers["Cookie"]=ck + self.request.old_cookie = ck + def ours(c): # don't forward our own cookies upstream (may confuse some sites, especially if a site uses Web Adjuster their end) + c = c.strip() + if not '=' in c: return 0 + c = c[:c.index('=')] + return (options.htmlonly_mode and c==htmlmode_cookie_name) or (options.cssName and c=="adjustCssSwitch") or (options.htmlFilterName and c=="adjustNoFilter") or (options.renderName and c=="adjustNoRender") or (options.prominentNotice and c=="_WA_warnOK") or (c==adjust_domain_cookieName and self.cookie_host()) + if options.upstream_guard: + def maketheirs(c): + if options.cssName: c=c.replace("adjustCssSwitch1","adjustCssSwitch") + if options.htmlFilterName: c=c.replace("adjustNoFilter1","adjustNoFilter") + if options.renderName: c=c.replace("adjustNoRender1","adjustNoRender") + return c + else: maketheirs = lambda x:x + self.request.headers["Cookie"]=";".join(maketheirs(x) for x in ck.split(";") if not ours(x)) for v in self.request.headers.get_list("Referer"): if v: self.request.headers["Referer"] = fixDNS(v) if "http://" in self.request.uri[1:]: # xyz?q=http://... stuff u=self.request.uri.split("http://") for i in range(1,len(u)): u[i]=fixDNS(u[i]) self.request.uri="http://".join(u) - if self.request.uri.startswith("http://"): # will reach here only if options.real_proxy - if not self.request.headers["Host"]: self.request.headers["Host"] = urlparse.urlparse(self.request.uri).netloc - self.request.uri = urlparse.urlunparse(("","")+urlparse.urlparse(self.request.uri)[2:]) - try: del self.request.headers['Proxy-Connection'] - except: pass - else: self.request.headers["Host"]=realHost + try: del self.request.headers['Proxy-Connection'] + except: pass + self.request.headers["Host"]=realHost try: del self.request.headers["Accept-Encoding"] # we'd better re-do that one except: pass if options.via: v = self.request.version if v.startswith("HTTP/"): v=v[5:] - self.addToHeader("Via",v+" "+convert_to_via_host(self.request.host)+" ("+program_name[:program_name.index("(c)")].strip()+")") + self.addToHeader("Via",v+" "+convert_to_via_host(self.request.host)+" ("+viaName+")") self.addToHeader("X-Forwarded-For",self.request.remote_ip) - self.sendRequest(converterFlags,viewSource,isProxyRequest,follow_redirects=False) # (DON'T follow redirects - browser needs to know about them!) + self.urlToFetch = "http://"+self.request.headers["Host"]+self.request.uri + if not isProxyRequest and any(re.search(x,self.urlToFetch) for x in options.prohibit): return self.redirect(self.urlToFetch) + try: + self.connection_header = self.request.headers['Connection'] + del self.request.headers['Connection'] + except: self.connection_header = None + if options.redirectFiles and not (isProxyRequest or any(converterFlags) or viewSource) and '.' in self.request.uri and self.request.uri[self.request.uri.rindex('.')+1:].lower() in redirectFiles_Extensions and not (options.bitrate and not options.askBitrate and self.request.uri[self.request.uri.rindex('.'):].lower()==".mp3"): self.sendHead() + else: self.sendRequest(converterFlags,viewSource,isProxyRequest,follow_redirects=False) # (DON'T follow redirects - browser needs to know about them!) def sendRequest(self,converterFlags,viewSource,isProxyRequest,follow_redirects): http = AsyncHTTPClient() body = self.request.body if not body: body = None # required by some Tornado versions - # TODO: basic authentication? auth_username, auth_password - self.urlToFetch = "http://"+self.request.headers["Host"]+self.request.uri - # TODO: try del self.request.headers['Connection'] ? but check it can't mess up Tornado (may have to put it back before write()s) http.fetch(self.urlToFetch, + connect_timeout=60,request_timeout=120, # Tornado's default is usually something like 20 seconds each; be more generous to slow servers (TODO: customise?) + proxy_host=upstream_proxy_host, proxy_port=upstream_proxy_port, use_gzip=not hasattr(self,"avoid_gzip"), method=self.request.method, headers=self.request.headers, body=body, callback=lambda r:self.doResponse(r,converterFlags,viewSource,isProxyRequest),follow_redirects=follow_redirects) + # (Don't have to worry about auth_username/auth_password: should just work by passing on the headers) + # TODO: header_callback (run with each header line as it is received, and headers will be empty in the final response); streaming_callback (run with each chunk of data as it is received, and body and buffer will be empty in the final response), but how to abort a partial transfer if we realise we don't want it (e.g. large file we don't want to modify on site that doesn't mind client being redirected there directly) def doResponse(self,response,converterFlags,viewSource,isProxyRequest): debuglog("doResponse "+self.request.uri) + if hasattr(self.request,"old_cookie"): self.request.headers["Cookie"] = self.request.old_cookie # put it back as we'll need to refer to our cookies + if self.connection_header: self.request.headers['Connection'] = self.connection_header # put it back before writing the response do_pdftotext,do_epubtotext,do_epubtozip,do_mp3 = converterFlags do_domain_process = do_html_process = do_js_process = True do_json_process = do_css_process = False @@ -865,32 +1185,36 @@ class RequestForwarder(RequestHandler): # Some versions of the GWAN server can send NULL bytes at the end of gzip data. Retry without requesting gzip. self.avoid_gzip = True return self.sendRequest(converterFlags,viewSource,isProxyRequest,False) + tryFetch = self.urlToFetch + if options.upstream_proxy: tryFetch += " via "+options.upstream_proxy + logging.error(error+" when fetching "+tryFetch) # better log it for the admin, especially if options.upstream_proxy, because it might be an upstream proxy malfunction + error = """<html><head><title>Error</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body><h1>Error</h1>%s<br>Was trying to fetch %s<hr>This is %s</body></html>""" % (error,ampEncode(tryFetch),serverName_html) self.set_status(504) - return self.doResponse2(("<html><body>%s</body></html>" % error),True,False) + return self.doResponse2(error,True,False) if viewSource: - def txt2html(t): return t.replace("&","&").replace("<","<").replace(">",">").replace("\n","<br>") def h2html(h): return "<br>".join("<b>"+txt2html(k)+"</b>: "+txt2html(v) for k,v in sorted(h.get_all())) - return self.doResponse2("<html><body><a href=\"#1\">Headers sent</a> | <a href=\"#2\">Headers received</a> | <a href=\"#3\">Page source</a><a name=\"1\"></a><h2>Headers sent</h2>"+h2html(self.request.headers)+"<a name=\"2\"></a><h2>Headers received</h2>"+h2html(response.headers)+"<a name=\"3\"></a><h2>Page source</h2>"+txt2html(response.body),True,False) + return self.doResponse2("<html><body><a href=\"#1\">Headers sent</a> | <a href=\"#2\">Headers received</a> | <a href=\"#3\">Page source</a> | <a href=\"#4\">Bottom</a> <a name=\"1\"></a><h2>Headers sent</h2>"+h2html(self.request.headers)+"<a name=\"2\"></a><h2>Headers received</h2>"+h2html(response.headers)+"<a name=\"3\"></a><h2>Page source</h2>"+txt2html(response.body)+"<hr><a name=\"4\"></a>This is "+serverName_html,True,False) headers_to_add = [] if (do_pdftotext or do_epubtotext or do_epubtozip or do_mp3) and not response.headers.get("Location","") and response.headers.get("Content-type","").startswith("text/"): # We thought we were going to get a PDF etc that could be converted, but it looks like they just sent more HTML (perhaps a "which version of the PDF did you want" screen) do_pdftotext=do_epubtotext=do_epubtozip=do_mp3=False cookie_host = self.cookie_host() + doRedirect = "" for name,value in response.headers.get_all(): - if name.lower() in ["connection","content-length","content-encoding","transfer-encoding","etag","server","alternate-protocol"]: continue # we'll do our own connection type etc + if name.lower() in ["connection","content-length","content-encoding","transfer-encoding","etag","server","alternate-protocol"]: continue # we'll do our own connection type etc (but don't include "Date" in this list: if the remote server includes a Date it would be useful to propagate that as a reference for its Age headers etc, TODO: unless remote server is broken? see also above comment re having no Date by default) # TODO: WebSocket (and Microsoft SM) gets the client to say 'Connection: Upgrade' with a load of Sec-WebSocket-* headers, check what Tornado does with that if (do_pdftotext or do_epubtotext) and name.lower() in ["content-disposition","content-type"]: continue # we're re-doing these also elif do_epubtozip and name.lower()=="content-disposition" and value.replace('"','').endswith(".epub"): epub = value.rfind(".epub") value=value[:epub]+".zip"+value[epub+5:] elif "location" in name.lower(): - old_value_1 = value + old_value_1 = value # before domain_process if not isProxyRequest: value=domain_process(value,cookie_host,True) - offsite = (value==old_value_1 and "http://" in value) - else: offsite = False - old_value_2 = value - if do_pdftotext: # is it still going to be pdf? + offsite = (value==old_value_1 and value.startswith("http://")) # i.e. domain_process didn't change it, and it's not relative + else: offsite = False # proxy requests are never "offsite" + old_value_2 = value # after domain_process but before PDF/EPUB-etc rewrites + if do_pdftotext: # is it still going to be pdf after the redirect? if value.lower().endswith(".pdf") or guessCMS(value,"pdf"): value += pdftotext_suffix if do_epubtotext: if value.lower().endswith(".epub") or guessCMS(value,"epub"): value += epubtotext_suffix @@ -902,16 +1226,22 @@ class RequestForwarder(RequestHandler): # ouch, we're not going to be able to do it this way because it's redirecting to somewhere we can't domain-proxy for. But we could follow the redirects ourselves to do the conversion: return self.sendRequest(converterFlags,viewSource,isProxyRequest,follow_redirects=True) # TODO: if that sendRequest results in HTML, overriding the do_... options, the browser will end up with an incorrect idea of the current address; might want to detect this and give the user the unchanged Location: header - else: do_pdftotext=do_epubtotext=do_epubtozip=do_mp3=False # do not attempt to media-process any body that is sent with this Location: redirect (if it's just a copy of the URL then running it through ebook-convert might hold things up unnecessarily) - if cookie_host and self.request.path=="/" and value.startswith("http") and not value.startswith("http://"+cookie_host+"/"): + # else: do_pdftotext=do_epubtotext=do_epubtozip=do_mp3=False # do not attempt to media-process any body that is sent with this Location: redirect (if it's just a copy of the URL then running it through ebook-convert might hold things up unnecessarily) + # -> actually, don't need to process the body AT ALL (doing so and adding our scripts etc is only bloat), we can do our own brief redirect. But not yet, because we might have to set cookies first. + else: doRedirect = value # TODO: do we need to check if response.code is in [301,302,303,307] before accepting a Location: ? + if cookie_host and self.request.path=="/" and old_value_1.startswith("http") and not old_value_1.startswith("http://"+cookie_host+"/"): # This'll be a problem. If the user is requesting / and the site's trying to redirect off-site, how do we know that the user isn't trying to get back to the URL box (having forgotten to clear the cookie) and now can't possibly do so because / always results in an off-site Location redirect ? - # (The same thing can occur if offsite is False but we're redirecting to one of our other domains, hence we use the value.startswith condition instead of the 'offsite' flag; the latter is true only if NONE of our domains can do it.) + # (The same thing can occur if offsite is False but we're redirecting to one of our other domains, hence we use the old_value_1.startswith condition instead of the 'offsite' flag; the latter is true only if NONE of our domains can do it.) # (DON'T just do this for ANY offsite url when in cookie_host mode - that could mess up images and things. (Could still mess up images etc if they're served from / with query parameters; for now we're assuming path=/ is a good condition to do this. The whole cookie_host thing is a compromise anyway; wildcard_dns is better.)) if offsite: reason="which this adjuster is not currently set to adjust" else: reason="which will be adjusted at %s (not here)" % (value[len("http://"):(value+"/").index('/',len("http://"))],) return self.doResponse2(("<html><body>The server is redirecting you to <a href=\"%s\">%s</a> %s.</body></html>" % (value,old_value_1,reason)),True,False) # and 'Back to URL box' link will be added - elif "set-cookie" in name.lower() and not isProxyRequest: - value=cookie_domain_process(value,cookie_host) + elif "set-cookie" in name.lower(): + if not isProxyRequest: value=cookie_domain_process(value,cookie_host) + if options.upstream_guard: + if options.cssName: value=value.replace("adjustCssSwitch","adjustCssSwitch1") + if options.htmlFilterName: value=value.replace("adjustNoFilter","adjustNoFilter1") + if options.renderName: value=value.replace("adjustNoRender","adjustNoRender1") headers_to_add.append((name,value)) if name.lower()=="content-type": if do_epubtozip: value="application/zip" @@ -927,12 +1257,16 @@ class RequestForwarder(RequestHandler): if do_html_process: headers_to_add[-1]=((name,value.replace(charset,"utf-8"))) # we'll be converting it elif do_html_process: headers_to_add[-1]=((name,value+"; charset=utf-8")) # ditto (don't leave as latin-1) # TODO: if there's no content-type header, send one anyway, with a charset - self.set_status(response.code) # (not before here! as might return doResponse2 above which will need status 200. Redirect without Location gets "unknown error 0x80072f76" on Pocket IE 6.) + self.set_status(response.code) # (not before here! as might return doResponse2 above which will need status 200. Redirect without Location gets "unknown error 0x80072f76" on IEMobile 6.) added = {} for name,value in headers_to_add: - if name in added: self.add_header(name,value) + value = value.replace("\t"," ") # needed for some servers + if name.lower() in added: self.add_header(name,value) else: self.set_header(name,value) # overriding any Tornado default - added[name]=1 + added[name.lower()]=1 + if doRedirect: + # ignore response.body and put our own in + return self.redirect(doRedirect,response.code) body = response.body if not body: self.myfinish() ; return # might just be a redirect (TODO: if it's not, set type to text/html and report error?) @@ -949,51 +1283,42 @@ class RequestForwarder(RequestHandler): if not charset=="utf-8": body=body.decode(charset,'replace').encode('utf-8') if do_pdftotext or do_epubtotext: self.set_header("Content-Type","text/plain; charset=utf-8") - self.set_header("Content-Disposition","attachment; filename=\"%s\"" % (self.request.uri[self.request.uri.rfind("/")+1:self.request.uri.rfind(".")]+".txt")) - # (Pocket IE on Windows Mobile doesn't always obey Content-Disposition, but Opera Mini etc should do it) + self.set_header("Content-Disposition","attachment; filename="+urllib.quote(self.request.uri[self.request.uri.rfind("/")+1:self.request.uri.rfind(".")]+".txt")) + # IEMobile 6 (and 7) ignores Content-Disposition and just displays the text in the browser using fonts that can't be made much bigger, even if you set Content-Type to application/octet-stream and filename to something ending .doc (or no filename at all), and even if you change the URL extension from TxT to TxQ or something. Even a null-or-random byte or two before the BOM doesn't stop it. Opening a real PDF file causes "Error: This file cannot be viewed on the device" (even if a PDF viewer is installed and associated with PDF files). Serving a text file with Content-Type application/vnd.ms-cab-compressed results in no error but no download either (real CAB files give a download question); same result for application/msword or application/rtf. + # And Opera Mini's not much help on that platform because its fonts can't be enlarged much (registry hacks to do so don't seem to work on non-touchscreen phones), although it could be squinted at to save some text files for later. + # Opera Mobile 10 on Windows Mobile also has trouble recognising Content-Disposition: attachment, even though Opera Mini is fine with it. + # Could show text as HTML, but that wouldn't allow saving multiple files for later (unless they all fit in cache, but even then navigation is restrictive). import tempfile if do_pdftotext: ext="pdf" elif do_epubtotext: ext="epub" else: ext="" # shouldn't get here - def remove_blanks_add_utf8_BOM(out): return '\xef\xbb\xbf'+"\n".join([x for x in out.replace("\r","").split("\n") if x]) - # first, is the result of pdftotext etc cached? - ktkey = (self.request.host, self.request.uri) - if kept_tempfiles.get(ktkey,None)==1: - # This conversion is in progress on another request (TODO: check it didn't somehow fail without updating kept_tempfiles?) - def tryLater(): - try: txt=open(kept_tempfiles[ktkey]).read() - except: txt = None - if txt: - self.write(remove_blanks_add_utf8_BOM(txt)) - self.myfinish() - else: IOLoop.instance().add_timeout(time.time()+1,lambda *args:tryLater()) - return tryLater() - if not ktkey in kept_tempfiles: - kept_tempfiles[ktkey] = 1 # in progress - try: txt=open(kept_tempfiles[ktkey]).read() - except: txt = None - if txt: - self.write(remove_blanks_add_utf8_BOM(txt)) - self.myfinish() ; return - # not cached - need to generate - f=tempfile.NamedTemporaryFile(delete=False,suffix="."+ext) # Python 2.6+ + if self.checkTextCache(): return + + f=tempfile.NamedTemporaryFile(delete=False,suffix="."+ext) # Python 2.6+ (TODO: if doing pdf/epub conversion in a Python 2.5 environment, would need fd,fname = tempfile.mkstemp(suffix=), and use os.write(fd,..) and os.close(fd)) f.write(body) ; f.close() - def tryDel(k): - try: del kept_tempfiles[k] - except: pass - def unlinkLater(k,fn): + def unlinkOutputLater(fn): + k = (self.request.host, self.request.uri) kept_tempfiles[k] = fn # it's ready for now - IOLoop.instance().add_timeout(time.time()+options.pdfepubkeep,lambda *args:(tryDel(k),os.unlink(fn))) - if do_pdftotext: runFilter(("pdftotext -enc UTF-8 -nopgbrk \"%s\" -" % f.name),"",(lambda out:(unlinkLater(ktkey,f.name),self.write(remove_blanks_add_utf8_BOM(out)),self.myfinish())), False) - elif do_epubtotext: - def ebookCallback(self,fn): - try: txt = open(fn+".txt").read() - except: txt = "Unable to read ebook-convert's output" - unlinkLater(ktkey,fn+".txt") - unlinkLater(0,fn) - self.write(remove_blanks_add_utf8_BOM(txt)) - self.myfinish() - runFilter(("ebook-convert %s %s.txt" % (f.name,f.name)),"",(lambda out:ebookCallback(self,f.name)), False) + def tryDel(k): + try: del kept_tempfiles[k] + except: pass + IOLoop.instance().add_timeout(time.time()+options.pdfepubkeep,lambda *args:(tryDel(k),unlink(fn))) + def txtCallback(self,fn,cmdname,err): + try: txt = open(fn+".txt").read() + except: # try to diagnose misconfiguration + txt = "Could not read %s's output from %s\n%s\n(This is %s)" % (cmdname,fn+".txt",err,serverName) + try: open(fn+".txt","w").write(txt) # must unconditionally leave a .txt file as there might be other requests waiting on cache + except: txt += "\nCould not write to "+fn+".txt" # TODO: logging.error as well ? + unlinkOutputLater(fn+".txt") + unlink(fn) + if self.inProgress_run(): return + self.write(remove_blanks_add_utf8_BOM(txt)) + self.myfinish() + self.inProgress() # if appropriate + if do_pdftotext: + if options.pdfepubkeep: runFilter(("pdftotext -enc UTF-8 -nopgbrk \"%s\" \"%s.txt\"" % (f.name,f.name)),"",(lambda out,err:txtCallback(self,f.name,"pdftotext",out+err)), False) + else: runFilter(("pdftotext -enc UTF-8 -nopgbrk \"%s\" -" % f.name),"",(lambda out,err:(unlink(f.name),self.write(remove_blanks_add_utf8_BOM(out)),self.myfinish())), False) # (pipe o/p from pdftotext directly, no temp outfile needed) + else: runFilter(("ebook-convert \"%s\" \"%s.txt\"" % (f.name,f.name)),"",(lambda out,err:txtCallback(self,f.name,"ebook-convert",out+err)), False) return if do_domain_process and not isProxyRequest: body = domain_process(body,cookie_host) # first, so filters to run and scripts to add can mention new domains without these being redirected back # Must also do things like 'delete' BEFORE the filters, especially if lxml is in use and might change the code so the delete patterns aren't recognised @@ -1003,37 +1328,56 @@ class RequestForwarder(RequestHandler): if options.delete_doctype: body=re.sub("^<![dD][oO][cC][tT][yY][pP][eE][^>]*>","",body,1) if do_js_process: body = js_process(body,self.urlToFetch) + if do_css_process: + for d in options.delete_css: + body=re.sub(d,"",body) # OK to change the code now: adjustList = [] if self.htmlOnlyMode(): adjustList.append(StripJSEtc()) - if (options.pdftotext or options.epubtotext or options.epubtozip or options.askBitrate) and (do_html_process or (do_json_process and options.htmlJson)): + elif options.upstream_guard: + # don't let upstream scripts get confused by our cookies (e.g. if the site is running Web Adjuster as well) + # TODO: do it in script files also? + if options.cssName: adjustList.append(transform_in_selected_tag("script",lambda s:s.replace("adjustCssSwitch","adjustCssSwitch1"))) + if options.htmlFilterName: adjustList.append(transform_in_selected_tag("script",lambda s:s.replace("adjustNoFilter","adjustNoFilter1"))) + if options.renderName: adjustList.append(transform_in_selected_tag("script",lambda s:s.replace("adjustNoRender","adjustNoRender1"))) + if (options.pdftotext or options.epubtotext or options.epubtozip or options.askBitrate or options.mailtoPath) and (do_html_process or (do_json_process and options.htmlJson)): # Add PDF links BEFORE the external filter, in case the external filter is broken and we have trouble parsing the result if do_html_process: - adjustList.append(AddConversionLinks()) + adjustList.append(AddConversionLinks(options.wildcard_dns or isProxyRequest)) else: ctl = find_HTML_in_JSON(body) for i in range(1,len(ctl),2): - ctl[i] = json_reEscape(add_conversion_links(ctl[i])) + ctl[i] = json_reEscape(add_conversion_links(ctl[i],options.wildcard_dns or isProxyRequest)) body = "".join(ctl) if options.headAppendCSS: # remove !important from other stylesheets important = re.compile("! *important") - if (do_html_process or (do_css_process and not self.urlToFetch == options.headAppendCSS)) and re.search(important,body): + if (do_html_process or (do_css_process and not self.urlToFetch == options.headAppendCSS and not (options.protectedCSS and re.search(options.protectedCSS,self.urlToFetch)))) and re.search(important,body): if do_css_process: body=re.sub(important,"",body) else: adjustList.append(transform_in_selected_tag("style",lambda s:re.sub(important,"",s))) if adjustList: body = HTML_adjust_svc(body,adjustList) - callback = lambda out:self.doResponse2(out,do_html_process,do_json_process) - skipFilter = options.htmlFilterName and "adjustNoFilter=1" in ';'.join(self.request.headers.get_list("Cookie")) - if do_html_process and options.htmlFilter and not skipFilter: - if options.htmlText: runFilterOnText(options.htmlFilter,find_text_in_HTML(body),callback) - else: runFilter(options.htmlFilter,body,callback) - elif do_json_process and options.htmlJson and options.htmlFilter and not skipFilter: + callback = lambda out,err:self.doResponse2(out,do_html_process,do_json_process) + htmlFilter = None + if options.htmlFilterName: + anf = self.getCookie("adjustNoFilter") + if not anf: anf = "0" + elif '-' in anf: anf = anf[anf.rindex("-")+1:] + if anf=="1": pass + elif '#' in options.htmlFilter: + htmlFilter = options.htmlFilter.split('#') + if anf=="0": htmlFilter = htmlFilter[0] + else: htmlFilter = htmlFilter[int(anf)-1] + else: htmlFilter = options.htmlFilter + if do_html_process and htmlFilter: + if options.htmlText: runFilterOnText(htmlFilter,find_text_in_HTML(body),callback) + else: runFilter(htmlFilter,body,callback) + elif do_json_process and options.htmlJson and htmlFilter: if options.htmlText: htmlFunc = find_text_in_HTML else: htmlFunc = None - runFilterOnText(options.htmlFilter,find_HTML_in_JSON(body,htmlFunc),callback,True) + runFilterOnText(htmlFilter,find_HTML_in_JSON(body,htmlFunc),callback,True) elif do_mp3 and options.bitrate: runFilter("lame --quiet --mp3input -m m --abr %d - -o -" % options.bitrate,body,callback,False) # -m m = mono (TODO: optional?) - else: callback(body) + else: callback(body,"") def doResponse2(self,body,do_html_process,do_json_process): debuglog("doResponse2 "+self.request.uri) # 2nd stage (domain change and external filter @@ -1041,24 +1385,56 @@ class RequestForwarder(RequestHandler): canRender = options.render and (do_html_process or (do_json_process and options.htmlJson)) and not self.checkBrowser(options.renderOmit) jsCookieString = ';'.join(self.request.headers.get_list("Cookie")) if do_html_process: body = html_additions(body,self.checkBrowser(options.cssNameReload),self.cookieHostToSet(),jsCookieString,canRender,self.cookie_host(),self.is_password_domain) - callback = lambda out:self.doResponse3(out) + callback = lambda out,err:self.doResponse3(out) if canRender and not "adjustNoRender=1" in jsCookieString: if do_html_process: func = find_text_in_HTML else: func=lambda body:find_HTML_in_JSON(body,find_text_in_HTML) debuglog("runFilterOnText Renderer") runFilterOnText(lambda t:Renderer.getMarkup(t.decode('utf-8')).encode('utf-8'),func(body),callback,not do_html_process,chr(0)) - else: callback(body) + else: callback(body,"") def doResponse3(self,body): # 3rd stage (rendering has been done) debuglog("doResponse3 (len=%d)" % len(body)) self.write(body) self.myfinish() + def sendHead(self): + # for options.redirectFiles: it looks like we have a "no processing necessary" request that we can tell the browser to get from the real site. But just confirm it's not a mis-named HTML document. + http = AsyncHTTPClient() + body = self.request.body + if not body: body = None + http.fetch(self.urlToFetch, + connect_timeout=60,request_timeout=120, # same TODO as above + proxy_host=upstream_proxy_host, proxy_port=upstream_proxy_port, + method="HEAD", headers=self.request.headers, body=body, + callback=lambda r:self.headResponse(r),follow_redirects=True) + def headResponse(self,response): + if self.connection_header: self.request.headers['Connection'] = self.connection_header # put it back before writing the response + might_need_processing_after_all = True + for name,value in response.headers.get_all(): + if name.lower()=="content-type": + value=value.lower() + might_need_processing_after_all = ("html" in value or "css" in value or "javascript" in value or "json" in value) # these need at least domain processing + if might_need_processing_after_all: self.sendRequest([False]*4,False,False,follow_redirects=False) + else: + if not options.logRedirectFiles: self.request.suppress_logging = True + self.redirect(self.urlToFetch) def checkBrowser(self,blist): ua = self.request.headers.get("User-Agent","") return any(b in ua for b in blist) kept_tempfiles = {} # TODO: delete any outstanding kept_tempfiles.values() on server interrupt +def remove_blanks_add_utf8_BOM(out): + # for writing text files from PDF and EPUB + return '\xef\xbb\xbf'+"\n".join([x for x in out.replace("\r","").split("\n") if x]) + +def rm_u8punc(u8): + # for SMS links, turn some Unicode punctuation into ASCII (helps with some phones) + for k,v in u8punc_d: u8=u8.replace(k,v) + return u8 +u8punc_d=u"\u2013 -- \u2014 -- \u2018 ' \u2019 ' \u201c \" \u201d \" \u2032 ' \u00b4 ' \u00a9 (c) \u00ae (r)".encode('utf-8').split() +u8punc_d = zip(u8punc_d[::2], u8punc_d[1::2]) + def getSearchURL(q): if not options.search_sites: return urllib.quote(q) # ?? def site(s,q): return s.split()[0]+urllib.quote(q) @@ -1076,33 +1452,54 @@ def searchHelp(): elif len(options.search_sites)==1: return " (or enter search terms)" else: return " or enter search terms, first word can be "+", ".join([x.split(None,1)[1] for x in options.search_sites]) def urlbox_html(htmlonly_checked): - r = '<html><head><title>Web Adjuster start page</title><meta name="viewport" content="width=device-width"></head><body><form action="/">Website to adjust: <input type="text" name="q"><input type="submit" value="Go">'+searchHelp() + r = '<html><head><title>Web Adjuster start page</title><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head><body><form action="/">Website to adjust: <input type="text" name="q"><input type="submit" value="Go">'+searchHelp() if htmlonly_checked: htmlonly_checked=' checked="checked"' else: htmlonly_checked = "" if options.htmlonly_mode: r += '<br><input type="checkbox" name="pr"'+htmlonly_checked+'> HTML-only mode' - return r+'</form><script language="javascript"><!--\ndocument.forms[0].q.focus();\n//--></script></body></html>' + return r+'</form><script><!--\ndocument.forms[0].q.focus();\n//--></script></body></html>' + +backScript="""<script><!-- +document.write('<br><a href="javascript:history.go(-1)">Back to previous page</a>') +//--></script>""" +backScriptNoBr="""<script><!-- +document.write('<a href="javascript:history.go(-1)">Back to previous page</a>') +//--></script>""" +# (HTML5 defaults type to text/javascript, as do all pre-HTML5 browsers including NN2's 'script language="javascript"' thing, so we might as well save a few bytes) + +def unlink(fn): + try: os.unlink(fn) + except: pass def runFilter(cmd,text,callback,textmode=True): # runs shell command 'cmd' on input 'text' in a new - # thread, then gets Tornado to call callback(output) + # thread, then gets Tornado to call callback(out,err) # If 'cmd' is not a string, assumes it's a function # to call (no new thread necessary, TODO: Jython/SMP) # this is for using runFilterOnText with an internal # callable such as the Renderer. if not type(cmd)==type(""): - # return callback(cmd(text)) + # return callback(cmd(text),"") # slightly more roundabout version to give watchdog ping a chance to work between cmd and callback: out = cmd(text) - return IOLoop.instance().add_timeout(time.time(),lambda *args:callback(out)) + return IOLoop.instance().add_timeout(time.time(),lambda *args:callback(out,"")) def subprocess_thread(): - sp=subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,universal_newlines=textmode) # TODO: check shell=True won't throw error on Windows + global helper_thread_count + helper_thread_count += 1 + sp=subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,universal_newlines=textmode) # TODO: check shell=True won't throw error on Windows out,err = sp.communicate(text) - IOLoop.instance().add_callback(lambda *args:callback(out)) + if not out: out="" + if not err: err="" # TODO: else logging.debug ? (some stderr might be harmless; don't want to fill normal logs) + IOLoop.instance().add_callback(lambda *args:callback(out,err)) + helper_thread_count -= 1 threading.Thread(target=subprocess_thread,args=()).start() def runBrowser(*args): def browser_thread(): - os.system(options.browser) ; stopServer() + global helper_thread_count + helper_thread_count += 1 + os.system(options.browser) + helper_thread_count -= 1 + stopServer() threading.Thread(target=browser_thread,args=()).start() def stopServer(*args): IOLoop.instance().add_callback(lambda *args:IOLoop.instance().stop()) @@ -1144,11 +1541,11 @@ def runFilterOnText(cmd,codeTextList,callback,escape=False,separator=None): def countItems(l): return len(separator.join(getText(l)).split(separator)) text = getText(codeTextList) toSend = separator.join(text) - if options.separator: + if separator == options.separator: toSend=separator+toSend+separator sortout = lambda out:out.split(separator)[1:-1] else: sortout = lambda out:out.split(separator) - runFilter(cmd,toSend,lambda out:callback("".join(getText(codeTextList,sortout(out),True)))) + runFilter(cmd,toSend,lambda out,err:callback("".join(getText(codeTextList,sortout(out),True)),err)) def extractCharsetEquals(value): charset=value[value.index("charset=")+len("charset="):] @@ -1187,38 +1584,48 @@ pdftotext_suffix = epubtotext_suffix = ".TxT" # TODO: what if a server uses .pdf mp3lofi_suffix = "-lOfI.mP3" epubtozip_suffix = ".ZiP" # TODO: what if a server uses .epub.ZiP ? class AddConversionLinks: + def __init__(self,offsite_ok): self.offsite_ok = offsite_ok def init(self,parser): self.parser = parser self.gotPDF=self.gotEPUB=self.gotMP3=None def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if tag=="a" and "href" in attrs: - l = attrs["href"].lower() + attrsD = dict(attrs) + if tag=="a" and "href" in attrsD: + l = attrsD["href"].lower() if l.startswith("http://"): - if not options.wildcard_dns and not options.real_proxy and not url_is_ours(l): return # "offsite" link, can't process (TODO: unless we send it to ourselves via an alternate syntax) - # TODO: should also check isProxyRequest rather than just relying on options.real_proxy + if not self.offsite_ok and not url_is_ours(l): return # "offsite" link, can't process (TODO: unless we send it to ourselves via an alternate syntax) # TODO: (if don't implement processing the link anyway) insert explanatory text for why an alternate link wasn't provided? + elif options.mailtoPath and l.startswith("mailto:"): + r=['<'+tag+" "] + for k,v in items(attrs): + if k.lower()=="href": v=options.mailtoPath+v[7:] + r.append(k+'="'+v.replace('&','&').replace('"','"').replace('&#','&#')+'"') + r.append('>') + self.parser.addDataFromTagHandler("".join(r),True) + return True # suppress original tag elif ":" in l and l.index(":")<l.find("/"): return # non-HTTP protocol - can't do (TODO: unless we do https, or send the link to ourselves via an alternate syntax) if l.endswith(".pdf") or guessCMS(l,"pdf"): - self.gotPDF = attrs["href"] + self.gotPDF = attrsD["href"] + if options.pdfomit and any(re.search(x,self.gotPDF) for x in options.pdfomit.split(",")): self.gotPDF = None if l.endswith(".epub") or guessCMS(l,"epub"): - self.gotEPUB = attrs["href"] + self.gotEPUB = attrsD["href"] if l.endswith(".mp3"): - self.gotMP3 = attrs["href"] + self.gotMP3 = attrsD["href"] def handle_endtag(self, tag): if tag=="a" and ((self.gotPDF and options.pdftotext) or (self.gotEPUB and (options.epubtozip or options.epubtotext)) or (self.gotMP3 and options.bitrate and options.askBitrate)): linksToAdd = [] - if self.gotPDF: linksToAdd.append("<a href=\"%s%s\">text</a>" % (self.gotPDF,pdftotext_suffix)) + linkStart = "<a style=\"display:inline!important;float:none!important\" href=" # adding style in case a site styles the previous link with float, which won't work with our '('...')' stuff + if self.gotPDF: linksToAdd.append("%s\"%s%s\">text</a>" % (linkStart,self.gotPDF,pdftotext_suffix)) elif self.gotEPUB: - if options.epubtotext: linksToAdd.append("<a href=\"%s%s\">text</a>" % (self.gotEPUB,epubtotext_suffix)) - if options.epubtozip: linksToAdd.append("<a href=\"%s%s\">zip</a>" % (self.gotEPUB,epubtozip_suffix)) - elif self.gotMP3: linksToAdd.append("<a href=\"%s%s\">lo-fi</a>" % (self.gotMP3,mp3lofi_suffix)) + if options.epubtotext: linksToAdd.append("%s\"%s%s\">text</a>" % (linkStart,self.gotEPUB,epubtotext_suffix)) + if options.epubtozip: linksToAdd.append("%s\"%s%s\">zip</a>" % (linkStart,self.gotEPUB,epubtozip_suffix)) + elif self.gotMP3: linksToAdd.append("%s\"%s%s\">lo-fi</a>" % (linkStart,self.gotMP3,mp3lofi_suffix)) if linksToAdd: self.parser.addDataFromTagHandler(" ("+", ".join(linksToAdd)+") ") self.gotPDF=self.gotEPUB=self.gotMP3=None def handle_data(self,data): pass -def add_conversion_links(h): +def add_conversion_links(h,offsite_ok): # (wrapper for when we can't avoid doing a special-case HTMLParser for it) - return HTML_adjust_svc(h,[AddConversionLinks()],can_use_LXML=False) # False because we're likely dealing with a fragment inside JSON, not a complete HTML document + return HTML_adjust_svc(h,[AddConversionLinks(offsite_ok)],can_use_LXML=False) # False because we're likely dealing with a fragment inside JSON, not a complete HTML document class StripJSEtc: # TODO: HTML_adjust_svc might need to do handle_entityref and handle_charref to catch those inside scripts etc @@ -1235,6 +1642,9 @@ class StripJSEtc: # TODO: remove style= attribute on other tags? (or only if it refers to a URL?) # TODO: what about event handler attributes, and javascript: URLs def handle_endtag(self, tag): + if tag=="head": + self.parser.addDataFromTagHandler('<meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"></head>',True) # TODO: document that htmlonly_mode adds this; might also want to have it when CSS is on + return True # suppress </head> because we've done it ourselves in the above (had to or addDataFromTagHandler would have added it AFTER the closing tag) if tag in ['script','style']: self.suppressing = False ; return True elif tag=='noscript': return True @@ -1379,16 +1789,32 @@ def HTML_adjust_svc_LXML(htmlStr,adjustList): etree.parse(StringIO(htmlStr.decode('utf-8')), lparser) return "".join(parser.out) +def items(maybeDict): + if type(maybeDict)==dict: return maybeDict.items() + else: return maybeDict + def transform_in_selected_tag(intag,transformFunc): - # assumes intag is closed and not nested, e.g. style + # assumes intag is closed and not nested, e.g. style, although small tags appearing inside it MIGHT work # also assumes transformFunc doesn't need to know about entity references etc (it's called for the data between them) class Adjustment: def init(self,parser): self.intag = False + self.parser = parser def handle_starttag(self, tag, attrs): - self.intag=(tag==intag) + if tag==intag: self.intag=True + elif intag=="script": + attrsD = dict(attrs) + if ("onclick" in attrsD and transformFunc(attrsD["onclick"]) != attrsD["onclick"]) or ("id" in attrsD and transformFunc(attrsD["id"]) != attrsD["id"]): # TODO: name as well? (shouldn't be needed for our own scripts) + # Re-write the tag ourselves, with that attribute changed + r=['<'+tag+" "] + for k,v in items(attrs): + if k in ["onclick","id"]: v = transformFunc(v) + r.append(k+'="'+v.replace('&','&').replace('"','"')+'"') + r.append('>') + self.parser.addDataFromTagHandler("".join(r),True) + return True def handle_endtag(self, tag): - self.intag = False + if tag==intag: self.intag=False def handle_data(self,data): if self.intag: return transformFunc(data) @@ -1422,25 +1848,35 @@ def find_text_in_HTML(htmlStr): # returns a codeTextList; encodes entities in ut return LXML_find_text_in_HTML(htmlStr) import htmlentitydefs class Parser(HTMLParser): + def shouldStripTag(self,tag): + self.ignoredLastTag = (tag.lower() in options.stripTags and (self.ignoredLastTag or self.getBytePos()==self.lastCodeStart)) + return self.ignoredLastTag def handle_starttag(self, tag, attrs): + if self.shouldStripTag(tag): return if tag in options.leaveTags: self.ignoreData=True def handle_endtag(self, tag): + if self.shouldStripTag(tag): return if tag in options.leaveTags: self.ignoreData=False # doesn't check for nesting or balancing # (documented limitation) - def handle_data(self,data,datalen=None): - if self.ignoreData or not data.strip(): - return # keep treating it as code - if datalen==None: data = latin1decode(data) + def getBytePos(self): # TODO: duplicate code line,offset = self.getpos() while line>self.knownLine: self.knownLine += 1 self.knownLinePos=htmlStr.find('\n',self.knownLinePos)+1 - dataStart = self.knownLinePos + offset - self.codeTextList.append(latin1decode(htmlStr[self.lastCodeStart:dataStart])) - self.codeTextList.append(data) + return self.knownLinePos + offset + def handle_data(self,data,datalen=None): + if self.ignoreData or not data.strip(): + return # keep treating it as code + if datalen==None: data = latin1decode(data) + dataStart = self.getBytePos() + if self.codeTextList and (self.ignoredLastTag or dataStart == self.lastCodeStart): # no intervening code, merge (TODO reduce string concatenation?) + self.codeTextList[-1] += data + else: + self.codeTextList.append(latin1decode(htmlStr[self.lastCodeStart:dataStart])) + self.codeTextList.append(data) if datalen==None: datalen = len(data) # otherwise we're overriding it for entity refs etc self.lastCodeStart = dataStart+datalen def handle_entityref(self,name): @@ -1453,7 +1889,7 @@ def find_text_in_HTML(htmlStr): # returns a codeTextList; encodes entities in ut parser = Parser() parser.codeTextList = [] ; parser.lastCodeStart = 0 parser.knownLine = 1 ; parser.knownLinePos = 0 - parser.ignoreData = False + parser.ignoreData = parser.ignoredLastTag = False htmlStr = fixHTML(htmlStr) err="" try: @@ -1473,27 +1909,39 @@ def find_text_in_HTML(htmlStr): # returns a codeTextList; encodes entities in ut def LXML_find_text_in_HTML(htmlStr): import htmlentitydefs class Parser: + def shouldStripTag(self,tag): + self.ignoredLastTag = (tag.lower() in options.stripTags and (self.ignoredLastTag or not self.out)) + return self.ignoredLastTag def start(self, tag, attrs): + sst = self.shouldStripTag(tag) self.out.append(lxmlEncodeTag(tag,dict((k,v.encode('utf-8')) for k,v in dict(attrs).items()))) - if tag in options.leaveTags: + if (not sst) and tag in options.leaveTags: self.ignoreData=True def end(self, tag): + sst = self.shouldStripTag(tag) if tag not in html_tags_not_needing_ends: self.out.append("</"+tag+">") - if tag in options.leaveTags: + if (not sst) and tag in options.leaveTags: self.ignoreData=False def data(self,unidata): data = unidata.encode('utf-8') if self.ignoreData or not data.strip(): self.out.append(data) ; return - self.codeTextList.append("".join(self.out)) - self.codeTextList.append(data) + if self.ignoredLastTag: self.out = [] + out = "".join(self.out) + if self.codeTextList and not out: + # merge (TODO reduce string concatenation?) + self.codeTextList[-1] += data + else: + self.codeTextList.append(out) + self.codeTextList.append(data) self.out = [] def comment(self,text): # TODO: same as above's def comment self.out.append("<!--"+text+"-->") def close(self): pass parser = Parser() ; parser.out = [] - parser.codeTextList = [] ; parser.ignoreData = False + parser.codeTextList = [] + parser.ignoreData = parser.ignoredLastTag = False lparser = etree.HTMLParser(target=parser) etree.parse(StringIO(htmlStr.decode('utf-8')), lparser) if len(parser.codeTextList)%2: parser.codeTextList.append("") @@ -1597,14 +2045,51 @@ def reloadSwitchJS(cookieName,jsCookieString,flipLogic,readableName,cookieHostTo if flipLogic: isOn,setOn,setOff = (not isOn),setOff,setOn if extraCondition: extraCondition = "&&"+extraCondition else: extraCondition = "" - if isOn: return r"""<script type="text/javascript"><!-- -if(!%s%s)document.write("%s: On | "+'<a href="javascript:document.cookie=\'%s=%s;domain=%s;expires=%s;path=/\';document.cookie=\'%s=%s;domain=.%s;expires=%s;path=/\';location.reload()">Off</a> ') + if isOn: return r"""<script><!-- +if(!%s%s&&document.readyState!='complete')document.write("%s: On | "+'<a href="javascript:document.cookie=\'%s=%s;domain=%s;expires=%s;path=/\';document.cookie=\'%s=%s;domain=.%s;expires=%s;path=/\';location.reload(true)">Off<\/a> ') //--></script>""" % (detect_iframe,extraCondition,readableName,cookieName,setOff,cookieHostToSet,cookieExpires,cookieName,setOff,cookieHostToSet,cookieExpires) - else: return r"""<script type="text/javascript"><!-- -if(!%s%s)document.write("%s: "+'<a href="javascript:document.cookie=\'%s=%s;domain=%s;expires=%s;path=/\';document.cookie=\'%s=%s;domain=.%s;expires=%s;path=/\';location.reload()">On</a> | Off ') + else: return r"""<script><!-- +if(!%s%s&&document.readyState!='complete')document.write("%s: "+'<a href="javascript:document.cookie=\'%s=%s;domain=%s;expires=%s;path=/\';document.cookie=\'%s=%s;domain=.%s;expires=%s;path=/\';location.reload(true)">On<\/a> | Off ') //--></script>""" % (detect_iframe,extraCondition,readableName,cookieName,setOn,cookieHostToSet,cookieExpires,cookieName,setOn,cookieHostToSet,cookieExpires) -def detect_renderCheck(): return r"""(document.getElementsByTagName && function(){var b=document.getElementsByTagName("BODY")[0],d=document.createElement("DIV"),s=document.createElement("SPAN"); d.appendChild(s); function wid(chr) { s.innerHTML = chr; b.appendChild(d); var width = s.offsetWidth; b.removeChild(d); return width; } var w1=wid("\u%s"),w2=wid("\uffff"),w3=wid("\ufffe"),w4=wid("\u2fdf"); return (w1!=w2 && w1!=w3 && w1!=w4)}())""" % options.renderCheck +def reloadSwitchJSMultiple(cookieName,jsCookieString,flipInitialItems,readableNames,cookieHostToSet,cookieExpires): + # flipInitialItems: for adjustNoFilter compatibility between one and multiple items, 1 means off, 0 (default) means 1st item, 2 means 2nd etc. (Currently, this function is only ever called with flipInitialItems==True) + r = [r"""<script><!-- +if(!%s&&document.readyState!='complete'){document.write("%s: """ % (detect_iframe,readableNames[0])] + spanStart = 0 + for i in range(len(readableNames)): + if i: r.append(" | ") + if i==len(readableNames)-1: + rN = "Off" + if flipInitialItems: chk = "1" + else: chk = "0" + else: + if i==2: + spanStart = len(r) + r.append('<span id=adjustNoFilter>') + # (gets here if len(readableNames)>3; use this as ID because we already have transform_in_selected_tag on it) (NB if quoting the id, use r'\"' because we're in a document.write) + rN = readableNames[i+1] + if flipInitialItems: + if i: chk=str(i+1) + else: chk="0" + else: chk = str(i+1) + if i >= 9: chk="x"+str(len(chk))+"-"+chk # so we can continue to use the 'x in string' code without worrying about common prefixes 1, 10, 100 ... + isOn = (cookieName+"="+chk) in jsCookieString + if chk=="0" and not isOn and not cookieName+"=" in jsCookieString: isOn = 1 # default + if isOn: + r.append(rN) + if 2 <= i < len(readableNames)-1: + # want to keep it unhidden if an option is selected that's not in the first 2 and isn't the "Off" + del r[spanStart] + spanStart = 0 + else: r.append(r""""+'<a href="javascript:document.cookie=\'%s=%s;domain=%s;expires=%s;path=/\';document.cookie=\'%s=%s;domain=.%s;expires=%s;path=/\';location.reload(true)">'+"%s<"+"\/a>""" % (cookieName,chk,cookieHostToSet,cookieExpires,cookieName,chk,cookieHostToSet,cookieExpires,rN)) + if spanStart: r.append('<"+"/span>') + r.append(' ")') + if spanStart: r.append(r';if(document.getElementById){var v=document.getElementById("adjustNoFilter");if(v.innerHTML){v.OIH=v.innerHTML;if(v.OIH==v.innerHTML)v.innerHTML="<a href=\"#adjustNoFilter\" onClick=\"this.parentNode.innerHTML=this.parentNode.OIH;return false\">More<"+"/A>"; }}') # (hide the span by default, if browser has enough JS support to do it) (TODO: could do it with non-innerHTML DOM functionality if necessary, but that's more long-winded and might also need to look out for non-working 'this' functionality) + r.append('}\n//--></script>') + return "".join(r) + +def detect_renderCheck(): return r"""(document.getElementsByTagName && function(){var b=document.getElementsByTagName("BODY")[0],d=document.createElement("DIV"),s=document.createElement("SPAN"); if(!(b.appendChild && b.removeChild && s.innerHTML))return 0; d.appendChild(s); function wid(chr) { s.innerHTML = chr; b.appendChild(d); var width = s.offsetWidth; b.removeChild(d); return width; } var w1=wid("\u%s"),w2=wid("\uffff"),w3=wid("\ufffe"),w4=wid("\u2fdf"); return (w1!=w2 && w1!=w3 && w1!=w4)}())""" % options.renderCheck # ffff, fffe - guaranteed invalid by Unicode, but just might be treated differently by browsers # 2fdf unallocated character at end of Kangxi radicals block, hopefully won't be used # do NOT use fffd, it's sometimes displayed differently to other unrenderable characters @@ -1612,41 +2097,57 @@ def detect_renderCheck(): return r"""(document.getElementsByTagName && function( def html_additions(html,slow_CSS_switch,cookieHostToSet,jsCookieString,canRender,cookie_host,is_password_domain): # Additions to make to HTML only (not on HTML embedded in JSON) + # called from doResponse2 if do_html_process is set + if html.startswith("<?xml"): link_close = " /" + else: link_close = "" if not "<body" in html.lower() and not "</body" in html.lower(): # frameset etc (TODO: what about broken HTML that omits the body tag? have tried to check for "</body" as well, but that might be missing also) return html - bodyAppend = options.bodyAppend - if not bodyAppend: bodyAppend = "" + bodyAppend = bodyAppend1 = "" bodyPrepend = options.bodyPrepend if not bodyPrepend: bodyPrepend = "" headAppend = "" + if set_window_onerror: headAppend += r"""<script><!-- +window.onerror=function(msg,url,line){alert(msg); return true} +--></script>""" if options.headAppendCSS: - # do this BEFORE headAppend, because someone might want to refer to it in a script in headAppend (although bodyPrepend is a better place to put 'change the href according to screen size' scripts, as some Webkit-based browsers don't make screen size available when processing the HEAD of the 1st document in the session) + # do this BEFORE options.headAppend, because someone might want to refer to it in a script in options.headAppend (although bodyPrepend is a better place to put 'change the href according to screen size' scripts, as some Webkit-based browsers don't make screen size available when processing the HEAD of the 1st document in the session) if options.cssName: + if options.cssName.startswith("*"): cssName = options.cssName[1:] # omit the * + else: cssName = options.cssName if slow_CSS_switch: # alternate, slower code involving hard HTML coding and page reload (but still requires some JS) - bodyAppend += reloadSwitchJS("adjustCssSwitch",jsCookieString,False,options.cssName,cookieHostToSet,cookieExpires) - if "adjustCssSwitch=1" in jsCookieString: - headAppend += '<link rel="stylesheet" type="text/css" href="%s">' % (options.headAppendCSS,) + bodyAppend += reloadSwitchJS("adjustCssSwitch",jsCookieString,False,cssName,cookieHostToSet,cookieExpires) + if options.cssName.startswith("*"): useCss = not "adjustCssSwitch=0" in jsCookieString + else: useCss = "adjustCssSwitch=1" in jsCookieString + if useCss: + headAppend += '<link rel="stylesheet" type="text/css" href="%s"%s>' % (options.headAppendCSS,link_close) + if options.cssHtmlAttrs: + i=html.lower().find("<body") + if i>-1: + i += 5 # after the "<body" + html = html[:i] + " " + options.cssHtmlAttrs + html[i:] else: # client-side only CSS switcher: - headAppend += """<link rel="alternate stylesheet" type="text/css" id="adjustCssSwitch" title="%s" href="%s"> -<script language="Javascript"><!-- -if(document.getElementById) document.getElementById('adjustCssSwitch').disabled=true -//--></script>""" % (options.cssName,options.headAppendCSS,) # (on some Webkit versions, MUST set disabled to true (from JS?) before setting it to false will work) -# disabled=} - bodyPrepend += """<script language="Javascript"><!-- -if(document.getElementById && document.cookie.indexOf("adjustCssSwitch=1")>-1) document.getElementById('adjustCssSwitch').disabled=false -//--></script>""" - bodyAppend += r"""<script type="text/javascript"><!-- -if(document.getElementById && !%s) document.write("%s: "+'<a href="javascript:document.getElementById(\'adjustCssSwitch\').disabled=false;document.cookie=\'adjustCssSwitch=1;domain=%s;expires=%s;path=/\';document.cookie=\'adjustCssSwitch=1;domain=.%s;expires=%s;path=/\';window.scrollTo(0,0)">On</a> | <a href="javascript:document.getElementById(\'adjustCssSwitch\').disabled=true;document.cookie=\'adjustCssSwitch=0;domain=%s;expires=%s;path=/\';document.cookie=\'adjustCssSwitch=0;domain=.%s;expires=%s;path=/\';window.scrollTo(0,0)">Off</a> ') -//--></script>""" % (detect_iframe,options.cssName,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires) - else: headAppend += '<link rel="stylesheet" type="text/css" href="%s">' % (options.headAppendCSS,) - if options.htmlFilterName and options.htmlFilter: bodyAppend += reloadSwitchJS("adjustNoFilter",jsCookieString,True,options.htmlFilterName,cookieHostToSet,cookieExpires) + headAppend += """<link rel="alternate stylesheet" type="text/css" id="adjustCssSwitch" title="%s" href="%s"%s>""" % (cssName,options.headAppendCSS,link_close) + # On some Webkit versions, MUST set disabled to true (from JS?) before setting it to false will work. And in MSIE9 it seems must do this from the BODY not the HEAD, so merge into the next script (also done the window.onload thing for MSIE; hope it doesn't interfere with any site's use of window.onload) : + if options.cssName.startswith("*"): cond='document.cookie.indexOf("adjustCssSwitch=0")==-1' + else: cond='document.cookie.indexOf("adjustCssSwitch=1")>-1' + bodyPrepend += """<script><!-- +if(document.getElementById) { var a=document.getElementById('adjustCssSwitch'); a.disabled=true; if(%s) {a.disabled=false;window.onload=function(e){a.disabled=true;a.disabled=false}} } +//--></script>""" % cond + bodyAppend += r"""<script><!-- +if(document.getElementById && !%s && document.readyState!='complete') document.write("%s: "+'<a href="#" onclick="document.cookie=\'adjustCssSwitch=1;domain=%s;expires=%s;path=/\';document.cookie=\'adjustCssSwitch=1;domain=.%s;expires=%s;path=/\';window.scrollTo(0,0);document.getElementById(\'adjustCssSwitch\').disabled=false;return false">On<\/a> | <a href="#" onclick="document.cookie=\'adjustCssSwitch=0;domain=%s;expires=%s;path=/\';document.cookie=\'adjustCssSwitch=0;domain=.%s;expires=%s;path=/\';window.scrollTo(0,0);document.getElementById(\'adjustCssSwitch\').disabled=true;return false">Off<\/a> ') +//--></script>""" % (detect_iframe,cssName,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires) # (hope it helps some MSIE versions to set cookies 1st, THEN scroll, and only THEN change the document. Also using onclick= rather than javascript: URLs) + #" # (this comment helps XEmacs21's syntax highlighting) + else: headAppend += '<link rel="stylesheet" type="text/css" href="%s"%s>' % (options.headAppendCSS,link_close) + if options.htmlFilterName and options.htmlFilter: + if '#' in options.htmlFilter: bodyAppend1 = reloadSwitchJSMultiple("adjustNoFilter",jsCookieString,True,options.htmlFilterName.split("#"),cookieHostToSet,cookieExpires) # (better put the multi-switch at the start of the options; it might be the most-used option. Put it into bodyAppend1: we don't want the word "Off" to be misread as part of the next option string, seeing as the word before it was probably not "On", unlike normal reloadSwitchJS switches) + else: bodyAppend += reloadSwitchJS("adjustNoFilter",jsCookieString,True,options.htmlFilterName,cookieHostToSet,cookieExpires) # (after the CSS if it's only an on/off) if canRender: # TODO: make the script below set a cookie to stop itself from being served on subsequent pages if detect_renderCheck failed? but this might be a false economy if upload bandwidth is significantly smaller than download bandwidth (and making it external could have similar issues) # TODO: if cookies are not supported, the script below could go into an infinite reload loop - if options.renderCheck and not "adjustNoRender=1" in jsCookieString: bodyPrepend += r"""<script type="text/javascript"><!-- -if(!%s && %s) { document.cookie='adjustNoRender=1;domain=%s;expires=%s;path=/';document.cookie='adjustNoRender=1;domain=.%s;expires=%s;path=/';location.reload() + if options.renderCheck and not "adjustNoRender=1" in jsCookieString: bodyPrepend += r"""<script><!-- +if(!%s && %s) { document.cookie='adjustNoRender=1;domain=%s;expires=%s;path=/';document.cookie='adjustNoRender=1;domain=.%s;expires=%s;path=/';location.reload(true) } //--></script>""" % (detect_iframe,detect_renderCheck(),cookieHostToSet,cookieExpires,cookieHostToSet,cookieExpires) if options.renderName: @@ -1654,13 +2155,14 @@ if(!%s && %s) { document.cookie='adjustNoRender=1;domain=%s;expires=%s;path=/';d else: extraCondition=None bodyAppend += reloadSwitchJS("adjustNoRender",jsCookieString,True,options.renderName,cookieHostToSet,cookieExpires,extraCondition) if cookie_host: - if enable_adjustDomainCookieName_URL_override: bodyAppend += r"""<script type="text/javascript"><!-- -if(!%s)document.write('<a href="http://%s/?%s=%s">Back to URL box</a>') + if enable_adjustDomainCookieName_URL_override: bodyAppend += r"""<script><!-- +if(!%s&&document.readyState!='complete')document.write('<a href="http://%s/?%s=%s">Back to URL box<\/a>') //--></script><noscript><a href="http://%s/?%s=%s">Back to URL box</a></noscript>""" % (detect_iframe,cookieHostToSet,adjust_domain_cookieName,adjust_domain_none,cookieHostToSet,adjust_domain_cookieName,adjust_domain_none) - else: bodyAppend += r"""<script type="text/javascript"><!-- -if(!%s)document.write('<a href="javascript:document.cookie=\'%s=%s;expires=%s;path=/\';if(location.href==\'http://%s/\')location.reload();else location.href=\'http://%s/?nocache=\'+Math.random()">Back to URL box</a>') + else: bodyAppend += r"""<script><!-- +if(!%s&&document.readyState!='complete')document.write('<a href="javascript:document.cookie=\'%s=%s;expires=%s;path=/\';if(location.href==\'http://%s/\')location.reload(true);else location.href=\'http://%s/?nocache=\'+Math.random()">Back to URL box<\/a>') //--></script>""" % (detect_iframe,adjust_domain_cookieName,adjust_domain_none,cookieExpires,cookieHostToSet,cookieHostToSet) # (we should KNOW if location.href is already that, and can write the conditional here not in that 'if', but they might bookmark the link or something) - if options.headAppendRuby: headAppend += """ + if options.headAppend: headAppend += options.headAppend + if options.headAppendRuby: bodyPrepend += """ <style id="ruby">ruby { display: inline-table; vertical-align: top; } ruby * { display: inline; line-height:1.0; text-indent:0; text-align:center; @@ -1670,24 +2172,24 @@ rt { display: table-header-group; font-size: 100%; line-height: 1.1; }</style> <!--[if !IE]>--> <style>rt { font-family: Gandhari, DejaVu Sans, Lucida Sans Unicode, Times New Roman, serif !important; }</style> <!--<![endif]--> -<script language="JavaScript"><!-- +<script><!-- var wk=navigator.userAgent.indexOf('WebKit/'); if(wk>-1){var v=document.getElementById('ruby');v.innerHTML=v.innerHTML.replace(/display[^;]*;/g,''); -v=navigator.userAgent.slice(wk+7,wk+12);if(v>=534.3&&v<535.7)document.write('<style>rt{padding-left:1ex;padding-right:1ex;}<'+'/style>')} +v=navigator.userAgent.slice(wk+7,wk+12);if(v>=534.3&&v<535.7&&document.readyState!='complete')document.write('<style>rt{padding-left:1ex;padding-right:1ex;}<\/style>')} //--></script> """ # (I sent the WebKit hack to Wenlin Institute as well) - if options.headAppend: headAppend += options.headAppend if options.prominentNotice and not is_password_domain: # if JS is available, use fixed positioning (so it still works on sites that do that, in case we're not overriding it via user CSS) and a JS acknowledge button styleAttrib="style=\"width: 80% !important; margin: 10%; border: red solid !important; background: black !important; color: white !important; text-align: center !important; display: block !important; left:0px; top:0px; z-index:99999; -moz-opacity: 1 !important; filter: none !important; opacity: 1 !important; visibility: visible !important;\"" if slow_CSS_switch: # use a slow version for this as well (TODO document that we do this?) (TODO the detect_iframe exclusion of the whole message) - if not "_WA_warnOK=1" in jsCookieString: bodyPrepend += "<div id=_WA_warn0 "+styleAttrib+">"+options.prominentNotice+r"""<script language="JavaScript"><!-- -document.write("<br><button style=\"color: black !important;background:#c0c0c0 !important;border: white solid !important\" onClick=\"document.cookie='_WA_warnOK=1;path=/';location.reload()\">Acknowledge</button>") -//--></script></div><script language="JavaScript"><!-- + if not "_WA_warnOK=1" in jsCookieString: bodyPrepend += "<div id=_WA_warn0 "+styleAttrib+">"+options.prominentNotice+r"""<script><!-- +if(document.readyState!='complete'&&document.cookie.indexOf("_WA_warnOK=1")==-1)document.write("<br><button style=\"color: black !important;background:#c0c0c0 !important;border: white solid !important\" onClick=\"document.cookie='_WA_warnOK=1;path=/';location.reload(true)\">Acknowledge<\/button>") +//--></script></div><script><!-- if(document.getElementById) document.getElementById('_WA_warn0').style.position="fixed" } //--></script>""" - else: bodyPrepend += "<div id=_WA_warn0 "+styleAttrib+">"+options.prominentNotice+r"""</div><script language="JavaScript"><!-- + #" # (this comment helps XEmacs21's syntax highlighting) + else: bodyPrepend += "<div id=_WA_warn0 "+styleAttrib+">"+options.prominentNotice+r"""</div><script><!-- if(document.getElementById) { var w=document.getElementById('_WA_warn0'); if(w.innerHTML) { @@ -1696,13 +2198,15 @@ if(document.getElementById) { var f="""+detect_iframe+r"""; if(!f) { var c=document.cookie.split(";"); for (i=0;i<c.length;i++) if (c[i].substr(0,c[i].indexOf("=")).replace(/\s/g,"") == "_WA_warnOK") { f=1;break; } } if(f) document.body.removeChild(document.getElementById('_WA_warn0')); - else w.innerHTML += "<br><button style=\"color: black !important;background:#c0c0c0 !important;border: white solid !important\" onClick=\"document.body.removeChild(document.getElementById('_WA_warn0'));document.cookie='_WA_warnOK=1;path=/'\">Acknowledge</button>"; + else w.innerHTML += "<br><button style=\"color: black !important;background:#c0c0c0 !important;border: white solid !important\" onClick=\"document.cookie='_WA_warnOK=1;path=/';document.body.removeChild(document.getElementById('_WA_warn0'))\">Acknowledge</button>"; }} //--></script>""" + #" # (this comment helps XEmacs21's syntax highlighting) + # (Above code works around a bug in MSIE 9 by setting the cookie BEFORE doing the removeChild. Otherwise the cookie does not persist.) if options.headAppendRuby: bodyAppend += """ -<script language="javascript"><!-- +<script><!-- if(navigator.userAgent.indexOf('WebKit/')>-1 && navigator.userAgent.slice(wk+7,wk+12)>534){var rbs=document.getElementsByTagName('rb');for(var i=0;i<rbs.length;i++)rbs[i].innerHTML='​'+rbs[i].innerHTML+'​'} -function treewalk(n) { var c=n.firstChild; while(c) { if (c.nodeType==1 && c.nodeName!="SCRIPT" && c.nodeName!="TEXTAREA" && !(c.nodeName=="A" && c.hasAttribute("href"))) { treewalk(c); if(c.nodeName=="RUBY" && c.hasAttribute("title") && !c.hasAttribute("onclick")) c.onclick=Function("alert(this.title)") } c=c.nextSibling; } } function tw() { treewalk(document.body); window.setTimeout(tw,5000); } treewalk(document.body); window.setTimeout(tw,1500); +function treewalk(n) { var c=n.firstChild; while(c) { if (c.nodeType==1 && c.nodeName!="SCRIPT" && c.nodeName!="TEXTAREA" && !(c.nodeName=="A" && c.href)) { treewalk(c); if(c.nodeName=="RUBY" && c.title && !c.onclick) c.onclick=Function("alert(this.title)") } c=c.nextSibling; } } function tw() { treewalk(document.body); window.setTimeout(tw,5000); } treewalk(document.body); window.setTimeout(tw,1500); //--></script>""" if headAppend: i=html.lower().find("</head") @@ -1718,6 +2222,9 @@ function treewalk(n) { var c=n.firstChild; while(c) { if (c.nodeType==1 && c.nod if i>-1: i=html.find(">",i) if i>-1: html=html[:i+1]+bodyPrepend+html[i+1:] + if bodyAppend1 and bodyAppend: bodyAppend = '<span style="float:left">' + bodyAppend1 + '</span><span style="float:left;width:1em"><br></span><span style="float: right">'+bodyAppend+'</span><span style="clear:both"></span>' # (the <br> is in case CSS is off or overrides float) + elif bodyAppend1: bodyAppend = bodyAppend1 + if options.bodyAppend: bodyAppend = options.bodyAppend + bodyAppend if bodyAppend: if options.bodyAppendGoesAfter: i=html.rfind(options.bodyAppendGoesAfter) @@ -1729,7 +2236,9 @@ function treewalk(n) { var c=n.firstChild; while(c) { if (c.nodeType==1 && c.nod html = html[:i]+bodyAppend+html[i:] return html -def ampEncode(unitxt): return unitxt.replace("&","&").replace("<","<").replace(">",">") # needed because these entities will be in cleartext to the renderer +def ampEncode(t): return t.replace("&","&").replace("<","<").replace(">",">") +# (needed below because these entities will be in cleartext to the renderer; also used by serve_mailtoPage to avoid cross-site scripting) +def txt2html(t): return ampEncode(t).replace("\n","<br>") class Renderer: def __init__(self): @@ -1746,11 +2255,12 @@ class Renderer: i=0 ; import unicodedata width = 0 ; imgStrStart = -1 ret = [] ; copyFrom = 0 - def doImgEnd(): + if options.renderWidth==0: doImgEnd=lambda:None + else: + def doImgEnd(): if imgStrStart >= 0 and width <= options.renderWidth and len(ret) > imgStrStart + 1: ret.insert(imgStrStart,'<nobr>') ret.append('</nobr>') - if options.renderWidth==0: doImgEnd=lambda:None if options.renderBlocks: combining=lambda x:False else: combining=unicodedata.combining checkAhead = (options.renderNChar>1 or not options.renderBlocks) @@ -1782,7 +2292,7 @@ class Renderer: if options.renderBlocks: self.hanziW = w/len(unitext) self.hanziH = h - return ('<img src="%s" width=%d height=%d>' % (options.renderPath+imgEncode(unitext),w,h)), w + return ('<img src="%s%s" width=%s height=%s>' % (options.renderPath,imgEncode(unitext),w,h)), w # (%s is faster than %d apparently, and format strings are faster than ''.join) def getImage(self,uri): if not options.render or not uri.startswith(options.renderPath): return False try: import ImageDraw,Image @@ -1819,25 +2329,60 @@ def imgEncode(unitext): # Encode unitext to something URL-safe, try to be efficient especially in small cases # Normally base64-encoded UTF-8 (output will be a multiple of 4 bytes) # but some single characters will be OK as-is, and 2 or 3 bytes could hex a unichr under U+1000 + # This function needs to be FAST - it can be called thousands of times during a page render if len(unitext)==1: - if unitext in string.letters+string.digits+"_.-": return unitext - elif 0xf<ord(unitext)<0x1000: return hex(ord(unitext))[2:] + o = ord(unitext) + if o < 0x1000: + # TODO: create_inRenderRange_function can also re-create this function to omit the above test if we don't have any ranges under 0x1000 ? (but it should be quite quick) + if unitext in string.letters+string.digits+"_.-": return unitext + elif 0xf<ord(unitext): return hex(ord(unitext))[2:] + elif o <= 0xFFFF: # (TODO: don't need that test if true for all our render ranges) + # TODO: make this optional? hex(ord(u))[-4:] is nearly 5x faster than b64encode(u.encode('utf-8')) in the case of 1 BMP character (it's faster than even just the .encode('utf-8') part), but result could also decode with base64, so we have to add an extra '_' byte to disambiguate, which adds to the traffic (although only a small amount compared to IMG markup anyway) + return '_'+hex(o)[-4:] return base64.b64encode(unitext.encode('utf-8')) def imgDecode(code): if len(code)==1: return code elif len(code) <= 3: return unichr(int(code,16)) + elif code.startswith("_"): return unichr(int(code[1:],16)) # (see TODO above) else: return base64.b64decode(code).decode('utf-8') class Dynamic_DNS_updater: def __init__(self): self.currentIP = None self.forceTime=0 + self.aggressive_mode = False IOLoop.instance().add_callback(lambda *args:self.queryIP()) + def queryLocalIP(self): + # Queries ip_query_url2 (if set, and if we know current IP). Depending on the response/situation, either passes control to queryIP (which sets the next timeout itself), or sets an ip_check_interval2 timeout. + if not ip_query_url2 or not self.currentIP: + return self.queryIP() + debuglog("queryLocalIP") + def handleResponse(r): + if r.error or not self.currentIP in r.body: + return self.queryIP() + # otherwise it looks like the IP is unchanged: + self.newIP(self.currentIP) # in case forceTime is up + IOLoop.instance().add_timeout(time.time()+options.ip_check_interval2,lambda *args:self.queryLocalIP()) + if ip_query_url2_user: + # some routers etc insist we send the non-auth'd request first, and the credentials only when prompted (that's what Lynx does with the -auth command line), TODO do we really need to do this every 60secs? (do it only if the other way gets an error??) but low-priority as this is all local-net stuff (and probably a dedicated link to the switch at that) + callback = lambda r:AsyncHTTPClient().fetch(ip_query_url2, callback=handleResponse, auth_username=ip_query_url2_user,auth_password=ip_query_url2_pwd) + else: callback = handleResponse + AsyncHTTPClient().fetch(ip_query_url2, callback=callback) def queryIP(self): + # Queries ip_query_url, and, after receiving a response (optionally via retries if ip_query_aggressive), sets a timeout to go back to queryLocalIP after ip_check_interval (not ip_check_interval2) debuglog("queryIP") def handleResponse(r): - if not r.error: self.newIP(r.body.strip()) - IOLoop.instance().add_timeout(time.time()+options.ip_check_interval,lambda *args:self.queryIP()) + if not r.error: + self.newIP(r.body.strip()) + if self.aggressive_mode: + logging.info("ip_query_url got response, stopping ip_query_aggressive") + self.aggressive_mode = False + elif options.ip_query_aggressive: + if not self.aggressive_mode: + logging.info("ip_query_url got error, starting ip_query_aggressive") + self.aggressive_mode = True + return self.queryIP() + IOLoop.instance().add_timeout(time.time()+options.ip_check_interval,lambda *args:self.queryLocalIP()) AsyncHTTPClient().fetch(options.ip_query_url, callback=handleResponse) def newIP(self,ip): debuglog("newIP "+ip) @@ -1847,15 +2392,14 @@ class Dynamic_DNS_updater: try: socket.inet_pton(socket.AF_INET6,ip) except socket.error: return # illegal IP, maybe a temporary error from the server self.currentIP = ip - if options.ip_change_command: - cmd = options.ip_change_command+" "+ip - logging.info("ip_change_command: "+cmd) - threading.Thread(target=os.system,args=(cmd,)).start() - if options.dynamic_dns_api: - # send the API updates one domain at a time - def upHost(hostList): - if hostList: AsyncHTTPClient().fetch(options.dynamic_dns_api % (hostList[0],ip), callback=lambda r:(logging.info("Dynamic DNS: update %s to %s gave error %s and body %s" % (hostList[0],ip,repr(r.error),repr(r.body.strip()))),upHost(hostList[1:])), auth_username=options.ddns_api_user, auth_password=options.ddns_api_pwd) - upHost(options.host_suffix.split("/")) + cmd = options.ip_change_command+" "+ip + logging.info("ip_change_command: "+cmd) + def runCmd(cmd): + global helper_thread_count + helper_thread_count += 1 + os.system(cmd) + helper_thread_count -= 1 + threading.Thread(target=runCmd,args=(cmd,)).start() self.forceTime=time.time()+options.ip_force_interval class WatchdogPings: @@ -1866,21 +2410,27 @@ class WatchdogPings: thread.start_new_thread((lambda *args:self.separate_thread()),()) self.ping() def separate_thread(self): # version for watchdogWait + # (does not adjust helper_thread_count / can't be "runaway") + global watchdog_mainServerResponded # a flag. Do NOT timestamp with time.time() - it can go wrong if NTP comes along and re-syncs the clock by a large amount def respond(*args): global watchdog_mainServerResponded - watchdog_mainServerResponded = time.time() - respond() ; stopped = 0 - while True: - if watchdog_mainServerResponded + options.watchdogWait >= time.time(): + watchdog_mainServerResponded = True + respond() ; stopped = 0 ; sleptSinceResponse = 0 + while options.watchdog: + if watchdog_mainServerResponded: self.ping() if stopped: logging.info("Main thread responded, restarting watchdog ping") stopped = 0 + watchdog_mainServerResponded = False + sleptSinceResponse = 0 IOLoop.instance().add_callback(respond) + elif sleptSinceResponse < options.watchdogWait: self.ping() # keep waiting for it elif not stopped: logging.info("Main thread unresponsive, stopping watchdog ping") stopped = 1 # but don't give up (it might respond just in time) time.sleep(options.watchdog) + sleptSinceResponse += options.watchdog # "dead reckoning" to avoid time.time() def ping(self): debuglog("pinging watchdog",logRepeats=False) self.wFile.write('a') ; self.wFile.flush() @@ -1889,25 +2439,58 @@ class WatchdogPings: # else one ping only (see separate_thread) fasterServer_up = False +def FSU_set(new_FSU,interval): + # sets new fasterServer_up state, and returns interval to next check + global fasterServer_up + fsu_old = fasterServer_up + fasterServer_up = new_FSU + if not fasterServer_up == fsu_old: + if fasterServer_up: logging.info("fasterServer %s came up - forwarding traffic to it" % options.fasterServer) + else: logging.info("fasterServer %s went down - handling traffic ourselves" % options.fasterServer) + # debuglog("fasterServer_up="+repr(fasterServer_up)+" (err="+repr(r.error)+")",logRepeats=False) + if fasterServer_up: return 1 # TODO: configurable? fallback if timeout when we try to connect to it as well? + elif interval < 60: interval *= 2 # TODO: configurable? + return interval class checkServer: - def __init__(self): self.client=None ; self.interval=1 + def __init__(self): + self.client = self.pendingClient = None + self.count = 0 + self.interval=1 def __call__(self): - def callback(r): - global fasterServer_up - fsu_old = fasterServer_up - fasterServer_up = not r.error - if not fasterServer_up == fsu_old: - if fasterServer_up: logging.info("fasterServer %s came up - forwarding traffic to it" % options.fasterServer) - else: logging.info("fasterServer %s went down - handling traffic ourselves" % options.fasterServer) - # debuglog("fasterServer_up="+repr(fasterServer_up)+" (err="+repr(r.error)+")",logRepeats=False) - if fasterServer_up: self.interval = 1 # TODO: configurable? fallback if timeout when we try to connect to it as well? - else: - if self.interval < 60: # TODO: configurable? - self.interval *= 2 - self.client = None - IOLoop.instance().add_timeout(time.time()+self.interval,lambda *args:checkServer()) - if not self.client: self.client=AsyncHTTPClient() - self.client.fetch("http://"+options.fasterServer+"/ping",connect_timeout=1,request_timeout=1,user_agent="ping",callback=callback,use_gzip=False) + if options.fasterServerNew: + # TODO: might be bytes in the queue if this server somehow gets held up. Could try read_until_close(close,stream) + if (self.client and self.count >= 2) or self.pendingClient: # it didn't call serverOK on 2 consecutive seconds (TODO: customizable?), or didn't connect within 1sec - give up + try: self.pendingClient.close() + except: pass + try: self.client.close() + except: pass + self.pendingClient = self.client = None + self.interval = FSU_set(False,self.interval) + return IOLoop.instance().add_timeout(time.time()+self.interval,lambda *args:checkServer()) + elif self.client: self.count += 1 + else: # create new self.pendingClient + server,port = options.fasterServer.rsplit(':',1) + self.pendingClient = tornado.iostream.IOStream(socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)) + def send_request(*args): + try: + self.pendingClient.write('GET /ping2 HTTP/1.0\r\nUser-Agent: ping\r\n\r\n') + self.client = self.pendingClient + self.pendingClient = None + self.client.read_until_close(lambda *args:True,lambda *args:self.serverOK()) + except: pass + self.pendingClient.connect((server,int(port)),send_request) + IOLoop.instance().add_timeout(time.time()+1,lambda *args:checkServer()) # check back in 1sec to see if it connected OK (should do if it's local) + else: # old version - issue HTTP requests to /ping + def callback(r): + self.interval = FSU_set(not r.error,self.interval) + if not fasterServer_up: self.client = None + IOLoop.instance().add_timeout(time.time()+self.interval,lambda *args:checkServer()) + if not self.client: self.client=AsyncHTTPClient() + self.client.fetch("http://"+options.fasterServer+"/ping",connect_timeout=1,request_timeout=1,user_agent="ping",callback=callback,use_gzip=False) + def serverOK(self): + # called when any chunk is available from the stream (normally once a second, but might catch up a few bytes if we've delayed for some reason) + self.interval = FSU_set(True,0) + self.count = 0 checkServer=checkServer() def debuglog(msg,logRepeats=True): -- GitLab