From fde73653bae8dc133784da61aeec99f3e9598e5a Mon Sep 17 00:00:00 2001 From: "Silas S. Brown" <ssb22@cam.ac.uk> Date: Tue, 7 Nov 2017 17:13:29 +0000 Subject: [PATCH] Update Web Adjuster git-svn-id: http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster@2965 29193198-4895-4776-b068-10539e920549 --- adjuster.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/adjuster.py b/adjuster.py index ed3b5d7..ea8d6eb 100755 --- a/adjuster.py +++ b/adjuster.py @@ -417,7 +417,7 @@ def protocolAndHost(realHost): # (the dot will be represented as a hyphen by dedot/redot, # but some servers e.g. GAE can't cope with any part of the # wildcard domain ending with a hyphen, so add the 0; - # TODO: what about fetching from IP addresses) + # TODO: what about fetching from IP addresses, although it's rare to get a server with IP ending .0 because it used to represent "the network") if realHost.endswith(".0"): return "https://",realHost[:-2] else: return "http://",realHost def protocolWithHost(realHost): @@ -2101,7 +2101,7 @@ class RequestForwarder(RequestHandler): except: pass def redirect(self,redir,status=301): - debuglog("redirect ("+repr(status)+" to "+repr(redir)+")"+self.debugExtras()) + debuglog("Serving redirect ("+repr(status)+" to "+repr(redir)+")"+self.debugExtras()) self.set_status(status) for h in ["Location","Content-Type","Content-Language"]: self.clear_header(h) # so redirect() can be called AFTER a site's headers are copied in self.add_header("Location",redir) @@ -2633,6 +2633,7 @@ document.forms[0].i.focus() def debugExtras(self): r = " for "+self.request.method+" "+self.request.uri + if not self.request.uri.startswith("http"): r += " host="+str(self.request.host) if self.WA_UseSSL or (hasattr(self.request,"connection") and hasattr(self.request.connection.stream,"isFromSslHelper")): r += " WA_UseSSL" if self.isPjsUpstream: r += " isPjsUpstream instance "+str(self.WA_PjsIndex+self.WA_PjsStart) if self.isSslUpstream: r += " isSslUpstream" @@ -2772,8 +2773,7 @@ document.forms[0].i.focus() self.request.uri = self.request.uri[:-len(suffix)] converterFlags.append(True) else: converterFlags.append(False) - if upstream_rewrite_ssl and not self.isSslUpstream: - protocol = "http://" # keep the .0 in and call protocolAndHost again on the isSslUpstream pass + if upstream_rewrite_ssl and not self.isSslUpstream and not (options.js_interpreter and not self.isPjsUpstream): protocol = "http://" # keep the .0 in and call protocolAndHost again on the isSslUpstream pass else: protocol,realHost = protocolAndHost(realHost) self.change_request_headers(realHost,isProxyRequest) self.urlToFetch = protocol+self.request.headers["Host"]+self.request.uri @@ -3011,15 +3011,11 @@ document.forms[0].i.focus() "access-control-allow-origin", # better rewrite this for JSON responses to scripts that are used on a site's other domains "link", # RFC 5988 equivalent to link elements in body; includes preloads; might want to adjust the resulting CSS or scripts (especially if the server won't support a fetch from a browser that supplies us as Referer) # "x-associated-content" # see comment in rmServerHeaders - ]: value=domain_process(value,cookie_host,True,https=self.urlToFetch.startswith("https")) + ]: value=domain_process(value,cookie_host,True,https=self.urlToFetch.startswith("https"),isProxyRequest=isProxyRequest,isSslUpstream=self.isSslUpstream) elif name.lower()=="location": # TODO: do we need to delete this header if response.code not in [301,302,303,307] ? old_value_1 = value # before domain_process - if not isProxyRequest: - value=domain_process(value,cookie_host,True,https=self.urlToFetch.startswith("https")) - offsite = (value==old_value_1 and (value.startswith("http://") or value.startswith("https://"))) # i.e. domain_process didn't change it, and it's not relative - else: # isProxyRequest - offsite = False # proxy requests are never "offsite" - if upstream_rewrite_ssl and not self.isSslUpstream and re.match("http://[^/]*.0",value): value="https"+value[4:].replace(".0","",1) # if the upstream proxy's saying .0 due to upstream_rewrite_ssl, then we have to take it out if we want https URLs unchanged in a client-side proxy request + value=domain_process(value,cookie_host,True,https=self.urlToFetch.startswith("https"),isProxyRequest=isProxyRequest,isSslUpstream=self.isSslUpstream) + offsite = (not isProxyRequest and value==old_value_1 and (value.startswith("http://") or value.startswith("https://"))) # i.e. domain_process didn't change it, and it's not relative old_value_2 = value # after domain_process but before PDF/EPUB-etc rewrites if do_pdftotext: # is it still going to be pdf after the redirect? if value.lower().endswith(".pdf") or guessCMS(value,"pdf"): value += pdftotext_suffix @@ -3047,7 +3043,7 @@ document.forms[0].i.focus() elif cookie_host and offsite and self.htmlOnlyMode() and not options.htmlonly_css: # in HTML-only mode, it should never be an embedded image etc, so we should be able to change the current cookie domain unconditionally value = "http://" + convert_to_requested_host(cookie_host,cookie_host) + "/?q=" + urllib.quote(old_value_1) + "&" + adjust_domain_cookieName + "=0&pr=on" # as above elif "set-cookie" in name.lower(): - if not isProxyRequest: value=cookie_domain_process(value,cookie_host) + if not isProxyRequest: value=cookie_domain_process(value,cookie_host) # (never doing this if isProxyRequest, therefore don't have to worry about the upstream_rewrite_ssl exception that applies to normal domain_process isProxyRequest) for ckName in upstreamGuard: value=value.replace(ckName,ckName+"1") headers_to_add.append((name,value)) if name.lower()=="content-type": @@ -4241,12 +4237,18 @@ def find_HTML_in_JSON(jsonStr,htmlListFunc=None): codeTextList.append(jsonStr[i:]) return codeTextList -def domain_process(text,cookieHost=None,stopAtOne=False,https=None): +def domain_process(text,cookieHost=None,stopAtOne=False,https=None,isProxyRequest=False,isSslUpstream=False): + if isProxyRequest: # called for Location: headers etc (not for document bodies) + if upstream_rewrite_ssl and not isSslUpstream: + # Although we don't need a full domain_process when the client is sending us a proxy request, we still have to beware of our UPstream proxy saying .0 in a Location: URL due to upstream_rewrite_ssl: take it out + m = re.match(r"http(://[A-Za-z0-9.-]*)\.0(?![A-Za-z0-9.-])",text) + if m: return "https"+m.group(1) + return text # Change the domains on appropriate http:// and https:// URLs. # Also on // URLs using 'https' as default (if it's not None). # Hope that there aren't any JS-computed links where # the domain is part of the computation. - # TODO: what of links to alternate ports or user:password links, currently we leave them unchanged (could use .<portNo> as an extension of the 'HTTPS hack' of .0, but allowing the public to request connects to any port could be a problem) + # TODO: what of links to alternate ports or user:password links, currently we leave them unchanged (could use .<portNo> as an extension of the 'HTTPS hack' of .0, but allowing the public to request connects to any port could be a problem, and IP addresses would have to be handled carefully: can no longer rely on ".0 used to mean the network" sort-of saving us) # TODO: leave alone URLs in HTML text/comments and JS comments? but script overload can make it hard to judge what is and isn't text. (NB this function is also called for Location headers) if "<!DOCTYPE" in text: # don't touch URLs inside the doctype! -- GitLab