FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
adjuster.py 489 KiB
Newer Older
Silas S. Brown's avatar
Silas S. Brown committed
    sslRetries = []
def IOLoopInstance():
    global ioLoopInstance
    try: return ioLoopInstance
    except: # better call this from the main thread first:
        if hasattr(IOLoop,"current"): ioLoopInstance = IOLoop.current() # for Tornado 5+ to work
        else: ioLoopInstance = IOLoop.instance() # in Tornado 4 and older, this can be called on-demand from any thread, but we're putting it in a global for forward-compatibility with the above
        return ioLoopInstance

    workaround_tornado_fd_issue()
    for core,sList in list(theServers.items()):
        if core == "all" or core == coreNo:
Silas S. Brown's avatar
Silas S. Brown committed
            for port,s in sList: s.start()
Silas S. Brown's avatar
Silas S. Brown committed
#@file: overload.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Multicore: pause/restart when a core is overloaded
# --------------------------------------------------

Silas S. Brown's avatar
Silas S. Brown committed
mainServerPaused = mainServerReallyPaused = False
def pauseOrRestartMainServer(shouldRun=True):
Silas S. Brown's avatar
Silas S. Brown committed
    if not (options.multicore and options.js_429): return
    global mainServerPaused
    if (not shouldRun) == mainServerPaused: return
Silas S. Brown's avatar
Silas S. Brown committed
    # if shouldRun: return # uncomment this 'once closed, stay closed' line to demonstrate the OS forwards to open cores only
Silas S. Brown's avatar
Silas S. Brown committed
    reallyPauseOrRestartMainServer(shouldRun)
    mainServerPaused = not mainServerPaused
    debuglog("Paused=%s on core %s" % (repr(mainServerPaused),repr(coreNo)))
    CrossProcess429.q.put((coreNo,mainServerPaused))
def reallyPauseOrRestartMainServer(shouldRun):
    global mainServerReallyPaused
    if shouldRun == "IfNotPaused": # called by CrossProcess429 to re-pause if and only if hasn't been reopened by the outer level in the meantime
        shouldRun = mainServerPaused
    if (not shouldRun) == mainServerReallyPaused: return
Silas S. Brown's avatar
Silas S. Brown committed
    for core,sList in theServers.items():
        if not (core == "all" or core == coreNo): continue
        for port,s in sList:
            if not port==options.port: continue
Silas S. Brown's avatar
Silas S. Brown committed
            if not hasattr(s,"_sockets"):
                logging.error("Cannot pause server: wrong Tornado version?")
                return
Silas S. Brown's avatar
Silas S. Brown committed
            if shouldRun: s.add_sockets(s._sockets.values())
            else:
                for fd, sock in s._sockets.items():
Silas S. Brown's avatar
Silas S. Brown committed
                    if hasattr(s,"io_loop"): s.io_loop.remove_handler(fd) # Tornado 4
                    else: IOLoopInstance().remove_handler(fd) # Tornado 5, not tested (TODO)
Silas S. Brown's avatar
Silas S. Brown committed
    mainServerReallyPaused = not mainServerReallyPaused
    debuglog("reallyPaused=%s on core %s" % (repr(mainServerReallyPaused),repr(coreNo)))
Silas S. Brown's avatar
Silas S. Brown committed
#@file: workarounds.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Miscellaneous bug workarounds
# --------------------------------------------------

def workaround_raspbian7_IPv6_bug():
    """Old Debian 7 based versions of Raspbian can boot with IPv6 enabled but later fail to configure it, hence tornado/netutil.py's AI_ADDRCONFIG flag is ineffective and socket.socket raises "Address family not supported by protocol" when it tries to listen on IPv6.  If that happens, we'll need to set address="0.0.0.0" for IPv4 only.  However, if we tried IPv6 and got the error, then at that point Tornado's bind_sockets will likely have ALREADY bound an IPv4 socket but not returned it; the socket does NOT get closed on dealloc, so a retry would get "Address already in use" unless we quit and re-run the application (or somehow try to figure out the socket number so it can be closed).  Instead of that, let's try to detect the situation in advance so we can set options.address to IPv4-only the first time."""
    if options.address: return # don't need to do this if we're listening on a specific address
    flags = socket.AI_PASSIVE
    if hasattr(socket, "AI_ADDRCONFIG"): flags |= socket.AI_ADDRCONFIG
    for af,socktype,proto,r1,r2 in socket.getaddrinfo(None,options.port,socket.AF_UNSPEC,socket.SOCK_STREAM,0,flags):
        try: socket.socket(af,socktype,proto)
        except socket.error as e:
            if "family not supported" in e.strerror:
                options.address = "0.0.0.0" # use IPv4 only
                return

Silas S. Brown's avatar
Silas S. Brown committed
def workaround_timeWait_problem():
    """Work around listen-port failing to bind when there are still TIME_WAIT connections from the previous run.  This at least seems to work around the problem MOST of the time."""
    global bind_sockets
    bind_sockets = tornado.netutil.bind_sockets
Silas S. Brown's avatar
Silas S. Brown committed
    if "win" in sys.platform and not sys.platform=="darwin":
        # Don't do this on MS-Windows.  It can result in
        # 'stealing' a port from another server even while
        # that other server is still running.
        return
    if not hasattr(socket, "SO_REUSEPORT"): return
    if getargspec==None: return
    if not 'reuse_port' in getargspec(tornado.netutil.bind_sockets).args: return # Tornado version too old
    def bind_sockets(*args,**kwargs):
        if not args[0]: pass # wer're doing port_randomise
        elif len(args) < 6: kwargs['reuse_port'] = True
        else: args=tuple(args[:6])+(True,)
        return tornado.netutil.bind_sockets(*args,**kwargs)

def workaround_tornado_fd_issue(): # TODO: is this still needed post-v0.3 now we fixed start-order bug?
    if not hasattr(IOLoopInstance(),'handle_callback_exception'):
        return # Tornado 6 doesn't have this, let's hope it's not needed
    cxFunc = IOLoopInstance().handle_callback_exception
    def newCx(callback):
        if callback: return cxFunc(callback)
        # self._handlers[fd] raised KeyError.  This means
        # we don't want to keep being told about the fd.
        fr = sys.exc_info()[2]
        while fr.tb_next: fr = fr.tb_next
        fd = fr.tb_frame.f_locals.get("fd",None)
        if not fd: return cxFunc("callback="+repr(callback)+" and newCx couldn't get fd from stack")
        logging.info("IOLoop has no handler left for fd "+repr(fd)+" but is still getting events from it.  Attempting low-level close to avoid loop.")
        except: pass
        try: os.close(fd)
        except: pass
    IOLoopInstance().handle_callback_exception = newCx
Silas S. Brown's avatar
Silas S. Brown committed
def check_LXML():
    # Might not find ALL problems with lxml installations, but at least we can check some basics
Silas S. Brown's avatar
Silas S. Brown committed
    try:
        from lxml import etree
        return etree.HTMLParser(target=None) # works on lxml 2.3.2
    except ImportError: sys.stderr.write("LXML library not found - ignoring useLXML option\n")
    except TypeError: sys.stderr.write("LXML library too old - ignoring useLXML option\n") # no target= option in 1.x
    options.useLXML = False

Silas S. Brown's avatar
Silas S. Brown committed
#@file: unix.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# More setup: Unix forking, privileges etc
# --------------------------------------------------

def dropPrivileges():
    if options.user and not os.getuid():
        # need to drop privileges
        import pwd ; pwd=pwd.getpwnam(options.user)
        os.setuid(pwd[2])
Silas S. Brown's avatar
Silas S. Brown committed
        # and help our external programs so they
        # don't try to load root's preferences etc
        os.environ['HOME'] = pwd[5]
        os.environ['USER']=os.environ['LOGNAME']=options.user
    if os.fork(): sys.exit()
    if os.fork(): sys.exit()
    devnull = os.open("/dev/null", os.O_RDWR)
Silas S. Brown's avatar
Silas S. Brown committed
    for fd in range(3): os.dup2(devnull,fd) # commenting out this loop will let you see stderr after the fork (TODO debug option?)
Silas S. Brown's avatar
Silas S. Brown committed
    if options.pidfile:
        try: open(options.pidfile,"w").write(str(os.getpid()))
        except: pass

def notifyReady():
    try: import sdnotify # sudo pip install sdnotify
    except ImportError: return
Silas S. Brown's avatar
Silas S. Brown committed
    sdnotify.SystemdNotifier().notify("READY=1") # we send READY=1 so you can do an adjuster.service (w/out --background) with Type=notify and ExecStart=/usr/bin/python /path/to/adjuster.py --config=...
Silas S. Brown's avatar
Silas S. Brown committed
# TODO: also send "WATCHDOG=1" so can use WatchdogSec ? (but multicore / js_interpreter could be a problem)

Silas S. Brown's avatar
Silas S. Brown committed
#@file: curl-setup.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# cURL client setup
# --------------------------------------------------

Silas S. Brown's avatar
Silas S. Brown committed
def MyAsyncHTTPClient(): return AsyncHTTPClient()
Silas S. Brown's avatar
Silas S. Brown committed
def curlFinished(): pass
def setupCurl(maxCurls,error=None):
Silas S. Brown's avatar
Silas S. Brown committed
  global pycurl
  try:
    curl_async = pycurl.version_info()[4] & (1 << 7) # CURL_VERSION_ASYNCHDNS
    if not curl_async: curl_async = ('c-ares' in pycurl.version or 'threaded' in pycurl.version) # older
    if not curl_async:
        if error: warn("The libcurl on this system might hold up our main thread while it resolves DNS (try building curl with ./configure --enable-ares)")
Silas S. Brown's avatar
Silas S. Brown committed
        else:
            del pycurl ; return # TODO: and say 'not using'?
    if float('.'.join(pycurl.version.split()[0].split('/')[1].rsplit('.')[:2])) < 7.5:
        if error: warn("The curl on this system is old and might hang when fetching certain SSL sites") # strace -p (myPID) shows busy looping on poll (TODO: option to not use it if we're not using upstream_proxy)
Silas S. Brown's avatar
Silas S. Brown committed
        else:
            del pycurl ; return # TODO: as above
Silas S. Brown's avatar
Silas S. Brown committed
    _oldCurl = pycurl.Curl
    def _newCurl(*args,**kwargs):
        c = _oldCurl(*args,**kwargs)
        so = c.setopt
        def mySetopt(k,v):
            so(k,v)
            if k==pycurl.PROXY: so(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_0) # workaround 599 "transfer closed with outstanding read data remaining" in Curl 7.55.1 with polipo2 as upstream proxy (TODO: curl-version dependent? 7.43.0 seemed OK in this aspect, although it had the above problem)
        c.setopt = mySetopt
        return c
    pycurl.Curl = _newCurl
Silas S. Brown's avatar
Silas S. Brown committed
    curl_max_clients = min(max(maxCurls,10),1000) # constrain curl_max_clients to between 10 and 1000 to work around Tornado issue 2127, and we'll warn about the issue ourselves if we go over:
Silas S. Brown's avatar
Silas S. Brown committed
    curl_inUse_clients = 0
    try: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient",max_clients=curl_max_clients)
Silas S. Brown's avatar
Silas S. Brown committed
    except: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") # will try in MyAsyncHTTPClient too (different versions of Tornado and all that...) (TODO: if that one also falls back to no max_clients, we might be reduced to 10 and should set curl_max_clients accordingly in order to get appropriate warning messages)
Silas S. Brown's avatar
Silas S. Brown committed
    def MyAsyncHTTPClient():
Silas S. Brown's avatar
Silas S. Brown committed
        try: problem = not len(AsyncHTTPClient()._free_list)
        except:
            global curl_inUse_clients
            curl_inUse_clients += 1
            problem = curl_inUse_clients >= curl_max_clients
        if problem:
            if upstream_rewrite_ssl and not options.ssl_fork: logging.error("curl_max_clients too low; AsyncHTTPClient will queue requests and COULD DEADLOCK due to upstream_rewrite_ssl (try --ssl-fork if you can't increase curl_max_clients)")
Silas S. Brown's avatar
Silas S. Brown committed
            else: logging.info("curl_max_clients too low; AsyncHTTPClient will queue requests")
        try: return AsyncHTTPClient(max_clients=curl_max_clients)
Silas S. Brown's avatar
Silas S. Brown committed
        except: return AsyncHTTPClient()
Silas S. Brown's avatar
Silas S. Brown committed
    def curlFinished(): # for callbacks to call
        global curl_inUse_clients
        curl_inUse_clients -= 1
        if curl_inUse_clients < 0:
            # This shouldn't happen.  But if it does, don't let the effect 'run away'.
            curl_inUse_clients = 0
Silas S. Brown's avatar
Silas S. Brown committed
  except: # fall back to the pure-Python one
      if error: errExit(error) # (unless it won't do)
    enable_gzip = True # for fetching remote sites
    class zlib:
        def compress(self,s,level): return s
        def decompressobj():
            class o:
                def decompress(self,s,maxlen): return s
            return o()
    zlib = zlib()

Silas S. Brown's avatar
Silas S. Brown committed
#@file: message-user.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Support for showing messages to specific IP addresses
# --------------------------------------------------

try:
    import hashlib # Python 2.5+, platforms?
    hashlib.md5
except: hashlib = None # (TODO: does this ever happen on a platform that supports Tornado?  Cygwin has hashlib with md5)
if hashlib: cookieHash = lambda msg: base64.b64encode(hashlib.md5(B(msg)).digest())[:10]
else: cookieHash = lambda msg: hex(hash(msg))[2:] # this fallback is not portable across different Python versions etc, so no good if you're running a fasterServer

Silas S. Brown's avatar
Silas S. Brown committed
ipv4_regexp = re.compile(r'^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$')
def ipv4_to_int(ip):
Silas S. Brown's avatar
Silas S. Brown committed
    if m: return (int(m.group(1))<<24) | (int(m.group(2))<<16) | (int(m.group(3))<<8) | int(m.group(4))
    else: return None
Silas S. Brown's avatar
Silas S. Brown committed
def ipv4range_to_ints(ip):
Silas S. Brown's avatar
Silas S. Brown committed
    if '-' in ip: return tuple(ipv4_to_int(i) for i in ip.split('-'))
    elif '/' in ip:
        start,bits = ip.split('/')
        start = ipv4_to_int(start)
        return start, start | ~(-1 << (32-int(bits)))
    else: return ipv4_to_int(ip),ipv4_to_int(ip)
def ipv4ranges_func(ipRanges_and_results):
    isIP = True ; rangeList=None ; fList = []
    for field in S(ipRanges_and_results).split('|'):
Silas S. Brown's avatar
Silas S. Brown committed
        if isIP: rangeList = [ipv4range_to_ints(i) for i in field.split(',')]
        else: fList.append((rangeList,field))
        isIP = not isIP
    def f(ip):
        ipInt = ipv4_to_int(ip)
        for rl,result in fList:
            if any((l<=ipInt<=h) for l,h in rl):
                return result # else None
    return f
Silas S. Brown's avatar
Silas S. Brown committed
#@file: connect-ssl.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Service routines for CONNECT passing to SSL terminator
# --------------------------------------------------
Silas S. Brown's avatar
Silas S. Brown committed
debug_connections = False
def myRepr(d):
    if re.search(B("[\x00-\x09\x0e-\x1f]"),B(d)): return "%d bytes" % len(d)
Silas S. Brown's avatar
Silas S. Brown committed
    elif len(d) >= 512: return repr(d[:500]+"...")
Silas S. Brown's avatar
Silas S. Brown committed
    else: return repr(d)
def peerName(socket):
Silas S. Brown's avatar
Silas S. Brown committed
    try: return socket.getpeername()
    except: return "(no socket??)"
        if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket)+" and closing it")
        try: stream.write(data,lambda *args:True)
        except: pass # ignore errors like client disconnected
    if not stream.closed():
        try: stream.close()
        except: pass
Silas S. Brown's avatar
Silas S. Brown committed
def writeOrError(opposite,name,stream,data):
    if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket))
Silas S. Brown's avatar
Silas S. Brown committed
    except:
Silas S. Brown's avatar
Silas S. Brown committed
        if name and not hasattr(stream,"writeOrError_already_complained"): logging.error("Error writing data to "+name)
        stream.writeOrError_already_complained = True
        try: stream.close()
        except: pass
        try: opposite.close() # (try to close the server stream we're reading if the client has gone away, and vice versa)
        except: pass
Silas S. Brown's avatar
Silas S. Brown committed
#@file: misc.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Miscellaneous variables
# --------------------------------------------------

cookieExpires = "Tue Jan 19 03:14:07 2038" # TODO: S2G (may have to switch to Max-Age and drop support for ~IE8)
Silas S. Brown's avatar
Silas S. Brown committed

set_window_onerror = False # for debugging Javascript on some mobile browsers (TODO make this a config option? but will have to check which browsers do and don't support window.onerror)

# Domain-setting cookie for when we have no wildcard_dns and no default_site:
adjust_domain_cookieName = "_adjusterDN_"
adjust_domain_none = B("0") # not a valid top-level domain (TODO hopefully no user wants this as a local domain...)
enable_adjustDomainCookieName_URL_override = True # TODO: document this!  (Allow &_adjusterDN_=0 or &_adjusterDN_=wherever in bookmark URLs, so it doesn't matter what setting the cookie has when the bookmark is activated)

htmlmode_cookie_name = "_adjustZJCG_" # zap JS, CSS and Graphics
password_cookie_name = "_pxyAxsP_" # "proxy access password". have to pick something that's unlikely to collide with a site's cookie
webdriver_click_code = "._adjustPJSC_"
redirectFiles_Extensions=set("pdf epub mp3 aac zip gif png jpeg jpg exe tar tgz tbz ttf woff swf txt doc rtf midi mid wav ly c h py".split()) # TODO: make this list configurable + maybe add a "minimum content length before it's worth re-directing" option

Silas S. Brown's avatar
Silas S. Brown committed
#@file: js-webdriver.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Server-side Javascript execution support
# --------------------------------------------------
Silas S. Brown's avatar
Silas S. Brown committed
class WebdriverWrapper:
    "Wrapper for webdriver that might or might not be in a separate process without shared memory"
    def __init__(self): self.theWebDriver = self.tmpDirToDelete = None
Silas S. Brown's avatar
Silas S. Brown committed
    def new(self,*args):
Silas S. Brown's avatar
Silas S. Brown committed
        try:
Silas S. Brown's avatar
Silas S. Brown committed
            # No coredump, for emergency_zap_pid_and_children
            import resource ; resource.setrlimit(resource.RLIMIT_CORE,(0,0))
        except: pass # oh well, have coredumps then :-(
        if options.js_multiprocess:
            # we have a whole process to ourselves (not just a thread)
            # so we can set the environment here.
            # Selenium doesn't always clean up temporary files on exit
            # (especially with Firefox), so let's set TMPDIR uniquely so we
            # can clean them ourselves.
            tmp = os.environ.get("TMPDIR",None)
            self.tempDirToDelete=os.environ['TMPDIR']=os.environ.get("TMPDIR","/tmp")+"/"+str(os.getpid())+"."+str(args[0])
            try: os.mkdir(self.tempDirToDelete)
            except: pass
        else: tmp = self.tempDirToDelete = None
Silas S. Brown's avatar
Silas S. Brown committed
        self.theWebDriver = get_new_webdriver(*args)
        if tmp: os.environ["TMPDIR"] = tmp
        elif options.js_multiprocess: del os.environ["TMPDIR"]
    def getTmp(self,*args): return self.tempDirToDelete
Silas S. Brown's avatar
Silas S. Brown committed
    def quit(self,*args):
        if not self.theWebDriver: return
        try: pid = self.theWebDriver.service.process.pid
Silas S. Brown's avatar
Silas S. Brown committed
        except: pid = debuglog("WebdriverWrapper: Unable to get self.theWebDriver.service.process.pid")
Silas S. Brown's avatar
Silas S. Brown committed
        try: self.theWebDriver.quit()
Silas S. Brown's avatar
Silas S. Brown committed
        except: debuglog("WebdriverWrapper: exception on quit") # e.g. sometimes get 'bad fd' in selenium's send_remote_shutdown_command _cookie_temp_file_handle
        # Try zapping the process ourselves anyway (even if theWebDriver.quit DIDN'T return error: seems it's sometimes still left around.  TODO: this could have unexpected consequences if the system's pid-reuse rate is excessively high.)
Silas S. Brown's avatar
Silas S. Brown committed
        self.theWebDriver = None
Silas S. Brown's avatar
Silas S. Brown committed
        emergency_zap_pid_and_children(pid)
        if self.tempDirToDelete: emergency_zap_tempdir(self.tempDirToDelete)
Silas S. Brown's avatar
Silas S. Brown committed
    def current_url(self):
        try: return self.theWebDriver.current_url
        except: return "" # PhantomJS Issue #13114: unconditional reload for now
Silas S. Brown's avatar
Silas S. Brown committed
    def get(self,url):
Silas S. Brown's avatar
Silas S. Brown committed
        if options.logDebug:
          try:
            for e in self.theWebDriver.get_log('browser'):
                print ("webdriver log: "+e['message'])
          except Exception as e: print ("webdriver get_log exception: "+repr(e))
    def execute_script(self,script): self.theWebDriver.execute_script(S(script))
    def click_id(self,clickElementID): self.theWebDriver.find_element_by_id(S(clickElementID)).click()
    def click_xpath(self,xpath): self.theWebDriver.find_element_by_xpath(S(xpath)).click()
    def click_linkText(self,clickLinkText): self.theWebDriver.find_element_by_link_text(S(clickLinkText)).click()
Silas S. Brown's avatar
Silas S. Brown committed
    def getu8(self):
        def f(switchBack):
            src = self.theWebDriver.find_element_by_xpath("//*").get_attribute("outerHTML")
            if options.js_frames:
                for el in ['frame','iframe']:
                    for frame in self.theWebDriver.find_elements_by_tag_name(el):
                        self.theWebDriver.switch_to.frame(frame)
                        src += f(switchBack+[frame])
                        self.theWebDriver.switch_to.default_content()
                        for fr in switchBack: self.theWebDriver.switch_to.frame(fr)
            return src
Silas S. Brown's avatar
Silas S. Brown committed
    def getpng(self):
        if options.js_interpreter=="HeadlessChrome": # resize not needed for PhantomJS (but PhantomJS is worse at font configuration and is no longer maintained)
            self.theWebDriver.set_window_size(js_size[0],min(16000,int(self.theWebDriver.execute_script("return document.body.parentNode.scrollHeight")))) # TODO: check the 16000: what is Selenium's limit? (confirmed over 8000)
            time.sleep(1)
Silas S. Brown's avatar
Silas S. Brown committed
        png = self.theWebDriver.get_screenshot_as_png()
        if options.js_interpreter=="HeadlessChrome": self.theWebDriver.set_window_size(*js_size)
Silas S. Brown's avatar
Silas S. Brown committed
        try: # can we optimise the screenshot image size?
            from PIL import Image
            s = BytesIO() ; Image.open(StringIO(png)).save(s,'png',optimize=True)
Silas S. Brown's avatar
Silas S. Brown committed
            png = s.getvalue()
        except: pass # just return non-optimized
        return png
Silas S. Brown's avatar
Silas S. Brown committed
def emergency_zap_pid_and_children(pid):
    if not pid: return
    try:
      for c in psutil.Process(pid).children(recursive=True):
        try: c.kill(9)
        except: pass
    except: pass # no psutil, or process already gone
Silas S. Brown's avatar
Silas S. Brown committed
    try: os.kill(pid,9),os.waitpid(pid, 0) # waitpid is necessary to clear it from the process table, but we should NOT use os.WNOHANG, as if we do, there's a race condition with the os.kill taking effect (even -9 isn't instant)
Silas S. Brown's avatar
Silas S. Brown committed
    except OSError: pass # maybe pid already gone
def emergency_zap_tempdir(d):
    try:
        for f in os.listdir(d): emergency_zap_tempdir(d+os.sep+f)
        os.rmdir(d)
    except: unlink(d)
try: from selenium.common.exceptions import TimeoutException
except: # no Selenium or wrong version
Silas S. Brown's avatar
Silas S. Brown committed
    class TimeoutException(Exception): pass # placeholder
class SeriousTimeoutException(Exception): pass
Silas S. Brown's avatar
Silas S. Brown committed
def webdriverWrapper_receiver(pipe,timeoutLock):
    "Command receiver for WebdriverWrapper for when it's running over IPC (--js-multiprocess).  Receives (command,args) and sends (return,exception), releasing the timeoutLock whenever it's ready to return."
Silas S. Brown's avatar
Silas S. Brown committed
    setProcName("adjusterWDhelp")
    CrossProcessLogging.initChild()
    try: w = WebdriverWrapper()
    except KeyboardInterrupt: return
Silas S. Brown's avatar
Silas S. Brown committed
    while True:
        try: cmd,args = pipe.recv()
Silas S. Brown's avatar
Silas S. Brown committed
        except KeyboardInterrupt: # all shutting down
Silas S. Brown's avatar
Silas S. Brown committed
            try: w.quit()
            except: pass
Silas S. Brown's avatar
Silas S. Brown committed
            try: timeoutLock.release()
Silas S. Brown's avatar
Silas S. Brown committed
            except ValueError: pass
Silas S. Brown's avatar
Silas S. Brown committed
            pipe.send(("INT","INT"))
            return pipe.close()
Silas S. Brown's avatar
Silas S. Brown committed
        if cmd=="EOF": return pipe.close()
Silas S. Brown's avatar
Silas S. Brown committed
        try: ret,exc = getattr(w,cmd)(*args), None
        except Exception as e:
Silas S. Brown's avatar
Silas S. Brown committed
            p = find_adjuster_in_traceback()
Silas S. Brown's avatar
Silas S. Brown committed
            if p: # see if we can add it to the message (note p will start with ", " so no need to add a space before it)
Silas S. Brown's avatar
Silas S. Brown committed
              try:
Silas S. Brown's avatar
Silas S. Brown committed
                  if hasattr(e,"msg") and e.msg: e.msg += p # should work with WebDriverException
                  elif type(e.args[0])==str: e.args=(repr(e.args[0])+p,) + tuple(e.args[1:]) # should work with things like httplib.BadStatusLine that are fussy about the number of arguments they get
Silas S. Brown's avatar
Silas S. Brown committed
                  else: e.args += (p,) # works with things like KeyError (although so should the above)
              except: e.message += p # works with base Exception
            ret,exc = None,e
Silas S. Brown's avatar
Silas S. Brown committed
        try: timeoutLock.release()
Silas S. Brown's avatar
Silas S. Brown committed
        except: pass # (may fail if controller's timeoutLock is turned off during quit_wd_atexit)
        try: pipe.send((ret,exc))
        except: pass # if they closed it, we'll get EOFError on next iteration
Silas S. Brown's avatar
Silas S. Brown committed
class WebdriverWrapperController:
    "Proxy for WebdriverWrapper if it's running over IPC"
    def __init__(self):
        self.pipe, cPipe = multiprocessing.Pipe()
Silas S. Brown's avatar
Silas S. Brown committed
        self.timeoutLock = multiprocessing.Lock()
        self.process = multiprocessing.Process(target=webdriverWrapper_receiver,args=(cPipe,self.timeoutLock))
Silas S. Brown's avatar
Silas S. Brown committed
        self.process.start()
Silas S. Brown's avatar
Silas S. Brown committed
    def send(self,cmd,args=()):
        "Send a command to a WebdriverWrapper over IPC, and either return its result or raise its exception in this process.  Also handle the raising of SeriousTimeoutException if needed, in which case the WebdriverWrapper should be stopped."
        try:
          if not self.timeoutLock.acquire(timeout=0):
Silas S. Brown's avatar
Silas S. Brown committed
            logging.error("REALLY serious SeriousTimeout (should never happen). Lock unavailable before sending command.")
            raise SeriousTimeoutException()
        except AttributeError: pass # self.timeoutLock==None because quit(final=True) called from another thread
Silas S. Brown's avatar
Silas S. Brown committed
        try: self.pipe.send((cmd,args))
        except IOError: return # already closed
        if cmd=="EOF":
            return self.pipe.close() # no return code
Silas S. Brown's avatar
Silas S. Brown committed
            if not self.timeoutLock.acquire(timeout=options.js_timeout2): # fallback in case Selenium timeout doesn't catch it (signal.alarm in the child process isn't guaranteed to help, so catch it here)
                try: logging.error("SeriousTimeout: WebdriverWrapper process took over "+str(options.js_timeout2)+"s to respond to "+repr((cmd,args))+". Emergency restarting this process.")
Silas S. Brown's avatar
Silas S. Brown committed
                except: pass # absolutely do not throw anything except SeriousTimeoutException from this branch
Silas S. Brown's avatar
Silas S. Brown committed
                raise SeriousTimeoutException()
            self.timeoutLock.release()
        except AttributeError: return # self.timeoutLock==None because quit(final=True) called from another thread
Silas S. Brown's avatar
Silas S. Brown committed
        ret,exc = self.pipe.recv()
        if ret==exc=="INT": return self.pipe.close()
        if exc: raise exc
        else: return ret
    def new(self,*args):
        self.send("new",args)
        self.tempDirToDelete=self.send("getTmp")
Silas S. Brown's avatar
Silas S. Brown committed
    def quit(self,final=False):
Silas S. Brown's avatar
Silas S. Brown committed
        if final: self.timeoutLock = None # quit_wd_atexit could plausibly run while another thread's still processing its last command, so allow these commands to be queued in the pipe from another thread without worrying about timeout when that happens
Silas S. Brown's avatar
Silas S. Brown committed
        self.send("quit")
        if final: self.send("EOF")
Silas S. Brown's avatar
Silas S. Brown committed
    def current_url(self): return self.send("current_url")
    def get(self,url): return self.send("get",(url,))
    def execute_script(self,script): self.send("execute_script",(script,))
    def click_id(self,clickElementID): self.send("click_id",(clickElementID,))
    def click_xpath(self,xpath): self.send("click_xpath",(xpath),)
    def click_linkText(self,clickLinkText): self.send("click_linkText",(clickLinkText,))
    def getu8(self): return self.send("getu8")
    def getpng(self): return self.send("getpng")
try: import multiprocessing # Python 2.6
except: multiprocessing = None
Silas S. Brown's avatar
Silas S. Brown committed
class WebdriverRunner:
Silas S. Brown's avatar
Silas S. Brown committed
    "Manage a WebdriverWrapperController (or a WebdriverWrapper if we're not using IPC) from a thread of the main process"
    def __init__(self,start=0,index=0):
        self.start,self.index = start,index
Silas S. Brown's avatar
Silas S. Brown committed
        if options.js_multiprocess:
Silas S. Brown's avatar
Silas S. Brown committed
            self.wrapper = WebdriverWrapperController()
        else: self.wrapper = WebdriverWrapper()
Silas S. Brown's avatar
Silas S. Brown committed
        self.renew_webdriver_newThread(True) # sets wd_threadStart
Silas S. Brown's avatar
Silas S. Brown committed
    def renew_controller(self): # SeriousTimeoutException
        emergency_zap_pid_and_children(self.wrapper.process.pid)
        emergency_zap_tempdir(self.wrapper.tempDirToDelete)
Silas S. Brown's avatar
Silas S. Brown committed
        self.wrapper = WebdriverWrapperController()
Silas S. Brown's avatar
Silas S. Brown committed
    def renew_webdriver_sameThread(self,firstTime=False):
Silas S. Brown's avatar
Silas S. Brown committed
        self.usageCount = 0 ; self.maybe_stuck = False
Silas S. Brown's avatar
Silas S. Brown committed
        while True:
          try:
Silas S. Brown's avatar
Silas S. Brown committed
              self.wrapper.quit(),self.wrapper.new(self.start+self.index,not firstTime)
              break
Silas S. Brown's avatar
Silas S. Brown committed
          except SeriousTimeoutException: # already logged
Silas S. Brown's avatar
Silas S. Brown committed
              self.renew_controller()
          except:
              logging.error("Exception "+exc_logStr()+" while renewing webdriver, retrying")
              time.sleep(1) # just in case
Silas S. Brown's avatar
Silas S. Brown committed
        self.usageCount = 0 ; self.maybe_stuck = False
Silas S. Brown's avatar
Silas S. Brown committed
    def renew_webdriver_newThread(self,firstTime=False):
Silas S. Brown's avatar
Silas S. Brown committed
        self.wd_threadStart = time.time() # cleared in _renew_wd after renew_webdriver_sameThread returns (it loops on exception)
Silas S. Brown's avatar
Silas S. Brown committed
        threading.Thread(target=_renew_wd,args=(self,firstTime)).start() ; return
Silas S. Brown's avatar
Silas S. Brown committed
    def quit_webdriver(self): self.wrapper.quit(final=True)
Silas S. Brown's avatar
Silas S. Brown committed
    def fetch(self,url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate):
Silas S. Brown's avatar
Silas S. Brown committed
        assert not self.wd_threadStart # webdriver_checkServe
Silas S. Brown's avatar
Silas S. Brown committed
        self.wd_threadStart = time.time() # cleared in wd_fetch after _wd_fetch returns or throws + possible renew-loop (TODO: if wd_fetch ITSELF somehow throws an exception, should be logged but this JS instance gets tied up until next adjuster restart)
Silas S. Brown's avatar
Silas S. Brown committed
        self.maybe_stuck = False
        threading.Thread(target=wd_fetch,args=(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,self,tooLate)).start()
Silas S. Brown's avatar
Silas S. Brown committed
    def current_url(self): return self.wrapper.current_url()
    def get(self,url): return self.wrapper.get(url)
Silas S. Brown's avatar
Silas S. Brown committed
    def execute_script(self,script): self.wrapper.execute_script(script)
    def click_id(self,clickElementID): self.wrapper.click_id(clickElementID)
    def click_xpath(self,xpath): self.wrapper.click_xpath(xpath)
    def click_linkText(self,clickLinkText): self.wrapper.click_linkText(clickLinkText)
    def getu8(self): return self.wrapper.getu8()
    def getpng(self): return self.wrapper.getpng()
def _renew_wd(wd,firstTime):
Silas S. Brown's avatar
Silas S. Brown committed
    wd.renew_webdriver_sameThread(firstTime)
    wd.wd_threadStart = False
    IOLoopInstance().add_callback(webdriver_checkServe)
Silas S. Brown's avatar
Silas S. Brown committed
def find_adjuster_in_traceback():
Silas S. Brown's avatar
Silas S. Brown committed
    ei = sys.exc_info()
    try:
        p = ei[1].args[-1]
        if "adjuster line" in p: return p # for webdriverWrapper_receiver
    except: pass
    try: __file__
    except: return "" # sometimes not defined ??
Silas S. Brown's avatar
Silas S. Brown committed
    l = traceback.extract_tb(ei[2])
Silas S. Brown's avatar
Silas S. Brown committed
    for i in xrange(len(l)-1,-1,-1):
Silas S. Brown's avatar
Silas S. Brown committed
        if __file__ in l[i][0]: return ", adjuster line "+str(l[i][1])
Silas S. Brown's avatar
Silas S. Brown committed
    return ""
Silas S. Brown's avatar
Silas S. Brown committed
def wd_fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,manager,tooLate):
    helper_threads.append('wd_fetch')
Silas S. Brown's avatar
Silas S. Brown committed
    need_restart = False
    def errHandle(error,extraMsg,prefetched):
Silas S. Brown's avatar
Silas S. Brown committed
        if not options.js_fallback: prefetched=None
Silas S. Brown's avatar
Silas S. Brown committed
        if prefetched: toRet = "non-webdriver page (js_fallback set)"
Silas S. Brown's avatar
Silas S. Brown committed
        else:
            toRet = "error"
            prefetched = wrapResponse("webdriver "+error)
Silas S. Brown's avatar
Silas S. Brown committed
        logging.error(extraMsg+" returning "+toRet)
Silas S. Brown's avatar
Silas S. Brown committed
        if options.js_fallback:
            try:
                prefetched.headers.add(options.js_fallback,error)
            except: logging.error("Could not add "+repr(options.js_fallback)+" to error response")
Silas S. Brown's avatar
Silas S. Brown committed
        return prefetched
    try:
        r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
Silas S. Brown's avatar
Silas S. Brown committed
        try:
            if options.js_fallback: r.headers.add(options.js_fallback,"OK")
Silas S. Brown's avatar
Silas S. Brown committed
        except: pass
Silas S. Brown's avatar
Silas S. Brown committed
    except TimeoutException:
Silas S. Brown's avatar
Silas S. Brown committed
        r = errHandle("timeout","webdriver "+str(manager.start+manager.index)+" timeout fetching "+url+find_adjuster_in_traceback()+"; no partial result, so",prefetched) # "webdriver timeout" sent to browser (can't include url here: domain gets rewritten)
Silas S. Brown's avatar
Silas S. Brown committed
    except SeriousTimeoutException:
Silas S. Brown's avatar
Silas S. Brown committed
        r = errHandle("serious timeout","lost communication with webdriver "+str(manager.start+manager.index)+" when fetching "+url+"; no partial result, so",prefetched)
Silas S. Brown's avatar
Silas S. Brown committed
        need_restart = "serious"
Silas S. Brown's avatar
Silas S. Brown committed
        if options.js_retry and not tooLate():
Silas S. Brown's avatar
Silas S. Brown committed
            logging.info("webdriver error fetching "+url+" ("+exc_logStr()+"); restarting webdriver "+str(manager.start+manager.index)+" for retry") # usually a BadStatusLine
Silas S. Brown's avatar
Silas S. Brown committed
            manager.renew_webdriver_sameThread()
Silas S. Brown's avatar
Silas S. Brown committed
            if tooLate(): r = errHandle("err","too late")
            else:
Silas S. Brown's avatar
Silas S. Brown committed
              try:
                  r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
                  try:
                      if options.js_fallback: r.headers.add(options.js_fallback,"OK")
                  except: pass
Silas S. Brown's avatar
Silas S. Brown committed
              except SeriousTimeoutException:
                r = errHandle("serious timeout","webdriver serious timeout on "+url+" after restart, so re-restarting and",prefetched)
                need_restart = "serious"
              except:
Silas S. Brown's avatar
Silas S. Brown committed
                r = errHandle("error","webdriver error on "+url+" even after restart, so re-restarting and",prefetched)
                need_restart = True
        else: # no retry
            r = errHandle("error","webdriver error on "+url+", so restarting and",prefetched)
Silas S. Brown's avatar
Silas S. Brown committed
            need_restart = True
    IOLoopInstance().add_callback(lambda *args:callback(r))
Silas S. Brown's avatar
Silas S. Brown committed
    manager.usageCount += 1
Silas S. Brown's avatar
Silas S. Brown committed
    if need_restart or (options.js_restartAfter and manager.usageCount >= options.js_restartAfter):
Silas S. Brown's avatar
Silas S. Brown committed
        if need_restart=="serious":manager.renew_controller()
        manager.renew_webdriver_sameThread()
    else: manager.finishTime = time.time()
Silas S. Brown's avatar
Silas S. Brown committed
    manager.wd_threadStart = manager.maybe_stuck = False
    IOLoopInstance().add_callback(webdriver_checkServe)
    helper_threads.remove('wd_fetch')
Silas S. Brown's avatar
Silas S. Brown committed
def exc_logStr():
    toLog = sys.exc_info()[:2]
    if hasattr(toLog[1],"msg") and toLog[1].msg: toLog=(toLog[0],toLog[1].msg) # for WebDriverException
    return repr(toLog)+find_adjuster_in_traceback()
Silas S. Brown's avatar
Silas S. Brown committed
def _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot): # single-user only! (and relies on being called only in htmlOnlyMode so leftover Javascript is removed and doesn't double-execute on JS-enabled browsers)
    import tornado.httputil ; url = S(url)
    currentUrl = S(manager.current_url())
Silas S. Brown's avatar
Silas S. Brown committed
    if prefetched or not re.sub('#.*','',currentUrl) == url:
        if prefetched:
Silas S. Brown's avatar
Silas S. Brown committed
            debuglog("webdriver %d get about:blank" % (manager.start+manager.index))
Silas S. Brown's avatar
Silas S. Brown committed
            manager.get("about:blank") # ensure no race condition with current page's XMLHttpRequests
Silas S. Brown's avatar
Silas S. Brown committed
            webdriver_prefetched[manager.index] = prefetched
Silas S. Brown's avatar
Silas S. Brown committed
        webdriver_inProgress[manager.index].clear() # race condition with start of next 'get' if we haven't done about:blank, but worst case is we'll wait a bit too long for page to finish
Silas S. Brown's avatar
Silas S. Brown committed
        debuglog(("webdriver %d get " % (manager.start+manager.index))+url)
        try: manager.get(url) # waits for onload
        except TimeoutException:
            # we might have got SOMEthing (e.g. on a page bringing in hundreds of scripts from a slow server, but still running some of them before the timeout)
            # May also be "Received error page"
            if currentUrl == S(manager.current_url()):
                debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - URL unchanged at "+currentUrl)
                raise # treat as "no partial result"
            debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - extracting partial")
        if not timed_out:
         debuglog(("webdriver %d loaded " % (manager.start+manager.index))+url)
         # we want to double-check XMLHttpRequests have gone through (TODO: low-value setTimeout as well? TODO: abort this early if currentUrl has changed and we're just going to issue a redirect? but would then need to ensure it's finished if client comes back to same instance that's still running after it follows the redirect)
         if options.js_reproxy:
Silas S. Brown's avatar
Silas S. Brown committed
          wasActive = True
Silas S. Brown's avatar
Silas S. Brown committed
          for _ in xrange(40): # up to 8+ seconds in steps of 0.2 (on top of the inital load)
Silas S. Brown's avatar
Silas S. Brown committed
            time.sleep(0.2) # unconditional first-wait hopefully long enough to catch XMLHttpRequest delayed-send, very-low-value setTimeout etc, but we don't want to wait a whole second if the page isn't GOING to make any requests (TODO: monitor the js going through the upstream proxy to see if it contains any calls to this? but we'll have to deal with js_interpreter's cache, unless set it to not cache and we cache upstream)
Silas S. Brown's avatar
Silas S. Brown committed
            active = webdriver_inProgress[manager.index]
            if not active and not wasActive: break # TODO: wait longer than 0.2-0.4 to see if it restarts another request?
            wasActive = active
         else: time.sleep(1) # can't do much if we're not reproxying, so just sleep 1sec and hope for the best
        currentUrl = None
    if (clickElementID or clickLinkText) and not timed_out:
Silas S. Brown's avatar
Silas S. Brown committed
        manager.execute_script("window.open = window.confirm = function(){return true;}") # in case any link has a "Do you really want to follow this link?" confirmation (webdriver default is usually Cancel), or has 'pop-under' window (TODO: switch to pop-up?)
        if clickElementID: manager.click_id(clickElementID)
        if clickLinkText:
            if not type(clickLinkText)==type(u""): clickLinkText=clickLinkText.decode('utf-8')
Silas S. Brown's avatar
Silas S. Brown committed
            if not '"' in clickLinkText: manager.click_xpath(u'//a[text()="'+clickLinkText+'"]')
            elif not "'" in clickLinkText: manager.click_xpath(u"//a[text()='"+clickLinkText+"']")
            else: manager.click_linkText(clickLinkText) # least reliable
        time.sleep(0.2) # TODO: more? what if the click results in fetching a new URL, had we better wait for XMLHttpRequests to finish?  (loop as above but how do we know when they've started?)  currentUrl code below should at least show us the new URL even if it hasn't finished loading, and then there's a delay while the client browser is told to fetch it, but that might not be enough
Silas S. Brown's avatar
Silas S. Brown committed
      except: debuglog("js_links find_element exception ignored",False)
      currentUrl = None
    if currentUrl == None: # we need to ask for it again
Silas S. Brown's avatar
Silas S. Brown committed
        currentUrl = manager.current_url()
        if not currentUrl: currentUrl = url # PhantomJS Issue #13114: relative links after a redirect are not likely to work now
    if S(currentUrl) == "about:blank":
        debuglog("got about:blank instead of "+S(url))
Silas S. Brown's avatar
Silas S. Brown committed
        return wrapResponse("webdriver failed to load") # don't return an actual redirect to about:blank, which breaks some versions of Lynx
Silas S. Brown's avatar
Silas S. Brown committed
    debuglog("Getting data from webdriver %d (current_url=%s)" % (manager.start+manager.index,S(currentUrl)))
Silas S. Brown's avatar
Silas S. Brown committed
    if asScreenshot: return wrapResponse(manager.getpng(),tornado.httputil.HTTPHeaders.parse("Content-type: image/png"),200)
    body = get_and_remove_httpequiv_charset(manager.getu8())[1]
    if timed_out: manager.get("about:blank") # as the timeout might have been due to a hard-locked script, so interrupting it should save some CPU
    if not re.sub(B('#.*'),B(''),B(currentUrl)) == B(url): # we have to ignore anything after a # in this comparison because we have no way of knowing (here) whether the user's browser already includes the # or not: might send it into a redirect loop
        # If we redirect, and if we have more than one user session active (and especially if we're multicore) then the second request might not come back to the same webdriver instance (or even the same adjuster process, so we can't even cache it unless shared), and reload is bad, so try to avoid redirect if possible.
        # We could set 'base href' instead, seeing as 'document.location' does not have to be right on the user's side as we've already executed the site's scripts here (unless the user has any extensions that require it to be right).  Don't use Content-Location header: not all browsers support + might cause caches to tread POST requests as invariant.
        # Any in-document "#" links will cause a reload if 'base href' is set, but at least we won't have to reload UNLESS the user follows such a link.
        if htmlFind(body,"<base href=") >= 0:
            pass # if it already has a <base href> we can leave it as-is, since it won't matter from which URL it was served
        else: return wrapResponse(addToHead(body,B('<base href="')+re.sub(B('#.*'),B(''),B(currentUrl))+B('">')),tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)
    return wrapResponse(body,tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)
Silas S. Brown's avatar
Silas S. Brown committed
def get_new_webdriver(index,renewing=False):
    if options.js_interpreter in ["HeadlessChrome","Chrome"]:
        return get_new_Chrome(index,renewing,options.js_interpreter=="HeadlessChrome")
    elif options.js_interpreter in ["HeadlessFirefox","Firefox"]:
        return get_new_Firefox(index,renewing,options.js_interpreter=="HeadlessFirefox")
Silas S. Brown's avatar
Silas S. Brown committed
    else: return get_new_PhantomJS(index,renewing)
Silas S. Brown's avatar
Silas S. Brown committed
    log_complaints = (index==0 and not renewing)
    from selenium.webdriver.chrome.options import Options
Silas S. Brown's avatar
Silas S. Brown committed
    opts = Options() ; dc = None
Silas S. Brown's avatar
Silas S. Brown committed
    # TODO: can set opts.binary_location if needed (e.g. for chromium, if distro's linking doesn't work)
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
Silas S. Brown's avatar
Silas S. Brown committed
    # Specify user-data-dir ourselves, further to Chromium bug 795 comment 12.  Include username and port (in case others are running or have run adjuster) as well as index.
    global myUsername
    try: myUsername
    except NameError:
        try: import getpass
        except ImportError: getpass = None
        if getpass: myUsername = S(getpass.getuser())
Silas S. Brown's avatar
Silas S. Brown committed
        else: myUsername = ""
Silas S. Brown's avatar
Silas S. Brown committed
    extra = ""
    while True: # might be restarting from a corrupted user-data-dir state; in worst case might not even be able to cleanly remove it (TODO: what if some processes associated with an older instance somehow took a while to go away and still have named referenc to previous path: increment counter unconditionally?  still rm the old one)
        path = writable_tmpdir()+"hChrome-"+myUsername+str(options.port)+"."+str(index)+extra
        if not os.path.exists(path): break
        import shutil ; shutil.rmtree(path,True)
        if not os.path.exists(path): break
        if extra: extra="-"+str(int(extra[1:])+1)
        else: extra = "-0"
Silas S. Brown's avatar
Silas S. Brown committed
    opts.add_argument("--user-data-dir="+path)
    opts.add_argument("--incognito") # reduce space taken up by above
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_reproxy:
Silas S. Brown's avatar
Silas S. Brown committed
        opts.add_argument("--proxy-server=127.0.0.1:%d" % proxyPort(index))
Silas S. Brown's avatar
Silas S. Brown committed
        opts.add_argument("--ignore-certificate-errors") # --ignore-certificate-errors is ignored by Chrome 59 (which was the first version to support Headless) and possibly some earlier versions, but we'll put it in just in case somebody runs an ancient non-headless Chrome in an offline experiment
Silas S. Brown's avatar
Silas S. Brown committed
        opts.add_argument("--allow-insecure-localhost") # Chrome 62+ can at least do *.localhost & 127.* but we'd need to domain-rewrite for this to help (proxy-host doesn't count)
        # Chrome 65 and chromedriver 2.35/2.36? can do:
        dc = wd_DesiredCapabilities(log_complaints)
        if dc:
            dc = dc.CHROME.copy()
            dc['acceptInsecureCerts'] = True
Silas S. Brown's avatar
Silas S. Brown committed
    elif options.upstream_proxy: opts.add_argument('--proxy-server='+options.upstream_proxy)
    if options.logDebug: opts.add_argument("--verbose")
    if options.js_UA and not options.js_UA.startswith("*"): opts.add_argument("--user-agent="+options.js_UA)
    if not options.js_images: opts.add_experimental_option("prefs",{"profile.managed_default_content_settings.images":2})
    # TODO: do we need to disable Javascript's ability to open new windows and tabs, plus target="_blank" etc, especially if using clickElementID?
Silas S. Brown's avatar
Silas S. Brown committed
    if options.via and not options.js_reproxy and log_complaints:
        # Oops: how can we put in a Via: header if we don't
        # have an upstream proxy to do so?  unless you want
        # to implement a Chrome extension to do it (TODO?)
        warn("--via ignored when running Chrome without --js-reproxy")
    if js_size: opts.add_argument("--window-size=%d,%d" % js_size)
Silas S. Brown's avatar
Silas S. Brown committed
    if dc: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts,desired_capabilities=dc)
    else: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts)
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_reproxy:
        chromeVersion = int(p.capabilities['version'].split(".")[0])
        if 59 <= chromeVersion < 65:
            if [int(x) for x in p.capabilities['chrome']['chromedriverVersion'].split(".",2)[:2]] < [2,35]: extrawarn = " (and chromedriver 2.35+)"
            else: extrawarn = ""
            warn("This version of Chrome will hang when used with js_reproxy on https pages. Try upgrading to Chrome 65+"+extrawarn) # TODO: is 59 really the first version to drop --ignore-certificate-errors ?
        elif chromeVersion >= 65 and not p.capabilities.get('acceptInsecureCerts',False): warn("This version of chromedriver will hang when used with js_reproxy on https pages. Your Chrome is new enough, but your chromedriver is not. Try downloading chromedriver 2.35/36+")
Silas S. Brown's avatar
Silas S. Brown committed
    try: p.set_page_load_timeout(options.js_timeout1)
Silas S. Brown's avatar
Silas S. Brown committed
    except: logging.info("Couldn't set HeadlessChrome page load timeout")
    return p
def get_new_Firefox(index,renewing,headless):
    if headless:
        os.environ['MOZ_HEADLESS'] = '1' # in case -headless not yet working
    from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
    from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
Silas S. Brown's avatar
Silas S. Brown committed
    profile = FirefoxProfile() ; caps = None
    log_complaints = (index==0 and not renewing) ; op = None
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_reproxy:
        from selenium.webdriver.common.proxy import Proxy,ProxyType
        import warnings
        warnings.filterwarnings("ignore","This method has been deprecated. Please pass in the proxy object to the Driver Object") # set_proxy deprecated, but documentation unclear how it should be replaced
Silas S. Brown's avatar
Silas S. Brown committed
        profile.set_proxy(Proxy({'proxyType':ProxyType.MANUAL,'httpProxy':"127.0.0.1:%d" % proxyPort(index),'sslProxy':"127.0.0.1:%d" % proxyPort(index),'ftpProxy':'','noProxy':''}))
Silas S. Brown's avatar
Silas S. Brown committed
        profile.accept_untrusted_certs = True # needed for some older versions?
        caps = wd_DesiredCapabilities(log_complaints)
        if caps:
            caps = caps.FIREFOX.copy()
            caps['acceptInsecureCerts'] = True
            caps['acceptSslCerts'] = True # older versions
    elif options.upstream_proxy: profile.set_proxy(options.upstream_proxy)
    if options.js_UA and not options.js_UA.startswith("*"): profile.set_preference("general.useragent.override",options.js_UA)
    if not options.js_images: profile.set_preference("permissions.default.image", 2)
Silas S. Brown's avatar
Silas S. Brown committed
    if options.via and not options.js_reproxy and log_complaints:
        # Oops: how can we put in a Via: header if we don't
        # have an upstream proxy to do so?  unless you want
        # to implement a Firefox extension to do it (TODO?)
        warn("--via ignored when running Firefox without --js-reproxy")
    # TODO: do any other options need to be set?  disable plugins, Firefox-update prompts, new windows/tabs with JS, etc?  or does Selenium do that?
    if options.logDebug: binary=FirefoxBinary(log_file=sys.stderr) # TODO: support logDebug to a file as well
    else: binary=FirefoxBinary()
    if headless: cmdL = ('-headless','-no-remote')
    else: cmdL = ('-no-remote',)
    if js_size: cmdL += ("-width",str(js_size[0]),"-height",str(js_size[1]))
    cmdL += ("about:blank",) # not Firefox start page
Silas S. Brown's avatar
Silas S. Brown committed
    binary.add_command_line_options(*cmdL) # cannot call this more than once
Silas S. Brown's avatar
Silas S. Brown committed
    if caps: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary,capabilities=caps)
    else: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary)
Silas S. Brown's avatar
Silas S. Brown committed
    try: p.set_page_load_timeout(options.js_timeout1)
    except: logging.info("Couldn't set Firefox page load timeout")
Silas S. Brown's avatar
Silas S. Brown committed
    return p
block_headless_firefox = [
    # servers that Firefox tries to CONNECT to on startup
    "push.services.mozilla.com","snippets.cdn.mozilla.net","firefox.settings.services.mozilla.com","location.services.mozilla.com","shavar.services.mozilla.com",
    "aus5.mozilla.org","ftp.mozilla.org",
    "fonts.googleapis.com", # Fedora version of Firefox connects to this
    # "start.fedoraproject.org","fedoraproject.org", # Fedora version of Firefox does this (but what if user actually wants to view one of those pages?)
]
Silas S. Brown's avatar
Silas S. Brown committed
def wd_DesiredCapabilities(log_complaints):
    try:
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
        return DesiredCapabilities
    except:
        if log_complaints: warn("Your Selenium installation is too old to set DesiredCapabilities.\nThis is likely to stop some js options from working properly.")
        return None
def wd_instantiateLoop(wdClass,index,renewing,**kw):
    debuglog("Instantiating "+wdClass.__name__+" "+repr(kw))
    if 'chrome_options' in kw:
        try: newChromedriver = 'options' in getargspec(webdriver.chrome.webdriver.WebDriver.__init__).args
        except: newChromedriver = False
        if newChromedriver:
            kw['options'] = kw['chrome_options']
            del kw['chrome_options']
    if not renewing: time.sleep(min(2*(index % js_per_core),int(options.js_timeout2 / 2))) # try not to start them all at once at the beginning (may reduce chance of failure)
    while True:
            if wdClass==webdriver.Chrome: p = wdClass(getoutput("which chromedriver 2>/dev/null"),**kw) # some versions need to be told explicitly where chromedriver is, rather than looking in PATH themselves, in order to get "wrong version" errors etc (otherwise errors ignored, Selenium looks for a different chromedriver and gives a slightly confusing error about 'none found' rather than the error you should have seen about 'wrong version')
            else: p = wdClass(**kw)
Silas S. Brown's avatar
Silas S. Brown committed
            if not p.capabilities: raise Exception("Didn't seem to get a p.capabilities")
            elif 'browserVersion' in p.capabilities:
                # Selenium 2.x calls it version, but Selenium
                # 3.x calls it browserVersion.  Map this back
                # to 'version' for our other code.
                p.capabilities['version'] = p.capabilities['browserVersion']
            elif not 'version' in p.capabilities: raise Exception("capabilities has no version: "+repr(p.capabilities.items()))
Silas S. Brown's avatar
Silas S. Brown committed
            if index==0 and not renewing: raise
Silas S. Brown's avatar
Silas S. Brown committed
            logging.error("Unhandled exception "+exc_logStr()+" when instantiating webdriver %d, retrying in 2sec" % index)
            time.sleep(2) ; p = None
        if p: break
Silas S. Brown's avatar
Silas S. Brown committed
    debuglog(wdClass.__name__+" instantiated")
    return p
Silas S. Brown's avatar
Silas S. Brown committed
def _get_new_PhantomJS(index,renewing):
    log_complaints = (index==0 and not renewing)
    os.environ["QT_QPA_PLATFORM"]="offscreen"
Silas S. Brown's avatar
Silas S. Brown committed
    sa = ['--ssl-protocol=any']
Silas S. Brown's avatar
Silas S. Brown committed
    # if options.logDebug: sa.append("--debug=true") # doesn't work: we don't see the debug output on stdout or stderr
Silas S. Brown's avatar
Silas S. Brown committed
    if options.js_reproxy:
Silas S. Brown's avatar
Silas S. Brown committed
        sa.append('--ignore-ssl-errors=true')
Silas S. Brown's avatar
Silas S. Brown committed
        sa.append('--proxy=127.0.0.1:%d' % proxyPort(index))
Silas S. Brown's avatar
Silas S. Brown committed
    elif options.upstream_proxy: sa.append('--proxy='+options.upstream_proxy)
Silas S. Brown's avatar
Silas S. Brown committed
    dc = wd_DesiredCapabilities(log_complaints)
    if dc:
        dc = dict(dc.PHANTOMJS)
        if options.js_UA and not options.js_UA.startswith("*"): dc["phantomjs.page.settings.userAgent"]=options.js_UA
        if not options.js_images: dc["phantomjs.page.settings.loadImages"]=False
        dc["phantomjs.page.settings.javascriptCanOpenWindows"]=dc["phantomjs.page.settings.javascriptCanCloseWindows"]=False # TODO: does this cover target="_blank" in clickElementID etc (which could have originated via DOM manipulation, so stripping them on the upstream proxy is insufficient; close/restart the driver every so often?)
        if options.via and not options.js_reproxy: dc["phantomjs.page.customHeaders.Via"]="1.0 "+convert_to_via_host("")+" ("+viaName+")" # customHeaders works in PhantomJS 1.5+ (TODO: make it per-request so can include old Via headers & update protocol version, via_host + X-Forwarded-For; will webdriver.DesiredCapabilities.PHANTOMJS[k]=v work before a request?) (don't have to worry about this if js_reproxy)
        return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,desired_capabilities=dc,service_args=sa)
    else: return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,service_args=sa)
Silas S. Brown's avatar
Silas S. Brown committed
def get_new_PhantomJS(index,renewing=False):
    wd = _get_new_PhantomJS(index,renewing)
    log_complaints = (index==0 and not renewing)
Silas S. Brown's avatar
Silas S. Brown committed
    if log_complaints and not options.js_reproxy:
Silas S. Brown's avatar
Silas S. Brown committed
     try: is_v2 = wd.capabilities['version'].startswith("2.")
     except: is_v2 = False
     if is_v2: warn("You may be affected by PhantomJS issue #13114.\nRelative links may be wrong after a redirect if the site sets Content-Security-Policy.\nTry --js_reproxy, or downgrade your PhantomJS to version 1.9.8")
    try: wd.set_window_size(*js_size)
Silas S. Brown's avatar
Silas S. Brown committed
    except: logging.info("Couldn't set PhantomJS window size")
Silas S. Brown's avatar
Silas S. Brown committed
    try: wd.set_page_load_timeout(options.js_timeout1)
Silas S. Brown's avatar
Silas S. Brown committed
    except: logging.info("Couldn't set PhantomJS page load timeout")
Silas S. Brown's avatar
Silas S. Brown committed
    return wd
Silas S. Brown's avatar
Silas S. Brown committed
def proxyPort(index): return port_randomise.get(js_proxy_port[index],js_proxy_port[index])
Silas S. Brown's avatar
Silas S. Brown committed
webdriver_runner = [] ; webdriver_prefetched = []
Silas S. Brown's avatar
Silas S. Brown committed
webdriver_via = [] ; webdriver_UA = []
Silas S. Brown's avatar
Silas S. Brown committed
webdriver_inProgress = [] ; webdriver_queue = []
Silas S. Brown's avatar
Silas S. Brown committed
webdriver_lambda = webdriver_mu = 0
Silas S. Brown's avatar
Silas S. Brown committed
def test_init_webdriver():
    "Check that we CAN start a webdriver, before forking to background and starting all of them"
    sys.stderr.write("Checking webdriver configuration... "),sys.stderr.flush()
Silas S. Brown's avatar
Silas S. Brown committed
    get_new_webdriver(0).quit()
    sys.stderr.write("OK\n")
quitFuncToCall = None
def init_webdrivers(start,N):
    informing = not options.background and not start and not (options.multicore and options.ssl_fork) # (if ssl_fork, we don't want the background 'starting N processes' messages to be interleaved with this)
    if informing:
        sys.stderr.write("Starting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
    for i in xrange(N):
        webdriver_runner.append(WebdriverRunner(start,len(webdriver_runner)))
Silas S. Brown's avatar
Silas S. Brown committed
        webdriver_prefetched.append(None)
Silas S. Brown's avatar
Silas S. Brown committed
        webdriver_inProgress.append(set())
Silas S. Brown's avatar
Silas S. Brown committed
        webdriver_via.append(None) ; webdriver_UA.append("")
Silas S. Brown's avatar
Silas S. Brown committed
    def quit_wd_atexit(*args):
      if informing: sys.stderr.write("Quitting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
Silas S. Brown's avatar
Silas S. Brown committed
            try: i.quit_webdriver()
      if informing: sys.stderr.write("done\n")
    global quitFuncToCall ; quitFuncToCall = quit_wd_atexit # don't use the real atexit, as we have our own thread-stop logic which might kick in first, leaving a stuck adjusterWDhelp process if js_multiprocess==True, and additionally holding up calling process if --stdio is in use (fixed in v0.2795)
    if options.js_restartMins and not options.js_restartAfter==1: IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
    if informing: sys.stderr.write("done\n")
Silas S. Brown's avatar
Silas S. Brown committed
def webdriver_allBusy():
Silas S. Brown's avatar
Silas S. Brown committed
    busyNow = sum(1 for i in webdriver_runner if i.wd_threadStart)
Silas S. Brown's avatar
Silas S. Brown committed
    webdriver_maxBusy = max(webdriver_maxBusy,busyNow)
Silas S. Brown's avatar
Silas S. Brown committed
    return busyNow == len(webdriver_runner)
def webdriver_checkServe(*args):
    # how many queue items can be served right now?
Silas S. Brown's avatar
Silas S. Brown committed
    # (called on IOLoop thread when new item added, or when
    # a server is finished)
Silas S. Brown's avatar
Silas S. Brown committed
    debuglog("Entering webdriver_checkServe, runners=%d" % len(webdriver_runner))
    for i in xrange(len(webdriver_runner)):
Silas S. Brown's avatar
Silas S. Brown committed
        if not webdriver_queue: break # just to save a little
Silas S. Brown's avatar
Silas S. Brown committed
        if not webdriver_runner[i].wd_threadStart:
Silas S. Brown's avatar
Silas S. Brown committed
            while webdriver_queue:
Silas S. Brown's avatar
Silas S. Brown committed
                url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate = webdriver_queue.pop(0)
Silas S. Brown's avatar
Silas S. Brown committed
                if tooLate():
                    debuglog("tooLate() for "+url)
                    continue
Silas S. Brown's avatar
Silas S. Brown committed
                debuglog("Starting fetch of "+url+" on webdriver instance "+str(i+webdriver_runner[i].start))
Silas S. Brown's avatar
Silas S. Brown committed
                webdriver_via[i],webdriver_UA[i] = via,ua
Silas S. Brown's avatar
Silas S. Brown committed
                webdriver_runner[i].fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate)
                global webdriver_mu ; webdriver_mu += 1
                break
    if webdriver_allBusy(): pauseOrRestartMainServer(0) # we're "paused" anyway when not in the poll wait, so might as well call this only at end, to depend on the final status (and make sure to call webdriver_allBusy() no matter what, as it has the side-effect of updating webdriver_maxBusy)
    else: pauseOrRestartMainServer(1)
Silas S. Brown's avatar
Silas S. Brown committed
    debuglog("Finishing webdriver_checkServe, webdriver_queue len=%d" % len(webdriver_queue))
def webdriver_checkRenew(*args):
    for i in webdriver_runner:
Silas S. Brown's avatar
Silas S. Brown committed
        if not i.wd_threadStart and i.usageCount and i.finishTime + options.js_restartMins < time.time(): i.renew_webdriver_newThread() # safe because we're running in the IOLoop thread, which therefore can't start wd_thread between our test of wd_threadStart and renew_webdriver_newThread
    IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
Silas S. Brown's avatar
Silas S. Brown committed
def webdriver_fetch(url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate):
Silas S. Brown's avatar
Silas S. Brown committed
    if tooLate(): return # probably webdriver_queue overload (which will also be logged)
    elif prefetched and (not hasattr(prefetched,"code") or prefetched.code >= 500): return callback(prefetched) # don't bother allocating a webdriver if we got a timeout or DNS error or something
Silas S. Brown's avatar
Silas S. Brown committed
    elif wsgi_mode: return callback(_wd_fetch(webdriver_runner[0],url,prefetched,clickElementID,clickLinkText,asScreenshot)) # (can't reproxy in wsgi_mode, so can't use via and ua) TODO: if *threaded* wsgi, index 0 might already be in use (we said threadsafe:true in AppEngine instructions but AppEngine can't do js_interpreter anyway; where else might we have threaded wsgi?  js_interpreter really is better run in non-wsgi mode anyway, so can js_reproxy)
    webdriver_queue.append((url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate))
Silas S. Brown's avatar
Silas S. Brown committed
    global webdriver_lambda ; webdriver_lambda += 1
Silas S. Brown's avatar
Silas S. Brown committed
    debuglog("webdriver_queue len=%d after adding %s" % (len(webdriver_queue),url))
Silas S. Brown's avatar
Silas S. Brown committed
    webdriver_checkServe() # safe as we're IOLoop thread
Silas S. Brown's avatar
Silas S. Brown committed
#@file: http-rewrite.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Service routines for basic HTTP header rewriting
# --------------------------------------------------

def fixServerHeader(i):
Silas S. Brown's avatar
Silas S. Brown committed
    i.set_header("Server",serverName)
    # TODO: in "real" proxy mode, "Server" might not be the most appropriate header to set for this
    try: i.clear_header("Date") # Date is added by Tornado 3; HTTP 1.1 says it's mandatory but then says don't put it if you're a clockless server (which we might be I suppose) so it seems leaving it out is OK especially if not specifying Age etc, and leaving it out saves bytes.  But if the REMOTE server specifies a Date then we should probably pass it on (see comments in doResponse below)
    except: pass # (ok if "Date" wasn't there)
rmServerHeaders = set([
    # server headers to remove.  We'll do our own connection type etc (but don't include "Date" in this list: if the remote server includes a Date it would be useful to propagate that as a reference for its Age headers etc, TODO: unless remote server is broken? see also comment in fixServerHeader re having no Date by default).  Many servers' Content-Location is faulty; it DOESN'T necessarily provide the new base href; it might be relative; it might be identical to the actual URL fetched; many browsers ignore it anyway
    "connection","content-length","content-encoding","transfer-encoding","etag","content-md5","server","alternate-protocol","strict-transport-security","content-location",
    "x-associated-content", # should NOT be sent to browser (should be interpreted by a server's SPDY/push module) but somebody might misread the specs (at least one Wikipedia editor did)
Silas S. Brown's avatar
Silas S. Brown committed
    "x-host","x-http-reason", # won't necessarily be the same
Silas S. Brown's avatar
Silas S. Brown committed
    "content-security-policy","x-webkit-csp","x-content-security-policy","x-frame-options", # sorry but if we're adjusting the site by adding our own scripts/styles we are likely to be broken by a CSP that restricts which of these we're allowed to do. (Even if we adjust the domains listed on those headers, what if our scripts rely on injecting inline code?)  Sites shouldn't *depend* on CSP to prevent XSS: it's just a belt-and-braces that works only in recent browsers.  Hopefully our added styles etc will break the XSS-introduced ones if we hit a lazy site.
Silas S. Brown's avatar
Silas S. Brown committed
    "vary", # we modify this (see code)
Silas S. Brown's avatar
Silas S. Brown committed
    "alt-svc",
Silas S. Brown's avatar
Silas S. Brown committed
    "public-key-pins","public-key-pins-report-only",
Silas S. Brown's avatar
Silas S. Brown committed
# TODO: WebSocket (and Microsoft SM) gets the client to say 'Connection: Upgrade' with a load of Sec-WebSocket-* headers, check what Tornado does with that
rmClientHeaders = ['Connection','Proxy-Connection','Accept-Charset','Accept-Encoding','X-Forwarded-Host','X-Forwarded-Port','X-Forwarded-Server','X-Forwarded-Proto','X-Request-Start','TE','Upgrade',
                   'Upgrade-Insecure-Requests', # we'd better remove this from the client headers if we're removing Content-Security-Policy etc from the server's
                   'Range', # TODO: we can pass Range to remote server if and only if we guarantee not to need to change anything  (could also add If-Range and If-None-Match to the list, but these should be harmless to pass to the remote server and If-None-Match might actually help a bit in the case where the document doesn't change)
]
Silas S. Brown's avatar
Silas S. Brown committed
#@file: request-forwarder.py
Silas S. Brown's avatar
Silas S. Brown committed
# --------------------------------------------------
# Our main RequestForwarder class.  Handles incoming
# HTTP requests, generates requests to upstream servers
# and handles responses.  Sorry it's got a bit big :-(
# --------------------------------------------------

Silas S. Brown's avatar
Silas S. Brown committed
the_supported_methods = ("GET", "HEAD", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "CONNECT")
# Don't support PROPFIND (from WebDAV) unless be careful about how to handle image requests with it
# TODO: image requests with OPTIONS ?
    def get_error_html(self,status,**kwargs): return htmlhead("Web Adjuster error")+options.errorHTML+"</body></html>" # Tornado 2.0
    def write_error(self,status,**kwargs): # Tornado 2.1+
        if hasattr(self,"_finished") and self._finished: return
Silas S. Brown's avatar
Silas S. Brown committed
        msg = self.get_error_html(status,**kwargs)
        if "{traceback}" in msg and 'exc_info' in kwargs:
            msg = msg.replace("{traceback}","<pre>"+ampEncode("".join(traceback.format_exception(*kwargs["exc_info"])))+"</pre>")
            # TODO: what about substituting for {traceback} on pre-2.1 versions of Tornado that relied on get_error_html and put the error into sys.exc_info()?  (need to check their source to see how reliable the traceback is in this case; post-2.1 versions re-raise it from write_error itself)
Silas S. Brown's avatar
Silas S. Brown committed
        if self.canWriteBody(): self.write(msg)
Silas S. Brown's avatar
Silas S. Brown committed
        self.finish()

    def cookie_host(self,checkReal=True,checkURL=True):
        # for cookies telling us what host the user wants
Silas S. Brown's avatar
Silas S. Brown committed
        if self.isPjsUpstream or self.isSslUpstream:
            return False
        if checkReal and convert_to_real_host(self.request.host,None): return # if we can get a real host without the cookie, the cookie does not apply to this host
        if enable_adjustDomainCookieName_URL_override and checkURL:
            if self.cookieViaURL: v = self.cookieViaURL
            else:
                v = self.request.arguments.get(adjust_domain_cookieName,None)
                if type(v)==type([]): v=v[-1]
                if v: self.removeArgument(adjust_domain_cookieName,quote(v))
            if v:
                self.cookieViaURL = v
                if v==adjust_domain_none: return None
                else: return v
        return self.getCookie(adjust_domain_cookieName,adjust_domain_none)

    def readCookies(self):
        if hasattr(self,"_adjuster_cookies_"): return # already OK
        self._adjuster_cookies_ = {}
        for c in self.request.headers.get_list("Cookie"):
            for cc in c.split(';'):
                if not '=' in cc: continue # (e.g. Maxthon has been known to append a ; to the end of the cookie string)
                self._adjuster_cookies_[n] = v
    def getCookie(self,cookieName,zeroValue=None):
        # zeroValue is a value that the cookie can be set to so as to "clear" it (because some browsers don't seem to understand JS that clears a cookie)
        self.readCookies()
        v = self._adjuster_cookies_.get(cookieName,None)
        if v==zeroValue: v = None
        return v
    def setCookie(self,cookieName,val):
        # This is ONLY for ADJUSTER OPTIONS - does NOT propagate to the server
        self.readCookies()
        self._adjuster_cookies_[cookieName] = val
    
    def clearUnrecognisedCookies(self):
        # When serving via adjust_domain_cookieName, on URL box try to save browser memory (and request time) and improve security by deleting cookies set by previous sites.  But can cover only the path=/ ones from here.
        self.readCookies()
        for n,v in self._adjuster_cookies_.items():
            if n in cRecogniseAny or n==adjust_domain_cookieName: continue # don't do adjust_domain_cookieName unless take into account addCookieFromURL (TODO: but would we ever GET here if that happens?)
            elif n in cRecognise1 and v=="1": continue
            for dot in ["","."]: self.add_header("Set-Cookie",n+"="+v+"; Domain="+dot+self.cookieHostToSet()+"; Path=/; Expires=Thu Jan 01 00:00:00 1970") # to clear it