Newer
Older

Silas S. Brown
committed
def IOLoopInstance():
global ioLoopInstance
try: return ioLoopInstance
except: # better call this from the main thread first:
if hasattr(IOLoop,"current"): ioLoopInstance = IOLoop.current() # for Tornado 5+ to work
else: ioLoopInstance = IOLoop.instance() # in Tornado 4 and older, this can be called on-demand from any thread, but we're putting it in a global for forward-compatibility with the above
return ioLoopInstance
def startServers():

Silas S. Brown
committed
workaround_tornado_fd_issue()
for core,sList in list(theServers.items()):
if core == "all" or core == coreNo:
# --------------------------------------------------
# Multicore: pause/restart when a core is overloaded
# --------------------------------------------------
mainServerPaused = mainServerReallyPaused = False
def pauseOrRestartMainServer(shouldRun=True):
if not (options.multicore and options.js_429): return
global mainServerPaused
if (not shouldRun) == mainServerPaused: return
# if shouldRun: return # uncomment this 'once closed, stay closed' line to demonstrate the OS forwards to open cores only
reallyPauseOrRestartMainServer(shouldRun)
mainServerPaused = not mainServerPaused
debuglog("Paused=%s on core %s" % (repr(mainServerPaused),repr(coreNo)))
CrossProcess429.q.put((coreNo,mainServerPaused))
def reallyPauseOrRestartMainServer(shouldRun):
global mainServerReallyPaused
if shouldRun == "IfNotPaused": # called by CrossProcess429 to re-pause if and only if hasn't been reopened by the outer level in the meantime
shouldRun = mainServerPaused
if (not shouldRun) == mainServerReallyPaused: return
for core,sList in theServers.items():
if not (core == "all" or core == coreNo): continue
for port,s in sList:
if not port==options.port: continue
if not hasattr(s,"_sockets"):
logging.error("Cannot pause server: wrong Tornado version?")
return
if shouldRun: s.add_sockets(s._sockets.values())
else:
for fd, sock in s._sockets.items():
if hasattr(s,"io_loop"): s.io_loop.remove_handler(fd) # Tornado 4

Silas S. Brown
committed
else: IOLoopInstance().remove_handler(fd) # Tornado 5, not tested (TODO)
mainServerReallyPaused = not mainServerReallyPaused
debuglog("reallyPaused=%s on core %s" % (repr(mainServerReallyPaused),repr(coreNo)))
# --------------------------------------------------
# Miscellaneous bug workarounds
# --------------------------------------------------
def workaround_raspbian7_IPv6_bug():
"""Old Debian 7 based versions of Raspbian can boot with IPv6 enabled but later fail to configure it, hence tornado/netutil.py's AI_ADDRCONFIG flag is ineffective and socket.socket raises "Address family not supported by protocol" when it tries to listen on IPv6. If that happens, we'll need to set address="0.0.0.0" for IPv4 only. However, if we tried IPv6 and got the error, then at that point Tornado's bind_sockets will likely have ALREADY bound an IPv4 socket but not returned it; the socket does NOT get closed on dealloc, so a retry would get "Address already in use" unless we quit and re-run the application (or somehow try to figure out the socket number so it can be closed). Instead of that, let's try to detect the situation in advance so we can set options.address to IPv4-only the first time."""
if options.address: return # don't need to do this if we're listening on a specific address
flags = socket.AI_PASSIVE
if hasattr(socket, "AI_ADDRCONFIG"): flags |= socket.AI_ADDRCONFIG
for af,socktype,proto,r1,r2 in socket.getaddrinfo(None,options.port,socket.AF_UNSPEC,socket.SOCK_STREAM,0,flags):
try: socket.socket(af,socktype,proto)
if "family not supported" in e.strerror:
options.address = "0.0.0.0" # use IPv4 only
return
def workaround_timeWait_problem():
"""Work around listen-port failing to bind when there are still TIME_WAIT connections from the previous run. This at least seems to work around the problem MOST of the time."""

Silas S. Brown
committed
global bind_sockets
bind_sockets = tornado.netutil.bind_sockets
if "win" in sys.platform and not sys.platform=="darwin":
# Don't do this on MS-Windows. It can result in
# 'stealing' a port from another server even while
# that other server is still running.
return
if not hasattr(socket, "SO_REUSEPORT"): return

Silas S. Brown
committed
if getargspec==None: return
if not 'reuse_port' in getargspec(tornado.netutil.bind_sockets).args: return # Tornado version too old
def bind_sockets(*args,**kwargs):
if not args[0]: pass # wer're doing port_randomise
elif len(args) < 6: kwargs['reuse_port'] = True
else: args=tuple(args[:6])+(True,)
return tornado.netutil.bind_sockets(*args,**kwargs)
def workaround_tornado_fd_issue(): # TODO: is this still needed post-v0.3 now we fixed start-order bug?
if not hasattr(IOLoopInstance(),'handle_callback_exception'):
return # Tornado 6 doesn't have this, let's hope it's not needed
cxFunc = IOLoopInstance().handle_callback_exception
def newCx(callback):
if callback: return cxFunc(callback)
# self._handlers[fd] raised KeyError. This means
# we don't want to keep being told about the fd.
fr = sys.exc_info()[2]
while fr.tb_next: fr = fr.tb_next
fd = fr.tb_frame.f_locals.get("fd",None)
if not fd: return cxFunc("callback="+repr(callback)+" and newCx couldn't get fd from stack")
logging.info("IOLoop has no handler left for fd "+repr(fd)+" but is still getting events from it. Attempting low-level close to avoid loop.")

Silas S. Brown
committed
try: IOLoopInstance().remove_handler(fd)
except: pass
try: os.close(fd)
except: pass

Silas S. Brown
committed
IOLoopInstance().handle_callback_exception = newCx
def check_LXML():
# Might not find ALL problems with lxml installations, but at least we can check some basics

Silas S. Brown
committed
global etree
try:
from lxml import etree
return etree.HTMLParser(target=None) # works on lxml 2.3.2
except ImportError: sys.stderr.write("LXML library not found - ignoring useLXML option\n")
except TypeError: sys.stderr.write("LXML library too old - ignoring useLXML option\n") # no target= option in 1.x
options.useLXML = False
# --------------------------------------------------
# More setup: Unix forking, privileges etc
# --------------------------------------------------

Silas S. Brown
committed
def dropPrivileges():
if options.user and not os.getuid():
# need to drop privileges

Silas S. Brown
committed
import pwd ; pwd=pwd.getpwnam(options.user)
os.setuid(pwd[2])
# and help our external programs so they
# don't try to load root's preferences etc
os.environ['HOME'] = pwd[5]

Silas S. Brown
committed
os.environ['USER']=os.environ['LOGNAME']=options.user

Silas S. Brown
committed
def unixfork():

Silas S. Brown
committed
os.setsid()

Silas S. Brown
committed
devnull = os.open("/dev/null", os.O_RDWR)
for fd in range(3): os.dup2(devnull,fd) # commenting out this loop will let you see stderr after the fork (TODO debug option?)
if options.pidfile:
try: open(options.pidfile,"w").write(str(os.getpid()))
except: pass
def notifyReady():
try: import sdnotify # sudo pip install sdnotify
except ImportError: return
sdnotify.SystemdNotifier().notify("READY=1") # we send READY=1 so you can do an adjuster.service (w/out --background) with Type=notify and ExecStart=/usr/bin/python /path/to/adjuster.py --config=...
# TODO: also send "WATCHDOG=1" so can use WatchdogSec ? (but multicore / js_interpreter could be a problem)
# --------------------------------------------------
# cURL client setup
# --------------------------------------------------
def setupCurl(maxCurls,error=None):

Silas S. Brown
committed
import pycurl # check it's there

Silas S. Brown
committed
curl_async = pycurl.version_info()[4] & (1 << 7) # CURL_VERSION_ASYNCHDNS
if not curl_async: curl_async = ('c-ares' in pycurl.version or 'threaded' in pycurl.version) # older
if not curl_async:
if error: warn("The libcurl on this system might hold up our main thread while it resolves DNS (try building curl with ./configure --enable-ares)")
else:
del pycurl ; return # TODO: and say 'not using'?
if float('.'.join(pycurl.version.split()[0].split('/')[1].rsplit('.')[:2])) < 7.5:
if error: warn("The curl on this system is old and might hang when fetching certain SSL sites") # strace -p (myPID) shows busy looping on poll (TODO: option to not use it if we're not using upstream_proxy)
_oldCurl = pycurl.Curl
def _newCurl(*args,**kwargs):
c = _oldCurl(*args,**kwargs)
so = c.setopt
def mySetopt(k,v):
so(k,v)
if k==pycurl.PROXY: so(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_0) # workaround 599 "transfer closed with outstanding read data remaining" in Curl 7.55.1 with polipo2 as upstream proxy (TODO: curl-version dependent? 7.43.0 seemed OK in this aspect, although it had the above problem)
c.setopt = mySetopt
return c
pycurl.Curl = _newCurl
curl_max_clients = min(max(maxCurls,10),1000) # constrain curl_max_clients to between 10 and 1000 to work around Tornado issue 2127, and we'll warn about the issue ourselves if we go over:
curl_inUse_clients = 0
try: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient",max_clients=curl_max_clients)
except: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") # will try in MyAsyncHTTPClient too (different versions of Tornado and all that...) (TODO: if that one also falls back to no max_clients, we might be reduced to 10 and should set curl_max_clients accordingly in order to get appropriate warning messages)
try: problem = not len(AsyncHTTPClient()._free_list)
except:
global curl_inUse_clients
curl_inUse_clients += 1
problem = curl_inUse_clients >= curl_max_clients
if problem:
if upstream_rewrite_ssl and not options.ssl_fork: logging.error("curl_max_clients too low; AsyncHTTPClient will queue requests and COULD DEADLOCK due to upstream_rewrite_ssl (try --ssl-fork if you can't increase curl_max_clients)")
else: logging.info("curl_max_clients too low; AsyncHTTPClient will queue requests")
try: return AsyncHTTPClient(max_clients=curl_max_clients)
def curlFinished(): # for callbacks to call
global curl_inUse_clients
curl_inUse_clients -= 1
if curl_inUse_clients < 0:
# This shouldn't happen. But if it does, don't let the effect 'run away'.
curl_inUse_clients = 0
except: # fall back to the pure-Python one
if error: errExit(error) # (unless it won't do)

Silas S. Brown
committed

Silas S. Brown
committed
try:
import zlib
enable_gzip = True # for fetching remote sites

Silas S. Brown
committed
except: # Windows?

Silas S. Brown
committed
enable_gzip = False

Silas S. Brown
committed
class zlib:
def compress(self,s,level): return s
def decompressobj():
class o:
def decompress(self,s,maxlen): return s
return o()
zlib = zlib()
# --------------------------------------------------
# Support for showing messages to specific IP addresses
# --------------------------------------------------

Silas S. Brown
committed
try:
import hashlib # Python 2.5+, platforms?
hashlib.md5
except: hashlib = None # (TODO: does this ever happen on a platform that supports Tornado? Cygwin has hashlib with md5)

Silas S. Brown
committed
if hashlib: cookieHash = lambda msg: base64.b64encode(hashlib.md5(B(msg)).digest())[:10]

Silas S. Brown
committed
else: cookieHash = lambda msg: hex(hash(msg))[2:] # this fallback is not portable across different Python versions etc, so no good if you're running a fasterServer
ipv4_regexp = re.compile(r'^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$')
def ipv4_to_int(ip):

Silas S. Brown
committed
m = re.match(ipv4_regexp,S(ip))
if m: return (int(m.group(1))<<24) | (int(m.group(2))<<16) | (int(m.group(3))<<8) | int(m.group(4))
else: return None

Silas S. Brown
committed
ip = S(ip)
if '-' in ip: return tuple(ipv4_to_int(i) for i in ip.split('-'))
elif '/' in ip:
start,bits = ip.split('/')
start = ipv4_to_int(start)
return start, start | ~(-1 << (32-int(bits)))
else: return ipv4_to_int(ip),ipv4_to_int(ip)
def ipv4ranges_func(ipRanges_and_results):
isIP = True ; rangeList=None ; fList = []

Silas S. Brown
committed
for field in S(ipRanges_and_results).split('|'):
if isIP: rangeList = [ipv4range_to_ints(i) for i in field.split(',')]
else: fList.append((rangeList,field))
isIP = not isIP
def f(ip):
ipInt = ipv4_to_int(ip)
for rl,result in fList:
if any((l<=ipInt<=h) for l,h in rl):
return result # else None
return f

Silas S. Brown
committed
# --------------------------------------------------
# Service routines for CONNECT passing to SSL terminator
# --------------------------------------------------

Silas S. Brown
committed

Silas S. Brown
committed
if re.search(B("[\x00-\x09\x0e-\x1f]"),B(d)): return "%d bytes" % len(d)
try: return socket.getpeername()
except: return "(no socket??)"

Silas S. Brown
committed
def writeAndClose(stream,data):
if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket)+" and closing it")
try: stream.write(data,lambda *args:True)
except: pass # ignore errors like client disconnected
if not stream.closed():
try: stream.close()
except: pass
if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket))
try: stream.write(data)
if name and not hasattr(stream,"writeOrError_already_complained"): logging.error("Error writing data to "+name)
stream.writeOrError_already_complained = True
try: stream.close()
except: pass
try: opposite.close() # (try to close the server stream we're reading if the client has gone away, and vice versa)
except: pass

Silas S. Brown
committed
# --------------------------------------------------
# Miscellaneous variables
# --------------------------------------------------
cookieExpires = "Tue Jan 19 03:14:07 2038" # TODO: S2G (may have to switch to Max-Age and drop support for ~IE8)
set_window_onerror = False # for debugging Javascript on some mobile browsers (TODO make this a config option? but will have to check which browsers do and don't support window.onerror)

Silas S. Brown
committed
# Domain-setting cookie for when we have no wildcard_dns and no default_site:
adjust_domain_cookieName = "_adjusterDN_"

Silas S. Brown
committed
adjust_domain_none = B("0") # not a valid top-level domain (TODO hopefully no user wants this as a local domain...)

Silas S. Brown
committed
enable_adjustDomainCookieName_URL_override = True # TODO: document this! (Allow &_adjusterDN_=0 or &_adjusterDN_=wherever in bookmark URLs, so it doesn't matter what setting the cookie has when the bookmark is activated)

Silas S. Brown
committed
seen_ipMessage_cookieName = "_adjusterIPM_"

Silas S. Brown
committed
htmlmode_cookie_name = "_adjustZJCG_" # zap JS, CSS and Graphics

Silas S. Brown
committed
password_cookie_name = "_pxyAxsP_" # "proxy access password". have to pick something that's unlikely to collide with a site's cookie

Silas S. Brown
committed

Silas S. Brown
committed
redirectFiles_Extensions=set("pdf epub mp3 aac zip gif png jpeg jpg exe tar tgz tbz ttf woff swf txt doc rtf midi mid wav ly c h py".split()) # TODO: make this list configurable + maybe add a "minimum content length before it's worth re-directing" option
# --------------------------------------------------
# Server-side Javascript execution support
# --------------------------------------------------
class WebdriverWrapper:
"Wrapper for webdriver that might or might not be in a separate process without shared memory"

Silas S. Brown
committed
def __init__(self): self.theWebDriver = self.tmpDirToDelete = None
# No coredump, for emergency_zap_pid_and_children
import resource ; resource.setrlimit(resource.RLIMIT_CORE,(0,0))
except: pass # oh well, have coredumps then :-(

Silas S. Brown
committed
if options.js_multiprocess:
# we have a whole process to ourselves (not just a thread)
# so we can set the environment here.
# Selenium doesn't always clean up temporary files on exit
# (especially with Firefox), so let's set TMPDIR uniquely so we
# can clean them ourselves.
tmp = os.environ.get("TMPDIR",None)
self.tempDirToDelete=os.environ['TMPDIR']=os.environ.get("TMPDIR","/tmp")+"/"+str(os.getpid())+"."+str(args[0])
try: os.mkdir(self.tempDirToDelete)
except: pass
else: tmp = self.tempDirToDelete = None

Silas S. Brown
committed
if tmp: os.environ["TMPDIR"] = tmp
elif options.js_multiprocess: del os.environ["TMPDIR"]
def getTmp(self,*args): return self.tempDirToDelete
def quit(self,*args):
if not self.theWebDriver: return
try: pid = self.theWebDriver.service.process.pid
except: pid = debuglog("WebdriverWrapper: Unable to get self.theWebDriver.service.process.pid")
except: debuglog("WebdriverWrapper: exception on quit") # e.g. sometimes get 'bad fd' in selenium's send_remote_shutdown_command _cookie_temp_file_handle
# Try zapping the process ourselves anyway (even if theWebDriver.quit DIDN'T return error: seems it's sometimes still left around. TODO: this could have unexpected consequences if the system's pid-reuse rate is excessively high.)

Silas S. Brown
committed
if self.tempDirToDelete: emergency_zap_tempdir(self.tempDirToDelete)
def current_url(self):
try: return self.theWebDriver.current_url
except: return "" # PhantomJS Issue #13114: unconditional reload for now

Silas S. Brown
committed
self.theWebDriver.get(S(url))
if options.logDebug:
try:
for e in self.theWebDriver.get_log('browser'):
except Exception as e: print ("webdriver get_log exception: "+repr(e))

Silas S. Brown
committed
def execute_script(self,script): self.theWebDriver.execute_script(S(script))
def click_id(self,clickElementID): self.theWebDriver.find_element_by_id(S(clickElementID)).click()
def click_xpath(self,xpath): self.theWebDriver.find_element_by_xpath(S(xpath)).click()
def click_linkText(self,clickLinkText): self.theWebDriver.find_element_by_link_text(S(clickLinkText)).click()
def getu8(self):
def f(switchBack):
src = self.theWebDriver.find_element_by_xpath("//*").get_attribute("outerHTML")
if options.js_frames:
for el in ['frame','iframe']:
for frame in self.theWebDriver.find_elements_by_tag_name(el):
self.theWebDriver.switch_to.frame(frame)
src += f(switchBack+[frame])
self.theWebDriver.switch_to.default_content()
for fr in switchBack: self.theWebDriver.switch_to.frame(fr)
return src

Silas S. Brown
committed
return B(f([]))
if options.js_interpreter=="HeadlessChrome": # resize not needed for PhantomJS (but PhantomJS is worse at font configuration and is no longer maintained)
self.theWebDriver.set_window_size(js_size[0],min(16000,int(self.theWebDriver.execute_script("return document.body.parentNode.scrollHeight")))) # TODO: check the 16000: what is Selenium's limit? (confirmed over 8000)
time.sleep(1)
if options.js_interpreter=="HeadlessChrome": self.theWebDriver.set_window_size(*js_size)
try: # can we optimise the screenshot image size?
from PIL import Image

Silas S. Brown
committed
s = BytesIO() ; Image.open(StringIO(png)).save(s,'png',optimize=True)
png = s.getvalue()
except: pass # just return non-optimized
return png
def emergency_zap_pid_and_children(pid):
if not pid: return
try:
for c in psutil.Process(pid).children(recursive=True):
try: c.kill(9)
except: pass
except: pass # no psutil, or process already gone
try: os.kill(pid,9),os.waitpid(pid, 0) # waitpid is necessary to clear it from the process table, but we should NOT use os.WNOHANG, as if we do, there's a race condition with the os.kill taking effect (even -9 isn't instant)

Silas S. Brown
committed
def emergency_zap_tempdir(d):
try:
for f in os.listdir(d): emergency_zap_tempdir(d+os.sep+f)
os.rmdir(d)
except: unlink(d)
try: from selenium.common.exceptions import TimeoutException
except: # no Selenium or wrong version
class TimeoutException(Exception): pass # placeholder
class SeriousTimeoutException(Exception): pass
def webdriverWrapper_receiver(pipe,timeoutLock):
"Command receiver for WebdriverWrapper for when it's running over IPC (--js-multiprocess). Receives (command,args) and sends (return,exception), releasing the timeoutLock whenever it's ready to return."
CrossProcessLogging.initChild()
try: w = WebdriverWrapper()
except KeyboardInterrupt: return
if p: # see if we can add it to the message (note p will start with ", " so no need to add a space before it)
if hasattr(e,"msg") and e.msg: e.msg += p # should work with WebDriverException
elif type(e.args[0])==str: e.args=(repr(e.args[0])+p,) + tuple(e.args[1:]) # should work with things like httplib.BadStatusLine that are fussy about the number of arguments they get
else: e.args += (p,) # works with things like KeyError (although so should the above)
except: e.message += p # works with base Exception
ret,exc = None,e
except: pass # (may fail if controller's timeoutLock is turned off during quit_wd_atexit)
try: pipe.send((ret,exc))
except: pass # if they closed it, we'll get EOFError on next iteration
class WebdriverWrapperController:
"Proxy for WebdriverWrapper if it's running over IPC"
def __init__(self):
self.pipe, cPipe = multiprocessing.Pipe()
self.timeoutLock = multiprocessing.Lock()
self.process = multiprocessing.Process(target=webdriverWrapper_receiver,args=(cPipe,self.timeoutLock))
def send(self,cmd,args=()):
"Send a command to a WebdriverWrapper over IPC, and either return its result or raise its exception in this process. Also handle the raising of SeriousTimeoutException if needed, in which case the WebdriverWrapper should be stopped."
try:
if not self.timeoutLock.acquire(timeout=0):
logging.error("REALLY serious SeriousTimeout (should never happen). Lock unavailable before sending command.")
raise SeriousTimeoutException()
except AttributeError: pass # self.timeoutLock==None because quit(final=True) called from another thread
try: self.pipe.send((cmd,args))
except IOError: return # already closed
if cmd=="EOF":
return self.pipe.close() # no return code
if not self.timeoutLock.acquire(timeout=options.js_timeout2): # fallback in case Selenium timeout doesn't catch it (signal.alarm in the child process isn't guaranteed to help, so catch it here)
try: logging.error("SeriousTimeout: WebdriverWrapper process took over "+str(options.js_timeout2)+"s to respond to "+repr((cmd,args))+". Emergency restarting this process.")
except: pass # absolutely do not throw anything except SeriousTimeoutException from this branch
raise SeriousTimeoutException()
self.timeoutLock.release()
except AttributeError: return # self.timeoutLock==None because quit(final=True) called from another thread
ret,exc = self.pipe.recv()
if ret==exc=="INT": return self.pipe.close()
if exc: raise exc
else: return ret

Silas S. Brown
committed
def new(self,*args):
self.send("new",args)
self.tempDirToDelete=self.send("getTmp")
if final: self.timeoutLock = None # quit_wd_atexit could plausibly run while another thread's still processing its last command, so allow these commands to be queued in the pipe from another thread without worrying about timeout when that happens
def current_url(self): return self.send("current_url")
def get(self,url): return self.send("get",(url,))
def execute_script(self,script): self.send("execute_script",(script,))
def click_id(self,clickElementID): self.send("click_id",(clickElementID,))
def click_xpath(self,xpath): self.send("click_xpath",(xpath),)
def click_linkText(self,clickLinkText): self.send("click_linkText",(clickLinkText,))
def getu8(self): return self.send("getu8")
def getpng(self): return self.send("getpng")
try: import multiprocessing # Python 2.6
except: multiprocessing = None
"Manage a WebdriverWrapperController (or a WebdriverWrapper if we're not using IPC) from a thread of the main process"
def __init__(self,start=0,index=0):
self.start,self.index = start,index
self.wrapper = WebdriverWrapperController()
else: self.wrapper = WebdriverWrapper()
self.renew_webdriver_newThread(True) # sets wd_threadStart
def renew_controller(self): # SeriousTimeoutException
emergency_zap_pid_and_children(self.wrapper.process.pid)

Silas S. Brown
committed
emergency_zap_tempdir(self.wrapper.tempDirToDelete)
def renew_webdriver_sameThread(self,firstTime=False):
self.wrapper.quit(),self.wrapper.new(self.start+self.index,not firstTime)
break
except:
logging.error("Exception "+exc_logStr()+" while renewing webdriver, retrying")
time.sleep(1) # just in case
def renew_webdriver_newThread(self,firstTime=False):
self.wd_threadStart = time.time() # cleared in _renew_wd after renew_webdriver_sameThread returns (it loops on exception)
threading.Thread(target=_renew_wd,args=(self,firstTime)).start() ; return
def quit_webdriver(self): self.wrapper.quit(final=True)
def fetch(self,url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate):
assert not self.wd_threadStart # webdriver_checkServe
self.wd_threadStart = time.time() # cleared in wd_fetch after _wd_fetch returns or throws + possible renew-loop (TODO: if wd_fetch ITSELF somehow throws an exception, should be logged but this JS instance gets tied up until next adjuster restart)
self.maybe_stuck = False
threading.Thread(target=wd_fetch,args=(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,self,tooLate)).start()
def current_url(self): return self.wrapper.current_url()
def get(self,url): return self.wrapper.get(url)
def execute_script(self,script): self.wrapper.execute_script(script)
def click_id(self,clickElementID): self.wrapper.click_id(clickElementID)
def click_xpath(self,xpath): self.wrapper.click_xpath(xpath)
def click_linkText(self,clickLinkText): self.wrapper.click_linkText(clickLinkText)
def getu8(self): return self.wrapper.getu8()
def getpng(self): return self.wrapper.getpng()
def _renew_wd(wd,firstTime):
wd.renew_webdriver_sameThread(firstTime)
wd.wd_threadStart = False

Silas S. Brown
committed
IOLoopInstance().add_callback(webdriver_checkServe)
ei = sys.exc_info()
try:
p = ei[1].args[-1]
if "adjuster line" in p: return p # for webdriverWrapper_receiver
except: pass
try: __file__
except: return "" # sometimes not defined ??
if __file__ in l[i][0]: return ", adjuster line "+str(l[i][1])
def wd_fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,manager,tooLate):

Silas S. Brown
committed
url = S(url)
need_restart = False
def errHandle(error,extraMsg,prefetched):
if prefetched: toRet = "non-webdriver page (js_fallback set)"
else:
toRet = "error"
prefetched = wrapResponse("webdriver "+error)
if options.js_fallback:
try:
prefetched.headers.add(options.js_fallback,error)
except: logging.error("Could not add "+repr(options.js_fallback)+" to error response")
return prefetched
try:
r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
try:
if options.js_fallback: r.headers.add(options.js_fallback,"OK")
r = errHandle("timeout","webdriver "+str(manager.start+manager.index)+" timeout fetching "+url+find_adjuster_in_traceback()+"; no partial result, so",prefetched) # "webdriver timeout" sent to browser (can't include url here: domain gets rewritten)
r = errHandle("serious timeout","lost communication with webdriver "+str(manager.start+manager.index)+" when fetching "+url+"; no partial result, so",prefetched)
logging.info("webdriver error fetching "+url+" ("+exc_logStr()+"); restarting webdriver "+str(manager.start+manager.index)+" for retry") # usually a BadStatusLine
if tooLate(): r = errHandle("err","too late")
else:
try:
r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
try:
if options.js_fallback: r.headers.add(options.js_fallback,"OK")
except: pass
except SeriousTimeoutException:
r = errHandle("serious timeout","webdriver serious timeout on "+url+" after restart, so re-restarting and",prefetched)
need_restart = "serious"
except:
r = errHandle("error","webdriver error on "+url+" even after restart, so re-restarting and",prefetched)
need_restart = True
else: # no retry
r = errHandle("error","webdriver error on "+url+", so restarting and",prefetched)

Silas S. Brown
committed
IOLoopInstance().add_callback(lambda *args:callback(r))
if need_restart or (options.js_restartAfter and manager.usageCount >= options.js_restartAfter):
if need_restart=="serious":manager.renew_controller()
manager.renew_webdriver_sameThread()
else: manager.finishTime = time.time()
manager.wd_threadStart = manager.maybe_stuck = False

Silas S. Brown
committed
IOLoopInstance().add_callback(webdriver_checkServe)
def exc_logStr():
toLog = sys.exc_info()[:2]
if hasattr(toLog[1],"msg") and toLog[1].msg: toLog=(toLog[0],toLog[1].msg) # for WebDriverException
return repr(toLog)+find_adjuster_in_traceback()
def _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot): # single-user only! (and relies on being called only in htmlOnlyMode so leftover Javascript is removed and doesn't double-execute on JS-enabled browsers)

Silas S. Brown
committed
import tornado.httputil ; url = S(url)
currentUrl = S(manager.current_url())

Silas S. Brown
committed
timed_out = False
if prefetched or not re.sub('#.*','',currentUrl) == url:
if prefetched:
debuglog("webdriver %d get about:blank" % (manager.start+manager.index))
manager.get("about:blank") # ensure no race condition with current page's XMLHttpRequests
webdriver_inProgress[manager.index].clear() # race condition with start of next 'get' if we haven't done about:blank, but worst case is we'll wait a bit too long for page to finish
debuglog(("webdriver %d get " % (manager.start+manager.index))+url)

Silas S. Brown
committed
try: manager.get(url) # waits for onload
except TimeoutException:
# we might have got SOMEthing (e.g. on a page bringing in hundreds of scripts from a slow server, but still running some of them before the timeout)
# May also be "Received error page"
if currentUrl == S(manager.current_url()):
debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - URL unchanged at "+currentUrl)
raise # treat as "no partial result"
debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - extracting partial")
if not timed_out:
debuglog(("webdriver %d loaded " % (manager.start+manager.index))+url)
# we want to double-check XMLHttpRequests have gone through (TODO: low-value setTimeout as well? TODO: abort this early if currentUrl has changed and we're just going to issue a redirect? but would then need to ensure it's finished if client comes back to same instance that's still running after it follows the redirect)
if options.js_reproxy:
for _ in xrange(40): # up to 8+ seconds in steps of 0.2 (on top of the inital load)
time.sleep(0.2) # unconditional first-wait hopefully long enough to catch XMLHttpRequest delayed-send, very-low-value setTimeout etc, but we don't want to wait a whole second if the page isn't GOING to make any requests (TODO: monitor the js going through the upstream proxy to see if it contains any calls to this? but we'll have to deal with js_interpreter's cache, unless set it to not cache and we cache upstream)
active = webdriver_inProgress[manager.index]
if not active and not wasActive: break # TODO: wait longer than 0.2-0.4 to see if it restarts another request?
wasActive = active

Silas S. Brown
committed
else: time.sleep(1) # can't do much if we're not reproxying, so just sleep 1sec and hope for the best

Silas S. Brown
committed
if (clickElementID or clickLinkText) and not timed_out:
manager.execute_script("window.open = window.confirm = function(){return true;}") # in case any link has a "Do you really want to follow this link?" confirmation (webdriver default is usually Cancel), or has 'pop-under' window (TODO: switch to pop-up?)
if clickElementID: manager.click_id(clickElementID)

Silas S. Brown
committed
if not type(clickLinkText)==type(u""): clickLinkText=clickLinkText.decode('utf-8')
if not '"' in clickLinkText: manager.click_xpath(u'//a[text()="'+clickLinkText+'"]')
elif not "'" in clickLinkText: manager.click_xpath(u"//a[text()='"+clickLinkText+"']")
else: manager.click_linkText(clickLinkText) # least reliable
time.sleep(0.2) # TODO: more? what if the click results in fetching a new URL, had we better wait for XMLHttpRequests to finish? (loop as above but how do we know when they've started?) currentUrl code below should at least show us the new URL even if it hasn't finished loading, and then there's a delay while the client browser is told to fetch it, but that might not be enough
except: debuglog("js_links find_element exception ignored",False)
currentUrl = None
if currentUrl == None: # we need to ask for it again
currentUrl = manager.current_url()
if not currentUrl: currentUrl = url # PhantomJS Issue #13114: relative links after a redirect are not likely to work now

Silas S. Brown
committed
if S(currentUrl) == "about:blank":
debuglog("got about:blank instead of "+S(url))
return wrapResponse("webdriver failed to load") # don't return an actual redirect to about:blank, which breaks some versions of Lynx
debuglog("Getting data from webdriver %d (current_url=%s)" % (manager.start+manager.index,S(currentUrl)))
if asScreenshot: return wrapResponse(manager.getpng(),tornado.httputil.HTTPHeaders.parse("Content-type: image/png"),200)

Silas S. Brown
committed
body = get_and_remove_httpequiv_charset(manager.getu8())[1]
if timed_out: manager.get("about:blank") # as the timeout might have been due to a hard-locked script, so interrupting it should save some CPU
if not re.sub(B('#.*'),B(''),B(currentUrl)) == B(url): # we have to ignore anything after a # in this comparison because we have no way of knowing (here) whether the user's browser already includes the # or not: might send it into a redirect loop
# If we redirect, and if we have more than one user session active (and especially if we're multicore) then the second request might not come back to the same webdriver instance (or even the same adjuster process, so we can't even cache it unless shared), and reload is bad, so try to avoid redirect if possible.
# We could set 'base href' instead, seeing as 'document.location' does not have to be right on the user's side as we've already executed the site's scripts here (unless the user has any extensions that require it to be right). Don't use Content-Location header: not all browsers support + might cause caches to tread POST requests as invariant.
# Any in-document "#" links will cause a reload if 'base href' is set, but at least we won't have to reload UNLESS the user follows such a link.
if htmlFind(body,"<base href=") >= 0:
pass # if it already has a <base href> we can leave it as-is, since it won't matter from which URL it was served
else: return wrapResponse(addToHead(body,B('<base href="')+re.sub(B('#.*'),B(''),B(currentUrl))+B('">')),tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)
return wrapResponse(body,tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)

Silas S. Brown
committed
if options.js_interpreter in ["HeadlessChrome","Chrome"]:
return get_new_Chrome(index,renewing,options.js_interpreter=="HeadlessChrome")
elif options.js_interpreter in ["HeadlessFirefox","Firefox"]:
return get_new_Firefox(index,renewing,options.js_interpreter=="HeadlessFirefox")

Silas S. Brown
committed
def get_new_Chrome(index,renewing,headless):
log_complaints = (index==0 and not renewing)
from selenium.webdriver.chrome.options import Options
# TODO: can set opts.binary_location if needed (e.g. for chromium, if distro's linking doesn't work)

Silas S. Brown
committed
if headless:
opts.add_argument("--headless")
opts.add_argument("--disable-gpu")
# Specify user-data-dir ourselves, further to Chromium bug 795 comment 12. Include username and port (in case others are running or have run adjuster) as well as index.
global myUsername
try: myUsername
except NameError:
try: import getpass
except ImportError: getpass = None

Silas S. Brown
committed
if getpass: myUsername = S(getpass.getuser())
extra = ""
while True: # might be restarting from a corrupted user-data-dir state; in worst case might not even be able to cleanly remove it (TODO: what if some processes associated with an older instance somehow took a while to go away and still have named referenc to previous path: increment counter unconditionally? still rm the old one)
path = writable_tmpdir()+"hChrome-"+myUsername+str(options.port)+"."+str(index)+extra
if not os.path.exists(path): break
import shutil ; shutil.rmtree(path,True)
if not os.path.exists(path): break
if extra: extra="-"+str(int(extra[1:])+1)
else: extra = "-0"
opts.add_argument("--user-data-dir="+path)
opts.add_argument("--incognito") # reduce space taken up by above
opts.add_argument("--proxy-server=127.0.0.1:%d" % proxyPort(index))
opts.add_argument("--ignore-certificate-errors") # --ignore-certificate-errors is ignored by Chrome 59 (which was the first version to support Headless) and possibly some earlier versions, but we'll put it in just in case somebody runs an ancient non-headless Chrome in an offline experiment
opts.add_argument("--allow-insecure-localhost") # Chrome 62+ can at least do *.localhost & 127.* but we'd need to domain-rewrite for this to help (proxy-host doesn't count)
# Chrome 65 and chromedriver 2.35/2.36? can do:
dc = wd_DesiredCapabilities(log_complaints)
if dc:
dc = dc.CHROME.copy()
dc['acceptInsecureCerts'] = True
elif options.upstream_proxy: opts.add_argument('--proxy-server='+options.upstream_proxy)
if options.logDebug: opts.add_argument("--verbose")
if options.js_UA and not options.js_UA.startswith("*"): opts.add_argument("--user-agent="+options.js_UA)
if not options.js_images: opts.add_experimental_option("prefs",{"profile.managed_default_content_settings.images":2})
# TODO: do we need to disable Javascript's ability to open new windows and tabs, plus target="_blank" etc, especially if using clickElementID?
if options.via and not options.js_reproxy and log_complaints:
# Oops: how can we put in a Via: header if we don't
# have an upstream proxy to do so? unless you want
# to implement a Chrome extension to do it (TODO?)

Silas S. Brown
committed
warn("--via ignored when running Chrome without --js-reproxy")
if js_size: opts.add_argument("--window-size=%d,%d" % js_size)
if dc: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts,desired_capabilities=dc)
else: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts)
if options.js_reproxy:
chromeVersion = int(p.capabilities['version'].split(".")[0])
if 59 <= chromeVersion < 65:
if [int(x) for x in p.capabilities['chrome']['chromedriverVersion'].split(".",2)[:2]] < [2,35]: extrawarn = " (and chromedriver 2.35+)"
else: extrawarn = ""
warn("This version of Chrome will hang when used with js_reproxy on https pages. Try upgrading to Chrome 65+"+extrawarn) # TODO: is 59 really the first version to drop --ignore-certificate-errors ?
elif chromeVersion >= 65 and not p.capabilities.get('acceptInsecureCerts',False): warn("This version of chromedriver will hang when used with js_reproxy on https pages. Your Chrome is new enough, but your chromedriver is not. Try downloading chromedriver 2.35/36+")
except: logging.info("Couldn't set HeadlessChrome page load timeout")
return p

Silas S. Brown
committed
def get_new_Firefox(index,renewing,headless):
if headless:
os.environ['MOZ_HEADLESS'] = '1' # in case -headless not yet working
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
log_complaints = (index==0 and not renewing) ; op = None
if options.js_reproxy:
from selenium.webdriver.common.proxy import Proxy,ProxyType
import warnings
warnings.filterwarnings("ignore","This method has been deprecated. Please pass in the proxy object to the Driver Object") # set_proxy deprecated, but documentation unclear how it should be replaced
profile.set_proxy(Proxy({'proxyType':ProxyType.MANUAL,'httpProxy':"127.0.0.1:%d" % proxyPort(index),'sslProxy':"127.0.0.1:%d" % proxyPort(index),'ftpProxy':'','noProxy':''}))
profile.accept_untrusted_certs = True # needed for some older versions?
caps = wd_DesiredCapabilities(log_complaints)
if caps:
caps = caps.FIREFOX.copy()
caps['acceptInsecureCerts'] = True
caps['acceptSslCerts'] = True # older versions
elif options.upstream_proxy: profile.set_proxy(options.upstream_proxy)
if options.js_UA and not options.js_UA.startswith("*"): profile.set_preference("general.useragent.override",options.js_UA)
if not options.js_images: profile.set_preference("permissions.default.image", 2)
if options.via and not options.js_reproxy and log_complaints:
# Oops: how can we put in a Via: header if we don't
# have an upstream proxy to do so? unless you want
# to implement a Firefox extension to do it (TODO?)

Silas S. Brown
committed
warn("--via ignored when running Firefox without --js-reproxy")
# TODO: do any other options need to be set? disable plugins, Firefox-update prompts, new windows/tabs with JS, etc? or does Selenium do that?
if options.logDebug: binary=FirefoxBinary(log_file=sys.stderr) # TODO: support logDebug to a file as well
else: binary=FirefoxBinary()

Silas S. Brown
committed
if headless: cmdL = ('-headless','-no-remote')
else: cmdL = ('-no-remote',)
if js_size: cmdL += ("-width",str(js_size[0]),"-height",str(js_size[1]))
cmdL += ("about:blank",) # not Firefox start page
binary.add_command_line_options(*cmdL) # cannot call this more than once
if caps: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary,capabilities=caps)
else: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary)

Silas S. Brown
committed
except: logging.info("Couldn't set Firefox page load timeout")

Silas S. Brown
committed
block_headless_firefox = [
# servers that Firefox tries to CONNECT to on startup
"push.services.mozilla.com","snippets.cdn.mozilla.net","firefox.settings.services.mozilla.com","location.services.mozilla.com","shavar.services.mozilla.com",
"aus5.mozilla.org","ftp.mozilla.org",
"fonts.googleapis.com", # Fedora version of Firefox connects to this
# "start.fedoraproject.org","fedoraproject.org", # Fedora version of Firefox does this (but what if user actually wants to view one of those pages?)
]
def wd_DesiredCapabilities(log_complaints):
try:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
return DesiredCapabilities
except:
if log_complaints: warn("Your Selenium installation is too old to set DesiredCapabilities.\nThis is likely to stop some js options from working properly.")
return None
def wd_instantiateLoop(wdClass,index,renewing,**kw):
debuglog("Instantiating "+wdClass.__name__+" "+repr(kw))

Silas S. Brown
committed
if 'chrome_options' in kw:
try: newChromedriver = 'options' in getargspec(webdriver.chrome.webdriver.WebDriver.__init__).args
except: newChromedriver = False
if newChromedriver:
kw['options'] = kw['chrome_options']
del kw['chrome_options']
if not renewing: time.sleep(min(2*(index % js_per_core),int(options.js_timeout2 / 2))) # try not to start them all at once at the beginning (may reduce chance of failure)

Silas S. Brown
committed
if wdClass==webdriver.Chrome: p = wdClass(getoutput("which chromedriver 2>/dev/null"),**kw) # some versions need to be told explicitly where chromedriver is, rather than looking in PATH themselves, in order to get "wrong version" errors etc (otherwise errors ignored, Selenium looks for a different chromedriver and gives a slightly confusing error about 'none found' rather than the error you should have seen about 'wrong version')
if not p.capabilities: raise Exception("Didn't seem to get a p.capabilities")
elif 'browserVersion' in p.capabilities:
# Selenium 2.x calls it version, but Selenium
# 3.x calls it browserVersion. Map this back
# to 'version' for our other code.
p.capabilities['version'] = p.capabilities['browserVersion']
elif not 'version' in p.capabilities: raise Exception("capabilities has no version: "+repr(p.capabilities.items()))
logging.error("Unhandled exception "+exc_logStr()+" when instantiating webdriver %d, retrying in 2sec" % index)
time.sleep(2) ; p = None
log_complaints = (index==0 and not renewing)
os.environ["QT_QPA_PLATFORM"]="offscreen"
# if options.logDebug: sa.append("--debug=true") # doesn't work: we don't see the debug output on stdout or stderr
sa.append('--proxy=127.0.0.1:%d' % proxyPort(index))
elif options.upstream_proxy: sa.append('--proxy='+options.upstream_proxy)
dc = wd_DesiredCapabilities(log_complaints)
if dc:
dc = dict(dc.PHANTOMJS)
if options.js_UA and not options.js_UA.startswith("*"): dc["phantomjs.page.settings.userAgent"]=options.js_UA
if not options.js_images: dc["phantomjs.page.settings.loadImages"]=False
dc["phantomjs.page.settings.javascriptCanOpenWindows"]=dc["phantomjs.page.settings.javascriptCanCloseWindows"]=False # TODO: does this cover target="_blank" in clickElementID etc (which could have originated via DOM manipulation, so stripping them on the upstream proxy is insufficient; close/restart the driver every so often?)
if options.via and not options.js_reproxy: dc["phantomjs.page.customHeaders.Via"]="1.0 "+convert_to_via_host("")+" ("+viaName+")" # customHeaders works in PhantomJS 1.5+ (TODO: make it per-request so can include old Via headers & update protocol version, via_host + X-Forwarded-For; will webdriver.DesiredCapabilities.PHANTOMJS[k]=v work before a request?) (don't have to worry about this if js_reproxy)
return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,desired_capabilities=dc,service_args=sa)
else: return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,service_args=sa)
def get_new_PhantomJS(index,renewing=False):
wd = _get_new_PhantomJS(index,renewing)
log_complaints = (index==0 and not renewing)
try: is_v2 = wd.capabilities['version'].startswith("2.")
except: is_v2 = False
if is_v2: warn("You may be affected by PhantomJS issue #13114.\nRelative links may be wrong after a redirect if the site sets Content-Security-Policy.\nTry --js_reproxy, or downgrade your PhantomJS to version 1.9.8")
try: wd.set_window_size(*js_size)
except: logging.info("Couldn't set PhantomJS window size")
try: wd.set_page_load_timeout(options.js_timeout1)
except: logging.info("Couldn't set PhantomJS page load timeout")
def proxyPort(index): return port_randomise.get(js_proxy_port[index],js_proxy_port[index])
def test_init_webdriver():
"Check that we CAN start a webdriver, before forking to background and starting all of them"
sys.stderr.write("Checking webdriver configuration... "),sys.stderr.flush()
get_new_webdriver(0).quit()
sys.stderr.write("OK\n")
def init_webdrivers(start,N):
informing = not options.background and not start and not (options.multicore and options.ssl_fork) # (if ssl_fork, we don't want the background 'starting N processes' messages to be interleaved with this)
if informing:
sys.stderr.write("Starting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
for i in xrange(N):
webdriver_runner.append(WebdriverRunner(start,len(webdriver_runner)))
webdriver_via.append(None) ; webdriver_UA.append("")
if informing: sys.stderr.write("Quitting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
try:
for i in webdriver_runner:
except: pass
except: pass
if informing: sys.stderr.write("done\n")
global quitFuncToCall ; quitFuncToCall = quit_wd_atexit # don't use the real atexit, as we have our own thread-stop logic which might kick in first, leaving a stuck adjusterWDhelp process if js_multiprocess==True, and additionally holding up calling process if --stdio is in use (fixed in v0.2795)

Silas S. Brown
committed
if options.js_restartMins and not options.js_restartAfter==1: IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
if informing: sys.stderr.write("done\n")
webdriver_maxBusy = 0
busyNow = sum(1 for i in webdriver_runner if i.wd_threadStart)
global webdriver_maxBusy
return busyNow == len(webdriver_runner)
def webdriver_checkServe(*args):
# how many queue items can be served right now?
# (called on IOLoop thread when new item added, or when
# a server is finished)
debuglog("Entering webdriver_checkServe, runners=%d" % len(webdriver_runner))
for i in xrange(len(webdriver_runner)):
if not webdriver_queue: break # just to save a little
url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate = webdriver_queue.pop(0)
if tooLate():
debuglog("tooLate() for "+url)
continue
debuglog("Starting fetch of "+url+" on webdriver instance "+str(i+webdriver_runner[i].start))
webdriver_runner[i].fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate)
global webdriver_mu ; webdriver_mu += 1
break
if webdriver_allBusy(): pauseOrRestartMainServer(0) # we're "paused" anyway when not in the poll wait, so might as well call this only at end, to depend on the final status (and make sure to call webdriver_allBusy() no matter what, as it has the side-effect of updating webdriver_maxBusy)
else: pauseOrRestartMainServer(1)
debuglog("Finishing webdriver_checkServe, webdriver_queue len=%d" % len(webdriver_queue))
def webdriver_checkRenew(*args):
for i in webdriver_runner:
if not i.wd_threadStart and i.usageCount and i.finishTime + options.js_restartMins < time.time(): i.renew_webdriver_newThread() # safe because we're running in the IOLoop thread, which therefore can't start wd_thread between our test of wd_threadStart and renew_webdriver_newThread

Silas S. Brown
committed
IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
def webdriver_fetch(url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate):
if tooLate(): return # probably webdriver_queue overload (which will also be logged)
elif prefetched and (not hasattr(prefetched,"code") or prefetched.code >= 500): return callback(prefetched) # don't bother allocating a webdriver if we got a timeout or DNS error or something
elif wsgi_mode: return callback(_wd_fetch(webdriver_runner[0],url,prefetched,clickElementID,clickLinkText,asScreenshot)) # (can't reproxy in wsgi_mode, so can't use via and ua) TODO: if *threaded* wsgi, index 0 might already be in use (we said threadsafe:true in AppEngine instructions but AppEngine can't do js_interpreter anyway; where else might we have threaded wsgi? js_interpreter really is better run in non-wsgi mode anyway, so can js_reproxy)
webdriver_queue.append((url,prefetched,ua,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate))
debuglog("webdriver_queue len=%d after adding %s" % (len(webdriver_queue),url))
webdriver_checkServe() # safe as we're IOLoop thread

Silas S. Brown
committed
# --------------------------------------------------
# Service routines for basic HTTP header rewriting
# --------------------------------------------------
i.set_header("Server",serverName)
# TODO: in "real" proxy mode, "Server" might not be the most appropriate header to set for this
try: i.clear_header("Date") # Date is added by Tornado 3; HTTP 1.1 says it's mandatory but then says don't put it if you're a clockless server (which we might be I suppose) so it seems leaving it out is OK especially if not specifying Age etc, and leaving it out saves bytes. But if the REMOTE server specifies a Date then we should probably pass it on (see comments in doResponse below)
except: pass # (ok if "Date" wasn't there)
rmServerHeaders = set([
# server headers to remove. We'll do our own connection type etc (but don't include "Date" in this list: if the remote server includes a Date it would be useful to propagate that as a reference for its Age headers etc, TODO: unless remote server is broken? see also comment in fixServerHeader re having no Date by default). Many servers' Content-Location is faulty; it DOESN'T necessarily provide the new base href; it might be relative; it might be identical to the actual URL fetched; many browsers ignore it anyway
"connection","content-length","content-encoding","transfer-encoding","etag","content-md5","server","alternate-protocol","strict-transport-security","content-location",
"x-associated-content", # should NOT be sent to browser (should be interpreted by a server's SPDY/push module) but somebody might misread the specs (at least one Wikipedia editor did)
"x-host","x-http-reason", # won't necessarily be the same
"content-security-policy","x-webkit-csp","x-content-security-policy","x-frame-options", # sorry but if we're adjusting the site by adding our own scripts/styles we are likely to be broken by a CSP that restricts which of these we're allowed to do. (Even if we adjust the domains listed on those headers, what if our scripts rely on injecting inline code?) Sites shouldn't *depend* on CSP to prevent XSS: it's just a belt-and-braces that works only in recent browsers. Hopefully our added styles etc will break the XSS-introduced ones if we hit a lazy site.
# TODO: WebSocket (and Microsoft SM) gets the client to say 'Connection: Upgrade' with a load of Sec-WebSocket-* headers, check what Tornado does with that
rmClientHeaders = ['Connection','Proxy-Connection','Accept-Charset','Accept-Encoding','X-Forwarded-Host','X-Forwarded-Port','X-Forwarded-Server','X-Forwarded-Proto','X-Request-Start','TE','Upgrade',
'Upgrade-Insecure-Requests', # we'd better remove this from the client headers if we're removing Content-Security-Policy etc from the server's
'Range', # TODO: we can pass Range to remote server if and only if we guarantee not to need to change anything (could also add If-Range and If-None-Match to the list, but these should be harmless to pass to the remote server and If-None-Match might actually help a bit in the case where the document doesn't change)
]
# --------------------------------------------------
# Our main RequestForwarder class. Handles incoming
# HTTP requests, generates requests to upstream servers
# and handles responses. Sorry it's got a bit big :-(
# --------------------------------------------------
the_supported_methods = ("GET", "HEAD", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "CONNECT")
# Don't support PROPFIND (from WebDAV) unless be careful about how to handle image requests with it
# TODO: image requests with OPTIONS ?

Silas S. Brown
committed
class RequestForwarder(RequestHandler):

Silas S. Brown
committed
def get_error_html(self,status,**kwargs): return htmlhead("Web Adjuster error")+options.errorHTML+"</body></html>" # Tornado 2.0
def write_error(self,status,**kwargs): # Tornado 2.1+
if hasattr(self,"_finished") and self._finished: return
msg = self.get_error_html(status,**kwargs)
if "{traceback}" in msg and 'exc_info' in kwargs:
msg = msg.replace("{traceback}","<pre>"+ampEncode("".join(traceback.format_exception(*kwargs["exc_info"])))+"</pre>")
# TODO: what about substituting for {traceback} on pre-2.1 versions of Tornado that relied on get_error_html and put the error into sys.exc_info()? (need to check their source to see how reliable the traceback is in this case; post-2.1 versions re-raise it from write_error itself)

Silas S. Brown
committed
def cookie_host(self,checkReal=True,checkURL=True):
# for cookies telling us what host the user wants
if self.isPjsUpstream or self.isSslUpstream:
return False

Silas S. Brown
committed
if checkReal and convert_to_real_host(self.request.host,None): return # if we can get a real host without the cookie, the cookie does not apply to this host
if enable_adjustDomainCookieName_URL_override and checkURL:
if self.cookieViaURL: v = self.cookieViaURL
else:
v = self.request.arguments.get(adjust_domain_cookieName,None)
if type(v)==type([]): v=v[-1]

Silas S. Brown
committed
if v: self.removeArgument(adjust_domain_cookieName,quote(v))

Silas S. Brown
committed
if v:
self.cookieViaURL = v
if v==adjust_domain_none: return None
else: return v
return self.getCookie(adjust_domain_cookieName,adjust_domain_none)

Silas S. Brown
committed
def readCookies(self):
if hasattr(self,"_adjuster_cookies_"): return # already OK
self._adjuster_cookies_ = {}

Silas S. Brown
committed
for c in self.request.headers.get_list("Cookie"):
for cc in c.split(';'):

Silas S. Brown
committed
if not '=' in cc: continue # (e.g. Maxthon has been known to append a ; to the end of the cookie string)

Silas S. Brown
committed
n,v = cc.strip().split('=',1)

Silas S. Brown
committed
self._adjuster_cookies_[n] = v
def getCookie(self,cookieName,zeroValue=None):
# zeroValue is a value that the cookie can be set to so as to "clear" it (because some browsers don't seem to understand JS that clears a cookie)
self.readCookies()
v = self._adjuster_cookies_.get(cookieName,None)
if v==zeroValue: v = None
return v
def setCookie(self,cookieName,val):
# This is ONLY for ADJUSTER OPTIONS - does NOT propagate to the server
self.readCookies()
self._adjuster_cookies_[cookieName] = val

Silas S. Brown
committed
def clearUnrecognisedCookies(self):
# When serving via adjust_domain_cookieName, on URL box try to save browser memory (and request time) and improve security by deleting cookies set by previous sites. But can cover only the path=/ ones from here.

Silas S. Brown
committed
self.readCookies()
for n,v in self._adjuster_cookies_.items():
if n in cRecogniseAny or n==adjust_domain_cookieName: continue # don't do adjust_domain_cookieName unless take into account addCookieFromURL (TODO: but would we ever GET here if that happens?)
elif n in cRecognise1 and v=="1": continue
for dot in ["","."]: self.add_header("Set-Cookie",n+"="+v+"; Domain="+dot+self.cookieHostToSet()+"; Path=/; Expires=Thu Jan 01 00:00:00 1970") # to clear it