define("usepycurl",default=True,help="Use the pycurl library if a suitable version is available (setting this to False might save a little RAM at the expense of remote-server tolerance)")
define("renderBlocks",default=False,help="Treat all characters rendered by the character-set renderer as \"blocks\" that are guaranteed to have the same dimensions (true for example if you are using the renderer for Chinese characters only). This is faster than checking words individually, but it may produce misprints if given a range of characters whose dimensions do differ.")
# renderBlocks TODO: blocksRange option for if want to render some that do and some that don't? (but profile it: PIL's getsize just might turn out to be quicker than the high-level range-check code)
define("fasterServer",help="Address:port of another instance of Web Adjuster to which we forward all traffic whenever it is available. When the other instance is not available, traffic will be handled by this one. Use for example if you have a slower always-on machine and a faster not-always-on machine and you want the slower machine to delegate to the faster machine when available. See also ipTrustReal.")
define("ipTrustReal",help="IP address of a machine that we trust, for example a machine that is using us as fasterServer. Any traffic coming from this machine with an X-Real-Ip header will be logged as though it originated at the value of its X-Real-Ip header. Setting this to * will cause X-Real-Ip to be trusted from ANY connection.")
# , which might be useful in an environment where you know the adjuster can be reached only via a proxy but the proxy's address can change; see also trust_XForwardedFor. (TODO: multiple IPs option like ip_messages? but might need to make it ipv6 ready)
define("trust_XForwardedFor",default=False,help="Like ipTrustReal but trusts X-Forwarded-For header from any IP if set to True (use this in an environment where the adjuster can be reached only via a load balancer etc)")
define("fasterServerNew",default=True,help="If fasterServer is set, assume it is running Web Adjuster v0.17 or later and use a more lightweight method of checking its availability. You might need to set this to False if for some reason you can't upgrade the fasterServer first.")
# (fasterServerNew: don't do auto-fallback as that creates unnecessary extra traffic, plus sending an unrecognized ping2 could clutter logs)
define("machineName",help="A name for the current machine to insert into the \"Server\" HTTP header for adjusted requests, for example to let users know if it's your faster or your slower machine that's currently serving them (although they'd need to inspect the headers to find out)")
define("redirectFiles",default=False,help="If, when not functioning as a \"real\" HTTP proxy, a URL is received that looks like it requires no processing on our part (e.g. an image or downloadable file that the user does not want converted), and if this is confirmed via a HEAD request to the remote server, then redirect the browser to fetch it directly and not via Web Adjuster. This takes bandwidth off the adjuster server, and should mean faster downloads, especially from sites that are better connected than the adjuster machine. However it might not work with sites that restrict \"deep linking\". (As a precaution, the confirmatory HEAD request is sent with a non-adjusted Referer header to simulate what the browser would send if fetching directly. If this results in an HTML \"Referer denied\" message then Web Adjuster will proxy the request in the normal way. This precaution might not detect ALL means of deep-linking denial though.)")
# --- e.g. it won't detect cookie-based deep-linking denial, or serving an image but not the real one. But it works with Akamai-based assets servers as of 2013-09 (but in some cases you might be able to use codeChanges to point these requests back to the site's original server instead of the Akamai one, if the latter just mirrors the former which is still available, and therefore save having to proxy the images. TODO: what if you can't do that but you can run another service on a higher bandwidth machine that can cache them, but can't run the adjuster on the higher-bandwidth machine; can we redirect?)
# If adjuster machine is running on a home broadband connection, don't forget the "uplink" speed of that broadband is likely to be lower than the "downlink" speed; the same should not be the case of a site running at a well-connected server farm. There's also extra delay if Web Adjuster has to download files first (which might be reduced by implementing streaming). Weighed against this is the extra overhead the browser has of repeating its request elsewhere, which could be an issue if the file is small and the browser's uplink is slow; in that case fetching it ourselves might be quicker than having the browser repeat the request; see TODO comment elsewhere about minimum content length before redirectFiles.
# TODO: for Referer problems in redirectFiles, if we're not on HTTPS, could redirect to an HTTPS page (on a separate private https server, or https://www.google.com/url/?q= but they might add checks) which then redirs to the target HTTP page, but that might not strip Referer on MSIE 7 etc, may have to whitelist browsers+versions for it, or test per-request but that wld lead to 4 redirects per img instead of 2 although cld cache (non-empty) ok-browser-strings (and hold up other requests from same browser until we know or have timed out ??); do this only if sendHead returns false but sendHead with proper referer returns ok (and cache a few sites where this is the case so don't have to re-test) ?? also it might not work in places where HTTPS is forbidden
# TODO: redirectFiles could call request_no_external_referer and test with blank Referer instead of non-adjusted Referer, but we'd have to figure out some way of verifying that the browser actually supports 'Referrer-Policy: same-origin' before doing this
define("upstream_guard",default=True,help="Modify scripts and cookies sent by upstream sites so they do not refer to the cookie names that our own scripts use. This is useful if you chain together multiple instances of Web Adjuster, such as for testing another installation without coming out of your usual proxy. If however you know that this instance will not be pointed to another, you can set upstream_guard to False to save some processing.")
define("skipLinkCheck",multiple=True,help="Comma-separated list of regular expressions specifying URLs to which we won't try to add or modify links for the pdftotext, epubtotext, epubtozip, askBitrate or mailtoPath options. This processing can take some time on large index pages with thousands of links; if you know that none of them are PDF, EPUB, MP3 or email links, or if you don't mind not processing any that are, then it saves time to skip this step for those pages.")
# skipLinkCheck TODO: it would be nice to have a 'max links on the page' limit as an alternative to a list of URL patterns
define("extensions",help="Name of a custom Python module to load to handle certain requests; this might be more efficient than setting up a separate Tornado-based server. The module's handle() function will be called with the URL and RequestHandler instance as arguments, and should return True if it processed the request, but anyway it should return as fast as possible. This module does NOT take priority over forwarding the request to fasterServer.")
define("loadBalancer",default=False,help="Set this to True if you have a default_site set and you are behind any kind of \"load balancer\" that works by issuing a GET / with no browser string. This option will detect such requests and avoid passing them to the remote site.")
define("multicore",default=False,help="(Linux and BSD) On multi-core CPUs, fork enough processes for all cores to participate in handling incoming requests. This increases RAM usage, but can help with high-load situations. Disabled on Mac due to unreliability (other cores can still be used for htmlFilter etc)")
# --- and --ssl-fork if there's not TOO many instances taking up the RAM; if you really want multiple cores to handle incoming requests on Mac/BSD you could run GNU/Linux in a virtual machine (or use a WSGI server)
define("num_cores",default=0,help="Set the number of CPU cores for the multicore option (0 for auto-detect)")
define("internalPort",default=0,help="The first port number to use for internal purposes when ssl_fork is in effect. Internal ports needed by real_proxy (for SSL) and js_reproxy are normally allocated from the ephemeral port range, but if ssl_fork delegates to independent processes then some of them need to be at known numbers. The default of 0 means one higher than 'port'; several unused ports may be needed starting at this number. If your Tornado is modern enough to support reuse_port then you can have multiple Adjuster instances listening on the same port (e.g. for one_request_only) provided they have different internalPort settings when run with ssl_fork. Note however that the --stop and --restart options will NOT distinguish between different internalPort settings, only 'port'.")
# Some environments (e.g. old OpenShift 2) can't use real_proxy or js_reproxy because the container won't let us open extra ports even for internal purposes; TODO: find some way to multiplex everything on one port? how to authenticate our JS-interpreter connections if the load-balancer makes remote connections to that port also seem to come from our IP?
define("fixed_ports",default=False,help="Do not allocate ports (even internal ports) from the ephemeral port range even when this is otherwise possible. This option might help if you are firewalling your loopback interface and want to write specific exceptions (although that still won't work if you're using js_interpreter=HeadlessChrome or similar which opens its own ephemeral ports as well: use containers if you're concerned). Fixed ports may result in failures if internal ports are already taken.")
define("compress_responses",default=True,help="Use gzip to compress responses for clients that indicate they are compatible with it. You may want to turn this off if your server's CPU is more important than your network bandwidth (e.g. browser on same machine).")
# THIS MUST BE THE LAST SECTION because it continues into
# the note below about Tornado logging options. (The order
# of define()s affects the HTML order only; --help will be
# sorted alphabetically by Tornado.)
heading("Logging options")
define("profile",default=0,help="Log timing statistics every N seconds (only when not idle)")
define("profile_lines",default=5,help="Number of lines to log when profile option is in use (not applicable if using --multicore)")
define("renderLog",default=False,help="Whether or not to log requests for character-set renderer images. Note that this can generate a LOT of log entries on some pages.")
define("logUnsupported",default=False,help="Whether or not to log attempts at requests using unsupported HTTP methods. Note that this can sometimes generate nearly as many log entries as renderLog if some browser (or malware) tries to do WebDAV PROPFIND requests on each of the images.")
define("logRedirectFiles",default=True,help="Whether or not to log requests that result in the browser being simply redirected to the original site when the redirectFiles option is on.")
# (Since redirectFiles still results in a HEAD request being sent to the remote site, the logRedirectFiles option defaults to True in case you need it to diagnose "fair use of remote site via adjuster" problems)
define("ipNoLog",multiple=True,help="A comma-separated list of IP addresses which can use the adjuster without being logged. If your network has a \"friendly probing\" service then you might want to use this to stop it filling up the logs. (Any tracebacks it causes will still be logged however.)")
define("squashLogs",default=True,help="Try to remove some duplicate information from consecutive log entries, to make logs easier to check. You might want to set this to False if you plan to use automatic search tools on the logs. Currently not supported with multicore, and will automatically be set to False if multicore is enabled.")
# (squashLogs: word 'some' is important as not all duplicate info is guaranteed to be removed. TODO: move BrowserLogger to the collection process so can collate for multicore?)
define("errorHTML",default="Adjuster error has been logged",help="What to say when an uncaught exception (due to a misconfiguration or programming error) has been logged. HTML markup is allowed in this message. If for some reason you have trouble accessing the log files, the traceback can usually be included in the page itself by placing {traceback} in the message.")
# errorHTML TODO: this currently requires Tornado 2.1+ (document this? see TODO in write_error)
define("logDebug",default=False,help="Write debugging messages (to standard error if in the foreground, or to the logs if in the background). Use as an alternative to --logging=debug if you don't also want debug messages from other Tornado modules. On Unix you may also toggle this at runtime by sending SIGUSR1 to the process(es).") # see debuglog()
# and continuing into the note below:
if not tornado:
if html: print ("")
end = "Tornado-provided logging options are not listed above because they might vary across Tornado versions; run
python adjuster.py --help to see a full list of the ones available on your setup. They typically include
log_file_max_size,
log_file_num_backups,
log_file_prefix and
log_to_stderr."
# and --logging=debug, but that may generate a lot of entries from curl_httpclient
if html: print (end)
else: print (end.replace("
","`").replace("","`"))
raise SystemExit
#@file: import2-other.py
# --------------------------------------------------
# Further imports
# --------------------------------------------------
import time,socket,logging,subprocess,threading,base64,signal,traceback,shutil
try: from string import letters,digits # Python 2
except ImportError:
from string import ascii_letters as letters # Python 3
from string import digits
try: import urlparse # Python 2
except ImportError: import urllib.parse as urlparse # Python 3
try: from urllib import quote,unquote # Python 2
except ImportError: from urllib.parse import quote,unquote # Python 3
try: import htmlentitydefs # Python 2
except ImportError: import html.entities as htmlentitydefs # Python 3
try: from urllib2 import build_opener,Request,ProxyHandler,HTTPRedirectHandler,urlopen # Python 2
except ImportError: from urllib.request import build_opener,Request,ProxyHandler,HTTPRedirectHandler,urlopen # Python 3
try: from urllib2 import HTTPError as UL_HTTPError # Python 2
except ImportError: from urllib.error import HTTPError as UL_HTTPError # Python 3
try: from commands import getoutput # Python 2
except ImportError: from subprocess import getoutput # Python 3
try: import simplejson as json # Python 2.5, and faster?
except ImportError: import json # Python 2.6
try: from HTMLParser import HTMLParser,HTMLParseError # Python 2
except ImportError:
from html.parser import HTMLParser as _HTMLParser # Python 3
class HTMLParser(_HTMLParser):
def __init__(self): _HTMLParser.__init__(self,convert_charrefs=False) # please behave as the old one did
try: from html.parser import HTMLParseError # removed in Python 3.5
except ImportError: # we use it only for recognition anyway
class HTMLParseError(Exception): pass
try: import psutil
except ImportError: psutil = None
try: # Python 2
from cStringIO import StringIO as BytesIO
from StringIO import StringIO # for when need Unicode
except ImportError: from io import BytesIO,StringIO # Python 3
try: from inspect import getfullargspec as getargspec # Python 3
except ImportError:
try: from inspect import getargspec # Python 2
except ImportError: getargspec = None
try: xrange # Python 2
except: xrange,unicode,unichr = range,str,chr # Python 3
try: bytes
except: bytes = str
try: # can we page the help text?
# (Tornado 2 just calls the module-level print_help, but Tornado 3 includes some direct calls to the object's method, so we have to override the latter. Have to use __dict__ because they override __setattr__.)
import pydoc ; pydoc.pager # ensure present
def new_top(*args):
dat = StringIO()
dat.write(twoline_program_name+"\n")
old_top(dat)
pydoc.pager(dat.getvalue())
raise SystemExit
old_top = tornado.options.options.print_help
tornado.options.options.__dict__['print_help'] = new_top
except: pass # oh well, can't page the help text
#@file: domain-rewrite.py
# --------------------------------------------------
# Domain-rewriting service routines
# --------------------------------------------------
def hostSuffix(n=0):
if options.host_suffix:
return options.host_suffix.split("/")[n]
return ""
def defaultSite(n=0):
return options.default_site.split("/")[n]
def convert_to_real_host(requested_host,cookie_host=None):
# Converts the host name requested by the user into the
# actual host that we should request, or returns "" if
# we should display the URL entry box etc.
if requested_host:
requested_host = B(requested_host)
port=B(":"+str(options.publicPort))
# port might or might not be present in user's request
orig_requested_host = requested_host
if requested_host.endswith(port): requested_host=requested_host[:-len(port)]
n=0
for h in options.host_suffix.split("/"):
if requested_host.endswith(B("."+h)) and options.wildcard_dns: return redot(requested_host[:-len(h)-1])
elif requested_host == B(h):
d = defaultSite(n)
if d: return B(d)
elif B(cookie_host)==B(h): return 0
else: return B(cookie_host)
n += 1
if options.real_proxy: return orig_requested_host
return B(defaultSite())
def convert_to_via_host(requested_host):
if not requested_host: requested_host = "" # ??
else: requested_host = S(requested_host)
port=":"+str(options.publicPort) # the port to advertise
orig_requested_host = requested_host
if requested_host.endswith(port): requested_host=requested_host[:-len(port)]
if options.publicPort==80: port=""
for h in options.host_suffix.split("/"):
if (requested_host == h and options.default_site) or requested_host.endswith("."+h): return h+port
return options.host_suffix+port
def publicPortStr():
if options.publicPort==80: return ""
else: return ":"+str(options.publicPort)
def convert_to_requested_host(real_host,cookie_host=None):
# Converts the actual host name into the host name that
# the user should request to get it through us
if not real_host: return ""
port = publicPortStr()
if options.default_site:
n=0
for i in options.default_site.split("/"):
if not i: i=cookie_host
if B(real_host) == B(i):
return hostSuffix(n)+port
n += 1
elif not options.wildcard_dns and B(real_host) == B(cookie_host):
return hostSuffix(0)+port # no default_site, cookie_host everywhere
if not options.wildcard_dns: return real_host # leave the proxy
else: return dedot(real_host)+"."+hostSuffix()+port
# RFC 2109: A Set-Cookie from request-host y.x.example.com for Domain=.example.com would be rejected, because H is y.x and contains a dot.
# That means (especially if a password is set) we'd better make sure our domain-rewrites don't contain dots. If requested with dot, relocate to without dot. (But see below re RFC 1035 limitation.)
def dedot(domain):
# - means . but -- is a real - (OK as 2 dots can't come together and a - can't come immediately after a dot in domain names, so --- = -., ---- = --, ----- = --. etc)
domain = S(domain)
d2 = domain.replace("-","--").replace(".","-") # this scheme, which I invented in 2012, was adopted by Google Translate (at the domain 'translate.goog') in 2018
if len(d2) > 63: return domain # We can't do it because RFC 1035 puts a 63-byte limit on each label (so our cross-domain preferences cookies can't work on very long domains, TODO document this?)
else: return d2
def redot(domain): return B(domain).replace(B("--"),B("@MINUS@")).replace(B("-"),B(".")).replace(B("@MINUS@"),B("-"))
def protocolAndHost(realHost):
# HTTPS hack: host ends with .0 = use HTTPS instead of HTTP
# (the dot will be represented as a hyphen by dedot/redot,
# but some servers e.g. GAE can't cope with any part of the
# wildcard domain ending with a hyphen, so add the 0;
# TODO: what about fetching from IP addresses, although it's rare to get a server with IP ending .0 because it used to represent "the network")
if B(realHost).endswith(B(".0")):
return "https://",realHost[:-2]
else: return "http://",realHost
def protocolWithHost(realHost):
x,y = protocolAndHost(realHost) ; return S(x)+S(y)
def domain_process(text,cookieHost=None,stopAtOne=False,https=None,isProxyRequest=False,isSslUpstream=False):
text = B(text)
if isProxyRequest:
# When running as a real proxy, domain_process is
# still called for Location: headers etc (not for
# document bodies), and the only thing we need to
# check is the upstream_rewrite_ssl option: if our
# UPstream proxy says .0 in a Location: URL due to
# upstream_rewrite_ssl, then take it out.
if upstream_rewrite_ssl and not isSslUpstream:
m = re.match(B(r"http(://[A-Za-z0-9.-]*)\.0(?![A-Za-z0-9.-])"),text)
if m: return B("https")+m.group(1)
return text
# Change the domains on appropriate http:// and https:// URLs.
# Also on // URLs using 'https' as default (if it's not None).
# Hope that there aren't any JS-computed links where
# the domain is part of the computation.
# TODO: what of links to alternate ports or user:password links, currently we leave them unchanged (could use .
as an extension of the 'HTTPS hack' of .0, but allowing the public to request connects to any port could be a problem, and IP addresses would have to be handled carefully: can no longer rely on ".0 used to mean the network" sort-of saving us)
# TODO: leave alone URLs in HTML text/comments and JS comments? but script overload can make it hard to judge what is and isn't text. (NB this function is also called for Location headers)
if B(""),dtStart)
else: dtStart = dtEnd = -1
def mFunc(m):
if dtStart\2",serverName) # stop mobile browsers interpreting the version number as a telephone number
global upstream_proxy_host, upstream_proxy_port
upstream_proxy_host = upstream_proxy_port = None
global upstream_rewrite_ssl ; upstream_rewrite_ssl=False
global cores ; cores = 1
if options.multicore:
options.squashLogs = False
if not 'linux' in sys.platform and not 'bsd' in sys.platform:
errExit("multicore option not supported on this platform")
# --- it does work on darwin (Mac), but as of 10.7 some incoming connections get 'lost' so it's not a good idea
cores = options.num_cores
if not cores:
import tornado.process
cores = tornado.process.cpu_count()
if cores==1: options.multicore = False
elif options.js_interpreter and options.js_instances % cores:
old = options.js_instances
options.js_instances += (cores - (options.js_instances % cores))
sys.stderr.write("multicore: changing js_instances %d -> %d (%d per core x %d cores)\n" % (old,options.js_instances,int(options.js_instances/cores),cores))
if options.js_interpreter in ["HeadlessChrome","Chrome"]:
try: # check inotify limit (Linux only)
maxI=int(open("/proc/sys/fs/inotify/max_user_instances").read())
except: maxI = -1
if not maxI==-1 and options.js_instances > maxI*20: warn("This system might run out of inotify instances with that number of Chrome processes. Try:\nsudo sysctl -n -w fs.inotify.max_user_watches=%d\nsudo sysctl -n -w fs.inotify.max_user_instances=%d" % (options.js_instances*40,options.js_instances*20))
global js_per_core
js_per_core = int(options.js_instances/cores)
if options.upstream_proxy:
maxCurls = 30*js_per_core
if options.ssl_fork: maxCurls = int(maxCurls/2)
if not options.usepycurl: errExit("upstream_proxy is not compatible with --usepycurl=False")
setupCurl(maxCurls,"upstream_proxy requires pycurl (try sudo pip install pycurl)")
if not ':' in options.upstream_proxy: options.upstream_proxy += ":80"
upstream_proxy_host,upstream_proxy_port = options.upstream_proxy.split(':') # TODO: IPv6 ?
if not upstream_proxy_host:
upstream_proxy_host = "127.0.0.1"
if wsgi_mode: warn("Can't do SSL-rewrite for upstream proxy when in WSGI mode")
else: upstream_rewrite_ssl = True
upstream_proxy_port = int(upstream_proxy_port)
elif options.usepycurl and not options.submitPath=='/': setupCurl(3*js_per_core) # and no error if not there
global codeChanges ; codeChanges = []
if options.codeChanges:
ccLines = [x for x in [x.strip() for x in options.codeChanges.split("\n")] if x and not x.startswith("#")]
while ccLines:
if len(ccLines)<3: errExit("codeChanges must be a multiple of 3 lines (see --help)")
codeChanges.append(tuple(ccLines[:3]))
ccLines = ccLines[3:]
if options.real_proxy:
options.open_proxy=True
if options.browser and "lynx" in options.browser and not "I_PROMISE_NOT_TO_LYNX_DUMP_SSL" in os.environ and not "-stdin" in options.browser and ("-dump" in options.browser or "-source" in options.browser or "-mime_header" in options.browser): errExit("Don't do that. If Lynx wants to ask you about our self-signed certificates, it'll assume the answer is No when running non-interactively, and this will cause it to fetch the page directly (not via our proxy) which could confuse you into thinking the adjuster's not working. If you know what you're doing, put I_PROMISE_NOT_TO_LYNX_DUMP_SSL in the environment to suppress this message (but if using js_interpreter beware of redirect to SSL). Or you can use wget --no-check-certificate -O - | lynx -dump -stdin") # TODO: could we configure Lynx to always accept when running non-interactively?
if options.htmlFilter and '#' in options.htmlFilter and not len(options.htmlFilter.split('#'))+1 == len(options.htmlFilterName.split('#')): errExit("Wrong number of #s in htmlFilterName for this htmlFilter setting")
global port_randomise
if options.fixed_ports:
class NullDict(dict):
def __setitem__(*args): pass
port_randomise = NullDict()
if options.port == -1:
if wsgi_mode:
warn("port=-1 won't work in WSGI mode, assuming 80")
options.port = 80
elif options.ssl_fork or options.background: errExit("Can't run in background or ssl-fork with an ephemeral main port, as that requires fork-before-listen so won't be able to report the allocated port number")
elif options.fixed_ports: errExit("fixed_ports is not compatible with port==-1")
else:
port_randomise[options.port] = True
if not options.internalPort:
# DON'T set it to -1 + 1 = 0
options.internalPort = 1024
elif options.port < 0 or options.port > 65535:
errExit("port out of range")
elif not options.port:
if wsgi_mode:
warn("port=0 won't work in WSGI mode, assuming 80")
options.port = 80
else:
options.real_proxy=options.js_reproxy=False ; options.fasterServer=""
options.open_proxy = True # bypass the check
if not options.publicPort:
options.publicPort = options.port
if not options.internalPort:
options.internalPort = options.port + 1
if options.internalPort in [options.publicPort,options.port]: errExit("--internalPort cannot match --port or --publicPort")
if options.just_me:
options.address = "localhost"
try: socket.socket().connect(('localhost',113))
except:
if not 'linux' in sys.platform or not getoutput("which netstat 2>/dev/null"): errExit("--just_me requires either an ident server to be running on port 113, or the system to be Linux with a netstat command available")
import getpass ; global myUsername ; myUsername = S(getpass.getuser())
elif not options.password and not options.open_proxy and not options.submitPath=='/' and not options.stop: errExit("Please set a password (or --just_me), or use --open_proxy.\n(Try --help for help; did you forget a --config=file?)") # (as a special case, if submitPath=/ then we're serving nothing but submit-your-own-text and bookmarklets, which means we won't be proxying anything anyway and don't need this check)
if options.submitBookmarkletDomain and not options.publicPort==80: warn("You will need to run another copy on "+options.submitBookmarkletDomain+" ports 80/443 for bookmarklets to work (submitBookmarkletDomain without publicPort=80)")
if options.pdftotext and not "pdftotext version" in getoutput("pdftotext -h"): errExit("pdftotext command does not seem to be usable\nPlease install it, or unset the pdftotext option")
if options.epubtotext and not "calibre" in getoutput("ebook-convert -h"): errExit("ebook-convert command does not seem to be usable\nPlease install calibre, or unset the epubtotext option")
global extensions
if options.extensions:
extensions = __import__(options.extensions)
else:
class E:
def handle(*args): return False
extensions = E()
global ipMatchingFunc
if options.ip_messages: ipMatchingFunc=ipv4ranges_func(options.ip_messages)
else: ipMatchingFunc = None
global submitPathIgnorePassword, submitPathForTest
if options.submitPath and options.submitPath.startswith('*'):
submitPathIgnorePassword = True
options.submitPath = options.submitPath[1:]
else: submitPathIgnorePassword = False
submitPathForTest = options.submitPath
if submitPathForTest and submitPathForTest[-1]=="?": submitPathForTest = submitPathForTest[:-1] # for CGI mode: putting the ? in tells adjuster to ADD a ? before any parameters, but does not require it to be there for the base submit URL (but don't do this if not submitPathForTest because it might not be a string)
if options.submitPath and not options.htmlText: errExit("submitPath only really makes sense if htmlText is set (or do you want users to submit actual HTML?)") # TODO: allow this? also with submitBookmarklet ??
if options.prominentNotice=="htmlFilter":
if not options.htmlFilter: errExit("prominentNotice=\"htmlFilter\" requires htmlFilter to be set")
if options.htmlJson or options.htmlText: errExit("prominentNotice=\"htmlFilter\" does not work with the htmlJson or htmlText options")
if not (options.submitPath and options.htmlFilter): options.submitBookmarklet = False # TODO: bookmarklet for character rendering? (as an additional bookmarklet if there are filters as well, and update submitBookmarklet help text) although it's rare to find a machine that lacks fonts but has a bookmarklet-capable browser
if options.submitBookmarklet and '_IHQ_' in options.submitPath: errExit("For implementation reasons, you cannot have the string _IHQ_ in submitPath when submitBookmarklet is on.") # Sorry. See TODO in 'def bookmarklet'
global upstreamGuard, cRecogniseAny, cRecognise1
upstreamGuard = set() ; cRecogniseAny = set() ; cRecognise1 = set() # cRecognise = cookies to NOT clear at url box when serving via adjust_domain_cookieName; upstreamGuard = cookies to not pass to upstream (and possibly rename if upstream sets them)
if options.password:
upstreamGuard.add(password_cookie_name)
cRecogniseAny.add(password_cookie_name)
if options.cssName:
upstreamGuard.add("adjustCssSwitch")
cRecognise1.add("adjustCssSwitch")
if options.htmlFilterName:
upstreamGuard.add("adjustNoFilter")
cRecognise1.add("adjustNoFilter")
if options.renderName:
upstreamGuard.add("adjustNoRender")
cRecognise1.add("adjustNoRender")
if options.prominentNotice:
upstreamGuard.add("_WA_warnOK")
cRecognise1.add("_WA_warnOK")
if options.htmlonly_mode:
upstreamGuard.add(htmlmode_cookie_name)
cRecognise1.add(htmlmode_cookie_name)
if options.ip_messages:
upstreamGuard.add(seen_ipMessage_cookieName)
cRecognise1.add(seen_ipMessage_cookieName)
h = options.headAppendCSS
if h and '%s' in h:
if not ';' in h: errExit("If putting %s in headAppendCSS, must also put ; with options (please read the help text)")
if options.default_site:
errExit("Cannot set default_site when headAppendCSS contains options, because we need the URL box to show those options")
# TODO: unless we implement some kind of inline setting, or special options URL ?
if options.cssHtmlAttrs and ';' in options.cssHtmlAttrs and not len(options.cssHtmlAttrs.split(';'))==len(h.rsplit(';',1)[1].split(',')): errExit("Number of choices in headAppendCSS last field does not match number of choices in cssHtmlAttrs")
for n in range(len(h.split(';'))-1):
upstreamGuard.add("adjustCss"+str(n)+"s")
cRecogniseAny.add("adjustCss"+str(n)+"s")
if options.useLXML: check_LXML()
global allowConnectHost,allowConnectPort,allowConnectURL
allowConnectHost=allowConnectPort=allowConnectURL=None
if not options.default_site: options.default_site = ""
# (so we can .split it even if it's None or something)
if not options.js_interpreter:
options.js_reproxy=options.js_frames=False
elif not options.htmlonly_mode: errExit("js_interpreter requires htmlonly_mode")
def intor0(x):
try: return int(x)
except: return 0
def check_injected_globals():
# for making sure we're used correctly when imported
# as a module by a wrapper script
try: defined_globals
except: return
for s in set(globals().keys()).difference(defined_globals):
if s in options: errExit("Error: adjuster.%s should be adjuster.options.%s" % (s,s)) # (tell them off, don't try to patch up: this could go more subtly wrong if they do it again with something we happened to have defined in our module before)
elif type(eval(s)) in [str,bool,int]: errExit("Don't understand injected %s %s (misspelled option?)" % (repr(type(eval(s))),s))
def setup_defined_globals(): # see above
global defined_globals
defined_globals = True # so included in itself
defined_globals = set(globals().keys())
#@file: log-multi.py
# --------------------------------------------------
# Logging and busy-signalling (especially multicore)
# --------------------------------------------------
class CrossProcessLogging(logging.Handler):
def needed(self): return (options.multicore or options.ssl_fork or (options.js_interpreter and options.js_multiprocess)) and options.log_file_prefix # (not needed if stderr-only or if won't fork)
def init(self):
"Called by initLogging before forks. Starts the separate logListener process."
if not self.needed(): return
try: logging.getLogger().handlers
except:
# Ouch, we won't know how to clear logging's handlers and start again in the child processes
errExit("The logging module on this system is not suitable for --log-file-prefix with --ssl-fork or --js-multiprocess")
if not multiprocessing: return # we'll have to open multiple files in initChild instead
self.loggingQ=multiprocessing.Queue()
def logListener():
try:
while True: logging.getLogger().handle(logging.makeLogRecord(self.loggingQ.get()))
except KeyboardInterrupt: pass
self.p = multiprocessing.Process(target=logListener) ; self.p.start()
logging.getLogger().handlers = [] # clear what Tornado has already put in place when it read the configuration
logging.getLogger().addHandler(self)
def initChild(self,toAppend=""):
"Called after a fork. toAppend helps to describe the child for logfile naming when multiprocessing is not available."
if not options.log_file_prefix: return # stderr is OK
if multiprocessing:
try: multiprocessing.process.current_process()._children.clear() # multiprocessing wasn't really designed for the parent to fork() later on
except: pass # probably wrong version
return # should be OK now
logging.getLogger().handlers = [] # clear Tornado's
if toAppend: options.log_file_prefix += "-"+toAppend
else: options.log_file_prefix += "-"+str(os.getpid())
# and get Tornado to (re-)initialise logging with these parameters:
if hasattr(tornado.options,"enable_pretty_logging"): tornado.options.enable_pretty_logging() # Tornado 2
else: # Tornado 4
import tornado.log
tornado.log.enable_pretty_logging()
def shutdown(self):
try: self.p.terminate() # in case KeyboardInterrupt hasn't already stopped it
except: pass
def emit(self, record): # simplified from Python 3.2 (but put just the dictionary, not the record obj itself, to make pickling errors less likely)
try:
if record.exc_info:
placeholder = self.format(record) # record.exc_text
record.exc_info = None
d = record.__dict__
d['msg'],d['args'] = record.getMessage(),None
self.loggingQ.put(d)
except (KeyboardInterrupt, SystemExit): raise
except: self.handleError(record)
class CrossProcess429:
def needed(self): return options.multicore and options.js_429
def init(self): self.q = multiprocessing.Queue()
def startThread(self):
if not self.needed(): return
self.b = [False]*cores
def listener():
allServersBusy = False
while True:
coreToSet, busyStatus = self.q.get()
if coreToSet=="quit": break
self.b[coreToSet] = busyStatus
newASB = all(self.b)
if not newASB == allServersBusy:
allServersBusy = newASB
if allServersBusy: IOLoopInstance().add_callback(lambda *args:reallyPauseOrRestartMainServer(True)) # run it just to serve the 429s, but don't set mainServerPaused=False or add an event to the queue
else: IOLoopInstance().add_callback(lambda *args:reallyPauseOrRestartMainServer("IfNotPaused")) # stop it if and only if it hasn't been restarted by the main thread before this callback
threading.Thread(target=listener,args=()).start()
def initLogging(): # MUST be after unixfork() if background
try:
import logging, tornado.log
class NoSSLWarnings:
def filter(self,record): return not (record.levelno==logging.WARNING and record.getMessage().startswith("SSL"))
tornado.log.gen_log.addFilter(NoSSLWarnings()) # Tornado 6
except: pass
global CrossProcessLogging
CrossProcessLogging = CrossProcessLogging()
CrossProcessLogging.init()
def init429():
global CrossProcess429
CrossProcess429 = CrossProcess429()
if CrossProcess429.needed(): CrossProcess429.init()
def shutdown429():
try: CrossProcess429.q.put(("quit","quit"))
except: pass
#@file: log-browser.py
# --------------------------------------------------
# browser logging
# --------------------------------------------------
helper_threads = []
class NullLogger:
def __call__(self,req): pass
class BrowserLogger:
def __init__(self):
# Do NOT read options here - they haven't been read yet
self.lastBrowser = None
self.lastIp = self.lastMethodStuff = None
def __call__(self,req):
if req.request.remote_ip in options.ipNoLog: return
try: ch = req.cookie_host()
except: ch = None # shouldn't happen
req=req.request
if hasattr(req,"suppress_logging"): return
if S(req.method) not in the_supported_methods and not options.logUnsupported: return
if S(req.method)=="CONNECT" or B(req.uri).startswith(B("http://")) or B(req.uri).startswith(B("https://")): host="" # URI will have everything
elif hasattr(req,"suppress_logger_host_convert"): host = req.host
else: host=B(convert_to_real_host(req.host,ch))
if host in [-1,B("error")]: host=req.host
elif host: host=protocolWithHost(host)
else: host=""
browser = req.headers.get("User-Agent",None)
if browser:
browser=B('"')+B(browser)+B('"')
if options.squashLogs and browser==self.lastBrowser: browser = ""
else:
self.lastBrowser = browser
browser=B(" ")+B(browser)
else: self.lastBrowser,browser = None," -"
if options.squashLogs:
# Date (as YYMMDD) and time are already be included in Tornado logging format, a format we don't want to override, especially as it has 'start of log string syntax highlighting' on some platforms
if req.remote_ip == self.lastIp:
ip=""
else:
self.lastIp = req.remote_ip
ip=B(req.remote_ip)+B(" ")
self.lastMethodStuff = None # always log method/version anew when IP is different
methodStuff = (req.method, req.version)
if methodStuff == self.lastMethodStuff:
r=host+S(req.uri)
else:
r='"%s %s%s %s"' % (S(req.method), host, S(req.uri), S(req.version))
self.lastMethodStuff = methodStuff
msg = S(ip)+S(r)+S(browser)
else: msg = '%s "%s %s%s %s" %s' % (S(req.remote_ip), S(req.method), host, S(req.uri), S(req.version), S(browser)) # could add "- - [%s]" with time.strftime("%d/%b/%Y:%X") if don't like Tornado-logs date-time format (and - - - before the browser %s)
logging.info(msg.replace('\x1b','[ESC]')) # make sure we are terminal safe, in case of malformed URLs
def initLogging_preListen():
global nullLog, accessLog
nullLog = NullLogger()
accessLog = BrowserLogger()
#@file: profile.py
# --------------------------------------------------
# Profiling and process naming
# --------------------------------------------------
profile_forks_too = False # TODO: configurable
def open_profile():
if options.profile:
global cProfile,pstats,profileIdle
import cProfile, pstats
setProfile() ; profileIdle = False
global reqsInFlight,origReqInFlight
reqsInFlight = set() ; origReqInFlight = set()
def open_profile_pjsOnly(): # TODO: combine with above
if options.profile:
global profileIdle
setProfile_pjsOnly() ; profileIdle = False
global reqsInFlight,origReqInFlight
reqsInFlight = set() ; origReqInFlight = set()
def setProfile():
global theProfiler, profileIdle
theProfiler = cProfile.Profile()
IOLoopInstance().add_timeout(time.time()+options.profile,lambda *args:pollProfile())
profileIdle = True ; theProfiler.enable()
def setProfile_pjsOnly():
IOLoopInstance().add_timeout(time.time()+options.profile,lambda *args:pollProfile_pjsOnly())
global profileIdle ; profileIdle = True
def pollProfile():
theProfiler.disable()
if not profileIdle: showProfile()
setProfile()
def pollProfile_pjsOnly():
if not profileIdle: showProfile(pjsOnly=True)
setProfile_pjsOnly()
def showProfile(pjsOnly=False):
global _doneShowProfile
try: _doneShowProfile
except: _doneShowProfile = False
if pjsOnly: pr = ""
else:
s = StringIO()
pstats.Stats(theProfiler,stream=s).sort_stats('cumulative').print_stats()
pr = "\n".join([x for x in s.getvalue().split("\n") if x and not "Ordered by" in x][:options.profile_lines])
if options.js_interpreter and len(webdriver_runner):
global webdriver_lambda,webdriver_mu,webdriver_maxBusy
stillUsed = sum(1 for i in webdriver_runner if i.wd_threadStart)
maybeStuck = set()
for i in webdriver_runner:
ms,tr = i.maybe_stuck,i.wd_threadStart
if ms and ms == tr and tr+30 < time.time():
maybeStuck.add(ms)
i.maybe_stuck = tr
webdriver_maxBusy = max(webdriver_maxBusy,stillUsed)
if pr: pr += "\n"
elif not options.background: pr += ": "
pr += "js_interpreter"
if options.multicore: pr += "%d" % (int(webdriver_runner[0].start/js_per_core),)
pr += " "
if not webdriver_maxBusy: pr += "idle"
else:
try: # NameError unless js_429 and multicore
if mainServerPaused: pr += "closed, "
else: pr += "open, "
except NameError: pass
served = "%d served" % webdriver_mu
if webdriver_lambda==webdriver_mu==len(webdriver_queue)==0: queue = "" # "; queue unused"
elif not webdriver_queue: queue="; queue empty: "+served
else: queue = "; queue %d: %d arrived, %s" % (len(webdriver_queue),webdriver_lambda,served)
if not _doneShowProfile:
if pjsOnly: stuck = ", next SIGUSR2 checks stuck;"
else: stuck = ";"
elif maybeStuck:
stuck = ", %d stuck for " % len(maybeStuck)
t = time.time()
s1=int(t-max(maybeStuck)); s2=int(t-min(maybeStuck))
if s1==s2: stuck += str(s1)
else: stuck += "%d-%d" % (s1,s2)
stuck += "s?"
else: stuck = ";" # or ", none stuck"
pr += "%d/%d busy%s " % (stillUsed,len(webdriver_runner),stuck)
if not webdriver_maxBusy == stillUsed:
pr += "maxUse=%d" % (webdriver_maxBusy,)
pr += queue
pr = pr.rstrip().replace("; ;",";")
if pr.endswith(";"): pr = pr[:-1]
webdriver_lambda = webdriver_mu = 0
webdriver_maxBusy = stillUsed
# TODO: also measure lambda/mu of other threads e.g. htmlFilter ?
if psutil and not webdriver_runner[0].start: pr += "; system RAM %.1f%% used" % (psutil.virtual_memory().percent)
try: pr2 += "%d requests in flight (%d from clients)" % (len(reqsInFlight),len(origReqInFlight))
except NameError: pr2="" # no reqsInFlight
_doneShowProfile = True
if not pr and not pr2: return
if pr: pr += "\n"
elif not options.background: pr += ": "
pr += pr2
if options.background: logging.info(pr)
elif can_do_ansi_colour: sys.stderr.write("\033[35m"+(time.strftime("%X")+pr).replace("\n","\n\033[35m")+"\033[0m\n")
else: sys.stderr.write(time.strftime("%X")+pr+"\n")
def setProcName(name="adjuster"):
"Try to set the process name for top/ps"
try: # setproctitle works on both Linux and BSD/Mac if installed (but doesn't affect Mac OS 10.7 "Activity Monitor")
import setproctitle # sudo pip install setproctitle or apt-get install python-setproctitle (requires gcc)
return setproctitle.setproctitle(name) # TODO: this also stops 'ps axwww' from displaying command-line arguments; make it optional?
except: pass
try: # ditto but non-Mac BSD not checked (and doesn't always work on Python 3) :
import procname # sudo pip install procname (requires gcc)
return procname.setprocname(name)
except: pass
try: # this works in GNU/Linux for 'top', 'pstree -p' and 'killall', but not 'ps' or 'pidof' (which need argv[0] to be changed in C) :
import ctypes ; name = B(name)
b = ctypes.create_string_buffer(len(name)+1)
b.value = name
ctypes.cdll.LoadLibrary('libc.so.6').prctl(15,ctypes.byref(b),0,0,0)
except: pass # oh well
#@file: server-control.py
# --------------------------------------------------
# Start / stop / install
# --------------------------------------------------
def serverControl():
if options.install:
current_crontab = getoutput("crontab -l 2>/dev/null")
def shell_escape(arg):
if re.match("^[A-Za-z0-9_=/.%+,:@-]*$",arg): return arg # no need to quote if it's entirely safe-characters (including colon: auto-complete escapes : in pathnames but that's probably in case it's used at the START of a command, where it's a built-in alias for 'true')
return "'"+arg.replace("'",r"'\''")+"'"
def cron_escape(arg): return shell_escape(arg).replace('%',r'\%')
new_cmd = "@reboot python "+" ".join(cron_escape(a) for a in sys.argv)
if not new_cmd in current_crontab.replace("\r","\n").split("\n") and not new_cmd in current_crontab.replace("$HOME",os.environ.get("HOME")).replace("\r","\n").split("\n"):
sys.stderr.write("Adding to crontab: "+new_cmd+"\n")
if not current_crontab.endswith("\n"): current_crontab += "\n"
os.popen("crontab -","w").write(current_crontab+new_cmd+"\n")
if options.restart or options.stop:
pidFound = stopOther()
if options.stop:
if not pidFound: sys.stderr.write("Could not find which PID to stop (maybe nothing was running?)\n")
try: CrossProcessLogging.shutdown()
except: pass
sys.exit(0)
def stopOther():
pid = triedStop = None
if options.pidfile:
try: pid = int(open(options.pidfile).read().strip())
except: pass
if not pid==None:
if not psutil or psutil.pid_exists(pid):
tryStop(pid,True) # tryStop will rm pidfile if had permission to send the stop signal
triedStop = pid
else: unlink(options.pidfile) # stale
if not options.port: return
elif not options.port:
# Oops: the listening port is used to identify the other process; without it, we don't know which process to stop
errExit("Cannot use --restart or --stop with --port=0 and no --pidfile")
pids = run_lsof()
if pids==False: # no lsof, or couldn't make sense of it
# Could try "fuser -n tcp "+str(options.port), but it can be slow on a busy system. Try netstat instead.
pids = run_netstat()
if pids==False:
if not options.pidfile: sys.stderr.write("stopOther: can't find understandable 'lsof' or 'netstat' commands on this system\n")
return False
try: pids.remove(os.getpid())
except: pass
for pid in pids:
if not pid==triedStop:
tryStop(pid)
return triedStop or pids
def tryStop(pid,alsoRemovePidfile=False):
if options.stop: other="the"
else: other="other"
try:
os.kill(pid,signal.SIGTERM)
if alsoRemovePidfile: unlink(options.pidfile)
sys.stderr.write("Stopped %s process at PID %d\n" % (other,pid))
except: sys.stderr.write("Failed to stop %s process at PID %d\n" % (other,pid))
def run_lsof():
# TODO: check ssl-fork ports as well as main port ? (also in run_netstat)
out = getoutput("lsof -iTCP:"+str(options.port)+" -sTCP:LISTEN 2>/dev/null") # Redirect lsof's stderr to /dev/null because it sometimes prints warnings, e.g. if something's wrong with Mac FUSE mounts, that won't affect the output we want. TODO: lsof can hang if ANY programs have files open on stuck remote mounts etc, even if this is nothing to do with TCP connections. -S 2 might help a BIT but it's not a solution. Linux's netstat -tlp needs root, and BSD's can't show PIDs. Might be better to write files or set something in the process name.
if out.startswith("lsof: unsupported"):
# lsof 4.81 has -sTCP:LISTEN but lsof 4.78 does not. However, not including -sTCP:LISTEN can cause lsof to make unnecessary hostname queries for established connections. So fall back only if have to.
out = getoutput("lsof -iTCP:"+str(options.port)+" -Ts 2>/dev/null") # lsof -Ts ensures will say LISTEN on the pid that's listening
lines = filter(lambda x:"LISTEN" in x,out.split("\n")[1:])
elif not out.strip() and not getoutput("which lsof 2>/dev/null"): return False
else: lines = out.split("\n")[1:]
pids = set()
for line in lines:
try: pids.add(int(line.split()[1]))
except:
if not pids:
# sys.stderr.write("stopOther: Can't make sense of lsof output %s\n" % repr(line))
return False # lsof not working, use something else
break
return pids
def run_netstat():
if not 'linux' in sys.platform or not getoutput("which netstat 2>/dev/null"): return False
pids = set()
for l in getoutput("netstat -tnlp").split("\n"):
if ':'+str(options.port)+' ' in l:
ps = l.split()[-1]
if '/' in ps:
pids.add(int(ps[:ps.index('/')]))
return pids
#@file: ssl-multiprocess.py
# --------------------------------------------------
# Support for SSL termination in separate processes
# --------------------------------------------------
sslforks_to_monitor = [] # list of [pid,callback1,callback2,port]
sslfork_monitor_pid = None
def sslSetup(HelperStarter, ping_portNo, isFixed=False):
if options.ssl_fork: # queue it to be started by monitor
if options.multicore and sslforks_to_monitor: sslforks_to_monitor[0][1] = (lambda c1=HelperStarter,c2=sslforks_to_monitor[0][1]:(c1(),c2())) # chain it, as in multicore mode we'll have {N cores} * {single process handling all SSL ports}, rather than cores * processes (TODO: if one gets stuck but others on the port can still handle requests, do we want to somehow detect the individual stuck one and restart it to reduce wasted CPU load?)
else:
# no multicore, or this is the first SSL helper, so we need to associate it with a (non-SSL) ping responder
sslforks_to_monitor.append([None,HelperStarter,(lambda *_:listen_on_port(Application([(r"(.*)",AliveResponder,{})],log_function=nullLog),ping_portNo,"127.0.0.1",False)),ping_portNo])
return ping_portNo + 1 # where to put the next listener
else: # just run it on the current process, and we can randomise the internal port and keep track of what it is
if not isFixed: port_randomise[ping_portNo-1] = True
HelperStarter()
return ping_portNo # next listener can use what would have been the ping-responder port as we're not using it
sslFork_pingInterval = 10 # TODO: configurable? (if setting this larger, might want to track the helper threads for early termination)
def maybe_sslfork_monitor():
"Returns SIGTERM callback if we're now a child process"
global sslforks_to_monitor
if not sslforks_to_monitor: return
global sslfork_monitor_pid
pid = os.fork()
if pid:
sslfork_monitor_pid = pid ; return
# If background, can't double-fork (our PID is known)
# (TODO: if profile_forks_too, there's no profile loop in this monitor (it starts only when we fork a new helper); unlikely to be useful here though)
try: os.setpgrp() # for stop_threads0 later
except: pass
signal.signal(signal.SIGTERM, terminateSslForks)
signal.signal(signal.SIGINT, terminateSslForks)
setProcName("adjusterSSLmon")
# (15 chars is max for some "top" implementations)
CrossProcessLogging.initChild("SSL")
# (not SSLmon because helper IDs will be appended to it)
global is_sslHelp ; is_sslHelp = True
for i in xrange(len(sslforks_to_monitor)):
if i==len(sslforks_to_monitor)-1: pid = 0 # don't bother to fork for the last one
else: pid = os.fork()
if pid: sslforks_to_monitor[i][0] = pid # for SIGTERM
else: # child
oldI = i
if i < len(sslforks_to_monitor)-1:
sslforks_to_monitor = [sslforks_to_monitor[i]]
i = 0 # we'll monitor only one in the child
# don't use IOLoop for this monitoring: too confusing if we have to restart it on fork
try: urlopen = build_opener(ProxyHandler({})).open # don't use the system proxy if set
except: pass # leave urlopen as default if above not supported
while True:
try: urlopen(("http://localhost:%d/" % sslforks_to_monitor[i][3]),timeout=sslFork_pingInterval)
except: # URLError etc
if restart_sslfork(i,oldI): # child
return lambda *args:stopServer("SIG*")
else: time.sleep(sslFork_pingInterval) # double it after a restart
time.sleep(sslFork_pingInterval)
def restart_sslfork(n,oldN):
global sslforks_to_monitor
if not sslforks_to_monitor[n][0]==None: # not first time
if options.multicore: oldN = "s"
else: oldN = " "+str(oldN)
logging.error("Restarting SSL helper%s via pid %d as not heard from port %d" % (oldN,sslforks_to_monitor[n][0],sslforks_to_monitor[n][3]))
emergency_zap_pid_and_children(sslforks_to_monitor[n][0]) # may have children if multicore
# TODO: if profile_forks_too, do things with profile?
pid = os.fork()
if pid: sslforks_to_monitor[n][0] = pid
else: # child
setProcName("adjusterSSLhelp")
CrossProcessLogging.initChild(str(n)) # TODO: or port number?
sslforks_to_monitor[n][1]() # main listener
sslforks_to_monitor[n][2]() # 'still alive' listener
sslforks_to_monitor = [] # nothing for us to check
return True
def terminateSslForks(*args):
"sslfork_monitor's SIGTERM handler"
global sslforks_to_monitor
for p,_,_,_ in sslforks_to_monitor:
if p==None: continue
try: os.kill(p,signal.SIGTERM)
except OSError: pass # somebody might have 'killall'd them
try: os.waitpid(p, os.WNOHANG)
except OSError: pass
stop_threads0()
class AliveResponder(RequestHandler):
SUPPORTED_METHODS = ("GET",)
def get(self, *args, **kwargs): self.write("1")
#@file: port-listen.py
# --------------------------------------------------
# Port listening - main, SSL-termination and JS-upstream
# --------------------------------------------------
def open_extra_ports():
"Returns the stop function if we're now a child process that shouldn't run anything else"
nextPort = options.internalPort
# don't add any other ports here: NormalRequestForwarder assumes the real_proxy SSL helper will be at internalPort
# All calls to sslSetup and maybe_sslfork_monitor must be made before ANY other calls to listen_on_port (as we don't yet want there to be an IOLoop instance when maybe_sslfork_monitor is called)
if options.real_proxy: nextPort = sslSetup(lambda port=nextPort:listen_on_port(Application([(r"(.*)",SSLRequestForwarder(),{})],log_function=accessLog,gzip=False),port,"127.0.0.1",False,ssl_options={"certfile":duff_certfile()}),nextPort+1) # A modified Application that's 'aware' it's the SSL-helper version (use SSLRequestForwarder & no need for staticDocs listener) - this will respond to SSL requests that have been CONNECT'd via the first port. We set gzip=False because little point if we know the final client is on localhost.
if options.js_reproxy:
# ditto for js_interpreter (saves having to override its user-agent, or add custom headers requiring PhantomJS 1.5+, for us to detect its connections back to us)
global js_proxy_port
js_proxy_port = []
for c in xrange(cores):
for i in xrange(js_per_core):
# PjsRequestForwarder to be done later
js_proxy_port.append(nextPort)
nextPort = sslSetup(lambda port=nextPort,cc=c,ii=i : listen_on_port(Application([(r"(.*)",PjsSslRequestForwarder(cc*js_per_core,ii),{})],log_function=nullLog,gzip=False),port+1,"127.0.0.1",False,ssl_options={"certfile":duff_certfile()}),nextPort+2)
if upstream_rewrite_ssl:
# This one does NOT listen on SSL: it listens on unencrypted HTTP and rewrites .0 into outgoing SSL. But we can still run it in a different process if ssl_fork is enabled, and this will save encountering the curl_max_clients issue as well as possibly offloading *client*-side SSL to a different CPU core (TODO: could also use Tornado's multiprocessing to multi-core the client-side SSL)
sslSetup(lambda port=upstream_proxy_port+1:listen_on_port(Application([(r"(.*)",UpSslRequestForwarder,{})],log_function=nullLog,gzip=False),port,"127.0.0.1",False),upstream_proxy_port+2,True) # TODO: document upstream_proxy_port+2 needs to be reserved if options.ssl_fork and not options.upstream_proxy_host
r = maybe_sslfork_monitor()
if r: return r
# NOW we can start non-sslSetup listen_on_port:
if options.js_reproxy:
for c in xrange(cores):
for i in xrange(js_per_core):
if options.ssl_fork: pass # do NOT port_randomise, because maybe_sslfork_monitor is called ABOVE and the fork will NOT have a copy of our updated port_randomise map for its forwardToOtherPid call
else: port_randomise[js_proxy_port[c*js_per_core+i]]=True
listen_on_port(makePjsApplication(c*js_per_core,i),js_proxy_port[c*js_per_core+i],"127.0.0.1",False,core=c)
def makeMainApplication():
handlers = [(r"(.*)",NormalRequestForwarder(),{})]
if options.staticDocs: handlers.insert(0,static_handler())
return Application(handlers,log_function=accessLog,gzip=options.compress_responses) # TODO: gzip= deprecated in Tornado 4.x (if they remove it, we may have to check Tornado version and send either gzip= or compress_response= as appropriate, in all calls to Application)
def makePjsApplication(x,y):
handlers = [(r"(.*)",PjsRequestForwarder(x,y),{})]
if options.js_upstream and options.staticDocs: handlers.insert(0,static_handler())
return Application(handlers,log_function=nullLog,gzip=False)
def start_multicore(isSSLEtcChild=False):
"Fork child processes, set coreNo unless isSSLEtcChild; parent waits and exits. Call to this must come after unixfork if want to run in the background."
global coreNo
if not options.multicore:
if not isSSLEtcChild: coreNo = 0
return
# Simplified version of Tornado fork_processes with
# added setupRunAndBrowser (must have the terminal)
children = set()
for i in range(cores):
pid = os.fork()
if not pid: # child
if not isSSLEtcChild: coreNo = i
return CrossProcessLogging.initChild()
children.add(pid)
if not isSSLEtcChild:
# Do the equivalent of setupRunAndBrowser() but without the IOLoop. This can start threads, so must be after the above fork() calls.
if options.browser: runBrowser()
if options.run: runRun()
# Now wait for the browser or the children to exit
# (and monitor for SIGTERM: we might be an SSLhelp)
def handleTerm(*_):
global interruptReason
interruptReason = "SIGTERM received by multicore helper"
for pid in children: os.kill(pid,signal.SIGTERM)
signal.signal(signal.SIGTERM,handleTerm)
try:
while children:
try: pid, status = os.wait()
except KeyboardInterrupt: raise # see below
except: continue # interrupted system call OK
if pid in children: children.remove(pid)
except KeyboardInterrupt: pass
if children:
try: reason = interruptReason # from handleTerm
except: reason = "keyboard interrupt"
reason = "Adjuster multicore handler: "+reason+", stopping "+str(len(children))+" child processes"
if options.background: logging.info(reason)
else: sys.stderr.write("\n"+reason+"\n")
for pid in children: os.kill(pid,signal.SIGTERM)
while children:
try: pid, status = os.wait()
except KeyboardInterrupt: logging.error("KeyboardInterrupt received while waiting for child-processes to terminate: "+" ".join(str(s) for s in children))
except: continue
if pid in children: children.remove(pid)
if not isSSLEtcChild: announceShutdown0()
stop_threads() # must be last thing, except
raise SystemExit # (in case weren't any threads to stop)
def openPortsEtc():
workaround_raspbian7_IPv6_bug()
workaround_timeWait_problem()
early_fork = (options.ssl_fork and options.background)
if early_fork: banner(True),unixfork()
if options.ssl_fork: initLogging() # even if not early_fork (i.e. not background)
stopFunc = open_extra_ports()
if stopFunc: # we're a child process (--ssl-fork)
assert not options.background or early_fork
# can't double-fork (our PID is known), hence early_fork above
start_multicore(True) ; schedule_retries()
if profile_forks_too: open_profile()
else: # we're not a child process of --ssl-fork
try:
if options.port: listen_on_port(makeMainApplication(),options.port,options.address,options.browser)
open_upnp() # make sure package avail if needed
if not early_fork: banner()
if options.background and not early_fork:
if options.js_interpreter: test_init_webdriver()
unixfork() # MUST be before init_webdrivers (js_interpreter does NOT work if you start them before forking)
if not options.ssl_fork: initLogging() # as we hadn't done it before (must be after unixfork)
init429()
if not options.background: notifyReady()
start_multicore() # if multicore, returns iff we're one of the cores
if not options.multicore or profile_forks_too: open_profile()
else: open_profile_pjsOnly()
if options.js_interpreter: init_webdrivers(coreNo*js_per_core,js_per_core)
if not options.multicore: setupRunAndBrowser() # (equivalent is done by start_multicore if multicore)
checkServer.setup() # (TODO: if we're multicore, can we propagate to other processes ourselves instead of having each core check the fasterServer? Low priority because how often will a multicore box need a fasterServer)
if not coreNo:
CrossProcess429.startThread()
Dynamic_DNS_updater()
if options.pimote: pimote_thread() # must be on same core as Dynamic_DNS_updater so it can set pimote_may_need_override
if options.multicore: stopFunc = lambda *_:stopServer("SIG*")
else: stopFunc = lambda *_:stopServer("SIGTERM received")
if options.seconds: IOLoopInstance().add_timeout(time.time()+options.seconds,lambda *args:stopServer("Uptime limit reached"))
if options.stdio and not coreNo: setup_stdio()
except SystemExit: raise
except: # oops, error during startup, stop forks if any
if not sslfork_monitor_pid == None:
time.sleep(0.5) # (it may have only just started: give it a chance to install its signal handler)
try: os.kill(sslfork_monitor_pid,signal.SIGTERM)
except OSError: pass
raise
signal.signal(signal.SIGTERM, stopFunc)
try: os.setpgrp() # for stop_threads0 later
except: pass
def setup_stdio():
# Handle option for request on standard input
# (when used in one-off mode)
global StdinPass,StdinPending
StdinPass,StdinPending = None,[]
def doStdin(fd,events):
l=os.read(fd,1024) # read 1 line or 1024 bytes (TODO: double-check this can never block)
if not l: # EOF (but don't close stdout yet)
IOLoopInstance().remove_handler(sys.stdin.fileno())
return
global StdinPass
if StdinPending: StdinPending.append(l) # connection is still being established
elif StdinPass: StdinPass.write(l) # open
else: # not yet established
StdinPending.append(l)
StdinPass = tornado.iostream.IOStream(socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0))
def ClearPending(): del StdinPending[:]
def WriteOut(s):
try: sys.stdout.buffer.write(s)
except: sys.stdout.write(s)
doCallback(None,StdinPass.connect,lambda *args:(StdinPass.write(B('').join(StdinPending)),ClearPending(),readUntilClose(StdinPass,lambda last:(WriteOut(last),sys.stdout.close()),WriteOut)),(options.address, port_randomise.get(options.port,options.port)))
IOLoopInstance().add_handler(sys.stdin.fileno(), doStdin, IOLoop.READ)
#@file: up-down.py
# --------------------------------------------------
# General startup and shutdown tasks
# --------------------------------------------------
def banner(delayed=False):
ret = [twoline_program_name]
if options.port:
if options.port==-1:
ret.append("Listening on 127.0.0.1:%d" % port_randomise[-1])
if not istty(sys.stdout) and options.background: sys.stdout.write("127.0.0.1:%d" % port_randomise[-1]),sys.stdout.flush()
else: ret.append("Listening on port %d" % options.port)
if upstream_rewrite_ssl: ret.append("--upstream-proxy back-connection helper is listening on 127.0.0.1:%d" % (upstream_proxy_port+1,))
if options.stdio: ret.append("Listening on standard input")
else: ret.append("Not listening (--port=0 set)")
if options.ssl_fork and not options.background: ret.append("To inspect processes, use: pstree "+str(os.getpid()))
ret = "\n".join(ret)+"\n"
if delayed: ret=ret.replace("Listening","Will listen").replace("Writing","Will write") # for --ssl-fork --background, need early fork (TODO: unless write a PID somewhere)
sys.stderr.write(ret),sys.stderr.flush()
if not options.background:
# set window title for foreground running
t = "adjuster"
if "SSH_CONNECTION" in os.environ: t += "@"+hostSuffix() # TODO: might want to use socket.getfqdn() to save confusion if several servers are configured with the same host_suffix and/or host_suffix specifies multiple hosts?
set_title(t)
def istty(f=sys.stderr): return hasattr(f,"isatty") and f.isatty()
def set_title(t):
if not istty(): return
term = os.environ.get("TERM","")
is_xterm = "xterm" in term
is_screen = (term=="screen" and os.environ.get("STY",""))
is_tmux = (term=="screen" and os.environ.get("TMUX",""))
if is_xterm or is_tmux:
sys.stderr.write("\033]0;%s\007" % (t,)),sys.stderr.flush()
# ("0;" sets both title and minimised title, "1;" sets minimised title, "2;" sets title. Tmux takes its pane title from title (but doesn't display it in the titlebar))
elif is_screen: os.system("screen -X title \"%s\"" % (t,))
else: return
if not t: return
import atexit
atexit.register(set_title,"")
global can_do_ansi_colour
can_do_ansi_colour = is_xterm or (is_screen and "VT 100/ANSI" in os.environ.get("TERMCAP",""))
# can_do_ansi_colour is used by showProfile (TODO: if profile_forks_too, we'd need to set this earlier than the call to banner / set_title in order to make it available to SSL forks etc, otherwise only the main one has purple profile output. Multicore is already OK (but does only counts per core).)
can_do_ansi_colour=False
coreNo = "unknown" # want it to be non-False to begin with
def announceInterrupt():
if coreNo or options.multicore: return # we are a silent helper process (coreNo=="unknown"), or we announce interrupts differently in multicore (see start_multicore), so nothing to do here
if options.background: logging.info("SIGINT received"+find_adjuster_in_traceback())
else: sys.stderr.write("\nKeyboard interrupt"+find_adjuster_in_traceback()+"\n")
def announceShutdown():
if coreNo or options.multicore: return # silent helper process (coreNo=="unknown"), or we announce interrupts differently in multicore (see start_multicore)
announceShutdown0()
def announceShutdown0():
global exitting ; exitting = True # so not restarted if options.runWait == 0 and the run process was given the same signal (it can be confusing if get a restart message from the other thread AFTER shutdown has been announced)
if options.background:
logging.info("Server shutdown")
if options.pidfile: unlink(options.pidfile)
else: sys.stderr.write("Adjuster shutdown\n")
def main():
check_injected_globals()
setProcName() ; readOptions() ; preprocessOptions()
serverControl() ; openPortsEtc() ; startServers()
try: IOLoopInstance().start()
# "There seemed a strangeness in the air,
# Vermilion light on the land's lean face;
# I heard a Voice from I knew not where:
# 'The Great Adjustment is taking place!'" - Thomas Hardy
except KeyboardInterrupt: announceInterrupt()
announceShutdown()
options.pimote = "" # so pimote_thread stops
for v in kept_tempfiles.values(): unlink(v)
stop_threads() # must be last thing
def plural(number): return "" if number==1 else "s"
def stop_threads():
shutdown429()
if quitFuncToCall: quitFuncToCall()
if not sslfork_monitor_pid == None:
try: os.kill(sslfork_monitor_pid,signal.SIGTERM) # this should cause it to propagate that signal to the monitored PIDs
except OSError: pass # somebody might have killall'd it
CrossProcessLogging.shutdown()
writeMsg = not options.background and not coreNo
for t in range(10): # wait for helper_threads first (especially if quitFuncToCall above, as if the terminate routine is too forceful it might prevent the EOF from being sent over the pipe (multiprocessing.Pipe has no flush method after we send the EOF, so quitFuncToCall's returning does NOT mean the eof has actually been sent) and we could get a stuck adjusterWDhelp process)
if t: time.sleep(0.2)
if not helper_threads:
if t: sys.stderr.write("Helper threads have stopped\n")
return
if not t and writeMsg: sys.stderr.write("Waiting 2secs for helper threads to stop...\n")
ht = [(i,1) for i in sorted(helper_threads)]
i = 0
while i < len(ht)-1:
if ht[i][0] == ht[i+1][0]:
ht[i] = (ht[i][0], ht[i][1]+1)
del ht[i+1]
else: i += 1
for i in xrange(len(ht)):
if ht[i][1]==1: ht[i] = ht[i][0]
else: ht[i] = ht[i][0]+"*"+str(ht[i][1])
msg = "Terminating %d helper thread%s (%s)" % (len(ht),plural(len(ht)),", ".join(ht))
# in case someone needs our port quickly.
# Most likely "runaway" thread is ip_change_command if you did a --restart shortly after the server started.
# TODO it would be nice if the port can be released at the IOLoop.instance.stop, so that it's not necessary to stop the threads
if writeMsg: sys.stderr.write(msg+"\n")
stop_threads0()
def stop_threads0():
signal.signal(signal.SIGTERM, signal.SIG_DFL)
if options.run:
try: os.kill(runningPid,signal.SIGTERM)
except NameError: pass # runningPid not set
except OSError: pass # already exitted
os.killpg(os.getpgrp(),signal.SIGTERM)
os.abort() # if the above didn't work, this should
#@file: tornado-setup.py
# --------------------------------------------------
# Basic Tornado-server setup
# --------------------------------------------------
def static_handler():
url,path = options.staticDocs.split('#')
if not url.startswith("/"): url="/"+url
if not url.endswith("/"): url += "/"
class OurStaticFileHandler(StaticFileHandler):
def set_extra_headers(self,path): fixServerHeader(self)
return (url+"(.*)",OurStaticFileHandler,{"path":path,"default_filename":"index.html"})
theServers = {}
port_randomise = {} # port -> _ or port -> mappedPort
def listen_on_port(application,port,address,browser,core="all",**kwargs):
# Don't set backlog=0: it's advisory only and is often rounded up to 8; we use CrossProcess429 instead
if port in port_randomise:
s = bind_sockets(0,"127.0.0.1")
# should get len(s)==1 if address=="127.0.0.1" (may get more than one socket, with different ports, if address maps to some mixed IPv4/IPv6 configuration)
port_randomise[port] = s[0].getsockname()[1]
else:
for portTry in [5,4,3,2,1,0]:
try: s = bind_sockets(port,address)
except socket.error as e:
if is_sslHelp:
# We had better not time.sleep() here trying
# to open, especially not if multicore: don't
# want to hold up the OTHER ports being opened
# and get into an infinite-restart loop when
# MOST services are already running:
f = lambda *_:IOLoopInstance().add_timeout(time.time()+1,lambda *args:listen_on_port(application,port,address,browser,core,schedRetry,**kwargs))
if is_sslHelp=="started": f()
else: sslRetries.append(f)
logging.info("Can't open port "+repr(port)+", retry scheduled")
return
if not "already in use" in e.strerror: raise
# Maybe the previous server is taking a while to stop
if portTry:
time.sleep(0.5) ; continue
# tried 6 times over 3 seconds, can't open the port
if browser:
# there's probably another adjuster instance, in which case we probably want to let the browser open a new window and let our listen() fail
runBrowser()
raise Exception("Can't open port "+repr(port)+" (tried for 3 seconds, "+e.strerror+")")
i = len(theServers.setdefault(core,[])) ; c = core
class ServerStarter: # don't construct HTTPServer before fork
def start(self):
h = HTTPServer(application,**kwargs)
h.add_sockets(s)
if port==options.port:
global mainServer ; mainServer = h
theServers[c][i]=(port,h) ; h.start()
theServers[core].append((port,ServerStarter()))
is_sslHelp = False ; sslRetries = []
def schedule_retries():
global is_sslHelp,sslRetries
is_sslHelp = "started"
for s in sslRetries: s()
sslRetries = []
def IOLoopInstance():
global ioLoopInstance
try: return ioLoopInstance
except: # better call this from the main thread first:
if hasattr(IOLoop,"current"): ioLoopInstance = IOLoop.current() # for Tornado 5+ to work
else: ioLoopInstance = IOLoop.instance() # in Tornado 4 and older, this can be called on-demand from any thread, but we're putting it in a global for forward-compatibility with the above
return ioLoopInstance
def startServers():
workaround_tornado_fd_issue()
for core,sList in list(theServers.items()):
if core == "all" or core == coreNo:
for port,s in sList: s.start()
#@file: overload.py
# --------------------------------------------------
# Multicore: pause/restart when a core is overloaded
# --------------------------------------------------
mainServerPaused = mainServerReallyPaused = False
def pauseOrRestartMainServer(shouldRun=True):
if not (options.multicore and options.js_429): return
global mainServerPaused
if (not shouldRun) == mainServerPaused: return
# if shouldRun: return # uncomment this 'once closed, stay closed' line to demonstrate the OS forwards to open cores only
reallyPauseOrRestartMainServer(shouldRun)
mainServerPaused = not mainServerPaused
debuglog("Paused=%s on core %s" % (repr(mainServerPaused),repr(coreNo)))
CrossProcess429.q.put((coreNo,mainServerPaused))
def reallyPauseOrRestartMainServer(shouldRun):
global mainServerReallyPaused
if shouldRun == "IfNotPaused": # called by CrossProcess429 to re-pause if and only if hasn't been reopened by the outer level in the meantime
shouldRun = mainServerPaused
if (not shouldRun) == mainServerReallyPaused: return
for core,sList in theServers.items():
if not (core == "all" or core == coreNo): continue
for port,s in sList:
if not port==options.port: continue
if not hasattr(s,"_sockets"):
logging.error("Cannot pause server: wrong Tornado version?")
return
if shouldRun: s.add_sockets(s._sockets.values())
else:
for fd, sock in s._sockets.items():
if hasattr(s,"io_loop"): s.io_loop.remove_handler(fd) # Tornado 4
else: IOLoopInstance().remove_handler(fd) # Tornado 5, not tested (TODO)
mainServerReallyPaused = not mainServerReallyPaused
debuglog("reallyPaused=%s on core %s" % (repr(mainServerReallyPaused),repr(coreNo)))
#@file: workarounds.py
# --------------------------------------------------
# Miscellaneous bug workarounds
# --------------------------------------------------
def workaround_raspbian7_IPv6_bug():
"""Old Debian 7 based versions of Raspbian can boot with IPv6 enabled but later fail to configure it, hence tornado/netutil.py's AI_ADDRCONFIG flag is ineffective and socket.socket raises "Address family not supported by protocol" when it tries to listen on IPv6. If that happens, we'll need to set address="0.0.0.0" for IPv4 only. However, if we tried IPv6 and got the error, then at that point Tornado's bind_sockets will likely have ALREADY bound an IPv4 socket but not returned it; the socket does NOT get closed on dealloc, so a retry would get "Address already in use" unless we quit and re-run the application (or somehow try to figure out the socket number so it can be closed). Instead of that, let's try to detect the situation in advance so we can set options.address to IPv4-only the first time."""
if options.address: return # don't need to do this if we're listening on a specific address
flags = socket.AI_PASSIVE
if hasattr(socket, "AI_ADDRCONFIG"): flags |= socket.AI_ADDRCONFIG
for af,socktype,proto,r1,r2 in socket.getaddrinfo(None,options.port,socket.AF_UNSPEC,socket.SOCK_STREAM,0,flags):
try: socket.socket(af,socktype,proto)
except socket.error as e:
if "family not supported" in e.strerror:
options.address = "0.0.0.0" # use IPv4 only
return
def workaround_timeWait_problem():
"""Work around listen-port failing to bind when there are still TIME_WAIT connections from the previous run. This at least seems to work around the problem MOST of the time."""
global bind_sockets
bind_sockets = tornado.netutil.bind_sockets
if "win" in sys.platform and not sys.platform=="darwin":
# Don't do this on MS-Windows. It can result in
# 'stealing' a port from another server even while
# that other server is still running.
return
if not hasattr(socket, "SO_REUSEPORT"): return
if getargspec==None: return
if not 'reuse_port' in getargspec(tornado.netutil.bind_sockets).args: return # Tornado version too old
def bind_sockets(*args,**kwargs):
if not args[0]: pass # wer're doing port_randomise
elif len(args) < 6: kwargs['reuse_port'] = True
else: args=tuple(args[:6])+(True,)
return tornado.netutil.bind_sockets(*args,**kwargs)
def workaround_tornado_fd_issue(): # TODO: is this still needed post-v0.3 now we fixed start-order bug?
if not hasattr(IOLoopInstance(),'handle_callback_exception'):
return # Tornado 6 doesn't have this, let's hope it's not needed
cxFunc = IOLoopInstance().handle_callback_exception
def newCx(callback):
if callback: return cxFunc(callback)
# self._handlers[fd] raised KeyError. This means
# we don't want to keep being told about the fd.
fr = sys.exc_info()[2]
while fr.tb_next: fr = fr.tb_next
fd = fr.tb_frame.f_locals.get("fd",None)
if not fd: return cxFunc("callback="+repr(callback)+" and newCx couldn't get fd from stack")
logging.info("IOLoop has no handler left for fd "+repr(fd)+" but is still getting events from it. Attempting low-level close to avoid loop.")
try: IOLoopInstance().remove_handler(fd)
except: pass
try: os.close(fd)
except: pass
IOLoopInstance().handle_callback_exception = newCx
def check_LXML():
# Might not find ALL problems with lxml installations, but at least we can check some basics
global etree
try:
from lxml import etree
return etree.HTMLParser(target=None) # works on lxml 2.3.2
except ImportError: sys.stderr.write("LXML library not found - ignoring useLXML option\n")
except TypeError: sys.stderr.write("LXML library too old - ignoring useLXML option\n") # no target= option in 1.x
options.useLXML = False
#@file: unix.py
# --------------------------------------------------
# More setup: Unix forking etc
# --------------------------------------------------
def unixfork():
if os.fork(): sys.exit()
os.setsid()
if os.fork(): sys.exit()
devnull = os.open("/dev/null", os.O_RDWR)
for fd in range(3): os.dup2(devnull,fd) # commenting out this loop will let you see stderr after the fork (TODO debug option?)
if options.pidfile:
try: open(options.pidfile,"w").write(str(os.getpid()))
except: pass
def notifyReady():
try: import sdnotify # sudo pip install sdnotify
except ImportError: return
sdnotify.SystemdNotifier().notify("READY=1") # we send READY=1 so you can do an adjuster.service (w/out --background) with Type=notify and ExecStart=/usr/bin/python /path/to/adjuster.py --config=...
#@file: curl-setup.py
# --------------------------------------------------
# cURL client setup
# --------------------------------------------------
def MyAsyncHTTPClient(): return AsyncHTTPClient()
def curlFinished(): pass
def setupCurl(maxCurls,error=None):
global pycurl
try:
import pycurl # check it's there
curl_async = pycurl.version_info()[4] & (1 << 7) # CURL_VERSION_ASYNCHDNS
if not curl_async: curl_async = ('c-ares' in pycurl.version or 'threaded' in pycurl.version) # older
if not curl_async:
if error: warn("The libcurl on this system might hold up our main thread while it resolves DNS (try building curl with ./configure --enable-ares)")
else:
del pycurl ; return # TODO: and say 'not using'?
if float('.'.join(pycurl.version.split()[0].split('/')[1].rsplit('.')[:2])) < 7.5:
if error: warn("The curl on this system is old and might hang when fetching certain SSL sites") # strace -p (myPID) shows busy looping on poll (TODO: option to not use it if we're not using upstream_proxy)
else:
del pycurl ; return # TODO: as above
_oldCurl = pycurl.Curl
def _newCurl(*args,**kwargs):
c = _oldCurl(*args,**kwargs)
so = c.setopt
def mySetopt(k,v):
so(k,v)
if k==pycurl.PROXY: so(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_0) # workaround 599 "transfer closed with outstanding read data remaining" in Curl 7.55.1 with polipo2 as upstream proxy (TODO: curl-version dependent? 7.43.0 seemed OK in this aspect, although it had the above problem)
c.setopt = mySetopt
return c
pycurl.Curl = _newCurl
curl_max_clients = min(max(maxCurls,10),1000) # constrain curl_max_clients to between 10 and 1000 to work around Tornado issue 2127, and we'll warn about the issue ourselves if we go over:
curl_inUse_clients = 0
try: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient",max_clients=curl_max_clients)
except: AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") # will try in MyAsyncHTTPClient too (different versions of Tornado and all that...) (TODO: if that one also falls back to no max_clients, we might be reduced to 10 and should set curl_max_clients accordingly in order to get appropriate warning messages)
def MyAsyncHTTPClient():
try: problem = not len(AsyncHTTPClient()._free_list)
except:
global curl_inUse_clients
curl_inUse_clients += 1
problem = curl_inUse_clients >= curl_max_clients
if problem:
if upstream_rewrite_ssl and not options.ssl_fork: logging.error("curl_max_clients too low; AsyncHTTPClient will queue requests and COULD DEADLOCK due to upstream_rewrite_ssl (try --ssl-fork if you can't increase curl_max_clients)")
else: logging.info("curl_max_clients too low; AsyncHTTPClient will queue requests")
try: return AsyncHTTPClient(max_clients=curl_max_clients)
except: return AsyncHTTPClient()
def curlFinished(): # for callbacks to call
global curl_inUse_clients
curl_inUse_clients -= 1
if curl_inUse_clients < 0:
# This shouldn't happen. But if it does, don't let the effect 'run away'.
curl_inUse_clients = 0
except: # fall back to the pure-Python one
if error: errExit(error) # (unless it won't do)
try:
import zlib
enable_gzip = True # for fetching remote sites
except: # Windows?
enable_gzip = False
class zlib:
def compress(self,s,level): return s
def decompressobj():
class o:
def decompress(self,s,maxlen): return s
return o()
zlib = zlib()
#@file: message-user.py
# --------------------------------------------------
# Support for showing messages to specific IP addresses
# --------------------------------------------------
try:
import hashlib # Python 2.5+, platforms?
hashlib.md5
except: hashlib = None # (TODO: does this ever happen on a platform that supports Tornado? Cygwin has hashlib with md5)
if hashlib: cookieHash = lambda msg: base64.b64encode(hashlib.md5(B(msg)).digest())[:10]
else: cookieHash = lambda msg: hex(hash(msg))[2:] # this fallback is not portable across different Python versions etc, so no good if you're running a fasterServer
ipv4_regexp = re.compile(r'^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$')
def ipv4_to_int(ip):
m = re.match(ipv4_regexp,S(ip))
if m: return (int(m.group(1))<<24) | (int(m.group(2))<<16) | (int(m.group(3))<<8) | int(m.group(4))
else: return None
def ipv4range_to_ints(ip):
ip = S(ip)
if '-' in ip: return tuple(ipv4_to_int(i) for i in ip.split('-'))
elif '/' in ip:
start,bits = ip.split('/')
start = ipv4_to_int(start)
return start, start | ~(-1 << (32-int(bits)))
else: return ipv4_to_int(ip),ipv4_to_int(ip)
def ipv4ranges_func(ipRanges_and_results):
isIP = True ; rangeList=None ; fList = []
for field in S(ipRanges_and_results).split('|'):
if isIP: rangeList = [ipv4range_to_ints(i) for i in field.split(',')]
else: fList.append((rangeList,field))
isIP = not isIP
def f(ip):
ipInt = ipv4_to_int(ip)
for rl,result in fList:
if any((l<=ipInt<=h) for l,h in rl):
return result # else None
return f
#@file: connect-ssl.py
# --------------------------------------------------
# Service routines for CONNECT passing to SSL terminator
# --------------------------------------------------
debug_connections = False
def myRepr(d):
if re.search(B("[\x00-\x09\x0e-\x1f]"),B(d)): return "%d bytes" % len(d)
elif len(d) >= 512: return repr(d[:500]+"...")
else: return repr(d)
def peerName(socket):
try: return socket.getpeername()
except: return "(no socket??)"
def writeAndClose(stream,data):
if data:
if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket)+" and closing it")
try: stream.write(data,lambda *args:True)
except: pass # ignore errors like client disconnected
if not stream.closed():
try: stream.close()
except: pass
def writeOrError(opposite,name,stream,data):
if debug_connections: print ("Writing "+myRepr(data)+" to "+peerName(stream.socket))
try: stream.write(data)
except:
if name and not hasattr(stream,"writeOrError_already_complained"): logging.error("Error writing data to "+name)
stream.writeOrError_already_complained = True
try: stream.close()
except: pass
try: opposite.close() # (try to close the server stream we're reading if the client has gone away, and vice versa)
except: pass
#@file: misc.py
# --------------------------------------------------
# Miscellaneous variables
# --------------------------------------------------
cookieExpires = "Tue Jan 19 03:14:07 2038" # TODO: S2G (may have to switch to Max-Age and drop support for ~IE8)
set_window_onerror = False # for debugging Javascript on some mobile browsers (TODO make this a config option? but will have to check which browsers do and don't support window.onerror)
# Domain-setting cookie for when we have no wildcard_dns and no default_site:
adjust_domain_cookieName = "_adjusterDN_"
adjust_domain_none = B("0") # not a valid top-level domain (TODO hopefully no user wants this as a local domain...)
enable_adjustDomainCookieName_URL_override = True # TODO: document this! (Allow &_adjusterDN_=0 or &_adjusterDN_=wherever in bookmark URLs, so it doesn't matter what setting the cookie has when the bookmark is activated)
seen_ipMessage_cookieName = "_adjusterIPM_"
htmlmode_cookie_name = "_adjustZJCG_" # zap JS, CSS and Graphics
password_cookie_name = "_pxyAxsP_" # "proxy access password". have to pick something that's unlikely to collide with a site's cookie
webdriver_click_code = "._adjustPJSC_"
redirectFiles_Extensions=set("pdf epub mp3 aac zip gif png jpeg jpg exe tar tgz tbz ttf woff swf txt doc rtf midi mid wav ly c h py".split()) # TODO: make this list configurable + maybe add a "minimum content length before it's worth re-directing" option
#@file: js-webdriver.py
# --------------------------------------------------
# Server-side Javascript execution support
# --------------------------------------------------
class WebdriverWrapper:
"Wrapper for webdriver that might or might not be in a separate process without shared memory"
def __init__(self): self.theWebDriver = self.tmpDirToDelete = None
def new(self,*args):
try:
# No coredump, for emergency_zap_pid_and_children
import resource ; resource.setrlimit(resource.RLIMIT_CORE,(0,0))
except: pass # oh well, have coredumps then :-(
if options.js_multiprocess:
# we have a whole process to ourselves (not just a thread)
# so we can set the environment here.
# Selenium doesn't always clean up temporary files on exit
# (especially with Firefox), so let's set TMPDIR uniquely so we
# can clean them ourselves.
tmp = os.environ.get("TMPDIR",None)
self.tempDirToDelete=os.environ['TMPDIR']=os.environ.get("TMPDIR","/tmp")+"/"+str(os.getpid())+"."+str(args[0])
try: os.mkdir(self.tempDirToDelete)
except: pass
else: tmp = self.tempDirToDelete = None
self.theWebDriver = get_new_webdriver(*args)
if tmp: os.environ["TMPDIR"] = tmp
elif options.js_multiprocess: del os.environ["TMPDIR"]
def getTmp(self,*args): return self.tempDirToDelete
def quit(self,*args):
if not self.theWebDriver: return
try: pid = self.theWebDriver.service.process.pid
except: pid = debuglog("WebdriverWrapper: Unable to get self.theWebDriver.service.process.pid")
try: self.theWebDriver.quit()
except: debuglog("WebdriverWrapper: exception on quit") # e.g. sometimes get 'bad fd' in selenium's send_remote_shutdown_command _cookie_temp_file_handle
# Try zapping the process ourselves anyway (even if theWebDriver.quit DIDN'T return error: seems it's sometimes still left around. TODO: this could have unexpected consequences if the system's pid-reuse rate is excessively high.)
self.theWebDriver = None
emergency_zap_pid_and_children(pid)
if self.tempDirToDelete: shutil.rmtree(self.tempDirToDelete,True)
def current_url(self):
try: return self.theWebDriver.current_url
except: return "" # PhantomJS Issue #13114: unconditional reload for now
def get(self,url):
self.theWebDriver.get(S(url))
if options.logDebug:
try:
for e in self.theWebDriver.get_log('browser'):
print ("webdriver log: "+e['message'])
except Exception as e: print ("webdriver get_log exception: "+repr(e))
def execute_script(self,script): self.theWebDriver.execute_script(S(script))
def click_id(self,clickElementID): self.theWebDriver.find_element_by_id(S(clickElementID)).click()
def click_xpath(self,xpath): self.theWebDriver.find_element_by_xpath(S(xpath)).click()
def click_linkText(self,clickLinkText): self.theWebDriver.find_element_by_link_text(S(clickLinkText)).click()
def getu8(self):
def f(switchBack):
src = self.theWebDriver.find_element_by_xpath("//*").get_attribute("outerHTML")
if options.js_frames:
for el in ['frame','iframe']:
for frame in self.theWebDriver.find_elements_by_tag_name(el):
self.theWebDriver.switch_to.frame(frame)
src += f(switchBack+[frame])
self.theWebDriver.switch_to.default_content()
for fr in switchBack: self.theWebDriver.switch_to.frame(fr)
return src
return B(f([]))
def getpng(self):
if options.js_interpreter=="HeadlessChrome": # resize not needed for PhantomJS (but PhantomJS is worse at font configuration and is no longer maintained)
self.theWebDriver.set_window_size(js_size[0],min(16000,intor0(self.theWebDriver.execute_script("return document.body.parentNode.scrollHeight")))) # TODO: check the 16000: what is Selenium's limit? (confirmed over 8000)
time.sleep(1)
png = self.theWebDriver.get_screenshot_as_png()
if options.js_interpreter=="HeadlessChrome": self.theWebDriver.set_window_size(*js_size)
try: # can we optimise the screenshot image size?
from PIL import Image
s = BytesIO() ; Image.open(StringIO(png)).save(s,'png',optimize=True)
png = s.getvalue()
except: pass # just return non-optimized
return png
def getWebdriverWrapper():
if options.js_interpreter=="edbrowse": return EdbrowseWrapper()
else: return WebdriverWrapper()
class EdbrowseWrapper:
"Experimental wrapper for edbrowse that behaves like WebdriverWrapper"
def __init__(self): self.tDir,self.url,self.out = None,"about:blank",b""
def new(self,index,*args): self.tDir,self.edEnv = setup_edbrowse(index)
def getTmp(self,*args): return self.tDir
def quit(self,*args):
if self.tDir: shutil.rmtree(self.tDir,True)
def current_url(self): return self.url
def get(self,url):
self.url = url
out = subprocess.Popen(["edbrowse","-e"],-1,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,env=self.edEnv).communicate(b"b %s\njdb\ndocument.documentElement.outerHTML\n" % B(url))[0]
try: self.out = re.search(b"\\.browse\n(.*)EOF\\s*$",out,flags=re.DOTALL).group(1)
except: self.out = "Error: "+repr(out)
def execute_script(self,script): return "0" # typically used to get window dimensions etc, not needed for edbrowse
def click_id(self,clickElementID): pass # shouldn't be called if js_links off
def click_xpath(self,xpath): pass
def click_linkText(self,clickLinkText): pass
def getu8(self): return self.out
def getpng(self): return b"" # screenshots don't make sense on edbrowse
def check_edbrowse():
tDir,edEnv = setup_edbrowse()
try: out,err = subprocess.Popen(["edbrowse","-v"],-1,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,env=edEnv).communicate(b"")
except OSError: errExit("Could not run edbrowse")
shutil.rmtree(tDir)
try: a,b,c = S(out).split('.')
except: errExit("Could not parse format of edbrowse version number "+repr(out))
if (intor0(a),intor0(b),intor0(c)) < (3,7,5): errExit("edbrowse too old: at least 3.7.5 is required")
def setup_edbrowse(index=None):
tDir = getoutput("(TMPDIR=/dev/shm mktemp -d -t ed || mktemp -d -t ed) 2>/dev/null")
edEnv=os.environ.copy()
edEnv["TMPDIR"]=edEnv["HOME"]=tDir
ebrc = "downdir="+tDir+"\n"
ebrc += "jar="+tDir+os.sep+"cookies\n"
ebrc += "cachedir="+tDir+"\ncachesize=0\n" # (dir might not be needed if size=0)
if options.js_UA and not options.js_UA.startswith("*"): ebrc += "agent="+options.js_UA+"\n"
if options.js_reproxy and not index==None: ebrc += "proxy=* * 127.0.0.1:%d\n" % proxyPort(index)
elif options.upstream_proxy: ebrc += "proxy=* * "+options.upstream_proxy+"\n"
open(tDir+os.sep+".ebrc","w").write(ebrc)
return tDir,edEnv
def emergency_zap_pid_and_children(pid):
if not pid: return
try:
for c in psutil.Process(pid).children(recursive=True):
try: c.kill(9)
except: pass
except: pass # no psutil, or process already gone
try: os.kill(pid,9),os.waitpid(pid, 0) # waitpid is necessary to clear it from the process table, but we should NOT use os.WNOHANG, as if we do, there's a race condition with the os.kill taking effect (even -9 isn't instant)
except OSError: pass # maybe pid already gone
try: from selenium.common.exceptions import TimeoutException
except: # no Selenium or wrong version
class TimeoutException(Exception): pass # placeholder
class SeriousTimeoutException(Exception): pass
def webdriverWrapper_receiver(pipe,timeoutLock):
"Command receiver for WebdriverWrapper for when it's running over IPC (--js-multiprocess). Receives (command,args) and sends (return,exception), releasing the timeoutLock whenever it's ready to return."
setProcName("adjusterWDhelp")
CrossProcessLogging.initChild()
try: w = getWebdriverWrapper()
except KeyboardInterrupt: return
while True:
try: cmd,args = pipe.recv()
except KeyboardInterrupt: # all shutting down
try: w.quit()
except: pass
try: timeoutLock.release()
except ValueError: pass
pipe.send(("INT","INT"))
return pipe.close()
if cmd=="EOF": return pipe.close()
try: ret,exc = getattr(w,cmd)(*args), None
except Exception as e:
p = find_adjuster_in_traceback()
if p: # see if we can add it to the message (note p will start with ", " so no need to add a space before it)
try:
if hasattr(e,"msg") and e.msg: e.msg += p # should work with WebDriverException
elif type(e.args[0])==str: e.args=(repr(e.args[0])+p,) + tuple(e.args[1:]) # should work with things like httplib.BadStatusLine that are fussy about the number of arguments they get
else: e.args += (p,) # works with things like KeyError (although so should the above)
except: e.message += p # works with base Exception
ret,exc = None,e
try: timeoutLock.release()
except: pass # (may fail if controller's timeoutLock is turned off during quit_wd_atexit)
try: pipe.send((ret,exc))
except: pass # if they closed it, we'll get EOFError on next iteration
class WebdriverWrapperController:
"Proxy for WebdriverWrapper if it's running over IPC"
def __init__(self):
self.pipe, cPipe = multiprocessing.Pipe()
self.timeoutLock = multiprocessing.Lock()
self.process = multiprocessing.Process(target=webdriverWrapper_receiver,args=(cPipe,self.timeoutLock))
self.process.start()
def send(self,cmd,args=()):
"Send a command to a WebdriverWrapper over IPC, and either return its result or raise its exception in this process. Also handle the raising of SeriousTimeoutException if needed, in which case the WebdriverWrapper should be stopped."
try:
if not self.timeoutLock.acquire(timeout=0):
logging.error("REALLY serious SeriousTimeout (should never happen). Lock unavailable before sending command.")
raise SeriousTimeoutException()
except AttributeError: pass # self.timeoutLock==None because quit(final=True) called from another thread
try: self.pipe.send((cmd,args))
except IOError: return # already closed
if cmd=="EOF":
return self.pipe.close() # no return code
try:
if not self.timeoutLock.acquire(timeout=options.js_timeout2): # fallback in case Selenium timeout doesn't catch it (signal.alarm in the child process isn't guaranteed to help, so catch it here)
try: logging.error("SeriousTimeout: WebdriverWrapper process took over "+str(options.js_timeout2)+"s to respond to "+repr((cmd,args))+". Emergency restarting this process.")
except: pass # absolutely do not throw anything except SeriousTimeoutException from this branch
raise SeriousTimeoutException()
self.timeoutLock.release()
except AttributeError: return # self.timeoutLock==None because quit(final=True) called from another thread
ret,exc = self.pipe.recv()
if ret==exc=="INT": return self.pipe.close()
if exc: raise exc
else: return ret
def new(self,*args):
self.send("new",args)
self.tempDirToDelete=self.send("getTmp")
def quit(self,final=False):
if final: self.timeoutLock = None # quit_wd_atexit could plausibly run while another thread's still processing its last command, so allow these commands to be queued in the pipe from another thread without worrying about timeout when that happens
self.send("quit")
if final: self.send("EOF")
def current_url(self): return self.send("current_url")
def get(self,url): return self.send("get",(url,))
def execute_script(self,script): self.send("execute_script",(script,))
def click_id(self,clickElementID): self.send("click_id",(clickElementID,))
def click_xpath(self,xpath): self.send("click_xpath",(xpath),)
def click_linkText(self,clickLinkText): self.send("click_linkText",(clickLinkText,))
def getu8(self): return self.send("getu8")
def getpng(self): return self.send("getpng")
try:
os.fork # exception if e.g. Windows
import multiprocessing # Python 2.6
if hasattr(multiprocessing,'set_start_method'): multiprocessing.set_start_method('fork')
except: multiprocessing = None
class WebdriverRunner:
"Manage a WebdriverWrapperController (or a WebdriverWrapper if we're not using IPC) from a thread of the main process"
def __init__(self,start=0,index=0):
self.start,self.index = start,index
if options.js_multiprocess:
self.wrapper = WebdriverWrapperController()
else: self.wrapper = getWebdriverWrapper()
self.renew_webdriver_newThread(True) # sets wd_threadStart
def renew_controller(self): # SeriousTimeoutException
emergency_zap_pid_and_children(self.wrapper.process.pid)
shutil.rmtree(self.wrapper.tempDirToDelete,True)
self.wrapper = WebdriverWrapperController()
def renew_webdriver_sameThread(self,firstTime=False):
self.usageCount = 0 ; self.maybe_stuck = False
while True:
try:
self.wrapper.quit(),self.wrapper.new(self.start+self.index,not firstTime)
break
except SeriousTimeoutException: # already logged
self.renew_controller()
except:
logging.error("Exception "+exc_logStr()+" while renewing webdriver, retrying")
time.sleep(1) # just in case
self.usageCount = 0 ; self.maybe_stuck = False
def renew_webdriver_newThread(self,firstTime=False):
self.wd_threadStart = time.time() # cleared in _renew_wd after renew_webdriver_sameThread returns (it loops on exception)
threading.Thread(target=_renew_wd,args=(self,firstTime)).start() ; return
def quit_webdriver(self): self.wrapper.quit(final=True)
def fetch(self,url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate):
assert not self.wd_threadStart # webdriver_checkServe
self.wd_threadStart = time.time() # cleared in wd_fetch after _wd_fetch returns or throws + possible renew-loop (TODO: if wd_fetch ITSELF somehow throws an exception, should be logged but this JS instance gets tied up until next adjuster restart)
self.maybe_stuck = False
threading.Thread(target=wd_fetch,args=(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,self,tooLate)).start()
def current_url(self): return self.wrapper.current_url()
def get(self,url): return self.wrapper.get(url)
def execute_script(self,script): self.wrapper.execute_script(script)
def click_id(self,clickElementID): self.wrapper.click_id(clickElementID)
def click_xpath(self,xpath): self.wrapper.click_xpath(xpath)
def click_linkText(self,clickLinkText): self.wrapper.click_linkText(clickLinkText)
def getu8(self): return self.wrapper.getu8()
def getpng(self): return self.wrapper.getpng()
def _renew_wd(wd,firstTime):
wd.renew_webdriver_sameThread(firstTime)
wd.wd_threadStart = False
IOLoopInstance().add_callback(webdriver_checkServe)
def find_adjuster_in_traceback():
ei = sys.exc_info()
try:
p = ei[1].args[-1]
if "adjuster line" in p: return p # for webdriverWrapper_receiver
except: pass
try: __file__
except: return "" # sometimes not defined ??
l = traceback.extract_tb(ei[2])
for i in xrange(len(l)-1,-1,-1):
if __file__ in l[i][0]: return ", adjuster line "+str(l[i][1])
return ""
def wd_fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,manager,tooLate):
url = S(url)
helper_threads.append('wd_fetch')
need_restart = False
def errHandle(error,extraMsg,prefetched):
if not options.js_fallback: prefetched=None
if prefetched: toRet = "non-webdriver page (js_fallback set)"
else:
toRet = "error"
prefetched = wrapResponse("webdriver "+error)
logging.error(extraMsg+" returning "+toRet)
if options.js_fallback:
try:
prefetched.headers.add(options.js_fallback,error)
except: logging.error("Could not add "+repr(options.js_fallback)+" to error response")
return prefetched
try:
r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
try:
if options.js_fallback: r.headers.add(options.js_fallback,"OK")
except: pass
except TimeoutException:
r = errHandle("timeout","webdriver "+str(manager.start+manager.index)+" timeout fetching "+url+find_adjuster_in_traceback()+"; no partial result, so",prefetched) # "webdriver timeout" sent to browser (can't include url here: domain gets rewritten)
except SeriousTimeoutException:
r = errHandle("serious timeout","lost communication with webdriver "+str(manager.start+manager.index)+" when fetching "+url+"; no partial result, so",prefetched)
need_restart = "serious"
except:
if options.js_retry and not tooLate():
logging.info("webdriver error fetching "+url+" ("+exc_logStr()+"); restarting webdriver "+str(manager.start+manager.index)+" for retry") # usually a BadStatusLine
manager.renew_webdriver_sameThread()
if tooLate(): r = errHandle("err","too late")
else:
try:
r = _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot)
try:
if options.js_fallback: r.headers.add(options.js_fallback,"OK")
except: pass
except SeriousTimeoutException:
r = errHandle("serious timeout","webdriver serious timeout on "+url+" after restart, so re-restarting and",prefetched)
need_restart = "serious"
except:
r = errHandle("error","webdriver error on "+url+" even after restart, so re-restarting and",prefetched)
need_restart = True
else: # no retry
r = errHandle("error","webdriver error on "+url+", so restarting and",prefetched)
need_restart = True
IOLoopInstance().add_callback(lambda *args:callback(r))
manager.usageCount += 1
if need_restart or (options.js_restartAfter and manager.usageCount >= options.js_restartAfter):
if need_restart=="serious":manager.renew_controller()
manager.renew_webdriver_sameThread()
else: manager.finishTime = time.time()
manager.wd_threadStart = manager.maybe_stuck = False
IOLoopInstance().add_callback(webdriver_checkServe)
helper_threads.remove('wd_fetch')
def exc_logStr():
toLog = sys.exc_info()[:2]
if hasattr(toLog[1],"msg") and toLog[1].msg: toLog=(toLog[0],toLog[1].msg) # for WebDriverException
return repr(toLog)+find_adjuster_in_traceback()
def _wd_fetch(manager,url,prefetched,clickElementID,clickLinkText,asScreenshot): # single-user only! (and relies on being called only in htmlOnlyMode so leftover Javascript is removed and doesn't double-execute on JS-enabled browsers)
import tornado.httputil ; url = S(url)
currentUrl = S(manager.current_url())
timed_out = False
if prefetched or not re.sub('#.*','',currentUrl) == url:
if prefetched:
debuglog("webdriver %d get about:blank" % (manager.start+manager.index))
manager.get("about:blank") # ensure no race condition with current page's XMLHttpRequests
webdriver_prefetched[manager.index] = prefetched
webdriver_inProgress[manager.index].clear() # race condition with start of next 'get' if we haven't done about:blank, but worst case is we'll wait a bit too long for page to finish
debuglog(("webdriver %d get " % (manager.start+manager.index))+url)
try: manager.get(url) # waits for onload
except TimeoutException:
# we might have got SOMEthing (e.g. on a page bringing in hundreds of scripts from a slow server, but still running some of them before the timeout)
# May also be "Received error page"
if currentUrl == S(manager.current_url()):
debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - URL unchanged at "+currentUrl)
raise # treat as "no partial result"
debuglog(("webdriver %d get() timeout " % (manager.start+manager.index))+url+" - extracting partial")
if not timed_out:
debuglog(("webdriver %d loaded " % (manager.start+manager.index))+url)
# we want to double-check XMLHttpRequests have gone through (TODO: low-value setTimeout as well? TODO: abort this early if currentUrl has changed and we're just going to issue a redirect? but would then need to ensure it's finished if client comes back to same instance that's still running after it follows the redirect)
if options.js_reproxy:
wasActive = True
for _ in xrange(40): # up to 8+ seconds in steps of 0.2 (on top of the inital load)
time.sleep(0.2) # unconditional first-wait hopefully long enough to catch XMLHttpRequest delayed-send, very-low-value setTimeout etc, but we don't want to wait a whole second if the page isn't GOING to make any requests (TODO: monitor the js going through the upstream proxy to see if it contains any calls to this? but we'll have to deal with js_interpreter's cache, unless set it to not cache and we cache upstream)
active = webdriver_inProgress[manager.index]
if not active and not wasActive: break # TODO: wait longer than 0.2-0.4 to see if it restarts another request?
wasActive = active
else: time.sleep(1) # can't do much if we're not reproxying, so just sleep 1sec and hope for the best
currentUrl = None
if (clickElementID or clickLinkText) and not timed_out:
try:
manager.execute_script("window.open = window.confirm = function(){return true;}") # in case any link has a "Do you really want to follow this link?" confirmation (webdriver default is usually Cancel), or has 'pop-under' window (TODO: switch to pop-up?)
if clickElementID: manager.click_id(clickElementID)
if clickLinkText:
if not type(clickLinkText)==type(u""): clickLinkText=clickLinkText.decode('utf-8')
if not '"' in clickLinkText: manager.click_xpath(u'//a[text()="'+clickLinkText+'"]')
elif not "'" in clickLinkText: manager.click_xpath(u"//a[text()='"+clickLinkText+"']")
else: manager.click_linkText(clickLinkText) # least reliable
time.sleep(0.2) # TODO: more? what if the click results in fetching a new URL, had we better wait for XMLHttpRequests to finish? (loop as above but how do we know when they've started?) currentUrl code below should at least show us the new URL even if it hasn't finished loading, and then there's a delay while the client browser is told to fetch it, but that might not be enough
except: debuglog("js_links find_element exception ignored",False)
currentUrl = None
if currentUrl == None: # we need to ask for it again
currentUrl = manager.current_url()
if not currentUrl: currentUrl = url # PhantomJS Issue #13114: relative links after a redirect are not likely to work now
if S(currentUrl) == "about:blank":
debuglog("got about:blank instead of "+S(url))
return wrapResponse("webdriver failed to load") # don't return an actual redirect to about:blank, which breaks some versions of Lynx
debuglog("Getting data from webdriver %d (current_url=%s)" % (manager.start+manager.index,S(currentUrl)))
if asScreenshot: return wrapResponse(manager.getpng(),tornado.httputil.HTTPHeaders.parse("Content-type: image/png"),200)
body = get_and_remove_httpequiv_charset(manager.getu8())[1]
if timed_out: manager.get("about:blank") # as the timeout might have been due to a hard-locked script, so interrupting it should save some CPU
if not re.sub(B('#.*'),B(''),B(currentUrl)) == B(url): # we have to ignore anything after a # in this comparison because we have no way of knowing (here) whether the user's browser already includes the # or not: might send it into a redirect loop
# If we redirect, and if we have more than one user session active (and especially if we're multicore) then the second request might not come back to the same webdriver instance (or even the same adjuster process, so we can't even cache it unless shared), and reload is bad, so try to avoid redirect if possible.
# We could set 'base href' instead, seeing as 'document.location' does not have to be right on the user's side as we've already executed the site's scripts here (unless the user has any extensions that require it to be right). Don't use Content-Location header: not all browsers support + might cause caches to tread POST requests as invariant.
# Any in-document "#" links will cause a reload if 'base href' is set, but at least we won't have to reload UNLESS the user follows such a link.
if htmlFind(body,"')),tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)
return wrapResponse(body,tornado.httputil.HTTPHeaders.parse("Content-type: text/html; charset=utf-8"),200)
def get_new_webdriver(index,renewing=False):
if options.js_interpreter in ["HeadlessChrome","Chrome"]:
return get_new_Chrome(index,renewing,options.js_interpreter=="HeadlessChrome")
elif options.js_interpreter in ["HeadlessFirefox","Firefox"]:
return get_new_Firefox(index,renewing,options.js_interpreter=="HeadlessFirefox")
else: return get_new_PhantomJS(index,renewing)
def get_new_Chrome(index,renewing,headless):
log_complaints = (index==0 and not renewing)
from selenium.webdriver.chrome.options import Options
opts = Options() ; dc = None
# TODO: can set opts.binary_location if needed (e.g. for chromium, if distro's linking doesn't work)
if headless:
opts.add_argument("--headless")
opts.add_argument("--disable-gpu")
# Specify user-data-dir ourselves, further to Chromium bug 795 comment 12. Include username and port (in case others are running or have run adjuster) as well as index.
global myUsername
try: myUsername
except NameError:
try: import getpass
except ImportError: getpass = None
if getpass: myUsername = S(getpass.getuser())
else: myUsername = ""
extra = ""
while True: # might be restarting from a corrupted user-data-dir state; in worst case might not even be able to cleanly remove it (TODO: what if some processes associated with an older instance somehow took a while to go away and still have named referenc to previous path: increment counter unconditionally? still rm the old one)
path = "/tmp/hChrome-"+myUsername+str(options.port)+"."+str(index)+extra # don't use writable_tmpdir() here: some versions of Chromedriver can fail if you try to put a /dev/shm path into --user-data-dir
if not os.path.exists(path): break
shutil.rmtree(path,True)
if not os.path.exists(path): break
if extra: extra="-"+str(int(extra[1:])+1)
else: extra = "-0"
opts.add_argument("--user-data-dir="+path)
opts.add_argument("--incognito") # reduce space taken up by above
if options.js_reproxy:
opts.add_argument("--proxy-server=127.0.0.1:%d" % proxyPort(index))
opts.add_argument("--ignore-certificate-errors") # --ignore-certificate-errors is ignored by Chrome 59 (which was the first version to support Headless) and possibly some earlier versions, but we'll put it in just in case somebody runs an ancient non-headless Chrome in an offline experiment
opts.add_argument("--allow-insecure-localhost") # Chrome 62+ can at least do *.localhost & 127.* but we'd need to domain-rewrite for this to help (proxy-host doesn't count)
# Chrome 65 and chromedriver 2.35/2.36? can do:
dc = wd_DesiredCapabilities(log_complaints)
if dc:
dc = dc.CHROME.copy()
dc['acceptInsecureCerts'] = True
elif options.upstream_proxy: opts.add_argument('--proxy-server='+options.upstream_proxy)
if options.logDebug: opts.add_argument("--verbose")
if options.js_UA and not options.js_UA.startswith("*"): opts.add_argument("--user-agent="+options.js_UA)
if not options.js_images: opts.add_experimental_option("prefs",{"profile.managed_default_content_settings.images":2})
# TODO: do we need to disable Javascript's ability to open new windows and tabs, plus target="_blank" etc, especially if using clickElementID?
if options.via and not options.js_reproxy and log_complaints:
# Oops: how can we put in a Via: header if we don't
# have an upstream proxy to do so? unless you want
# to implement a Chrome extension to do it (TODO?)
warn("--via ignored when running Chrome without --js-reproxy")
if js_size: opts.add_argument("--window-size=%d,%d" % js_size)
if dc: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts,desired_capabilities=dc)
else: p = wd_instantiateLoop(webdriver.Chrome,index,renewing,chrome_options=opts)
if options.js_reproxy:
chromeVersion = int(p.capabilities['version'].split(".")[0])
if 59 <= chromeVersion < 65:
if [int(x) for x in p.capabilities['chrome']['chromedriverVersion'].split(".",2)[:2]] < [2,35]: extrawarn = " (and chromedriver 2.35+)"
else: extrawarn = ""
warn("This version of Chrome will hang when used with js_reproxy on https pages. Try upgrading to Chrome 65+"+extrawarn) # TODO: is 59 really the first version to drop --ignore-certificate-errors ?
elif chromeVersion >= 65 and not p.capabilities.get('acceptInsecureCerts',False): warn("This version of chromedriver will hang when used with js_reproxy on https pages. Your Chrome is new enough, but your chromedriver is not. Try downloading chromedriver 2.35/36+")
try: p.set_page_load_timeout(options.js_timeout1)
except: logging.info("Couldn't set HeadlessChrome page load timeout")
return p
def get_new_Firefox(index,renewing,headless):
if headless:
os.environ['MOZ_HEADLESS'] = '1' # in case -headless not yet working
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
profile = FirefoxProfile() ; caps = None
log_complaints = (index==0 and not renewing) ; op = None
proxyToUse = None
if options.js_reproxy:
from selenium.webdriver.common.proxy import Proxy,ProxyType
proxyToUse = Proxy({'proxyType':ProxyType.MANUAL,'httpProxy':"127.0.0.1:%d" % proxyPort(index),'sslProxy':"127.0.0.1:%d" % proxyPort(index),'ftpProxy':'','noProxy':''})
if hasattr(profile,"set_proxy"):
import warnings
warnings.filterwarnings("ignore","This method has been deprecated. Please pass in the proxy object to the Driver Object")
profile.set_proxy(proxyToUse) ; proxyToUse = None
profile.accept_untrusted_certs = True # needed for some older versions?
caps = wd_DesiredCapabilities(log_complaints)
if caps:
caps = caps.FIREFOX.copy()
caps['acceptInsecureCerts'] = True
caps['acceptSslCerts'] = True # older versions
elif options.upstream_proxy:
if hasattr(profile,"set_proxy"):
import warnings
warnings.filterwarnings("ignore","This method has been deprecated. Please pass in the proxy object to the Driver Object")
profile.set_proxy(options.upstream_proxy)
else: proxyToUse = options.upstream_proxy
if options.js_UA and not options.js_UA.startswith("*"): profile.set_preference("general.useragent.override",options.js_UA)
if not options.js_images: profile.set_preference("permissions.default.image", 2)
if options.via and not options.js_reproxy and log_complaints:
# Oops: how can we put in a Via: header if we don't
# have an upstream proxy to do so? unless you want
# to implement a Firefox extension to do it (TODO?)
warn("--via ignored when running Firefox without --js-reproxy")
# TODO: do any other options need to be set? disable plugins, Firefox-update prompts, new windows/tabs with JS, etc? or does Selenium do that?
if options.logDebug: binary=FirefoxBinary(log_file=sys.stderr) # TODO: support logDebug to a file as well
else: binary=FirefoxBinary()
if headless: cmdL = ('-headless','-no-remote')
else: cmdL = ('-no-remote',)
if js_size: cmdL += ("-width",str(js_size[0]),"-height",str(js_size[1]))
cmdL += ("about:blank",) # not Firefox start page
binary.add_command_line_options(*cmdL) # cannot call this more than once
if caps and proxyToUse: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary,capabilities=caps,proxy=proxyToUse)
elif caps: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary,capabilities=caps)
elif proxyToUse: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary,proxy=proxyToUse)
else: p = wd_instantiateLoop(webdriver.Firefox,index,renewing,firefox_profile=profile,firefox_binary=binary)
try: p.set_page_load_timeout(options.js_timeout1)
except: logging.info("Couldn't set Firefox page load timeout")
return p
block_headless_firefox = [
# servers that Firefox tries to CONNECT to on startup
"push.services.mozilla.com","snippets.cdn.mozilla.net","firefox.settings.services.mozilla.com","location.services.mozilla.com","shavar.services.mozilla.com",
"aus5.mozilla.org","ftp.mozilla.org",
"fonts.googleapis.com", # Fedora version of Firefox connects to this
# "start.fedoraproject.org","fedoraproject.org", # Fedora version of Firefox does this (but what if user actually wants to view one of those pages?)
]
def wd_DesiredCapabilities(log_complaints):
try:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
return DesiredCapabilities
except:
if log_complaints: warn("Your Selenium installation is too old to set DesiredCapabilities.\nThis is likely to stop some js options from working properly.")
return None
def wd_instantiateLoop(wdClass,index,renewing,**kw):
debuglog("Instantiating "+wdClass.__name__+" "+repr(kw))
if 'chrome_options' in kw:
try: newChromedriver = 'options' in getargspec(webdriver.chrome.webdriver.WebDriver.__init__).args
except: newChromedriver = False
if newChromedriver:
kw['options'] = kw['chrome_options']
del kw['chrome_options']
if not renewing: time.sleep(min(2*(index % js_per_core),int(options.js_timeout2 / 2))) # try not to start them all at once at the beginning (may reduce chance of failure)
while True:
try:
if wdClass==webdriver.Chrome: p = wdClass(getoutput("which chromedriver 2>/dev/null"),**kw) # some versions need to be told explicitly where chromedriver is, rather than looking in PATH themselves, in order to get "wrong version" errors etc (otherwise errors ignored, Selenium looks for a different chromedriver and gives a slightly confusing error about 'none found' rather than the error you should have seen about 'wrong version')
else: p = wdClass(**kw)
if not p.capabilities: raise Exception("Didn't seem to get a p.capabilities")
elif 'browserVersion' in p.capabilities:
# Selenium 2.x calls it version, but Selenium
# 3.x calls it browserVersion. Map this back
# to 'version' for our other code.
p.capabilities['version'] = p.capabilities['browserVersion']
elif not 'version' in p.capabilities: raise Exception("capabilities has no version: "+repr(p.capabilities.items()))
except:
if index==0 and not renewing: raise
logging.error("Unhandled exception "+exc_logStr()+" when instantiating webdriver %d, retrying in 2sec" % index)
time.sleep(2) ; p = None
if p: break
debuglog(wdClass.__name__+" instantiated")
return p
def _get_new_PhantomJS(index,renewing):
log_complaints = (index==0 and not renewing)
os.environ["QT_QPA_PLATFORM"]="offscreen"
sa = ['--ssl-protocol=any']
# if options.logDebug: sa.append("--debug=true") # doesn't work: we don't see the debug output on stdout or stderr
if options.js_reproxy:
sa.append('--ignore-ssl-errors=true')
sa.append('--proxy=127.0.0.1:%d' % proxyPort(index))
elif options.upstream_proxy: sa.append('--proxy='+options.upstream_proxy)
dc = wd_DesiredCapabilities(log_complaints)
if dc:
dc = dict(dc.PHANTOMJS)
if options.js_UA and not options.js_UA.startswith("*"): dc["phantomjs.page.settings.userAgent"]=options.js_UA
if not options.js_images: dc["phantomjs.page.settings.loadImages"]=False
dc["phantomjs.page.settings.javascriptCanOpenWindows"]=dc["phantomjs.page.settings.javascriptCanCloseWindows"]=False # TODO: does this cover target="_blank" in clickElementID etc (which could have originated via DOM manipulation, so stripping them on the upstream proxy is insufficient; close/restart the driver every so often?)
if options.via and not options.js_reproxy: dc["phantomjs.page.customHeaders.Via"]="1.0 "+convert_to_via_host("")+" ("+viaName+")" # customHeaders works in PhantomJS 1.5+ (TODO: make it per-request so can include old Via headers & update protocol version, via_host + X-Forwarded-For; will webdriver.DesiredCapabilities.PHANTOMJS[k]=v work before a request?) (don't have to worry about this if js_reproxy)
return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,desired_capabilities=dc,service_args=sa)
else: return wd_instantiateLoop(webdriver.PhantomJS,index,renewing,service_args=sa)
def get_new_PhantomJS(index,renewing=False):
wd = _get_new_PhantomJS(index,renewing)
log_complaints = (index==0 and not renewing)
if log_complaints and not options.js_reproxy:
try: is_v2 = wd.capabilities['version'].startswith("2.")
except: is_v2 = False
if is_v2: warn("You may be affected by PhantomJS issue #13114.\nRelative links may be wrong after a redirect if the site sets Content-Security-Policy.\nTry --js_reproxy, or downgrade your PhantomJS to version 1.9.8")
try: wd.set_window_size(*js_size)
except: logging.info("Couldn't set PhantomJS window size")
try: wd.set_page_load_timeout(options.js_timeout1)
except: logging.info("Couldn't set PhantomJS page load timeout")
return wd
def proxyPort(index): return port_randomise.get(js_proxy_port[index],js_proxy_port[index])
webdriver_runner = [] ; webdriver_prefetched = []
webdriver_via = [] ; webdriver_UA = [] ; webdriver_AL = []
webdriver_inProgress = [] ; webdriver_queue = []
webdriver_lambda = webdriver_mu = 0
def test_init_webdriver():
"Check that we CAN start a webdriver, before forking to background and starting all of them"
if options.js_interpreter=="edbrowse": return
sys.stderr.write("Checking webdriver configuration... "),sys.stderr.flush()
get_new_webdriver(0).quit()
sys.stderr.write("OK\n")
quitFuncToCall = None
def init_webdrivers(start,N):
informing = not options.background and not start and not (options.multicore and options.ssl_fork) # (if ssl_fork, we don't want the background 'starting N processes' messages to be interleaved with this)
if informing:
sys.stderr.write("Starting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
for i in xrange(N):
webdriver_runner.append(WebdriverRunner(start,len(webdriver_runner)))
webdriver_prefetched.append(None)
webdriver_inProgress.append(set())
webdriver_via.append(None) ; webdriver_UA.append("") ; webdriver_AL.append("")
def quit_wd_atexit(*args):
if informing: sys.stderr.write("Quitting %d webdriver%s... " % (options.js_instances,plural(options.js_instances))),sys.stderr.flush()
try:
for i in webdriver_runner:
try: i.quit_webdriver()
except: pass
except: pass
if informing: sys.stderr.write("done\n")
global quitFuncToCall ; quitFuncToCall = quit_wd_atexit # don't use the real atexit, as we have our own thread-stop logic which might kick in first, leaving a stuck adjusterWDhelp process if js_multiprocess==True, and additionally holding up calling process if --stdio is in use (fixed in v0.2795)
if options.js_restartMins and not options.js_restartAfter==1: IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
if informing: sys.stderr.write("done\n")
webdriver_maxBusy = 0
def webdriver_allBusy():
busyNow = sum(1 for i in webdriver_runner if i.wd_threadStart)
global webdriver_maxBusy
webdriver_maxBusy = max(webdriver_maxBusy,busyNow)
return busyNow == len(webdriver_runner)
def webdriver_checkServe(*args):
# how many queue items can be served right now?
# (called on IOLoop thread when new item added, or when
# a server is finished)
debuglog("Entering webdriver_checkServe, runners=%d" % len(webdriver_runner))
for i in xrange(len(webdriver_runner)):
if not webdriver_queue: break # just to save a little
if not webdriver_runner[i].wd_threadStart:
while webdriver_queue:
url,prefetched,ua,acceptLang,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate = webdriver_queue.pop(0)
if tooLate():
debuglog("tooLate() for "+url)
continue
debuglog("Starting fetch of "+url+" on webdriver instance "+str(i+webdriver_runner[i].start))
webdriver_via[i],webdriver_UA[i] = via,ua
webdriver_AL[i] = acceptLang
webdriver_runner[i].fetch(url,prefetched,clickElementID,clickLinkText,asScreenshot,callback,tooLate)
global webdriver_mu ; webdriver_mu += 1
break
if webdriver_allBusy(): pauseOrRestartMainServer(0) # we're "paused" anyway when not in the poll wait, so might as well call this only at end, to depend on the final status (and make sure to call webdriver_allBusy() no matter what, as it has the side-effect of updating webdriver_maxBusy)
else: pauseOrRestartMainServer(1)
debuglog("Finishing webdriver_checkServe, webdriver_queue len=%d" % len(webdriver_queue))
def webdriver_checkRenew(*args):
for i in webdriver_runner:
if not i.wd_threadStart and i.usageCount and i.finishTime + options.js_restartMins < time.time(): i.renew_webdriver_newThread() # safe because we're running in the IOLoop thread, which therefore can't start wd_thread between our test of wd_threadStart and renew_webdriver_newThread
IOLoopInstance().add_timeout(time.time()+60,webdriver_checkRenew)
def webdriver_fetch(url,prefetched,ua,acceptLang,clickElementID,clickLinkText,via,asScreenshot,callback,tooLate):
if tooLate(): return # probably webdriver_queue overload (which will also be logged)
elif prefetched and (not hasattr(prefetched,"code") or prefetched.code >= 500 or not hasattr(prefetched,"body") or not re.search(b'