PK!vapetest/__init__.py"""Smarter-than-monkey testing for web apps. You can read about APE and how to use it on its [home page](https://boxingbeetle.com/tools/ape/). What follows here is a quick tour of the code. Overview ======== APE consists of a core that crawls the web app/site under test to find pages to check. This core already does some checks, for example it reports HTTP errors, checks for inconsistencies in text encoding declarations and reports XML documents that are not well-formed. When a document has been loaded, the core will offer it to all active plugins to check. For example the HTML checker, even though it is core functionality, is implemented as a plugin: `apetest.plugin.checkhtml`. Entry Point =========== `apetest.cmdline.main` parses command line arguments and then calls `apetest.cmdline.run` to start a test run. A test run starts by creating a `apetest.spider.Spider` to crawl the web app/site under test, a `apetest.report.Scribe` to collect the reporting and a `apetest.checker.PageChecker` to load and check the pages. After all pages have been checked, the core writes the test report. Finally, plugins are given a chance to create their final output as well and clean up after themselves. Key Concepts ============ Since APE has no specific knowledge about the app it is testing other than its URL, it cannot test state changes. Therefore it will not attempt to change any server-side state, by making only HTTP `GET` requests, which should be idempotent. The exact resource being requested is determined by the page URL and an optional query; this is modeled by the `apetest.request.Request` class. A request is discovered by crawling other pages. Something that can generate requests is called a *referrer* and is modeled by the `apetest.referrer.Referrer` class. Some referrers, like an HTTP redirect, generate one exact request, while other referrers, like an HTML form, can generate many different requests depending on the values of the form's controls. A request is considered *speculative* if an actual user accessing the web app/site through a web browser would not normally generate it. The server should be robust against such requests, but if it rejects them with an HTTP "client error" (400) status, APE will not report that as an error. """ PK!TDDapetest/checker.py# SPDX-License-Identifier: BSD-3-Clause """Checks a document for problems and finds links to other documents. The `PageChecker` class is where the work is done. """ from collections import defaultdict from enum import Enum from logging import getLogger import re from urllib.parse import urljoin, urlsplit, urlunsplit from lxml import etree from apetest.control import ( Checkbox, FileInput, HiddenInput, RadioButton, RadioButtonGroup, SelectSingle, SelectMultiple, SubmitButton, SubmitButtons, TextArea, TextField ) from apetest.fetch import decode_and_report, encoding_from_bom, load_page from apetest.referrer import Form, LinkSet, Redirect from apetest.request import Request class Accept(Enum): """The types of documents that we tell the server we accept.""" ANY = 1 """Accept both HTML and XHTML.""" HTML = 2 """Accept only HTML.""" _LOG = getLogger(__name__) _RE_XML_DECL = re.compile( r'<\?xml([ \t\r\n\'"\w.\-=]*).*\?>' ) _RE_XML_DECL_ATTR = re.compile( r'[ \t\r\n]+([a-z]+)[ \t\r\n]*=[ \t\r\n]*' r'(?P[\'"])([\w.\-]*)(?P=quote)' ) def strip_xml_decl(text): """Strip the XML declaration from the start of the given text. Returns the given text without XML declaration, or the unmodified text if no XML declaration was found. """ match = _RE_XML_DECL.match(text) return text if match is None else text[match.end():] def encoding_from_xml_decl(text): """Look for an XML declaration with an `encoding` attribute at the start of the given text. Returns: encoding The attribute value, converted to lower case. None If no attribute was found. """ match = _RE_XML_DECL.match(text) if match is not None: decl = match.group(1) for match in _RE_XML_DECL_ATTR.finditer(decl): name, quote_, value = match.groups() if name == 'encoding': return value.lower() return None def normalize_url(url): """Returns a unique string for the given URL. This is required in some places, since different libraries have different opinions whether local URLs should start with `file:/` or `file:///`. """ return urlunsplit(urlsplit(url)) def parse_document(content, is_xml, report): """Parse the given XML or HTML document. Parameters: content Text to be parsed. is_xlm If `True`, parse as XML, otherwise parse as HTML. report: apetest.report.Report Parse errors are logged here. Returns: tree A document `etree`. None If the document is too broken to be parsed. """ parser_factory = etree.XMLParser if is_xml else etree.HTMLParser parser = parser_factory(recover=True) if is_xml: # The lxml parser does not accept encoding in XML declarations # when parsing strings. content = strip_xml_decl(content) try: root = etree.fromstring(content, parser) except etree.XMLSyntaxError: report.error( 'Failed to parse document as %s; ' 'cannot gather references to other documents.', 'XML' if is_xml else 'HTML' ) return None # The lxml HTML parser is an HTML4 parser. HTML5 is similar enough # that it will still be able to produce a document tree, but it will # report errors on for example inline SVG. if is_xml: for error in parser.error_log: if hasattr(error, 'line'): line = error.line elif hasattr(error, 'position'): line = error.position[0] else: line = None message = error.message if line is not None: message += ' (line %d)' % line report.error(message) return None if root is None else root.getroottree() def _parse_input_control(attrib): _LOG.debug('input: %s', attrib) disabled = 'disabled' in attrib if disabled: return None # TODO: Support readonly controls? name = attrib.get('name') ctype = attrib.get('type') value = attrib.get('value') if ctype in ('text', 'password'): return TextField(name, value) elif ctype == 'checkbox': return Checkbox(name, value) elif ctype == 'radio': return RadioButton(name, value) elif ctype == 'file': return FileInput(name, value) elif ctype == 'hidden': return HiddenInput(name, value) elif ctype in ('submit', 'image'): return SubmitButton(name, value) elif ctype in ('button', 'reset'): # Type "button" is used by JavaScript, "reset" by the browser. return None else: # Invalid control type, will already be flagged by the DTD. return None class PageChecker: """Retrieves a page, checks its contents and finds references to other pages. """ def __init__(self, base_url, accept, scribe, plugins): """Initialize page checker. Parameters: base_url Base URL for the web site or app under test. accept: Accept The types of documents that we tell the server we accept. scribe: apetest.report.Scribe Reports will be added here. plugins: apetest.plugin.PluginCollection Plugins to notify of loaded documents. """ self.base_url = normalize_url(base_url) self.accept = accept self.scribe = scribe self.plugins = plugins def short_url(self, url): """Return a shortened version of `url`. This drops the part of the URL that all pages share. """ assert url.startswith(self.base_url), url return url[self.base_url.rindex('/') + 1 : ] def check(self, req): """Check a single `apetest.request.Request`.""" req_url = str(req) _LOG.info('Checking page: %s', self.short_url(req_url)) accept = self.accept accept_header = { # Prefer XHTML to HTML because it is stricter. Accept.ANY: 'text/html; q=0.8, application/xhtml+xml; q=1.0', Accept.HTML: 'text/html; q=1.0' }[accept] report, response, content_bytes = load_page( req_url, req.maybe_bad, accept_header ) referrers = [] if response is not None and response.code is not None \ and 300 <= response.code < 400: content_url = normalize_url(response.url) if content_url != req_url: if content_url.startswith(self.base_url): if not content_url.startswith('file:'): report.info( 'Redirected to: %s', self.short_url(content_url) ) try: referrers.append( Redirect(Request.from_url(content_url)) ) except ValueError as ex: report.warning('%s', ex) else: report.info('Redirected outside: %s', content_url) if content_bytes is None: report.info('Could not get any content to check') skip_content = True elif response.code in (200, None): skip_content = False else: # TODO: This should probably be user-selectable. # A lot of web servers produce error and redirection # notices that are not HTML5 compliant. Checking the # content is likely only useful if the application # under test is producing the content instead. report.info( 'Skipping content check because of HTTP status %d', response.code ) skip_content = True if skip_content: report.checked = True self.scribe.add_report(report) return referrers headers = response.headers content_type_header = headers['Content-Type'] if content_type_header is None: message = 'Missing Content-Type header' _LOG.error(message) report.error(message) self.scribe.add_report(report) return referrers content_type = headers.get_content_type() is_html = content_type in ('text/html', 'application/xhtml+xml') is_xml = content_type.endswith('/xml') or content_type.endswith('+xml') http_encoding = headers.get_content_charset() # Speculatively decode the first 1024 bytes, so we can look inside # the document for encoding clues. bom_encoding = encoding_from_bom(content_bytes) content_head = content_bytes[:1024].decode( bom_encoding or 'ascii', 'replace' ) if not is_xml and content_head.startswith(' tags. # Try possible encodings in order of precedence. # W3C recommends giving the BOM, if present, precedence over HTTP. # http://www.w3.org/International/questions/qa-byte-order-mark try: content, used_encoding = decode_and_report( content_bytes, ((bom_encoding, 'Byte Order Mark'), (decl_encoding, 'XML declaration'), (http_encoding, 'HTTP header')), report ) except UnicodeDecodeError as ex: # All likely encodings failed. report.error('Failed to decode contents') self.scribe.add_report(report) return referrers if req_url.startswith('file:'): # Construct a new header that is likely more accurate. content_type_header = '%s; charset=%s' % ( content_type, used_encoding ) self.plugins.resource_loaded(content_bytes, content_type_header, report) if is_html or is_xml: tree = parse_document(content, is_xml, report) if tree is not None: # Find links to other documents. referrers += self.find_referrers_in_xml(tree, req_url, report) if is_html: referrers += self.find_referrers_in_html(tree, req_url) self.scribe.add_report(report) return referrers _htmlLinkElements = { 'a': 'href', 'link': 'href', 'img': 'src', 'script': 'src', } _xmlLinkElements = { '{http://www.w3.org/1999/xhtml}' + tag_name: attr_name for tag_name, attr_name in _htmlLinkElements.items() } # SVG 1.1 uses XLink, but SVG 2 has native 'href' attributes. # We're only interested in elements that can link to external # resources, not all elements that support 'href'. _xmlLinkElements.update({ '{http://www.w3.org/2000/svg}' + tag_name: 'href' for tag_name in ('a', 'image', 'script') }) _xmlLinkElements.update({ '{http://www.w3.org/2005/Atom}link': 'href' }) # Insert HTML elements without namespace for HTML trees and # with namespace for XHTML trees. _linkElements = dict(_htmlLinkElements) _linkElements.update(_xmlLinkElements) def find_urls(self, tree): """Yield URLs found in the document `tree`. """ get_attr_name = self._linkElements.__getitem__ for node in tree.getroot().iter(): try: yield node.attrib[get_attr_name(node.tag)] except KeyError: pass try: yield node.attrib['{http://www.w3.org/1999/xlink}href'] except KeyError: pass def find_referrers_in_xml(self, tree, tree_url, report): """Yield `apetest.referrer.Referrer` objects for links found in XML tags in the document `tree`. """ links = defaultdict(LinkSet) for url in self.find_urls(tree): _LOG.debug(' Found URL: %s', url) if url.startswith('?'): url = urlsplit(tree_url).path + url url = urljoin(tree_url, url) if url.startswith(self.base_url): try: request = Request.from_url(url) except ValueError as ex: report.warning('%s', ex) else: links[request.page_url].add(request) yield from links.values() def find_referrers_in_html(self, tree, url): """Yield `apetest.referrer.Referrer` objects for links and forms found in HTML tags in the document `tree`. """ root = tree.getroot() ns_prefix = '{%s}' % root.nsmap[None] if None in root.nsmap else '' for form_node in root.getiterator(ns_prefix + 'form'): # TODO: How to handle an empty action? # 1. take current path, erase query (current impl) # 2. take current path, merge query # 3. flag as error (not clearly specced) # I think either flag as error, or mimic the browsers. try: action = form_node.attrib['action'] or urlsplit(url).path method = form_node.attrib['method'].lower() except KeyError: continue if method == 'post': # TODO: Support POST (with flag to enable/disable). continue if method != 'get': # The DTD will already have flagged this as a violation. continue submit_url = urljoin(url, action) if not submit_url.startswith(self.base_url): continue # Note: Disabled controls should not be submitted, so we pretend # they do not even exist. controls = [] radio_buttons = defaultdict(list) submit_buttons = [] for inp in form_node.getiterator(ns_prefix + 'input'): control = _parse_input_control(inp.attrib) if control is None: pass elif isinstance(control, RadioButton): radio_buttons[control.name].append(control) elif isinstance(control, SubmitButton): submit_buttons.append(control) else: controls.append(control) for control in form_node.getiterator(ns_prefix + 'select'): name = control.attrib.get('name') multiple = control.attrib.get('multiple') disabled = 'disabled' in control.attrib if disabled: continue options = [ option.attrib.get('value', option.text) for option in control.getiterator(ns_prefix + 'option') ] if multiple: for option in options: controls.append(SelectMultiple(name, option)) else: controls.append(SelectSingle(name, options)) for control in form_node.getiterator(ns_prefix + 'textarea'): name = control.attrib.get('name') value = control.text disabled = 'disabled' in control.attrib if disabled: continue _LOG.debug('textarea "%s": %s', name, value) controls.append(TextArea(name, value)) # Merge exclusive controls. for buttons in radio_buttons.values(): controls.append(RadioButtonGroup(buttons)) if submit_buttons: controls.append(SubmitButtons(submit_buttons)) # If the form contains no submit buttons, assume it can be # submitted using JavaScript, so continue. yield Form(submit_url, method, controls) PK!fuYapetest/cmdline.py# SPDX-License-Identifier: BSD-3-Clause """Command line interface.""" from argparse import ArgumentParser import logging from os import getcwd from urllib.parse import urljoin, urlparse from apetest.checker import Accept, PageChecker from apetest.plugin import ( PluginCollection, add_plugin_arguments, create_plugins, load_plugins ) from apetest.report import Scribe from apetest.request import Request from apetest.spider import spider_req from apetest.version import VERSION_STRING def detect_url(arg): """Attempt to turn a command line argument into a full URL.""" url = urlparse(arg) if url.scheme in ('http', 'https'): return arg if arg.startswith('/'): # Assume absolute file path. return urljoin('file://', arg) url = urlparse('http://' + arg) idx = url.netloc.find(':') if idx != -1 and url.netloc[idx + 1:].isdigit(): # Host and port without scheme, assume HTTP. return 'http://' + arg # Assume relative file path. return urljoin('file://%s/' % getcwd(), arg) def run(url, report_file_name, accept, plugins=()): """Runs APE with the given arguments. Parameters: url Base URL of the web site or app to check. report_file_name Path to write the HTML report to. accept: apetest.checker.Accept Document types that we tell the server that we accept. plugins: apetest.plugin.Plugin* Plugins to use on this run. Returns: exit_code 0 if successful, non-zero on errors. """ plugins = PluginCollection(plugins) try: try: first_req = Request.from_url(detect_url(url)) except ValueError as ex: print('Bad URL:', ex) return 1 spider, robots_report = spider_req(first_req) base_url = first_req.page_url scribe = Scribe(base_url, spider, plugins) if robots_report is not None: scribe.add_report(robots_report) checker = PageChecker(base_url, accept, scribe, plugins) print('Checking "%s" and below...' % base_url) for request in spider: referrers = checker.check(request) spider.add_requests(request, referrers) print('Done checking') print('Writing report to "%s"...' % report_file_name) with open(report_file_name, 'w', encoding='ascii', errors='xmlcharrefreplace') as out: for node in scribe.present(): out.write(node.flatten()) print('Done reporting') scribe.postprocess() print('Done post processing') return 0 finally: plugins.close() def main(): """Parse command line arguments and call `run` with the results. This is the entry point that gets called by the wrapper script. """ # Register core arguments. parser = ArgumentParser( description='Automated Page Exerciser: ' 'smarter-than-monkey testing for web apps', epilog='This is a test tool; do not use on production sites.' ) parser.add_argument( 'url', metavar='URL|PATH', help='web app/site to check' ) parser.add_argument( '--accept', type=str, choices=('any', 'html'), default='any', help='accept serialization: any (HTML or XHTML; default) or HTML only' ) parser.add_argument( 'report', metavar='REPORT', help='file to write the HTML report to' ) parser.add_argument( '-v', '--verbose', action='count', default=0, help='increase amount of logging, can be passed multiple times' ) parser.add_argument( '-V', '--version', action='version', version='APE %s' % VERSION_STRING ) # Let plugins register their arguments. plugin_modules = tuple(load_plugins()) for module in plugin_modules: add_plugin_arguments(module, parser) args = parser.parse_args() level_map = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} level = level_map.get(args.verbose, logging.DEBUG) logging.basicConfig(level=level, format='%(levelname)s: %(message)s') # Instantiate plugins. plugins = [] for module in plugin_modules: try: plugins += create_plugins(module, args) except Exception: # pylint: disable=broad-except return 1 accept = Accept[args.accept.upper()] return run(args.url, args.report, accept, plugins) PK!nlU  apetest/control.py# SPDX-License-Identifier: BSD-3-Clause """Models form controls. This module contains the `Control` class and its subclasses, which can be used to model input elements in an (HTML) form. """ # TODO: The subclasses currently have an almost 1:1 mapping to HTML, # but I'm not sure that is necessary. For example SelectMultiple # and Checkbox have the same submission functionality. # And SelectSingle and RadioButtonGroup differ only in whether # non-selection is possible; when adding support for the HTML5 # "required" attribute, SelectSingle can become functionally # equivalent to RadioButtonGroup. class Control: """Abstract base class for submittable elements in a form.""" def has_alternative(self, name, value): """Return `True` iff the given name-value combination could be submitted by this control. Note that for free-input controls, it is possible this method returns `True` while the name-value pair is not in the sequence returned by the `Control.alternatives` method. """ raise NotImplementedError def alternatives(self): """Yield alternative name-value pairs of the ways this control can be submitted. For multiple-choice controls all possible alternatives are included. For free-input controls there is an infinite number of alternatives, so we just pick a few. The alternative `(None, None)` represents a control not being part of the submission, for example an unchecked checkbox. """ raise NotImplementedError class SingleValueControl(Control): """Control that produces at most one name-value combination. Note that there can be any number of possible values, but in each submission of the form, no more than one value is submitted for this control. """ def __init__(self, name, value): """Initialize control with the given name-value combination.""" Control.__init__(self) self.name = name """The name under which this control is submitted.""" self.value = value """The default value for this control. Some control types can only submit this value, other control types can submit other values as well. """ def has_alternative(self, name, value): return name == self.name and value == self.value def alternatives(self): yield self.name, self.value class FileInput(SingleValueControl): """Control for selecting and uploading files.""" def has_alternative(self, name, value): # Any text could be submitted, so we only have to check the name. return name == self.name def alternatives(self): # Today's browsers, as a security precaution, will provide an empty # file name input field even if a default value is provided. # Since we have no idea what kind of file should be uploaded, we just # submit the empty string. yield self.name, '' class HiddenInput(SingleValueControl): """Control that is not visible to the user. This control submits its default value. """ class TextField(SingleValueControl): """Single-line text input.""" def has_alternative(self, name, value): # Any text could be submitted, so we only have to check the name. return name == self.name def alternatives(self): yield self.name, '' # empty yield self.name, self.value # default yield self.name, 'ook' # librarian's choice class TextArea(SingleValueControl): """Multi-line text input.""" def has_alternative(self, name, value): # Any text could be submitted, so we only have to check the name. return name == self.name def alternatives(self): yield self.name, '' # empty yield self.name, self.value # default yield self.name, 'Ook.\nOok? Ook!' # librarian's choice class Checkbox(SingleValueControl): """Checkbox. This control can submit its default value (box checked) or nothing (box unchecked). """ def has_alternative(self, name, value): return ( (name is None and value is None) or (name == self.name and value == self.value) ) def alternatives(self): yield None, None # box unchecked yield self.name, self.value # box checked class RadioButton(SingleValueControl): """Single radio button. Radio buttons must be combined in a `RadioButtonGroup` control. """ def has_alternative(self, name, value): assert False, 'radio button "%s" was not merged' % self.name def alternatives(self): assert False, 'radio button "%s" was not merged' % self.name class RadioButtonGroup(Control): """Multiple-choice control containing one or more radio buttons.""" def __init__(self, buttons): """Initialize a radio buttons group control containing `buttons`, which must be a non-empty sequence of `RadioButton` objects. """ # Perform sanity check on input and gather values. name = buttons[0].name values = [] for button in buttons: if not isinstance(button, RadioButton): raise TypeError('expected RadioButton, got %s' % type(button)) if button.name != name: raise ValueError( 'radio button name "%s" differs from ' 'first radio button name "%s"' % (button.name, name) ) values.append(button.value) # Actual construction. Control.__init__(self) self.name = name self.values = values def has_alternative(self, name, value): return name == self.name and value in self.values def alternatives(self): for value in self.values: yield self.name, value class SubmitButton(SingleValueControl): """Single submit button. All submit buttons in a form must be combined in a `SubmitButtons` control. """ class SubmitButtons(Control): """Pseudo-control which contains all submit buttons for a form. Only one submit button can be used for submission; this pseudo-control models the choice between submit buttons. """ def __init__(self, buttons): """Initialize a submit buttons control containing `buttons`, which must be a sequence of `SubmitButton` objects. """ Control.__init__(self) self.buttons = tuple((button.name, button.value) for button in buttons) def has_alternative(self, name, value): return (name, value) in self.buttons def alternatives(self): yield from self.buttons class SelectMultiple(SingleValueControl): """Pseudo-control which represents an option in a `` control where one option can be active at the same time. This type of control is typically shown in a browser as a drop-down list. """ def __init__(self, name, options): """Initialize a single-choice `