1927 lines
63 KiB
Python
1927 lines
63 KiB
Python
# Copyright (c) 2004 Ian Bicking. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
#
|
|
# 3. Neither the name of Ian Bicking nor the names of its contributors may
|
|
# be used to endorse or promote products derived from this software
|
|
# without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""The ``lxml.html`` tool set for HTML handling.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
|
|
__all__ = [
|
|
'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
|
|
'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
|
|
'find_rel_links', 'find_class', 'make_links_absolute',
|
|
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
|
|
|
|
|
|
import copy
|
|
import sys
|
|
import re
|
|
from functools import partial
|
|
|
|
try:
|
|
# while unnecessary, importing from 'collections.abc' is the right way to do it
|
|
from collections.abc import MutableMapping, MutableSet
|
|
except ImportError:
|
|
from collections import MutableMapping, MutableSet
|
|
|
|
from .. import etree
|
|
from . import defs
|
|
from ._setmixin import SetMixin
|
|
|
|
try:
|
|
from urlparse import urljoin
|
|
except ImportError:
|
|
# Python 3
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
unicode
|
|
except NameError:
|
|
# Python 3
|
|
unicode = str
|
|
try:
|
|
basestring
|
|
except NameError:
|
|
# Python 3
|
|
basestring = (str, bytes)
|
|
|
|
|
|
def __fix_docstring(s):
|
|
if not s:
|
|
return s
|
|
if sys.version_info[0] >= 3:
|
|
sub = re.compile(r"^(\s*)u'", re.M).sub
|
|
else:
|
|
sub = re.compile(r"^(\s*)b'", re.M).sub
|
|
return sub(r"\1'", s)
|
|
|
|
|
|
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
|
|
|
|
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
|
|
namespaces={'x':XHTML_NAMESPACE})
|
|
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
|
|
namespaces={'x':XHTML_NAMESPACE})
|
|
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
|
|
namespaces={'x':XHTML_NAMESPACE})
|
|
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
|
|
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
|
|
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
|
|
_collect_string_content = etree.XPath("string()")
|
|
_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
|
|
_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
|
|
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
|
|
namespaces={'x':XHTML_NAMESPACE})
|
|
_archive_re = re.compile(r'[^ ]+')
|
|
_parse_meta_refresh_url = re.compile(
|
|
r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
|
|
|
|
|
|
def _unquote_match(s, pos):
|
|
if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
|
|
return s[1:-1], pos+1
|
|
else:
|
|
return s,pos
|
|
|
|
|
|
def _transform_result(typ, result):
|
|
"""Convert the result back into the input type.
|
|
"""
|
|
if issubclass(typ, bytes):
|
|
return tostring(result, encoding='utf-8')
|
|
elif issubclass(typ, unicode):
|
|
return tostring(result, encoding='unicode')
|
|
else:
|
|
return result
|
|
|
|
|
|
def _nons(tag):
|
|
if isinstance(tag, basestring):
|
|
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
|
|
return tag.split('}')[-1]
|
|
return tag
|
|
|
|
|
|
class Classes(MutableSet):
|
|
"""Provides access to an element's class attribute as a set-like collection.
|
|
Usage::
|
|
|
|
>>> el = fromstring('<p class="hidden large">Text</p>')
|
|
>>> classes = el.classes # or: classes = Classes(el.attrib)
|
|
>>> classes |= ['block', 'paragraph']
|
|
>>> el.get('class')
|
|
'hidden large block paragraph'
|
|
>>> classes.toggle('hidden')
|
|
False
|
|
>>> el.get('class')
|
|
'large block paragraph'
|
|
>>> classes -= ('some', 'classes', 'block')
|
|
>>> el.get('class')
|
|
'large paragraph'
|
|
"""
|
|
def __init__(self, attributes):
|
|
self._attributes = attributes
|
|
self._get_class_value = partial(attributes.get, 'class', '')
|
|
|
|
def add(self, value):
|
|
"""
|
|
Add a class.
|
|
|
|
This has no effect if the class is already present.
|
|
"""
|
|
if not value or re.search(r'\s', value):
|
|
raise ValueError("Invalid class name: %r" % value)
|
|
classes = self._get_class_value().split()
|
|
if value in classes:
|
|
return
|
|
classes.append(value)
|
|
self._attributes['class'] = ' '.join(classes)
|
|
|
|
def discard(self, value):
|
|
"""
|
|
Remove a class if it is currently present.
|
|
|
|
If the class is not present, do nothing.
|
|
"""
|
|
if not value or re.search(r'\s', value):
|
|
raise ValueError("Invalid class name: %r" % value)
|
|
classes = [name for name in self._get_class_value().split()
|
|
if name != value]
|
|
if classes:
|
|
self._attributes['class'] = ' '.join(classes)
|
|
elif 'class' in self._attributes:
|
|
del self._attributes['class']
|
|
|
|
def remove(self, value):
|
|
"""
|
|
Remove a class; it must currently be present.
|
|
|
|
If the class is not present, raise a KeyError.
|
|
"""
|
|
if not value or re.search(r'\s', value):
|
|
raise ValueError("Invalid class name: %r" % value)
|
|
super(Classes, self).remove(value)
|
|
|
|
def __contains__(self, name):
|
|
classes = self._get_class_value()
|
|
return name in classes and name in classes.split()
|
|
|
|
def __iter__(self):
|
|
return iter(self._get_class_value().split())
|
|
|
|
def __len__(self):
|
|
return len(self._get_class_value().split())
|
|
|
|
# non-standard methods
|
|
|
|
def update(self, values):
|
|
"""
|
|
Add all names from 'values'.
|
|
"""
|
|
classes = self._get_class_value().split()
|
|
extended = False
|
|
for value in values:
|
|
if value not in classes:
|
|
classes.append(value)
|
|
extended = True
|
|
if extended:
|
|
self._attributes['class'] = ' '.join(classes)
|
|
|
|
def toggle(self, value):
|
|
"""
|
|
Add a class name if it isn't there yet, or remove it if it exists.
|
|
|
|
Returns true if the class was added (and is now enabled) and
|
|
false if it was removed (and is now disabled).
|
|
"""
|
|
if not value or re.search(r'\s', value):
|
|
raise ValueError("Invalid class name: %r" % value)
|
|
classes = self._get_class_value().split()
|
|
try:
|
|
classes.remove(value)
|
|
enabled = False
|
|
except ValueError:
|
|
classes.append(value)
|
|
enabled = True
|
|
if classes:
|
|
self._attributes['class'] = ' '.join(classes)
|
|
else:
|
|
del self._attributes['class']
|
|
return enabled
|
|
|
|
|
|
class HtmlMixin(object):
|
|
|
|
def set(self, key, value=None):
|
|
"""set(self, key, value=None)
|
|
|
|
Sets an element attribute. If no value is provided, or if the value is None,
|
|
creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
|
|
for ``form.set('novalidate')``.
|
|
"""
|
|
super(HtmlElement, self).set(key, value)
|
|
|
|
@property
|
|
def classes(self):
|
|
"""
|
|
A set-like wrapper around the 'class' attribute.
|
|
"""
|
|
return Classes(self.attrib)
|
|
|
|
@classes.setter
|
|
def classes(self, classes):
|
|
assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
|
|
value = classes._get_class_value()
|
|
if value:
|
|
self.set('class', value)
|
|
elif self.get('class') is not None:
|
|
del self.attrib['class']
|
|
|
|
@property
|
|
def base_url(self):
|
|
"""
|
|
Returns the base URL, given when the page was parsed.
|
|
|
|
Use with ``urlparse.urljoin(el.base_url, href)`` to get
|
|
absolute URLs.
|
|
"""
|
|
return self.getroottree().docinfo.URL
|
|
|
|
@property
|
|
def forms(self):
|
|
"""
|
|
Return a list of all the forms
|
|
"""
|
|
return _forms_xpath(self)
|
|
|
|
@property
|
|
def body(self):
|
|
"""
|
|
Return the <body> element. Can be called from a child element
|
|
to get the document's head.
|
|
"""
|
|
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
|
|
|
|
@property
|
|
def head(self):
|
|
"""
|
|
Returns the <head> element. Can be called from a child
|
|
element to get the document's head.
|
|
"""
|
|
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
|
|
|
|
@property
|
|
def label(self):
|
|
"""
|
|
Get or set any <label> element associated with this element.
|
|
"""
|
|
id = self.get('id')
|
|
if not id:
|
|
return None
|
|
result = _label_xpath(self, id=id)
|
|
if not result:
|
|
return None
|
|
else:
|
|
return result[0]
|
|
|
|
@label.setter
|
|
def label(self, label):
|
|
id = self.get('id')
|
|
if not id:
|
|
raise TypeError(
|
|
"You cannot set a label for an element (%r) that has no id"
|
|
% self)
|
|
if _nons(label.tag) != 'label':
|
|
raise TypeError(
|
|
"You can only assign label to a label element (not %r)"
|
|
% label)
|
|
label.set('for', id)
|
|
|
|
@label.deleter
|
|
def label(self):
|
|
label = self.label
|
|
if label is not None:
|
|
del label.attrib['for']
|
|
|
|
def drop_tree(self):
|
|
"""
|
|
Removes this element from the tree, including its children and
|
|
text. The tail text is joined to the previous element or
|
|
parent.
|
|
"""
|
|
parent = self.getparent()
|
|
assert parent is not None
|
|
if self.tail:
|
|
previous = self.getprevious()
|
|
if previous is None:
|
|
parent.text = (parent.text or '') + self.tail
|
|
else:
|
|
previous.tail = (previous.tail or '') + self.tail
|
|
parent.remove(self)
|
|
|
|
def drop_tag(self):
|
|
"""
|
|
Remove the tag, but not its children or text. The children and text
|
|
are merged into the parent.
|
|
|
|
Example::
|
|
|
|
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
|
|
>>> h.find('.//b').drop_tag()
|
|
>>> print(tostring(h, encoding='unicode'))
|
|
<div>Hello World!</div>
|
|
"""
|
|
parent = self.getparent()
|
|
assert parent is not None
|
|
previous = self.getprevious()
|
|
if self.text and isinstance(self.tag, basestring):
|
|
# not a Comment, etc.
|
|
if previous is None:
|
|
parent.text = (parent.text or '') + self.text
|
|
else:
|
|
previous.tail = (previous.tail or '') + self.text
|
|
if self.tail:
|
|
if len(self):
|
|
last = self[-1]
|
|
last.tail = (last.tail or '') + self.tail
|
|
elif previous is None:
|
|
parent.text = (parent.text or '') + self.tail
|
|
else:
|
|
previous.tail = (previous.tail or '') + self.tail
|
|
index = parent.index(self)
|
|
parent[index:index+1] = self[:]
|
|
|
|
def find_rel_links(self, rel):
|
|
"""
|
|
Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
|
|
"""
|
|
rel = rel.lower()
|
|
return [el for el in _rel_links_xpath(self)
|
|
if el.get('rel').lower() == rel]
|
|
|
|
def find_class(self, class_name):
|
|
"""
|
|
Find any elements with the given class name.
|
|
"""
|
|
return _class_xpath(self, class_name=class_name)
|
|
|
|
def get_element_by_id(self, id, *default):
|
|
"""
|
|
Get the first element in a document with the given id. If none is
|
|
found, return the default argument if provided or raise KeyError
|
|
otherwise.
|
|
|
|
Note that there can be more than one element with the same id,
|
|
and this isn't uncommon in HTML documents found in the wild.
|
|
Browsers return only the first match, and this function does
|
|
the same.
|
|
"""
|
|
try:
|
|
# FIXME: should this check for multiple matches?
|
|
# browsers just return the first one
|
|
return _id_xpath(self, id=id)[0]
|
|
except IndexError:
|
|
if default:
|
|
return default[0]
|
|
else:
|
|
raise KeyError(id)
|
|
|
|
def text_content(self):
|
|
"""
|
|
Return the text content of the tag (and the text in any children).
|
|
"""
|
|
return _collect_string_content(self)
|
|
|
|
def cssselect(self, expr, translator='html'):
|
|
"""
|
|
Run the CSS expression on this element and its children,
|
|
returning a list of the results.
|
|
|
|
Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
|
|
-- note that pre-compiling the expression can provide a substantial
|
|
speedup.
|
|
"""
|
|
# Do the import here to make the dependency optional.
|
|
from lxml.cssselect import CSSSelector
|
|
return CSSSelector(expr, translator=translator)(self)
|
|
|
|
########################################
|
|
## Link functions
|
|
########################################
|
|
|
|
def make_links_absolute(self, base_url=None, resolve_base_href=True,
|
|
handle_failures=None):
|
|
"""
|
|
Make all links in the document absolute, given the
|
|
``base_url`` for the document (the full URL where the document
|
|
came from), or if no ``base_url`` is given, then the ``.base_url``
|
|
of the document.
|
|
|
|
If ``resolve_base_href`` is true, then any ``<base href>``
|
|
tags in the document are used *and* removed from the document.
|
|
If it is false then any such tag is ignored.
|
|
|
|
If ``handle_failures`` is None (default), a failure to process
|
|
a URL will abort the processing. If set to 'ignore', errors
|
|
are ignored. If set to 'discard', failing URLs will be removed.
|
|
"""
|
|
if base_url is None:
|
|
base_url = self.base_url
|
|
if base_url is None:
|
|
raise TypeError(
|
|
"No base_url given, and the document has no base_url")
|
|
if resolve_base_href:
|
|
self.resolve_base_href()
|
|
|
|
if handle_failures == 'ignore':
|
|
def link_repl(href):
|
|
try:
|
|
return urljoin(base_url, href)
|
|
except ValueError:
|
|
return href
|
|
elif handle_failures == 'discard':
|
|
def link_repl(href):
|
|
try:
|
|
return urljoin(base_url, href)
|
|
except ValueError:
|
|
return None
|
|
elif handle_failures is None:
|
|
def link_repl(href):
|
|
return urljoin(base_url, href)
|
|
else:
|
|
raise ValueError(
|
|
"unexpected value for handle_failures: %r" % handle_failures)
|
|
|
|
self.rewrite_links(link_repl)
|
|
|
|
def resolve_base_href(self, handle_failures=None):
|
|
"""
|
|
Find any ``<base href>`` tag in the document, and apply its
|
|
values to all links found in the document. Also remove the
|
|
tag once it has been applied.
|
|
|
|
If ``handle_failures`` is None (default), a failure to process
|
|
a URL will abort the processing. If set to 'ignore', errors
|
|
are ignored. If set to 'discard', failing URLs will be removed.
|
|
"""
|
|
base_href = None
|
|
basetags = self.xpath('//base[@href]|//x:base[@href]',
|
|
namespaces={'x': XHTML_NAMESPACE})
|
|
for b in basetags:
|
|
base_href = b.get('href')
|
|
b.drop_tree()
|
|
if not base_href:
|
|
return
|
|
self.make_links_absolute(base_href, resolve_base_href=False,
|
|
handle_failures=handle_failures)
|
|
|
|
def iterlinks(self):
|
|
"""
|
|
Yield (element, attribute, link, pos), where attribute may be None
|
|
(indicating the link is in the text). ``pos`` is the position
|
|
where the link occurs; often 0, but sometimes something else in
|
|
the case of links in stylesheets or style tags.
|
|
|
|
Note: <base href> is *not* taken into account in any way. The
|
|
link you get is exactly the link in the document.
|
|
|
|
Note: multiple links inside of a single text string or
|
|
attribute value are returned in reversed order. This makes it
|
|
possible to replace or delete them from the text string value
|
|
based on their reported text positions. Otherwise, a
|
|
modification at one text position can change the positions of
|
|
links reported later on.
|
|
"""
|
|
link_attrs = defs.link_attrs
|
|
for el in self.iter(etree.Element):
|
|
attribs = el.attrib
|
|
tag = _nons(el.tag)
|
|
if tag == 'object':
|
|
codebase = None
|
|
## <object> tags have attributes that are relative to
|
|
## codebase
|
|
if 'codebase' in attribs:
|
|
codebase = el.get('codebase')
|
|
yield (el, 'codebase', codebase, 0)
|
|
for attrib in ('classid', 'data'):
|
|
if attrib in attribs:
|
|
value = el.get(attrib)
|
|
if codebase is not None:
|
|
value = urljoin(codebase, value)
|
|
yield (el, attrib, value, 0)
|
|
if 'archive' in attribs:
|
|
for match in _archive_re.finditer(el.get('archive')):
|
|
value = match.group(0)
|
|
if codebase is not None:
|
|
value = urljoin(codebase, value)
|
|
yield (el, 'archive', value, match.start())
|
|
else:
|
|
for attrib in link_attrs:
|
|
if attrib in attribs:
|
|
yield (el, attrib, attribs[attrib], 0)
|
|
if tag == 'meta':
|
|
http_equiv = attribs.get('http-equiv', '').lower()
|
|
if http_equiv == 'refresh':
|
|
content = attribs.get('content', '')
|
|
match = _parse_meta_refresh_url(content)
|
|
url = (match.group('url') if match else content).strip()
|
|
# unexpected content means the redirect won't work, but we might
|
|
# as well be permissive and return the entire string.
|
|
if url:
|
|
url, pos = _unquote_match(
|
|
url, match.start('url') if match else content.find(url))
|
|
yield (el, 'content', url, pos)
|
|
elif tag == 'param':
|
|
valuetype = el.get('valuetype') or ''
|
|
if valuetype.lower() == 'ref':
|
|
## FIXME: while it's fine we *find* this link,
|
|
## according to the spec we aren't supposed to
|
|
## actually change the value, including resolving
|
|
## it. It can also still be a link, even if it
|
|
## doesn't have a valuetype="ref" (which seems to be the norm)
|
|
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
|
|
yield (el, 'value', el.get('value'), 0)
|
|
elif tag == 'style' and el.text:
|
|
urls = [
|
|
# (start_pos, url)
|
|
_unquote_match(match.group(1), match.start(1))[::-1]
|
|
for match in _iter_css_urls(el.text)
|
|
] + [
|
|
(match.start(1), match.group(1))
|
|
for match in _iter_css_imports(el.text)
|
|
]
|
|
if urls:
|
|
# sort by start pos to bring both match sets back into order
|
|
# and reverse the list to report correct positions despite
|
|
# modifications
|
|
urls.sort(reverse=True)
|
|
for start, url in urls:
|
|
yield (el, None, url, start)
|
|
if 'style' in attribs:
|
|
urls = list(_iter_css_urls(attribs['style']))
|
|
if urls:
|
|
# return in reversed order to simplify in-place modifications
|
|
for match in urls[::-1]:
|
|
url, start = _unquote_match(match.group(1), match.start(1))
|
|
yield (el, 'style', url, start)
|
|
|
|
def rewrite_links(self, link_repl_func, resolve_base_href=True,
|
|
base_href=None):
|
|
"""
|
|
Rewrite all the links in the document. For each link
|
|
``link_repl_func(link)`` will be called, and the return value
|
|
will replace the old link.
|
|
|
|
Note that links may not be absolute (unless you first called
|
|
``make_links_absolute()``), and may be internal (e.g.,
|
|
``'#anchor'``). They can also be values like
|
|
``'mailto:email'`` or ``'javascript:expr'``.
|
|
|
|
If you give ``base_href`` then all links passed to
|
|
``link_repl_func()`` will take that into account.
|
|
|
|
If the ``link_repl_func`` returns None, the attribute or
|
|
tag text will be removed completely.
|
|
"""
|
|
if base_href is not None:
|
|
# FIXME: this can be done in one pass with a wrapper
|
|
# around link_repl_func
|
|
self.make_links_absolute(
|
|
base_href, resolve_base_href=resolve_base_href)
|
|
elif resolve_base_href:
|
|
self.resolve_base_href()
|
|
|
|
for el, attrib, link, pos in self.iterlinks():
|
|
new_link = link_repl_func(link.strip())
|
|
if new_link == link:
|
|
continue
|
|
if new_link is None:
|
|
# Remove the attribute or element content
|
|
if attrib is None:
|
|
el.text = ''
|
|
else:
|
|
del el.attrib[attrib]
|
|
continue
|
|
|
|
if attrib is None:
|
|
new = el.text[:pos] + new_link + el.text[pos+len(link):]
|
|
el.text = new
|
|
else:
|
|
cur = el.get(attrib)
|
|
if not pos and len(cur) == len(link):
|
|
new = new_link # most common case
|
|
else:
|
|
new = cur[:pos] + new_link + cur[pos+len(link):]
|
|
el.set(attrib, new)
|
|
|
|
|
|
class _MethodFunc(object):
|
|
"""
|
|
An object that represents a method on an element as a function;
|
|
the function takes either an element or an HTML string. It
|
|
returns whatever the function normally returns, or if the function
|
|
works in-place (and so returns None) it returns a serialized form
|
|
of the resulting document.
|
|
"""
|
|
def __init__(self, name, copy=False, source_class=HtmlMixin):
|
|
self.name = name
|
|
self.copy = copy
|
|
self.__doc__ = getattr(source_class, self.name).__doc__
|
|
def __call__(self, doc, *args, **kw):
|
|
result_type = type(doc)
|
|
if isinstance(doc, basestring):
|
|
if 'copy' in kw:
|
|
raise TypeError(
|
|
"The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
|
|
doc = fromstring(doc, **kw)
|
|
else:
|
|
if 'copy' in kw:
|
|
make_a_copy = kw.pop('copy')
|
|
else:
|
|
make_a_copy = self.copy
|
|
if make_a_copy:
|
|
doc = copy.deepcopy(doc)
|
|
meth = getattr(doc, self.name)
|
|
result = meth(*args, **kw)
|
|
# FIXME: this None test is a bit sloppy
|
|
if result is None:
|
|
# Then return what we got in
|
|
return _transform_result(result_type, doc)
|
|
else:
|
|
return result
|
|
|
|
|
|
find_rel_links = _MethodFunc('find_rel_links', copy=False)
|
|
find_class = _MethodFunc('find_class', copy=False)
|
|
make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
|
|
resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
|
|
iterlinks = _MethodFunc('iterlinks', copy=False)
|
|
rewrite_links = _MethodFunc('rewrite_links', copy=True)
|
|
|
|
|
|
class HtmlComment(etree.CommentBase, HtmlMixin):
|
|
pass
|
|
|
|
|
|
class HtmlElement(etree.ElementBase, HtmlMixin):
|
|
# Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
|
|
cssselect = HtmlMixin.cssselect
|
|
set = HtmlMixin.set
|
|
|
|
|
|
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
|
|
pass
|
|
|
|
|
|
class HtmlEntity(etree.EntityBase, HtmlMixin):
|
|
pass
|
|
|
|
|
|
class HtmlElementClassLookup(etree.CustomElementClassLookup):
|
|
"""A lookup scheme for HTML Element classes.
|
|
|
|
To create a lookup instance with different Element classes, pass a tag
|
|
name mapping of Element classes in the ``classes`` keyword argument and/or
|
|
a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
|
|
The special key '*' denotes a Mixin class that should be mixed into all
|
|
Element classes.
|
|
"""
|
|
_default_element_classes = {}
|
|
|
|
def __init__(self, classes=None, mixins=None):
|
|
etree.CustomElementClassLookup.__init__(self)
|
|
if classes is None:
|
|
classes = self._default_element_classes.copy()
|
|
if mixins:
|
|
mixers = {}
|
|
for name, value in mixins:
|
|
if name == '*':
|
|
for n in classes.keys():
|
|
mixers.setdefault(n, []).append(value)
|
|
else:
|
|
mixers.setdefault(name, []).append(value)
|
|
for name, mix_bases in mixers.items():
|
|
cur = classes.get(name, HtmlElement)
|
|
bases = tuple(mix_bases + [cur])
|
|
classes[name] = type(cur.__name__, bases, {})
|
|
self._element_classes = classes
|
|
|
|
def lookup(self, node_type, document, namespace, name):
|
|
if node_type == 'element':
|
|
return self._element_classes.get(name.lower(), HtmlElement)
|
|
elif node_type == 'comment':
|
|
return HtmlComment
|
|
elif node_type == 'PI':
|
|
return HtmlProcessingInstruction
|
|
elif node_type == 'entity':
|
|
return HtmlEntity
|
|
# Otherwise normal lookup
|
|
return None
|
|
|
|
|
|
################################################################################
|
|
# parsing
|
|
################################################################################
|
|
|
|
_looks_like_full_html_unicode = re.compile(
|
|
unicode(r'^\s*<(?:html|!doctype)'), re.I).match
|
|
_looks_like_full_html_bytes = re.compile(
|
|
r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
|
|
|
|
|
|
def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
|
|
if parser is None:
|
|
parser = html_parser
|
|
value = etree.fromstring(html, parser, **kw)
|
|
if value is None:
|
|
raise etree.ParserError(
|
|
"Document is empty")
|
|
if ensure_head_body and value.find('head') is None:
|
|
value.insert(0, Element('head'))
|
|
if ensure_head_body and value.find('body') is None:
|
|
value.append(Element('body'))
|
|
return value
|
|
|
|
|
|
def fragments_fromstring(html, no_leading_text=False, base_url=None,
|
|
parser=None, **kw):
|
|
"""Parses several HTML elements, returning a list of elements.
|
|
|
|
The first item in the list may be a string.
|
|
If no_leading_text is true, then it will be an error if there is
|
|
leading text, and it will always be a list of only elements.
|
|
|
|
base_url will set the document's base_url attribute
|
|
(and the tree's docinfo.URL).
|
|
"""
|
|
if parser is None:
|
|
parser = html_parser
|
|
# FIXME: check what happens when you give html with a body, head, etc.
|
|
if isinstance(html, bytes):
|
|
if not _looks_like_full_html_bytes(html):
|
|
# can't use %-formatting in early Py3 versions
|
|
html = ('<html><body>'.encode('ascii') + html +
|
|
'</body></html>'.encode('ascii'))
|
|
else:
|
|
if not _looks_like_full_html_unicode(html):
|
|
html = '<html><body>%s</body></html>' % html
|
|
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
|
|
assert _nons(doc.tag) == 'html'
|
|
bodies = [e for e in doc if _nons(e.tag) == 'body']
|
|
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
|
|
body = bodies[0]
|
|
elements = []
|
|
if no_leading_text and body.text and body.text.strip():
|
|
raise etree.ParserError(
|
|
"There is leading text: %r" % body.text)
|
|
if body.text and body.text.strip():
|
|
elements.append(body.text)
|
|
elements.extend(body)
|
|
# FIXME: removing the reference to the parent artificial document
|
|
# would be nice
|
|
return elements
|
|
|
|
|
|
def fragment_fromstring(html, create_parent=False, base_url=None,
|
|
parser=None, **kw):
|
|
"""
|
|
Parses a single HTML element; it is an error if there is more than
|
|
one element, or if anything but whitespace precedes or follows the
|
|
element.
|
|
|
|
If ``create_parent`` is true (or is a tag name) then a parent node
|
|
will be created to encapsulate the HTML in a single element. In this
|
|
case, leading or trailing text is also allowed, as are multiple elements
|
|
as result of the parsing.
|
|
|
|
Passing a ``base_url`` will set the document's ``base_url`` attribute
|
|
(and the tree's docinfo.URL).
|
|
"""
|
|
if parser is None:
|
|
parser = html_parser
|
|
|
|
accept_leading_text = bool(create_parent)
|
|
|
|
elements = fragments_fromstring(
|
|
html, parser=parser, no_leading_text=not accept_leading_text,
|
|
base_url=base_url, **kw)
|
|
|
|
if create_parent:
|
|
if not isinstance(create_parent, basestring):
|
|
create_parent = 'div'
|
|
new_root = Element(create_parent)
|
|
if elements:
|
|
if isinstance(elements[0], basestring):
|
|
new_root.text = elements[0]
|
|
del elements[0]
|
|
new_root.extend(elements)
|
|
return new_root
|
|
|
|
if not elements:
|
|
raise etree.ParserError('No elements found')
|
|
if len(elements) > 1:
|
|
raise etree.ParserError(
|
|
"Multiple elements found (%s)"
|
|
% ', '.join([_element_name(e) for e in elements]))
|
|
el = elements[0]
|
|
if el.tail and el.tail.strip():
|
|
raise etree.ParserError(
|
|
"Element followed by text: %r" % el.tail)
|
|
el.tail = None
|
|
return el
|
|
|
|
|
|
def fromstring(html, base_url=None, parser=None, **kw):
|
|
"""
|
|
Parse the html, returning a single element/document.
|
|
|
|
This tries to minimally parse the chunk of text, without knowing if it
|
|
is a fragment or a document.
|
|
|
|
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
|
|
"""
|
|
if parser is None:
|
|
parser = html_parser
|
|
if isinstance(html, bytes):
|
|
is_full_html = _looks_like_full_html_bytes(html)
|
|
else:
|
|
is_full_html = _looks_like_full_html_unicode(html)
|
|
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
|
|
if is_full_html:
|
|
return doc
|
|
# otherwise, lets parse it out...
|
|
bodies = doc.findall('body')
|
|
if not bodies:
|
|
bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
|
|
if bodies:
|
|
body = bodies[0]
|
|
if len(bodies) > 1:
|
|
# Somehow there are multiple bodies, which is bad, but just
|
|
# smash them into one body
|
|
for other_body in bodies[1:]:
|
|
if other_body.text:
|
|
if len(body):
|
|
body[-1].tail = (body[-1].tail or '') + other_body.text
|
|
else:
|
|
body.text = (body.text or '') + other_body.text
|
|
body.extend(other_body)
|
|
# We'll ignore tail
|
|
# I guess we are ignoring attributes too
|
|
other_body.drop_tree()
|
|
else:
|
|
body = None
|
|
heads = doc.findall('head')
|
|
if not heads:
|
|
heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
|
|
if heads:
|
|
# Well, we have some sort of structure, so lets keep it all
|
|
head = heads[0]
|
|
if len(heads) > 1:
|
|
for other_head in heads[1:]:
|
|
head.extend(other_head)
|
|
# We don't care about text or tail in a head
|
|
other_head.drop_tree()
|
|
return doc
|
|
if body is None:
|
|
return doc
|
|
if (len(body) == 1 and (not body.text or not body.text.strip())
|
|
and (not body[-1].tail or not body[-1].tail.strip())):
|
|
# The body has just one element, so it was probably a single
|
|
# element passed in
|
|
return body[0]
|
|
# Now we have a body which represents a bunch of tags which have the
|
|
# content that was passed in. We will create a fake container, which
|
|
# is the body tag, except <body> implies too much structure.
|
|
if _contains_block_level_tag(body):
|
|
body.tag = 'div'
|
|
else:
|
|
body.tag = 'span'
|
|
return body
|
|
|
|
|
|
def parse(filename_or_url, parser=None, base_url=None, **kw):
|
|
"""
|
|
Parse a filename, URL, or file-like object into an HTML document
|
|
tree. Note: this returns a tree, not an element. Use
|
|
``parse(...).getroot()`` to get the document root.
|
|
|
|
You can override the base URL with the ``base_url`` keyword. This
|
|
is most useful when parsing from a file-like object.
|
|
"""
|
|
if parser is None:
|
|
parser = html_parser
|
|
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
|
|
|
|
|
|
def _contains_block_level_tag(el):
|
|
# FIXME: I could do this with XPath, but would that just be
|
|
# unnecessarily slow?
|
|
for el in el.iter(etree.Element):
|
|
if _nons(el.tag) in defs.block_tags:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _element_name(el):
|
|
if isinstance(el, etree.CommentBase):
|
|
return 'comment'
|
|
elif isinstance(el, basestring):
|
|
return 'string'
|
|
else:
|
|
return _nons(el.tag)
|
|
|
|
|
|
################################################################################
|
|
# form handling
|
|
################################################################################
|
|
|
|
class FormElement(HtmlElement):
|
|
"""
|
|
Represents a <form> element.
|
|
"""
|
|
|
|
@property
|
|
def inputs(self):
|
|
"""
|
|
Returns an accessor for all the input elements in the form.
|
|
|
|
See `InputGetter` for more information about the object.
|
|
"""
|
|
return InputGetter(self)
|
|
|
|
@property
|
|
def fields(self):
|
|
"""
|
|
Dictionary-like object that represents all the fields in this
|
|
form. You can set values in this dictionary to effect the
|
|
form.
|
|
"""
|
|
return FieldsDict(self.inputs)
|
|
|
|
@fields.setter
|
|
def fields(self, value):
|
|
fields = self.fields
|
|
prev_keys = fields.keys()
|
|
for key, value in value.items():
|
|
if key in prev_keys:
|
|
prev_keys.remove(key)
|
|
fields[key] = value
|
|
for key in prev_keys:
|
|
if key is None:
|
|
# Case of an unnamed input; these aren't really
|
|
# expressed in form_values() anyway.
|
|
continue
|
|
fields[key] = None
|
|
|
|
def _name(self):
|
|
if self.get('name'):
|
|
return self.get('name')
|
|
elif self.get('id'):
|
|
return '#' + self.get('id')
|
|
iter_tags = self.body.iter
|
|
forms = list(iter_tags('form'))
|
|
if not forms:
|
|
forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
|
|
return str(forms.index(self))
|
|
|
|
def form_values(self):
|
|
"""
|
|
Return a list of tuples of the field values for the form.
|
|
This is suitable to be passed to ``urllib.urlencode()``.
|
|
"""
|
|
results = []
|
|
for el in self.inputs:
|
|
name = el.name
|
|
if not name or 'disabled' in el.attrib:
|
|
continue
|
|
tag = _nons(el.tag)
|
|
if tag == 'textarea':
|
|
results.append((name, el.value))
|
|
elif tag == 'select':
|
|
value = el.value
|
|
if el.multiple:
|
|
for v in value:
|
|
results.append((name, v))
|
|
elif value is not None:
|
|
results.append((name, el.value))
|
|
else:
|
|
assert tag == 'input', (
|
|
"Unexpected tag: %r" % el)
|
|
if el.checkable and not el.checked:
|
|
continue
|
|
if el.type in ('submit', 'image', 'reset', 'file'):
|
|
continue
|
|
value = el.value
|
|
if value is not None:
|
|
results.append((name, el.value))
|
|
return results
|
|
|
|
@property
|
|
def action(self):
|
|
"""
|
|
Get/set the form's ``action`` attribute.
|
|
"""
|
|
base_url = self.base_url
|
|
action = self.get('action')
|
|
if base_url and action is not None:
|
|
return urljoin(base_url, action)
|
|
else:
|
|
return action
|
|
|
|
@action.setter
|
|
def action(self, value):
|
|
self.set('action', value)
|
|
|
|
@action.deleter
|
|
def action(self):
|
|
attrib = self.attrib
|
|
if 'action' in attrib:
|
|
del attrib['action']
|
|
|
|
@property
|
|
def method(self):
|
|
"""
|
|
Get/set the form's method. Always returns a capitalized
|
|
string, and defaults to ``'GET'``
|
|
"""
|
|
return self.get('method', 'GET').upper()
|
|
|
|
@method.setter
|
|
def method(self, value):
|
|
self.set('method', value.upper())
|
|
|
|
|
|
HtmlElementClassLookup._default_element_classes['form'] = FormElement
|
|
|
|
|
|
def submit_form(form, extra_values=None, open_http=None):
|
|
"""
|
|
Helper function to submit a form. Returns a file-like object, as from
|
|
``urllib.urlopen()``. This object also has a ``.geturl()`` function,
|
|
which shows the URL if there were any redirects.
|
|
|
|
You can use this like::
|
|
|
|
form = doc.forms[0]
|
|
form.inputs['foo'].value = 'bar' # etc
|
|
response = form.submit()
|
|
doc = parse(response)
|
|
doc.make_links_absolute(response.geturl())
|
|
|
|
To change the HTTP requester, pass a function as ``open_http`` keyword
|
|
argument that opens the URL for you. The function must have the following
|
|
signature::
|
|
|
|
open_http(method, URL, values)
|
|
|
|
The action is one of 'GET' or 'POST', the URL is the target URL as a
|
|
string, and the values are a sequence of ``(name, value)`` tuples with the
|
|
form data.
|
|
"""
|
|
values = form.form_values()
|
|
if extra_values:
|
|
if hasattr(extra_values, 'items'):
|
|
extra_values = extra_values.items()
|
|
values.extend(extra_values)
|
|
if open_http is None:
|
|
open_http = open_http_urllib
|
|
if form.action:
|
|
url = form.action
|
|
else:
|
|
url = form.base_url
|
|
return open_http(form.method, url, values)
|
|
|
|
|
|
def open_http_urllib(method, url, values):
|
|
if not url:
|
|
raise ValueError("cannot submit, no URL provided")
|
|
## FIXME: should test that it's not a relative URL or something
|
|
try:
|
|
from urllib import urlencode, urlopen
|
|
except ImportError: # Python 3
|
|
from urllib.request import urlopen
|
|
from urllib.parse import urlencode
|
|
if method == 'GET':
|
|
if '?' in url:
|
|
url += '&'
|
|
else:
|
|
url += '?'
|
|
url += urlencode(values)
|
|
data = None
|
|
else:
|
|
data = urlencode(values)
|
|
if not isinstance(data, bytes):
|
|
data = data.encode('ASCII')
|
|
return urlopen(url, data)
|
|
|
|
|
|
class FieldsDict(MutableMapping):
|
|
|
|
def __init__(self, inputs):
|
|
self.inputs = inputs
|
|
def __getitem__(self, item):
|
|
return self.inputs[item].value
|
|
def __setitem__(self, item, value):
|
|
self.inputs[item].value = value
|
|
def __delitem__(self, item):
|
|
raise KeyError(
|
|
"You cannot remove keys from ElementDict")
|
|
def keys(self):
|
|
return self.inputs.keys()
|
|
def __contains__(self, item):
|
|
return item in self.inputs
|
|
def __iter__(self):
|
|
return iter(self.inputs.keys())
|
|
def __len__(self):
|
|
return len(self.inputs)
|
|
|
|
def __repr__(self):
|
|
return '<%s for form %s>' % (
|
|
self.__class__.__name__,
|
|
self.inputs.form._name())
|
|
|
|
|
|
class InputGetter(object):
|
|
|
|
"""
|
|
An accessor that represents all the input fields in a form.
|
|
|
|
You can get fields by name from this, with
|
|
``form.inputs['field_name']``. If there are a set of checkboxes
|
|
with the same name, they are returned as a list (a `CheckboxGroup`
|
|
which also allows value setting). Radio inputs are handled
|
|
similarly.
|
|
|
|
You can also iterate over this to get all input elements. This
|
|
won't return the same thing as if you get all the names, as
|
|
checkboxes and radio elements are returned individually.
|
|
"""
|
|
|
|
_name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
|
|
_all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
|
|
|
|
def __init__(self, form):
|
|
self.form = form
|
|
|
|
def __repr__(self):
|
|
return '<%s for form %s>' % (
|
|
self.__class__.__name__,
|
|
self.form._name())
|
|
|
|
## FIXME: there should be more methods, and it's unclear if this is
|
|
## a dictionary-like object or list-like object
|
|
|
|
def __getitem__(self, name):
|
|
results = self._name_xpath(self.form, name=name)
|
|
if results:
|
|
type = results[0].get('type')
|
|
if type == 'radio' and len(results) > 1:
|
|
group = RadioGroup(results)
|
|
group.name = name
|
|
return group
|
|
elif type == 'checkbox' and len(results) > 1:
|
|
group = CheckboxGroup(results)
|
|
group.name = name
|
|
return group
|
|
else:
|
|
# I don't like throwing away elements like this
|
|
return results[0]
|
|
else:
|
|
raise KeyError(
|
|
"No input element with the name %r" % name)
|
|
|
|
def __contains__(self, name):
|
|
results = self._name_xpath(self.form, name=name)
|
|
return bool(results)
|
|
|
|
def keys(self):
|
|
names = set()
|
|
for el in self:
|
|
names.add(el.name)
|
|
if None in names:
|
|
names.remove(None)
|
|
return list(names)
|
|
|
|
def __iter__(self):
|
|
## FIXME: kind of dumb to turn a list into an iterator, only
|
|
## to have it likely turned back into a list again :(
|
|
return iter(self._all_xpath(self.form))
|
|
|
|
|
|
class InputMixin(object):
|
|
"""
|
|
Mix-in for all input elements (input, select, and textarea)
|
|
"""
|
|
@property
|
|
def name(self):
|
|
"""
|
|
Get/set the name of the element
|
|
"""
|
|
return self.get('name')
|
|
|
|
@name.setter
|
|
def name(self, value):
|
|
self.set('name', value)
|
|
|
|
@name.deleter
|
|
def name(self):
|
|
attrib = self.attrib
|
|
if 'name' in attrib:
|
|
del attrib['name']
|
|
|
|
def __repr__(self):
|
|
type_name = getattr(self, 'type', None)
|
|
if type_name:
|
|
type_name = ' type=%r' % type_name
|
|
else:
|
|
type_name = ''
|
|
return '<%s %x name=%r%s>' % (
|
|
self.__class__.__name__, id(self), self.name, type_name)
|
|
|
|
|
|
class TextareaElement(InputMixin, HtmlElement):
|
|
"""
|
|
``<textarea>`` element. You can get the name with ``.name`` and
|
|
get/set the value with ``.value``
|
|
"""
|
|
@property
|
|
def value(self):
|
|
"""
|
|
Get/set the value (which is the contents of this element)
|
|
"""
|
|
content = self.text or ''
|
|
if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
|
|
serialisation_method = 'xml'
|
|
else:
|
|
serialisation_method = 'html'
|
|
for el in self:
|
|
# it's rare that we actually get here, so let's not use ''.join()
|
|
content += etree.tostring(
|
|
el, method=serialisation_method, encoding='unicode')
|
|
return content
|
|
|
|
@value.setter
|
|
def value(self, value):
|
|
del self[:]
|
|
self.text = value
|
|
|
|
@value.deleter
|
|
def value(self):
|
|
self.text = ''
|
|
del self[:]
|
|
|
|
|
|
HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
|
|
|
|
|
|
class SelectElement(InputMixin, HtmlElement):
|
|
"""
|
|
``<select>`` element. You can get the name with ``.name``.
|
|
|
|
``.value`` will be the value of the selected option, unless this
|
|
is a multi-select element (``<select multiple>``), in which case
|
|
it will be a set-like object. In either case ``.value_options``
|
|
gives the possible values.
|
|
|
|
The boolean attribute ``.multiple`` shows if this is a
|
|
multi-select.
|
|
"""
|
|
@property
|
|
def value(self):
|
|
"""
|
|
Get/set the value of this select (the selected option).
|
|
|
|
If this is a multi-select, this is a set-like object that
|
|
represents all the selected options.
|
|
"""
|
|
if self.multiple:
|
|
return MultipleSelectOptions(self)
|
|
options = _options_xpath(self)
|
|
|
|
try:
|
|
selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
|
|
except StopIteration:
|
|
try:
|
|
selected_option = next(el for el in options if el.get('disabled') is None)
|
|
except StopIteration:
|
|
return None
|
|
value = selected_option.get('value')
|
|
if value is None:
|
|
value = (selected_option.text or '').strip()
|
|
return value
|
|
|
|
@value.setter
|
|
def value(self, value):
|
|
if self.multiple:
|
|
if isinstance(value, basestring):
|
|
raise TypeError("You must pass in a sequence")
|
|
values = self.value
|
|
values.clear()
|
|
values.update(value)
|
|
return
|
|
checked_option = None
|
|
if value is not None:
|
|
for el in _options_xpath(self):
|
|
opt_value = el.get('value')
|
|
if opt_value is None:
|
|
opt_value = (el.text or '').strip()
|
|
if opt_value == value:
|
|
checked_option = el
|
|
break
|
|
else:
|
|
raise ValueError(
|
|
"There is no option with the value of %r" % value)
|
|
for el in _options_xpath(self):
|
|
if 'selected' in el.attrib:
|
|
del el.attrib['selected']
|
|
if checked_option is not None:
|
|
checked_option.set('selected', '')
|
|
|
|
@value.deleter
|
|
def value(self):
|
|
# FIXME: should del be allowed at all?
|
|
if self.multiple:
|
|
self.value.clear()
|
|
else:
|
|
self.value = None
|
|
|
|
@property
|
|
def value_options(self):
|
|
"""
|
|
All the possible values this select can have (the ``value``
|
|
attribute of all the ``<option>`` elements.
|
|
"""
|
|
options = []
|
|
for el in _options_xpath(self):
|
|
value = el.get('value')
|
|
if value is None:
|
|
value = (el.text or '').strip()
|
|
options.append(value)
|
|
return options
|
|
|
|
@property
|
|
def multiple(self):
|
|
"""
|
|
Boolean attribute: is there a ``multiple`` attribute on this element.
|
|
"""
|
|
return 'multiple' in self.attrib
|
|
|
|
@multiple.setter
|
|
def multiple(self, value):
|
|
if value:
|
|
self.set('multiple', '')
|
|
elif 'multiple' in self.attrib:
|
|
del self.attrib['multiple']
|
|
|
|
|
|
HtmlElementClassLookup._default_element_classes['select'] = SelectElement
|
|
|
|
|
|
class MultipleSelectOptions(SetMixin):
|
|
"""
|
|
Represents all the selected options in a ``<select multiple>`` element.
|
|
|
|
You can add to this set-like option to select an option, or remove
|
|
to unselect the option.
|
|
"""
|
|
|
|
def __init__(self, select):
|
|
self.select = select
|
|
|
|
@property
|
|
def options(self):
|
|
"""
|
|
Iterator of all the ``<option>`` elements.
|
|
"""
|
|
return iter(_options_xpath(self.select))
|
|
|
|
def __iter__(self):
|
|
for option in self.options:
|
|
if 'selected' in option.attrib:
|
|
opt_value = option.get('value')
|
|
if opt_value is None:
|
|
opt_value = (option.text or '').strip()
|
|
yield opt_value
|
|
|
|
def add(self, item):
|
|
for option in self.options:
|
|
opt_value = option.get('value')
|
|
if opt_value is None:
|
|
opt_value = (option.text or '').strip()
|
|
if opt_value == item:
|
|
option.set('selected', '')
|
|
break
|
|
else:
|
|
raise ValueError(
|
|
"There is no option with the value %r" % item)
|
|
|
|
def remove(self, item):
|
|
for option in self.options:
|
|
opt_value = option.get('value')
|
|
if opt_value is None:
|
|
opt_value = (option.text or '').strip()
|
|
if opt_value == item:
|
|
if 'selected' in option.attrib:
|
|
del option.attrib['selected']
|
|
else:
|
|
raise ValueError(
|
|
"The option %r is not currently selected" % item)
|
|
break
|
|
else:
|
|
raise ValueError(
|
|
"There is not option with the value %r" % item)
|
|
|
|
def __repr__(self):
|
|
return '<%s {%s} for select name=%r>' % (
|
|
self.__class__.__name__,
|
|
', '.join([repr(v) for v in self]),
|
|
self.select.name)
|
|
|
|
|
|
class RadioGroup(list):
|
|
"""
|
|
This object represents several ``<input type=radio>`` elements
|
|
that have the same name.
|
|
|
|
You can use this like a list, but also use the property
|
|
``.value`` to check/uncheck inputs. Also you can use
|
|
``.value_options`` to get the possible values.
|
|
"""
|
|
@property
|
|
def value(self):
|
|
"""
|
|
Get/set the value, which checks the radio with that value (and
|
|
unchecks any other value).
|
|
"""
|
|
for el in self:
|
|
if 'checked' in el.attrib:
|
|
return el.get('value')
|
|
return None
|
|
|
|
@value.setter
|
|
def value(self, value):
|
|
checked_option = None
|
|
if value is not None:
|
|
for el in self:
|
|
if el.get('value') == value:
|
|
checked_option = el
|
|
break
|
|
else:
|
|
raise ValueError("There is no radio input with the value %r" % value)
|
|
for el in self:
|
|
if 'checked' in el.attrib:
|
|
del el.attrib['checked']
|
|
if checked_option is not None:
|
|
checked_option.set('checked', '')
|
|
|
|
@value.deleter
|
|
def value(self):
|
|
self.value = None
|
|
|
|
@property
|
|
def value_options(self):
|
|
"""
|
|
Returns a list of all the possible values.
|
|
"""
|
|
return [el.get('value') for el in self]
|
|
|
|
def __repr__(self):
|
|
return '%s(%s)' % (
|
|
self.__class__.__name__,
|
|
list.__repr__(self))
|
|
|
|
|
|
class CheckboxGroup(list):
|
|
"""
|
|
Represents a group of checkboxes (``<input type=checkbox>``) that
|
|
have the same name.
|
|
|
|
In addition to using this like a list, the ``.value`` attribute
|
|
returns a set-like object that you can add to or remove from to
|
|
check and uncheck checkboxes. You can also use ``.value_options``
|
|
to get the possible values.
|
|
"""
|
|
@property
|
|
def value(self):
|
|
"""
|
|
Return a set-like object that can be modified to check or
|
|
uncheck individual checkboxes according to their value.
|
|
"""
|
|
return CheckboxValues(self)
|
|
|
|
@value.setter
|
|
def value(self, value):
|
|
values = self.value
|
|
values.clear()
|
|
if not hasattr(value, '__iter__'):
|
|
raise ValueError(
|
|
"A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
|
|
% (self[0].name, value))
|
|
values.update(value)
|
|
|
|
@value.deleter
|
|
def value(self):
|
|
self.value.clear()
|
|
|
|
@property
|
|
def value_options(self):
|
|
"""
|
|
Returns a list of all the possible values.
|
|
"""
|
|
return [el.get('value') for el in self]
|
|
|
|
def __repr__(self):
|
|
return '%s(%s)' % (
|
|
self.__class__.__name__, list.__repr__(self))
|
|
|
|
|
|
class CheckboxValues(SetMixin):
|
|
"""
|
|
Represents the values of the checked checkboxes in a group of
|
|
checkboxes with the same name.
|
|
"""
|
|
|
|
def __init__(self, group):
|
|
self.group = group
|
|
|
|
def __iter__(self):
|
|
return iter([
|
|
el.get('value')
|
|
for el in self.group
|
|
if 'checked' in el.attrib])
|
|
|
|
def add(self, value):
|
|
for el in self.group:
|
|
if el.get('value') == value:
|
|
el.set('checked', '')
|
|
break
|
|
else:
|
|
raise KeyError("No checkbox with value %r" % value)
|
|
|
|
def remove(self, value):
|
|
for el in self.group:
|
|
if el.get('value') == value:
|
|
if 'checked' in el.attrib:
|
|
del el.attrib['checked']
|
|
else:
|
|
raise KeyError(
|
|
"The checkbox with value %r was already unchecked" % value)
|
|
break
|
|
else:
|
|
raise KeyError(
|
|
"No checkbox with value %r" % value)
|
|
|
|
def __repr__(self):
|
|
return '<%s {%s} for checkboxes name=%r>' % (
|
|
self.__class__.__name__,
|
|
', '.join([repr(v) for v in self]),
|
|
self.group.name)
|
|
|
|
|
|
class InputElement(InputMixin, HtmlElement):
|
|
"""
|
|
Represents an ``<input>`` element.
|
|
|
|
You can get the type with ``.type`` (which is lower-cased and
|
|
defaults to ``'text'``).
|
|
|
|
Also you can get and set the value with ``.value``
|
|
|
|
Checkboxes and radios have the attribute ``input.checkable ==
|
|
True`` (for all others it is false) and a boolean attribute
|
|
``.checked``.
|
|
|
|
"""
|
|
|
|
## FIXME: I'm a little uncomfortable with the use of .checked
|
|
@property
|
|
def value(self):
|
|
"""
|
|
Get/set the value of this element, using the ``value`` attribute.
|
|
|
|
Also, if this is a checkbox and it has no value, this defaults
|
|
to ``'on'``. If it is a checkbox or radio that is not
|
|
checked, this returns None.
|
|
"""
|
|
if self.checkable:
|
|
if self.checked:
|
|
return self.get('value') or 'on'
|
|
else:
|
|
return None
|
|
return self.get('value')
|
|
|
|
@value.setter
|
|
def value(self, value):
|
|
if self.checkable:
|
|
if not value:
|
|
self.checked = False
|
|
else:
|
|
self.checked = True
|
|
if isinstance(value, basestring):
|
|
self.set('value', value)
|
|
else:
|
|
self.set('value', value)
|
|
|
|
@value.deleter
|
|
def value(self):
|
|
if self.checkable:
|
|
self.checked = False
|
|
else:
|
|
if 'value' in self.attrib:
|
|
del self.attrib['value']
|
|
|
|
@property
|
|
def type(self):
|
|
"""
|
|
Return the type of this element (using the type attribute).
|
|
"""
|
|
return self.get('type', 'text').lower()
|
|
|
|
@type.setter
|
|
def type(self, value):
|
|
self.set('type', value)
|
|
|
|
@property
|
|
def checkable(self):
|
|
"""
|
|
Boolean: can this element be checked?
|
|
"""
|
|
return self.type in ('checkbox', 'radio')
|
|
|
|
@property
|
|
def checked(self):
|
|
"""
|
|
Boolean attribute to get/set the presence of the ``checked``
|
|
attribute.
|
|
|
|
You can only use this on checkable input types.
|
|
"""
|
|
if not self.checkable:
|
|
raise AttributeError('Not a checkable input type')
|
|
return 'checked' in self.attrib
|
|
|
|
@checked.setter
|
|
def checked(self, value):
|
|
if not self.checkable:
|
|
raise AttributeError('Not a checkable input type')
|
|
if value:
|
|
self.set('checked', '')
|
|
else:
|
|
attrib = self.attrib
|
|
if 'checked' in attrib:
|
|
del attrib['checked']
|
|
|
|
|
|
HtmlElementClassLookup._default_element_classes['input'] = InputElement
|
|
|
|
|
|
class LabelElement(HtmlElement):
|
|
"""
|
|
Represents a ``<label>`` element.
|
|
|
|
Label elements are linked to other elements with their ``for``
|
|
attribute. You can access this element with ``label.for_element``.
|
|
"""
|
|
@property
|
|
def for_element(self):
|
|
"""
|
|
Get/set the element this label points to. Return None if it
|
|
can't be found.
|
|
"""
|
|
id = self.get('for')
|
|
if not id:
|
|
return None
|
|
return self.body.get_element_by_id(id)
|
|
|
|
@for_element.setter
|
|
def for_element(self, other):
|
|
id = other.get('id')
|
|
if not id:
|
|
raise TypeError(
|
|
"Element %r has no id attribute" % other)
|
|
self.set('for', id)
|
|
|
|
@for_element.deleter
|
|
def for_element(self):
|
|
attrib = self.attrib
|
|
if 'id' in attrib:
|
|
del attrib['id']
|
|
|
|
|
|
HtmlElementClassLookup._default_element_classes['label'] = LabelElement
|
|
|
|
|
|
############################################################
|
|
## Serialization
|
|
############################################################
|
|
|
|
def html_to_xhtml(html):
|
|
"""Convert all tags in an HTML tree to XHTML by moving them to the
|
|
XHTML namespace.
|
|
"""
|
|
try:
|
|
html = html.getroot()
|
|
except AttributeError:
|
|
pass
|
|
prefix = "{%s}" % XHTML_NAMESPACE
|
|
for el in html.iter(etree.Element):
|
|
tag = el.tag
|
|
if tag[0] != '{':
|
|
el.tag = prefix + tag
|
|
|
|
|
|
def xhtml_to_html(xhtml):
|
|
"""Convert all tags in an XHTML tree to HTML by removing their
|
|
XHTML namespace.
|
|
"""
|
|
try:
|
|
xhtml = xhtml.getroot()
|
|
except AttributeError:
|
|
pass
|
|
prefix = "{%s}" % XHTML_NAMESPACE
|
|
prefix_len = len(prefix)
|
|
for el in xhtml.iter(prefix + "*"):
|
|
el.tag = el.tag[prefix_len:]
|
|
|
|
|
|
# This isn't a general match, but it's a match for what libxml2
|
|
# specifically serialises:
|
|
__str_replace_meta_content_type = re.compile(
|
|
r'<meta http-equiv="Content-Type"[^>]*>').sub
|
|
__bytes_replace_meta_content_type = re.compile(
|
|
r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
|
|
|
|
|
|
def tostring(doc, pretty_print=False, include_meta_content_type=False,
|
|
encoding=None, method="html", with_tail=True, doctype=None):
|
|
"""Return an HTML string representation of the document.
|
|
|
|
Note: if include_meta_content_type is true this will create a
|
|
``<meta http-equiv="Content-Type" ...>`` tag in the head;
|
|
regardless of the value of include_meta_content_type any existing
|
|
``<meta http-equiv="Content-Type" ...>`` tag will be removed
|
|
|
|
The ``encoding`` argument controls the output encoding (defauts to
|
|
ASCII, with &#...; character references for any characters outside
|
|
of ASCII). Note that you can pass the name ``'unicode'`` as
|
|
``encoding`` argument to serialise to a Unicode string.
|
|
|
|
The ``method`` argument defines the output method. It defaults to
|
|
'html', but can also be 'xml' for xhtml output, or 'text' to
|
|
serialise to plain text without markup.
|
|
|
|
To leave out the tail text of the top-level element that is being
|
|
serialised, pass ``with_tail=False``.
|
|
|
|
The ``doctype`` option allows passing in a plain string that will
|
|
be serialised before the XML tree. Note that passing in non
|
|
well-formed content here will make the XML output non well-formed.
|
|
Also, an existing doctype in the document tree will not be removed
|
|
when serialising an ElementTree instance.
|
|
|
|
Example::
|
|
|
|
>>> from lxml import html
|
|
>>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
|
|
|
|
>>> html.tostring(root)
|
|
b'<p>Hello<br>world!</p>'
|
|
>>> html.tostring(root, method='html')
|
|
b'<p>Hello<br>world!</p>'
|
|
|
|
>>> html.tostring(root, method='xml')
|
|
b'<p>Hello<br/>world!</p>'
|
|
|
|
>>> html.tostring(root, method='text')
|
|
b'Helloworld!'
|
|
|
|
>>> html.tostring(root, method='text', encoding='unicode')
|
|
u'Helloworld!'
|
|
|
|
>>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
|
|
>>> html.tostring(root[0], method='text', encoding='unicode')
|
|
u'Helloworld!TAIL'
|
|
|
|
>>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
|
|
u'Helloworld!'
|
|
|
|
>>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
|
|
>>> html.tostring(doc, method='html', encoding='unicode')
|
|
u'<html><body><p>Hello<br>world!</p></body></html>'
|
|
|
|
>>> print(html.tostring(doc, method='html', encoding='unicode',
|
|
... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
|
|
... ' "http://www.w3.org/TR/html4/strict.dtd">'))
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
|
<html><body><p>Hello<br>world!</p></body></html>
|
|
"""
|
|
html = etree.tostring(doc, method=method, pretty_print=pretty_print,
|
|
encoding=encoding, with_tail=with_tail,
|
|
doctype=doctype)
|
|
if method == 'html' and not include_meta_content_type:
|
|
if isinstance(html, str):
|
|
html = __str_replace_meta_content_type('', html)
|
|
else:
|
|
html = __bytes_replace_meta_content_type(bytes(), html)
|
|
return html
|
|
|
|
|
|
tostring.__doc__ = __fix_docstring(tostring.__doc__)
|
|
|
|
|
|
def open_in_browser(doc, encoding=None):
|
|
"""
|
|
Open the HTML document in a web browser, saving it to a temporary
|
|
file to open it. Note that this does not delete the file after
|
|
use. This is mainly meant for debugging.
|
|
"""
|
|
import os
|
|
import webbrowser
|
|
import tempfile
|
|
if not isinstance(doc, etree._ElementTree):
|
|
doc = etree.ElementTree(doc)
|
|
handle, fn = tempfile.mkstemp(suffix='.html')
|
|
f = os.fdopen(handle, 'wb')
|
|
try:
|
|
doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
|
|
finally:
|
|
# we leak the file itself here, but we should at least close it
|
|
f.close()
|
|
url = 'file://' + fn.replace(os.path.sep, '/')
|
|
print(url)
|
|
webbrowser.open(url)
|
|
|
|
|
|
################################################################################
|
|
# configure Element class lookup
|
|
################################################################################
|
|
|
|
class HTMLParser(etree.HTMLParser):
|
|
"""An HTML parser that is configured to return lxml.html Element
|
|
objects.
|
|
"""
|
|
def __init__(self, **kwargs):
|
|
super(HTMLParser, self).__init__(**kwargs)
|
|
self.set_element_class_lookup(HtmlElementClassLookup())
|
|
|
|
|
|
class XHTMLParser(etree.XMLParser):
|
|
"""An XML parser that is configured to return lxml.html Element
|
|
objects.
|
|
|
|
Note that this parser is not really XHTML aware unless you let it
|
|
load a DTD that declares the HTML entities. To do this, make sure
|
|
you have the XHTML DTDs installed in your catalogs, and create the
|
|
parser like this::
|
|
|
|
>>> parser = XHTMLParser(load_dtd=True)
|
|
|
|
If you additionally want to validate the document, use this::
|
|
|
|
>>> parser = XHTMLParser(dtd_validation=True)
|
|
|
|
For catalog support, see http://www.xmlsoft.org/catalog.html.
|
|
"""
|
|
def __init__(self, **kwargs):
|
|
super(XHTMLParser, self).__init__(**kwargs)
|
|
self.set_element_class_lookup(HtmlElementClassLookup())
|
|
|
|
|
|
def Element(*args, **kw):
|
|
"""Create a new HTML Element.
|
|
|
|
This can also be used for XHTML documents.
|
|
"""
|
|
v = html_parser.makeelement(*args, **kw)
|
|
return v
|
|
|
|
|
|
html_parser = HTMLParser()
|
|
xhtml_parser = XHTMLParser()
|