diff options
author | 2022-11-13 23:46:45 +0530 | |
---|---|---|
committer | 2022-11-13 23:46:45 +0530 | |
commit | 9468226a9e2e2ab8cdd599f1d8538e860ca86120 (patch) | |
tree | 0a77ada226d6db80639f96b438bf83e4e756edb5 /env/lib/python3.10/site-packages/lxml/html | |
download | idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.gz idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.bz2 idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.zip |
id card generator
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html')
28 files changed, 5024 insertions, 0 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py new file mode 100644 index 0000000..c35365d --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py @@ -0,0 +1,10 @@ +__doc__ = """Legacy interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["parse", "convert_tree"] + +from .soupparser import convert_tree, parse as _parse + +def parse(file, beautifulsoup=None, makeelement=None): + root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) + return root.getroot() diff --git a/env/lib/python3.10/site-packages/lxml/html/__init__.py b/env/lib/python3.10/site-packages/lxml/html/__init__.py new file mode 100644 index 0000000..ef06a40 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__init__.py @@ -0,0 +1,1946 @@ +# Copyright (c) 2004 Ian Bicking. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. Neither the name of Ian Bicking nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""The ``lxml.html`` tool set for HTML handling. +""" + +from __future__ import absolute_import + +__all__ = [ + 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', + 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] + + +import copy +import sys +import re +from functools import partial + +try: + from collections.abc import MutableMapping, MutableSet +except ImportError: + from collections import MutableMapping, MutableSet + +from .. import etree +from . import defs +from ._setmixin import SetMixin + +try: + from urlparse import urljoin +except ImportError: + # Python 3 + from urllib.parse import urljoin + +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + # Python 3 + basestring = (str, bytes) + + +def __fix_docstring(s): + if not s: + return s + if sys.version_info[0] >= 3: + sub = re.compile(r"^(\s*)u'", re.M).sub + else: + sub = re.compile(r"^(\s*)b'", re.M).sub + return sub(r"\1'", s) + + +XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", + namespaces={'x':XHTML_NAMESPACE}) +_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", + namespaces={'x':XHTML_NAMESPACE}) +_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", + namespaces={'x':XHTML_NAMESPACE}) +#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) +_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") +_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_collect_string_content = etree.XPath("string()") +_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer +_iter_css_imports = re.compile(r'@import "(.*?)"').finditer +_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", + namespaces={'x':XHTML_NAMESPACE}) +_archive_re = re.compile(r'[^ ]+') +_parse_meta_refresh_url = re.compile( + r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search + + +def _unquote_match(s, pos): + if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": + return s[1:-1], pos+1 + else: + return s,pos + + +def _transform_result(typ, result): + """Convert the result back into the input type. + """ + if issubclass(typ, bytes): + return tostring(result, encoding='utf-8') + elif issubclass(typ, unicode): + return tostring(result, encoding='unicode') + else: + return result + + +def _nons(tag): + if isinstance(tag, basestring): + if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: + return tag.split('}')[-1] + return tag + + +class Classes(MutableSet): + """Provides access to an element's class attribute as a set-like collection. + Usage:: + + >>> el = fromstring('<p class="hidden large">Text</p>') + >>> classes = el.classes # or: classes = Classes(el.attrib) + >>> classes |= ['block', 'paragraph'] + >>> el.get('class') + 'hidden large block paragraph' + >>> classes.toggle('hidden') + False + >>> el.get('class') + 'large block paragraph' + >>> classes -= ('some', 'classes', 'block') + >>> el.get('class') + 'large paragraph' + """ + def __init__(self, attributes): + self._attributes = attributes + self._get_class_value = partial(attributes.get, 'class', '') + + def add(self, value): + """ + Add a class. + + This has no effect if the class is already present. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + if value in classes: + return + classes.append(value) + self._attributes['class'] = ' '.join(classes) + + def discard(self, value): + """ + Remove a class if it is currently present. + + If the class is not present, do nothing. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = [name for name in self._get_class_value().split() + if name != value] + if classes: + self._attributes['class'] = ' '.join(classes) + elif 'class' in self._attributes: + del self._attributes['class'] + + def remove(self, value): + """ + Remove a class; it must currently be present. + + If the class is not present, raise a KeyError. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + super(Classes, self).remove(value) + + def __contains__(self, name): + classes = self._get_class_value() + return name in classes and name in classes.split() + + def __iter__(self): + return iter(self._get_class_value().split()) + + def __len__(self): + return len(self._get_class_value().split()) + + # non-standard methods + + def update(self, values): + """ + Add all names from 'values'. + """ + classes = self._get_class_value().split() + extended = False + for value in values: + if value not in classes: + classes.append(value) + extended = True + if extended: + self._attributes['class'] = ' '.join(classes) + + def toggle(self, value): + """ + Add a class name if it isn't there yet, or remove it if it exists. + + Returns true if the class was added (and is now enabled) and + false if it was removed (and is now disabled). + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + try: + classes.remove(value) + enabled = False + except ValueError: + classes.append(value) + enabled = True + if classes: + self._attributes['class'] = ' '.join(classes) + else: + del self._attributes['class'] + return enabled + + +class HtmlMixin(object): + + def set(self, key, value=None): + """set(self, key, value=None) + + Sets an element attribute. If no value is provided, or if the value is None, + creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" + for ``form.set('novalidate')``. + """ + super(HtmlMixin, self).set(key, value) + + @property + def classes(self): + """ + A set-like wrapper around the 'class' attribute. + """ + return Classes(self.attrib) + + @classes.setter + def classes(self, classes): + assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. + value = classes._get_class_value() + if value: + self.set('class', value) + elif self.get('class') is not None: + del self.attrib['class'] + + @property + def base_url(self): + """ + Returns the base URL, given when the page was parsed. + + Use with ``urlparse.urljoin(el.base_url, href)`` to get + absolute URLs. + """ + return self.getroottree().docinfo.URL + + @property + def forms(self): + """ + Return a list of all the forms + """ + return _forms_xpath(self) + + @property + def body(self): + """ + Return the <body> element. Can be called from a child element + to get the document's head. + """ + return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def head(self): + """ + Returns the <head> element. Can be called from a child + element to get the document's head. + """ + return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def label(self): + """ + Get or set any <label> element associated with this element. + """ + id = self.get('id') + if not id: + return None + result = _label_xpath(self, id=id) + if not result: + return None + else: + return result[0] + + @label.setter + def label(self, label): + id = self.get('id') + if not id: + raise TypeError( + "You cannot set a label for an element (%r) that has no id" + % self) + if _nons(label.tag) != 'label': + raise TypeError( + "You can only assign label to a label element (not %r)" + % label) + label.set('for', id) + + @label.deleter + def label(self): + label = self.label + if label is not None: + del label.attrib['for'] + + def drop_tree(self): + """ + Removes this element from the tree, including its children and + text. The tail text is joined to the previous element or + parent. + """ + parent = self.getparent() + assert parent is not None + if self.tail: + previous = self.getprevious() + if previous is None: + parent.text = (parent.text or '') + self.tail + else: + previous.tail = (previous.tail or '') + self.tail + parent.remove(self) + + def drop_tag(self): + """ + Remove the tag, but not its children or text. The children and text + are merged into the parent. + + Example:: + + >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') + >>> h.find('.//b').drop_tag() + >>> print(tostring(h, encoding='unicode')) + <div>Hello World!</div> + """ + parent = self.getparent() + assert parent is not None + previous = self.getprevious() + if self.text and isinstance(self.tag, basestring): + # not a Comment, etc. + if previous is None: + parent.text = (parent.text or '') + self.text + else: + previous.tail = (previous.tail or '') + self.text + if self.tail: + if len(self): + last = self[-1] + last.tail = (last.tail or '') + self.tail + elif previous is None: + parent.text = (parent.text or '') + self.tail + else: + previous.tail = (previous.tail or '') + self.tail + index = parent.index(self) + parent[index:index+1] = self[:] + + def find_rel_links(self, rel): + """ + Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. + """ + rel = rel.lower() + return [el for el in _rel_links_xpath(self) + if el.get('rel').lower() == rel] + + def find_class(self, class_name): + """ + Find any elements with the given class name. + """ + return _class_xpath(self, class_name=class_name) + + def get_element_by_id(self, id, *default): + """ + Get the first element in a document with the given id. If none is + found, return the default argument if provided or raise KeyError + otherwise. + + Note that there can be more than one element with the same id, + and this isn't uncommon in HTML documents found in the wild. + Browsers return only the first match, and this function does + the same. + """ + try: + # FIXME: should this check for multiple matches? + # browsers just return the first one + return _id_xpath(self, id=id)[0] + except IndexError: + if default: + return default[0] + else: + raise KeyError(id) + + def text_content(self): + """ + Return the text content of the tag (and the text in any children). + """ + return _collect_string_content(self) + + def cssselect(self, expr, translator='html'): + """ + Run the CSS expression on this element and its children, + returning a list of the results. + + Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) + -- note that pre-compiling the expression can provide a substantial + speedup. + """ + # Do the import here to make the dependency optional. + from lxml.cssselect import CSSSelector + return CSSSelector(expr, translator=translator)(self) + + ######################################## + ## Link functions + ######################################## + + def make_links_absolute(self, base_url=None, resolve_base_href=True, + handle_failures=None): + """ + Make all links in the document absolute, given the + ``base_url`` for the document (the full URL where the document + came from), or if no ``base_url`` is given, then the ``.base_url`` + of the document. + + If ``resolve_base_href`` is true, then any ``<base href>`` + tags in the document are used *and* removed from the document. + If it is false then any such tag is ignored. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. + """ + if base_url is None: + base_url = self.base_url + if base_url is None: + raise TypeError( + "No base_url given, and the document has no base_url") + if resolve_base_href: + self.resolve_base_href() + + if handle_failures == 'ignore': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return href + elif handle_failures == 'discard': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return None + elif handle_failures is None: + def link_repl(href): + return urljoin(base_url, href) + else: + raise ValueError( + "unexpected value for handle_failures: %r" % handle_failures) + + self.rewrite_links(link_repl) + + def resolve_base_href(self, handle_failures=None): + """ + Find any ``<base href>`` tag in the document, and apply its + values to all links found in the document. Also remove the + tag once it has been applied. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. + """ + base_href = None + basetags = self.xpath('//base[@href]|//x:base[@href]', + namespaces={'x': XHTML_NAMESPACE}) + for b in basetags: + base_href = b.get('href') + b.drop_tree() + if not base_href: + return + self.make_links_absolute(base_href, resolve_base_href=False, + handle_failures=handle_failures) + + def iterlinks(self): + """ + Yield (element, attribute, link, pos), where attribute may be None + (indicating the link is in the text). ``pos`` is the position + where the link occurs; often 0, but sometimes something else in + the case of links in stylesheets or style tags. + + Note: <base href> is *not* taken into account in any way. The + link you get is exactly the link in the document. + + Note: multiple links inside of a single text string or + attribute value are returned in reversed order. This makes it + possible to replace or delete them from the text string value + based on their reported text positions. Otherwise, a + modification at one text position can change the positions of + links reported later on. + """ + link_attrs = defs.link_attrs + for el in self.iter(etree.Element): + attribs = el.attrib + tag = _nons(el.tag) + if tag == 'object': + codebase = None + ## <object> tags have attributes that are relative to + ## codebase + if 'codebase' in attribs: + codebase = el.get('codebase') + yield (el, 'codebase', codebase, 0) + for attrib in ('classid', 'data'): + if attrib in attribs: + value = el.get(attrib) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, attrib, value, 0) + if 'archive' in attribs: + for match in _archive_re.finditer(el.get('archive')): + value = match.group(0) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, 'archive', value, match.start()) + else: + for attrib in link_attrs: + if attrib in attribs: + yield (el, attrib, attribs[attrib], 0) + if tag == 'meta': + http_equiv = attribs.get('http-equiv', '').lower() + if http_equiv == 'refresh': + content = attribs.get('content', '') + match = _parse_meta_refresh_url(content) + url = (match.group('url') if match else content).strip() + # unexpected content means the redirect won't work, but we might + # as well be permissive and return the entire string. + if url: + url, pos = _unquote_match( + url, match.start('url') if match else content.find(url)) + yield (el, 'content', url, pos) + elif tag == 'param': + valuetype = el.get('valuetype') or '' + if valuetype.lower() == 'ref': + ## FIXME: while it's fine we *find* this link, + ## according to the spec we aren't supposed to + ## actually change the value, including resolving + ## it. It can also still be a link, even if it + ## doesn't have a valuetype="ref" (which seems to be the norm) + ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype + yield (el, 'value', el.get('value'), 0) + elif tag == 'style' and el.text: + urls = [ + # (start_pos, url) + _unquote_match(match.group(1), match.start(1))[::-1] + for match in _iter_css_urls(el.text) + ] + [ + (match.start(1), match.group(1)) + for match in _iter_css_imports(el.text) + ] + if urls: + # sort by start pos to bring both match sets back into order + # and reverse the list to report correct positions despite + # modifications + urls.sort(reverse=True) + for start, url in urls: + yield (el, None, url, start) + if 'style' in attribs: + urls = list(_iter_css_urls(attribs['style'])) + if urls: + # return in reversed order to simplify in-place modifications + for match in urls[::-1]: + url, start = _unquote_match(match.group(1), match.start(1)) + yield (el, 'style', url, start) + + def rewrite_links(self, link_repl_func, resolve_base_href=True, + base_href=None): + """ + Rewrite all the links in the document. For each link + ``link_repl_func(link)`` will be called, and the return value + will replace the old link. + + Note that links may not be absolute (unless you first called + ``make_links_absolute()``), and may be internal (e.g., + ``'#anchor'``). They can also be values like + ``'mailto:email'`` or ``'javascript:expr'``. + + If you give ``base_href`` then all links passed to + ``link_repl_func()`` will take that into account. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. + """ + if base_href is not None: + # FIXME: this can be done in one pass with a wrapper + # around link_repl_func + self.make_links_absolute( + base_href, resolve_base_href=resolve_base_href) + elif resolve_base_href: + self.resolve_base_href() + + for el, attrib, link, pos in self.iterlinks(): + new_link = link_repl_func(link.strip()) + if new_link == link: + continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue + + if attrib is None: + new = el.text[:pos] + new_link + el.text[pos+len(link):] + el.text = new + else: + cur = el.get(attrib) + if not pos and len(cur) == len(link): + new = new_link # most common case + else: + new = cur[:pos] + new_link + cur[pos+len(link):] + el.set(attrib, new) + + +class _MethodFunc(object): + """ + An object that represents a method on an element as a function; + the function takes either an element or an HTML string. It + returns whatever the function normally returns, or if the function + works in-place (and so returns None) it returns a serialized form + of the resulting document. + """ + def __init__(self, name, copy=False, source_class=HtmlMixin): + self.name = name + self.copy = copy + self.__doc__ = getattr(source_class, self.name).__doc__ + def __call__(self, doc, *args, **kw): + result_type = type(doc) + if isinstance(doc, basestring): + if 'copy' in kw: + raise TypeError( + "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) + doc = fromstring(doc, **kw) + else: + if 'copy' in kw: + make_a_copy = kw.pop('copy') + else: + make_a_copy = self.copy + if make_a_copy: + doc = copy.deepcopy(doc) + meth = getattr(doc, self.name) + result = meth(*args, **kw) + # FIXME: this None test is a bit sloppy + if result is None: + # Then return what we got in + return _transform_result(result_type, doc) + else: + return result + + +find_rel_links = _MethodFunc('find_rel_links', copy=False) +find_class = _MethodFunc('find_class', copy=False) +make_links_absolute = _MethodFunc('make_links_absolute', copy=True) +resolve_base_href = _MethodFunc('resolve_base_href', copy=True) +iterlinks = _MethodFunc('iterlinks', copy=False) +rewrite_links = _MethodFunc('rewrite_links', copy=True) + + +class HtmlComment(HtmlMixin, etree.CommentBase): + pass + + +class HtmlElement(HtmlMixin, etree.ElementBase): + pass + + +class HtmlProcessingInstruction(HtmlMixin, etree.PIBase): + pass + + +class HtmlEntity(HtmlMixin, etree.EntityBase): + pass + + +class HtmlElementClassLookup(etree.CustomElementClassLookup): + """A lookup scheme for HTML Element classes. + + To create a lookup instance with different Element classes, pass a tag + name mapping of Element classes in the ``classes`` keyword argument and/or + a tag name mapping of Mixin classes in the ``mixins`` keyword argument. + The special key '*' denotes a Mixin class that should be mixed into all + Element classes. + """ + _default_element_classes = {} + + def __init__(self, classes=None, mixins=None): + etree.CustomElementClassLookup.__init__(self) + if classes is None: + classes = self._default_element_classes.copy() + if mixins: + mixers = {} + for name, value in mixins: + if name == '*': + for n in classes.keys(): + mixers.setdefault(n, []).append(value) + else: + mixers.setdefault(name, []).append(value) + for name, mix_bases in mixers.items(): + cur = classes.get(name, HtmlElement) + bases = tuple(mix_bases + [cur]) + classes[name] = type(cur.__name__, bases, {}) + self._element_classes = classes + + def lookup(self, node_type, document, namespace, name): + if node_type == 'element': + return self._element_classes.get(name.lower(), HtmlElement) + elif node_type == 'comment': + return HtmlComment + elif node_type == 'PI': + return HtmlProcessingInstruction + elif node_type == 'entity': + return HtmlEntity + # Otherwise normal lookup + return None + + +################################################################################ +# parsing +################################################################################ + +_looks_like_full_html_unicode = re.compile( + unicode(r'^\s*<(?:html|!doctype)'), re.I).match +_looks_like_full_html_bytes = re.compile( + r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match + + +def document_fromstring(html, parser=None, ensure_head_body=False, **kw): + if parser is None: + parser = html_parser + value = etree.fromstring(html, parser, **kw) + if value is None: + raise etree.ParserError( + "Document is empty") + if ensure_head_body and value.find('head') is None: + value.insert(0, Element('head')) + if ensure_head_body and value.find('body') is None: + value.append(Element('body')) + return value + + +def fragments_fromstring(html, no_leading_text=False, base_url=None, + parser=None, **kw): + """Parses several HTML elements, returning a list of elements. + + The first item in the list may be a string. + If no_leading_text is true, then it will be an error if there is + leading text, and it will always be a list of only elements. + + base_url will set the document's base_url attribute + (and the tree's docinfo.URL). + """ + if parser is None: + parser = html_parser + # FIXME: check what happens when you give html with a body, head, etc. + if isinstance(html, bytes): + if not _looks_like_full_html_bytes(html): + # can't use %-formatting in early Py3 versions + html = ('<html><body>'.encode('ascii') + html + + '</body></html>'.encode('ascii')) + else: + if not _looks_like_full_html_unicode(html): + html = '<html><body>%s</body></html>' % html + doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) + assert _nons(doc.tag) == 'html' + bodies = [e for e in doc if _nons(e.tag) == 'body'] + assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) + body = bodies[0] + elements = [] + if no_leading_text and body.text and body.text.strip(): + raise etree.ParserError( + "There is leading text: %r" % body.text) + if body.text and body.text.strip(): + elements.append(body.text) + elements.extend(body) + # FIXME: removing the reference to the parent artificial document + # would be nice + return elements + + +def fragment_fromstring(html, create_parent=False, base_url=None, + parser=None, **kw): + """ + Parses a single HTML element; it is an error if there is more than + one element, or if anything but whitespace precedes or follows the + element. + + If ``create_parent`` is true (or is a tag name) then a parent node + will be created to encapsulate the HTML in a single element. In this + case, leading or trailing text is also allowed, as are multiple elements + as result of the parsing. + + Passing a ``base_url`` will set the document's ``base_url`` attribute + (and the tree's docinfo.URL). + """ + if parser is None: + parser = html_parser + + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, parser=parser, no_leading_text=not accept_leading_text, + base_url=base_url, **kw) + + if create_parent: + if not isinstance(create_parent, basestring): + create_parent = 'div' + new_root = Element(create_parent) + if elements: + if isinstance(elements[0], basestring): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root + + if not elements: + raise etree.ParserError('No elements found') + if len(elements) > 1: + raise etree.ParserError( + "Multiple elements found (%s)" + % ', '.join([_element_name(e) for e in elements])) + el = elements[0] + if el.tail and el.tail.strip(): + raise etree.ParserError( + "Element followed by text: %r" % el.tail) + el.tail = None + return el + + +def fromstring(html, base_url=None, parser=None, **kw): + """ + Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + + base_url will set the document's base_url attribute (and the tree's docinfo.URL) + """ + if parser is None: + parser = html_parser + if isinstance(html, bytes): + is_full_html = _looks_like_full_html_bytes(html) + else: + is_full_html = _looks_like_full_html_unicode(html) + doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) + if is_full_html: + return doc + # otherwise, lets parse it out... + bodies = doc.findall('body') + if not bodies: + bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) + if bodies: + body = bodies[0] + if len(bodies) > 1: + # Somehow there are multiple bodies, which is bad, but just + # smash them into one body + for other_body in bodies[1:]: + if other_body.text: + if len(body): + body[-1].tail = (body[-1].tail or '') + other_body.text + else: + body.text = (body.text or '') + other_body.text + body.extend(other_body) + # We'll ignore tail + # I guess we are ignoring attributes too + other_body.drop_tree() + else: + body = None + heads = doc.findall('head') + if not heads: + heads = doc.findall('{%s}head' % XHTML_NAMESPACE) + if heads: + # Well, we have some sort of structure, so lets keep it all + head = heads[0] + if len(heads) > 1: + for other_head in heads[1:]: + head.extend(other_head) + # We don't care about text or tail in a head + other_head.drop_tree() + return doc + if body is None: + return doc + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + # The body has just one element, so it was probably a single + # element passed in + return body[0] + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except <body> implies too much structure. + if _contains_block_level_tag(body): + body.tag = 'div' + else: + body.tag = 'span' + return body + + +def parse(filename_or_url, parser=None, base_url=None, **kw): + """ + Parse a filename, URL, or file-like object into an HTML document + tree. Note: this returns a tree, not an element. Use + ``parse(...).getroot()`` to get the document root. + + You can override the base URL with the ``base_url`` keyword. This + is most useful when parsing from a file-like object. + """ + if parser is None: + parser = html_parser + return etree.parse(filename_or_url, parser, base_url=base_url, **kw) + + +def _contains_block_level_tag(el): + # FIXME: I could do this with XPath, but would that just be + # unnecessarily slow? + for el in el.iter(etree.Element): + if _nons(el.tag) in defs.block_tags: + return True + return False + + +def _element_name(el): + if isinstance(el, etree.CommentBase): + return 'comment' + elif isinstance(el, basestring): + return 'string' + else: + return _nons(el.tag) + + +################################################################################ +# form handling +################################################################################ + +class FormElement(HtmlElement): + """ + Represents a <form> element. + """ + + @property + def inputs(self): + """ + Returns an accessor for all the input elements in the form. + + See `InputGetter` for more information about the object. + """ + return InputGetter(self) + + @property + def fields(self): + """ + Dictionary-like object that represents all the fields in this + form. You can set values in this dictionary to effect the + form. + """ + return FieldsDict(self.inputs) + + @fields.setter + def fields(self, value): + fields = self.fields + prev_keys = fields.keys() + for key, value in value.items(): + if key in prev_keys: + prev_keys.remove(key) + fields[key] = value + for key in prev_keys: + if key is None: + # Case of an unnamed input; these aren't really + # expressed in form_values() anyway. + continue + fields[key] = None + + def _name(self): + if self.get('name'): + return self.get('name') + elif self.get('id'): + return '#' + self.get('id') + iter_tags = self.body.iter + forms = list(iter_tags('form')) + if not forms: + forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) + return str(forms.index(self)) + + def form_values(self): + """ + Return a list of tuples of the field values for the form. + This is suitable to be passed to ``urllib.urlencode()``. + """ + results = [] + for el in self.inputs: + name = el.name + if not name or 'disabled' in el.attrib: + continue + tag = _nons(el.tag) + if tag == 'textarea': + results.append((name, el.value)) + elif tag == 'select': + value = el.value + if el.multiple: + for v in value: + results.append((name, v)) + elif value is not None: + results.append((name, el.value)) + else: + assert tag == 'input', ( + "Unexpected tag: %r" % el) + if el.checkable and not el.checked: + continue + if el.type in ('submit', 'image', 'reset', 'file'): + continue + value = el.value + if value is not None: + results.append((name, el.value)) + return results + + @property + def action(self): + """ + Get/set the form's ``action`` attribute. + """ + base_url = self.base_url + action = self.get('action') + if base_url and action is not None: + return urljoin(base_url, action) + else: + return action + + @action.setter + def action(self, value): + self.set('action', value) + + @action.deleter + def action(self): + attrib = self.attrib + if 'action' in attrib: + del attrib['action'] + + @property + def method(self): + """ + Get/set the form's method. Always returns a capitalized + string, and defaults to ``'GET'`` + """ + return self.get('method', 'GET').upper() + + @method.setter + def method(self, value): + self.set('method', value.upper()) + + +HtmlElementClassLookup._default_element_classes['form'] = FormElement + + +def submit_form(form, extra_values=None, open_http=None): + """ + Helper function to submit a form. Returns a file-like object, as from + ``urllib.urlopen()``. This object also has a ``.geturl()`` function, + which shows the URL if there were any redirects. + + You can use this like:: + + form = doc.forms[0] + form.inputs['foo'].value = 'bar' # etc + response = form.submit() + doc = parse(response) + doc.make_links_absolute(response.geturl()) + + To change the HTTP requester, pass a function as ``open_http`` keyword + argument that opens the URL for you. The function must have the following + signature:: + + open_http(method, URL, values) + + The action is one of 'GET' or 'POST', the URL is the target URL as a + string, and the values are a sequence of ``(name, value)`` tuples with the + form data. + """ + values = form.form_values() + if extra_values: + if hasattr(extra_values, 'items'): + extra_values = extra_values.items() + values.extend(extra_values) + if open_http is None: + open_http = open_http_urllib + if form.action: + url = form.action + else: + url = form.base_url + return open_http(form.method, url, values) + + +def open_http_urllib(method, url, values): + if not url: + raise ValueError("cannot submit, no URL provided") + ## FIXME: should test that it's not a relative URL or something + try: + from urllib import urlencode, urlopen + except ImportError: # Python 3 + from urllib.request import urlopen + from urllib.parse import urlencode + if method == 'GET': + if '?' in url: + url += '&' + else: + url += '?' + url += urlencode(values) + data = None + else: + data = urlencode(values) + if not isinstance(data, bytes): + data = data.encode('ASCII') + return urlopen(url, data) + + +class FieldsDict(MutableMapping): + + def __init__(self, inputs): + self.inputs = inputs + def __getitem__(self, item): + return self.inputs[item].value + def __setitem__(self, item, value): + self.inputs[item].value = value + def __delitem__(self, item): + raise KeyError( + "You cannot remove keys from ElementDict") + def keys(self): + return self.inputs.keys() + def __contains__(self, item): + return item in self.inputs + def __iter__(self): + return iter(self.inputs.keys()) + def __len__(self): + return len(self.inputs) + + def __repr__(self): + return '<%s for form %s>' % ( + self.__class__.__name__, + self.inputs.form._name()) + + +class InputGetter(object): + + """ + An accessor that represents all the input fields in a form. + + You can get fields by name from this, with + ``form.inputs['field_name']``. If there are a set of checkboxes + with the same name, they are returned as a list (a `CheckboxGroup` + which also allows value setting). Radio inputs are handled + similarly. Use ``.keys()`` and ``.items()`` to process all fields + in this way. + + You can also iterate over this to get all input elements. This + won't return the same thing as if you get all the names, as + checkboxes and radio elements are returned individually. + """ + + def __init__(self, form): + self.form = form + + def __repr__(self): + return '<%s for form %s>' % ( + self.__class__.__name__, + self.form._name()) + + ## FIXME: there should be more methods, and it's unclear if this is + ## a dictionary-like object or list-like object + + def __getitem__(self, name): + fields = [field for field in self if field.name == name] + if not fields: + raise KeyError("No input element with the name %r" % name) + + input_type = fields[0].get('type') + if input_type == 'radio' and len(fields) > 1: + group = RadioGroup(fields) + group.name = name + return group + elif input_type == 'checkbox' and len(fields) > 1: + group = CheckboxGroup(fields) + group.name = name + return group + else: + # I don't like throwing away elements like this + return fields[0] + + def __contains__(self, name): + for field in self: + if field.name == name: + return True + return False + + def keys(self): + """ + Returns all unique field names, in document order. + + :return: A list of all unique field names. + """ + names = [] + seen = {None} + for el in self: + name = el.name + if name not in seen: + names.append(name) + seen.add(name) + return names + + def items(self): + """ + Returns all fields with their names, similar to dict.items(). + + :return: A list of (name, field) tuples. + """ + items = [] + seen = set() + for el in self: + name = el.name + if name not in seen: + seen.add(name) + items.append((name, self[name])) + return items + + def __iter__(self): + return self.form.iter('select', 'input', 'textarea') + + def __len__(self): + return sum(1 for _ in self) + + +class InputMixin(object): + """ + Mix-in for all input elements (input, select, and textarea) + """ + @property + def name(self): + """ + Get/set the name of the element + """ + return self.get('name') + + @name.setter + def name(self, value): + self.set('name', value) + + @name.deleter + def name(self): + attrib = self.attrib + if 'name' in attrib: + del attrib['name'] + + def __repr__(self): + type_name = getattr(self, 'type', None) + if type_name: + type_name = ' type=%r' % type_name + else: + type_name = '' + return '<%s %x name=%r%s>' % ( + self.__class__.__name__, id(self), self.name, type_name) + + +class TextareaElement(InputMixin, HtmlElement): + """ + ``<textarea>`` element. You can get the name with ``.name`` and + get/set the value with ``.value`` + """ + @property + def value(self): + """ + Get/set the value (which is the contents of this element) + """ + content = self.text or '' + if self.tag.startswith("{%s}" % XHTML_NAMESPACE): + serialisation_method = 'xml' + else: + serialisation_method = 'html' + for el in self: + # it's rare that we actually get here, so let's not use ''.join() + content += etree.tostring( + el, method=serialisation_method, encoding='unicode') + return content + + @value.setter + def value(self, value): + del self[:] + self.text = value + + @value.deleter + def value(self): + self.text = '' + del self[:] + + +HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement + + +class SelectElement(InputMixin, HtmlElement): + """ + ``<select>`` element. You can get the name with ``.name``. + + ``.value`` will be the value of the selected option, unless this + is a multi-select element (``<select multiple>``), in which case + it will be a set-like object. In either case ``.value_options`` + gives the possible values. + + The boolean attribute ``.multiple`` shows if this is a + multi-select. + """ + @property + def value(self): + """ + Get/set the value of this select (the selected option). + + If this is a multi-select, this is a set-like object that + represents all the selected options. + """ + if self.multiple: + return MultipleSelectOptions(self) + options = _options_xpath(self) + + try: + selected_option = next(el for el in reversed(options) if el.get('selected') is not None) + except StopIteration: + try: + selected_option = next(el for el in options if el.get('disabled') is None) + except StopIteration: + return None + value = selected_option.get('value') + if value is None: + value = (selected_option.text or '').strip() + return value + + @value.setter + def value(self, value): + if self.multiple: + if isinstance(value, basestring): + raise TypeError("You must pass in a sequence") + values = self.value + values.clear() + values.update(value) + return + checked_option = None + if value is not None: + for el in _options_xpath(self): + opt_value = el.get('value') + if opt_value is None: + opt_value = (el.text or '').strip() + if opt_value == value: + checked_option = el + break + else: + raise ValueError( + "There is no option with the value of %r" % value) + for el in _options_xpath(self): + if 'selected' in el.attrib: + del el.attrib['selected'] + if checked_option is not None: + checked_option.set('selected', '') + + @value.deleter + def value(self): + # FIXME: should del be allowed at all? + if self.multiple: + self.value.clear() + else: + self.value = None + + @property + def value_options(self): + """ + All the possible values this select can have (the ``value`` + attribute of all the ``<option>`` elements. + """ + options = [] + for el in _options_xpath(self): + value = el.get('value') + if value is None: + value = (el.text or '').strip() + options.append(value) + return options + + @property + def multiple(self): + """ + Boolean attribute: is there a ``multiple`` attribute on this element. + """ + return 'multiple' in self.attrib + + @multiple.setter + def multiple(self, value): + if value: + self.set('multiple', '') + elif 'multiple' in self.attrib: + del self.attrib['multiple'] + + +HtmlElementClassLookup._default_element_classes['select'] = SelectElement + + +class MultipleSelectOptions(SetMixin): + """ + Represents all the selected options in a ``<select multiple>`` element. + + You can add to this set-like option to select an option, or remove + to unselect the option. + """ + + def __init__(self, select): + self.select = select + + @property + def options(self): + """ + Iterator of all the ``<option>`` elements. + """ + return iter(_options_xpath(self.select)) + + def __iter__(self): + for option in self.options: + if 'selected' in option.attrib: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + yield opt_value + + def add(self, item): + for option in self.options: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + if opt_value == item: + option.set('selected', '') + break + else: + raise ValueError( + "There is no option with the value %r" % item) + + def remove(self, item): + for option in self.options: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + if opt_value == item: + if 'selected' in option.attrib: + del option.attrib['selected'] + else: + raise ValueError( + "The option %r is not currently selected" % item) + break + else: + raise ValueError( + "There is not option with the value %r" % item) + + def __repr__(self): + return '<%s {%s} for select name=%r>' % ( + self.__class__.__name__, + ', '.join([repr(v) for v in self]), + self.select.name) + + +class RadioGroup(list): + """ + This object represents several ``<input type=radio>`` elements + that have the same name. + + You can use this like a list, but also use the property + ``.value`` to check/uncheck inputs. Also you can use + ``.value_options`` to get the possible values. + """ + @property + def value(self): + """ + Get/set the value, which checks the radio with that value (and + unchecks any other value). + """ + for el in self: + if 'checked' in el.attrib: + return el.get('value') + return None + + @value.setter + def value(self, value): + checked_option = None + if value is not None: + for el in self: + if el.get('value') == value: + checked_option = el + break + else: + raise ValueError("There is no radio input with the value %r" % value) + for el in self: + if 'checked' in el.attrib: + del el.attrib['checked'] + if checked_option is not None: + checked_option.set('checked', '') + + @value.deleter + def value(self): + self.value = None + + @property + def value_options(self): + """ + Returns a list of all the possible values. + """ + return [el.get('value') for el in self] + + def __repr__(self): + return '%s(%s)' % ( + self.__class__.__name__, + list.__repr__(self)) + + +class CheckboxGroup(list): + """ + Represents a group of checkboxes (``<input type=checkbox>``) that + have the same name. + + In addition to using this like a list, the ``.value`` attribute + returns a set-like object that you can add to or remove from to + check and uncheck checkboxes. You can also use ``.value_options`` + to get the possible values. + """ + @property + def value(self): + """ + Return a set-like object that can be modified to check or + uncheck individual checkboxes according to their value. + """ + return CheckboxValues(self) + + @value.setter + def value(self, value): + values = self.value + values.clear() + if not hasattr(value, '__iter__'): + raise ValueError( + "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" + % (self[0].name, value)) + values.update(value) + + @value.deleter + def value(self): + self.value.clear() + + @property + def value_options(self): + """ + Returns a list of all the possible values. + """ + return [el.get('value') for el in self] + + def __repr__(self): + return '%s(%s)' % ( + self.__class__.__name__, list.__repr__(self)) + + +class CheckboxValues(SetMixin): + """ + Represents the values of the checked checkboxes in a group of + checkboxes with the same name. + """ + + def __init__(self, group): + self.group = group + + def __iter__(self): + return iter([ + el.get('value') + for el in self.group + if 'checked' in el.attrib]) + + def add(self, value): + for el in self.group: + if el.get('value') == value: + el.set('checked', '') + break + else: + raise KeyError("No checkbox with value %r" % value) + + def remove(self, value): + for el in self.group: + if el.get('value') == value: + if 'checked' in el.attrib: + del el.attrib['checked'] + else: + raise KeyError( + "The checkbox with value %r was already unchecked" % value) + break + else: + raise KeyError( + "No checkbox with value %r" % value) + + def __repr__(self): + return '<%s {%s} for checkboxes name=%r>' % ( + self.__class__.__name__, + ', '.join([repr(v) for v in self]), + self.group.name) + + +class InputElement(InputMixin, HtmlElement): + """ + Represents an ``<input>`` element. + + You can get the type with ``.type`` (which is lower-cased and + defaults to ``'text'``). + + Also you can get and set the value with ``.value`` + + Checkboxes and radios have the attribute ``input.checkable == + True`` (for all others it is false) and a boolean attribute + ``.checked``. + + """ + + ## FIXME: I'm a little uncomfortable with the use of .checked + @property + def value(self): + """ + Get/set the value of this element, using the ``value`` attribute. + + Also, if this is a checkbox and it has no value, this defaults + to ``'on'``. If it is a checkbox or radio that is not + checked, this returns None. + """ + if self.checkable: + if self.checked: + return self.get('value') or 'on' + else: + return None + return self.get('value') + + @value.setter + def value(self, value): + if self.checkable: + if not value: + self.checked = False + else: + self.checked = True + if isinstance(value, basestring): + self.set('value', value) + else: + self.set('value', value) + + @value.deleter + def value(self): + if self.checkable: + self.checked = False + else: + if 'value' in self.attrib: + del self.attrib['value'] + + @property + def type(self): + """ + Return the type of this element (using the type attribute). + """ + return self.get('type', 'text').lower() + + @type.setter + def type(self, value): + self.set('type', value) + + @property + def checkable(self): + """ + Boolean: can this element be checked? + """ + return self.type in ('checkbox', 'radio') + + @property + def checked(self): + """ + Boolean attribute to get/set the presence of the ``checked`` + attribute. + + You can only use this on checkable input types. + """ + if not self.checkable: + raise AttributeError('Not a checkable input type') + return 'checked' in self.attrib + + @checked.setter + def checked(self, value): + if not self.checkable: + raise AttributeError('Not a checkable input type') + if value: + self.set('checked', '') + else: + attrib = self.attrib + if 'checked' in attrib: + del attrib['checked'] + + +HtmlElementClassLookup._default_element_classes['input'] = InputElement + + +class LabelElement(HtmlElement): + """ + Represents a ``<label>`` element. + + Label elements are linked to other elements with their ``for`` + attribute. You can access this element with ``label.for_element``. + """ + @property + def for_element(self): + """ + Get/set the element this label points to. Return None if it + can't be found. + """ + id = self.get('for') + if not id: + return None + return self.body.get_element_by_id(id) + + @for_element.setter + def for_element(self, other): + id = other.get('id') + if not id: + raise TypeError( + "Element %r has no id attribute" % other) + self.set('for', id) + + @for_element.deleter + def for_element(self): + attrib = self.attrib + if 'id' in attrib: + del attrib['id'] + + +HtmlElementClassLookup._default_element_classes['label'] = LabelElement + + +############################################################ +## Serialization +############################################################ + +def html_to_xhtml(html): + """Convert all tags in an HTML tree to XHTML by moving them to the + XHTML namespace. + """ + try: + html = html.getroot() + except AttributeError: + pass + prefix = "{%s}" % XHTML_NAMESPACE + for el in html.iter(etree.Element): + tag = el.tag + if tag[0] != '{': + el.tag = prefix + tag + + +def xhtml_to_html(xhtml): + """Convert all tags in an XHTML tree to HTML by removing their + XHTML namespace. + """ + try: + xhtml = xhtml.getroot() + except AttributeError: + pass + prefix = "{%s}" % XHTML_NAMESPACE + prefix_len = len(prefix) + for el in xhtml.iter(prefix + "*"): + el.tag = el.tag[prefix_len:] + + +# This isn't a general match, but it's a match for what libxml2 +# specifically serialises: +__str_replace_meta_content_type = re.compile( + r'<meta http-equiv="Content-Type"[^>]*>').sub +__bytes_replace_meta_content_type = re.compile( + r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub + + +def tostring(doc, pretty_print=False, include_meta_content_type=False, + encoding=None, method="html", with_tail=True, doctype=None): + """Return an HTML string representation of the document. + + Note: if include_meta_content_type is true this will create a + ``<meta http-equiv="Content-Type" ...>`` tag in the head; + regardless of the value of include_meta_content_type any existing + ``<meta http-equiv="Content-Type" ...>`` tag will be removed + + The ``encoding`` argument controls the output encoding (defaults to + ASCII, with &#...; character references for any characters outside + of ASCII). Note that you can pass the name ``'unicode'`` as + ``encoding`` argument to serialise to a Unicode string. + + The ``method`` argument defines the output method. It defaults to + 'html', but can also be 'xml' for xhtml output, or 'text' to + serialise to plain text without markup. + + To leave out the tail text of the top-level element that is being + serialised, pass ``with_tail=False``. + + The ``doctype`` option allows passing in a plain string that will + be serialised before the XML tree. Note that passing in non + well-formed content here will make the XML output non well-formed. + Also, an existing doctype in the document tree will not be removed + when serialising an ElementTree instance. + + Example:: + + >>> from lxml import html + >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') + + >>> html.tostring(root) + b'<p>Hello<br>world!</p>' + >>> html.tostring(root, method='html') + b'<p>Hello<br>world!</p>' + + >>> html.tostring(root, method='xml') + b'<p>Hello<br/>world!</p>' + + >>> html.tostring(root, method='text') + b'Helloworld!' + + >>> html.tostring(root, method='text', encoding='unicode') + u'Helloworld!' + + >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') + >>> html.tostring(root[0], method='text', encoding='unicode') + u'Helloworld!TAIL' + + >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) + u'Helloworld!' + + >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') + >>> html.tostring(doc, method='html', encoding='unicode') + u'<html><body><p>Hello<br>world!</p></body></html>' + + >>> print(html.tostring(doc, method='html', encoding='unicode', + ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' + ... ' "http://www.w3.org/TR/html4/strict.dtd">')) + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> + <html><body><p>Hello<br>world!</p></body></html> + """ + html = etree.tostring(doc, method=method, pretty_print=pretty_print, + encoding=encoding, with_tail=with_tail, + doctype=doctype) + if method == 'html' and not include_meta_content_type: + if isinstance(html, str): + html = __str_replace_meta_content_type('', html) + else: + html = __bytes_replace_meta_content_type(bytes(), html) + return html + + +tostring.__doc__ = __fix_docstring(tostring.__doc__) + + +def open_in_browser(doc, encoding=None): + """ + Open the HTML document in a web browser, saving it to a temporary + file to open it. Note that this does not delete the file after + use. This is mainly meant for debugging. + """ + import os + import webbrowser + import tempfile + if not isinstance(doc, etree._ElementTree): + doc = etree.ElementTree(doc) + handle, fn = tempfile.mkstemp(suffix='.html') + f = os.fdopen(handle, 'wb') + try: + doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") + finally: + # we leak the file itself here, but we should at least close it + f.close() + url = 'file://' + fn.replace(os.path.sep, '/') + print(url) + webbrowser.open(url) + + +################################################################################ +# configure Element class lookup +################################################################################ + +class HTMLParser(etree.HTMLParser): + """An HTML parser that is configured to return lxml.html Element + objects. + """ + def __init__(self, **kwargs): + super(HTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(HtmlElementClassLookup()) + + +class XHTMLParser(etree.XMLParser): + """An XML parser that is configured to return lxml.html Element + objects. + + Note that this parser is not really XHTML aware unless you let it + load a DTD that declares the HTML entities. To do this, make sure + you have the XHTML DTDs installed in your catalogs, and create the + parser like this:: + + >>> parser = XHTMLParser(load_dtd=True) + + If you additionally want to validate the document, use this:: + + >>> parser = XHTMLParser(dtd_validation=True) + + For catalog support, see http://www.xmlsoft.org/catalog.html. + """ + def __init__(self, **kwargs): + super(XHTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(HtmlElementClassLookup()) + + +def Element(*args, **kw): + """Create a new HTML Element. + + This can also be used for XHTML documents. + """ + v = html_parser.makeelement(*args, **kw) + return v + + +html_parser = HTMLParser() +xhtml_parser = XHTMLParser() diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..a378207 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..4bc5785 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..fa25497 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..b243408 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..a2de006 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..b915259 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..c343b40 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..8dc2d4b --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..c029ed9 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..049161a --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..6208e67 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..3293704 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc Binary files differnew file mode 100644 index 0000000..d76e7dd --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc diff --git a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py new file mode 100644 index 0000000..e0502c0 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py @@ -0,0 +1,88 @@ +from __future__ import absolute_import + +import optparse +import sys +import re +import os +from .diff import htmldiff + +description = """\ +""" + +parser = optparse.OptionParser( + usage="%prog [OPTIONS] FILE1 FILE2\n" + "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...", + description=description, + ) + +parser.add_option( + '-o', '--output', + metavar="FILE", + dest="output", + default="-", + help="File to write the difference to", + ) + +parser.add_option( + '-a', '--annotation', + action="store_true", + dest="annotation", + help="Do an annotation") + +def main(args=None): + if args is None: + args = sys.argv[1:] + options, args = parser.parse_args(args) + if options.annotation: + return annotate(options, args) + if len(args) != 2: + print('Error: you must give two files') + parser.print_help() + sys.exit(1) + file1, file2 = args + input1 = read_file(file1) + input2 = read_file(file2) + body1 = split_body(input1)[1] + pre, body2, post = split_body(input2) + result = htmldiff(body1, body2) + result = pre + result + post + if options.output == '-': + if not result.endswith('\n'): + result += '\n' + sys.stdout.write(result) + else: + with open(options.output, 'wb') as f: + f.write(result) + +def read_file(filename): + if filename == '-': + c = sys.stdin.read() + elif not os.path.exists(filename): + raise OSError( + "Input file %s does not exist" % filename) + else: + with open(filename, 'rb') as f: + c = f.read() + return c + +body_start_re = re.compile( + r"<body.*?>", re.I|re.S) +body_end_re = re.compile( + r"</body.*?>", re.I|re.S) + +def split_body(html): + pre = post = '' + match = body_start_re.search(html) + if match: + pre = html[:match.end()] + html = html[match.end():] + match = body_end_re.search(html) + if match: + post = html[match.start():] + html = html[:match.start()] + return pre, html, post + +def annotate(options, args): + print("Not yet implemented") + sys.exit(1) + diff --git a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py new file mode 100644 index 0000000..3405c20 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py @@ -0,0 +1,100 @@ +""" +Legacy module - don't use in new code! + +html5lib now has its own proper implementation. + +This module implements a tree builder for html5lib that generates lxml +html element trees. This module uses camelCase as it follows the +html5lib style guide. +""" + +from html5lib.treebuilders import _base, etree as etree_builders +from lxml import html, etree + + +class DocumentType(object): + + def __init__(self, name, publicId, systemId): + self.name = name + self.publicId = publicId + self.systemId = systemId + +class Document(object): + + def __init__(self): + self._elementTree = None + self.childNodes = [] + + def appendChild(self, element): + self._elementTree.getroot().addnext(element._element) + + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = None + commentClass = None + fragmentClass = Document + + def __init__(self, *args, **kwargs): + html_builder = etree_builders.getETreeModule(html, fullTree=False) + etree_builder = etree_builders.getETreeModule(etree, fullTree=False) + self.elementClass = html_builder.Element + self.commentClass = etree_builder.Comment + _base.TreeBuilder.__init__(self, *args, **kwargs) + + def reset(self): + _base.TreeBuilder.reset(self) + self.rootInserted = False + self.initialComments = [] + self.doctype = None + + def getDocument(self): + return self.document._elementTree + + def getFragment(self): + fragment = [] + element = self.openElements[0]._element + if element.text: + fragment.append(element.text) + fragment.extend(element.getchildren()) + if element.tail: + fragment.append(element.tail) + return fragment + + def insertDoctype(self, name, publicId, systemId): + doctype = self.doctypeClass(name, publicId, systemId) + self.doctype = doctype + + def insertComment(self, data, parent=None): + if not self.rootInserted: + self.initialComments.append(data) + else: + _base.TreeBuilder.insertComment(self, data, parent) + + def insertRoot(self, name): + buf = [] + if self.doctype and self.doctype.name: + buf.append('<!DOCTYPE %s' % self.doctype.name) + if self.doctype.publicId is not None or self.doctype.systemId is not None: + buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId, + self.doctype.systemId)) + buf.append('>') + buf.append('<html></html>') + root = html.fromstring(''.join(buf)) + + # Append the initial comments: + for comment in self.initialComments: + root.addprevious(etree.Comment(comment)) + + # Create the root document and add the ElementTree to it + self.document = self.documentClass() + self.document._elementTree = root.getroottree() + + # Add the root element to the internal child/open data structures + root_element = self.elementClass(name) + root_element._element = root + self.document.childNodes.append(root_element) + self.openElements.append(root_element) + + self.rootInserted = True diff --git a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py new file mode 100644 index 0000000..c99738e --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py @@ -0,0 +1,56 @@ +try: + from collections.abc import MutableSet +except ImportError: + from collections import MutableSet + + +class SetMixin(MutableSet): + + """ + Mix-in for sets. You must define __iter__, add, remove + """ + + def __len__(self): + length = 0 + for item in self: + length += 1 + return length + + def __contains__(self, item): + for has_item in self: + if item == has_item: + return True + return False + + issubset = MutableSet.__le__ + issuperset = MutableSet.__ge__ + + union = MutableSet.__or__ + intersection = MutableSet.__and__ + difference = MutableSet.__sub__ + symmetric_difference = MutableSet.__xor__ + + def copy(self): + return set(self) + + def update(self, other): + self |= other + + def intersection_update(self, other): + self &= other + + def difference_update(self, other): + self -= other + + def symmetric_difference_update(self, other): + self ^= other + + def discard(self, item): + try: + self.remove(item) + except KeyError: + pass + + @classmethod + def _from_iterable(cls, it): + return set(it) diff --git a/env/lib/python3.10/site-packages/lxml/html/builder.py b/env/lib/python3.10/site-packages/lxml/html/builder.py new file mode 100644 index 0000000..8a074ec --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/builder.py @@ -0,0 +1,133 @@ +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# Copyright (c) 1999-2004 by Fredrik Lundh +# -------------------------------------------------------------------- + +""" +A set of HTML generator tags for building HTML documents. + +Usage:: + + >>> from lxml.html.builder import * + >>> html = HTML( + ... HEAD( TITLE("Hello World") ), + ... BODY( CLASS("main"), + ... H1("Hello World !") + ... ) + ... ) + + >>> import lxml.etree + >>> print lxml.etree.tostring(html, pretty_print=True) + <html> + <head> + <title>Hello World</title> + </head> + <body class="main"> + <h1>Hello World !</h1> + </body> + </html> + +""" + +from lxml.builder import ElementMaker +from lxml.html import html_parser + +E = ElementMaker(makeelement=html_parser.makeelement) + +# elements +A = E.a #: anchor +ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.) +ACRONYM = E.acronym #: +ADDRESS = E.address #: information on author +APPLET = E.applet #: Java applet (DEPRECATED) +AREA = E.area #: client-side image map area +B = E.b #: bold text style +BASE = E.base #: document base URI +BASEFONT = E.basefont #: base font size (DEPRECATED) +BDO = E.bdo #: I18N BiDi over-ride +BIG = E.big #: large text style +BLOCKQUOTE = E.blockquote #: long quotation +BODY = E.body #: document body +BR = E.br #: forced line break +BUTTON = E.button #: push button +CAPTION = E.caption #: table caption +CENTER = E.center #: shorthand for DIV align=center (DEPRECATED) +CITE = E.cite #: citation +CODE = E.code #: computer code fragment +COL = E.col #: table column +COLGROUP = E.colgroup #: table column group +DD = E.dd #: definition description +DEL = getattr(E, 'del') #: deleted text +DFN = E.dfn #: instance definition +DIR = E.dir #: directory list (DEPRECATED) +DIV = E.div #: generic language/style container +DL = E.dl #: definition list +DT = E.dt #: definition term +EM = E.em #: emphasis +FIELDSET = E.fieldset #: form control group +FONT = E.font #: local change to font (DEPRECATED) +FORM = E.form #: interactive form +FRAME = E.frame #: subwindow +FRAMESET = E.frameset #: window subdivision +H1 = E.h1 #: heading +H2 = E.h2 #: heading +H3 = E.h3 #: heading +H4 = E.h4 #: heading +H5 = E.h5 #: heading +H6 = E.h6 #: heading +HEAD = E.head #: document head +HR = E.hr #: horizontal rule +HTML = E.html #: document root element +I = E.i #: italic text style +IFRAME = E.iframe #: inline subwindow +IMG = E.img #: Embedded image +INPUT = E.input #: form control +INS = E.ins #: inserted text +ISINDEX = E.isindex #: single line prompt (DEPRECATED) +KBD = E.kbd #: text to be entered by the user +LABEL = E.label #: form field label text +LEGEND = E.legend #: fieldset legend +LI = E.li #: list item +LINK = E.link #: a media-independent link +MAP = E.map #: client-side image map +MENU = E.menu #: menu list (DEPRECATED) +META = E.meta #: generic metainformation +NOFRAMES = E.noframes #: alternate content container for non frame-based rendering +NOSCRIPT = E.noscript #: alternate content container for non script-based rendering +OBJECT = E.object #: generic embedded object +OL = E.ol #: ordered list +OPTGROUP = E.optgroup #: option group +OPTION = E.option #: selectable choice +P = E.p #: paragraph +PARAM = E.param #: named property value +PRE = E.pre #: preformatted text +Q = E.q #: short inline quotation +S = E.s #: strike-through text style (DEPRECATED) +SAMP = E.samp #: sample program output, scripts, etc. +SCRIPT = E.script #: script statements +SELECT = E.select #: option selector +SMALL = E.small #: small text style +SPAN = E.span #: generic language/style container +STRIKE = E.strike #: strike-through text (DEPRECATED) +STRONG = E.strong #: strong emphasis +STYLE = E.style #: style info +SUB = E.sub #: subscript +SUP = E.sup #: superscript +TABLE = E.table #: +TBODY = E.tbody #: table body +TD = E.td #: table data cell +TEXTAREA = E.textarea #: multi-line text field +TFOOT = E.tfoot #: table footer +TH = E.th #: table header cell +THEAD = E.thead #: table header +TITLE = E.title #: document title +TR = E.tr #: table row +TT = E.tt #: teletype or monospaced text style +U = E.u #: underlined text style (DEPRECATED) +UL = E.ul #: unordered list +VAR = E.var #: instance of a variable or program argument + +# attributes (only reserved words are included here) +ATTR = dict +def CLASS(v): return {'class': v} +def FOR(v): return {'for': v} diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so Binary files differnew file mode 100755 index 0000000..31087ea --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py new file mode 100644 index 0000000..e6b0543 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/clean.py @@ -0,0 +1,786 @@ +# cython: language_level=3str + +"""A cleanup tool for HTML. + +Removes unwanted tags and content. See the `Cleaner` class for +details. +""" + +from __future__ import absolute_import + +import copy +import re +import sys +try: + from urlparse import urlsplit + from urllib import unquote_plus +except ImportError: + # Python 3 + from urllib.parse import urlsplit, unquote_plus +from lxml import etree +from lxml.html import defs +from lxml.html import fromstring, XHTML_NAMESPACE +from lxml.html import xhtml_to_html, _transform_result + +try: + unichr +except NameError: + # Python 3 + unichr = chr +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + basestring = (str, bytes) + + +__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', + 'word_break', 'word_break_html'] + +# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# Particularly the CSS cleaning; most of the tag cleaning is integrated now +# I have multiple kinds of schemes searched; but should schemes be +# whitelisted instead? +# max height? +# remove images? Also in CSS? background attribute? +# Some way to whitelist object, iframe, etc (e.g., if you want to +# allow *just* embedded YouTube movies) +# Log what was deleted and why? +# style="behavior: ..." might be bad in IE? +# Should we have something for just <meta http-equiv>? That's the worst of the +# metas. +# UTF-7 detections? Example: +# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- +# you don't always have to have the charset set, if the page has no charset +# and there's UTF7-like code in it. +# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php + + +# This is an IE-specific construct you can have in a stylesheet to +# run some Javascript: +_replace_css_javascript = re.compile( + r'expression\s*\(.*?\)', re.S|re.I).sub + +# Do I have to worry about @\nimport? +_replace_css_import = re.compile( + r'@\s*import', re.I).sub + +_looks_like_tag_content = re.compile( + r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', + *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search + +# All kinds of schemes besides just javascript: that can cause +# execution: +_find_image_dataurls = re.compile( + r'data:image/(.+);base64,', re.I).findall +_possibly_malicious_schemes = re.compile( + r'(javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).findall +# SVG images can contain script content +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search + +def _has_javascript_scheme(s): + safe_image_urls = 0 + for image_type in _find_image_dataurls(s): + if _is_unsafe_image_type(image_type): + return True + safe_image_urls += 1 + return len(_possibly_malicious_schemes(s)) > safe_image_urls + +_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub + +# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx +_conditional_comment_re = re.compile( + r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) + +_find_styled_elements = etree.XPath( + "descendant-or-self::*[@style]") + +_find_external_links = etree.XPath( + ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" + "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), + namespaces={'x':XHTML_NAMESPACE}) + + +class Cleaner(object): + """ + Instances cleans the document of each of the possible offending + elements. The cleaning is controlled by attributes; you can + override attributes in a subclass, or set them in the constructor. + + ``scripts``: + Removes any ``<script>`` tags. + + ``javascript``: + Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets + as they could contain Javascript. + + ``comments``: + Removes any comments. + + ``style``: + Removes any style tags. + + ``inline_style`` + Removes any style attributes. Defaults to the value of the ``style`` option. + + ``links``: + Removes any ``<link>`` tags + + ``meta``: + Removes any ``<meta>`` tags + + ``page_structure``: + Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. + + ``processing_instructions``: + Removes any processing instructions. + + ``embedded``: + Removes any embedded objects (flash, iframes) + + ``frames``: + Removes any frame-related tags + + ``forms``: + Removes any form tags + + ``annoying_tags``: + Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` + + ``remove_tags``: + A list of tags to remove. Only the tags will be removed, + their content will get pulled up into the parent tag. + + ``kill_tags``: + A list of tags to kill. Killing also removes the tag's content, + i.e. the whole subtree, not just the tag itself. + + ``allow_tags``: + A list of tags to include (default include all). + + ``remove_unknown_tags``: + Remove any tags that aren't standard parts of HTML. + + ``safe_attrs_only``: + If true, only include 'safe' attributes (specifically the list + from the feedparser HTML sanitisation web site). + + ``safe_attrs``: + A set of attribute names to override the default list of attributes + considered 'safe' (when safe_attrs_only=True). + + ``add_nofollow``: + If true, then any <a> tags will have ``rel="nofollow"`` added to them. + + ``host_whitelist``: + A list or set of hosts that you can use for embedded content + (for content like ``<object>``, ``<link rel="stylesheet">``, etc). + You can also implement/override the method + ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to + implement more complex rules for what can be embedded. + Anything that passes this test will be shown, regardless of + the value of (for instance) ``embedded``. + + Note that this parameter might not work as intended if you do not + make the links absolute before doing the cleaning. + + Note that you may also need to set ``whitelist_tags``. + + ``whitelist_tags``: + A set of tags that can be included with ``host_whitelist``. + The default is ``iframe`` and ``embed``; you may wish to + include other tags like ``script``, or you may want to + implement ``allow_embedded_url`` for more control. Set to None to + include all tags. + + This modifies the document *in place*. + """ + + scripts = True + javascript = True + comments = True + style = False + inline_style = None + links = True + meta = True + page_structure = True + processing_instructions = True + embedded = True + frames = True + forms = True + annoying_tags = True + remove_tags = None + allow_tags = None + kill_tags = None + remove_unknown_tags = True + safe_attrs_only = True + safe_attrs = defs.safe_attrs + add_nofollow = False + host_whitelist = () + whitelist_tags = {'iframe', 'embed'} + + def __init__(self, **kw): + not_an_attribute = object() + for name, value in kw.items(): + default = getattr(self, name, not_an_attribute) + if (default is not None and default is not True and default is not False + and not isinstance(default, (frozenset, set, tuple, list))): + raise TypeError( + "Unknown parameter: %s=%r" % (name, value)) + setattr(self, name, value) + if self.inline_style is None and 'inline_style' not in kw: + self.inline_style = self.style + + if kw.get("allow_tags"): + if kw.get("remove_unknown_tags"): + raise ValueError("It does not make sense to pass in both " + "allow_tags and remove_unknown_tags") + self.remove_unknown_tags = False + + # Used to lookup the primary URL for a given tag that is up for + # removal: + _tag_link_attrs = dict( + script='src', + link='href', + # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html + # From what I can tell, both attributes can contain a link: + applet=['code', 'object'], + iframe='src', + embed='src', + layer='src', + # FIXME: there doesn't really seem like a general way to figure out what + # links an <object> tag uses; links often go in <param> tags with values + # that we don't really know. You'd have to have knowledge about specific + # kinds of plugins (probably keyed off classid), and match against those. + ##object=?, + # FIXME: not looking at the action currently, because it is more complex + # than than -- if you keep the form, you should keep the form controls. + ##form='action', + a='href', + ) + + def __call__(self, doc): + """ + Cleans the document. + """ + try: + getroot = doc.getroot + except AttributeError: + pass # Element instance + else: + doc = getroot() # ElementTree instance, instead of an element + # convert XHTML to HTML + xhtml_to_html(doc) + # Normalize a case that IE treats <image> like <img>, and that + # can confuse either this step or later steps. + for el in doc.iter('image'): + el.tag = 'img' + if not self.comments: + # Of course, if we were going to kill comments anyway, we don't + # need to worry about this + self.kill_conditional_comments(doc) + + kill_tags = set(self.kill_tags or ()) + remove_tags = set(self.remove_tags or ()) + allow_tags = set(self.allow_tags or ()) + + if self.scripts: + kill_tags.add('script') + if self.safe_attrs_only: + safe_attrs = set(self.safe_attrs) + for el in doc.iter(etree.Element): + attrib = el.attrib + for aname in attrib.keys(): + if aname not in safe_attrs: + del attrib[aname] + if self.javascript: + if not (self.safe_attrs_only and + self.safe_attrs == defs.safe_attrs): + # safe_attrs handles events attributes itself + for el in doc.iter(etree.Element): + attrib = el.attrib + for aname in attrib.keys(): + if aname.startswith('on'): + del attrib[aname] + doc.rewrite_links(self._remove_javascript_link, + resolve_base_href=False) + # If we're deleting style then we don't have to remove JS links + # from styles, otherwise... + if not self.inline_style: + for el in _find_styled_elements(doc): + old = el.get('style') + new = _replace_css_javascript('', old) + new = _replace_css_import('', new) + if self._has_sneaky_javascript(new): + # Something tricky is going on... + del el.attrib['style'] + elif new != old: + el.set('style', new) + if not self.style: + for el in list(doc.iter('style')): + if el.get('type', '').lower().strip() == 'text/javascript': + el.drop_tree() + continue + old = el.text or '' + new = _replace_css_javascript('', old) + # The imported CSS can do anything; we just can't allow: + new = _replace_css_import('', new) + if self._has_sneaky_javascript(new): + # Something tricky is going on... + el.text = '/* deleted */' + elif new != old: + el.text = new + if self.comments: + kill_tags.add(etree.Comment) + if self.processing_instructions: + kill_tags.add(etree.ProcessingInstruction) + if self.style: + kill_tags.add('style') + if self.inline_style: + etree.strip_attributes(doc, 'style') + if self.links: + kill_tags.add('link') + elif self.style or self.javascript: + # We must get rid of included stylesheets if Javascript is not + # allowed, as you can put Javascript in them + for el in list(doc.iter('link')): + if 'stylesheet' in el.get('rel', '').lower(): + # Note this kills alternate stylesheets as well + if not self.allow_element(el): + el.drop_tree() + if self.meta: + kill_tags.add('meta') + if self.page_structure: + remove_tags.update(('head', 'html', 'title')) + if self.embedded: + # FIXME: is <layer> really embedded? + # We should get rid of any <param> tags not inside <applet>; + # These are not really valid anyway. + for el in list(doc.iter('param')): + parent = el.getparent() + while parent is not None and parent.tag not in ('applet', 'object'): + parent = parent.getparent() + if parent is None: + el.drop_tree() + kill_tags.update(('applet',)) + # The alternate contents that are in an iframe are a good fallback: + remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) + if self.frames: + # FIXME: ideally we should look at the frame links, but + # generally frames don't mix properly with an HTML + # fragment anyway. + kill_tags.update(defs.frame_tags) + if self.forms: + remove_tags.add('form') + kill_tags.update(('button', 'input', 'select', 'textarea')) + if self.annoying_tags: + remove_tags.update(('blink', 'marquee')) + + _remove = [] + _kill = [] + for el in doc.iter(): + if el.tag in kill_tags: + if self.allow_element(el): + continue + _kill.append(el) + elif el.tag in remove_tags: + if self.allow_element(el): + continue + _remove.append(el) + + if _remove and _remove[0] == doc: + # We have to drop the parent-most tag, which we can't + # do. Instead we'll rewrite it: + el = _remove.pop(0) + el.tag = 'div' + el.attrib.clear() + elif _kill and _kill[0] == doc: + # We have to drop the parent-most element, which we can't + # do. Instead we'll clear it: + el = _kill.pop(0) + if el.tag != 'html': + el.tag = 'div' + el.clear() + + _kill.reverse() # start with innermost tags + for el in _kill: + el.drop_tree() + for el in _remove: + el.drop_tag() + + if self.remove_unknown_tags: + if allow_tags: + raise ValueError( + "It does not make sense to pass in both allow_tags and remove_unknown_tags") + allow_tags = set(defs.tags) + if allow_tags: + # make sure we do not remove comments/PIs if users want them (which is rare enough) + if not self.comments: + allow_tags.add(etree.Comment) + if not self.processing_instructions: + allow_tags.add(etree.ProcessingInstruction) + + bad = [] + for el in doc.iter(): + if el.tag not in allow_tags: + bad.append(el) + if bad: + if bad[0] is doc: + el = bad.pop(0) + el.tag = 'div' + el.attrib.clear() + for el in bad: + el.drop_tag() + if self.add_nofollow: + for el in _find_external_links(doc): + if not self.allow_follow(el): + rel = el.get('rel') + if rel: + if ('nofollow' in rel + and ' nofollow ' in (' %s ' % rel)): + continue + rel = '%s nofollow' % rel + else: + rel = 'nofollow' + el.set('rel', rel) + + def allow_follow(self, anchor): + """ + Override to suppress rel="nofollow" on some anchors. + """ + return False + + def allow_element(self, el): + """ + Decide whether an element is configured to be accepted or rejected. + + :param el: an element. + :return: true to accept the element or false to reject/discard it. + """ + if el.tag not in self._tag_link_attrs: + return False + attr = self._tag_link_attrs[el.tag] + if isinstance(attr, (list, tuple)): + for one_attr in attr: + url = el.get(one_attr) + if not url: + return False + if not self.allow_embedded_url(el, url): + return False + return True + else: + url = el.get(attr) + if not url: + return False + return self.allow_embedded_url(el, url) + + def allow_embedded_url(self, el, url): + """ + Decide whether a URL that was found in an element's attributes or text + if configured to be accepted or rejected. + + :param el: an element. + :param url: a URL found on the element. + :return: true to accept the URL and false to reject it. + """ + if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: + return False + scheme, netloc, path, query, fragment = urlsplit(url) + netloc = netloc.lower().split(':', 1)[0] + if scheme not in ('http', 'https'): + return False + if netloc in self.host_whitelist: + return True + return False + + def kill_conditional_comments(self, doc): + """ + IE conditional comments basically embed HTML that the parser + doesn't normally see. We can't allow anything like that, so + we'll kill any comments that could be conditional. + """ + has_conditional_comment = _conditional_comment_re.search + self._kill_elements( + doc, lambda el: has_conditional_comment(el.text), + etree.Comment) + + def _kill_elements(self, doc, condition, iterate=None): + bad = [] + for el in doc.iter(iterate): + if condition(el): + bad.append(el) + for el in bad: + el.drop_tree() + + def _remove_javascript_link(self, link): + # links like "j a v a s c r i p t:" might be interpreted in IE + new = _substitute_whitespace('', unquote_plus(link)) + if _has_javascript_scheme(new): + # FIXME: should this be None to delete? + return '' + return link + + _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub + + def _has_sneaky_javascript(self, style): + """ + Depending on the browser, stuff like ``e x p r e s s i o n(...)`` + can get interpreted, or ``expre/* stuff */ssion(...)``. This + checks for attempt to do stuff like this. + + Typically the response will be to kill the entire style; if you + have just a bit of Javascript in the style another rule will catch + that and remove only the Javascript from the style; this catches + more sneaky attempts. + """ + style = self._substitute_comments('', style) + style = style.replace('\\', '') + style = _substitute_whitespace('', style) + style = style.lower() + if _has_javascript_scheme(style): + return True + if 'expression(' in style: + return True + if '@import' in style: + return True + if '</noscript' in style: + # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' + return True + if _looks_like_tag_content(style): + # e.g. '<math><style><img src=x onerror=alert(1)></style></math>' + return True + return False + + def clean_html(self, html): + result_type = type(html) + if isinstance(html, basestring): + doc = fromstring(html) + else: + doc = copy.deepcopy(html) + self(doc) + return _transform_result(result_type, doc) + +clean = Cleaner() +clean_html = clean.clean_html + +############################################################ +## Autolinking +############################################################ + +_link_regexes = [ + re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), + # This is conservative, but autolinking can be a bit conservative: + re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), + ] + +_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] + +_avoid_hosts = [ + re.compile(r'^localhost', re.I), + re.compile(r'\bexample\.(?:com|org|net)$', re.I), + re.compile(r'^127\.0\.0\.1$'), + ] + +_avoid_classes = ['nolink'] + +def autolink(el, link_regexes=_link_regexes, + avoid_elements=_avoid_elements, + avoid_hosts=_avoid_hosts, + avoid_classes=_avoid_classes): + """ + Turn any URLs into links. + + It will search for links identified by the given regular + expressions (by default mailto and http(s) links). + + It won't link text in an element in avoid_elements, or an element + with a class in avoid_classes. It won't link to anything with a + host that matches one of the regular expressions in avoid_hosts + (default localhost and 127.0.0.1). + + If you pass in an element, the element's tail will not be + substituted, only the contents of the element. + """ + if el.tag in avoid_elements: + return + class_name = el.get('class') + if class_name: + class_name = class_name.split() + for match_class in avoid_classes: + if match_class in class_name: + return + for child in list(el): + autolink(child, link_regexes=link_regexes, + avoid_elements=avoid_elements, + avoid_hosts=avoid_hosts, + avoid_classes=avoid_classes) + if child.tail: + text, tail_children = _link_text( + child.tail, link_regexes, avoid_hosts, factory=el.makeelement) + if tail_children: + child.tail = text + index = el.index(child) + el[index+1:index+1] = tail_children + if el.text: + text, pre_children = _link_text( + el.text, link_regexes, avoid_hosts, factory=el.makeelement) + if pre_children: + el.text = text + el[:0] = pre_children + +def _link_text(text, link_regexes, avoid_hosts, factory): + leading_text = '' + links = [] + last_pos = 0 + while 1: + best_match, best_pos = None, None + for regex in link_regexes: + regex_pos = last_pos + while 1: + match = regex.search(text, pos=regex_pos) + if match is None: + break + host = match.group('host') + for host_regex in avoid_hosts: + if host_regex.search(host): + regex_pos = match.end() + break + else: + break + if match is None: + continue + if best_pos is None or match.start() < best_pos: + best_match = match + best_pos = match.start() + if best_match is None: + # No more matches + if links: + assert not links[-1].tail + links[-1].tail = text + else: + assert not leading_text + leading_text = text + break + link = best_match.group(0) + end = best_match.end() + if link.endswith('.') or link.endswith(','): + # These punctuation marks shouldn't end a link + end -= 1 + link = link[:-1] + prev_text = text[:best_match.start()] + if links: + assert not links[-1].tail + links[-1].tail = prev_text + else: + assert not leading_text + leading_text = prev_text + anchor = factory('a') + anchor.set('href', link) + body = best_match.group('body') + if not body: + body = link + if body.endswith('.') or body.endswith(','): + body = body[:-1] + anchor.text = body + links.append(anchor) + text = text[end:] + return leading_text, links + +def autolink_html(html, *args, **kw): + result_type = type(html) + if isinstance(html, basestring): + doc = fromstring(html) + else: + doc = copy.deepcopy(html) + autolink(doc, *args, **kw) + return _transform_result(result_type, doc) + +autolink_html.__doc__ = autolink.__doc__ + +############################################################ +## Word wrapping +############################################################ + +_avoid_word_break_elements = ['pre', 'textarea', 'code'] +_avoid_word_break_classes = ['nobreak'] + +def word_break(el, max_width=40, + avoid_elements=_avoid_word_break_elements, + avoid_classes=_avoid_word_break_classes, + break_character=unichr(0x200b)): + """ + Breaks any long words found in the body of the text (not attributes). + + Doesn't effect any of the tags in avoid_elements, by default + ``<textarea>`` and ``<pre>`` + + Breaks words by inserting ​, which is a unicode character + for Zero Width Space character. This generally takes up no space + in rendering, but does copy as a space, and in monospace contexts + usually takes up space. + + See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion + """ + # Character suggestion of ​ comes from: + # http://www.cs.tut.fi/~jkorpela/html/nobr.html + if el.tag in _avoid_word_break_elements: + return + class_name = el.get('class') + if class_name: + dont_break = False + class_name = class_name.split() + for avoid in avoid_classes: + if avoid in class_name: + dont_break = True + break + if dont_break: + return + if el.text: + el.text = _break_text(el.text, max_width, break_character) + for child in el: + word_break(child, max_width=max_width, + avoid_elements=avoid_elements, + avoid_classes=avoid_classes, + break_character=break_character) + if child.tail: + child.tail = _break_text(child.tail, max_width, break_character) + +def word_break_html(html, *args, **kw): + result_type = type(html) + doc = fromstring(html) + word_break(doc, *args, **kw) + return _transform_result(result_type, doc) + +def _break_text(text, max_width, break_character): + words = text.split() + for word in words: + if len(word) > max_width: + replacement = _insert_break(word, max_width, break_character) + text = text.replace(word, replacement) + return text + +_break_prefer_re = re.compile(r'[^a-z]', re.I) + +def _insert_break(word, width, break_character): + orig_word = word + result = '' + while len(word) > width: + start = word[:width] + breaks = list(_break_prefer_re.finditer(start)) + if breaks: + last_break = breaks[-1] + # Only walk back up to 10 characters to find a nice break: + if last_break.end() > width-10: + # FIXME: should the break character be at the end of the + # chunk, or the beginning of the next chunk? + start = word[:last_break.end()] + result += start + break_character + word = word[len(start):] + result += word + return result + diff --git a/env/lib/python3.10/site-packages/lxml/html/defs.py b/env/lib/python3.10/site-packages/lxml/html/defs.py new file mode 100644 index 0000000..2058ea3 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/defs.py @@ -0,0 +1,135 @@ +# FIXME: this should all be confirmed against what a DTD says +# (probably in a test; this may not match the DTD exactly, but we +# should document just how it differs). + +""" +Data taken from https://www.w3.org/TR/html401/index/elements.html +and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements +for html5_tags. +""" + +empty_tags = frozenset([ + 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', + 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track']) + +deprecated_tags = frozenset([ + 'applet', 'basefont', 'center', 'dir', 'font', 'isindex', + 'menu', 's', 'strike', 'u']) + +# archive actually takes a space-separated list of URIs +link_attrs = frozenset([ + 'action', 'archive', 'background', 'cite', 'classid', + 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', + 'usemap', + # Not standard: + 'dynsrc', 'lowsrc', + # HTML5 formaction + 'formaction' + ]) + +# Not in the HTML 4 spec: +# onerror, onresize +event_attrs = frozenset([ + 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', + 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', + 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', + 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit', + 'onunload', + ]) + +safe_attrs = frozenset([ + 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', + 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', + 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', + 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', + 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', + 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', + 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', + 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) + +# From http://htmlhelp.com/reference/html40/olist.html +top_level_tags = frozenset([ + 'html', 'head', 'body', 'frameset', + ]) + +head_tags = frozenset([ + 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title', + ]) + +general_block_tags = frozenset([ + 'address', + 'blockquote', + 'center', + 'del', + 'div', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'hr', + 'ins', + 'isindex', + 'noscript', + 'p', + 'pre', + ]) + +list_tags = frozenset([ + 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul', + ]) + +table_tags = frozenset([ + 'table', 'caption', 'colgroup', 'col', + 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th', + ]) + +# just this one from +# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm +block_tags = general_block_tags | list_tags | table_tags | frozenset([ + # Partial form tags + 'fieldset', 'form', 'legend', 'optgroup', 'option', + ]) + +form_tags = frozenset([ + 'form', 'button', 'fieldset', 'legend', 'input', 'label', + 'select', 'optgroup', 'option', 'textarea', + ]) + +special_inline_tags = frozenset([ + 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe', + 'img', 'map', 'area', 'object', 'param', 'q', 'script', + 'span', 'sub', 'sup', + ]) + +phrase_tags = frozenset([ + 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em', + 'ins', 'kbd', 'samp', 'strong', 'var', + ]) + +font_style_tags = frozenset([ + 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u', + ]) + +frame_tags = frozenset([ + 'frameset', 'frame', 'noframes', + ]) + +html5_tags = frozenset([ + 'article', 'aside', 'audio', 'canvas', 'command', 'datalist', + 'details', 'embed', 'figcaption', 'figure', 'footer', 'header', + 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output', + 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary', + 'svg', 'time', 'track', 'video', 'wbr' + ]) + +# These tags aren't standard +nonstandard_tags = frozenset(['blink', 'marquee']) + + +tags = (top_level_tags | head_tags | general_block_tags | list_tags + | table_tags | form_tags | special_inline_tags | phrase_tags + | font_style_tags | nonstandard_tags | html5_tags) diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so Binary files differnew file mode 100755 index 0000000..0c11b90 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py new file mode 100644 index 0000000..39bec78 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/diff.py @@ -0,0 +1,884 @@ +# cython: language_level=3 + +from __future__ import absolute_import + +import difflib +from lxml import etree +from lxml.html import fragment_fromstring +import re + +__all__ = ['html_annotate', 'htmldiff'] + +try: + from html import escape as html_escape +except ImportError: + from cgi import escape as html_escape +try: + _unicode = unicode +except NameError: + # Python 3 + _unicode = str +try: + basestring +except NameError: + # Python 3 + basestring = str + +############################################################ +## Annotation +############################################################ + +def default_markup(text, version): + return '<span title="%s">%s</span>' % ( + html_escape(_unicode(version), 1), text) + +def html_annotate(doclist, markup=default_markup): + """ + doclist should be ordered from oldest to newest, like:: + + >>> version1 = 'Hello World' + >>> version2 = 'Goodbye World' + >>> print(html_annotate([(version1, 'version 1'), + ... (version2, 'version 2')])) + <span title="version 2">Goodbye</span> <span title="version 1">World</span> + + The documents must be *fragments* (str/UTF8 or unicode), not + complete documents + + The markup argument is a function to markup the spans of words. + This function is called like markup('Hello', 'version 2'), and + returns HTML. The first argument is text and never includes any + markup. The default uses a span with a title: + + >>> print(default_markup('Some Text', 'by Joe')) + <span title="by Joe">Some Text</span> + """ + # The basic strategy we have is to split the documents up into + # logical tokens (which are words with attached markup). We then + # do diffs of each of the versions to track when a token first + # appeared in the document; the annotation attached to the token + # is the version where it first appeared. + tokenlist = [tokenize_annotated(doc, version) + for doc, version in doclist] + cur_tokens = tokenlist[0] + for tokens in tokenlist[1:]: + html_annotate_merge_annotations(cur_tokens, tokens) + cur_tokens = tokens + + # After we've tracked all the tokens, we can combine spans of text + # that are adjacent and have the same annotation + cur_tokens = compress_tokens(cur_tokens) + # And finally add markup + result = markup_serialize_tokens(cur_tokens, markup) + return ''.join(result).strip() + +def tokenize_annotated(doc, annotation): + """Tokenize a document and add an annotation attribute to each token + """ + tokens = tokenize(doc, include_hrefs=False) + for tok in tokens: + tok.annotation = annotation + return tokens + +def html_annotate_merge_annotations(tokens_old, tokens_new): + """Merge the annotations from tokens_old into tokens_new, when the + tokens in the new document already existed in the old document. + """ + s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) + commands = s.get_opcodes() + + for command, i1, i2, j1, j2 in commands: + if command == 'equal': + eq_old = tokens_old[i1:i2] + eq_new = tokens_new[j1:j2] + copy_annotations(eq_old, eq_new) + +def copy_annotations(src, dest): + """ + Copy annotations from the tokens listed in src to the tokens in dest + """ + assert len(src) == len(dest) + for src_tok, dest_tok in zip(src, dest): + dest_tok.annotation = src_tok.annotation + +def compress_tokens(tokens): + """ + Combine adjacent tokens when there is no HTML between the tokens, + and they share an annotation + """ + result = [tokens[0]] + for tok in tokens[1:]: + if (not result[-1].post_tags and + not tok.pre_tags and + result[-1].annotation == tok.annotation): + compress_merge_back(result, tok) + else: + result.append(tok) + return result + +def compress_merge_back(tokens, tok): + """ Merge tok into the last element of tokens (modifying the list of + tokens in-place). """ + last = tokens[-1] + if type(last) is not token or type(tok) is not token: + tokens.append(tok) + else: + text = _unicode(last) + if last.trailing_whitespace: + text += last.trailing_whitespace + text += tok + merged = token(text, + pre_tags=last.pre_tags, + post_tags=tok.post_tags, + trailing_whitespace=tok.trailing_whitespace) + merged.annotation = last.annotation + tokens[-1] = merged + +def markup_serialize_tokens(tokens, markup_func): + """ + Serialize the list of tokens into a list of text chunks, calling + markup_func around text to add annotations. + """ + for token in tokens: + for pre in token.pre_tags: + yield pre + html = token.html() + html = markup_func(html, token.annotation) + if token.trailing_whitespace: + html += token.trailing_whitespace + yield html + for post in token.post_tags: + yield post + + +############################################################ +## HTML Diffs +############################################################ + +def htmldiff(old_html, new_html): + ## FIXME: this should take parsed documents too, and use their body + ## or other content. + """ Do a diff of the old and new document. The documents are HTML + *fragments* (str/UTF8 or unicode), they are not complete documents + (i.e., no <html> tag). + + Returns HTML with <ins> and <del> tags added around the + appropriate text. + + Markup is generally ignored, with the markup from new_html + preserved, and possibly some markup from old_html (though it is + considered acceptable to lose some of the old markup). Only the + words in the HTML are diffed. The exception is <img> tags, which + are treated like words, and the href attribute of <a> tags, which + are noted inside the tag itself when there are changes. + """ + old_html_tokens = tokenize(old_html) + new_html_tokens = tokenize(new_html) + result = htmldiff_tokens(old_html_tokens, new_html_tokens) + result = ''.join(result).strip() + return fixup_ins_del_tags(result) + +def htmldiff_tokens(html1_tokens, html2_tokens): + """ Does a diff on the tokens themselves, returning a list of text + chunks (not tokens). + """ + # There are several passes as we do the differences. The tokens + # isolate the portion of the content we care to diff; difflib does + # all the actual hard work at that point. + # + # Then we must create a valid document from pieces of both the old + # document and the new document. We generally prefer to take + # markup from the new document, and only do a best effort attempt + # to keep markup from the old document; anything that we can't + # resolve we throw away. Also we try to put the deletes as close + # to the location where we think they would have been -- because + # we are only keeping the markup from the new document, it can be + # fuzzy where in the new document the old text would have gone. + # Again we just do a best effort attempt. + s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) + commands = s.get_opcodes() + result = [] + for command, i1, i2, j1, j2 in commands: + if command == 'equal': + result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) + continue + if command == 'insert' or command == 'replace': + ins_tokens = expand_tokens(html2_tokens[j1:j2]) + merge_insert(ins_tokens, result) + if command == 'delete' or command == 'replace': + del_tokens = expand_tokens(html1_tokens[i1:i2]) + merge_delete(del_tokens, result) + # If deletes were inserted directly as <del> then we'd have an + # invalid document at this point. Instead we put in special + # markers, and when the complete diffed document has been created + # we try to move the deletes around and resolve any problems. + result = cleanup_delete(result) + + return result + +def expand_tokens(tokens, equal=False): + """Given a list of tokens, return a generator of the chunks of + text for the data in the tokens. + """ + for token in tokens: + for pre in token.pre_tags: + yield pre + if not equal or not token.hide_when_equal: + if token.trailing_whitespace: + yield token.html() + token.trailing_whitespace + else: + yield token.html() + for post in token.post_tags: + yield post + +def merge_insert(ins_chunks, doc): + """ doc is the already-handled document (as a list of text chunks); + here we add <ins>ins_chunks</ins> to the end of that. """ + # Though we don't throw away unbalanced_start or unbalanced_end + # (we assume there is accompanying markup later or earlier in the + # document), we only put <ins> around the balanced portion. + unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) + doc.extend(unbalanced_start) + if doc and not doc[-1].endswith(' '): + # Fix up the case where the word before the insert didn't end with + # a space + doc[-1] += ' ' + doc.append('<ins>') + if balanced and balanced[-1].endswith(' '): + # We move space outside of </ins> + balanced[-1] = balanced[-1][:-1] + doc.extend(balanced) + doc.append('</ins> ') + doc.extend(unbalanced_end) + +# These are sentinels to represent the start and end of a <del> +# segment, until we do the cleanup phase to turn them into proper +# markup: +class DEL_START: + pass +class DEL_END: + pass + +class NoDeletes(Exception): + """ Raised when the document no longer contains any pending deletes + (DEL_START/DEL_END) """ + +def merge_delete(del_chunks, doc): + """ Adds the text chunks in del_chunks to the document doc (another + list of text chunks) with marker to show it is a delete. + cleanup_delete later resolves these markers into <del> tags.""" + doc.append(DEL_START) + doc.extend(del_chunks) + doc.append(DEL_END) + +def cleanup_delete(chunks): + """ Cleans up any DEL_START/DEL_END markers in the document, replacing + them with <del></del>. To do this while keeping the document + valid, it may need to drop some tags (either start or end tags). + + It may also move the del into adjacent tags to try to move it to a + similar location where it was originally located (e.g., moving a + delete into preceding <div> tag, if the del looks like (DEL_START, + 'Text</div>', DEL_END)""" + while 1: + # Find a pending DEL_START/DEL_END, splitting the document + # into stuff-preceding-DEL_START, stuff-inside, and + # stuff-following-DEL_END + try: + pre_delete, delete, post_delete = split_delete(chunks) + except NoDeletes: + # Nothing found, we've cleaned up the entire doc + break + # The stuff-inside-DEL_START/END may not be well balanced + # markup. First we figure out what unbalanced portions there are: + unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) + # Then we move the span forward and/or backward based on these + # unbalanced portions: + locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) + locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) + doc = pre_delete + if doc and not doc[-1].endswith(' '): + # Fix up case where the word before us didn't have a trailing space + doc[-1] += ' ' + doc.append('<del>') + if balanced and balanced[-1].endswith(' '): + # We move space outside of </del> + balanced[-1] = balanced[-1][:-1] + doc.extend(balanced) + doc.append('</del> ') + doc.extend(post_delete) + chunks = doc + return chunks + +def split_unbalanced(chunks): + """Return (unbalanced_start, balanced, unbalanced_end), where each is + a list of text and tag chunks. + + unbalanced_start is a list of all the tags that are opened, but + not closed in this span. Similarly, unbalanced_end is a list of + tags that are closed but were not opened. Extracting these might + mean some reordering of the chunks.""" + start = [] + end = [] + tag_stack = [] + balanced = [] + for chunk in chunks: + if not chunk.startswith('<'): + balanced.append(chunk) + continue + endtag = chunk[1] == '/' + name = chunk.split()[0].strip('<>/') + if name in empty_tags: + balanced.append(chunk) + continue + if endtag: + if tag_stack and tag_stack[-1][0] == name: + balanced.append(chunk) + name, pos, tag = tag_stack.pop() + balanced[pos] = tag + elif tag_stack: + start.extend([tag for name, pos, tag in tag_stack]) + tag_stack = [] + end.append(chunk) + else: + end.append(chunk) + else: + tag_stack.append((name, len(balanced), chunk)) + balanced.append(None) + start.extend( + [chunk for name, pos, chunk in tag_stack]) + balanced = [chunk for chunk in balanced if chunk is not None] + return start, balanced, end + +def split_delete(chunks): + """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, + stuff_after_DEL_END). Returns the first case found (there may be + more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if + there's no DEL_START found. """ + try: + pos = chunks.index(DEL_START) + except ValueError: + raise NoDeletes + pos2 = chunks.index(DEL_END) + return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] + +def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): + """ pre_delete and post_delete implicitly point to a place in the + document (where the two were split). This moves that point (by + popping items from one and pushing them onto the other). It moves + the point to try to find a place where unbalanced_start applies. + + As an example:: + + >>> unbalanced_start = ['<div>'] + >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] + >>> pre, post = doc[:3], doc[3:] + >>> pre, post + (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) + >>> locate_unbalanced_start(unbalanced_start, pre, post) + >>> pre, post + (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) + + As you can see, we moved the point so that the dangling <div> that + we found will be effectively replaced by the div in the original + document. If this doesn't work out, we just throw away + unbalanced_start without doing anything. + """ + while 1: + if not unbalanced_start: + # We have totally succeeded in finding the position + break + finding = unbalanced_start[0] + finding_name = finding.split()[0].strip('<>') + if not post_delete: + break + next = post_delete[0] + if next is DEL_START or not next.startswith('<'): + # Reached a word, we can't move the delete text forward + break + if next[1] == '/': + # Reached a closing tag, can we go further? Maybe not... + break + name = next.split()[0].strip('<>') + if name == 'ins': + # Can't move into an insert + break + assert name != 'del', ( + "Unexpected delete tag: %r" % next) + if name == finding_name: + unbalanced_start.pop(0) + pre_delete.append(post_delete.pop(0)) + else: + # Found a tag that doesn't match + break + +def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): + """ like locate_unbalanced_start, except handling end tags and + possibly moving the point earlier in the document. """ + while 1: + if not unbalanced_end: + # Success + break + finding = unbalanced_end[-1] + finding_name = finding.split()[0].strip('<>/') + if not pre_delete: + break + next = pre_delete[-1] + if next is DEL_END or not next.startswith('</'): + # A word or a start tag + break + name = next.split()[0].strip('<>/') + if name == 'ins' or name == 'del': + # Can't move into an insert or delete + break + if name == finding_name: + unbalanced_end.pop() + post_delete.insert(0, pre_delete.pop()) + else: + # Found a tag that doesn't match + break + +class token(_unicode): + """ Represents a diffable token, generally a word that is displayed to + the user. Opening tags are attached to this token when they are + adjacent (pre_tags) and closing tags that follow the word + (post_tags). Some exceptions occur when there are empty tags + adjacent to a word, so there may be close tags in pre_tags, or + open tags in post_tags. + + We also keep track of whether the word was originally followed by + whitespace, even though we do not want to treat the word as + equivalent to a similar word that does not have a trailing + space.""" + + # When this is true, the token will be eliminated from the + # displayed diff if no change has occurred: + hide_when_equal = False + + def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): + obj = _unicode.__new__(cls, text) + + if pre_tags is not None: + obj.pre_tags = pre_tags + else: + obj.pre_tags = [] + + if post_tags is not None: + obj.post_tags = post_tags + else: + obj.post_tags = [] + + obj.trailing_whitespace = trailing_whitespace + + return obj + + def __repr__(self): + return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, + self.post_tags, self.trailing_whitespace) + + def html(self): + return _unicode(self) + +class tag_token(token): + + """ Represents a token that is actually a tag. Currently this is just + the <img> tag, which takes up visible space just like a word but + is only represented in a document by a tag. """ + + def __new__(cls, tag, data, html_repr, pre_tags=None, + post_tags=None, trailing_whitespace=""): + obj = token.__new__(cls, "%s: %s" % (type, data), + pre_tags=pre_tags, + post_tags=post_tags, + trailing_whitespace=trailing_whitespace) + obj.tag = tag + obj.data = data + obj.html_repr = html_repr + return obj + + def __repr__(self): + return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( + self.tag, + self.data, + self.html_repr, + self.pre_tags, + self.post_tags, + self.trailing_whitespace) + def html(self): + return self.html_repr + +class href_token(token): + + """ Represents the href in an anchor tag. Unlike other words, we only + show the href when it changes. """ + + hide_when_equal = True + + def html(self): + return ' Link: %s' % self + +def tokenize(html, include_hrefs=True): + """ + Parse the given HTML and returns token objects (words with attached tags). + + This parses only the content of a page; anything in the head is + ignored, and the <head> and <body> elements are themselves + optional. The content is then parsed by lxml, which ensures the + validity of the resulting parsed document (though lxml may make + incorrect guesses when the markup is particular bad). + + <ins> and <del> tags are also eliminated from the document, as + that gets confusing. + + If include_hrefs is true, then the href attribute of <a> tags is + included as a special kind of diffable token.""" + if etree.iselement(html): + body_el = html + else: + body_el = parse_html(html, cleanup=True) + # Then we split the document into text chunks for each tag, word, and end tag: + chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) + # Finally re-joining them into token objects: + return fixup_chunks(chunks) + +def parse_html(html, cleanup=True): + """ + Parses an HTML fragment, returning an lxml element. Note that the HTML will be + wrapped in a <div> tag that was not in the original document. + + If cleanup is true, make sure there's no <head> or <body>, and get + rid of any <ins> and <del> tags. + """ + if cleanup: + # This removes any extra markup or structure like <head>: + html = cleanup_html(html) + return fragment_fromstring(html, create_parent=True) + +_body_re = re.compile(r'<body.*?>', re.I|re.S) +_end_body_re = re.compile(r'</body.*?>', re.I|re.S) +_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) + +def cleanup_html(html): + """ This 'cleans' the HTML, meaning that any page structure is removed + (only the contents of <body> are used, if there is any <body). + Also <ins> and <del> tags are removed. """ + match = _body_re.search(html) + if match: + html = html[match.end():] + match = _end_body_re.search(html) + if match: + html = html[:match.start()] + html = _ins_del_re.sub('', html) + return html + + +end_whitespace_re = re.compile(r'[ \t\n\r]$') + +def split_trailing_whitespace(word): + """ + This function takes a word, such as 'test\n\n' and returns ('test','\n\n') + """ + stripped_length = len(word.rstrip()) + return word[0:stripped_length], word[stripped_length:] + + +def fixup_chunks(chunks): + """ + This function takes a list of chunks and produces a list of tokens. + """ + tag_accum = [] + cur_word = None + result = [] + for chunk in chunks: + if isinstance(chunk, tuple): + if chunk[0] == 'img': + src = chunk[1] + tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) + cur_word = tag_token('img', src, html_repr=tag, + pre_tags=tag_accum, + trailing_whitespace=trailing_whitespace) + tag_accum = [] + result.append(cur_word) + + elif chunk[0] == 'href': + href = chunk[1] + cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") + tag_accum = [] + result.append(cur_word) + continue + + if is_word(chunk): + chunk, trailing_whitespace = split_trailing_whitespace(chunk) + cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) + tag_accum = [] + result.append(cur_word) + + elif is_start_tag(chunk): + tag_accum.append(chunk) + + elif is_end_tag(chunk): + if tag_accum: + tag_accum.append(chunk) + else: + assert cur_word, ( + "Weird state, cur_word=%r, result=%r, chunks=%r of %r" + % (cur_word, result, chunk, chunks)) + cur_word.post_tags.append(chunk) + else: + assert False + + if not result: + return [token('', pre_tags=tag_accum)] + else: + result[-1].post_tags.extend(tag_accum) + + return result + + +# All the tags in HTML that don't require end tags: +empty_tags = ( + 'param', 'img', 'area', 'br', 'basefont', 'input', + 'base', 'meta', 'link', 'col') + +block_level_tags = ( + 'address', + 'blockquote', + 'center', + 'dir', + 'div', + 'dl', + 'fieldset', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'hr', + 'isindex', + 'menu', + 'noframes', + 'noscript', + 'ol', + 'p', + 'pre', + 'table', + 'ul', + ) + +block_level_container_tags = ( + 'dd', + 'dt', + 'frameset', + 'li', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ) + + +def flatten_el(el, include_hrefs, skip_tag=False): + """ Takes an lxml element el, and generates all the text chunks for + that tag. Each start tag is a chunk, each word is a chunk, and each + end tag is a chunk. + + If skip_tag is true, then the outermost container tag is + not returned (just its contents).""" + if not skip_tag: + if el.tag == 'img': + yield ('img', el.get('src'), start_tag(el)) + else: + yield start_tag(el) + if el.tag in empty_tags and not el.text and not len(el) and not el.tail: + return + start_words = split_words(el.text) + for word in start_words: + yield html_escape(word) + for child in el: + for item in flatten_el(child, include_hrefs=include_hrefs): + yield item + if el.tag == 'a' and el.get('href') and include_hrefs: + yield ('href', el.get('href')) + if not skip_tag: + yield end_tag(el) + end_words = split_words(el.tail) + for word in end_words: + yield html_escape(word) + +split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) + +def split_words(text): + """ Splits some text into words. Includes trailing whitespace + on each word when appropriate. """ + if not text or not text.strip(): + return [] + + words = split_words_re.findall(text) + return words + +start_whitespace_re = re.compile(r'^[ \t\n\r]') + +def start_tag(el): + """ + The text representation of the start tag for a tag. + """ + return '<%s%s>' % ( + el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) + for name, value in el.attrib.items()])) + +def end_tag(el): + """ The text representation of an end tag for a tag. Includes + trailing whitespace when appropriate. """ + if el.tail and start_whitespace_re.search(el.tail): + extra = ' ' + else: + extra = '' + return '</%s>%s' % (el.tag, extra) + +def is_word(tok): + return not tok.startswith('<') + +def is_end_tag(tok): + return tok.startswith('</') + +def is_start_tag(tok): + return tok.startswith('<') and not tok.startswith('</') + +def fixup_ins_del_tags(html): + """ Given an html string, move any <ins> or <del> tags inside of any + block-level elements, e.g. transform <ins><p>word</p></ins> to + <p><ins>word</ins></p> """ + doc = parse_html(html, cleanup=False) + _fixup_ins_del_tags(doc) + html = serialize_html_fragment(doc, skip_outer=True) + return html + +def serialize_html_fragment(el, skip_outer=False): + """ Serialize a single lxml element as HTML. The serialized form + includes the elements tail. + + If skip_outer is true, then don't serialize the outermost tag + """ + assert not isinstance(el, basestring), ( + "You should pass in an element, not a string like %r" % el) + html = etree.tostring(el, method="html", encoding=_unicode) + if skip_outer: + # Get rid of the extra starting tag: + html = html[html.find('>')+1:] + # Get rid of the extra end tag: + html = html[:html.rfind('<')] + return html.strip() + else: + return html + +def _fixup_ins_del_tags(doc): + """fixup_ins_del_tags that works on an lxml document in-place + """ + for tag in ['ins', 'del']: + for el in doc.xpath('descendant-or-self::%s' % tag): + if not _contains_block_level_tag(el): + continue + _move_el_inside_block(el, tag=tag) + el.drop_tag() + #_merge_element_contents(el) + +def _contains_block_level_tag(el): + """True if the element contains any block-level elements, like <p>, <td>, etc. + """ + if el.tag in block_level_tags or el.tag in block_level_container_tags: + return True + for child in el: + if _contains_block_level_tag(child): + return True + return False + +def _move_el_inside_block(el, tag): + """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags + and moves them inside any block-level tags. """ + for child in el: + if _contains_block_level_tag(child): + break + else: + # No block-level tags in any child + children_tag = etree.Element(tag) + children_tag.text = el.text + el.text = None + children_tag.extend(list(el)) + el[:] = [children_tag] + return + for child in list(el): + if _contains_block_level_tag(child): + _move_el_inside_block(child, tag) + if child.tail: + tail_tag = etree.Element(tag) + tail_tag.text = child.tail + child.tail = None + el.insert(el.index(child)+1, tail_tag) + else: + child_tag = etree.Element(tag) + el.replace(child, child_tag) + child_tag.append(child) + if el.text: + text_tag = etree.Element(tag) + text_tag.text = el.text + el.text = None + el.insert(0, text_tag) + +def _merge_element_contents(el): + """ + Removes an element, but merges its contents into its place, e.g., + given <p>Hi <i>there!</i></p>, if you remove the <i> element you get + <p>Hi there!</p> + """ + parent = el.getparent() + text = el.text or '' + if el.tail: + if not len(el): + text += el.tail + else: + if el[-1].tail: + el[-1].tail += el.tail + else: + el[-1].tail = el.tail + index = parent.index(el) + if text: + if index == 0: + previous = None + else: + previous = parent[index-1] + if previous is None: + if parent.text: + parent.text += text + else: + parent.text = text + else: + if previous.tail: + previous.tail += text + else: + previous.tail = text + parent[index:index+1] = el.getchildren() + +class InsensitiveSequenceMatcher(difflib.SequenceMatcher): + """ + Acts like SequenceMatcher, but tries not to find very small equal + blocks amidst large spans of changes + """ + + threshold = 2 + + def get_matching_blocks(self): + size = min(len(self.b), len(self.b)) + threshold = min(self.threshold, size / 4) + actual = difflib.SequenceMatcher.get_matching_blocks(self) + return [item for item in actual + if item[2] > threshold + or not item[2]] + +if __name__ == '__main__': + from lxml.html import _diffcommand + _diffcommand.main() + diff --git a/env/lib/python3.10/site-packages/lxml/html/formfill.py b/env/lib/python3.10/site-packages/lxml/html/formfill.py new file mode 100644 index 0000000..2499a8e --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/formfill.py @@ -0,0 +1,299 @@ +from lxml.etree import XPath, ElementBase +from lxml.html import fromstring, XHTML_NAMESPACE +from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result +from lxml.html import defs +import copy + +try: + basestring +except NameError: + # Python 3 + basestring = str + +__all__ = ['FormNotFound', 'fill_form', 'fill_form_html', + 'insert_errors', 'insert_errors_html', + 'DefaultErrorCreator'] + +class FormNotFound(LookupError): + """ + Raised when no form can be found + """ + +_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE}) +_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]), + namespaces={'x':XHTML_NAMESPACE}) +_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]', + namespaces={'x':XHTML_NAMESPACE}) +_name_xpath = XPath('descendant-or-self::*[@name=$name]') + +def fill_form( + el, + values, + form_id=None, + form_index=None, + ): + el = _find_form(el, form_id=form_id, form_index=form_index) + _fill_form(el, values) + +def fill_form_html(html, values, form_id=None, form_index=None): + result_type = type(html) + if isinstance(html, basestring): + doc = fromstring(html) + else: + doc = copy.deepcopy(html) + fill_form(doc, values, form_id=form_id, form_index=form_index) + return _transform_result(result_type, doc) + +def _fill_form(el, values): + counts = {} + if hasattr(values, 'mixed'): + # For Paste request parameters + values = values.mixed() + inputs = _input_xpath(el) + for input in inputs: + name = input.get('name') + if not name: + continue + if _takes_multiple(input): + value = values.get(name, []) + if not isinstance(value, (list, tuple)): + value = [value] + _fill_multiple(input, value) + elif name not in values: + continue + else: + index = counts.get(name, 0) + counts[name] = index + 1 + value = values[name] + if isinstance(value, (list, tuple)): + try: + value = value[index] + except IndexError: + continue + elif index > 0: + continue + _fill_single(input, value) + +def _takes_multiple(input): + if _nons(input.tag) == 'select' and input.get('multiple'): + # FIXME: multiple="0"? + return True + type = input.get('type', '').lower() + if type in ('radio', 'checkbox'): + return True + return False + +def _fill_multiple(input, value): + type = input.get('type', '').lower() + if type == 'checkbox': + v = input.get('value') + if v is None: + if not value: + result = False + else: + result = value[0] + if isinstance(value, basestring): + # The only valid "on" value for an unnamed checkbox is 'on' + result = result == 'on' + _check(input, result) + else: + _check(input, v in value) + elif type == 'radio': + v = input.get('value') + _check(input, v in value) + else: + assert _nons(input.tag) == 'select' + for option in _options_xpath(input): + v = option.get('value') + if v is None: + # This seems to be the default, at least on IE + # FIXME: but I'm not sure + v = option.text_content() + _select(option, v in value) + +def _check(el, check): + if check: + el.set('checked', '') + else: + if 'checked' in el.attrib: + del el.attrib['checked'] + +def _select(el, select): + if select: + el.set('selected', '') + else: + if 'selected' in el.attrib: + del el.attrib['selected'] + +def _fill_single(input, value): + if _nons(input.tag) == 'textarea': + input.text = value + else: + input.set('value', value) + +def _find_form(el, form_id=None, form_index=None): + if form_id is None and form_index is None: + forms = _forms_xpath(el) + for form in forms: + return form + raise FormNotFound( + "No forms in page") + if form_id is not None: + form = el.get_element_by_id(form_id) + if form is not None: + return form + forms = _form_name_xpath(el, name=form_id) + if forms: + return forms[0] + else: + raise FormNotFound( + "No form with the name or id of %r (forms: %s)" + % (id, ', '.join(_find_form_ids(el)))) + if form_index is not None: + forms = _forms_xpath(el) + try: + return forms[form_index] + except IndexError: + raise FormNotFound( + "There is no form with the index %r (%i forms found)" + % (form_index, len(forms))) + +def _find_form_ids(el): + forms = _forms_xpath(el) + if not forms: + yield '(no forms)' + return + for index, form in enumerate(forms): + if form.get('id'): + if form.get('name'): + yield '%s or %s' % (form.get('id'), + form.get('name')) + else: + yield form.get('id') + elif form.get('name'): + yield form.get('name') + else: + yield '(unnamed form %s)' % index + +############################################################ +## Error filling +############################################################ + +class DefaultErrorCreator(object): + insert_before = True + block_inside = True + error_container_tag = 'div' + error_message_class = 'error-message' + error_block_class = 'error-block' + default_message = "Invalid" + + def __init__(self, **kw): + for name, value in kw.items(): + if not hasattr(self, name): + raise TypeError( + "Unexpected keyword argument: %s" % name) + setattr(self, name, value) + + def __call__(self, el, is_block, message): + error_el = el.makeelement(self.error_container_tag) + if self.error_message_class: + error_el.set('class', self.error_message_class) + if is_block and self.error_block_class: + error_el.set('class', error_el.get('class', '')+' '+self.error_block_class) + if message is None or message == '': + message = self.default_message + if isinstance(message, ElementBase): + error_el.append(message) + else: + assert isinstance(message, basestring), ( + "Bad message; should be a string or element: %r" % message) + error_el.text = message or self.default_message + if is_block and self.block_inside: + if self.insert_before: + error_el.tail = el.text + el.text = None + el.insert(0, error_el) + else: + el.append(error_el) + else: + parent = el.getparent() + pos = parent.index(el) + if self.insert_before: + parent.insert(pos, error_el) + else: + error_el.tail = el.tail + el.tail = None + parent.insert(pos+1, error_el) + +default_error_creator = DefaultErrorCreator() + + +def insert_errors( + el, + errors, + form_id=None, + form_index=None, + error_class="error", + error_creator=default_error_creator, + ): + el = _find_form(el, form_id=form_id, form_index=form_index) + for name, error in errors.items(): + if error is None: + continue + for error_el, message in _find_elements_for_name(el, name, error): + assert isinstance(message, (basestring, type(None), ElementBase)), ( + "Bad message: %r" % message) + _insert_error(error_el, message, error_class, error_creator) + +def insert_errors_html(html, values, **kw): + result_type = type(html) + if isinstance(html, basestring): + doc = fromstring(html) + else: + doc = copy.deepcopy(html) + insert_errors(doc, values, **kw) + return _transform_result(result_type, doc) + +def _insert_error(el, error, error_class, error_creator): + if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea': + is_block = False + else: + is_block = True + if _nons(el.tag) != 'form' and error_class: + _add_class(el, error_class) + if el.get('id'): + labels = _label_for_xpath(el, for_id=el.get('id')) + if labels: + for label in labels: + _add_class(label, error_class) + error_creator(el, is_block, error) + +def _add_class(el, class_name): + if el.get('class'): + el.set('class', el.get('class')+' '+class_name) + else: + el.set('class', class_name) + +def _find_elements_for_name(form, name, error): + if name is None: + # An error for the entire form + yield form, error + return + if name.startswith('#'): + # By id + el = form.get_element_by_id(name[1:]) + if el is not None: + yield el, error + return + els = _name_xpath(form, name=name) + if not els: + # FIXME: should this raise an exception? + return + if not isinstance(error, (list, tuple)): + yield els[0], error + return + # FIXME: if error is longer than els, should it raise an error? + for el, err in zip(els, error): + if err is None: + continue + yield el, err diff --git a/env/lib/python3.10/site-packages/lxml/html/html5parser.py b/env/lib/python3.10/site-packages/lxml/html/html5parser.py new file mode 100644 index 0000000..2f7be15 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/html5parser.py @@ -0,0 +1,260 @@ +""" +An interface to html5lib that mimics the lxml.html interface. +""" +import sys +import string + +from html5lib import HTMLParser as _HTMLParser +from html5lib.treebuilders.etree_lxml import TreeBuilder +from lxml import etree +from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag + +# python3 compatibility +try: + _strings = basestring +except NameError: + _strings = (bytes, str) +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + + +class HTMLParser(_HTMLParser): + """An html5lib HTML parser with lxml as tree.""" + + def __init__(self, strict=False, **kwargs): + _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + + +try: + from html5lib import XHTMLParser as _XHTMLParser +except ImportError: + pass +else: + class XHTMLParser(_XHTMLParser): + """An html5lib XHTML Parser with lxml as tree.""" + + def __init__(self, strict=False, **kwargs): + _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + + xhtml_parser = XHTMLParser() + + +def _find_tag(tree, tag): + elem = tree.find(tag) + if elem is not None: + return elem + return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag)) + + +def document_fromstring(html, guess_charset=None, parser=None): + """ + Parse a whole document into a string. + + If `guess_charset` is true, or if the input is not Unicode but a + byte string, the `chardet` library will perform charset guessing + on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + if parser is None: + parser = html_parser + + options = {} + if guess_charset is None and isinstance(html, bytes): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + guess_charset = True + if guess_charset is not None: + options['useChardet'] = guess_charset + return parser.parse(html, **options).getroot() + + +def fragments_fromstring(html, no_leading_text=False, + guess_charset=None, parser=None): + """Parses several HTML elements, returning a list of elements. + + The first item in the list may be a string. If no_leading_text is true, + then it will be an error if there is leading text, and it will always be + a list of only elements. + + If `guess_charset` is true, the `chardet` library will perform charset + guessing on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + if parser is None: + parser = html_parser + + options = {} + if guess_charset is None and isinstance(html, bytes): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + guess_charset = False + if guess_charset is not None: + options['useChardet'] = guess_charset + children = parser.parseFragment(html, 'div', **options) + if children and isinstance(children[0], _strings): + if no_leading_text: + if children[0].strip(): + raise etree.ParserError('There is leading text: %r' % + children[0]) + del children[0] + return children + + +def fragment_fromstring(html, create_parent=False, + guess_charset=None, parser=None): + """Parses a single HTML element; it is an error if there is more than + one element, or if anything but whitespace precedes or follows the + element. + + If 'create_parent' is true (or is a tag name) then a parent node + will be created to encapsulate the HTML in a single element. In + this case, leading or trailing text is allowed. + + If `guess_charset` is true, the `chardet` library will perform charset + guessing on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, guess_charset=guess_charset, parser=parser, + no_leading_text=not accept_leading_text) + + if create_parent: + if not isinstance(create_parent, _strings): + create_parent = 'div' + new_root = Element(create_parent) + if elements: + if isinstance(elements[0], _strings): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root + + if not elements: + raise etree.ParserError('No elements found') + if len(elements) > 1: + raise etree.ParserError('Multiple elements found') + result = elements[0] + if result.tail and result.tail.strip(): + raise etree.ParserError('Element followed by text: %r' % result.tail) + result.tail = None + return result + + +def fromstring(html, guess_charset=None, parser=None): + """Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + + 'base_url' will set the document's base_url attribute (and the tree's + docinfo.URL) + + If `guess_charset` is true, or if the input is not Unicode but a + byte string, the `chardet` library will perform charset guessing + on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + doc = document_fromstring(html, parser=parser, + guess_charset=guess_charset) + + # document starts with doctype or <html>, full document! + start = html[:50] + if isinstance(start, bytes): + # Allow text comparison in python3. + # Decode as ascii, that also covers latin-1 and utf-8 for the + # characters we need. + start = start.decode('ascii', 'replace') + + start = start.lstrip().lower() + if start.startswith('<html') or start.startswith('<!doctype'): + return doc + + head = _find_tag(doc, 'head') + + # if the head is not empty we have a full document + if len(head): + return doc + + body = _find_tag(doc, 'body') + + # The body has just one element, so it was probably a single + # element passed in + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + return body[0] + + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except <body> implies too much structure. + if _contains_block_level_tag(body): + body.tag = 'div' + else: + body.tag = 'span' + return body + + +def parse(filename_url_or_file, guess_charset=None, parser=None): + """Parse a filename, URL, or file-like object into an HTML document + tree. Note: this returns a tree, not an element. Use + ``parse(...).getroot()`` to get the document root. + + If ``guess_charset`` is true, the ``useChardet`` option is passed into + html5lib to enable character detection. This option is on by default + when parsing from URLs, off by default when parsing from file(-like) + objects (which tend to return Unicode more often than not), and on by + default when parsing from a file path (which is read in binary mode). + """ + if parser is None: + parser = html_parser + if not isinstance(filename_url_or_file, _strings): + fp = filename_url_or_file + if guess_charset is None: + # assume that file-like objects return Unicode more often than bytes + guess_charset = False + elif _looks_like_url(filename_url_or_file): + fp = urlopen(filename_url_or_file) + if guess_charset is None: + # assume that URLs return bytes + guess_charset = True + else: + fp = open(filename_url_or_file, 'rb') + if guess_charset is None: + guess_charset = True + + options = {} + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + if guess_charset: + options['useChardet'] = guess_charset + return parser.parse(fp, **options) + + +def _looks_like_url(str): + scheme = urlparse(str)[0] + if not scheme: + return False + elif (sys.platform == 'win32' and + scheme in string.ascii_letters + and len(scheme) == 1): + # looks like a 'normal' absolute path + return False + else: + return True + + +html_parser = HTMLParser() diff --git a/env/lib/python3.10/site-packages/lxml/html/soupparser.py b/env/lib/python3.10/site-packages/lxml/html/soupparser.py new file mode 100644 index 0000000..e0cf3a0 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/soupparser.py @@ -0,0 +1,314 @@ +"""External interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["fromstring", "parse", "convert_tree"] + +import re +from lxml import etree, html + +try: + from bs4 import ( + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, + Declaration, Doctype) + _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) +except ImportError: + from BeautifulSoup import ( + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, + Declaration) + _DECLARATION_OR_DOCTYPE = Declaration + + +def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a string of HTML data into an Element tree using the + BeautifulSoup parser. + + Returns the root ``<html>`` Element of the tree. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + return _parse(data, beautifulsoup, makeelement, **bsargs) + + +def parse(file, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a file into an ElemenTree using the BeautifulSoup parser. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + if not hasattr(file, 'read'): + file = open(file) + root = _parse(file, beautifulsoup, makeelement, **bsargs) + return etree.ElementTree(root) + + +def convert_tree(beautiful_soup_tree, makeelement=None): + """Convert a BeautifulSoup tree to a list of Element trees. + + Returns a list instead of a single root Element to support + HTML-like soup with more than one root element. + + You can pass a different Element factory through the `makeelement` + keyword. + """ + root = _convert_tree(beautiful_soup_tree, makeelement) + children = root.getchildren() + for child in children: + root.remove(child) + return children + + +# helpers + +def _parse(source, beautifulsoup, makeelement, **bsargs): + if beautifulsoup is None: + beautifulsoup = BeautifulSoup + if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 + if 'convertEntities' not in bsargs: + bsargs['convertEntities'] = 'html' + if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 + if 'features' not in bsargs: + bsargs['features'] = 'html.parser' # use Python html parser + tree = beautifulsoup(source, **bsargs) + root = _convert_tree(tree, makeelement) + # from ET: wrap the document in a html root element, if necessary + if len(root) == 1 and root[0].tag == "html": + return root[0] + root.tag = "html" + return root + + +_parse_doctype_declaration = re.compile( + r'(?:\s|[<!])*DOCTYPE\s*HTML' + r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' + r'(?:\s+(\'[^\']*\'|"[^"]*"))?', + re.IGNORECASE).match + + +class _PseudoTag: + # Minimal imitation of BeautifulSoup.Tag + def __init__(self, contents): + self.name = 'html' + self.attrs = [] + self.contents = contents + + def __iter__(self): + return self.contents.__iter__() + + +def _convert_tree(beautiful_soup_tree, makeelement): + if makeelement is None: + makeelement = html.html_parser.makeelement + + # Split the tree into three parts: + # i) everything before the root element: document type + # declaration, comments, processing instructions, whitespace + # ii) the root(s), + # iii) everything after the root: comments, processing + # instructions, whitespace + first_element_idx = last_element_idx = None + html_root = declaration = None + for i, e in enumerate(beautiful_soup_tree): + if isinstance(e, Tag): + if first_element_idx is None: + first_element_idx = i + last_element_idx = i + if html_root is None and e.name and e.name.lower() == 'html': + html_root = e + elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): + declaration = e + + # For a nice, well-formatted document, the variable roots below is + # a list consisting of a single <html> element. However, the document + # may be a soup like '<meta><head><title>Hello</head><body>Hi + # all<\p>'. In this example roots is a list containing meta, head + # and body elements. + if first_element_idx is None: + pre_root = post_root = [] + roots = beautiful_soup_tree.contents + else: + pre_root = beautiful_soup_tree.contents[:first_element_idx] + roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] + post_root = beautiful_soup_tree.contents[last_element_idx+1:] + + # Reorganize so that there is one <html> root... + if html_root is not None: + # ... use existing one if possible, ... + i = roots.index(html_root) + html_root.contents = roots[:i] + html_root.contents + roots[i+1:] + else: + # ... otherwise create a new one. + html_root = _PseudoTag(roots) + + convert_node = _init_node_converters(makeelement) + + # Process pre_root + res_root = convert_node(html_root) + prev = res_root + for e in reversed(pre_root): + converted = convert_node(e) + if converted is not None: + prev.addprevious(converted) + prev = converted + + # ditto for post_root + prev = res_root + for e in post_root: + converted = convert_node(e) + if converted is not None: + prev.addnext(converted) + prev = converted + + if declaration is not None: + try: + # bs4 provides full Doctype string + doctype_string = declaration.output_ready() + except AttributeError: + doctype_string = declaration.string + + match = _parse_doctype_declaration(doctype_string) + if not match: + # Something is wrong if we end up in here. Since soupparser should + # tolerate errors, do not raise Exception, just let it pass. + pass + else: + external_id, sys_uri = match.groups() + docinfo = res_root.getroottree().docinfo + # strip quotes and update DOCTYPE values (any of None, '', '...') + docinfo.public_id = external_id and external_id[1:-1] + docinfo.system_url = sys_uri and sys_uri[1:-1] + + return res_root + + +def _init_node_converters(makeelement): + converters = {} + ordered_node_types = [] + + def converter(*types): + def add(handler): + for t in types: + converters[t] = handler + ordered_node_types.append(t) + return handler + return add + + def find_best_converter(node): + for t in ordered_node_types: + if isinstance(node, t): + return converters[t] + return None + + def convert_node(bs_node, parent=None): + # duplicated in convert_tag() below + try: + handler = converters[type(bs_node)] + except KeyError: + handler = converters[type(bs_node)] = find_best_converter(bs_node) + if handler is None: + return None + return handler(bs_node, parent) + + def map_attrs(bs_attrs): + if isinstance(bs_attrs, dict): # bs4 + attribs = {} + for k, v in bs_attrs.items(): + if isinstance(v, list): + v = " ".join(v) + attribs[k] = unescape(v) + else: + attribs = dict((k, unescape(v)) for k, v in bs_attrs) + return attribs + + def append_text(parent, text): + if len(parent) == 0: + parent.text = (parent.text or '') + text + else: + parent[-1].tail = (parent[-1].tail or '') + text + + # converters are tried in order of their definition + + @converter(Tag, _PseudoTag) + def convert_tag(bs_node, parent): + attrs = bs_node.attrs + if parent is not None: + attribs = map_attrs(attrs) if attrs else None + res = etree.SubElement(parent, bs_node.name, attrib=attribs) + else: + attribs = map_attrs(attrs) if attrs else {} + res = makeelement(bs_node.name, attrib=attribs) + + for child in bs_node: + # avoid double recursion by inlining convert_node(), see above + try: + handler = converters[type(child)] + except KeyError: + pass + else: + if handler is not None: + handler(child, res) + continue + convert_node(child, res) + return res + + @converter(Comment) + def convert_comment(bs_node, parent): + res = html.HtmlComment(bs_node) + if parent is not None: + parent.append(res) + return res + + @converter(ProcessingInstruction) + def convert_pi(bs_node, parent): + if bs_node.endswith('?'): + # The PI is of XML style (<?as df?>) but BeautifulSoup + # interpreted it as being SGML style (<?as df>). Fix. + bs_node = bs_node[:-1] + res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) + if parent is not None: + parent.append(res) + return res + + @converter(NavigableString) + def convert_text(bs_node, parent): + if parent is not None: + append_text(parent, unescape(bs_node)) + return None + + return convert_node + + +# copied from ET's ElementSoup + +try: + from html.entities import name2codepoint # Python 3 +except ImportError: + from htmlentitydefs import name2codepoint + + +handle_entities = re.compile(r"&(\w+);").sub + + +try: + unichr +except NameError: + # Python 3 + unichr = chr + + +def unescape(string): + if not string: + return '' + # work around oddities in BeautifulSoup's entity handling + def unescape_entity(m): + try: + return unichr(name2codepoint[m.group(1)]) + except KeyError: + return m.group(0) # use as is + return handle_entities(unescape_entity, string) diff --git a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py new file mode 100644 index 0000000..f352a1c --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py @@ -0,0 +1,13 @@ +"""Doctest module for HTML comparison. + +Usage:: + + >>> import lxml.html.usedoctest + >>> # now do your HTML doctests ... + +See `lxml.doctestcompare`. +""" + +from lxml import doctestcompare + +doctestcompare.temp_install(html=True, del_module=__name__) |