aboutsummaryrefslogtreecommitdiffstats
path: root/env/lib/python3.10/site-packages/lxml/html
diff options
context:
space:
mode:
authorLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2022-11-13 23:46:45 +0530
committerLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2022-11-13 23:46:45 +0530
commit9468226a9e2e2ab8cdd599f1d8538e860ca86120 (patch)
tree0a77ada226d6db80639f96b438bf83e4e756edb5 /env/lib/python3.10/site-packages/lxml/html
downloadidcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.gz
idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.bz2
idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.zip
id card generator
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html')
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/ElementSoup.py10
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__init__.py1946
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pycbin0 -> 511 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pycbin0 -> 56397 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pycbin0 -> 2297 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pycbin0 -> 3603 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pycbin0 -> 2096 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pycbin0 -> 2938 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pycbin0 -> 17699 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pycbin0 -> 2812 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pycbin0 -> 24074 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pycbin0 -> 7384 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pycbin0 -> 6414 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pycbin0 -> 8014 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pycbin0 -> 437 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_diffcommand.py88
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_html5builder.py100
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_setmixin.py56
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/builder.py133
-rwxr-xr-xenv/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.sobin0 -> 564824 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/clean.py786
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/defs.py135
-rwxr-xr-xenv/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.sobin0 -> 787752 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/diff.py884
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/formfill.py299
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/html5parser.py260
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/soupparser.py314
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/usedoctest.py13
28 files changed, 5024 insertions, 0 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py
new file mode 100644
index 0000000..c35365d
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py
@@ -0,0 +1,10 @@
+__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["parse", "convert_tree"]
+
+from .soupparser import convert_tree, parse as _parse
+
+def parse(file, beautifulsoup=None, makeelement=None):
+ root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
+ return root.getroot()
diff --git a/env/lib/python3.10/site-packages/lxml/html/__init__.py b/env/lib/python3.10/site-packages/lxml/html/__init__.py
new file mode 100644
index 0000000..ef06a40
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__init__.py
@@ -0,0 +1,1946 @@
+# Copyright (c) 2004 Ian Bicking. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. Neither the name of Ian Bicking nor the names of its contributors may
+# be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""The ``lxml.html`` tool set for HTML handling.
+"""
+
+from __future__ import absolute_import
+
+__all__ = [
+ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
+ 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
+ 'find_rel_links', 'find_class', 'make_links_absolute',
+ 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
+
+
+import copy
+import sys
+import re
+from functools import partial
+
+try:
+ from collections.abc import MutableMapping, MutableSet
+except ImportError:
+ from collections import MutableMapping, MutableSet
+
+from .. import etree
+from . import defs
+from ._setmixin import SetMixin
+
+try:
+ from urlparse import urljoin
+except ImportError:
+ # Python 3
+ from urllib.parse import urljoin
+
+try:
+ unicode
+except NameError:
+ # Python 3
+ unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = (str, bytes)
+
+
+def __fix_docstring(s):
+ if not s:
+ return s
+ if sys.version_info[0] >= 3:
+ sub = re.compile(r"^(\s*)u'", re.M).sub
+ else:
+ sub = re.compile(r"^(\s*)b'", re.M).sub
+ return sub(r"\1'", s)
+
+
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
+ namespaces={'x':XHTML_NAMESPACE})
+_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
+ namespaces={'x':XHTML_NAMESPACE})
+_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
+ namespaces={'x':XHTML_NAMESPACE})
+#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
+_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_collect_string_content = etree.XPath("string()")
+_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
+_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
+_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
+ namespaces={'x':XHTML_NAMESPACE})
+_archive_re = re.compile(r'[^ ]+')
+_parse_meta_refresh_url = re.compile(
+ r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
+
+
+def _unquote_match(s, pos):
+ if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
+ return s[1:-1], pos+1
+ else:
+ return s,pos
+
+
+def _transform_result(typ, result):
+ """Convert the result back into the input type.
+ """
+ if issubclass(typ, bytes):
+ return tostring(result, encoding='utf-8')
+ elif issubclass(typ, unicode):
+ return tostring(result, encoding='unicode')
+ else:
+ return result
+
+
+def _nons(tag):
+ if isinstance(tag, basestring):
+ if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
+ return tag.split('}')[-1]
+ return tag
+
+
+class Classes(MutableSet):
+ """Provides access to an element's class attribute as a set-like collection.
+ Usage::
+
+ >>> el = fromstring('<p class="hidden large">Text</p>')
+ >>> classes = el.classes # or: classes = Classes(el.attrib)
+ >>> classes |= ['block', 'paragraph']
+ >>> el.get('class')
+ 'hidden large block paragraph'
+ >>> classes.toggle('hidden')
+ False
+ >>> el.get('class')
+ 'large block paragraph'
+ >>> classes -= ('some', 'classes', 'block')
+ >>> el.get('class')
+ 'large paragraph'
+ """
+ def __init__(self, attributes):
+ self._attributes = attributes
+ self._get_class_value = partial(attributes.get, 'class', '')
+
+ def add(self, value):
+ """
+ Add a class.
+
+ This has no effect if the class is already present.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = self._get_class_value().split()
+ if value in classes:
+ return
+ classes.append(value)
+ self._attributes['class'] = ' '.join(classes)
+
+ def discard(self, value):
+ """
+ Remove a class if it is currently present.
+
+ If the class is not present, do nothing.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = [name for name in self._get_class_value().split()
+ if name != value]
+ if classes:
+ self._attributes['class'] = ' '.join(classes)
+ elif 'class' in self._attributes:
+ del self._attributes['class']
+
+ def remove(self, value):
+ """
+ Remove a class; it must currently be present.
+
+ If the class is not present, raise a KeyError.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ super(Classes, self).remove(value)
+
+ def __contains__(self, name):
+ classes = self._get_class_value()
+ return name in classes and name in classes.split()
+
+ def __iter__(self):
+ return iter(self._get_class_value().split())
+
+ def __len__(self):
+ return len(self._get_class_value().split())
+
+ # non-standard methods
+
+ def update(self, values):
+ """
+ Add all names from 'values'.
+ """
+ classes = self._get_class_value().split()
+ extended = False
+ for value in values:
+ if value not in classes:
+ classes.append(value)
+ extended = True
+ if extended:
+ self._attributes['class'] = ' '.join(classes)
+
+ def toggle(self, value):
+ """
+ Add a class name if it isn't there yet, or remove it if it exists.
+
+ Returns true if the class was added (and is now enabled) and
+ false if it was removed (and is now disabled).
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = self._get_class_value().split()
+ try:
+ classes.remove(value)
+ enabled = False
+ except ValueError:
+ classes.append(value)
+ enabled = True
+ if classes:
+ self._attributes['class'] = ' '.join(classes)
+ else:
+ del self._attributes['class']
+ return enabled
+
+
+class HtmlMixin(object):
+
+ def set(self, key, value=None):
+ """set(self, key, value=None)
+
+ Sets an element attribute. If no value is provided, or if the value is None,
+ creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
+ for ``form.set('novalidate')``.
+ """
+ super(HtmlMixin, self).set(key, value)
+
+ @property
+ def classes(self):
+ """
+ A set-like wrapper around the 'class' attribute.
+ """
+ return Classes(self.attrib)
+
+ @classes.setter
+ def classes(self, classes):
+ assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
+ value = classes._get_class_value()
+ if value:
+ self.set('class', value)
+ elif self.get('class') is not None:
+ del self.attrib['class']
+
+ @property
+ def base_url(self):
+ """
+ Returns the base URL, given when the page was parsed.
+
+ Use with ``urlparse.urljoin(el.base_url, href)`` to get
+ absolute URLs.
+ """
+ return self.getroottree().docinfo.URL
+
+ @property
+ def forms(self):
+ """
+ Return a list of all the forms
+ """
+ return _forms_xpath(self)
+
+ @property
+ def body(self):
+ """
+ Return the <body> element. Can be called from a child element
+ to get the document's head.
+ """
+ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
+
+ @property
+ def head(self):
+ """
+ Returns the <head> element. Can be called from a child
+ element to get the document's head.
+ """
+ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
+
+ @property
+ def label(self):
+ """
+ Get or set any <label> element associated with this element.
+ """
+ id = self.get('id')
+ if not id:
+ return None
+ result = _label_xpath(self, id=id)
+ if not result:
+ return None
+ else:
+ return result[0]
+
+ @label.setter
+ def label(self, label):
+ id = self.get('id')
+ if not id:
+ raise TypeError(
+ "You cannot set a label for an element (%r) that has no id"
+ % self)
+ if _nons(label.tag) != 'label':
+ raise TypeError(
+ "You can only assign label to a label element (not %r)"
+ % label)
+ label.set('for', id)
+
+ @label.deleter
+ def label(self):
+ label = self.label
+ if label is not None:
+ del label.attrib['for']
+
+ def drop_tree(self):
+ """
+ Removes this element from the tree, including its children and
+ text. The tail text is joined to the previous element or
+ parent.
+ """
+ parent = self.getparent()
+ assert parent is not None
+ if self.tail:
+ previous = self.getprevious()
+ if previous is None:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ previous.tail = (previous.tail or '') + self.tail
+ parent.remove(self)
+
+ def drop_tag(self):
+ """
+ Remove the tag, but not its children or text. The children and text
+ are merged into the parent.
+
+ Example::
+
+ >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
+ >>> h.find('.//b').drop_tag()
+ >>> print(tostring(h, encoding='unicode'))
+ <div>Hello World!</div>
+ """
+ parent = self.getparent()
+ assert parent is not None
+ previous = self.getprevious()
+ if self.text and isinstance(self.tag, basestring):
+ # not a Comment, etc.
+ if previous is None:
+ parent.text = (parent.text or '') + self.text
+ else:
+ previous.tail = (previous.tail or '') + self.text
+ if self.tail:
+ if len(self):
+ last = self[-1]
+ last.tail = (last.tail or '') + self.tail
+ elif previous is None:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ previous.tail = (previous.tail or '') + self.tail
+ index = parent.index(self)
+ parent[index:index+1] = self[:]
+
+ def find_rel_links(self, rel):
+ """
+ Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
+ """
+ rel = rel.lower()
+ return [el for el in _rel_links_xpath(self)
+ if el.get('rel').lower() == rel]
+
+ def find_class(self, class_name):
+ """
+ Find any elements with the given class name.
+ """
+ return _class_xpath(self, class_name=class_name)
+
+ def get_element_by_id(self, id, *default):
+ """
+ Get the first element in a document with the given id. If none is
+ found, return the default argument if provided or raise KeyError
+ otherwise.
+
+ Note that there can be more than one element with the same id,
+ and this isn't uncommon in HTML documents found in the wild.
+ Browsers return only the first match, and this function does
+ the same.
+ """
+ try:
+ # FIXME: should this check for multiple matches?
+ # browsers just return the first one
+ return _id_xpath(self, id=id)[0]
+ except IndexError:
+ if default:
+ return default[0]
+ else:
+ raise KeyError(id)
+
+ def text_content(self):
+ """
+ Return the text content of the tag (and the text in any children).
+ """
+ return _collect_string_content(self)
+
+ def cssselect(self, expr, translator='html'):
+ """
+ Run the CSS expression on this element and its children,
+ returning a list of the results.
+
+ Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
+ -- note that pre-compiling the expression can provide a substantial
+ speedup.
+ """
+ # Do the import here to make the dependency optional.
+ from lxml.cssselect import CSSSelector
+ return CSSSelector(expr, translator=translator)(self)
+
+ ########################################
+ ## Link functions
+ ########################################
+
+ def make_links_absolute(self, base_url=None, resolve_base_href=True,
+ handle_failures=None):
+ """
+ Make all links in the document absolute, given the
+ ``base_url`` for the document (the full URL where the document
+ came from), or if no ``base_url`` is given, then the ``.base_url``
+ of the document.
+
+ If ``resolve_base_href`` is true, then any ``<base href>``
+ tags in the document are used *and* removed from the document.
+ If it is false then any such tag is ignored.
+
+ If ``handle_failures`` is None (default), a failure to process
+ a URL will abort the processing. If set to 'ignore', errors
+ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ if base_url is None:
+ base_url = self.base_url
+ if base_url is None:
+ raise TypeError(
+ "No base_url given, and the document has no base_url")
+ if resolve_base_href:
+ self.resolve_base_href()
+
+ if handle_failures == 'ignore':
+ def link_repl(href):
+ try:
+ return urljoin(base_url, href)
+ except ValueError:
+ return href
+ elif handle_failures == 'discard':
+ def link_repl(href):
+ try:
+ return urljoin(base_url, href)
+ except ValueError:
+ return None
+ elif handle_failures is None:
+ def link_repl(href):
+ return urljoin(base_url, href)
+ else:
+ raise ValueError(
+ "unexpected value for handle_failures: %r" % handle_failures)
+
+ self.rewrite_links(link_repl)
+
+ def resolve_base_href(self, handle_failures=None):
+ """
+ Find any ``<base href>`` tag in the document, and apply its
+ values to all links found in the document. Also remove the
+ tag once it has been applied.
+
+ If ``handle_failures`` is None (default), a failure to process
+ a URL will abort the processing. If set to 'ignore', errors
+ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ base_href = None
+ basetags = self.xpath('//base[@href]|//x:base[@href]',
+ namespaces={'x': XHTML_NAMESPACE})
+ for b in basetags:
+ base_href = b.get('href')
+ b.drop_tree()
+ if not base_href:
+ return
+ self.make_links_absolute(base_href, resolve_base_href=False,
+ handle_failures=handle_failures)
+
+ def iterlinks(self):
+ """
+ Yield (element, attribute, link, pos), where attribute may be None
+ (indicating the link is in the text). ``pos`` is the position
+ where the link occurs; often 0, but sometimes something else in
+ the case of links in stylesheets or style tags.
+
+ Note: <base href> is *not* taken into account in any way. The
+ link you get is exactly the link in the document.
+
+ Note: multiple links inside of a single text string or
+ attribute value are returned in reversed order. This makes it
+ possible to replace or delete them from the text string value
+ based on their reported text positions. Otherwise, a
+ modification at one text position can change the positions of
+ links reported later on.
+ """
+ link_attrs = defs.link_attrs
+ for el in self.iter(etree.Element):
+ attribs = el.attrib
+ tag = _nons(el.tag)
+ if tag == 'object':
+ codebase = None
+ ## <object> tags have attributes that are relative to
+ ## codebase
+ if 'codebase' in attribs:
+ codebase = el.get('codebase')
+ yield (el, 'codebase', codebase, 0)
+ for attrib in ('classid', 'data'):
+ if attrib in attribs:
+ value = el.get(attrib)
+ if codebase is not None:
+ value = urljoin(codebase, value)
+ yield (el, attrib, value, 0)
+ if 'archive' in attribs:
+ for match in _archive_re.finditer(el.get('archive')):
+ value = match.group(0)
+ if codebase is not None:
+ value = urljoin(codebase, value)
+ yield (el, 'archive', value, match.start())
+ else:
+ for attrib in link_attrs:
+ if attrib in attribs:
+ yield (el, attrib, attribs[attrib], 0)
+ if tag == 'meta':
+ http_equiv = attribs.get('http-equiv', '').lower()
+ if http_equiv == 'refresh':
+ content = attribs.get('content', '')
+ match = _parse_meta_refresh_url(content)
+ url = (match.group('url') if match else content).strip()
+ # unexpected content means the redirect won't work, but we might
+ # as well be permissive and return the entire string.
+ if url:
+ url, pos = _unquote_match(
+ url, match.start('url') if match else content.find(url))
+ yield (el, 'content', url, pos)
+ elif tag == 'param':
+ valuetype = el.get('valuetype') or ''
+ if valuetype.lower() == 'ref':
+ ## FIXME: while it's fine we *find* this link,
+ ## according to the spec we aren't supposed to
+ ## actually change the value, including resolving
+ ## it. It can also still be a link, even if it
+ ## doesn't have a valuetype="ref" (which seems to be the norm)
+ ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
+ yield (el, 'value', el.get('value'), 0)
+ elif tag == 'style' and el.text:
+ urls = [
+ # (start_pos, url)
+ _unquote_match(match.group(1), match.start(1))[::-1]
+ for match in _iter_css_urls(el.text)
+ ] + [
+ (match.start(1), match.group(1))
+ for match in _iter_css_imports(el.text)
+ ]
+ if urls:
+ # sort by start pos to bring both match sets back into order
+ # and reverse the list to report correct positions despite
+ # modifications
+ urls.sort(reverse=True)
+ for start, url in urls:
+ yield (el, None, url, start)
+ if 'style' in attribs:
+ urls = list(_iter_css_urls(attribs['style']))
+ if urls:
+ # return in reversed order to simplify in-place modifications
+ for match in urls[::-1]:
+ url, start = _unquote_match(match.group(1), match.start(1))
+ yield (el, 'style', url, start)
+
+ def rewrite_links(self, link_repl_func, resolve_base_href=True,
+ base_href=None):
+ """
+ Rewrite all the links in the document. For each link
+ ``link_repl_func(link)`` will be called, and the return value
+ will replace the old link.
+
+ Note that links may not be absolute (unless you first called
+ ``make_links_absolute()``), and may be internal (e.g.,
+ ``'#anchor'``). They can also be values like
+ ``'mailto:email'`` or ``'javascript:expr'``.
+
+ If you give ``base_href`` then all links passed to
+ ``link_repl_func()`` will take that into account.
+
+ If the ``link_repl_func`` returns None, the attribute or
+ tag text will be removed completely.
+ """
+ if base_href is not None:
+ # FIXME: this can be done in one pass with a wrapper
+ # around link_repl_func
+ self.make_links_absolute(
+ base_href, resolve_base_href=resolve_base_href)
+ elif resolve_base_href:
+ self.resolve_base_href()
+
+ for el, attrib, link, pos in self.iterlinks():
+ new_link = link_repl_func(link.strip())
+ if new_link == link:
+ continue
+ if new_link is None:
+ # Remove the attribute or element content
+ if attrib is None:
+ el.text = ''
+ else:
+ del el.attrib[attrib]
+ continue
+
+ if attrib is None:
+ new = el.text[:pos] + new_link + el.text[pos+len(link):]
+ el.text = new
+ else:
+ cur = el.get(attrib)
+ if not pos and len(cur) == len(link):
+ new = new_link # most common case
+ else:
+ new = cur[:pos] + new_link + cur[pos+len(link):]
+ el.set(attrib, new)
+
+
+class _MethodFunc(object):
+ """
+ An object that represents a method on an element as a function;
+ the function takes either an element or an HTML string. It
+ returns whatever the function normally returns, or if the function
+ works in-place (and so returns None) it returns a serialized form
+ of the resulting document.
+ """
+ def __init__(self, name, copy=False, source_class=HtmlMixin):
+ self.name = name
+ self.copy = copy
+ self.__doc__ = getattr(source_class, self.name).__doc__
+ def __call__(self, doc, *args, **kw):
+ result_type = type(doc)
+ if isinstance(doc, basestring):
+ if 'copy' in kw:
+ raise TypeError(
+ "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
+ doc = fromstring(doc, **kw)
+ else:
+ if 'copy' in kw:
+ make_a_copy = kw.pop('copy')
+ else:
+ make_a_copy = self.copy
+ if make_a_copy:
+ doc = copy.deepcopy(doc)
+ meth = getattr(doc, self.name)
+ result = meth(*args, **kw)
+ # FIXME: this None test is a bit sloppy
+ if result is None:
+ # Then return what we got in
+ return _transform_result(result_type, doc)
+ else:
+ return result
+
+
+find_rel_links = _MethodFunc('find_rel_links', copy=False)
+find_class = _MethodFunc('find_class', copy=False)
+make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
+resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
+iterlinks = _MethodFunc('iterlinks', copy=False)
+rewrite_links = _MethodFunc('rewrite_links', copy=True)
+
+
+class HtmlComment(HtmlMixin, etree.CommentBase):
+ pass
+
+
+class HtmlElement(HtmlMixin, etree.ElementBase):
+ pass
+
+
+class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):
+ pass
+
+
+class HtmlEntity(HtmlMixin, etree.EntityBase):
+ pass
+
+
+class HtmlElementClassLookup(etree.CustomElementClassLookup):
+ """A lookup scheme for HTML Element classes.
+
+ To create a lookup instance with different Element classes, pass a tag
+ name mapping of Element classes in the ``classes`` keyword argument and/or
+ a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
+ The special key '*' denotes a Mixin class that should be mixed into all
+ Element classes.
+ """
+ _default_element_classes = {}
+
+ def __init__(self, classes=None, mixins=None):
+ etree.CustomElementClassLookup.__init__(self)
+ if classes is None:
+ classes = self._default_element_classes.copy()
+ if mixins:
+ mixers = {}
+ for name, value in mixins:
+ if name == '*':
+ for n in classes.keys():
+ mixers.setdefault(n, []).append(value)
+ else:
+ mixers.setdefault(name, []).append(value)
+ for name, mix_bases in mixers.items():
+ cur = classes.get(name, HtmlElement)
+ bases = tuple(mix_bases + [cur])
+ classes[name] = type(cur.__name__, bases, {})
+ self._element_classes = classes
+
+ def lookup(self, node_type, document, namespace, name):
+ if node_type == 'element':
+ return self._element_classes.get(name.lower(), HtmlElement)
+ elif node_type == 'comment':
+ return HtmlComment
+ elif node_type == 'PI':
+ return HtmlProcessingInstruction
+ elif node_type == 'entity':
+ return HtmlEntity
+ # Otherwise normal lookup
+ return None
+
+
+################################################################################
+# parsing
+################################################################################
+
+_looks_like_full_html_unicode = re.compile(
+ unicode(r'^\s*<(?:html|!doctype)'), re.I).match
+_looks_like_full_html_bytes = re.compile(
+ r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
+
+
+def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
+ if parser is None:
+ parser = html_parser
+ value = etree.fromstring(html, parser, **kw)
+ if value is None:
+ raise etree.ParserError(
+ "Document is empty")
+ if ensure_head_body and value.find('head') is None:
+ value.insert(0, Element('head'))
+ if ensure_head_body and value.find('body') is None:
+ value.append(Element('body'))
+ return value
+
+
+def fragments_fromstring(html, no_leading_text=False, base_url=None,
+ parser=None, **kw):
+ """Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string.
+ If no_leading_text is true, then it will be an error if there is
+ leading text, and it will always be a list of only elements.
+
+ base_url will set the document's base_url attribute
+ (and the tree's docinfo.URL).
+ """
+ if parser is None:
+ parser = html_parser
+ # FIXME: check what happens when you give html with a body, head, etc.
+ if isinstance(html, bytes):
+ if not _looks_like_full_html_bytes(html):
+ # can't use %-formatting in early Py3 versions
+ html = ('<html><body>'.encode('ascii') + html +
+ '</body></html>'.encode('ascii'))
+ else:
+ if not _looks_like_full_html_unicode(html):
+ html = '<html><body>%s</body></html>' % html
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+ assert _nons(doc.tag) == 'html'
+ bodies = [e for e in doc if _nons(e.tag) == 'body']
+ assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
+ body = bodies[0]
+ elements = []
+ if no_leading_text and body.text and body.text.strip():
+ raise etree.ParserError(
+ "There is leading text: %r" % body.text)
+ if body.text and body.text.strip():
+ elements.append(body.text)
+ elements.extend(body)
+ # FIXME: removing the reference to the parent artificial document
+ # would be nice
+ return elements
+
+
+def fragment_fromstring(html, create_parent=False, base_url=None,
+ parser=None, **kw):
+ """
+ Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If ``create_parent`` is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element. In this
+ case, leading or trailing text is also allowed, as are multiple elements
+ as result of the parsing.
+
+ Passing a ``base_url`` will set the document's ``base_url`` attribute
+ (and the tree's docinfo.URL).
+ """
+ if parser is None:
+ parser = html_parser
+
+ accept_leading_text = bool(create_parent)
+
+ elements = fragments_fromstring(
+ html, parser=parser, no_leading_text=not accept_leading_text,
+ base_url=base_url, **kw)
+
+ if create_parent:
+ if not isinstance(create_parent, basestring):
+ create_parent = 'div'
+ new_root = Element(create_parent)
+ if elements:
+ if isinstance(elements[0], basestring):
+ new_root.text = elements[0]
+ del elements[0]
+ new_root.extend(elements)
+ return new_root
+
+ if not elements:
+ raise etree.ParserError('No elements found')
+ if len(elements) > 1:
+ raise etree.ParserError(
+ "Multiple elements found (%s)"
+ % ', '.join([_element_name(e) for e in elements]))
+ el = elements[0]
+ if el.tail and el.tail.strip():
+ raise etree.ParserError(
+ "Element followed by text: %r" % el.tail)
+ el.tail = None
+ return el
+
+
+def fromstring(html, base_url=None, parser=None, **kw):
+ """
+ Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+
+ base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+ """
+ if parser is None:
+ parser = html_parser
+ if isinstance(html, bytes):
+ is_full_html = _looks_like_full_html_bytes(html)
+ else:
+ is_full_html = _looks_like_full_html_unicode(html)
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+ if is_full_html:
+ return doc
+ # otherwise, lets parse it out...
+ bodies = doc.findall('body')
+ if not bodies:
+ bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
+ if bodies:
+ body = bodies[0]
+ if len(bodies) > 1:
+ # Somehow there are multiple bodies, which is bad, but just
+ # smash them into one body
+ for other_body in bodies[1:]:
+ if other_body.text:
+ if len(body):
+ body[-1].tail = (body[-1].tail or '') + other_body.text
+ else:
+ body.text = (body.text or '') + other_body.text
+ body.extend(other_body)
+ # We'll ignore tail
+ # I guess we are ignoring attributes too
+ other_body.drop_tree()
+ else:
+ body = None
+ heads = doc.findall('head')
+ if not heads:
+ heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
+ if heads:
+ # Well, we have some sort of structure, so lets keep it all
+ head = heads[0]
+ if len(heads) > 1:
+ for other_head in heads[1:]:
+ head.extend(other_head)
+ # We don't care about text or tail in a head
+ other_head.drop_tree()
+ return doc
+ if body is None:
+ return doc
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ # The body has just one element, so it was probably a single
+ # element passed in
+ return body[0]
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except <body> implies too much structure.
+ if _contains_block_level_tag(body):
+ body.tag = 'div'
+ else:
+ body.tag = 'span'
+ return body
+
+
+def parse(filename_or_url, parser=None, base_url=None, **kw):
+ """
+ Parse a filename, URL, or file-like object into an HTML document
+ tree. Note: this returns a tree, not an element. Use
+ ``parse(...).getroot()`` to get the document root.
+
+ You can override the base URL with the ``base_url`` keyword. This
+ is most useful when parsing from a file-like object.
+ """
+ if parser is None:
+ parser = html_parser
+ return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
+
+
+def _contains_block_level_tag(el):
+ # FIXME: I could do this with XPath, but would that just be
+ # unnecessarily slow?
+ for el in el.iter(etree.Element):
+ if _nons(el.tag) in defs.block_tags:
+ return True
+ return False
+
+
+def _element_name(el):
+ if isinstance(el, etree.CommentBase):
+ return 'comment'
+ elif isinstance(el, basestring):
+ return 'string'
+ else:
+ return _nons(el.tag)
+
+
+################################################################################
+# form handling
+################################################################################
+
+class FormElement(HtmlElement):
+ """
+ Represents a <form> element.
+ """
+
+ @property
+ def inputs(self):
+ """
+ Returns an accessor for all the input elements in the form.
+
+ See `InputGetter` for more information about the object.
+ """
+ return InputGetter(self)
+
+ @property
+ def fields(self):
+ """
+ Dictionary-like object that represents all the fields in this
+ form. You can set values in this dictionary to effect the
+ form.
+ """
+ return FieldsDict(self.inputs)
+
+ @fields.setter
+ def fields(self, value):
+ fields = self.fields
+ prev_keys = fields.keys()
+ for key, value in value.items():
+ if key in prev_keys:
+ prev_keys.remove(key)
+ fields[key] = value
+ for key in prev_keys:
+ if key is None:
+ # Case of an unnamed input; these aren't really
+ # expressed in form_values() anyway.
+ continue
+ fields[key] = None
+
+ def _name(self):
+ if self.get('name'):
+ return self.get('name')
+ elif self.get('id'):
+ return '#' + self.get('id')
+ iter_tags = self.body.iter
+ forms = list(iter_tags('form'))
+ if not forms:
+ forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
+ return str(forms.index(self))
+
+ def form_values(self):
+ """
+ Return a list of tuples of the field values for the form.
+ This is suitable to be passed to ``urllib.urlencode()``.
+ """
+ results = []
+ for el in self.inputs:
+ name = el.name
+ if not name or 'disabled' in el.attrib:
+ continue
+ tag = _nons(el.tag)
+ if tag == 'textarea':
+ results.append((name, el.value))
+ elif tag == 'select':
+ value = el.value
+ if el.multiple:
+ for v in value:
+ results.append((name, v))
+ elif value is not None:
+ results.append((name, el.value))
+ else:
+ assert tag == 'input', (
+ "Unexpected tag: %r" % el)
+ if el.checkable and not el.checked:
+ continue
+ if el.type in ('submit', 'image', 'reset', 'file'):
+ continue
+ value = el.value
+ if value is not None:
+ results.append((name, el.value))
+ return results
+
+ @property
+ def action(self):
+ """
+ Get/set the form's ``action`` attribute.
+ """
+ base_url = self.base_url
+ action = self.get('action')
+ if base_url and action is not None:
+ return urljoin(base_url, action)
+ else:
+ return action
+
+ @action.setter
+ def action(self, value):
+ self.set('action', value)
+
+ @action.deleter
+ def action(self):
+ attrib = self.attrib
+ if 'action' in attrib:
+ del attrib['action']
+
+ @property
+ def method(self):
+ """
+ Get/set the form's method. Always returns a capitalized
+ string, and defaults to ``'GET'``
+ """
+ return self.get('method', 'GET').upper()
+
+ @method.setter
+ def method(self, value):
+ self.set('method', value.upper())
+
+
+HtmlElementClassLookup._default_element_classes['form'] = FormElement
+
+
+def submit_form(form, extra_values=None, open_http=None):
+ """
+ Helper function to submit a form. Returns a file-like object, as from
+ ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
+ which shows the URL if there were any redirects.
+
+ You can use this like::
+
+ form = doc.forms[0]
+ form.inputs['foo'].value = 'bar' # etc
+ response = form.submit()
+ doc = parse(response)
+ doc.make_links_absolute(response.geturl())
+
+ To change the HTTP requester, pass a function as ``open_http`` keyword
+ argument that opens the URL for you. The function must have the following
+ signature::
+
+ open_http(method, URL, values)
+
+ The action is one of 'GET' or 'POST', the URL is the target URL as a
+ string, and the values are a sequence of ``(name, value)`` tuples with the
+ form data.
+ """
+ values = form.form_values()
+ if extra_values:
+ if hasattr(extra_values, 'items'):
+ extra_values = extra_values.items()
+ values.extend(extra_values)
+ if open_http is None:
+ open_http = open_http_urllib
+ if form.action:
+ url = form.action
+ else:
+ url = form.base_url
+ return open_http(form.method, url, values)
+
+
+def open_http_urllib(method, url, values):
+ if not url:
+ raise ValueError("cannot submit, no URL provided")
+ ## FIXME: should test that it's not a relative URL or something
+ try:
+ from urllib import urlencode, urlopen
+ except ImportError: # Python 3
+ from urllib.request import urlopen
+ from urllib.parse import urlencode
+ if method == 'GET':
+ if '?' in url:
+ url += '&'
+ else:
+ url += '?'
+ url += urlencode(values)
+ data = None
+ else:
+ data = urlencode(values)
+ if not isinstance(data, bytes):
+ data = data.encode('ASCII')
+ return urlopen(url, data)
+
+
+class FieldsDict(MutableMapping):
+
+ def __init__(self, inputs):
+ self.inputs = inputs
+ def __getitem__(self, item):
+ return self.inputs[item].value
+ def __setitem__(self, item, value):
+ self.inputs[item].value = value
+ def __delitem__(self, item):
+ raise KeyError(
+ "You cannot remove keys from ElementDict")
+ def keys(self):
+ return self.inputs.keys()
+ def __contains__(self, item):
+ return item in self.inputs
+ def __iter__(self):
+ return iter(self.inputs.keys())
+ def __len__(self):
+ return len(self.inputs)
+
+ def __repr__(self):
+ return '<%s for form %s>' % (
+ self.__class__.__name__,
+ self.inputs.form._name())
+
+
+class InputGetter(object):
+
+ """
+ An accessor that represents all the input fields in a form.
+
+ You can get fields by name from this, with
+ ``form.inputs['field_name']``. If there are a set of checkboxes
+ with the same name, they are returned as a list (a `CheckboxGroup`
+ which also allows value setting). Radio inputs are handled
+ similarly. Use ``.keys()`` and ``.items()`` to process all fields
+ in this way.
+
+ You can also iterate over this to get all input elements. This
+ won't return the same thing as if you get all the names, as
+ checkboxes and radio elements are returned individually.
+ """
+
+ def __init__(self, form):
+ self.form = form
+
+ def __repr__(self):
+ return '<%s for form %s>' % (
+ self.__class__.__name__,
+ self.form._name())
+
+ ## FIXME: there should be more methods, and it's unclear if this is
+ ## a dictionary-like object or list-like object
+
+ def __getitem__(self, name):
+ fields = [field for field in self if field.name == name]
+ if not fields:
+ raise KeyError("No input element with the name %r" % name)
+
+ input_type = fields[0].get('type')
+ if input_type == 'radio' and len(fields) > 1:
+ group = RadioGroup(fields)
+ group.name = name
+ return group
+ elif input_type == 'checkbox' and len(fields) > 1:
+ group = CheckboxGroup(fields)
+ group.name = name
+ return group
+ else:
+ # I don't like throwing away elements like this
+ return fields[0]
+
+ def __contains__(self, name):
+ for field in self:
+ if field.name == name:
+ return True
+ return False
+
+ def keys(self):
+ """
+ Returns all unique field names, in document order.
+
+ :return: A list of all unique field names.
+ """
+ names = []
+ seen = {None}
+ for el in self:
+ name = el.name
+ if name not in seen:
+ names.append(name)
+ seen.add(name)
+ return names
+
+ def items(self):
+ """
+ Returns all fields with their names, similar to dict.items().
+
+ :return: A list of (name, field) tuples.
+ """
+ items = []
+ seen = set()
+ for el in self:
+ name = el.name
+ if name not in seen:
+ seen.add(name)
+ items.append((name, self[name]))
+ return items
+
+ def __iter__(self):
+ return self.form.iter('select', 'input', 'textarea')
+
+ def __len__(self):
+ return sum(1 for _ in self)
+
+
+class InputMixin(object):
+ """
+ Mix-in for all input elements (input, select, and textarea)
+ """
+ @property
+ def name(self):
+ """
+ Get/set the name of the element
+ """
+ return self.get('name')
+
+ @name.setter
+ def name(self, value):
+ self.set('name', value)
+
+ @name.deleter
+ def name(self):
+ attrib = self.attrib
+ if 'name' in attrib:
+ del attrib['name']
+
+ def __repr__(self):
+ type_name = getattr(self, 'type', None)
+ if type_name:
+ type_name = ' type=%r' % type_name
+ else:
+ type_name = ''
+ return '<%s %x name=%r%s>' % (
+ self.__class__.__name__, id(self), self.name, type_name)
+
+
+class TextareaElement(InputMixin, HtmlElement):
+ """
+ ``<textarea>`` element. You can get the name with ``.name`` and
+ get/set the value with ``.value``
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value (which is the contents of this element)
+ """
+ content = self.text or ''
+ if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
+ serialisation_method = 'xml'
+ else:
+ serialisation_method = 'html'
+ for el in self:
+ # it's rare that we actually get here, so let's not use ''.join()
+ content += etree.tostring(
+ el, method=serialisation_method, encoding='unicode')
+ return content
+
+ @value.setter
+ def value(self, value):
+ del self[:]
+ self.text = value
+
+ @value.deleter
+ def value(self):
+ self.text = ''
+ del self[:]
+
+
+HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
+
+
+class SelectElement(InputMixin, HtmlElement):
+ """
+ ``<select>`` element. You can get the name with ``.name``.
+
+ ``.value`` will be the value of the selected option, unless this
+ is a multi-select element (``<select multiple>``), in which case
+ it will be a set-like object. In either case ``.value_options``
+ gives the possible values.
+
+ The boolean attribute ``.multiple`` shows if this is a
+ multi-select.
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value of this select (the selected option).
+
+ If this is a multi-select, this is a set-like object that
+ represents all the selected options.
+ """
+ if self.multiple:
+ return MultipleSelectOptions(self)
+ options = _options_xpath(self)
+
+ try:
+ selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
+ except StopIteration:
+ try:
+ selected_option = next(el for el in options if el.get('disabled') is None)
+ except StopIteration:
+ return None
+ value = selected_option.get('value')
+ if value is None:
+ value = (selected_option.text or '').strip()
+ return value
+
+ @value.setter
+ def value(self, value):
+ if self.multiple:
+ if isinstance(value, basestring):
+ raise TypeError("You must pass in a sequence")
+ values = self.value
+ values.clear()
+ values.update(value)
+ return
+ checked_option = None
+ if value is not None:
+ for el in _options_xpath(self):
+ opt_value = el.get('value')
+ if opt_value is None:
+ opt_value = (el.text or '').strip()
+ if opt_value == value:
+ checked_option = el
+ break
+ else:
+ raise ValueError(
+ "There is no option with the value of %r" % value)
+ for el in _options_xpath(self):
+ if 'selected' in el.attrib:
+ del el.attrib['selected']
+ if checked_option is not None:
+ checked_option.set('selected', '')
+
+ @value.deleter
+ def value(self):
+ # FIXME: should del be allowed at all?
+ if self.multiple:
+ self.value.clear()
+ else:
+ self.value = None
+
+ @property
+ def value_options(self):
+ """
+ All the possible values this select can have (the ``value``
+ attribute of all the ``<option>`` elements.
+ """
+ options = []
+ for el in _options_xpath(self):
+ value = el.get('value')
+ if value is None:
+ value = (el.text or '').strip()
+ options.append(value)
+ return options
+
+ @property
+ def multiple(self):
+ """
+ Boolean attribute: is there a ``multiple`` attribute on this element.
+ """
+ return 'multiple' in self.attrib
+
+ @multiple.setter
+ def multiple(self, value):
+ if value:
+ self.set('multiple', '')
+ elif 'multiple' in self.attrib:
+ del self.attrib['multiple']
+
+
+HtmlElementClassLookup._default_element_classes['select'] = SelectElement
+
+
+class MultipleSelectOptions(SetMixin):
+ """
+ Represents all the selected options in a ``<select multiple>`` element.
+
+ You can add to this set-like option to select an option, or remove
+ to unselect the option.
+ """
+
+ def __init__(self, select):
+ self.select = select
+
+ @property
+ def options(self):
+ """
+ Iterator of all the ``<option>`` elements.
+ """
+ return iter(_options_xpath(self.select))
+
+ def __iter__(self):
+ for option in self.options:
+ if 'selected' in option.attrib:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ yield opt_value
+
+ def add(self, item):
+ for option in self.options:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ if opt_value == item:
+ option.set('selected', '')
+ break
+ else:
+ raise ValueError(
+ "There is no option with the value %r" % item)
+
+ def remove(self, item):
+ for option in self.options:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ if opt_value == item:
+ if 'selected' in option.attrib:
+ del option.attrib['selected']
+ else:
+ raise ValueError(
+ "The option %r is not currently selected" % item)
+ break
+ else:
+ raise ValueError(
+ "There is not option with the value %r" % item)
+
+ def __repr__(self):
+ return '<%s {%s} for select name=%r>' % (
+ self.__class__.__name__,
+ ', '.join([repr(v) for v in self]),
+ self.select.name)
+
+
+class RadioGroup(list):
+ """
+ This object represents several ``<input type=radio>`` elements
+ that have the same name.
+
+ You can use this like a list, but also use the property
+ ``.value`` to check/uncheck inputs. Also you can use
+ ``.value_options`` to get the possible values.
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value, which checks the radio with that value (and
+ unchecks any other value).
+ """
+ for el in self:
+ if 'checked' in el.attrib:
+ return el.get('value')
+ return None
+
+ @value.setter
+ def value(self, value):
+ checked_option = None
+ if value is not None:
+ for el in self:
+ if el.get('value') == value:
+ checked_option = el
+ break
+ else:
+ raise ValueError("There is no radio input with the value %r" % value)
+ for el in self:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+ if checked_option is not None:
+ checked_option.set('checked', '')
+
+ @value.deleter
+ def value(self):
+ self.value = None
+
+ @property
+ def value_options(self):
+ """
+ Returns a list of all the possible values.
+ """
+ return [el.get('value') for el in self]
+
+ def __repr__(self):
+ return '%s(%s)' % (
+ self.__class__.__name__,
+ list.__repr__(self))
+
+
+class CheckboxGroup(list):
+ """
+ Represents a group of checkboxes (``<input type=checkbox>``) that
+ have the same name.
+
+ In addition to using this like a list, the ``.value`` attribute
+ returns a set-like object that you can add to or remove from to
+ check and uncheck checkboxes. You can also use ``.value_options``
+ to get the possible values.
+ """
+ @property
+ def value(self):
+ """
+ Return a set-like object that can be modified to check or
+ uncheck individual checkboxes according to their value.
+ """
+ return CheckboxValues(self)
+
+ @value.setter
+ def value(self, value):
+ values = self.value
+ values.clear()
+ if not hasattr(value, '__iter__'):
+ raise ValueError(
+ "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
+ % (self[0].name, value))
+ values.update(value)
+
+ @value.deleter
+ def value(self):
+ self.value.clear()
+
+ @property
+ def value_options(self):
+ """
+ Returns a list of all the possible values.
+ """
+ return [el.get('value') for el in self]
+
+ def __repr__(self):
+ return '%s(%s)' % (
+ self.__class__.__name__, list.__repr__(self))
+
+
+class CheckboxValues(SetMixin):
+ """
+ Represents the values of the checked checkboxes in a group of
+ checkboxes with the same name.
+ """
+
+ def __init__(self, group):
+ self.group = group
+
+ def __iter__(self):
+ return iter([
+ el.get('value')
+ for el in self.group
+ if 'checked' in el.attrib])
+
+ def add(self, value):
+ for el in self.group:
+ if el.get('value') == value:
+ el.set('checked', '')
+ break
+ else:
+ raise KeyError("No checkbox with value %r" % value)
+
+ def remove(self, value):
+ for el in self.group:
+ if el.get('value') == value:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+ else:
+ raise KeyError(
+ "The checkbox with value %r was already unchecked" % value)
+ break
+ else:
+ raise KeyError(
+ "No checkbox with value %r" % value)
+
+ def __repr__(self):
+ return '<%s {%s} for checkboxes name=%r>' % (
+ self.__class__.__name__,
+ ', '.join([repr(v) for v in self]),
+ self.group.name)
+
+
+class InputElement(InputMixin, HtmlElement):
+ """
+ Represents an ``<input>`` element.
+
+ You can get the type with ``.type`` (which is lower-cased and
+ defaults to ``'text'``).
+
+ Also you can get and set the value with ``.value``
+
+ Checkboxes and radios have the attribute ``input.checkable ==
+ True`` (for all others it is false) and a boolean attribute
+ ``.checked``.
+
+ """
+
+ ## FIXME: I'm a little uncomfortable with the use of .checked
+ @property
+ def value(self):
+ """
+ Get/set the value of this element, using the ``value`` attribute.
+
+ Also, if this is a checkbox and it has no value, this defaults
+ to ``'on'``. If it is a checkbox or radio that is not
+ checked, this returns None.
+ """
+ if self.checkable:
+ if self.checked:
+ return self.get('value') or 'on'
+ else:
+ return None
+ return self.get('value')
+
+ @value.setter
+ def value(self, value):
+ if self.checkable:
+ if not value:
+ self.checked = False
+ else:
+ self.checked = True
+ if isinstance(value, basestring):
+ self.set('value', value)
+ else:
+ self.set('value', value)
+
+ @value.deleter
+ def value(self):
+ if self.checkable:
+ self.checked = False
+ else:
+ if 'value' in self.attrib:
+ del self.attrib['value']
+
+ @property
+ def type(self):
+ """
+ Return the type of this element (using the type attribute).
+ """
+ return self.get('type', 'text').lower()
+
+ @type.setter
+ def type(self, value):
+ self.set('type', value)
+
+ @property
+ def checkable(self):
+ """
+ Boolean: can this element be checked?
+ """
+ return self.type in ('checkbox', 'radio')
+
+ @property
+ def checked(self):
+ """
+ Boolean attribute to get/set the presence of the ``checked``
+ attribute.
+
+ You can only use this on checkable input types.
+ """
+ if not self.checkable:
+ raise AttributeError('Not a checkable input type')
+ return 'checked' in self.attrib
+
+ @checked.setter
+ def checked(self, value):
+ if not self.checkable:
+ raise AttributeError('Not a checkable input type')
+ if value:
+ self.set('checked', '')
+ else:
+ attrib = self.attrib
+ if 'checked' in attrib:
+ del attrib['checked']
+
+
+HtmlElementClassLookup._default_element_classes['input'] = InputElement
+
+
+class LabelElement(HtmlElement):
+ """
+ Represents a ``<label>`` element.
+
+ Label elements are linked to other elements with their ``for``
+ attribute. You can access this element with ``label.for_element``.
+ """
+ @property
+ def for_element(self):
+ """
+ Get/set the element this label points to. Return None if it
+ can't be found.
+ """
+ id = self.get('for')
+ if not id:
+ return None
+ return self.body.get_element_by_id(id)
+
+ @for_element.setter
+ def for_element(self, other):
+ id = other.get('id')
+ if not id:
+ raise TypeError(
+ "Element %r has no id attribute" % other)
+ self.set('for', id)
+
+ @for_element.deleter
+ def for_element(self):
+ attrib = self.attrib
+ if 'id' in attrib:
+ del attrib['id']
+
+
+HtmlElementClassLookup._default_element_classes['label'] = LabelElement
+
+
+############################################################
+## Serialization
+############################################################
+
+def html_to_xhtml(html):
+ """Convert all tags in an HTML tree to XHTML by moving them to the
+ XHTML namespace.
+ """
+ try:
+ html = html.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ for el in html.iter(etree.Element):
+ tag = el.tag
+ if tag[0] != '{':
+ el.tag = prefix + tag
+
+
+def xhtml_to_html(xhtml):
+ """Convert all tags in an XHTML tree to HTML by removing their
+ XHTML namespace.
+ """
+ try:
+ xhtml = xhtml.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ prefix_len = len(prefix)
+ for el in xhtml.iter(prefix + "*"):
+ el.tag = el.tag[prefix_len:]
+
+
+# This isn't a general match, but it's a match for what libxml2
+# specifically serialises:
+__str_replace_meta_content_type = re.compile(
+ r'<meta http-equiv="Content-Type"[^>]*>').sub
+__bytes_replace_meta_content_type = re.compile(
+ r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
+
+
+def tostring(doc, pretty_print=False, include_meta_content_type=False,
+ encoding=None, method="html", with_tail=True, doctype=None):
+ """Return an HTML string representation of the document.
+
+ Note: if include_meta_content_type is true this will create a
+ ``<meta http-equiv="Content-Type" ...>`` tag in the head;
+ regardless of the value of include_meta_content_type any existing
+ ``<meta http-equiv="Content-Type" ...>`` tag will be removed
+
+ The ``encoding`` argument controls the output encoding (defaults to
+ ASCII, with &#...; character references for any characters outside
+ of ASCII). Note that you can pass the name ``'unicode'`` as
+ ``encoding`` argument to serialise to a Unicode string.
+
+ The ``method`` argument defines the output method. It defaults to
+ 'html', but can also be 'xml' for xhtml output, or 'text' to
+ serialise to plain text without markup.
+
+ To leave out the tail text of the top-level element that is being
+ serialised, pass ``with_tail=False``.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ Example::
+
+ >>> from lxml import html
+ >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
+
+ >>> html.tostring(root)
+ b'<p>Hello<br>world!</p>'
+ >>> html.tostring(root, method='html')
+ b'<p>Hello<br>world!</p>'
+
+ >>> html.tostring(root, method='xml')
+ b'<p>Hello<br/>world!</p>'
+
+ >>> html.tostring(root, method='text')
+ b'Helloworld!'
+
+ >>> html.tostring(root, method='text', encoding='unicode')
+ u'Helloworld!'
+
+ >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
+ >>> html.tostring(root[0], method='text', encoding='unicode')
+ u'Helloworld!TAIL'
+
+ >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
+ u'Helloworld!'
+
+ >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
+ >>> html.tostring(doc, method='html', encoding='unicode')
+ u'<html><body><p>Hello<br>world!</p></body></html>'
+
+ >>> print(html.tostring(doc, method='html', encoding='unicode',
+ ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
+ ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ <html><body><p>Hello<br>world!</p></body></html>
+ """
+ html = etree.tostring(doc, method=method, pretty_print=pretty_print,
+ encoding=encoding, with_tail=with_tail,
+ doctype=doctype)
+ if method == 'html' and not include_meta_content_type:
+ if isinstance(html, str):
+ html = __str_replace_meta_content_type('', html)
+ else:
+ html = __bytes_replace_meta_content_type(bytes(), html)
+ return html
+
+
+tostring.__doc__ = __fix_docstring(tostring.__doc__)
+
+
+def open_in_browser(doc, encoding=None):
+ """
+ Open the HTML document in a web browser, saving it to a temporary
+ file to open it. Note that this does not delete the file after
+ use. This is mainly meant for debugging.
+ """
+ import os
+ import webbrowser
+ import tempfile
+ if not isinstance(doc, etree._ElementTree):
+ doc = etree.ElementTree(doc)
+ handle, fn = tempfile.mkstemp(suffix='.html')
+ f = os.fdopen(handle, 'wb')
+ try:
+ doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
+ finally:
+ # we leak the file itself here, but we should at least close it
+ f.close()
+ url = 'file://' + fn.replace(os.path.sep, '/')
+ print(url)
+ webbrowser.open(url)
+
+
+################################################################################
+# configure Element class lookup
+################################################################################
+
+class HTMLParser(etree.HTMLParser):
+ """An HTML parser that is configured to return lxml.html Element
+ objects.
+ """
+ def __init__(self, **kwargs):
+ super(HTMLParser, self).__init__(**kwargs)
+ self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+class XHTMLParser(etree.XMLParser):
+ """An XML parser that is configured to return lxml.html Element
+ objects.
+
+ Note that this parser is not really XHTML aware unless you let it
+ load a DTD that declares the HTML entities. To do this, make sure
+ you have the XHTML DTDs installed in your catalogs, and create the
+ parser like this::
+
+ >>> parser = XHTMLParser(load_dtd=True)
+
+ If you additionally want to validate the document, use this::
+
+ >>> parser = XHTMLParser(dtd_validation=True)
+
+ For catalog support, see http://www.xmlsoft.org/catalog.html.
+ """
+ def __init__(self, **kwargs):
+ super(XHTMLParser, self).__init__(**kwargs)
+ self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+def Element(*args, **kw):
+ """Create a new HTML Element.
+
+ This can also be used for XHTML documents.
+ """
+ v = html_parser.makeelement(*args, **kw)
+ return v
+
+
+html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc
new file mode 100644
index 0000000..a378207
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..4bc5785
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc
new file mode 100644
index 0000000..fa25497
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc
new file mode 100644
index 0000000..b243408
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc
new file mode 100644
index 0000000..a2de006
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000..b915259
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc
new file mode 100644
index 0000000..c343b40
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc
new file mode 100644
index 0000000..8dc2d4b
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc
new file mode 100644
index 0000000..c029ed9
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc
new file mode 100644
index 0000000..049161a
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc
new file mode 100644
index 0000000..6208e67
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc
new file mode 100644
index 0000000..3293704
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc
new file mode 100644
index 0000000..d76e7dd
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py
new file mode 100644
index 0000000..e0502c0
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py
@@ -0,0 +1,88 @@
+from __future__ import absolute_import
+
+import optparse
+import sys
+import re
+import os
+from .diff import htmldiff
+
+description = """\
+"""
+
+parser = optparse.OptionParser(
+ usage="%prog [OPTIONS] FILE1 FILE2\n"
+ "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
+ description=description,
+ )
+
+parser.add_option(
+ '-o', '--output',
+ metavar="FILE",
+ dest="output",
+ default="-",
+ help="File to write the difference to",
+ )
+
+parser.add_option(
+ '-a', '--annotation',
+ action="store_true",
+ dest="annotation",
+ help="Do an annotation")
+
+def main(args=None):
+ if args is None:
+ args = sys.argv[1:]
+ options, args = parser.parse_args(args)
+ if options.annotation:
+ return annotate(options, args)
+ if len(args) != 2:
+ print('Error: you must give two files')
+ parser.print_help()
+ sys.exit(1)
+ file1, file2 = args
+ input1 = read_file(file1)
+ input2 = read_file(file2)
+ body1 = split_body(input1)[1]
+ pre, body2, post = split_body(input2)
+ result = htmldiff(body1, body2)
+ result = pre + result + post
+ if options.output == '-':
+ if not result.endswith('\n'):
+ result += '\n'
+ sys.stdout.write(result)
+ else:
+ with open(options.output, 'wb') as f:
+ f.write(result)
+
+def read_file(filename):
+ if filename == '-':
+ c = sys.stdin.read()
+ elif not os.path.exists(filename):
+ raise OSError(
+ "Input file %s does not exist" % filename)
+ else:
+ with open(filename, 'rb') as f:
+ c = f.read()
+ return c
+
+body_start_re = re.compile(
+ r"<body.*?>", re.I|re.S)
+body_end_re = re.compile(
+ r"</body.*?>", re.I|re.S)
+
+def split_body(html):
+ pre = post = ''
+ match = body_start_re.search(html)
+ if match:
+ pre = html[:match.end()]
+ html = html[match.end():]
+ match = body_end_re.search(html)
+ if match:
+ post = html[match.start():]
+ html = html[:match.start()]
+ return pre, html, post
+
+def annotate(options, args):
+ print("Not yet implemented")
+ sys.exit(1)
+
diff --git a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py
new file mode 100644
index 0000000..3405c20
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py
@@ -0,0 +1,100 @@
+"""
+Legacy module - don't use in new code!
+
+html5lib now has its own proper implementation.
+
+This module implements a tree builder for html5lib that generates lxml
+html element trees. This module uses camelCase as it follows the
+html5lib style guide.
+"""
+
+from html5lib.treebuilders import _base, etree as etree_builders
+from lxml import html, etree
+
+
+class DocumentType(object):
+
+ def __init__(self, name, publicId, systemId):
+ self.name = name
+ self.publicId = publicId
+ self.systemId = systemId
+
+class Document(object):
+
+ def __init__(self):
+ self._elementTree = None
+ self.childNodes = []
+
+ def appendChild(self, element):
+ self._elementTree.getroot().addnext(element._element)
+
+
+class TreeBuilder(_base.TreeBuilder):
+ documentClass = Document
+ doctypeClass = DocumentType
+ elementClass = None
+ commentClass = None
+ fragmentClass = Document
+
+ def __init__(self, *args, **kwargs):
+ html_builder = etree_builders.getETreeModule(html, fullTree=False)
+ etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
+ self.elementClass = html_builder.Element
+ self.commentClass = etree_builder.Comment
+ _base.TreeBuilder.__init__(self, *args, **kwargs)
+
+ def reset(self):
+ _base.TreeBuilder.reset(self)
+ self.rootInserted = False
+ self.initialComments = []
+ self.doctype = None
+
+ def getDocument(self):
+ return self.document._elementTree
+
+ def getFragment(self):
+ fragment = []
+ element = self.openElements[0]._element
+ if element.text:
+ fragment.append(element.text)
+ fragment.extend(element.getchildren())
+ if element.tail:
+ fragment.append(element.tail)
+ return fragment
+
+ def insertDoctype(self, name, publicId, systemId):
+ doctype = self.doctypeClass(name, publicId, systemId)
+ self.doctype = doctype
+
+ def insertComment(self, data, parent=None):
+ if not self.rootInserted:
+ self.initialComments.append(data)
+ else:
+ _base.TreeBuilder.insertComment(self, data, parent)
+
+ def insertRoot(self, name):
+ buf = []
+ if self.doctype and self.doctype.name:
+ buf.append('<!DOCTYPE %s' % self.doctype.name)
+ if self.doctype.publicId is not None or self.doctype.systemId is not None:
+ buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
+ self.doctype.systemId))
+ buf.append('>')
+ buf.append('<html></html>')
+ root = html.fromstring(''.join(buf))
+
+ # Append the initial comments:
+ for comment in self.initialComments:
+ root.addprevious(etree.Comment(comment))
+
+ # Create the root document and add the ElementTree to it
+ self.document = self.documentClass()
+ self.document._elementTree = root.getroottree()
+
+ # Add the root element to the internal child/open data structures
+ root_element = self.elementClass(name)
+ root_element._element = root
+ self.document.childNodes.append(root_element)
+ self.openElements.append(root_element)
+
+ self.rootInserted = True
diff --git a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py
new file mode 100644
index 0000000..c99738e
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py
@@ -0,0 +1,56 @@
+try:
+ from collections.abc import MutableSet
+except ImportError:
+ from collections import MutableSet
+
+
+class SetMixin(MutableSet):
+
+ """
+ Mix-in for sets. You must define __iter__, add, remove
+ """
+
+ def __len__(self):
+ length = 0
+ for item in self:
+ length += 1
+ return length
+
+ def __contains__(self, item):
+ for has_item in self:
+ if item == has_item:
+ return True
+ return False
+
+ issubset = MutableSet.__le__
+ issuperset = MutableSet.__ge__
+
+ union = MutableSet.__or__
+ intersection = MutableSet.__and__
+ difference = MutableSet.__sub__
+ symmetric_difference = MutableSet.__xor__
+
+ def copy(self):
+ return set(self)
+
+ def update(self, other):
+ self |= other
+
+ def intersection_update(self, other):
+ self &= other
+
+ def difference_update(self, other):
+ self -= other
+
+ def symmetric_difference_update(self, other):
+ self ^= other
+
+ def discard(self, item):
+ try:
+ self.remove(item)
+ except KeyError:
+ pass
+
+ @classmethod
+ def _from_iterable(cls, it):
+ return set(it)
diff --git a/env/lib/python3.10/site-packages/lxml/html/builder.py b/env/lib/python3.10/site-packages/lxml/html/builder.py
new file mode 100644
index 0000000..8a074ec
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/builder.py
@@ -0,0 +1,133 @@
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+# Copyright (c) 1999-2004 by Fredrik Lundh
+# --------------------------------------------------------------------
+
+"""
+A set of HTML generator tags for building HTML documents.
+
+Usage::
+
+ >>> from lxml.html.builder import *
+ >>> html = HTML(
+ ... HEAD( TITLE("Hello World") ),
+ ... BODY( CLASS("main"),
+ ... H1("Hello World !")
+ ... )
+ ... )
+
+ >>> import lxml.etree
+ >>> print lxml.etree.tostring(html, pretty_print=True)
+ <html>
+ <head>
+ <title>Hello World</title>
+ </head>
+ <body class="main">
+ <h1>Hello World !</h1>
+ </body>
+ </html>
+
+"""
+
+from lxml.builder import ElementMaker
+from lxml.html import html_parser
+
+E = ElementMaker(makeelement=html_parser.makeelement)
+
+# elements
+A = E.a #: anchor
+ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
+ACRONYM = E.acronym #:
+ADDRESS = E.address #: information on author
+APPLET = E.applet #: Java applet (DEPRECATED)
+AREA = E.area #: client-side image map area
+B = E.b #: bold text style
+BASE = E.base #: document base URI
+BASEFONT = E.basefont #: base font size (DEPRECATED)
+BDO = E.bdo #: I18N BiDi over-ride
+BIG = E.big #: large text style
+BLOCKQUOTE = E.blockquote #: long quotation
+BODY = E.body #: document body
+BR = E.br #: forced line break
+BUTTON = E.button #: push button
+CAPTION = E.caption #: table caption
+CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
+CITE = E.cite #: citation
+CODE = E.code #: computer code fragment
+COL = E.col #: table column
+COLGROUP = E.colgroup #: table column group
+DD = E.dd #: definition description
+DEL = getattr(E, 'del') #: deleted text
+DFN = E.dfn #: instance definition
+DIR = E.dir #: directory list (DEPRECATED)
+DIV = E.div #: generic language/style container
+DL = E.dl #: definition list
+DT = E.dt #: definition term
+EM = E.em #: emphasis
+FIELDSET = E.fieldset #: form control group
+FONT = E.font #: local change to font (DEPRECATED)
+FORM = E.form #: interactive form
+FRAME = E.frame #: subwindow
+FRAMESET = E.frameset #: window subdivision
+H1 = E.h1 #: heading
+H2 = E.h2 #: heading
+H3 = E.h3 #: heading
+H4 = E.h4 #: heading
+H5 = E.h5 #: heading
+H6 = E.h6 #: heading
+HEAD = E.head #: document head
+HR = E.hr #: horizontal rule
+HTML = E.html #: document root element
+I = E.i #: italic text style
+IFRAME = E.iframe #: inline subwindow
+IMG = E.img #: Embedded image
+INPUT = E.input #: form control
+INS = E.ins #: inserted text
+ISINDEX = E.isindex #: single line prompt (DEPRECATED)
+KBD = E.kbd #: text to be entered by the user
+LABEL = E.label #: form field label text
+LEGEND = E.legend #: fieldset legend
+LI = E.li #: list item
+LINK = E.link #: a media-independent link
+MAP = E.map #: client-side image map
+MENU = E.menu #: menu list (DEPRECATED)
+META = E.meta #: generic metainformation
+NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
+NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
+OBJECT = E.object #: generic embedded object
+OL = E.ol #: ordered list
+OPTGROUP = E.optgroup #: option group
+OPTION = E.option #: selectable choice
+P = E.p #: paragraph
+PARAM = E.param #: named property value
+PRE = E.pre #: preformatted text
+Q = E.q #: short inline quotation
+S = E.s #: strike-through text style (DEPRECATED)
+SAMP = E.samp #: sample program output, scripts, etc.
+SCRIPT = E.script #: script statements
+SELECT = E.select #: option selector
+SMALL = E.small #: small text style
+SPAN = E.span #: generic language/style container
+STRIKE = E.strike #: strike-through text (DEPRECATED)
+STRONG = E.strong #: strong emphasis
+STYLE = E.style #: style info
+SUB = E.sub #: subscript
+SUP = E.sup #: superscript
+TABLE = E.table #:
+TBODY = E.tbody #: table body
+TD = E.td #: table data cell
+TEXTAREA = E.textarea #: multi-line text field
+TFOOT = E.tfoot #: table footer
+TH = E.th #: table header cell
+THEAD = E.thead #: table header
+TITLE = E.title #: document title
+TR = E.tr #: table row
+TT = E.tt #: teletype or monospaced text style
+U = E.u #: underlined text style (DEPRECATED)
+UL = E.ul #: unordered list
+VAR = E.var #: instance of a variable or program argument
+
+# attributes (only reserved words are included here)
+ATTR = dict
+def CLASS(v): return {'class': v}
+def FOR(v): return {'for': v}
diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so
new file mode 100755
index 0000000..31087ea
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py
new file mode 100644
index 0000000..e6b0543
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/clean.py
@@ -0,0 +1,786 @@
+# cython: language_level=3str
+
+"""A cleanup tool for HTML.
+
+Removes unwanted tags and content. See the `Cleaner` class for
+details.
+"""
+
+from __future__ import absolute_import
+
+import copy
+import re
+import sys
+try:
+ from urlparse import urlsplit
+ from urllib import unquote_plus
+except ImportError:
+ # Python 3
+ from urllib.parse import urlsplit, unquote_plus
+from lxml import etree
+from lxml.html import defs
+from lxml.html import fromstring, XHTML_NAMESPACE
+from lxml.html import xhtml_to_html, _transform_result
+
+try:
+ unichr
+except NameError:
+ # Python 3
+ unichr = chr
+try:
+ unicode
+except NameError:
+ # Python 3
+ unicode = str
+try:
+ basestring
+except NameError:
+ basestring = (str, bytes)
+
+
+__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
+ 'word_break', 'word_break_html']
+
+# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+# Particularly the CSS cleaning; most of the tag cleaning is integrated now
+# I have multiple kinds of schemes searched; but should schemes be
+# whitelisted instead?
+# max height?
+# remove images? Also in CSS? background attribute?
+# Some way to whitelist object, iframe, etc (e.g., if you want to
+# allow *just* embedded YouTube movies)
+# Log what was deleted and why?
+# style="behavior: ..." might be bad in IE?
+# Should we have something for just <meta http-equiv>? That's the worst of the
+# metas.
+# UTF-7 detections? Example:
+# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
+# you don't always have to have the charset set, if the page has no charset
+# and there's UTF7-like code in it.
+# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
+
+
+# This is an IE-specific construct you can have in a stylesheet to
+# run some Javascript:
+_replace_css_javascript = re.compile(
+ r'expression\s*\(.*?\)', re.S|re.I).sub
+
+# Do I have to worry about @\nimport?
+_replace_css_import = re.compile(
+ r'@\s*import', re.I).sub
+
+_looks_like_tag_content = re.compile(
+ r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
+ *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
+
+# All kinds of schemes besides just javascript: that can cause
+# execution:
+_find_image_dataurls = re.compile(
+ r'data:image/(.+);base64,', re.I).findall
+_possibly_malicious_schemes = re.compile(
+ r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
+ re.I).findall
+# SVG images can contain script content
+_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
+
+def _has_javascript_scheme(s):
+ safe_image_urls = 0
+ for image_type in _find_image_dataurls(s):
+ if _is_unsafe_image_type(image_type):
+ return True
+ safe_image_urls += 1
+ return len(_possibly_malicious_schemes(s)) > safe_image_urls
+
+_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
+
+# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
+_conditional_comment_re = re.compile(
+ r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
+
+_find_styled_elements = etree.XPath(
+ "descendant-or-self::*[@style]")
+
+_find_external_links = etree.XPath(
+ ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
+ "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
+ namespaces={'x':XHTML_NAMESPACE})
+
+
+class Cleaner(object):
+ """
+ Instances cleans the document of each of the possible offending
+ elements. The cleaning is controlled by attributes; you can
+ override attributes in a subclass, or set them in the constructor.
+
+ ``scripts``:
+ Removes any ``<script>`` tags.
+
+ ``javascript``:
+ Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
+ as they could contain Javascript.
+
+ ``comments``:
+ Removes any comments.
+
+ ``style``:
+ Removes any style tags.
+
+ ``inline_style``
+ Removes any style attributes. Defaults to the value of the ``style`` option.
+
+ ``links``:
+ Removes any ``<link>`` tags
+
+ ``meta``:
+ Removes any ``<meta>`` tags
+
+ ``page_structure``:
+ Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
+
+ ``processing_instructions``:
+ Removes any processing instructions.
+
+ ``embedded``:
+ Removes any embedded objects (flash, iframes)
+
+ ``frames``:
+ Removes any frame-related tags
+
+ ``forms``:
+ Removes any form tags
+
+ ``annoying_tags``:
+ Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
+
+ ``remove_tags``:
+ A list of tags to remove. Only the tags will be removed,
+ their content will get pulled up into the parent tag.
+
+ ``kill_tags``:
+ A list of tags to kill. Killing also removes the tag's content,
+ i.e. the whole subtree, not just the tag itself.
+
+ ``allow_tags``:
+ A list of tags to include (default include all).
+
+ ``remove_unknown_tags``:
+ Remove any tags that aren't standard parts of HTML.
+
+ ``safe_attrs_only``:
+ If true, only include 'safe' attributes (specifically the list
+ from the feedparser HTML sanitisation web site).
+
+ ``safe_attrs``:
+ A set of attribute names to override the default list of attributes
+ considered 'safe' (when safe_attrs_only=True).
+
+ ``add_nofollow``:
+ If true, then any <a> tags will have ``rel="nofollow"`` added to them.
+
+ ``host_whitelist``:
+ A list or set of hosts that you can use for embedded content
+ (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
+ You can also implement/override the method
+ ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
+ implement more complex rules for what can be embedded.
+ Anything that passes this test will be shown, regardless of
+ the value of (for instance) ``embedded``.
+
+ Note that this parameter might not work as intended if you do not
+ make the links absolute before doing the cleaning.
+
+ Note that you may also need to set ``whitelist_tags``.
+
+ ``whitelist_tags``:
+ A set of tags that can be included with ``host_whitelist``.
+ The default is ``iframe`` and ``embed``; you may wish to
+ include other tags like ``script``, or you may want to
+ implement ``allow_embedded_url`` for more control. Set to None to
+ include all tags.
+
+ This modifies the document *in place*.
+ """
+
+ scripts = True
+ javascript = True
+ comments = True
+ style = False
+ inline_style = None
+ links = True
+ meta = True
+ page_structure = True
+ processing_instructions = True
+ embedded = True
+ frames = True
+ forms = True
+ annoying_tags = True
+ remove_tags = None
+ allow_tags = None
+ kill_tags = None
+ remove_unknown_tags = True
+ safe_attrs_only = True
+ safe_attrs = defs.safe_attrs
+ add_nofollow = False
+ host_whitelist = ()
+ whitelist_tags = {'iframe', 'embed'}
+
+ def __init__(self, **kw):
+ not_an_attribute = object()
+ for name, value in kw.items():
+ default = getattr(self, name, not_an_attribute)
+ if (default is not None and default is not True and default is not False
+ and not isinstance(default, (frozenset, set, tuple, list))):
+ raise TypeError(
+ "Unknown parameter: %s=%r" % (name, value))
+ setattr(self, name, value)
+ if self.inline_style is None and 'inline_style' not in kw:
+ self.inline_style = self.style
+
+ if kw.get("allow_tags"):
+ if kw.get("remove_unknown_tags"):
+ raise ValueError("It does not make sense to pass in both "
+ "allow_tags and remove_unknown_tags")
+ self.remove_unknown_tags = False
+
+ # Used to lookup the primary URL for a given tag that is up for
+ # removal:
+ _tag_link_attrs = dict(
+ script='src',
+ link='href',
+ # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
+ # From what I can tell, both attributes can contain a link:
+ applet=['code', 'object'],
+ iframe='src',
+ embed='src',
+ layer='src',
+ # FIXME: there doesn't really seem like a general way to figure out what
+ # links an <object> tag uses; links often go in <param> tags with values
+ # that we don't really know. You'd have to have knowledge about specific
+ # kinds of plugins (probably keyed off classid), and match against those.
+ ##object=?,
+ # FIXME: not looking at the action currently, because it is more complex
+ # than than -- if you keep the form, you should keep the form controls.
+ ##form='action',
+ a='href',
+ )
+
+ def __call__(self, doc):
+ """
+ Cleans the document.
+ """
+ try:
+ getroot = doc.getroot
+ except AttributeError:
+ pass # Element instance
+ else:
+ doc = getroot() # ElementTree instance, instead of an element
+ # convert XHTML to HTML
+ xhtml_to_html(doc)
+ # Normalize a case that IE treats <image> like <img>, and that
+ # can confuse either this step or later steps.
+ for el in doc.iter('image'):
+ el.tag = 'img'
+ if not self.comments:
+ # Of course, if we were going to kill comments anyway, we don't
+ # need to worry about this
+ self.kill_conditional_comments(doc)
+
+ kill_tags = set(self.kill_tags or ())
+ remove_tags = set(self.remove_tags or ())
+ allow_tags = set(self.allow_tags or ())
+
+ if self.scripts:
+ kill_tags.add('script')
+ if self.safe_attrs_only:
+ safe_attrs = set(self.safe_attrs)
+ for el in doc.iter(etree.Element):
+ attrib = el.attrib
+ for aname in attrib.keys():
+ if aname not in safe_attrs:
+ del attrib[aname]
+ if self.javascript:
+ if not (self.safe_attrs_only and
+ self.safe_attrs == defs.safe_attrs):
+ # safe_attrs handles events attributes itself
+ for el in doc.iter(etree.Element):
+ attrib = el.attrib
+ for aname in attrib.keys():
+ if aname.startswith('on'):
+ del attrib[aname]
+ doc.rewrite_links(self._remove_javascript_link,
+ resolve_base_href=False)
+ # If we're deleting style then we don't have to remove JS links
+ # from styles, otherwise...
+ if not self.inline_style:
+ for el in _find_styled_elements(doc):
+ old = el.get('style')
+ new = _replace_css_javascript('', old)
+ new = _replace_css_import('', new)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ del el.attrib['style']
+ elif new != old:
+ el.set('style', new)
+ if not self.style:
+ for el in list(doc.iter('style')):
+ if el.get('type', '').lower().strip() == 'text/javascript':
+ el.drop_tree()
+ continue
+ old = el.text or ''
+ new = _replace_css_javascript('', old)
+ # The imported CSS can do anything; we just can't allow:
+ new = _replace_css_import('', new)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ el.text = '/* deleted */'
+ elif new != old:
+ el.text = new
+ if self.comments:
+ kill_tags.add(etree.Comment)
+ if self.processing_instructions:
+ kill_tags.add(etree.ProcessingInstruction)
+ if self.style:
+ kill_tags.add('style')
+ if self.inline_style:
+ etree.strip_attributes(doc, 'style')
+ if self.links:
+ kill_tags.add('link')
+ elif self.style or self.javascript:
+ # We must get rid of included stylesheets if Javascript is not
+ # allowed, as you can put Javascript in them
+ for el in list(doc.iter('link')):
+ if 'stylesheet' in el.get('rel', '').lower():
+ # Note this kills alternate stylesheets as well
+ if not self.allow_element(el):
+ el.drop_tree()
+ if self.meta:
+ kill_tags.add('meta')
+ if self.page_structure:
+ remove_tags.update(('head', 'html', 'title'))
+ if self.embedded:
+ # FIXME: is <layer> really embedded?
+ # We should get rid of any <param> tags not inside <applet>;
+ # These are not really valid anyway.
+ for el in list(doc.iter('param')):
+ parent = el.getparent()
+ while parent is not None and parent.tag not in ('applet', 'object'):
+ parent = parent.getparent()
+ if parent is None:
+ el.drop_tree()
+ kill_tags.update(('applet',))
+ # The alternate contents that are in an iframe are a good fallback:
+ remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
+ if self.frames:
+ # FIXME: ideally we should look at the frame links, but
+ # generally frames don't mix properly with an HTML
+ # fragment anyway.
+ kill_tags.update(defs.frame_tags)
+ if self.forms:
+ remove_tags.add('form')
+ kill_tags.update(('button', 'input', 'select', 'textarea'))
+ if self.annoying_tags:
+ remove_tags.update(('blink', 'marquee'))
+
+ _remove = []
+ _kill = []
+ for el in doc.iter():
+ if el.tag in kill_tags:
+ if self.allow_element(el):
+ continue
+ _kill.append(el)
+ elif el.tag in remove_tags:
+ if self.allow_element(el):
+ continue
+ _remove.append(el)
+
+ if _remove and _remove[0] == doc:
+ # We have to drop the parent-most tag, which we can't
+ # do. Instead we'll rewrite it:
+ el = _remove.pop(0)
+ el.tag = 'div'
+ el.attrib.clear()
+ elif _kill and _kill[0] == doc:
+ # We have to drop the parent-most element, which we can't
+ # do. Instead we'll clear it:
+ el = _kill.pop(0)
+ if el.tag != 'html':
+ el.tag = 'div'
+ el.clear()
+
+ _kill.reverse() # start with innermost tags
+ for el in _kill:
+ el.drop_tree()
+ for el in _remove:
+ el.drop_tag()
+
+ if self.remove_unknown_tags:
+ if allow_tags:
+ raise ValueError(
+ "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+ allow_tags = set(defs.tags)
+ if allow_tags:
+ # make sure we do not remove comments/PIs if users want them (which is rare enough)
+ if not self.comments:
+ allow_tags.add(etree.Comment)
+ if not self.processing_instructions:
+ allow_tags.add(etree.ProcessingInstruction)
+
+ bad = []
+ for el in doc.iter():
+ if el.tag not in allow_tags:
+ bad.append(el)
+ if bad:
+ if bad[0] is doc:
+ el = bad.pop(0)
+ el.tag = 'div'
+ el.attrib.clear()
+ for el in bad:
+ el.drop_tag()
+ if self.add_nofollow:
+ for el in _find_external_links(doc):
+ if not self.allow_follow(el):
+ rel = el.get('rel')
+ if rel:
+ if ('nofollow' in rel
+ and ' nofollow ' in (' %s ' % rel)):
+ continue
+ rel = '%s nofollow' % rel
+ else:
+ rel = 'nofollow'
+ el.set('rel', rel)
+
+ def allow_follow(self, anchor):
+ """
+ Override to suppress rel="nofollow" on some anchors.
+ """
+ return False
+
+ def allow_element(self, el):
+ """
+ Decide whether an element is configured to be accepted or rejected.
+
+ :param el: an element.
+ :return: true to accept the element or false to reject/discard it.
+ """
+ if el.tag not in self._tag_link_attrs:
+ return False
+ attr = self._tag_link_attrs[el.tag]
+ if isinstance(attr, (list, tuple)):
+ for one_attr in attr:
+ url = el.get(one_attr)
+ if not url:
+ return False
+ if not self.allow_embedded_url(el, url):
+ return False
+ return True
+ else:
+ url = el.get(attr)
+ if not url:
+ return False
+ return self.allow_embedded_url(el, url)
+
+ def allow_embedded_url(self, el, url):
+ """
+ Decide whether a URL that was found in an element's attributes or text
+ if configured to be accepted or rejected.
+
+ :param el: an element.
+ :param url: a URL found on the element.
+ :return: true to accept the URL and false to reject it.
+ """
+ if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
+ return False
+ scheme, netloc, path, query, fragment = urlsplit(url)
+ netloc = netloc.lower().split(':', 1)[0]
+ if scheme not in ('http', 'https'):
+ return False
+ if netloc in self.host_whitelist:
+ return True
+ return False
+
+ def kill_conditional_comments(self, doc):
+ """
+ IE conditional comments basically embed HTML that the parser
+ doesn't normally see. We can't allow anything like that, so
+ we'll kill any comments that could be conditional.
+ """
+ has_conditional_comment = _conditional_comment_re.search
+ self._kill_elements(
+ doc, lambda el: has_conditional_comment(el.text),
+ etree.Comment)
+
+ def _kill_elements(self, doc, condition, iterate=None):
+ bad = []
+ for el in doc.iter(iterate):
+ if condition(el):
+ bad.append(el)
+ for el in bad:
+ el.drop_tree()
+
+ def _remove_javascript_link(self, link):
+ # links like "j a v a s c r i p t:" might be interpreted in IE
+ new = _substitute_whitespace('', unquote_plus(link))
+ if _has_javascript_scheme(new):
+ # FIXME: should this be None to delete?
+ return ''
+ return link
+
+ _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
+
+ def _has_sneaky_javascript(self, style):
+ """
+ Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+ can get interpreted, or ``expre/* stuff */ssion(...)``. This
+ checks for attempt to do stuff like this.
+
+ Typically the response will be to kill the entire style; if you
+ have just a bit of Javascript in the style another rule will catch
+ that and remove only the Javascript from the style; this catches
+ more sneaky attempts.
+ """
+ style = self._substitute_comments('', style)
+ style = style.replace('\\', '')
+ style = _substitute_whitespace('', style)
+ style = style.lower()
+ if _has_javascript_scheme(style):
+ return True
+ if 'expression(' in style:
+ return True
+ if '@import' in style:
+ return True
+ if '</noscript' in style:
+ # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ return True
+ if _looks_like_tag_content(style):
+ # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
+ return True
+ return False
+
+ def clean_html(self, html):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ self(doc)
+ return _transform_result(result_type, doc)
+
+clean = Cleaner()
+clean_html = clean.clean_html
+
+############################################################
+## Autolinking
+############################################################
+
+_link_regexes = [
+ re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
+ # This is conservative, but autolinking can be a bit conservative:
+ re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
+ ]
+
+_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
+
+_avoid_hosts = [
+ re.compile(r'^localhost', re.I),
+ re.compile(r'\bexample\.(?:com|org|net)$', re.I),
+ re.compile(r'^127\.0\.0\.1$'),
+ ]
+
+_avoid_classes = ['nolink']
+
+def autolink(el, link_regexes=_link_regexes,
+ avoid_elements=_avoid_elements,
+ avoid_hosts=_avoid_hosts,
+ avoid_classes=_avoid_classes):
+ """
+ Turn any URLs into links.
+
+ It will search for links identified by the given regular
+ expressions (by default mailto and http(s) links).
+
+ It won't link text in an element in avoid_elements, or an element
+ with a class in avoid_classes. It won't link to anything with a
+ host that matches one of the regular expressions in avoid_hosts
+ (default localhost and 127.0.0.1).
+
+ If you pass in an element, the element's tail will not be
+ substituted, only the contents of the element.
+ """
+ if el.tag in avoid_elements:
+ return
+ class_name = el.get('class')
+ if class_name:
+ class_name = class_name.split()
+ for match_class in avoid_classes:
+ if match_class in class_name:
+ return
+ for child in list(el):
+ autolink(child, link_regexes=link_regexes,
+ avoid_elements=avoid_elements,
+ avoid_hosts=avoid_hosts,
+ avoid_classes=avoid_classes)
+ if child.tail:
+ text, tail_children = _link_text(
+ child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
+ if tail_children:
+ child.tail = text
+ index = el.index(child)
+ el[index+1:index+1] = tail_children
+ if el.text:
+ text, pre_children = _link_text(
+ el.text, link_regexes, avoid_hosts, factory=el.makeelement)
+ if pre_children:
+ el.text = text
+ el[:0] = pre_children
+
+def _link_text(text, link_regexes, avoid_hosts, factory):
+ leading_text = ''
+ links = []
+ last_pos = 0
+ while 1:
+ best_match, best_pos = None, None
+ for regex in link_regexes:
+ regex_pos = last_pos
+ while 1:
+ match = regex.search(text, pos=regex_pos)
+ if match is None:
+ break
+ host = match.group('host')
+ for host_regex in avoid_hosts:
+ if host_regex.search(host):
+ regex_pos = match.end()
+ break
+ else:
+ break
+ if match is None:
+ continue
+ if best_pos is None or match.start() < best_pos:
+ best_match = match
+ best_pos = match.start()
+ if best_match is None:
+ # No more matches
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = text
+ else:
+ assert not leading_text
+ leading_text = text
+ break
+ link = best_match.group(0)
+ end = best_match.end()
+ if link.endswith('.') or link.endswith(','):
+ # These punctuation marks shouldn't end a link
+ end -= 1
+ link = link[:-1]
+ prev_text = text[:best_match.start()]
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = prev_text
+ else:
+ assert not leading_text
+ leading_text = prev_text
+ anchor = factory('a')
+ anchor.set('href', link)
+ body = best_match.group('body')
+ if not body:
+ body = link
+ if body.endswith('.') or body.endswith(','):
+ body = body[:-1]
+ anchor.text = body
+ links.append(anchor)
+ text = text[end:]
+ return leading_text, links
+
+def autolink_html(html, *args, **kw):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ autolink(doc, *args, **kw)
+ return _transform_result(result_type, doc)
+
+autolink_html.__doc__ = autolink.__doc__
+
+############################################################
+## Word wrapping
+############################################################
+
+_avoid_word_break_elements = ['pre', 'textarea', 'code']
+_avoid_word_break_classes = ['nobreak']
+
+def word_break(el, max_width=40,
+ avoid_elements=_avoid_word_break_elements,
+ avoid_classes=_avoid_word_break_classes,
+ break_character=unichr(0x200b)):
+ """
+ Breaks any long words found in the body of the text (not attributes).
+
+ Doesn't effect any of the tags in avoid_elements, by default
+ ``<textarea>`` and ``<pre>``
+
+ Breaks words by inserting &#8203;, which is a unicode character
+ for Zero Width Space character. This generally takes up no space
+ in rendering, but does copy as a space, and in monospace contexts
+ usually takes up space.
+
+ See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
+ """
+ # Character suggestion of &#8203 comes from:
+ # http://www.cs.tut.fi/~jkorpela/html/nobr.html
+ if el.tag in _avoid_word_break_elements:
+ return
+ class_name = el.get('class')
+ if class_name:
+ dont_break = False
+ class_name = class_name.split()
+ for avoid in avoid_classes:
+ if avoid in class_name:
+ dont_break = True
+ break
+ if dont_break:
+ return
+ if el.text:
+ el.text = _break_text(el.text, max_width, break_character)
+ for child in el:
+ word_break(child, max_width=max_width,
+ avoid_elements=avoid_elements,
+ avoid_classes=avoid_classes,
+ break_character=break_character)
+ if child.tail:
+ child.tail = _break_text(child.tail, max_width, break_character)
+
+def word_break_html(html, *args, **kw):
+ result_type = type(html)
+ doc = fromstring(html)
+ word_break(doc, *args, **kw)
+ return _transform_result(result_type, doc)
+
+def _break_text(text, max_width, break_character):
+ words = text.split()
+ for word in words:
+ if len(word) > max_width:
+ replacement = _insert_break(word, max_width, break_character)
+ text = text.replace(word, replacement)
+ return text
+
+_break_prefer_re = re.compile(r'[^a-z]', re.I)
+
+def _insert_break(word, width, break_character):
+ orig_word = word
+ result = ''
+ while len(word) > width:
+ start = word[:width]
+ breaks = list(_break_prefer_re.finditer(start))
+ if breaks:
+ last_break = breaks[-1]
+ # Only walk back up to 10 characters to find a nice break:
+ if last_break.end() > width-10:
+ # FIXME: should the break character be at the end of the
+ # chunk, or the beginning of the next chunk?
+ start = word[:last_break.end()]
+ result += start + break_character
+ word = word[len(start):]
+ result += word
+ return result
+
diff --git a/env/lib/python3.10/site-packages/lxml/html/defs.py b/env/lib/python3.10/site-packages/lxml/html/defs.py
new file mode 100644
index 0000000..2058ea3
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/defs.py
@@ -0,0 +1,135 @@
+# FIXME: this should all be confirmed against what a DTD says
+# (probably in a test; this may not match the DTD exactly, but we
+# should document just how it differs).
+
+"""
+Data taken from https://www.w3.org/TR/html401/index/elements.html
+and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
+for html5_tags.
+"""
+
+empty_tags = frozenset([
+ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+ 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
+
+deprecated_tags = frozenset([
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
+ 'menu', 's', 'strike', 'u'])
+
+# archive actually takes a space-separated list of URIs
+link_attrs = frozenset([
+ 'action', 'archive', 'background', 'cite', 'classid',
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
+ 'usemap',
+ # Not standard:
+ 'dynsrc', 'lowsrc',
+ # HTML5 formaction
+ 'formaction'
+ ])
+
+# Not in the HTML 4 spec:
+# onerror, onresize
+event_attrs = frozenset([
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
+ 'onunload',
+ ])
+
+safe_attrs = frozenset([
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
+
+# From http://htmlhelp.com/reference/html40/olist.html
+top_level_tags = frozenset([
+ 'html', 'head', 'body', 'frameset',
+ ])
+
+head_tags = frozenset([
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
+ ])
+
+general_block_tags = frozenset([
+ 'address',
+ 'blockquote',
+ 'center',
+ 'del',
+ 'div',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'ins',
+ 'isindex',
+ 'noscript',
+ 'p',
+ 'pre',
+ ])
+
+list_tags = frozenset([
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
+ ])
+
+table_tags = frozenset([
+ 'table', 'caption', 'colgroup', 'col',
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
+ ])
+
+# just this one from
+# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
+block_tags = general_block_tags | list_tags | table_tags | frozenset([
+ # Partial form tags
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
+ ])
+
+form_tags = frozenset([
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
+ 'select', 'optgroup', 'option', 'textarea',
+ ])
+
+special_inline_tags = frozenset([
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
+ 'span', 'sub', 'sup',
+ ])
+
+phrase_tags = frozenset([
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
+ 'ins', 'kbd', 'samp', 'strong', 'var',
+ ])
+
+font_style_tags = frozenset([
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
+ ])
+
+frame_tags = frozenset([
+ 'frameset', 'frame', 'noframes',
+ ])
+
+html5_tags = frozenset([
+ 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
+ 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
+ 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
+ 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
+ 'svg', 'time', 'track', 'video', 'wbr'
+ ])
+
+# These tags aren't standard
+nonstandard_tags = frozenset(['blink', 'marquee'])
+
+
+tags = (top_level_tags | head_tags | general_block_tags | list_tags
+ | table_tags | form_tags | special_inline_tags | phrase_tags
+ | font_style_tags | nonstandard_tags | html5_tags)
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so
new file mode 100755
index 0000000..0c11b90
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py
new file mode 100644
index 0000000..39bec78
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/diff.py
@@ -0,0 +1,884 @@
+# cython: language_level=3
+
+from __future__ import absolute_import
+
+import difflib
+from lxml import etree
+from lxml.html import fragment_fromstring
+import re
+
+__all__ = ['html_annotate', 'htmldiff']
+
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+try:
+ _unicode = unicode
+except NameError:
+ # Python 3
+ _unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+############################################################
+## Annotation
+############################################################
+
+def default_markup(text, version):
+ return '<span title="%s">%s</span>' % (
+ html_escape(_unicode(version), 1), text)
+
+def html_annotate(doclist, markup=default_markup):
+ """
+ doclist should be ordered from oldest to newest, like::
+
+ >>> version1 = 'Hello World'
+ >>> version2 = 'Goodbye World'
+ >>> print(html_annotate([(version1, 'version 1'),
+ ... (version2, 'version 2')]))
+ <span title="version 2">Goodbye</span> <span title="version 1">World</span>
+
+ The documents must be *fragments* (str/UTF8 or unicode), not
+ complete documents
+
+ The markup argument is a function to markup the spans of words.
+ This function is called like markup('Hello', 'version 2'), and
+ returns HTML. The first argument is text and never includes any
+ markup. The default uses a span with a title:
+
+ >>> print(default_markup('Some Text', 'by Joe'))
+ <span title="by Joe">Some Text</span>
+ """
+ # The basic strategy we have is to split the documents up into
+ # logical tokens (which are words with attached markup). We then
+ # do diffs of each of the versions to track when a token first
+ # appeared in the document; the annotation attached to the token
+ # is the version where it first appeared.
+ tokenlist = [tokenize_annotated(doc, version)
+ for doc, version in doclist]
+ cur_tokens = tokenlist[0]
+ for tokens in tokenlist[1:]:
+ html_annotate_merge_annotations(cur_tokens, tokens)
+ cur_tokens = tokens
+
+ # After we've tracked all the tokens, we can combine spans of text
+ # that are adjacent and have the same annotation
+ cur_tokens = compress_tokens(cur_tokens)
+ # And finally add markup
+ result = markup_serialize_tokens(cur_tokens, markup)
+ return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation):
+ """Tokenize a document and add an annotation attribute to each token
+ """
+ tokens = tokenize(doc, include_hrefs=False)
+ for tok in tokens:
+ tok.annotation = annotation
+ return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+ """Merge the annotations from tokens_old into tokens_new, when the
+ tokens in the new document already existed in the old document.
+ """
+ s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+ commands = s.get_opcodes()
+
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ eq_old = tokens_old[i1:i2]
+ eq_new = tokens_new[j1:j2]
+ copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest):
+ """
+ Copy annotations from the tokens listed in src to the tokens in dest
+ """
+ assert len(src) == len(dest)
+ for src_tok, dest_tok in zip(src, dest):
+ dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+ """
+ Combine adjacent tokens when there is no HTML between the tokens,
+ and they share an annotation
+ """
+ result = [tokens[0]]
+ for tok in tokens[1:]:
+ if (not result[-1].post_tags and
+ not tok.pre_tags and
+ result[-1].annotation == tok.annotation):
+ compress_merge_back(result, tok)
+ else:
+ result.append(tok)
+ return result
+
+def compress_merge_back(tokens, tok):
+ """ Merge tok into the last element of tokens (modifying the list of
+ tokens in-place). """
+ last = tokens[-1]
+ if type(last) is not token or type(tok) is not token:
+ tokens.append(tok)
+ else:
+ text = _unicode(last)
+ if last.trailing_whitespace:
+ text += last.trailing_whitespace
+ text += tok
+ merged = token(text,
+ pre_tags=last.pre_tags,
+ post_tags=tok.post_tags,
+ trailing_whitespace=tok.trailing_whitespace)
+ merged.annotation = last.annotation
+ tokens[-1] = merged
+
+def markup_serialize_tokens(tokens, markup_func):
+ """
+ Serialize the list of tokens into a list of text chunks, calling
+ markup_func around text to add annotations.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ html = token.html()
+ html = markup_func(html, token.annotation)
+ if token.trailing_whitespace:
+ html += token.trailing_whitespace
+ yield html
+ for post in token.post_tags:
+ yield post
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+ ## FIXME: this should take parsed documents too, and use their body
+ ## or other content.
+ """ Do a diff of the old and new document. The documents are HTML
+ *fragments* (str/UTF8 or unicode), they are not complete documents
+ (i.e., no <html> tag).
+
+ Returns HTML with <ins> and <del> tags added around the
+ appropriate text.
+
+ Markup is generally ignored, with the markup from new_html
+ preserved, and possibly some markup from old_html (though it is
+ considered acceptable to lose some of the old markup). Only the
+ words in the HTML are diffed. The exception is <img> tags, which
+ are treated like words, and the href attribute of <a> tags, which
+ are noted inside the tag itself when there are changes.
+ """
+ old_html_tokens = tokenize(old_html)
+ new_html_tokens = tokenize(new_html)
+ result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+ result = ''.join(result).strip()
+ return fixup_ins_del_tags(result)
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+ """ Does a diff on the tokens themselves, returning a list of text
+ chunks (not tokens).
+ """
+ # There are several passes as we do the differences. The tokens
+ # isolate the portion of the content we care to diff; difflib does
+ # all the actual hard work at that point.
+ #
+ # Then we must create a valid document from pieces of both the old
+ # document and the new document. We generally prefer to take
+ # markup from the new document, and only do a best effort attempt
+ # to keep markup from the old document; anything that we can't
+ # resolve we throw away. Also we try to put the deletes as close
+ # to the location where we think they would have been -- because
+ # we are only keeping the markup from the new document, it can be
+ # fuzzy where in the new document the old text would have gone.
+ # Again we just do a best effort attempt.
+ s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+ commands = s.get_opcodes()
+ result = []
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+ continue
+ if command == 'insert' or command == 'replace':
+ ins_tokens = expand_tokens(html2_tokens[j1:j2])
+ merge_insert(ins_tokens, result)
+ if command == 'delete' or command == 'replace':
+ del_tokens = expand_tokens(html1_tokens[i1:i2])
+ merge_delete(del_tokens, result)
+ # If deletes were inserted directly as <del> then we'd have an
+ # invalid document at this point. Instead we put in special
+ # markers, and when the complete diffed document has been created
+ # we try to move the deletes around and resolve any problems.
+ result = cleanup_delete(result)
+
+ return result
+
+def expand_tokens(tokens, equal=False):
+ """Given a list of tokens, return a generator of the chunks of
+ text for the data in the tokens.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ if not equal or not token.hide_when_equal:
+ if token.trailing_whitespace:
+ yield token.html() + token.trailing_whitespace
+ else:
+ yield token.html()
+ for post in token.post_tags:
+ yield post
+
+def merge_insert(ins_chunks, doc):
+ """ doc is the already-handled document (as a list of text chunks);
+ here we add <ins>ins_chunks</ins> to the end of that. """
+ # Though we don't throw away unbalanced_start or unbalanced_end
+ # (we assume there is accompanying markup later or earlier in the
+ # document), we only put <ins> around the balanced portion.
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+ doc.extend(unbalanced_start)
+ if doc and not doc[-1].endswith(' '):
+ # Fix up the case where the word before the insert didn't end with
+ # a space
+ doc[-1] += ' '
+ doc.append('<ins>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </ins>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</ins> ')
+ doc.extend(unbalanced_end)
+
+# These are sentinels to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+ pass
+class DEL_END:
+ pass
+
+class NoDeletes(Exception):
+ """ Raised when the document no longer contains any pending deletes
+ (DEL_START/DEL_END) """
+
+def merge_delete(del_chunks, doc):
+ """ Adds the text chunks in del_chunks to the document doc (another
+ list of text chunks) with marker to show it is a delete.
+ cleanup_delete later resolves these markers into <del> tags."""
+ doc.append(DEL_START)
+ doc.extend(del_chunks)
+ doc.append(DEL_END)
+
+def cleanup_delete(chunks):
+ """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+ them with <del></del>. To do this while keeping the document
+ valid, it may need to drop some tags (either start or end tags).
+
+ It may also move the del into adjacent tags to try to move it to a
+ similar location where it was originally located (e.g., moving a
+ delete into preceding <div> tag, if the del looks like (DEL_START,
+ 'Text</div>', DEL_END)"""
+ while 1:
+ # Find a pending DEL_START/DEL_END, splitting the document
+ # into stuff-preceding-DEL_START, stuff-inside, and
+ # stuff-following-DEL_END
+ try:
+ pre_delete, delete, post_delete = split_delete(chunks)
+ except NoDeletes:
+ # Nothing found, we've cleaned up the entire doc
+ break
+ # The stuff-inside-DEL_START/END may not be well balanced
+ # markup. First we figure out what unbalanced portions there are:
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+ # Then we move the span forward and/or backward based on these
+ # unbalanced portions:
+ locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+ locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+ doc = pre_delete
+ if doc and not doc[-1].endswith(' '):
+ # Fix up case where the word before us didn't have a trailing space
+ doc[-1] += ' '
+ doc.append('<del>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </del>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</del> ')
+ doc.extend(post_delete)
+ chunks = doc
+ return chunks
+
+def split_unbalanced(chunks):
+ """Return (unbalanced_start, balanced, unbalanced_end), where each is
+ a list of text and tag chunks.
+
+ unbalanced_start is a list of all the tags that are opened, but
+ not closed in this span. Similarly, unbalanced_end is a list of
+ tags that are closed but were not opened. Extracting these might
+ mean some reordering of the chunks."""
+ start = []
+ end = []
+ tag_stack = []
+ balanced = []
+ for chunk in chunks:
+ if not chunk.startswith('<'):
+ balanced.append(chunk)
+ continue
+ endtag = chunk[1] == '/'
+ name = chunk.split()[0].strip('<>/')
+ if name in empty_tags:
+ balanced.append(chunk)
+ continue
+ if endtag:
+ if tag_stack and tag_stack[-1][0] == name:
+ balanced.append(chunk)
+ name, pos, tag = tag_stack.pop()
+ balanced[pos] = tag
+ elif tag_stack:
+ start.extend([tag for name, pos, tag in tag_stack])
+ tag_stack = []
+ end.append(chunk)
+ else:
+ end.append(chunk)
+ else:
+ tag_stack.append((name, len(balanced), chunk))
+ balanced.append(None)
+ start.extend(
+ [chunk for name, pos, chunk in tag_stack])
+ balanced = [chunk for chunk in balanced if chunk is not None]
+ return start, balanced, end
+
+def split_delete(chunks):
+ """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+ stuff_after_DEL_END). Returns the first case found (there may be
+ more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
+ there's no DEL_START found. """
+ try:
+ pos = chunks.index(DEL_START)
+ except ValueError:
+ raise NoDeletes
+ pos2 = chunks.index(DEL_END)
+ return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+ """ pre_delete and post_delete implicitly point to a place in the
+ document (where the two were split). This moves that point (by
+ popping items from one and pushing them onto the other). It moves
+ the point to try to find a place where unbalanced_start applies.
+
+ As an example::
+
+ >>> unbalanced_start = ['<div>']
+ >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+ >>> pre, post = doc[:3], doc[3:]
+ >>> pre, post
+ (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+ >>> locate_unbalanced_start(unbalanced_start, pre, post)
+ >>> pre, post
+ (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+
+ As you can see, we moved the point so that the dangling <div> that
+ we found will be effectively replaced by the div in the original
+ document. If this doesn't work out, we just throw away
+ unbalanced_start without doing anything.
+ """
+ while 1:
+ if not unbalanced_start:
+ # We have totally succeeded in finding the position
+ break
+ finding = unbalanced_start[0]
+ finding_name = finding.split()[0].strip('<>')
+ if not post_delete:
+ break
+ next = post_delete[0]
+ if next is DEL_START or not next.startswith('<'):
+ # Reached a word, we can't move the delete text forward
+ break
+ if next[1] == '/':
+ # Reached a closing tag, can we go further? Maybe not...
+ break
+ name = next.split()[0].strip('<>')
+ if name == 'ins':
+ # Can't move into an insert
+ break
+ assert name != 'del', (
+ "Unexpected delete tag: %r" % next)
+ if name == finding_name:
+ unbalanced_start.pop(0)
+ pre_delete.append(post_delete.pop(0))
+ else:
+ # Found a tag that doesn't match
+ break
+
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+ """ like locate_unbalanced_start, except handling end tags and
+ possibly moving the point earlier in the document. """
+ while 1:
+ if not unbalanced_end:
+ # Success
+ break
+ finding = unbalanced_end[-1]
+ finding_name = finding.split()[0].strip('<>/')
+ if not pre_delete:
+ break
+ next = pre_delete[-1]
+ if next is DEL_END or not next.startswith('</'):
+ # A word or a start tag
+ break
+ name = next.split()[0].strip('<>/')
+ if name == 'ins' or name == 'del':
+ # Can't move into an insert or delete
+ break
+ if name == finding_name:
+ unbalanced_end.pop()
+ post_delete.insert(0, pre_delete.pop())
+ else:
+ # Found a tag that doesn't match
+ break
+
+class token(_unicode):
+ """ Represents a diffable token, generally a word that is displayed to
+ the user. Opening tags are attached to this token when they are
+ adjacent (pre_tags) and closing tags that follow the word
+ (post_tags). Some exceptions occur when there are empty tags
+ adjacent to a word, so there may be close tags in pre_tags, or
+ open tags in post_tags.
+
+ We also keep track of whether the word was originally followed by
+ whitespace, even though we do not want to treat the word as
+ equivalent to a similar word that does not have a trailing
+ space."""
+
+ # When this is true, the token will be eliminated from the
+ # displayed diff if no change has occurred:
+ hide_when_equal = False
+
+ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
+ obj = _unicode.__new__(cls, text)
+
+ if pre_tags is not None:
+ obj.pre_tags = pre_tags
+ else:
+ obj.pre_tags = []
+
+ if post_tags is not None:
+ obj.post_tags = post_tags
+ else:
+ obj.post_tags = []
+
+ obj.trailing_whitespace = trailing_whitespace
+
+ return obj
+
+ def __repr__(self):
+ return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+ self.post_tags, self.trailing_whitespace)
+
+ def html(self):
+ return _unicode(self)
+
+class tag_token(token):
+
+ """ Represents a token that is actually a tag. Currently this is just
+ the <img> tag, which takes up visible space just like a word but
+ is only represented in a document by a tag. """
+
+ def __new__(cls, tag, data, html_repr, pre_tags=None,
+ post_tags=None, trailing_whitespace=""):
+ obj = token.__new__(cls, "%s: %s" % (type, data),
+ pre_tags=pre_tags,
+ post_tags=post_tags,
+ trailing_whitespace=trailing_whitespace)
+ obj.tag = tag
+ obj.data = data
+ obj.html_repr = html_repr
+ return obj
+
+ def __repr__(self):
+ return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
+ self.tag,
+ self.data,
+ self.html_repr,
+ self.pre_tags,
+ self.post_tags,
+ self.trailing_whitespace)
+ def html(self):
+ return self.html_repr
+
+class href_token(token):
+
+ """ Represents the href in an anchor tag. Unlike other words, we only
+ show the href when it changes. """
+
+ hide_when_equal = True
+
+ def html(self):
+ return ' Link: %s' % self
+
+def tokenize(html, include_hrefs=True):
+ """
+ Parse the given HTML and returns token objects (words with attached tags).
+
+ This parses only the content of a page; anything in the head is
+ ignored, and the <head> and <body> elements are themselves
+ optional. The content is then parsed by lxml, which ensures the
+ validity of the resulting parsed document (though lxml may make
+ incorrect guesses when the markup is particular bad).
+
+ <ins> and <del> tags are also eliminated from the document, as
+ that gets confusing.
+
+ If include_hrefs is true, then the href attribute of <a> tags is
+ included as a special kind of diffable token."""
+ if etree.iselement(html):
+ body_el = html
+ else:
+ body_el = parse_html(html, cleanup=True)
+ # Then we split the document into text chunks for each tag, word, and end tag:
+ chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
+ # Finally re-joining them into token objects:
+ return fixup_chunks(chunks)
+
+def parse_html(html, cleanup=True):
+ """
+ Parses an HTML fragment, returning an lxml element. Note that the HTML will be
+ wrapped in a <div> tag that was not in the original document.
+
+ If cleanup is true, make sure there's no <head> or <body>, and get
+ rid of any <ins> and <del> tags.
+ """
+ if cleanup:
+ # This removes any extra markup or structure like <head>:
+ html = cleanup_html(html)
+ return fragment_fromstring(html, create_parent=True)
+
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+
+def cleanup_html(html):
+ """ This 'cleans' the HTML, meaning that any page structure is removed
+ (only the contents of <body> are used, if there is any <body).
+ Also <ins> and <del> tags are removed. """
+ match = _body_re.search(html)
+ if match:
+ html = html[match.end():]
+ match = _end_body_re.search(html)
+ if match:
+ html = html[:match.start()]
+ html = _ins_del_re.sub('', html)
+ return html
+
+
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+
+def split_trailing_whitespace(word):
+ """
+ This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+ """
+ stripped_length = len(word.rstrip())
+ return word[0:stripped_length], word[stripped_length:]
+
+
+def fixup_chunks(chunks):
+ """
+ This function takes a list of chunks and produces a list of tokens.
+ """
+ tag_accum = []
+ cur_word = None
+ result = []
+ for chunk in chunks:
+ if isinstance(chunk, tuple):
+ if chunk[0] == 'img':
+ src = chunk[1]
+ tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
+ cur_word = tag_token('img', src, html_repr=tag,
+ pre_tags=tag_accum,
+ trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif chunk[0] == 'href':
+ href = chunk[1]
+ cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
+ tag_accum = []
+ result.append(cur_word)
+ continue
+
+ if is_word(chunk):
+ chunk, trailing_whitespace = split_trailing_whitespace(chunk)
+ cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif is_start_tag(chunk):
+ tag_accum.append(chunk)
+
+ elif is_end_tag(chunk):
+ if tag_accum:
+ tag_accum.append(chunk)
+ else:
+ assert cur_word, (
+ "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+ % (cur_word, result, chunk, chunks))
+ cur_word.post_tags.append(chunk)
+ else:
+ assert False
+
+ if not result:
+ return [token('', pre_tags=tag_accum)]
+ else:
+ result[-1].post_tags.extend(tag_accum)
+
+ return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+block_level_tags = (
+ 'address',
+ 'blockquote',
+ 'center',
+ 'dir',
+ 'div',
+ 'dl',
+ 'fieldset',
+ 'form',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'isindex',
+ 'menu',
+ 'noframes',
+ 'noscript',
+ 'ol',
+ 'p',
+ 'pre',
+ 'table',
+ 'ul',
+ )
+
+block_level_container_tags = (
+ 'dd',
+ 'dt',
+ 'frameset',
+ 'li',
+ 'tbody',
+ 'td',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ )
+
+
+def flatten_el(el, include_hrefs, skip_tag=False):
+ """ Takes an lxml element el, and generates all the text chunks for
+ that tag. Each start tag is a chunk, each word is a chunk, and each
+ end tag is a chunk.
+
+ If skip_tag is true, then the outermost container tag is
+ not returned (just its contents)."""
+ if not skip_tag:
+ if el.tag == 'img':
+ yield ('img', el.get('src'), start_tag(el))
+ else:
+ yield start_tag(el)
+ if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
+ return
+ start_words = split_words(el.text)
+ for word in start_words:
+ yield html_escape(word)
+ for child in el:
+ for item in flatten_el(child, include_hrefs=include_hrefs):
+ yield item
+ if el.tag == 'a' and el.get('href') and include_hrefs:
+ yield ('href', el.get('href'))
+ if not skip_tag:
+ yield end_tag(el)
+ end_words = split_words(el.tail)
+ for word in end_words:
+ yield html_escape(word)
+
+split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
+
+def split_words(text):
+ """ Splits some text into words. Includes trailing whitespace
+ on each word when appropriate. """
+ if not text or not text.strip():
+ return []
+
+ words = split_words_re.findall(text)
+ return words
+
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+
+def start_tag(el):
+ """
+ The text representation of the start tag for a tag.
+ """
+ return '<%s%s>' % (
+ el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
+ for name, value in el.attrib.items()]))
+
+def end_tag(el):
+ """ The text representation of an end tag for a tag. Includes
+ trailing whitespace when appropriate. """
+ if el.tail and start_whitespace_re.search(el.tail):
+ extra = ' '
+ else:
+ extra = ''
+ return '</%s>%s' % (el.tag, extra)
+
+def is_word(tok):
+ return not tok.startswith('<')
+
+def is_end_tag(tok):
+ return tok.startswith('</')
+
+def is_start_tag(tok):
+ return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+ """ Given an html string, move any <ins> or <del> tags inside of any
+ block-level elements, e.g. transform <ins><p>word</p></ins> to
+ <p><ins>word</ins></p> """
+ doc = parse_html(html, cleanup=False)
+ _fixup_ins_del_tags(doc)
+ html = serialize_html_fragment(doc, skip_outer=True)
+ return html
+
+def serialize_html_fragment(el, skip_outer=False):
+ """ Serialize a single lxml element as HTML. The serialized form
+ includes the elements tail.
+
+ If skip_outer is true, then don't serialize the outermost tag
+ """
+ assert not isinstance(el, basestring), (
+ "You should pass in an element, not a string like %r" % el)
+ html = etree.tostring(el, method="html", encoding=_unicode)
+ if skip_outer:
+ # Get rid of the extra starting tag:
+ html = html[html.find('>')+1:]
+ # Get rid of the extra end tag:
+ html = html[:html.rfind('<')]
+ return html.strip()
+ else:
+ return html
+
+def _fixup_ins_del_tags(doc):
+ """fixup_ins_del_tags that works on an lxml document in-place
+ """
+ for tag in ['ins', 'del']:
+ for el in doc.xpath('descendant-or-self::%s' % tag):
+ if not _contains_block_level_tag(el):
+ continue
+ _move_el_inside_block(el, tag=tag)
+ el.drop_tag()
+ #_merge_element_contents(el)
+
+def _contains_block_level_tag(el):
+ """True if the element contains any block-level elements, like <p>, <td>, etc.
+ """
+ if el.tag in block_level_tags or el.tag in block_level_container_tags:
+ return True
+ for child in el:
+ if _contains_block_level_tag(child):
+ return True
+ return False
+
+def _move_el_inside_block(el, tag):
+ """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+ and moves them inside any block-level tags. """
+ for child in el:
+ if _contains_block_level_tag(child):
+ break
+ else:
+ # No block-level tags in any child
+ children_tag = etree.Element(tag)
+ children_tag.text = el.text
+ el.text = None
+ children_tag.extend(list(el))
+ el[:] = [children_tag]
+ return
+ for child in list(el):
+ if _contains_block_level_tag(child):
+ _move_el_inside_block(child, tag)
+ if child.tail:
+ tail_tag = etree.Element(tag)
+ tail_tag.text = child.tail
+ child.tail = None
+ el.insert(el.index(child)+1, tail_tag)
+ else:
+ child_tag = etree.Element(tag)
+ el.replace(child, child_tag)
+ child_tag.append(child)
+ if el.text:
+ text_tag = etree.Element(tag)
+ text_tag.text = el.text
+ el.text = None
+ el.insert(0, text_tag)
+
+def _merge_element_contents(el):
+ """
+ Removes an element, but merges its contents into its place, e.g.,
+ given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+ <p>Hi there!</p>
+ """
+ parent = el.getparent()
+ text = el.text or ''
+ if el.tail:
+ if not len(el):
+ text += el.tail
+ else:
+ if el[-1].tail:
+ el[-1].tail += el.tail
+ else:
+ el[-1].tail = el.tail
+ index = parent.index(el)
+ if text:
+ if index == 0:
+ previous = None
+ else:
+ previous = parent[index-1]
+ if previous is None:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+ else:
+ if previous.tail:
+ previous.tail += text
+ else:
+ previous.tail = text
+ parent[index:index+1] = el.getchildren()
+
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+ """
+ Acts like SequenceMatcher, but tries not to find very small equal
+ blocks amidst large spans of changes
+ """
+
+ threshold = 2
+
+ def get_matching_blocks(self):
+ size = min(len(self.b), len(self.b))
+ threshold = min(self.threshold, size / 4)
+ actual = difflib.SequenceMatcher.get_matching_blocks(self)
+ return [item for item in actual
+ if item[2] > threshold
+ or not item[2]]
+
+if __name__ == '__main__':
+ from lxml.html import _diffcommand
+ _diffcommand.main()
+
diff --git a/env/lib/python3.10/site-packages/lxml/html/formfill.py b/env/lib/python3.10/site-packages/lxml/html/formfill.py
new file mode 100644
index 0000000..2499a8e
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/formfill.py
@@ -0,0 +1,299 @@
+from lxml.etree import XPath, ElementBase
+from lxml.html import fromstring, XHTML_NAMESPACE
+from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
+from lxml.html import defs
+import copy
+
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
+ 'insert_errors', 'insert_errors_html',
+ 'DefaultErrorCreator']
+
+class FormNotFound(LookupError):
+ """
+ Raised when no form can be found
+ """
+
+_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
+_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
+ namespaces={'x':XHTML_NAMESPACE})
+_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
+ namespaces={'x':XHTML_NAMESPACE})
+_name_xpath = XPath('descendant-or-self::*[@name=$name]')
+
+def fill_form(
+ el,
+ values,
+ form_id=None,
+ form_index=None,
+ ):
+ el = _find_form(el, form_id=form_id, form_index=form_index)
+ _fill_form(el, values)
+
+def fill_form_html(html, values, form_id=None, form_index=None):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ fill_form(doc, values, form_id=form_id, form_index=form_index)
+ return _transform_result(result_type, doc)
+
+def _fill_form(el, values):
+ counts = {}
+ if hasattr(values, 'mixed'):
+ # For Paste request parameters
+ values = values.mixed()
+ inputs = _input_xpath(el)
+ for input in inputs:
+ name = input.get('name')
+ if not name:
+ continue
+ if _takes_multiple(input):
+ value = values.get(name, [])
+ if not isinstance(value, (list, tuple)):
+ value = [value]
+ _fill_multiple(input, value)
+ elif name not in values:
+ continue
+ else:
+ index = counts.get(name, 0)
+ counts[name] = index + 1
+ value = values[name]
+ if isinstance(value, (list, tuple)):
+ try:
+ value = value[index]
+ except IndexError:
+ continue
+ elif index > 0:
+ continue
+ _fill_single(input, value)
+
+def _takes_multiple(input):
+ if _nons(input.tag) == 'select' and input.get('multiple'):
+ # FIXME: multiple="0"?
+ return True
+ type = input.get('type', '').lower()
+ if type in ('radio', 'checkbox'):
+ return True
+ return False
+
+def _fill_multiple(input, value):
+ type = input.get('type', '').lower()
+ if type == 'checkbox':
+ v = input.get('value')
+ if v is None:
+ if not value:
+ result = False
+ else:
+ result = value[0]
+ if isinstance(value, basestring):
+ # The only valid "on" value for an unnamed checkbox is 'on'
+ result = result == 'on'
+ _check(input, result)
+ else:
+ _check(input, v in value)
+ elif type == 'radio':
+ v = input.get('value')
+ _check(input, v in value)
+ else:
+ assert _nons(input.tag) == 'select'
+ for option in _options_xpath(input):
+ v = option.get('value')
+ if v is None:
+ # This seems to be the default, at least on IE
+ # FIXME: but I'm not sure
+ v = option.text_content()
+ _select(option, v in value)
+
+def _check(el, check):
+ if check:
+ el.set('checked', '')
+ else:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+
+def _select(el, select):
+ if select:
+ el.set('selected', '')
+ else:
+ if 'selected' in el.attrib:
+ del el.attrib['selected']
+
+def _fill_single(input, value):
+ if _nons(input.tag) == 'textarea':
+ input.text = value
+ else:
+ input.set('value', value)
+
+def _find_form(el, form_id=None, form_index=None):
+ if form_id is None and form_index is None:
+ forms = _forms_xpath(el)
+ for form in forms:
+ return form
+ raise FormNotFound(
+ "No forms in page")
+ if form_id is not None:
+ form = el.get_element_by_id(form_id)
+ if form is not None:
+ return form
+ forms = _form_name_xpath(el, name=form_id)
+ if forms:
+ return forms[0]
+ else:
+ raise FormNotFound(
+ "No form with the name or id of %r (forms: %s)"
+ % (id, ', '.join(_find_form_ids(el))))
+ if form_index is not None:
+ forms = _forms_xpath(el)
+ try:
+ return forms[form_index]
+ except IndexError:
+ raise FormNotFound(
+ "There is no form with the index %r (%i forms found)"
+ % (form_index, len(forms)))
+
+def _find_form_ids(el):
+ forms = _forms_xpath(el)
+ if not forms:
+ yield '(no forms)'
+ return
+ for index, form in enumerate(forms):
+ if form.get('id'):
+ if form.get('name'):
+ yield '%s or %s' % (form.get('id'),
+ form.get('name'))
+ else:
+ yield form.get('id')
+ elif form.get('name'):
+ yield form.get('name')
+ else:
+ yield '(unnamed form %s)' % index
+
+############################################################
+## Error filling
+############################################################
+
+class DefaultErrorCreator(object):
+ insert_before = True
+ block_inside = True
+ error_container_tag = 'div'
+ error_message_class = 'error-message'
+ error_block_class = 'error-block'
+ default_message = "Invalid"
+
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ if not hasattr(self, name):
+ raise TypeError(
+ "Unexpected keyword argument: %s" % name)
+ setattr(self, name, value)
+
+ def __call__(self, el, is_block, message):
+ error_el = el.makeelement(self.error_container_tag)
+ if self.error_message_class:
+ error_el.set('class', self.error_message_class)
+ if is_block and self.error_block_class:
+ error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
+ if message is None or message == '':
+ message = self.default_message
+ if isinstance(message, ElementBase):
+ error_el.append(message)
+ else:
+ assert isinstance(message, basestring), (
+ "Bad message; should be a string or element: %r" % message)
+ error_el.text = message or self.default_message
+ if is_block and self.block_inside:
+ if self.insert_before:
+ error_el.tail = el.text
+ el.text = None
+ el.insert(0, error_el)
+ else:
+ el.append(error_el)
+ else:
+ parent = el.getparent()
+ pos = parent.index(el)
+ if self.insert_before:
+ parent.insert(pos, error_el)
+ else:
+ error_el.tail = el.tail
+ el.tail = None
+ parent.insert(pos+1, error_el)
+
+default_error_creator = DefaultErrorCreator()
+
+
+def insert_errors(
+ el,
+ errors,
+ form_id=None,
+ form_index=None,
+ error_class="error",
+ error_creator=default_error_creator,
+ ):
+ el = _find_form(el, form_id=form_id, form_index=form_index)
+ for name, error in errors.items():
+ if error is None:
+ continue
+ for error_el, message in _find_elements_for_name(el, name, error):
+ assert isinstance(message, (basestring, type(None), ElementBase)), (
+ "Bad message: %r" % message)
+ _insert_error(error_el, message, error_class, error_creator)
+
+def insert_errors_html(html, values, **kw):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ insert_errors(doc, values, **kw)
+ return _transform_result(result_type, doc)
+
+def _insert_error(el, error, error_class, error_creator):
+ if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
+ is_block = False
+ else:
+ is_block = True
+ if _nons(el.tag) != 'form' and error_class:
+ _add_class(el, error_class)
+ if el.get('id'):
+ labels = _label_for_xpath(el, for_id=el.get('id'))
+ if labels:
+ for label in labels:
+ _add_class(label, error_class)
+ error_creator(el, is_block, error)
+
+def _add_class(el, class_name):
+ if el.get('class'):
+ el.set('class', el.get('class')+' '+class_name)
+ else:
+ el.set('class', class_name)
+
+def _find_elements_for_name(form, name, error):
+ if name is None:
+ # An error for the entire form
+ yield form, error
+ return
+ if name.startswith('#'):
+ # By id
+ el = form.get_element_by_id(name[1:])
+ if el is not None:
+ yield el, error
+ return
+ els = _name_xpath(form, name=name)
+ if not els:
+ # FIXME: should this raise an exception?
+ return
+ if not isinstance(error, (list, tuple)):
+ yield els[0], error
+ return
+ # FIXME: if error is longer than els, should it raise an error?
+ for el, err in zip(els, error):
+ if err is None:
+ continue
+ yield el, err
diff --git a/env/lib/python3.10/site-packages/lxml/html/html5parser.py b/env/lib/python3.10/site-packages/lxml/html/html5parser.py
new file mode 100644
index 0000000..2f7be15
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/html5parser.py
@@ -0,0 +1,260 @@
+"""
+An interface to html5lib that mimics the lxml.html interface.
+"""
+import sys
+import string
+
+from html5lib import HTMLParser as _HTMLParser
+from html5lib.treebuilders.etree_lxml import TreeBuilder
+from lxml import etree
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
+
+# python3 compatibility
+try:
+ _strings = basestring
+except NameError:
+ _strings = (bytes, str)
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+try:
+ from urlparse import urlparse
+except ImportError:
+ from urllib.parse import urlparse
+
+
+class HTMLParser(_HTMLParser):
+ """An html5lib HTML parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+
+try:
+ from html5lib import XHTMLParser as _XHTMLParser
+except ImportError:
+ pass
+else:
+ class XHTMLParser(_XHTMLParser):
+ """An html5lib XHTML Parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+ xhtml_parser = XHTMLParser()
+
+
+def _find_tag(tree, tag):
+ elem = tree.find(tag)
+ if elem is not None:
+ return elem
+ return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
+
+
+def document_fromstring(html, guess_charset=None, parser=None):
+ """
+ Parse a whole document into a string.
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = True
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ return parser.parse(html, **options).getroot()
+
+
+def fragments_fromstring(html, no_leading_text=False,
+ guess_charset=None, parser=None):
+ """Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string. If no_leading_text is true,
+ then it will be an error if there is leading text, and it will always be
+ a list of only elements.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = False
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ children = parser.parseFragment(html, 'div', **options)
+ if children and isinstance(children[0], _strings):
+ if no_leading_text:
+ if children[0].strip():
+ raise etree.ParserError('There is leading text: %r' %
+ children[0])
+ del children[0]
+ return children
+
+
+def fragment_fromstring(html, create_parent=False,
+ guess_charset=None, parser=None):
+ """Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If 'create_parent' is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element. In
+ this case, leading or trailing text is allowed.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ accept_leading_text = bool(create_parent)
+
+ elements = fragments_fromstring(
+ html, guess_charset=guess_charset, parser=parser,
+ no_leading_text=not accept_leading_text)
+
+ if create_parent:
+ if not isinstance(create_parent, _strings):
+ create_parent = 'div'
+ new_root = Element(create_parent)
+ if elements:
+ if isinstance(elements[0], _strings):
+ new_root.text = elements[0]
+ del elements[0]
+ new_root.extend(elements)
+ return new_root
+
+ if not elements:
+ raise etree.ParserError('No elements found')
+ if len(elements) > 1:
+ raise etree.ParserError('Multiple elements found')
+ result = elements[0]
+ if result.tail and result.tail.strip():
+ raise etree.ParserError('Element followed by text: %r' % result.tail)
+ result.tail = None
+ return result
+
+
+def fromstring(html, guess_charset=None, parser=None):
+ """Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+
+ 'base_url' will set the document's base_url attribute (and the tree's
+ docinfo.URL)
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+ doc = document_fromstring(html, parser=parser,
+ guess_charset=guess_charset)
+
+ # document starts with doctype or <html>, full document!
+ start = html[:50]
+ if isinstance(start, bytes):
+ # Allow text comparison in python3.
+ # Decode as ascii, that also covers latin-1 and utf-8 for the
+ # characters we need.
+ start = start.decode('ascii', 'replace')
+
+ start = start.lstrip().lower()
+ if start.startswith('<html') or start.startswith('<!doctype'):
+ return doc
+
+ head = _find_tag(doc, 'head')
+
+ # if the head is not empty we have a full document
+ if len(head):
+ return doc
+
+ body = _find_tag(doc, 'body')
+
+ # The body has just one element, so it was probably a single
+ # element passed in
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ return body[0]
+
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except <body> implies too much structure.
+ if _contains_block_level_tag(body):
+ body.tag = 'div'
+ else:
+ body.tag = 'span'
+ return body
+
+
+def parse(filename_url_or_file, guess_charset=None, parser=None):
+ """Parse a filename, URL, or file-like object into an HTML document
+ tree. Note: this returns a tree, not an element. Use
+ ``parse(...).getroot()`` to get the document root.
+
+ If ``guess_charset`` is true, the ``useChardet`` option is passed into
+ html5lib to enable character detection. This option is on by default
+ when parsing from URLs, off by default when parsing from file(-like)
+ objects (which tend to return Unicode more often than not), and on by
+ default when parsing from a file path (which is read in binary mode).
+ """
+ if parser is None:
+ parser = html_parser
+ if not isinstance(filename_url_or_file, _strings):
+ fp = filename_url_or_file
+ if guess_charset is None:
+ # assume that file-like objects return Unicode more often than bytes
+ guess_charset = False
+ elif _looks_like_url(filename_url_or_file):
+ fp = urlopen(filename_url_or_file)
+ if guess_charset is None:
+ # assume that URLs return bytes
+ guess_charset = True
+ else:
+ fp = open(filename_url_or_file, 'rb')
+ if guess_charset is None:
+ guess_charset = True
+
+ options = {}
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ if guess_charset:
+ options['useChardet'] = guess_charset
+ return parser.parse(fp, **options)
+
+
+def _looks_like_url(str):
+ scheme = urlparse(str)[0]
+ if not scheme:
+ return False
+ elif (sys.platform == 'win32' and
+ scheme in string.ascii_letters
+ and len(scheme) == 1):
+ # looks like a 'normal' absolute path
+ return False
+ else:
+ return True
+
+
+html_parser = HTMLParser()
diff --git a/env/lib/python3.10/site-packages/lxml/html/soupparser.py b/env/lib/python3.10/site-packages/lxml/html/soupparser.py
new file mode 100644
index 0000000..e0cf3a0
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/soupparser.py
@@ -0,0 +1,314 @@
+"""External interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["fromstring", "parse", "convert_tree"]
+
+import re
+from lxml import etree, html
+
+try:
+ from bs4 import (
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
+ Declaration, Doctype)
+ _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
+except ImportError:
+ from BeautifulSoup import (
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
+ Declaration)
+ _DECLARATION_OR_DOCTYPE = Declaration
+
+
+def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
+ """Parse a string of HTML data into an Element tree using the
+ BeautifulSoup parser.
+
+ Returns the root ``<html>`` Element of the tree.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ return _parse(data, beautifulsoup, makeelement, **bsargs)
+
+
+def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
+ """Parse a file into an ElemenTree using the BeautifulSoup parser.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ if not hasattr(file, 'read'):
+ file = open(file)
+ root = _parse(file, beautifulsoup, makeelement, **bsargs)
+ return etree.ElementTree(root)
+
+
+def convert_tree(beautiful_soup_tree, makeelement=None):
+ """Convert a BeautifulSoup tree to a list of Element trees.
+
+ Returns a list instead of a single root Element to support
+ HTML-like soup with more than one root element.
+
+ You can pass a different Element factory through the `makeelement`
+ keyword.
+ """
+ root = _convert_tree(beautiful_soup_tree, makeelement)
+ children = root.getchildren()
+ for child in children:
+ root.remove(child)
+ return children
+
+
+# helpers
+
+def _parse(source, beautifulsoup, makeelement, **bsargs):
+ if beautifulsoup is None:
+ beautifulsoup = BeautifulSoup
+ if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
+ if 'convertEntities' not in bsargs:
+ bsargs['convertEntities'] = 'html'
+ if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
+ if 'features' not in bsargs:
+ bsargs['features'] = 'html.parser' # use Python html parser
+ tree = beautifulsoup(source, **bsargs)
+ root = _convert_tree(tree, makeelement)
+ # from ET: wrap the document in a html root element, if necessary
+ if len(root) == 1 and root[0].tag == "html":
+ return root[0]
+ root.tag = "html"
+ return root
+
+
+_parse_doctype_declaration = re.compile(
+ r'(?:\s|[<!])*DOCTYPE\s*HTML'
+ r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
+ r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
+ re.IGNORECASE).match
+
+
+class _PseudoTag:
+ # Minimal imitation of BeautifulSoup.Tag
+ def __init__(self, contents):
+ self.name = 'html'
+ self.attrs = []
+ self.contents = contents
+
+ def __iter__(self):
+ return self.contents.__iter__()
+
+
+def _convert_tree(beautiful_soup_tree, makeelement):
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+
+ # Split the tree into three parts:
+ # i) everything before the root element: document type
+ # declaration, comments, processing instructions, whitespace
+ # ii) the root(s),
+ # iii) everything after the root: comments, processing
+ # instructions, whitespace
+ first_element_idx = last_element_idx = None
+ html_root = declaration = None
+ for i, e in enumerate(beautiful_soup_tree):
+ if isinstance(e, Tag):
+ if first_element_idx is None:
+ first_element_idx = i
+ last_element_idx = i
+ if html_root is None and e.name and e.name.lower() == 'html':
+ html_root = e
+ elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
+ declaration = e
+
+ # For a nice, well-formatted document, the variable roots below is
+ # a list consisting of a single <html> element. However, the document
+ # may be a soup like '<meta><head><title>Hello</head><body>Hi
+ # all<\p>'. In this example roots is a list containing meta, head
+ # and body elements.
+ if first_element_idx is None:
+ pre_root = post_root = []
+ roots = beautiful_soup_tree.contents
+ else:
+ pre_root = beautiful_soup_tree.contents[:first_element_idx]
+ roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
+ post_root = beautiful_soup_tree.contents[last_element_idx+1:]
+
+ # Reorganize so that there is one <html> root...
+ if html_root is not None:
+ # ... use existing one if possible, ...
+ i = roots.index(html_root)
+ html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
+ else:
+ # ... otherwise create a new one.
+ html_root = _PseudoTag(roots)
+
+ convert_node = _init_node_converters(makeelement)
+
+ # Process pre_root
+ res_root = convert_node(html_root)
+ prev = res_root
+ for e in reversed(pre_root):
+ converted = convert_node(e)
+ if converted is not None:
+ prev.addprevious(converted)
+ prev = converted
+
+ # ditto for post_root
+ prev = res_root
+ for e in post_root:
+ converted = convert_node(e)
+ if converted is not None:
+ prev.addnext(converted)
+ prev = converted
+
+ if declaration is not None:
+ try:
+ # bs4 provides full Doctype string
+ doctype_string = declaration.output_ready()
+ except AttributeError:
+ doctype_string = declaration.string
+
+ match = _parse_doctype_declaration(doctype_string)
+ if not match:
+ # Something is wrong if we end up in here. Since soupparser should
+ # tolerate errors, do not raise Exception, just let it pass.
+ pass
+ else:
+ external_id, sys_uri = match.groups()
+ docinfo = res_root.getroottree().docinfo
+ # strip quotes and update DOCTYPE values (any of None, '', '...')
+ docinfo.public_id = external_id and external_id[1:-1]
+ docinfo.system_url = sys_uri and sys_uri[1:-1]
+
+ return res_root
+
+
+def _init_node_converters(makeelement):
+ converters = {}
+ ordered_node_types = []
+
+ def converter(*types):
+ def add(handler):
+ for t in types:
+ converters[t] = handler
+ ordered_node_types.append(t)
+ return handler
+ return add
+
+ def find_best_converter(node):
+ for t in ordered_node_types:
+ if isinstance(node, t):
+ return converters[t]
+ return None
+
+ def convert_node(bs_node, parent=None):
+ # duplicated in convert_tag() below
+ try:
+ handler = converters[type(bs_node)]
+ except KeyError:
+ handler = converters[type(bs_node)] = find_best_converter(bs_node)
+ if handler is None:
+ return None
+ return handler(bs_node, parent)
+
+ def map_attrs(bs_attrs):
+ if isinstance(bs_attrs, dict): # bs4
+ attribs = {}
+ for k, v in bs_attrs.items():
+ if isinstance(v, list):
+ v = " ".join(v)
+ attribs[k] = unescape(v)
+ else:
+ attribs = dict((k, unescape(v)) for k, v in bs_attrs)
+ return attribs
+
+ def append_text(parent, text):
+ if len(parent) == 0:
+ parent.text = (parent.text or '') + text
+ else:
+ parent[-1].tail = (parent[-1].tail or '') + text
+
+ # converters are tried in order of their definition
+
+ @converter(Tag, _PseudoTag)
+ def convert_tag(bs_node, parent):
+ attrs = bs_node.attrs
+ if parent is not None:
+ attribs = map_attrs(attrs) if attrs else None
+ res = etree.SubElement(parent, bs_node.name, attrib=attribs)
+ else:
+ attribs = map_attrs(attrs) if attrs else {}
+ res = makeelement(bs_node.name, attrib=attribs)
+
+ for child in bs_node:
+ # avoid double recursion by inlining convert_node(), see above
+ try:
+ handler = converters[type(child)]
+ except KeyError:
+ pass
+ else:
+ if handler is not None:
+ handler(child, res)
+ continue
+ convert_node(child, res)
+ return res
+
+ @converter(Comment)
+ def convert_comment(bs_node, parent):
+ res = html.HtmlComment(bs_node)
+ if parent is not None:
+ parent.append(res)
+ return res
+
+ @converter(ProcessingInstruction)
+ def convert_pi(bs_node, parent):
+ if bs_node.endswith('?'):
+ # The PI is of XML style (<?as df?>) but BeautifulSoup
+ # interpreted it as being SGML style (<?as df>). Fix.
+ bs_node = bs_node[:-1]
+ res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
+ if parent is not None:
+ parent.append(res)
+ return res
+
+ @converter(NavigableString)
+ def convert_text(bs_node, parent):
+ if parent is not None:
+ append_text(parent, unescape(bs_node))
+ return None
+
+ return convert_node
+
+
+# copied from ET's ElementSoup
+
+try:
+ from html.entities import name2codepoint # Python 3
+except ImportError:
+ from htmlentitydefs import name2codepoint
+
+
+handle_entities = re.compile(r"&(\w+);").sub
+
+
+try:
+ unichr
+except NameError:
+ # Python 3
+ unichr = chr
+
+
+def unescape(string):
+ if not string:
+ return ''
+ # work around oddities in BeautifulSoup's entity handling
+ def unescape_entity(m):
+ try:
+ return unichr(name2codepoint[m.group(1)])
+ except KeyError:
+ return m.group(0) # use as is
+ return handle_entities(unescape_entity, string)
diff --git a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py
new file mode 100644
index 0000000..f352a1c
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py
@@ -0,0 +1,13 @@
+"""Doctest module for HTML comparison.
+
+Usage::
+
+ >>> import lxml.html.usedoctest
+ >>> # now do your HTML doctests ...
+
+See `lxml.doctestcompare`.
+"""
+
+from lxml import doctestcompare
+
+doctestcompare.temp_install(html=True, del_module=__name__)