diff options
author | 2022-11-14 16:43:12 +0530 | |
---|---|---|
committer | 2022-11-14 16:43:12 +0530 | |
commit | d47f8b48935d258f4c5c3e2267911753bebd5214 (patch) | |
tree | 3ed04e75bc3fc7c8e4ce618f527565da1df630a1 /env/lib/python3.10/site-packages/lxml/html | |
parent | 9468226a9e2e2ab8cdd599f1d8538e860ca86120 (diff) | |
download | idcard-d47f8b48935d258f4c5c3e2267911753bebd5214.tar.gz idcard-d47f8b48935d258f4c5c3e2267911753bebd5214.tar.bz2 idcard-d47f8b48935d258f4c5c3e2267911753bebd5214.zip |
id card
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html')
28 files changed, 0 insertions, 5024 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py deleted file mode 100644 index c35365d..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py +++ /dev/null @@ -1,10 +0,0 @@ -__doc__ = """Legacy interface to the BeautifulSoup HTML parser. -""" - -__all__ = ["parse", "convert_tree"] - -from .soupparser import convert_tree, parse as _parse - -def parse(file, beautifulsoup=None, makeelement=None): - root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) - return root.getroot() diff --git a/env/lib/python3.10/site-packages/lxml/html/__init__.py b/env/lib/python3.10/site-packages/lxml/html/__init__.py deleted file mode 100644 index ef06a40..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__init__.py +++ /dev/null @@ -1,1946 +0,0 @@ -# Copyright (c) 2004 Ian Bicking. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# -# 3. Neither the name of Ian Bicking nor the names of its contributors may -# be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""The ``lxml.html`` tool set for HTML handling. -""" - -from __future__ import absolute_import - -__all__ = [ - 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', - 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', - 'find_rel_links', 'find_class', 'make_links_absolute', - 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] - - -import copy -import sys -import re -from functools import partial - -try: - from collections.abc import MutableMapping, MutableSet -except ImportError: - from collections import MutableMapping, MutableSet - -from .. import etree -from . import defs -from ._setmixin import SetMixin - -try: - from urlparse import urljoin -except ImportError: - # Python 3 - from urllib.parse import urljoin - -try: - unicode -except NameError: - # Python 3 - unicode = str -try: - basestring -except NameError: - # Python 3 - basestring = (str, bytes) - - -def __fix_docstring(s): - if not s: - return s - if sys.version_info[0] >= 3: - sub = re.compile(r"^(\s*)u'", re.M).sub - else: - sub = re.compile(r"^(\s*)b'", re.M).sub - return sub(r"\1'", s) - - -XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" - -_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", - namespaces={'x':XHTML_NAMESPACE}) -_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", - namespaces={'x':XHTML_NAMESPACE}) -_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", - namespaces={'x':XHTML_NAMESPACE}) -#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) -_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") -_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") -_collect_string_content = etree.XPath("string()") -_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer -_iter_css_imports = re.compile(r'@import "(.*?)"').finditer -_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", - namespaces={'x':XHTML_NAMESPACE}) -_archive_re = re.compile(r'[^ ]+') -_parse_meta_refresh_url = re.compile( - r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search - - -def _unquote_match(s, pos): - if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": - return s[1:-1], pos+1 - else: - return s,pos - - -def _transform_result(typ, result): - """Convert the result back into the input type. - """ - if issubclass(typ, bytes): - return tostring(result, encoding='utf-8') - elif issubclass(typ, unicode): - return tostring(result, encoding='unicode') - else: - return result - - -def _nons(tag): - if isinstance(tag, basestring): - if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: - return tag.split('}')[-1] - return tag - - -class Classes(MutableSet): - """Provides access to an element's class attribute as a set-like collection. - Usage:: - - >>> el = fromstring('<p class="hidden large">Text</p>') - >>> classes = el.classes # or: classes = Classes(el.attrib) - >>> classes |= ['block', 'paragraph'] - >>> el.get('class') - 'hidden large block paragraph' - >>> classes.toggle('hidden') - False - >>> el.get('class') - 'large block paragraph' - >>> classes -= ('some', 'classes', 'block') - >>> el.get('class') - 'large paragraph' - """ - def __init__(self, attributes): - self._attributes = attributes - self._get_class_value = partial(attributes.get, 'class', '') - - def add(self, value): - """ - Add a class. - - This has no effect if the class is already present. - """ - if not value or re.search(r'\s', value): - raise ValueError("Invalid class name: %r" % value) - classes = self._get_class_value().split() - if value in classes: - return - classes.append(value) - self._attributes['class'] = ' '.join(classes) - - def discard(self, value): - """ - Remove a class if it is currently present. - - If the class is not present, do nothing. - """ - if not value or re.search(r'\s', value): - raise ValueError("Invalid class name: %r" % value) - classes = [name for name in self._get_class_value().split() - if name != value] - if classes: - self._attributes['class'] = ' '.join(classes) - elif 'class' in self._attributes: - del self._attributes['class'] - - def remove(self, value): - """ - Remove a class; it must currently be present. - - If the class is not present, raise a KeyError. - """ - if not value or re.search(r'\s', value): - raise ValueError("Invalid class name: %r" % value) - super(Classes, self).remove(value) - - def __contains__(self, name): - classes = self._get_class_value() - return name in classes and name in classes.split() - - def __iter__(self): - return iter(self._get_class_value().split()) - - def __len__(self): - return len(self._get_class_value().split()) - - # non-standard methods - - def update(self, values): - """ - Add all names from 'values'. - """ - classes = self._get_class_value().split() - extended = False - for value in values: - if value not in classes: - classes.append(value) - extended = True - if extended: - self._attributes['class'] = ' '.join(classes) - - def toggle(self, value): - """ - Add a class name if it isn't there yet, or remove it if it exists. - - Returns true if the class was added (and is now enabled) and - false if it was removed (and is now disabled). - """ - if not value or re.search(r'\s', value): - raise ValueError("Invalid class name: %r" % value) - classes = self._get_class_value().split() - try: - classes.remove(value) - enabled = False - except ValueError: - classes.append(value) - enabled = True - if classes: - self._attributes['class'] = ' '.join(classes) - else: - del self._attributes['class'] - return enabled - - -class HtmlMixin(object): - - def set(self, key, value=None): - """set(self, key, value=None) - - Sets an element attribute. If no value is provided, or if the value is None, - creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" - for ``form.set('novalidate')``. - """ - super(HtmlMixin, self).set(key, value) - - @property - def classes(self): - """ - A set-like wrapper around the 'class' attribute. - """ - return Classes(self.attrib) - - @classes.setter - def classes(self, classes): - assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. - value = classes._get_class_value() - if value: - self.set('class', value) - elif self.get('class') is not None: - del self.attrib['class'] - - @property - def base_url(self): - """ - Returns the base URL, given when the page was parsed. - - Use with ``urlparse.urljoin(el.base_url, href)`` to get - absolute URLs. - """ - return self.getroottree().docinfo.URL - - @property - def forms(self): - """ - Return a list of all the forms - """ - return _forms_xpath(self) - - @property - def body(self): - """ - Return the <body> element. Can be called from a child element - to get the document's head. - """ - return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] - - @property - def head(self): - """ - Returns the <head> element. Can be called from a child - element to get the document's head. - """ - return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] - - @property - def label(self): - """ - Get or set any <label> element associated with this element. - """ - id = self.get('id') - if not id: - return None - result = _label_xpath(self, id=id) - if not result: - return None - else: - return result[0] - - @label.setter - def label(self, label): - id = self.get('id') - if not id: - raise TypeError( - "You cannot set a label for an element (%r) that has no id" - % self) - if _nons(label.tag) != 'label': - raise TypeError( - "You can only assign label to a label element (not %r)" - % label) - label.set('for', id) - - @label.deleter - def label(self): - label = self.label - if label is not None: - del label.attrib['for'] - - def drop_tree(self): - """ - Removes this element from the tree, including its children and - text. The tail text is joined to the previous element or - parent. - """ - parent = self.getparent() - assert parent is not None - if self.tail: - previous = self.getprevious() - if previous is None: - parent.text = (parent.text or '') + self.tail - else: - previous.tail = (previous.tail or '') + self.tail - parent.remove(self) - - def drop_tag(self): - """ - Remove the tag, but not its children or text. The children and text - are merged into the parent. - - Example:: - - >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') - >>> h.find('.//b').drop_tag() - >>> print(tostring(h, encoding='unicode')) - <div>Hello World!</div> - """ - parent = self.getparent() - assert parent is not None - previous = self.getprevious() - if self.text and isinstance(self.tag, basestring): - # not a Comment, etc. - if previous is None: - parent.text = (parent.text or '') + self.text - else: - previous.tail = (previous.tail or '') + self.text - if self.tail: - if len(self): - last = self[-1] - last.tail = (last.tail or '') + self.tail - elif previous is None: - parent.text = (parent.text or '') + self.tail - else: - previous.tail = (previous.tail or '') + self.tail - index = parent.index(self) - parent[index:index+1] = self[:] - - def find_rel_links(self, rel): - """ - Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. - """ - rel = rel.lower() - return [el for el in _rel_links_xpath(self) - if el.get('rel').lower() == rel] - - def find_class(self, class_name): - """ - Find any elements with the given class name. - """ - return _class_xpath(self, class_name=class_name) - - def get_element_by_id(self, id, *default): - """ - Get the first element in a document with the given id. If none is - found, return the default argument if provided or raise KeyError - otherwise. - - Note that there can be more than one element with the same id, - and this isn't uncommon in HTML documents found in the wild. - Browsers return only the first match, and this function does - the same. - """ - try: - # FIXME: should this check for multiple matches? - # browsers just return the first one - return _id_xpath(self, id=id)[0] - except IndexError: - if default: - return default[0] - else: - raise KeyError(id) - - def text_content(self): - """ - Return the text content of the tag (and the text in any children). - """ - return _collect_string_content(self) - - def cssselect(self, expr, translator='html'): - """ - Run the CSS expression on this element and its children, - returning a list of the results. - - Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) - -- note that pre-compiling the expression can provide a substantial - speedup. - """ - # Do the import here to make the dependency optional. - from lxml.cssselect import CSSSelector - return CSSSelector(expr, translator=translator)(self) - - ######################################## - ## Link functions - ######################################## - - def make_links_absolute(self, base_url=None, resolve_base_href=True, - handle_failures=None): - """ - Make all links in the document absolute, given the - ``base_url`` for the document (the full URL where the document - came from), or if no ``base_url`` is given, then the ``.base_url`` - of the document. - - If ``resolve_base_href`` is true, then any ``<base href>`` - tags in the document are used *and* removed from the document. - If it is false then any such tag is ignored. - - If ``handle_failures`` is None (default), a failure to process - a URL will abort the processing. If set to 'ignore', errors - are ignored. If set to 'discard', failing URLs will be removed. - """ - if base_url is None: - base_url = self.base_url - if base_url is None: - raise TypeError( - "No base_url given, and the document has no base_url") - if resolve_base_href: - self.resolve_base_href() - - if handle_failures == 'ignore': - def link_repl(href): - try: - return urljoin(base_url, href) - except ValueError: - return href - elif handle_failures == 'discard': - def link_repl(href): - try: - return urljoin(base_url, href) - except ValueError: - return None - elif handle_failures is None: - def link_repl(href): - return urljoin(base_url, href) - else: - raise ValueError( - "unexpected value for handle_failures: %r" % handle_failures) - - self.rewrite_links(link_repl) - - def resolve_base_href(self, handle_failures=None): - """ - Find any ``<base href>`` tag in the document, and apply its - values to all links found in the document. Also remove the - tag once it has been applied. - - If ``handle_failures`` is None (default), a failure to process - a URL will abort the processing. If set to 'ignore', errors - are ignored. If set to 'discard', failing URLs will be removed. - """ - base_href = None - basetags = self.xpath('//base[@href]|//x:base[@href]', - namespaces={'x': XHTML_NAMESPACE}) - for b in basetags: - base_href = b.get('href') - b.drop_tree() - if not base_href: - return - self.make_links_absolute(base_href, resolve_base_href=False, - handle_failures=handle_failures) - - def iterlinks(self): - """ - Yield (element, attribute, link, pos), where attribute may be None - (indicating the link is in the text). ``pos`` is the position - where the link occurs; often 0, but sometimes something else in - the case of links in stylesheets or style tags. - - Note: <base href> is *not* taken into account in any way. The - link you get is exactly the link in the document. - - Note: multiple links inside of a single text string or - attribute value are returned in reversed order. This makes it - possible to replace or delete them from the text string value - based on their reported text positions. Otherwise, a - modification at one text position can change the positions of - links reported later on. - """ - link_attrs = defs.link_attrs - for el in self.iter(etree.Element): - attribs = el.attrib - tag = _nons(el.tag) - if tag == 'object': - codebase = None - ## <object> tags have attributes that are relative to - ## codebase - if 'codebase' in attribs: - codebase = el.get('codebase') - yield (el, 'codebase', codebase, 0) - for attrib in ('classid', 'data'): - if attrib in attribs: - value = el.get(attrib) - if codebase is not None: - value = urljoin(codebase, value) - yield (el, attrib, value, 0) - if 'archive' in attribs: - for match in _archive_re.finditer(el.get('archive')): - value = match.group(0) - if codebase is not None: - value = urljoin(codebase, value) - yield (el, 'archive', value, match.start()) - else: - for attrib in link_attrs: - if attrib in attribs: - yield (el, attrib, attribs[attrib], 0) - if tag == 'meta': - http_equiv = attribs.get('http-equiv', '').lower() - if http_equiv == 'refresh': - content = attribs.get('content', '') - match = _parse_meta_refresh_url(content) - url = (match.group('url') if match else content).strip() - # unexpected content means the redirect won't work, but we might - # as well be permissive and return the entire string. - if url: - url, pos = _unquote_match( - url, match.start('url') if match else content.find(url)) - yield (el, 'content', url, pos) - elif tag == 'param': - valuetype = el.get('valuetype') or '' - if valuetype.lower() == 'ref': - ## FIXME: while it's fine we *find* this link, - ## according to the spec we aren't supposed to - ## actually change the value, including resolving - ## it. It can also still be a link, even if it - ## doesn't have a valuetype="ref" (which seems to be the norm) - ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype - yield (el, 'value', el.get('value'), 0) - elif tag == 'style' and el.text: - urls = [ - # (start_pos, url) - _unquote_match(match.group(1), match.start(1))[::-1] - for match in _iter_css_urls(el.text) - ] + [ - (match.start(1), match.group(1)) - for match in _iter_css_imports(el.text) - ] - if urls: - # sort by start pos to bring both match sets back into order - # and reverse the list to report correct positions despite - # modifications - urls.sort(reverse=True) - for start, url in urls: - yield (el, None, url, start) - if 'style' in attribs: - urls = list(_iter_css_urls(attribs['style'])) - if urls: - # return in reversed order to simplify in-place modifications - for match in urls[::-1]: - url, start = _unquote_match(match.group(1), match.start(1)) - yield (el, 'style', url, start) - - def rewrite_links(self, link_repl_func, resolve_base_href=True, - base_href=None): - """ - Rewrite all the links in the document. For each link - ``link_repl_func(link)`` will be called, and the return value - will replace the old link. - - Note that links may not be absolute (unless you first called - ``make_links_absolute()``), and may be internal (e.g., - ``'#anchor'``). They can also be values like - ``'mailto:email'`` or ``'javascript:expr'``. - - If you give ``base_href`` then all links passed to - ``link_repl_func()`` will take that into account. - - If the ``link_repl_func`` returns None, the attribute or - tag text will be removed completely. - """ - if base_href is not None: - # FIXME: this can be done in one pass with a wrapper - # around link_repl_func - self.make_links_absolute( - base_href, resolve_base_href=resolve_base_href) - elif resolve_base_href: - self.resolve_base_href() - - for el, attrib, link, pos in self.iterlinks(): - new_link = link_repl_func(link.strip()) - if new_link == link: - continue - if new_link is None: - # Remove the attribute or element content - if attrib is None: - el.text = '' - else: - del el.attrib[attrib] - continue - - if attrib is None: - new = el.text[:pos] + new_link + el.text[pos+len(link):] - el.text = new - else: - cur = el.get(attrib) - if not pos and len(cur) == len(link): - new = new_link # most common case - else: - new = cur[:pos] + new_link + cur[pos+len(link):] - el.set(attrib, new) - - -class _MethodFunc(object): - """ - An object that represents a method on an element as a function; - the function takes either an element or an HTML string. It - returns whatever the function normally returns, or if the function - works in-place (and so returns None) it returns a serialized form - of the resulting document. - """ - def __init__(self, name, copy=False, source_class=HtmlMixin): - self.name = name - self.copy = copy - self.__doc__ = getattr(source_class, self.name).__doc__ - def __call__(self, doc, *args, **kw): - result_type = type(doc) - if isinstance(doc, basestring): - if 'copy' in kw: - raise TypeError( - "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) - doc = fromstring(doc, **kw) - else: - if 'copy' in kw: - make_a_copy = kw.pop('copy') - else: - make_a_copy = self.copy - if make_a_copy: - doc = copy.deepcopy(doc) - meth = getattr(doc, self.name) - result = meth(*args, **kw) - # FIXME: this None test is a bit sloppy - if result is None: - # Then return what we got in - return _transform_result(result_type, doc) - else: - return result - - -find_rel_links = _MethodFunc('find_rel_links', copy=False) -find_class = _MethodFunc('find_class', copy=False) -make_links_absolute = _MethodFunc('make_links_absolute', copy=True) -resolve_base_href = _MethodFunc('resolve_base_href', copy=True) -iterlinks = _MethodFunc('iterlinks', copy=False) -rewrite_links = _MethodFunc('rewrite_links', copy=True) - - -class HtmlComment(HtmlMixin, etree.CommentBase): - pass - - -class HtmlElement(HtmlMixin, etree.ElementBase): - pass - - -class HtmlProcessingInstruction(HtmlMixin, etree.PIBase): - pass - - -class HtmlEntity(HtmlMixin, etree.EntityBase): - pass - - -class HtmlElementClassLookup(etree.CustomElementClassLookup): - """A lookup scheme for HTML Element classes. - - To create a lookup instance with different Element classes, pass a tag - name mapping of Element classes in the ``classes`` keyword argument and/or - a tag name mapping of Mixin classes in the ``mixins`` keyword argument. - The special key '*' denotes a Mixin class that should be mixed into all - Element classes. - """ - _default_element_classes = {} - - def __init__(self, classes=None, mixins=None): - etree.CustomElementClassLookup.__init__(self) - if classes is None: - classes = self._default_element_classes.copy() - if mixins: - mixers = {} - for name, value in mixins: - if name == '*': - for n in classes.keys(): - mixers.setdefault(n, []).append(value) - else: - mixers.setdefault(name, []).append(value) - for name, mix_bases in mixers.items(): - cur = classes.get(name, HtmlElement) - bases = tuple(mix_bases + [cur]) - classes[name] = type(cur.__name__, bases, {}) - self._element_classes = classes - - def lookup(self, node_type, document, namespace, name): - if node_type == 'element': - return self._element_classes.get(name.lower(), HtmlElement) - elif node_type == 'comment': - return HtmlComment - elif node_type == 'PI': - return HtmlProcessingInstruction - elif node_type == 'entity': - return HtmlEntity - # Otherwise normal lookup - return None - - -################################################################################ -# parsing -################################################################################ - -_looks_like_full_html_unicode = re.compile( - unicode(r'^\s*<(?:html|!doctype)'), re.I).match -_looks_like_full_html_bytes = re.compile( - r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match - - -def document_fromstring(html, parser=None, ensure_head_body=False, **kw): - if parser is None: - parser = html_parser - value = etree.fromstring(html, parser, **kw) - if value is None: - raise etree.ParserError( - "Document is empty") - if ensure_head_body and value.find('head') is None: - value.insert(0, Element('head')) - if ensure_head_body and value.find('body') is None: - value.append(Element('body')) - return value - - -def fragments_fromstring(html, no_leading_text=False, base_url=None, - parser=None, **kw): - """Parses several HTML elements, returning a list of elements. - - The first item in the list may be a string. - If no_leading_text is true, then it will be an error if there is - leading text, and it will always be a list of only elements. - - base_url will set the document's base_url attribute - (and the tree's docinfo.URL). - """ - if parser is None: - parser = html_parser - # FIXME: check what happens when you give html with a body, head, etc. - if isinstance(html, bytes): - if not _looks_like_full_html_bytes(html): - # can't use %-formatting in early Py3 versions - html = ('<html><body>'.encode('ascii') + html + - '</body></html>'.encode('ascii')) - else: - if not _looks_like_full_html_unicode(html): - html = '<html><body>%s</body></html>' % html - doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) - assert _nons(doc.tag) == 'html' - bodies = [e for e in doc if _nons(e.tag) == 'body'] - assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) - body = bodies[0] - elements = [] - if no_leading_text and body.text and body.text.strip(): - raise etree.ParserError( - "There is leading text: %r" % body.text) - if body.text and body.text.strip(): - elements.append(body.text) - elements.extend(body) - # FIXME: removing the reference to the parent artificial document - # would be nice - return elements - - -def fragment_fromstring(html, create_parent=False, base_url=None, - parser=None, **kw): - """ - Parses a single HTML element; it is an error if there is more than - one element, or if anything but whitespace precedes or follows the - element. - - If ``create_parent`` is true (or is a tag name) then a parent node - will be created to encapsulate the HTML in a single element. In this - case, leading or trailing text is also allowed, as are multiple elements - as result of the parsing. - - Passing a ``base_url`` will set the document's ``base_url`` attribute - (and the tree's docinfo.URL). - """ - if parser is None: - parser = html_parser - - accept_leading_text = bool(create_parent) - - elements = fragments_fromstring( - html, parser=parser, no_leading_text=not accept_leading_text, - base_url=base_url, **kw) - - if create_parent: - if not isinstance(create_parent, basestring): - create_parent = 'div' - new_root = Element(create_parent) - if elements: - if isinstance(elements[0], basestring): - new_root.text = elements[0] - del elements[0] - new_root.extend(elements) - return new_root - - if not elements: - raise etree.ParserError('No elements found') - if len(elements) > 1: - raise etree.ParserError( - "Multiple elements found (%s)" - % ', '.join([_element_name(e) for e in elements])) - el = elements[0] - if el.tail and el.tail.strip(): - raise etree.ParserError( - "Element followed by text: %r" % el.tail) - el.tail = None - return el - - -def fromstring(html, base_url=None, parser=None, **kw): - """ - Parse the html, returning a single element/document. - - This tries to minimally parse the chunk of text, without knowing if it - is a fragment or a document. - - base_url will set the document's base_url attribute (and the tree's docinfo.URL) - """ - if parser is None: - parser = html_parser - if isinstance(html, bytes): - is_full_html = _looks_like_full_html_bytes(html) - else: - is_full_html = _looks_like_full_html_unicode(html) - doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) - if is_full_html: - return doc - # otherwise, lets parse it out... - bodies = doc.findall('body') - if not bodies: - bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) - if bodies: - body = bodies[0] - if len(bodies) > 1: - # Somehow there are multiple bodies, which is bad, but just - # smash them into one body - for other_body in bodies[1:]: - if other_body.text: - if len(body): - body[-1].tail = (body[-1].tail or '') + other_body.text - else: - body.text = (body.text or '') + other_body.text - body.extend(other_body) - # We'll ignore tail - # I guess we are ignoring attributes too - other_body.drop_tree() - else: - body = None - heads = doc.findall('head') - if not heads: - heads = doc.findall('{%s}head' % XHTML_NAMESPACE) - if heads: - # Well, we have some sort of structure, so lets keep it all - head = heads[0] - if len(heads) > 1: - for other_head in heads[1:]: - head.extend(other_head) - # We don't care about text or tail in a head - other_head.drop_tree() - return doc - if body is None: - return doc - if (len(body) == 1 and (not body.text or not body.text.strip()) - and (not body[-1].tail or not body[-1].tail.strip())): - # The body has just one element, so it was probably a single - # element passed in - return body[0] - # Now we have a body which represents a bunch of tags which have the - # content that was passed in. We will create a fake container, which - # is the body tag, except <body> implies too much structure. - if _contains_block_level_tag(body): - body.tag = 'div' - else: - body.tag = 'span' - return body - - -def parse(filename_or_url, parser=None, base_url=None, **kw): - """ - Parse a filename, URL, or file-like object into an HTML document - tree. Note: this returns a tree, not an element. Use - ``parse(...).getroot()`` to get the document root. - - You can override the base URL with the ``base_url`` keyword. This - is most useful when parsing from a file-like object. - """ - if parser is None: - parser = html_parser - return etree.parse(filename_or_url, parser, base_url=base_url, **kw) - - -def _contains_block_level_tag(el): - # FIXME: I could do this with XPath, but would that just be - # unnecessarily slow? - for el in el.iter(etree.Element): - if _nons(el.tag) in defs.block_tags: - return True - return False - - -def _element_name(el): - if isinstance(el, etree.CommentBase): - return 'comment' - elif isinstance(el, basestring): - return 'string' - else: - return _nons(el.tag) - - -################################################################################ -# form handling -################################################################################ - -class FormElement(HtmlElement): - """ - Represents a <form> element. - """ - - @property - def inputs(self): - """ - Returns an accessor for all the input elements in the form. - - See `InputGetter` for more information about the object. - """ - return InputGetter(self) - - @property - def fields(self): - """ - Dictionary-like object that represents all the fields in this - form. You can set values in this dictionary to effect the - form. - """ - return FieldsDict(self.inputs) - - @fields.setter - def fields(self, value): - fields = self.fields - prev_keys = fields.keys() - for key, value in value.items(): - if key in prev_keys: - prev_keys.remove(key) - fields[key] = value - for key in prev_keys: - if key is None: - # Case of an unnamed input; these aren't really - # expressed in form_values() anyway. - continue - fields[key] = None - - def _name(self): - if self.get('name'): - return self.get('name') - elif self.get('id'): - return '#' + self.get('id') - iter_tags = self.body.iter - forms = list(iter_tags('form')) - if not forms: - forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) - return str(forms.index(self)) - - def form_values(self): - """ - Return a list of tuples of the field values for the form. - This is suitable to be passed to ``urllib.urlencode()``. - """ - results = [] - for el in self.inputs: - name = el.name - if not name or 'disabled' in el.attrib: - continue - tag = _nons(el.tag) - if tag == 'textarea': - results.append((name, el.value)) - elif tag == 'select': - value = el.value - if el.multiple: - for v in value: - results.append((name, v)) - elif value is not None: - results.append((name, el.value)) - else: - assert tag == 'input', ( - "Unexpected tag: %r" % el) - if el.checkable and not el.checked: - continue - if el.type in ('submit', 'image', 'reset', 'file'): - continue - value = el.value - if value is not None: - results.append((name, el.value)) - return results - - @property - def action(self): - """ - Get/set the form's ``action`` attribute. - """ - base_url = self.base_url - action = self.get('action') - if base_url and action is not None: - return urljoin(base_url, action) - else: - return action - - @action.setter - def action(self, value): - self.set('action', value) - - @action.deleter - def action(self): - attrib = self.attrib - if 'action' in attrib: - del attrib['action'] - - @property - def method(self): - """ - Get/set the form's method. Always returns a capitalized - string, and defaults to ``'GET'`` - """ - return self.get('method', 'GET').upper() - - @method.setter - def method(self, value): - self.set('method', value.upper()) - - -HtmlElementClassLookup._default_element_classes['form'] = FormElement - - -def submit_form(form, extra_values=None, open_http=None): - """ - Helper function to submit a form. Returns a file-like object, as from - ``urllib.urlopen()``. This object also has a ``.geturl()`` function, - which shows the URL if there were any redirects. - - You can use this like:: - - form = doc.forms[0] - form.inputs['foo'].value = 'bar' # etc - response = form.submit() - doc = parse(response) - doc.make_links_absolute(response.geturl()) - - To change the HTTP requester, pass a function as ``open_http`` keyword - argument that opens the URL for you. The function must have the following - signature:: - - open_http(method, URL, values) - - The action is one of 'GET' or 'POST', the URL is the target URL as a - string, and the values are a sequence of ``(name, value)`` tuples with the - form data. - """ - values = form.form_values() - if extra_values: - if hasattr(extra_values, 'items'): - extra_values = extra_values.items() - values.extend(extra_values) - if open_http is None: - open_http = open_http_urllib - if form.action: - url = form.action - else: - url = form.base_url - return open_http(form.method, url, values) - - -def open_http_urllib(method, url, values): - if not url: - raise ValueError("cannot submit, no URL provided") - ## FIXME: should test that it's not a relative URL or something - try: - from urllib import urlencode, urlopen - except ImportError: # Python 3 - from urllib.request import urlopen - from urllib.parse import urlencode - if method == 'GET': - if '?' in url: - url += '&' - else: - url += '?' - url += urlencode(values) - data = None - else: - data = urlencode(values) - if not isinstance(data, bytes): - data = data.encode('ASCII') - return urlopen(url, data) - - -class FieldsDict(MutableMapping): - - def __init__(self, inputs): - self.inputs = inputs - def __getitem__(self, item): - return self.inputs[item].value - def __setitem__(self, item, value): - self.inputs[item].value = value - def __delitem__(self, item): - raise KeyError( - "You cannot remove keys from ElementDict") - def keys(self): - return self.inputs.keys() - def __contains__(self, item): - return item in self.inputs - def __iter__(self): - return iter(self.inputs.keys()) - def __len__(self): - return len(self.inputs) - - def __repr__(self): - return '<%s for form %s>' % ( - self.__class__.__name__, - self.inputs.form._name()) - - -class InputGetter(object): - - """ - An accessor that represents all the input fields in a form. - - You can get fields by name from this, with - ``form.inputs['field_name']``. If there are a set of checkboxes - with the same name, they are returned as a list (a `CheckboxGroup` - which also allows value setting). Radio inputs are handled - similarly. Use ``.keys()`` and ``.items()`` to process all fields - in this way. - - You can also iterate over this to get all input elements. This - won't return the same thing as if you get all the names, as - checkboxes and radio elements are returned individually. - """ - - def __init__(self, form): - self.form = form - - def __repr__(self): - return '<%s for form %s>' % ( - self.__class__.__name__, - self.form._name()) - - ## FIXME: there should be more methods, and it's unclear if this is - ## a dictionary-like object or list-like object - - def __getitem__(self, name): - fields = [field for field in self if field.name == name] - if not fields: - raise KeyError("No input element with the name %r" % name) - - input_type = fields[0].get('type') - if input_type == 'radio' and len(fields) > 1: - group = RadioGroup(fields) - group.name = name - return group - elif input_type == 'checkbox' and len(fields) > 1: - group = CheckboxGroup(fields) - group.name = name - return group - else: - # I don't like throwing away elements like this - return fields[0] - - def __contains__(self, name): - for field in self: - if field.name == name: - return True - return False - - def keys(self): - """ - Returns all unique field names, in document order. - - :return: A list of all unique field names. - """ - names = [] - seen = {None} - for el in self: - name = el.name - if name not in seen: - names.append(name) - seen.add(name) - return names - - def items(self): - """ - Returns all fields with their names, similar to dict.items(). - - :return: A list of (name, field) tuples. - """ - items = [] - seen = set() - for el in self: - name = el.name - if name not in seen: - seen.add(name) - items.append((name, self[name])) - return items - - def __iter__(self): - return self.form.iter('select', 'input', 'textarea') - - def __len__(self): - return sum(1 for _ in self) - - -class InputMixin(object): - """ - Mix-in for all input elements (input, select, and textarea) - """ - @property - def name(self): - """ - Get/set the name of the element - """ - return self.get('name') - - @name.setter - def name(self, value): - self.set('name', value) - - @name.deleter - def name(self): - attrib = self.attrib - if 'name' in attrib: - del attrib['name'] - - def __repr__(self): - type_name = getattr(self, 'type', None) - if type_name: - type_name = ' type=%r' % type_name - else: - type_name = '' - return '<%s %x name=%r%s>' % ( - self.__class__.__name__, id(self), self.name, type_name) - - -class TextareaElement(InputMixin, HtmlElement): - """ - ``<textarea>`` element. You can get the name with ``.name`` and - get/set the value with ``.value`` - """ - @property - def value(self): - """ - Get/set the value (which is the contents of this element) - """ - content = self.text or '' - if self.tag.startswith("{%s}" % XHTML_NAMESPACE): - serialisation_method = 'xml' - else: - serialisation_method = 'html' - for el in self: - # it's rare that we actually get here, so let's not use ''.join() - content += etree.tostring( - el, method=serialisation_method, encoding='unicode') - return content - - @value.setter - def value(self, value): - del self[:] - self.text = value - - @value.deleter - def value(self): - self.text = '' - del self[:] - - -HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement - - -class SelectElement(InputMixin, HtmlElement): - """ - ``<select>`` element. You can get the name with ``.name``. - - ``.value`` will be the value of the selected option, unless this - is a multi-select element (``<select multiple>``), in which case - it will be a set-like object. In either case ``.value_options`` - gives the possible values. - - The boolean attribute ``.multiple`` shows if this is a - multi-select. - """ - @property - def value(self): - """ - Get/set the value of this select (the selected option). - - If this is a multi-select, this is a set-like object that - represents all the selected options. - """ - if self.multiple: - return MultipleSelectOptions(self) - options = _options_xpath(self) - - try: - selected_option = next(el for el in reversed(options) if el.get('selected') is not None) - except StopIteration: - try: - selected_option = next(el for el in options if el.get('disabled') is None) - except StopIteration: - return None - value = selected_option.get('value') - if value is None: - value = (selected_option.text or '').strip() - return value - - @value.setter - def value(self, value): - if self.multiple: - if isinstance(value, basestring): - raise TypeError("You must pass in a sequence") - values = self.value - values.clear() - values.update(value) - return - checked_option = None - if value is not None: - for el in _options_xpath(self): - opt_value = el.get('value') - if opt_value is None: - opt_value = (el.text or '').strip() - if opt_value == value: - checked_option = el - break - else: - raise ValueError( - "There is no option with the value of %r" % value) - for el in _options_xpath(self): - if 'selected' in el.attrib: - del el.attrib['selected'] - if checked_option is not None: - checked_option.set('selected', '') - - @value.deleter - def value(self): - # FIXME: should del be allowed at all? - if self.multiple: - self.value.clear() - else: - self.value = None - - @property - def value_options(self): - """ - All the possible values this select can have (the ``value`` - attribute of all the ``<option>`` elements. - """ - options = [] - for el in _options_xpath(self): - value = el.get('value') - if value is None: - value = (el.text or '').strip() - options.append(value) - return options - - @property - def multiple(self): - """ - Boolean attribute: is there a ``multiple`` attribute on this element. - """ - return 'multiple' in self.attrib - - @multiple.setter - def multiple(self, value): - if value: - self.set('multiple', '') - elif 'multiple' in self.attrib: - del self.attrib['multiple'] - - -HtmlElementClassLookup._default_element_classes['select'] = SelectElement - - -class MultipleSelectOptions(SetMixin): - """ - Represents all the selected options in a ``<select multiple>`` element. - - You can add to this set-like option to select an option, or remove - to unselect the option. - """ - - def __init__(self, select): - self.select = select - - @property - def options(self): - """ - Iterator of all the ``<option>`` elements. - """ - return iter(_options_xpath(self.select)) - - def __iter__(self): - for option in self.options: - if 'selected' in option.attrib: - opt_value = option.get('value') - if opt_value is None: - opt_value = (option.text or '').strip() - yield opt_value - - def add(self, item): - for option in self.options: - opt_value = option.get('value') - if opt_value is None: - opt_value = (option.text or '').strip() - if opt_value == item: - option.set('selected', '') - break - else: - raise ValueError( - "There is no option with the value %r" % item) - - def remove(self, item): - for option in self.options: - opt_value = option.get('value') - if opt_value is None: - opt_value = (option.text or '').strip() - if opt_value == item: - if 'selected' in option.attrib: - del option.attrib['selected'] - else: - raise ValueError( - "The option %r is not currently selected" % item) - break - else: - raise ValueError( - "There is not option with the value %r" % item) - - def __repr__(self): - return '<%s {%s} for select name=%r>' % ( - self.__class__.__name__, - ', '.join([repr(v) for v in self]), - self.select.name) - - -class RadioGroup(list): - """ - This object represents several ``<input type=radio>`` elements - that have the same name. - - You can use this like a list, but also use the property - ``.value`` to check/uncheck inputs. Also you can use - ``.value_options`` to get the possible values. - """ - @property - def value(self): - """ - Get/set the value, which checks the radio with that value (and - unchecks any other value). - """ - for el in self: - if 'checked' in el.attrib: - return el.get('value') - return None - - @value.setter - def value(self, value): - checked_option = None - if value is not None: - for el in self: - if el.get('value') == value: - checked_option = el - break - else: - raise ValueError("There is no radio input with the value %r" % value) - for el in self: - if 'checked' in el.attrib: - del el.attrib['checked'] - if checked_option is not None: - checked_option.set('checked', '') - - @value.deleter - def value(self): - self.value = None - - @property - def value_options(self): - """ - Returns a list of all the possible values. - """ - return [el.get('value') for el in self] - - def __repr__(self): - return '%s(%s)' % ( - self.__class__.__name__, - list.__repr__(self)) - - -class CheckboxGroup(list): - """ - Represents a group of checkboxes (``<input type=checkbox>``) that - have the same name. - - In addition to using this like a list, the ``.value`` attribute - returns a set-like object that you can add to or remove from to - check and uncheck checkboxes. You can also use ``.value_options`` - to get the possible values. - """ - @property - def value(self): - """ - Return a set-like object that can be modified to check or - uncheck individual checkboxes according to their value. - """ - return CheckboxValues(self) - - @value.setter - def value(self, value): - values = self.value - values.clear() - if not hasattr(value, '__iter__'): - raise ValueError( - "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" - % (self[0].name, value)) - values.update(value) - - @value.deleter - def value(self): - self.value.clear() - - @property - def value_options(self): - """ - Returns a list of all the possible values. - """ - return [el.get('value') for el in self] - - def __repr__(self): - return '%s(%s)' % ( - self.__class__.__name__, list.__repr__(self)) - - -class CheckboxValues(SetMixin): - """ - Represents the values of the checked checkboxes in a group of - checkboxes with the same name. - """ - - def __init__(self, group): - self.group = group - - def __iter__(self): - return iter([ - el.get('value') - for el in self.group - if 'checked' in el.attrib]) - - def add(self, value): - for el in self.group: - if el.get('value') == value: - el.set('checked', '') - break - else: - raise KeyError("No checkbox with value %r" % value) - - def remove(self, value): - for el in self.group: - if el.get('value') == value: - if 'checked' in el.attrib: - del el.attrib['checked'] - else: - raise KeyError( - "The checkbox with value %r was already unchecked" % value) - break - else: - raise KeyError( - "No checkbox with value %r" % value) - - def __repr__(self): - return '<%s {%s} for checkboxes name=%r>' % ( - self.__class__.__name__, - ', '.join([repr(v) for v in self]), - self.group.name) - - -class InputElement(InputMixin, HtmlElement): - """ - Represents an ``<input>`` element. - - You can get the type with ``.type`` (which is lower-cased and - defaults to ``'text'``). - - Also you can get and set the value with ``.value`` - - Checkboxes and radios have the attribute ``input.checkable == - True`` (for all others it is false) and a boolean attribute - ``.checked``. - - """ - - ## FIXME: I'm a little uncomfortable with the use of .checked - @property - def value(self): - """ - Get/set the value of this element, using the ``value`` attribute. - - Also, if this is a checkbox and it has no value, this defaults - to ``'on'``. If it is a checkbox or radio that is not - checked, this returns None. - """ - if self.checkable: - if self.checked: - return self.get('value') or 'on' - else: - return None - return self.get('value') - - @value.setter - def value(self, value): - if self.checkable: - if not value: - self.checked = False - else: - self.checked = True - if isinstance(value, basestring): - self.set('value', value) - else: - self.set('value', value) - - @value.deleter - def value(self): - if self.checkable: - self.checked = False - else: - if 'value' in self.attrib: - del self.attrib['value'] - - @property - def type(self): - """ - Return the type of this element (using the type attribute). - """ - return self.get('type', 'text').lower() - - @type.setter - def type(self, value): - self.set('type', value) - - @property - def checkable(self): - """ - Boolean: can this element be checked? - """ - return self.type in ('checkbox', 'radio') - - @property - def checked(self): - """ - Boolean attribute to get/set the presence of the ``checked`` - attribute. - - You can only use this on checkable input types. - """ - if not self.checkable: - raise AttributeError('Not a checkable input type') - return 'checked' in self.attrib - - @checked.setter - def checked(self, value): - if not self.checkable: - raise AttributeError('Not a checkable input type') - if value: - self.set('checked', '') - else: - attrib = self.attrib - if 'checked' in attrib: - del attrib['checked'] - - -HtmlElementClassLookup._default_element_classes['input'] = InputElement - - -class LabelElement(HtmlElement): - """ - Represents a ``<label>`` element. - - Label elements are linked to other elements with their ``for`` - attribute. You can access this element with ``label.for_element``. - """ - @property - def for_element(self): - """ - Get/set the element this label points to. Return None if it - can't be found. - """ - id = self.get('for') - if not id: - return None - return self.body.get_element_by_id(id) - - @for_element.setter - def for_element(self, other): - id = other.get('id') - if not id: - raise TypeError( - "Element %r has no id attribute" % other) - self.set('for', id) - - @for_element.deleter - def for_element(self): - attrib = self.attrib - if 'id' in attrib: - del attrib['id'] - - -HtmlElementClassLookup._default_element_classes['label'] = LabelElement - - -############################################################ -## Serialization -############################################################ - -def html_to_xhtml(html): - """Convert all tags in an HTML tree to XHTML by moving them to the - XHTML namespace. - """ - try: - html = html.getroot() - except AttributeError: - pass - prefix = "{%s}" % XHTML_NAMESPACE - for el in html.iter(etree.Element): - tag = el.tag - if tag[0] != '{': - el.tag = prefix + tag - - -def xhtml_to_html(xhtml): - """Convert all tags in an XHTML tree to HTML by removing their - XHTML namespace. - """ - try: - xhtml = xhtml.getroot() - except AttributeError: - pass - prefix = "{%s}" % XHTML_NAMESPACE - prefix_len = len(prefix) - for el in xhtml.iter(prefix + "*"): - el.tag = el.tag[prefix_len:] - - -# This isn't a general match, but it's a match for what libxml2 -# specifically serialises: -__str_replace_meta_content_type = re.compile( - r'<meta http-equiv="Content-Type"[^>]*>').sub -__bytes_replace_meta_content_type = re.compile( - r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub - - -def tostring(doc, pretty_print=False, include_meta_content_type=False, - encoding=None, method="html", with_tail=True, doctype=None): - """Return an HTML string representation of the document. - - Note: if include_meta_content_type is true this will create a - ``<meta http-equiv="Content-Type" ...>`` tag in the head; - regardless of the value of include_meta_content_type any existing - ``<meta http-equiv="Content-Type" ...>`` tag will be removed - - The ``encoding`` argument controls the output encoding (defaults to - ASCII, with &#...; character references for any characters outside - of ASCII). Note that you can pass the name ``'unicode'`` as - ``encoding`` argument to serialise to a Unicode string. - - The ``method`` argument defines the output method. It defaults to - 'html', but can also be 'xml' for xhtml output, or 'text' to - serialise to plain text without markup. - - To leave out the tail text of the top-level element that is being - serialised, pass ``with_tail=False``. - - The ``doctype`` option allows passing in a plain string that will - be serialised before the XML tree. Note that passing in non - well-formed content here will make the XML output non well-formed. - Also, an existing doctype in the document tree will not be removed - when serialising an ElementTree instance. - - Example:: - - >>> from lxml import html - >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') - - >>> html.tostring(root) - b'<p>Hello<br>world!</p>' - >>> html.tostring(root, method='html') - b'<p>Hello<br>world!</p>' - - >>> html.tostring(root, method='xml') - b'<p>Hello<br/>world!</p>' - - >>> html.tostring(root, method='text') - b'Helloworld!' - - >>> html.tostring(root, method='text', encoding='unicode') - u'Helloworld!' - - >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') - >>> html.tostring(root[0], method='text', encoding='unicode') - u'Helloworld!TAIL' - - >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) - u'Helloworld!' - - >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') - >>> html.tostring(doc, method='html', encoding='unicode') - u'<html><body><p>Hello<br>world!</p></body></html>' - - >>> print(html.tostring(doc, method='html', encoding='unicode', - ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' - ... ' "http://www.w3.org/TR/html4/strict.dtd">')) - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> - <html><body><p>Hello<br>world!</p></body></html> - """ - html = etree.tostring(doc, method=method, pretty_print=pretty_print, - encoding=encoding, with_tail=with_tail, - doctype=doctype) - if method == 'html' and not include_meta_content_type: - if isinstance(html, str): - html = __str_replace_meta_content_type('', html) - else: - html = __bytes_replace_meta_content_type(bytes(), html) - return html - - -tostring.__doc__ = __fix_docstring(tostring.__doc__) - - -def open_in_browser(doc, encoding=None): - """ - Open the HTML document in a web browser, saving it to a temporary - file to open it. Note that this does not delete the file after - use. This is mainly meant for debugging. - """ - import os - import webbrowser - import tempfile - if not isinstance(doc, etree._ElementTree): - doc = etree.ElementTree(doc) - handle, fn = tempfile.mkstemp(suffix='.html') - f = os.fdopen(handle, 'wb') - try: - doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") - finally: - # we leak the file itself here, but we should at least close it - f.close() - url = 'file://' + fn.replace(os.path.sep, '/') - print(url) - webbrowser.open(url) - - -################################################################################ -# configure Element class lookup -################################################################################ - -class HTMLParser(etree.HTMLParser): - """An HTML parser that is configured to return lxml.html Element - objects. - """ - def __init__(self, **kwargs): - super(HTMLParser, self).__init__(**kwargs) - self.set_element_class_lookup(HtmlElementClassLookup()) - - -class XHTMLParser(etree.XMLParser): - """An XML parser that is configured to return lxml.html Element - objects. - - Note that this parser is not really XHTML aware unless you let it - load a DTD that declares the HTML entities. To do this, make sure - you have the XHTML DTDs installed in your catalogs, and create the - parser like this:: - - >>> parser = XHTMLParser(load_dtd=True) - - If you additionally want to validate the document, use this:: - - >>> parser = XHTMLParser(dtd_validation=True) - - For catalog support, see http://www.xmlsoft.org/catalog.html. - """ - def __init__(self, **kwargs): - super(XHTMLParser, self).__init__(**kwargs) - self.set_element_class_lookup(HtmlElementClassLookup()) - - -def Element(*args, **kw): - """Create a new HTML Element. - - This can also be used for XHTML documents. - """ - v = html_parser.makeelement(*args, **kw) - return v - - -html_parser = HTMLParser() -xhtml_parser = XHTMLParser() diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc Binary files differdeleted file mode 100644 index a378207..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc Binary files differdeleted file mode 100644 index 4bc5785..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc Binary files differdeleted file mode 100644 index fa25497..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc Binary files differdeleted file mode 100644 index b243408..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc Binary files differdeleted file mode 100644 index a2de006..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc Binary files differdeleted file mode 100644 index b915259..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc Binary files differdeleted file mode 100644 index c343b40..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc Binary files differdeleted file mode 100644 index 8dc2d4b..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc Binary files differdeleted file mode 100644 index c029ed9..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc Binary files differdeleted file mode 100644 index 049161a..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc Binary files differdeleted file mode 100644 index 6208e67..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc Binary files differdeleted file mode 100644 index 3293704..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc Binary files differdeleted file mode 100644 index d76e7dd..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py deleted file mode 100644 index e0502c0..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import absolute_import - -import optparse -import sys -import re -import os -from .diff import htmldiff - -description = """\ -""" - -parser = optparse.OptionParser( - usage="%prog [OPTIONS] FILE1 FILE2\n" - "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...", - description=description, - ) - -parser.add_option( - '-o', '--output', - metavar="FILE", - dest="output", - default="-", - help="File to write the difference to", - ) - -parser.add_option( - '-a', '--annotation', - action="store_true", - dest="annotation", - help="Do an annotation") - -def main(args=None): - if args is None: - args = sys.argv[1:] - options, args = parser.parse_args(args) - if options.annotation: - return annotate(options, args) - if len(args) != 2: - print('Error: you must give two files') - parser.print_help() - sys.exit(1) - file1, file2 = args - input1 = read_file(file1) - input2 = read_file(file2) - body1 = split_body(input1)[1] - pre, body2, post = split_body(input2) - result = htmldiff(body1, body2) - result = pre + result + post - if options.output == '-': - if not result.endswith('\n'): - result += '\n' - sys.stdout.write(result) - else: - with open(options.output, 'wb') as f: - f.write(result) - -def read_file(filename): - if filename == '-': - c = sys.stdin.read() - elif not os.path.exists(filename): - raise OSError( - "Input file %s does not exist" % filename) - else: - with open(filename, 'rb') as f: - c = f.read() - return c - -body_start_re = re.compile( - r"<body.*?>", re.I|re.S) -body_end_re = re.compile( - r"</body.*?>", re.I|re.S) - -def split_body(html): - pre = post = '' - match = body_start_re.search(html) - if match: - pre = html[:match.end()] - html = html[match.end():] - match = body_end_re.search(html) - if match: - post = html[match.start():] - html = html[:match.start()] - return pre, html, post - -def annotate(options, args): - print("Not yet implemented") - sys.exit(1) - diff --git a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py deleted file mode 100644 index 3405c20..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Legacy module - don't use in new code! - -html5lib now has its own proper implementation. - -This module implements a tree builder for html5lib that generates lxml -html element trees. This module uses camelCase as it follows the -html5lib style guide. -""" - -from html5lib.treebuilders import _base, etree as etree_builders -from lxml import html, etree - - -class DocumentType(object): - - def __init__(self, name, publicId, systemId): - self.name = name - self.publicId = publicId - self.systemId = systemId - -class Document(object): - - def __init__(self): - self._elementTree = None - self.childNodes = [] - - def appendChild(self, element): - self._elementTree.getroot().addnext(element._element) - - -class TreeBuilder(_base.TreeBuilder): - documentClass = Document - doctypeClass = DocumentType - elementClass = None - commentClass = None - fragmentClass = Document - - def __init__(self, *args, **kwargs): - html_builder = etree_builders.getETreeModule(html, fullTree=False) - etree_builder = etree_builders.getETreeModule(etree, fullTree=False) - self.elementClass = html_builder.Element - self.commentClass = etree_builder.Comment - _base.TreeBuilder.__init__(self, *args, **kwargs) - - def reset(self): - _base.TreeBuilder.reset(self) - self.rootInserted = False - self.initialComments = [] - self.doctype = None - - def getDocument(self): - return self.document._elementTree - - def getFragment(self): - fragment = [] - element = self.openElements[0]._element - if element.text: - fragment.append(element.text) - fragment.extend(element.getchildren()) - if element.tail: - fragment.append(element.tail) - return fragment - - def insertDoctype(self, name, publicId, systemId): - doctype = self.doctypeClass(name, publicId, systemId) - self.doctype = doctype - - def insertComment(self, data, parent=None): - if not self.rootInserted: - self.initialComments.append(data) - else: - _base.TreeBuilder.insertComment(self, data, parent) - - def insertRoot(self, name): - buf = [] - if self.doctype and self.doctype.name: - buf.append('<!DOCTYPE %s' % self.doctype.name) - if self.doctype.publicId is not None or self.doctype.systemId is not None: - buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId, - self.doctype.systemId)) - buf.append('>') - buf.append('<html></html>') - root = html.fromstring(''.join(buf)) - - # Append the initial comments: - for comment in self.initialComments: - root.addprevious(etree.Comment(comment)) - - # Create the root document and add the ElementTree to it - self.document = self.documentClass() - self.document._elementTree = root.getroottree() - - # Add the root element to the internal child/open data structures - root_element = self.elementClass(name) - root_element._element = root - self.document.childNodes.append(root_element) - self.openElements.append(root_element) - - self.rootInserted = True diff --git a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py deleted file mode 100644 index c99738e..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py +++ /dev/null @@ -1,56 +0,0 @@ -try: - from collections.abc import MutableSet -except ImportError: - from collections import MutableSet - - -class SetMixin(MutableSet): - - """ - Mix-in for sets. You must define __iter__, add, remove - """ - - def __len__(self): - length = 0 - for item in self: - length += 1 - return length - - def __contains__(self, item): - for has_item in self: - if item == has_item: - return True - return False - - issubset = MutableSet.__le__ - issuperset = MutableSet.__ge__ - - union = MutableSet.__or__ - intersection = MutableSet.__and__ - difference = MutableSet.__sub__ - symmetric_difference = MutableSet.__xor__ - - def copy(self): - return set(self) - - def update(self, other): - self |= other - - def intersection_update(self, other): - self &= other - - def difference_update(self, other): - self -= other - - def symmetric_difference_update(self, other): - self ^= other - - def discard(self, item): - try: - self.remove(item) - except KeyError: - pass - - @classmethod - def _from_iterable(cls, it): - return set(it) diff --git a/env/lib/python3.10/site-packages/lxml/html/builder.py b/env/lib/python3.10/site-packages/lxml/html/builder.py deleted file mode 100644 index 8a074ec..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/builder.py +++ /dev/null @@ -1,133 +0,0 @@ -# -------------------------------------------------------------------- -# The ElementTree toolkit is -# Copyright (c) 1999-2004 by Fredrik Lundh -# -------------------------------------------------------------------- - -""" -A set of HTML generator tags for building HTML documents. - -Usage:: - - >>> from lxml.html.builder import * - >>> html = HTML( - ... HEAD( TITLE("Hello World") ), - ... BODY( CLASS("main"), - ... H1("Hello World !") - ... ) - ... ) - - >>> import lxml.etree - >>> print lxml.etree.tostring(html, pretty_print=True) - <html> - <head> - <title>Hello World</title> - </head> - <body class="main"> - <h1>Hello World !</h1> - </body> - </html> - -""" - -from lxml.builder import ElementMaker -from lxml.html import html_parser - -E = ElementMaker(makeelement=html_parser.makeelement) - -# elements -A = E.a #: anchor -ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.) -ACRONYM = E.acronym #: -ADDRESS = E.address #: information on author -APPLET = E.applet #: Java applet (DEPRECATED) -AREA = E.area #: client-side image map area -B = E.b #: bold text style -BASE = E.base #: document base URI -BASEFONT = E.basefont #: base font size (DEPRECATED) -BDO = E.bdo #: I18N BiDi over-ride -BIG = E.big #: large text style -BLOCKQUOTE = E.blockquote #: long quotation -BODY = E.body #: document body -BR = E.br #: forced line break -BUTTON = E.button #: push button -CAPTION = E.caption #: table caption -CENTER = E.center #: shorthand for DIV align=center (DEPRECATED) -CITE = E.cite #: citation -CODE = E.code #: computer code fragment -COL = E.col #: table column -COLGROUP = E.colgroup #: table column group -DD = E.dd #: definition description -DEL = getattr(E, 'del') #: deleted text -DFN = E.dfn #: instance definition -DIR = E.dir #: directory list (DEPRECATED) -DIV = E.div #: generic language/style container -DL = E.dl #: definition list -DT = E.dt #: definition term -EM = E.em #: emphasis -FIELDSET = E.fieldset #: form control group -FONT = E.font #: local change to font (DEPRECATED) -FORM = E.form #: interactive form -FRAME = E.frame #: subwindow -FRAMESET = E.frameset #: window subdivision -H1 = E.h1 #: heading -H2 = E.h2 #: heading -H3 = E.h3 #: heading -H4 = E.h4 #: heading -H5 = E.h5 #: heading -H6 = E.h6 #: heading -HEAD = E.head #: document head -HR = E.hr #: horizontal rule -HTML = E.html #: document root element -I = E.i #: italic text style -IFRAME = E.iframe #: inline subwindow -IMG = E.img #: Embedded image -INPUT = E.input #: form control -INS = E.ins #: inserted text -ISINDEX = E.isindex #: single line prompt (DEPRECATED) -KBD = E.kbd #: text to be entered by the user -LABEL = E.label #: form field label text -LEGEND = E.legend #: fieldset legend -LI = E.li #: list item -LINK = E.link #: a media-independent link -MAP = E.map #: client-side image map -MENU = E.menu #: menu list (DEPRECATED) -META = E.meta #: generic metainformation -NOFRAMES = E.noframes #: alternate content container for non frame-based rendering -NOSCRIPT = E.noscript #: alternate content container for non script-based rendering -OBJECT = E.object #: generic embedded object -OL = E.ol #: ordered list -OPTGROUP = E.optgroup #: option group -OPTION = E.option #: selectable choice -P = E.p #: paragraph -PARAM = E.param #: named property value -PRE = E.pre #: preformatted text -Q = E.q #: short inline quotation -S = E.s #: strike-through text style (DEPRECATED) -SAMP = E.samp #: sample program output, scripts, etc. -SCRIPT = E.script #: script statements -SELECT = E.select #: option selector -SMALL = E.small #: small text style -SPAN = E.span #: generic language/style container -STRIKE = E.strike #: strike-through text (DEPRECATED) -STRONG = E.strong #: strong emphasis -STYLE = E.style #: style info -SUB = E.sub #: subscript -SUP = E.sup #: superscript -TABLE = E.table #: -TBODY = E.tbody #: table body -TD = E.td #: table data cell -TEXTAREA = E.textarea #: multi-line text field -TFOOT = E.tfoot #: table footer -TH = E.th #: table header cell -THEAD = E.thead #: table header -TITLE = E.title #: document title -TR = E.tr #: table row -TT = E.tt #: teletype or monospaced text style -U = E.u #: underlined text style (DEPRECATED) -UL = E.ul #: unordered list -VAR = E.var #: instance of a variable or program argument - -# attributes (only reserved words are included here) -ATTR = dict -def CLASS(v): return {'class': v} -def FOR(v): return {'for': v} diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so Binary files differdeleted file mode 100755 index 31087ea..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py deleted file mode 100644 index e6b0543..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/clean.py +++ /dev/null @@ -1,786 +0,0 @@ -# cython: language_level=3str - -"""A cleanup tool for HTML. - -Removes unwanted tags and content. See the `Cleaner` class for -details. -""" - -from __future__ import absolute_import - -import copy -import re -import sys -try: - from urlparse import urlsplit - from urllib import unquote_plus -except ImportError: - # Python 3 - from urllib.parse import urlsplit, unquote_plus -from lxml import etree -from lxml.html import defs -from lxml.html import fromstring, XHTML_NAMESPACE -from lxml.html import xhtml_to_html, _transform_result - -try: - unichr -except NameError: - # Python 3 - unichr = chr -try: - unicode -except NameError: - # Python 3 - unicode = str -try: - basestring -except NameError: - basestring = (str, bytes) - - -__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', - 'word_break', 'word_break_html'] - -# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# Particularly the CSS cleaning; most of the tag cleaning is integrated now -# I have multiple kinds of schemes searched; but should schemes be -# whitelisted instead? -# max height? -# remove images? Also in CSS? background attribute? -# Some way to whitelist object, iframe, etc (e.g., if you want to -# allow *just* embedded YouTube movies) -# Log what was deleted and why? -# style="behavior: ..." might be bad in IE? -# Should we have something for just <meta http-equiv>? That's the worst of the -# metas. -# UTF-7 detections? Example: -# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- -# you don't always have to have the charset set, if the page has no charset -# and there's UTF7-like code in it. -# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php - - -# This is an IE-specific construct you can have in a stylesheet to -# run some Javascript: -_replace_css_javascript = re.compile( - r'expression\s*\(.*?\)', re.S|re.I).sub - -# Do I have to worry about @\nimport? -_replace_css_import = re.compile( - r'@\s*import', re.I).sub - -_looks_like_tag_content = re.compile( - r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', - *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search - -# All kinds of schemes besides just javascript: that can cause -# execution: -_find_image_dataurls = re.compile( - r'data:image/(.+);base64,', re.I).findall -_possibly_malicious_schemes = re.compile( - r'(javascript|jscript|livescript|vbscript|data|about|mocha):', - re.I).findall -# SVG images can contain script content -_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search - -def _has_javascript_scheme(s): - safe_image_urls = 0 - for image_type in _find_image_dataurls(s): - if _is_unsafe_image_type(image_type): - return True - safe_image_urls += 1 - return len(_possibly_malicious_schemes(s)) > safe_image_urls - -_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub - -# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx -_conditional_comment_re = re.compile( - r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) - -_find_styled_elements = etree.XPath( - "descendant-or-self::*[@style]") - -_find_external_links = etree.XPath( - ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" - "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), - namespaces={'x':XHTML_NAMESPACE}) - - -class Cleaner(object): - """ - Instances cleans the document of each of the possible offending - elements. The cleaning is controlled by attributes; you can - override attributes in a subclass, or set them in the constructor. - - ``scripts``: - Removes any ``<script>`` tags. - - ``javascript``: - Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets - as they could contain Javascript. - - ``comments``: - Removes any comments. - - ``style``: - Removes any style tags. - - ``inline_style`` - Removes any style attributes. Defaults to the value of the ``style`` option. - - ``links``: - Removes any ``<link>`` tags - - ``meta``: - Removes any ``<meta>`` tags - - ``page_structure``: - Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. - - ``processing_instructions``: - Removes any processing instructions. - - ``embedded``: - Removes any embedded objects (flash, iframes) - - ``frames``: - Removes any frame-related tags - - ``forms``: - Removes any form tags - - ``annoying_tags``: - Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` - - ``remove_tags``: - A list of tags to remove. Only the tags will be removed, - their content will get pulled up into the parent tag. - - ``kill_tags``: - A list of tags to kill. Killing also removes the tag's content, - i.e. the whole subtree, not just the tag itself. - - ``allow_tags``: - A list of tags to include (default include all). - - ``remove_unknown_tags``: - Remove any tags that aren't standard parts of HTML. - - ``safe_attrs_only``: - If true, only include 'safe' attributes (specifically the list - from the feedparser HTML sanitisation web site). - - ``safe_attrs``: - A set of attribute names to override the default list of attributes - considered 'safe' (when safe_attrs_only=True). - - ``add_nofollow``: - If true, then any <a> tags will have ``rel="nofollow"`` added to them. - - ``host_whitelist``: - A list or set of hosts that you can use for embedded content - (for content like ``<object>``, ``<link rel="stylesheet">``, etc). - You can also implement/override the method - ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to - implement more complex rules for what can be embedded. - Anything that passes this test will be shown, regardless of - the value of (for instance) ``embedded``. - - Note that this parameter might not work as intended if you do not - make the links absolute before doing the cleaning. - - Note that you may also need to set ``whitelist_tags``. - - ``whitelist_tags``: - A set of tags that can be included with ``host_whitelist``. - The default is ``iframe`` and ``embed``; you may wish to - include other tags like ``script``, or you may want to - implement ``allow_embedded_url`` for more control. Set to None to - include all tags. - - This modifies the document *in place*. - """ - - scripts = True - javascript = True - comments = True - style = False - inline_style = None - links = True - meta = True - page_structure = True - processing_instructions = True - embedded = True - frames = True - forms = True - annoying_tags = True - remove_tags = None - allow_tags = None - kill_tags = None - remove_unknown_tags = True - safe_attrs_only = True - safe_attrs = defs.safe_attrs - add_nofollow = False - host_whitelist = () - whitelist_tags = {'iframe', 'embed'} - - def __init__(self, **kw): - not_an_attribute = object() - for name, value in kw.items(): - default = getattr(self, name, not_an_attribute) - if (default is not None and default is not True and default is not False - and not isinstance(default, (frozenset, set, tuple, list))): - raise TypeError( - "Unknown parameter: %s=%r" % (name, value)) - setattr(self, name, value) - if self.inline_style is None and 'inline_style' not in kw: - self.inline_style = self.style - - if kw.get("allow_tags"): - if kw.get("remove_unknown_tags"): - raise ValueError("It does not make sense to pass in both " - "allow_tags and remove_unknown_tags") - self.remove_unknown_tags = False - - # Used to lookup the primary URL for a given tag that is up for - # removal: - _tag_link_attrs = dict( - script='src', - link='href', - # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html - # From what I can tell, both attributes can contain a link: - applet=['code', 'object'], - iframe='src', - embed='src', - layer='src', - # FIXME: there doesn't really seem like a general way to figure out what - # links an <object> tag uses; links often go in <param> tags with values - # that we don't really know. You'd have to have knowledge about specific - # kinds of plugins (probably keyed off classid), and match against those. - ##object=?, - # FIXME: not looking at the action currently, because it is more complex - # than than -- if you keep the form, you should keep the form controls. - ##form='action', - a='href', - ) - - def __call__(self, doc): - """ - Cleans the document. - """ - try: - getroot = doc.getroot - except AttributeError: - pass # Element instance - else: - doc = getroot() # ElementTree instance, instead of an element - # convert XHTML to HTML - xhtml_to_html(doc) - # Normalize a case that IE treats <image> like <img>, and that - # can confuse either this step or later steps. - for el in doc.iter('image'): - el.tag = 'img' - if not self.comments: - # Of course, if we were going to kill comments anyway, we don't - # need to worry about this - self.kill_conditional_comments(doc) - - kill_tags = set(self.kill_tags or ()) - remove_tags = set(self.remove_tags or ()) - allow_tags = set(self.allow_tags or ()) - - if self.scripts: - kill_tags.add('script') - if self.safe_attrs_only: - safe_attrs = set(self.safe_attrs) - for el in doc.iter(etree.Element): - attrib = el.attrib - for aname in attrib.keys(): - if aname not in safe_attrs: - del attrib[aname] - if self.javascript: - if not (self.safe_attrs_only and - self.safe_attrs == defs.safe_attrs): - # safe_attrs handles events attributes itself - for el in doc.iter(etree.Element): - attrib = el.attrib - for aname in attrib.keys(): - if aname.startswith('on'): - del attrib[aname] - doc.rewrite_links(self._remove_javascript_link, - resolve_base_href=False) - # If we're deleting style then we don't have to remove JS links - # from styles, otherwise... - if not self.inline_style: - for el in _find_styled_elements(doc): - old = el.get('style') - new = _replace_css_javascript('', old) - new = _replace_css_import('', new) - if self._has_sneaky_javascript(new): - # Something tricky is going on... - del el.attrib['style'] - elif new != old: - el.set('style', new) - if not self.style: - for el in list(doc.iter('style')): - if el.get('type', '').lower().strip() == 'text/javascript': - el.drop_tree() - continue - old = el.text or '' - new = _replace_css_javascript('', old) - # The imported CSS can do anything; we just can't allow: - new = _replace_css_import('', new) - if self._has_sneaky_javascript(new): - # Something tricky is going on... - el.text = '/* deleted */' - elif new != old: - el.text = new - if self.comments: - kill_tags.add(etree.Comment) - if self.processing_instructions: - kill_tags.add(etree.ProcessingInstruction) - if self.style: - kill_tags.add('style') - if self.inline_style: - etree.strip_attributes(doc, 'style') - if self.links: - kill_tags.add('link') - elif self.style or self.javascript: - # We must get rid of included stylesheets if Javascript is not - # allowed, as you can put Javascript in them - for el in list(doc.iter('link')): - if 'stylesheet' in el.get('rel', '').lower(): - # Note this kills alternate stylesheets as well - if not self.allow_element(el): - el.drop_tree() - if self.meta: - kill_tags.add('meta') - if self.page_structure: - remove_tags.update(('head', 'html', 'title')) - if self.embedded: - # FIXME: is <layer> really embedded? - # We should get rid of any <param> tags not inside <applet>; - # These are not really valid anyway. - for el in list(doc.iter('param')): - parent = el.getparent() - while parent is not None and parent.tag not in ('applet', 'object'): - parent = parent.getparent() - if parent is None: - el.drop_tree() - kill_tags.update(('applet',)) - # The alternate contents that are in an iframe are a good fallback: - remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) - if self.frames: - # FIXME: ideally we should look at the frame links, but - # generally frames don't mix properly with an HTML - # fragment anyway. - kill_tags.update(defs.frame_tags) - if self.forms: - remove_tags.add('form') - kill_tags.update(('button', 'input', 'select', 'textarea')) - if self.annoying_tags: - remove_tags.update(('blink', 'marquee')) - - _remove = [] - _kill = [] - for el in doc.iter(): - if el.tag in kill_tags: - if self.allow_element(el): - continue - _kill.append(el) - elif el.tag in remove_tags: - if self.allow_element(el): - continue - _remove.append(el) - - if _remove and _remove[0] == doc: - # We have to drop the parent-most tag, which we can't - # do. Instead we'll rewrite it: - el = _remove.pop(0) - el.tag = 'div' - el.attrib.clear() - elif _kill and _kill[0] == doc: - # We have to drop the parent-most element, which we can't - # do. Instead we'll clear it: - el = _kill.pop(0) - if el.tag != 'html': - el.tag = 'div' - el.clear() - - _kill.reverse() # start with innermost tags - for el in _kill: - el.drop_tree() - for el in _remove: - el.drop_tag() - - if self.remove_unknown_tags: - if allow_tags: - raise ValueError( - "It does not make sense to pass in both allow_tags and remove_unknown_tags") - allow_tags = set(defs.tags) - if allow_tags: - # make sure we do not remove comments/PIs if users want them (which is rare enough) - if not self.comments: - allow_tags.add(etree.Comment) - if not self.processing_instructions: - allow_tags.add(etree.ProcessingInstruction) - - bad = [] - for el in doc.iter(): - if el.tag not in allow_tags: - bad.append(el) - if bad: - if bad[0] is doc: - el = bad.pop(0) - el.tag = 'div' - el.attrib.clear() - for el in bad: - el.drop_tag() - if self.add_nofollow: - for el in _find_external_links(doc): - if not self.allow_follow(el): - rel = el.get('rel') - if rel: - if ('nofollow' in rel - and ' nofollow ' in (' %s ' % rel)): - continue - rel = '%s nofollow' % rel - else: - rel = 'nofollow' - el.set('rel', rel) - - def allow_follow(self, anchor): - """ - Override to suppress rel="nofollow" on some anchors. - """ - return False - - def allow_element(self, el): - """ - Decide whether an element is configured to be accepted or rejected. - - :param el: an element. - :return: true to accept the element or false to reject/discard it. - """ - if el.tag not in self._tag_link_attrs: - return False - attr = self._tag_link_attrs[el.tag] - if isinstance(attr, (list, tuple)): - for one_attr in attr: - url = el.get(one_attr) - if not url: - return False - if not self.allow_embedded_url(el, url): - return False - return True - else: - url = el.get(attr) - if not url: - return False - return self.allow_embedded_url(el, url) - - def allow_embedded_url(self, el, url): - """ - Decide whether a URL that was found in an element's attributes or text - if configured to be accepted or rejected. - - :param el: an element. - :param url: a URL found on the element. - :return: true to accept the URL and false to reject it. - """ - if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: - return False - scheme, netloc, path, query, fragment = urlsplit(url) - netloc = netloc.lower().split(':', 1)[0] - if scheme not in ('http', 'https'): - return False - if netloc in self.host_whitelist: - return True - return False - - def kill_conditional_comments(self, doc): - """ - IE conditional comments basically embed HTML that the parser - doesn't normally see. We can't allow anything like that, so - we'll kill any comments that could be conditional. - """ - has_conditional_comment = _conditional_comment_re.search - self._kill_elements( - doc, lambda el: has_conditional_comment(el.text), - etree.Comment) - - def _kill_elements(self, doc, condition, iterate=None): - bad = [] - for el in doc.iter(iterate): - if condition(el): - bad.append(el) - for el in bad: - el.drop_tree() - - def _remove_javascript_link(self, link): - # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', unquote_plus(link)) - if _has_javascript_scheme(new): - # FIXME: should this be None to delete? - return '' - return link - - _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub - - def _has_sneaky_javascript(self, style): - """ - Depending on the browser, stuff like ``e x p r e s s i o n(...)`` - can get interpreted, or ``expre/* stuff */ssion(...)``. This - checks for attempt to do stuff like this. - - Typically the response will be to kill the entire style; if you - have just a bit of Javascript in the style another rule will catch - that and remove only the Javascript from the style; this catches - more sneaky attempts. - """ - style = self._substitute_comments('', style) - style = style.replace('\\', '') - style = _substitute_whitespace('', style) - style = style.lower() - if _has_javascript_scheme(style): - return True - if 'expression(' in style: - return True - if '@import' in style: - return True - if '</noscript' in style: - # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' - return True - if _looks_like_tag_content(style): - # e.g. '<math><style><img src=x onerror=alert(1)></style></math>' - return True - return False - - def clean_html(self, html): - result_type = type(html) - if isinstance(html, basestring): - doc = fromstring(html) - else: - doc = copy.deepcopy(html) - self(doc) - return _transform_result(result_type, doc) - -clean = Cleaner() -clean_html = clean.clean_html - -############################################################ -## Autolinking -############################################################ - -_link_regexes = [ - re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), - # This is conservative, but autolinking can be a bit conservative: - re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), - ] - -_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] - -_avoid_hosts = [ - re.compile(r'^localhost', re.I), - re.compile(r'\bexample\.(?:com|org|net)$', re.I), - re.compile(r'^127\.0\.0\.1$'), - ] - -_avoid_classes = ['nolink'] - -def autolink(el, link_regexes=_link_regexes, - avoid_elements=_avoid_elements, - avoid_hosts=_avoid_hosts, - avoid_classes=_avoid_classes): - """ - Turn any URLs into links. - - It will search for links identified by the given regular - expressions (by default mailto and http(s) links). - - It won't link text in an element in avoid_elements, or an element - with a class in avoid_classes. It won't link to anything with a - host that matches one of the regular expressions in avoid_hosts - (default localhost and 127.0.0.1). - - If you pass in an element, the element's tail will not be - substituted, only the contents of the element. - """ - if el.tag in avoid_elements: - return - class_name = el.get('class') - if class_name: - class_name = class_name.split() - for match_class in avoid_classes: - if match_class in class_name: - return - for child in list(el): - autolink(child, link_regexes=link_regexes, - avoid_elements=avoid_elements, - avoid_hosts=avoid_hosts, - avoid_classes=avoid_classes) - if child.tail: - text, tail_children = _link_text( - child.tail, link_regexes, avoid_hosts, factory=el.makeelement) - if tail_children: - child.tail = text - index = el.index(child) - el[index+1:index+1] = tail_children - if el.text: - text, pre_children = _link_text( - el.text, link_regexes, avoid_hosts, factory=el.makeelement) - if pre_children: - el.text = text - el[:0] = pre_children - -def _link_text(text, link_regexes, avoid_hosts, factory): - leading_text = '' - links = [] - last_pos = 0 - while 1: - best_match, best_pos = None, None - for regex in link_regexes: - regex_pos = last_pos - while 1: - match = regex.search(text, pos=regex_pos) - if match is None: - break - host = match.group('host') - for host_regex in avoid_hosts: - if host_regex.search(host): - regex_pos = match.end() - break - else: - break - if match is None: - continue - if best_pos is None or match.start() < best_pos: - best_match = match - best_pos = match.start() - if best_match is None: - # No more matches - if links: - assert not links[-1].tail - links[-1].tail = text - else: - assert not leading_text - leading_text = text - break - link = best_match.group(0) - end = best_match.end() - if link.endswith('.') or link.endswith(','): - # These punctuation marks shouldn't end a link - end -= 1 - link = link[:-1] - prev_text = text[:best_match.start()] - if links: - assert not links[-1].tail - links[-1].tail = prev_text - else: - assert not leading_text - leading_text = prev_text - anchor = factory('a') - anchor.set('href', link) - body = best_match.group('body') - if not body: - body = link - if body.endswith('.') or body.endswith(','): - body = body[:-1] - anchor.text = body - links.append(anchor) - text = text[end:] - return leading_text, links - -def autolink_html(html, *args, **kw): - result_type = type(html) - if isinstance(html, basestring): - doc = fromstring(html) - else: - doc = copy.deepcopy(html) - autolink(doc, *args, **kw) - return _transform_result(result_type, doc) - -autolink_html.__doc__ = autolink.__doc__ - -############################################################ -## Word wrapping -############################################################ - -_avoid_word_break_elements = ['pre', 'textarea', 'code'] -_avoid_word_break_classes = ['nobreak'] - -def word_break(el, max_width=40, - avoid_elements=_avoid_word_break_elements, - avoid_classes=_avoid_word_break_classes, - break_character=unichr(0x200b)): - """ - Breaks any long words found in the body of the text (not attributes). - - Doesn't effect any of the tags in avoid_elements, by default - ``<textarea>`` and ``<pre>`` - - Breaks words by inserting ​, which is a unicode character - for Zero Width Space character. This generally takes up no space - in rendering, but does copy as a space, and in monospace contexts - usually takes up space. - - See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion - """ - # Character suggestion of ​ comes from: - # http://www.cs.tut.fi/~jkorpela/html/nobr.html - if el.tag in _avoid_word_break_elements: - return - class_name = el.get('class') - if class_name: - dont_break = False - class_name = class_name.split() - for avoid in avoid_classes: - if avoid in class_name: - dont_break = True - break - if dont_break: - return - if el.text: - el.text = _break_text(el.text, max_width, break_character) - for child in el: - word_break(child, max_width=max_width, - avoid_elements=avoid_elements, - avoid_classes=avoid_classes, - break_character=break_character) - if child.tail: - child.tail = _break_text(child.tail, max_width, break_character) - -def word_break_html(html, *args, **kw): - result_type = type(html) - doc = fromstring(html) - word_break(doc, *args, **kw) - return _transform_result(result_type, doc) - -def _break_text(text, max_width, break_character): - words = text.split() - for word in words: - if len(word) > max_width: - replacement = _insert_break(word, max_width, break_character) - text = text.replace(word, replacement) - return text - -_break_prefer_re = re.compile(r'[^a-z]', re.I) - -def _insert_break(word, width, break_character): - orig_word = word - result = '' - while len(word) > width: - start = word[:width] - breaks = list(_break_prefer_re.finditer(start)) - if breaks: - last_break = breaks[-1] - # Only walk back up to 10 characters to find a nice break: - if last_break.end() > width-10: - # FIXME: should the break character be at the end of the - # chunk, or the beginning of the next chunk? - start = word[:last_break.end()] - result += start + break_character - word = word[len(start):] - result += word - return result - diff --git a/env/lib/python3.10/site-packages/lxml/html/defs.py b/env/lib/python3.10/site-packages/lxml/html/defs.py deleted file mode 100644 index 2058ea3..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/defs.py +++ /dev/null @@ -1,135 +0,0 @@ -# FIXME: this should all be confirmed against what a DTD says -# (probably in a test; this may not match the DTD exactly, but we -# should document just how it differs). - -""" -Data taken from https://www.w3.org/TR/html401/index/elements.html -and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements -for html5_tags. -""" - -empty_tags = frozenset([ - 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track']) - -deprecated_tags = frozenset([ - 'applet', 'basefont', 'center', 'dir', 'font', 'isindex', - 'menu', 's', 'strike', 'u']) - -# archive actually takes a space-separated list of URIs -link_attrs = frozenset([ - 'action', 'archive', 'background', 'cite', 'classid', - 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', - 'usemap', - # Not standard: - 'dynsrc', 'lowsrc', - # HTML5 formaction - 'formaction' - ]) - -# Not in the HTML 4 spec: -# onerror, onresize -event_attrs = frozenset([ - 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', - 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', - 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', - 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit', - 'onunload', - ]) - -safe_attrs = frozenset([ - 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', - 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', - 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', - 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', - 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', - 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', - 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', - 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) - -# From http://htmlhelp.com/reference/html40/olist.html -top_level_tags = frozenset([ - 'html', 'head', 'body', 'frameset', - ]) - -head_tags = frozenset([ - 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title', - ]) - -general_block_tags = frozenset([ - 'address', - 'blockquote', - 'center', - 'del', - 'div', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'hr', - 'ins', - 'isindex', - 'noscript', - 'p', - 'pre', - ]) - -list_tags = frozenset([ - 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul', - ]) - -table_tags = frozenset([ - 'table', 'caption', 'colgroup', 'col', - 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th', - ]) - -# just this one from -# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm -block_tags = general_block_tags | list_tags | table_tags | frozenset([ - # Partial form tags - 'fieldset', 'form', 'legend', 'optgroup', 'option', - ]) - -form_tags = frozenset([ - 'form', 'button', 'fieldset', 'legend', 'input', 'label', - 'select', 'optgroup', 'option', 'textarea', - ]) - -special_inline_tags = frozenset([ - 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe', - 'img', 'map', 'area', 'object', 'param', 'q', 'script', - 'span', 'sub', 'sup', - ]) - -phrase_tags = frozenset([ - 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em', - 'ins', 'kbd', 'samp', 'strong', 'var', - ]) - -font_style_tags = frozenset([ - 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u', - ]) - -frame_tags = frozenset([ - 'frameset', 'frame', 'noframes', - ]) - -html5_tags = frozenset([ - 'article', 'aside', 'audio', 'canvas', 'command', 'datalist', - 'details', 'embed', 'figcaption', 'figure', 'footer', 'header', - 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output', - 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary', - 'svg', 'time', 'track', 'video', 'wbr' - ]) - -# These tags aren't standard -nonstandard_tags = frozenset(['blink', 'marquee']) - - -tags = (top_level_tags | head_tags | general_block_tags | list_tags - | table_tags | form_tags | special_inline_tags | phrase_tags - | font_style_tags | nonstandard_tags | html5_tags) diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so Binary files differdeleted file mode 100755 index 0c11b90..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so +++ /dev/null diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py deleted file mode 100644 index 39bec78..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/diff.py +++ /dev/null @@ -1,884 +0,0 @@ -# cython: language_level=3 - -from __future__ import absolute_import - -import difflib -from lxml import etree -from lxml.html import fragment_fromstring -import re - -__all__ = ['html_annotate', 'htmldiff'] - -try: - from html import escape as html_escape -except ImportError: - from cgi import escape as html_escape -try: - _unicode = unicode -except NameError: - # Python 3 - _unicode = str -try: - basestring -except NameError: - # Python 3 - basestring = str - -############################################################ -## Annotation -############################################################ - -def default_markup(text, version): - return '<span title="%s">%s</span>' % ( - html_escape(_unicode(version), 1), text) - -def html_annotate(doclist, markup=default_markup): - """ - doclist should be ordered from oldest to newest, like:: - - >>> version1 = 'Hello World' - >>> version2 = 'Goodbye World' - >>> print(html_annotate([(version1, 'version 1'), - ... (version2, 'version 2')])) - <span title="version 2">Goodbye</span> <span title="version 1">World</span> - - The documents must be *fragments* (str/UTF8 or unicode), not - complete documents - - The markup argument is a function to markup the spans of words. - This function is called like markup('Hello', 'version 2'), and - returns HTML. The first argument is text and never includes any - markup. The default uses a span with a title: - - >>> print(default_markup('Some Text', 'by Joe')) - <span title="by Joe">Some Text</span> - """ - # The basic strategy we have is to split the documents up into - # logical tokens (which are words with attached markup). We then - # do diffs of each of the versions to track when a token first - # appeared in the document; the annotation attached to the token - # is the version where it first appeared. - tokenlist = [tokenize_annotated(doc, version) - for doc, version in doclist] - cur_tokens = tokenlist[0] - for tokens in tokenlist[1:]: - html_annotate_merge_annotations(cur_tokens, tokens) - cur_tokens = tokens - - # After we've tracked all the tokens, we can combine spans of text - # that are adjacent and have the same annotation - cur_tokens = compress_tokens(cur_tokens) - # And finally add markup - result = markup_serialize_tokens(cur_tokens, markup) - return ''.join(result).strip() - -def tokenize_annotated(doc, annotation): - """Tokenize a document and add an annotation attribute to each token - """ - tokens = tokenize(doc, include_hrefs=False) - for tok in tokens: - tok.annotation = annotation - return tokens - -def html_annotate_merge_annotations(tokens_old, tokens_new): - """Merge the annotations from tokens_old into tokens_new, when the - tokens in the new document already existed in the old document. - """ - s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) - commands = s.get_opcodes() - - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - eq_old = tokens_old[i1:i2] - eq_new = tokens_new[j1:j2] - copy_annotations(eq_old, eq_new) - -def copy_annotations(src, dest): - """ - Copy annotations from the tokens listed in src to the tokens in dest - """ - assert len(src) == len(dest) - for src_tok, dest_tok in zip(src, dest): - dest_tok.annotation = src_tok.annotation - -def compress_tokens(tokens): - """ - Combine adjacent tokens when there is no HTML between the tokens, - and they share an annotation - """ - result = [tokens[0]] - for tok in tokens[1:]: - if (not result[-1].post_tags and - not tok.pre_tags and - result[-1].annotation == tok.annotation): - compress_merge_back(result, tok) - else: - result.append(tok) - return result - -def compress_merge_back(tokens, tok): - """ Merge tok into the last element of tokens (modifying the list of - tokens in-place). """ - last = tokens[-1] - if type(last) is not token or type(tok) is not token: - tokens.append(tok) - else: - text = _unicode(last) - if last.trailing_whitespace: - text += last.trailing_whitespace - text += tok - merged = token(text, - pre_tags=last.pre_tags, - post_tags=tok.post_tags, - trailing_whitespace=tok.trailing_whitespace) - merged.annotation = last.annotation - tokens[-1] = merged - -def markup_serialize_tokens(tokens, markup_func): - """ - Serialize the list of tokens into a list of text chunks, calling - markup_func around text to add annotations. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - html = token.html() - html = markup_func(html, token.annotation) - if token.trailing_whitespace: - html += token.trailing_whitespace - yield html - for post in token.post_tags: - yield post - - -############################################################ -## HTML Diffs -############################################################ - -def htmldiff(old_html, new_html): - ## FIXME: this should take parsed documents too, and use their body - ## or other content. - """ Do a diff of the old and new document. The documents are HTML - *fragments* (str/UTF8 or unicode), they are not complete documents - (i.e., no <html> tag). - - Returns HTML with <ins> and <del> tags added around the - appropriate text. - - Markup is generally ignored, with the markup from new_html - preserved, and possibly some markup from old_html (though it is - considered acceptable to lose some of the old markup). Only the - words in the HTML are diffed. The exception is <img> tags, which - are treated like words, and the href attribute of <a> tags, which - are noted inside the tag itself when there are changes. - """ - old_html_tokens = tokenize(old_html) - new_html_tokens = tokenize(new_html) - result = htmldiff_tokens(old_html_tokens, new_html_tokens) - result = ''.join(result).strip() - return fixup_ins_del_tags(result) - -def htmldiff_tokens(html1_tokens, html2_tokens): - """ Does a diff on the tokens themselves, returning a list of text - chunks (not tokens). - """ - # There are several passes as we do the differences. The tokens - # isolate the portion of the content we care to diff; difflib does - # all the actual hard work at that point. - # - # Then we must create a valid document from pieces of both the old - # document and the new document. We generally prefer to take - # markup from the new document, and only do a best effort attempt - # to keep markup from the old document; anything that we can't - # resolve we throw away. Also we try to put the deletes as close - # to the location where we think they would have been -- because - # we are only keeping the markup from the new document, it can be - # fuzzy where in the new document the old text would have gone. - # Again we just do a best effort attempt. - s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) - commands = s.get_opcodes() - result = [] - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) - continue - if command == 'insert' or command == 'replace': - ins_tokens = expand_tokens(html2_tokens[j1:j2]) - merge_insert(ins_tokens, result) - if command == 'delete' or command == 'replace': - del_tokens = expand_tokens(html1_tokens[i1:i2]) - merge_delete(del_tokens, result) - # If deletes were inserted directly as <del> then we'd have an - # invalid document at this point. Instead we put in special - # markers, and when the complete diffed document has been created - # we try to move the deletes around and resolve any problems. - result = cleanup_delete(result) - - return result - -def expand_tokens(tokens, equal=False): - """Given a list of tokens, return a generator of the chunks of - text for the data in the tokens. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - if not equal or not token.hide_when_equal: - if token.trailing_whitespace: - yield token.html() + token.trailing_whitespace - else: - yield token.html() - for post in token.post_tags: - yield post - -def merge_insert(ins_chunks, doc): - """ doc is the already-handled document (as a list of text chunks); - here we add <ins>ins_chunks</ins> to the end of that. """ - # Though we don't throw away unbalanced_start or unbalanced_end - # (we assume there is accompanying markup later or earlier in the - # document), we only put <ins> around the balanced portion. - unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) - doc.extend(unbalanced_start) - if doc and not doc[-1].endswith(' '): - # Fix up the case where the word before the insert didn't end with - # a space - doc[-1] += ' ' - doc.append('<ins>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </ins> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</ins> ') - doc.extend(unbalanced_end) - -# These are sentinels to represent the start and end of a <del> -# segment, until we do the cleanup phase to turn them into proper -# markup: -class DEL_START: - pass -class DEL_END: - pass - -class NoDeletes(Exception): - """ Raised when the document no longer contains any pending deletes - (DEL_START/DEL_END) """ - -def merge_delete(del_chunks, doc): - """ Adds the text chunks in del_chunks to the document doc (another - list of text chunks) with marker to show it is a delete. - cleanup_delete later resolves these markers into <del> tags.""" - doc.append(DEL_START) - doc.extend(del_chunks) - doc.append(DEL_END) - -def cleanup_delete(chunks): - """ Cleans up any DEL_START/DEL_END markers in the document, replacing - them with <del></del>. To do this while keeping the document - valid, it may need to drop some tags (either start or end tags). - - It may also move the del into adjacent tags to try to move it to a - similar location where it was originally located (e.g., moving a - delete into preceding <div> tag, if the del looks like (DEL_START, - 'Text</div>', DEL_END)""" - while 1: - # Find a pending DEL_START/DEL_END, splitting the document - # into stuff-preceding-DEL_START, stuff-inside, and - # stuff-following-DEL_END - try: - pre_delete, delete, post_delete = split_delete(chunks) - except NoDeletes: - # Nothing found, we've cleaned up the entire doc - break - # The stuff-inside-DEL_START/END may not be well balanced - # markup. First we figure out what unbalanced portions there are: - unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) - # Then we move the span forward and/or backward based on these - # unbalanced portions: - locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) - locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) - doc = pre_delete - if doc and not doc[-1].endswith(' '): - # Fix up case where the word before us didn't have a trailing space - doc[-1] += ' ' - doc.append('<del>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </del> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</del> ') - doc.extend(post_delete) - chunks = doc - return chunks - -def split_unbalanced(chunks): - """Return (unbalanced_start, balanced, unbalanced_end), where each is - a list of text and tag chunks. - - unbalanced_start is a list of all the tags that are opened, but - not closed in this span. Similarly, unbalanced_end is a list of - tags that are closed but were not opened. Extracting these might - mean some reordering of the chunks.""" - start = [] - end = [] - tag_stack = [] - balanced = [] - for chunk in chunks: - if not chunk.startswith('<'): - balanced.append(chunk) - continue - endtag = chunk[1] == '/' - name = chunk.split()[0].strip('<>/') - if name in empty_tags: - balanced.append(chunk) - continue - if endtag: - if tag_stack and tag_stack[-1][0] == name: - balanced.append(chunk) - name, pos, tag = tag_stack.pop() - balanced[pos] = tag - elif tag_stack: - start.extend([tag for name, pos, tag in tag_stack]) - tag_stack = [] - end.append(chunk) - else: - end.append(chunk) - else: - tag_stack.append((name, len(balanced), chunk)) - balanced.append(None) - start.extend( - [chunk for name, pos, chunk in tag_stack]) - balanced = [chunk for chunk in balanced if chunk is not None] - return start, balanced, end - -def split_delete(chunks): - """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, - stuff_after_DEL_END). Returns the first case found (there may be - more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if - there's no DEL_START found. """ - try: - pos = chunks.index(DEL_START) - except ValueError: - raise NoDeletes - pos2 = chunks.index(DEL_END) - return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] - -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): - """ pre_delete and post_delete implicitly point to a place in the - document (where the two were split). This moves that point (by - popping items from one and pushing them onto the other). It moves - the point to try to find a place where unbalanced_start applies. - - As an example:: - - >>> unbalanced_start = ['<div>'] - >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] - >>> pre, post = doc[:3], doc[3:] - >>> pre, post - (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) - >>> locate_unbalanced_start(unbalanced_start, pre, post) - >>> pre, post - (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) - - As you can see, we moved the point so that the dangling <div> that - we found will be effectively replaced by the div in the original - document. If this doesn't work out, we just throw away - unbalanced_start without doing anything. - """ - while 1: - if not unbalanced_start: - # We have totally succeeded in finding the position - break - finding = unbalanced_start[0] - finding_name = finding.split()[0].strip('<>') - if not post_delete: - break - next = post_delete[0] - if next is DEL_START or not next.startswith('<'): - # Reached a word, we can't move the delete text forward - break - if next[1] == '/': - # Reached a closing tag, can we go further? Maybe not... - break - name = next.split()[0].strip('<>') - if name == 'ins': - # Can't move into an insert - break - assert name != 'del', ( - "Unexpected delete tag: %r" % next) - if name == finding_name: - unbalanced_start.pop(0) - pre_delete.append(post_delete.pop(0)) - else: - # Found a tag that doesn't match - break - -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): - """ like locate_unbalanced_start, except handling end tags and - possibly moving the point earlier in the document. """ - while 1: - if not unbalanced_end: - # Success - break - finding = unbalanced_end[-1] - finding_name = finding.split()[0].strip('<>/') - if not pre_delete: - break - next = pre_delete[-1] - if next is DEL_END or not next.startswith('</'): - # A word or a start tag - break - name = next.split()[0].strip('<>/') - if name == 'ins' or name == 'del': - # Can't move into an insert or delete - break - if name == finding_name: - unbalanced_end.pop() - post_delete.insert(0, pre_delete.pop()) - else: - # Found a tag that doesn't match - break - -class token(_unicode): - """ Represents a diffable token, generally a word that is displayed to - the user. Opening tags are attached to this token when they are - adjacent (pre_tags) and closing tags that follow the word - (post_tags). Some exceptions occur when there are empty tags - adjacent to a word, so there may be close tags in pre_tags, or - open tags in post_tags. - - We also keep track of whether the word was originally followed by - whitespace, even though we do not want to treat the word as - equivalent to a similar word that does not have a trailing - space.""" - - # When this is true, the token will be eliminated from the - # displayed diff if no change has occurred: - hide_when_equal = False - - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): - obj = _unicode.__new__(cls, text) - - if pre_tags is not None: - obj.pre_tags = pre_tags - else: - obj.pre_tags = [] - - if post_tags is not None: - obj.post_tags = post_tags - else: - obj.post_tags = [] - - obj.trailing_whitespace = trailing_whitespace - - return obj - - def __repr__(self): - return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, - self.post_tags, self.trailing_whitespace) - - def html(self): - return _unicode(self) - -class tag_token(token): - - """ Represents a token that is actually a tag. Currently this is just - the <img> tag, which takes up visible space just like a word but - is only represented in a document by a tag. """ - - def __new__(cls, tag, data, html_repr, pre_tags=None, - post_tags=None, trailing_whitespace=""): - obj = token.__new__(cls, "%s: %s" % (type, data), - pre_tags=pre_tags, - post_tags=post_tags, - trailing_whitespace=trailing_whitespace) - obj.tag = tag - obj.data = data - obj.html_repr = html_repr - return obj - - def __repr__(self): - return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( - self.tag, - self.data, - self.html_repr, - self.pre_tags, - self.post_tags, - self.trailing_whitespace) - def html(self): - return self.html_repr - -class href_token(token): - - """ Represents the href in an anchor tag. Unlike other words, we only - show the href when it changes. """ - - hide_when_equal = True - - def html(self): - return ' Link: %s' % self - -def tokenize(html, include_hrefs=True): - """ - Parse the given HTML and returns token objects (words with attached tags). - - This parses only the content of a page; anything in the head is - ignored, and the <head> and <body> elements are themselves - optional. The content is then parsed by lxml, which ensures the - validity of the resulting parsed document (though lxml may make - incorrect guesses when the markup is particular bad). - - <ins> and <del> tags are also eliminated from the document, as - that gets confusing. - - If include_hrefs is true, then the href attribute of <a> tags is - included as a special kind of diffable token.""" - if etree.iselement(html): - body_el = html - else: - body_el = parse_html(html, cleanup=True) - # Then we split the document into text chunks for each tag, word, and end tag: - chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) - # Finally re-joining them into token objects: - return fixup_chunks(chunks) - -def parse_html(html, cleanup=True): - """ - Parses an HTML fragment, returning an lxml element. Note that the HTML will be - wrapped in a <div> tag that was not in the original document. - - If cleanup is true, make sure there's no <head> or <body>, and get - rid of any <ins> and <del> tags. - """ - if cleanup: - # This removes any extra markup or structure like <head>: - html = cleanup_html(html) - return fragment_fromstring(html, create_parent=True) - -_body_re = re.compile(r'<body.*?>', re.I|re.S) -_end_body_re = re.compile(r'</body.*?>', re.I|re.S) -_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) - -def cleanup_html(html): - """ This 'cleans' the HTML, meaning that any page structure is removed - (only the contents of <body> are used, if there is any <body). - Also <ins> and <del> tags are removed. """ - match = _body_re.search(html) - if match: - html = html[match.end():] - match = _end_body_re.search(html) - if match: - html = html[:match.start()] - html = _ins_del_re.sub('', html) - return html - - -end_whitespace_re = re.compile(r'[ \t\n\r]$') - -def split_trailing_whitespace(word): - """ - This function takes a word, such as 'test\n\n' and returns ('test','\n\n') - """ - stripped_length = len(word.rstrip()) - return word[0:stripped_length], word[stripped_length:] - - -def fixup_chunks(chunks): - """ - This function takes a list of chunks and produces a list of tokens. - """ - tag_accum = [] - cur_word = None - result = [] - for chunk in chunks: - if isinstance(chunk, tuple): - if chunk[0] == 'img': - src = chunk[1] - tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) - cur_word = tag_token('img', src, html_repr=tag, - pre_tags=tag_accum, - trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - - elif chunk[0] == 'href': - href = chunk[1] - cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") - tag_accum = [] - result.append(cur_word) - continue - - if is_word(chunk): - chunk, trailing_whitespace = split_trailing_whitespace(chunk) - cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - - elif is_start_tag(chunk): - tag_accum.append(chunk) - - elif is_end_tag(chunk): - if tag_accum: - tag_accum.append(chunk) - else: - assert cur_word, ( - "Weird state, cur_word=%r, result=%r, chunks=%r of %r" - % (cur_word, result, chunk, chunks)) - cur_word.post_tags.append(chunk) - else: - assert False - - if not result: - return [token('', pre_tags=tag_accum)] - else: - result[-1].post_tags.extend(tag_accum) - - return result - - -# All the tags in HTML that don't require end tags: -empty_tags = ( - 'param', 'img', 'area', 'br', 'basefont', 'input', - 'base', 'meta', 'link', 'col') - -block_level_tags = ( - 'address', - 'blockquote', - 'center', - 'dir', - 'div', - 'dl', - 'fieldset', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'hr', - 'isindex', - 'menu', - 'noframes', - 'noscript', - 'ol', - 'p', - 'pre', - 'table', - 'ul', - ) - -block_level_container_tags = ( - 'dd', - 'dt', - 'frameset', - 'li', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ) - - -def flatten_el(el, include_hrefs, skip_tag=False): - """ Takes an lxml element el, and generates all the text chunks for - that tag. Each start tag is a chunk, each word is a chunk, and each - end tag is a chunk. - - If skip_tag is true, then the outermost container tag is - not returned (just its contents).""" - if not skip_tag: - if el.tag == 'img': - yield ('img', el.get('src'), start_tag(el)) - else: - yield start_tag(el) - if el.tag in empty_tags and not el.text and not len(el) and not el.tail: - return - start_words = split_words(el.text) - for word in start_words: - yield html_escape(word) - for child in el: - for item in flatten_el(child, include_hrefs=include_hrefs): - yield item - if el.tag == 'a' and el.get('href') and include_hrefs: - yield ('href', el.get('href')) - if not skip_tag: - yield end_tag(el) - end_words = split_words(el.tail) - for word in end_words: - yield html_escape(word) - -split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) - -def split_words(text): - """ Splits some text into words. Includes trailing whitespace - on each word when appropriate. """ - if not text or not text.strip(): - return [] - - words = split_words_re.findall(text) - return words - -start_whitespace_re = re.compile(r'^[ \t\n\r]') - -def start_tag(el): - """ - The text representation of the start tag for a tag. - """ - return '<%s%s>' % ( - el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) - for name, value in el.attrib.items()])) - -def end_tag(el): - """ The text representation of an end tag for a tag. Includes - trailing whitespace when appropriate. """ - if el.tail and start_whitespace_re.search(el.tail): - extra = ' ' - else: - extra = '' - return '</%s>%s' % (el.tag, extra) - -def is_word(tok): - return not tok.startswith('<') - -def is_end_tag(tok): - return tok.startswith('</') - -def is_start_tag(tok): - return tok.startswith('<') and not tok.startswith('</') - -def fixup_ins_del_tags(html): - """ Given an html string, move any <ins> or <del> tags inside of any - block-level elements, e.g. transform <ins><p>word</p></ins> to - <p><ins>word</ins></p> """ - doc = parse_html(html, cleanup=False) - _fixup_ins_del_tags(doc) - html = serialize_html_fragment(doc, skip_outer=True) - return html - -def serialize_html_fragment(el, skip_outer=False): - """ Serialize a single lxml element as HTML. The serialized form - includes the elements tail. - - If skip_outer is true, then don't serialize the outermost tag - """ - assert not isinstance(el, basestring), ( - "You should pass in an element, not a string like %r" % el) - html = etree.tostring(el, method="html", encoding=_unicode) - if skip_outer: - # Get rid of the extra starting tag: - html = html[html.find('>')+1:] - # Get rid of the extra end tag: - html = html[:html.rfind('<')] - return html.strip() - else: - return html - -def _fixup_ins_del_tags(doc): - """fixup_ins_del_tags that works on an lxml document in-place - """ - for tag in ['ins', 'del']: - for el in doc.xpath('descendant-or-self::%s' % tag): - if not _contains_block_level_tag(el): - continue - _move_el_inside_block(el, tag=tag) - el.drop_tag() - #_merge_element_contents(el) - -def _contains_block_level_tag(el): - """True if the element contains any block-level elements, like <p>, <td>, etc. - """ - if el.tag in block_level_tags or el.tag in block_level_container_tags: - return True - for child in el: - if _contains_block_level_tag(child): - return True - return False - -def _move_el_inside_block(el, tag): - """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags - and moves them inside any block-level tags. """ - for child in el: - if _contains_block_level_tag(child): - break - else: - # No block-level tags in any child - children_tag = etree.Element(tag) - children_tag.text = el.text - el.text = None - children_tag.extend(list(el)) - el[:] = [children_tag] - return - for child in list(el): - if _contains_block_level_tag(child): - _move_el_inside_block(child, tag) - if child.tail: - tail_tag = etree.Element(tag) - tail_tag.text = child.tail - child.tail = None - el.insert(el.index(child)+1, tail_tag) - else: - child_tag = etree.Element(tag) - el.replace(child, child_tag) - child_tag.append(child) - if el.text: - text_tag = etree.Element(tag) - text_tag.text = el.text - el.text = None - el.insert(0, text_tag) - -def _merge_element_contents(el): - """ - Removes an element, but merges its contents into its place, e.g., - given <p>Hi <i>there!</i></p>, if you remove the <i> element you get - <p>Hi there!</p> - """ - parent = el.getparent() - text = el.text or '' - if el.tail: - if not len(el): - text += el.tail - else: - if el[-1].tail: - el[-1].tail += el.tail - else: - el[-1].tail = el.tail - index = parent.index(el) - if text: - if index == 0: - previous = None - else: - previous = parent[index-1] - if previous is None: - if parent.text: - parent.text += text - else: - parent.text = text - else: - if previous.tail: - previous.tail += text - else: - previous.tail = text - parent[index:index+1] = el.getchildren() - -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): - """ - Acts like SequenceMatcher, but tries not to find very small equal - blocks amidst large spans of changes - """ - - threshold = 2 - - def get_matching_blocks(self): - size = min(len(self.b), len(self.b)) - threshold = min(self.threshold, size / 4) - actual = difflib.SequenceMatcher.get_matching_blocks(self) - return [item for item in actual - if item[2] > threshold - or not item[2]] - -if __name__ == '__main__': - from lxml.html import _diffcommand - _diffcommand.main() - diff --git a/env/lib/python3.10/site-packages/lxml/html/formfill.py b/env/lib/python3.10/site-packages/lxml/html/formfill.py deleted file mode 100644 index 2499a8e..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/formfill.py +++ /dev/null @@ -1,299 +0,0 @@ -from lxml.etree import XPath, ElementBase -from lxml.html import fromstring, XHTML_NAMESPACE -from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result -from lxml.html import defs -import copy - -try: - basestring -except NameError: - # Python 3 - basestring = str - -__all__ = ['FormNotFound', 'fill_form', 'fill_form_html', - 'insert_errors', 'insert_errors_html', - 'DefaultErrorCreator'] - -class FormNotFound(LookupError): - """ - Raised when no form can be found - """ - -_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE}) -_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]), - namespaces={'x':XHTML_NAMESPACE}) -_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]', - namespaces={'x':XHTML_NAMESPACE}) -_name_xpath = XPath('descendant-or-self::*[@name=$name]') - -def fill_form( - el, - values, - form_id=None, - form_index=None, - ): - el = _find_form(el, form_id=form_id, form_index=form_index) - _fill_form(el, values) - -def fill_form_html(html, values, form_id=None, form_index=None): - result_type = type(html) - if isinstance(html, basestring): - doc = fromstring(html) - else: - doc = copy.deepcopy(html) - fill_form(doc, values, form_id=form_id, form_index=form_index) - return _transform_result(result_type, doc) - -def _fill_form(el, values): - counts = {} - if hasattr(values, 'mixed'): - # For Paste request parameters - values = values.mixed() - inputs = _input_xpath(el) - for input in inputs: - name = input.get('name') - if not name: - continue - if _takes_multiple(input): - value = values.get(name, []) - if not isinstance(value, (list, tuple)): - value = [value] - _fill_multiple(input, value) - elif name not in values: - continue - else: - index = counts.get(name, 0) - counts[name] = index + 1 - value = values[name] - if isinstance(value, (list, tuple)): - try: - value = value[index] - except IndexError: - continue - elif index > 0: - continue - _fill_single(input, value) - -def _takes_multiple(input): - if _nons(input.tag) == 'select' and input.get('multiple'): - # FIXME: multiple="0"? - return True - type = input.get('type', '').lower() - if type in ('radio', 'checkbox'): - return True - return False - -def _fill_multiple(input, value): - type = input.get('type', '').lower() - if type == 'checkbox': - v = input.get('value') - if v is None: - if not value: - result = False - else: - result = value[0] - if isinstance(value, basestring): - # The only valid "on" value for an unnamed checkbox is 'on' - result = result == 'on' - _check(input, result) - else: - _check(input, v in value) - elif type == 'radio': - v = input.get('value') - _check(input, v in value) - else: - assert _nons(input.tag) == 'select' - for option in _options_xpath(input): - v = option.get('value') - if v is None: - # This seems to be the default, at least on IE - # FIXME: but I'm not sure - v = option.text_content() - _select(option, v in value) - -def _check(el, check): - if check: - el.set('checked', '') - else: - if 'checked' in el.attrib: - del el.attrib['checked'] - -def _select(el, select): - if select: - el.set('selected', '') - else: - if 'selected' in el.attrib: - del el.attrib['selected'] - -def _fill_single(input, value): - if _nons(input.tag) == 'textarea': - input.text = value - else: - input.set('value', value) - -def _find_form(el, form_id=None, form_index=None): - if form_id is None and form_index is None: - forms = _forms_xpath(el) - for form in forms: - return form - raise FormNotFound( - "No forms in page") - if form_id is not None: - form = el.get_element_by_id(form_id) - if form is not None: - return form - forms = _form_name_xpath(el, name=form_id) - if forms: - return forms[0] - else: - raise FormNotFound( - "No form with the name or id of %r (forms: %s)" - % (id, ', '.join(_find_form_ids(el)))) - if form_index is not None: - forms = _forms_xpath(el) - try: - return forms[form_index] - except IndexError: - raise FormNotFound( - "There is no form with the index %r (%i forms found)" - % (form_index, len(forms))) - -def _find_form_ids(el): - forms = _forms_xpath(el) - if not forms: - yield '(no forms)' - return - for index, form in enumerate(forms): - if form.get('id'): - if form.get('name'): - yield '%s or %s' % (form.get('id'), - form.get('name')) - else: - yield form.get('id') - elif form.get('name'): - yield form.get('name') - else: - yield '(unnamed form %s)' % index - -############################################################ -## Error filling -############################################################ - -class DefaultErrorCreator(object): - insert_before = True - block_inside = True - error_container_tag = 'div' - error_message_class = 'error-message' - error_block_class = 'error-block' - default_message = "Invalid" - - def __init__(self, **kw): - for name, value in kw.items(): - if not hasattr(self, name): - raise TypeError( - "Unexpected keyword argument: %s" % name) - setattr(self, name, value) - - def __call__(self, el, is_block, message): - error_el = el.makeelement(self.error_container_tag) - if self.error_message_class: - error_el.set('class', self.error_message_class) - if is_block and self.error_block_class: - error_el.set('class', error_el.get('class', '')+' '+self.error_block_class) - if message is None or message == '': - message = self.default_message - if isinstance(message, ElementBase): - error_el.append(message) - else: - assert isinstance(message, basestring), ( - "Bad message; should be a string or element: %r" % message) - error_el.text = message or self.default_message - if is_block and self.block_inside: - if self.insert_before: - error_el.tail = el.text - el.text = None - el.insert(0, error_el) - else: - el.append(error_el) - else: - parent = el.getparent() - pos = parent.index(el) - if self.insert_before: - parent.insert(pos, error_el) - else: - error_el.tail = el.tail - el.tail = None - parent.insert(pos+1, error_el) - -default_error_creator = DefaultErrorCreator() - - -def insert_errors( - el, - errors, - form_id=None, - form_index=None, - error_class="error", - error_creator=default_error_creator, - ): - el = _find_form(el, form_id=form_id, form_index=form_index) - for name, error in errors.items(): - if error is None: - continue - for error_el, message in _find_elements_for_name(el, name, error): - assert isinstance(message, (basestring, type(None), ElementBase)), ( - "Bad message: %r" % message) - _insert_error(error_el, message, error_class, error_creator) - -def insert_errors_html(html, values, **kw): - result_type = type(html) - if isinstance(html, basestring): - doc = fromstring(html) - else: - doc = copy.deepcopy(html) - insert_errors(doc, values, **kw) - return _transform_result(result_type, doc) - -def _insert_error(el, error, error_class, error_creator): - if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea': - is_block = False - else: - is_block = True - if _nons(el.tag) != 'form' and error_class: - _add_class(el, error_class) - if el.get('id'): - labels = _label_for_xpath(el, for_id=el.get('id')) - if labels: - for label in labels: - _add_class(label, error_class) - error_creator(el, is_block, error) - -def _add_class(el, class_name): - if el.get('class'): - el.set('class', el.get('class')+' '+class_name) - else: - el.set('class', class_name) - -def _find_elements_for_name(form, name, error): - if name is None: - # An error for the entire form - yield form, error - return - if name.startswith('#'): - # By id - el = form.get_element_by_id(name[1:]) - if el is not None: - yield el, error - return - els = _name_xpath(form, name=name) - if not els: - # FIXME: should this raise an exception? - return - if not isinstance(error, (list, tuple)): - yield els[0], error - return - # FIXME: if error is longer than els, should it raise an error? - for el, err in zip(els, error): - if err is None: - continue - yield el, err diff --git a/env/lib/python3.10/site-packages/lxml/html/html5parser.py b/env/lib/python3.10/site-packages/lxml/html/html5parser.py deleted file mode 100644 index 2f7be15..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/html5parser.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -An interface to html5lib that mimics the lxml.html interface. -""" -import sys -import string - -from html5lib import HTMLParser as _HTMLParser -from html5lib.treebuilders.etree_lxml import TreeBuilder -from lxml import etree -from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag - -# python3 compatibility -try: - _strings = basestring -except NameError: - _strings = (bytes, str) -try: - from urllib2 import urlopen -except ImportError: - from urllib.request import urlopen -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse - - -class HTMLParser(_HTMLParser): - """An html5lib HTML parser with lxml as tree.""" - - def __init__(self, strict=False, **kwargs): - _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) - - -try: - from html5lib import XHTMLParser as _XHTMLParser -except ImportError: - pass -else: - class XHTMLParser(_XHTMLParser): - """An html5lib XHTML Parser with lxml as tree.""" - - def __init__(self, strict=False, **kwargs): - _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) - - xhtml_parser = XHTMLParser() - - -def _find_tag(tree, tag): - elem = tree.find(tag) - if elem is not None: - return elem - return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag)) - - -def document_fromstring(html, guess_charset=None, parser=None): - """ - Parse a whole document into a string. - - If `guess_charset` is true, or if the input is not Unicode but a - byte string, the `chardet` library will perform charset guessing - on the string. - """ - if not isinstance(html, _strings): - raise TypeError('string required') - - if parser is None: - parser = html_parser - - options = {} - if guess_charset is None and isinstance(html, bytes): - # html5lib does not accept useChardet as an argument, if it - # detected the html argument would produce unicode objects. - guess_charset = True - if guess_charset is not None: - options['useChardet'] = guess_charset - return parser.parse(html, **options).getroot() - - -def fragments_fromstring(html, no_leading_text=False, - guess_charset=None, parser=None): - """Parses several HTML elements, returning a list of elements. - - The first item in the list may be a string. If no_leading_text is true, - then it will be an error if there is leading text, and it will always be - a list of only elements. - - If `guess_charset` is true, the `chardet` library will perform charset - guessing on the string. - """ - if not isinstance(html, _strings): - raise TypeError('string required') - - if parser is None: - parser = html_parser - - options = {} - if guess_charset is None and isinstance(html, bytes): - # html5lib does not accept useChardet as an argument, if it - # detected the html argument would produce unicode objects. - guess_charset = False - if guess_charset is not None: - options['useChardet'] = guess_charset - children = parser.parseFragment(html, 'div', **options) - if children and isinstance(children[0], _strings): - if no_leading_text: - if children[0].strip(): - raise etree.ParserError('There is leading text: %r' % - children[0]) - del children[0] - return children - - -def fragment_fromstring(html, create_parent=False, - guess_charset=None, parser=None): - """Parses a single HTML element; it is an error if there is more than - one element, or if anything but whitespace precedes or follows the - element. - - If 'create_parent' is true (or is a tag name) then a parent node - will be created to encapsulate the HTML in a single element. In - this case, leading or trailing text is allowed. - - If `guess_charset` is true, the `chardet` library will perform charset - guessing on the string. - """ - if not isinstance(html, _strings): - raise TypeError('string required') - - accept_leading_text = bool(create_parent) - - elements = fragments_fromstring( - html, guess_charset=guess_charset, parser=parser, - no_leading_text=not accept_leading_text) - - if create_parent: - if not isinstance(create_parent, _strings): - create_parent = 'div' - new_root = Element(create_parent) - if elements: - if isinstance(elements[0], _strings): - new_root.text = elements[0] - del elements[0] - new_root.extend(elements) - return new_root - - if not elements: - raise etree.ParserError('No elements found') - if len(elements) > 1: - raise etree.ParserError('Multiple elements found') - result = elements[0] - if result.tail and result.tail.strip(): - raise etree.ParserError('Element followed by text: %r' % result.tail) - result.tail = None - return result - - -def fromstring(html, guess_charset=None, parser=None): - """Parse the html, returning a single element/document. - - This tries to minimally parse the chunk of text, without knowing if it - is a fragment or a document. - - 'base_url' will set the document's base_url attribute (and the tree's - docinfo.URL) - - If `guess_charset` is true, or if the input is not Unicode but a - byte string, the `chardet` library will perform charset guessing - on the string. - """ - if not isinstance(html, _strings): - raise TypeError('string required') - doc = document_fromstring(html, parser=parser, - guess_charset=guess_charset) - - # document starts with doctype or <html>, full document! - start = html[:50] - if isinstance(start, bytes): - # Allow text comparison in python3. - # Decode as ascii, that also covers latin-1 and utf-8 for the - # characters we need. - start = start.decode('ascii', 'replace') - - start = start.lstrip().lower() - if start.startswith('<html') or start.startswith('<!doctype'): - return doc - - head = _find_tag(doc, 'head') - - # if the head is not empty we have a full document - if len(head): - return doc - - body = _find_tag(doc, 'body') - - # The body has just one element, so it was probably a single - # element passed in - if (len(body) == 1 and (not body.text or not body.text.strip()) - and (not body[-1].tail or not body[-1].tail.strip())): - return body[0] - - # Now we have a body which represents a bunch of tags which have the - # content that was passed in. We will create a fake container, which - # is the body tag, except <body> implies too much structure. - if _contains_block_level_tag(body): - body.tag = 'div' - else: - body.tag = 'span' - return body - - -def parse(filename_url_or_file, guess_charset=None, parser=None): - """Parse a filename, URL, or file-like object into an HTML document - tree. Note: this returns a tree, not an element. Use - ``parse(...).getroot()`` to get the document root. - - If ``guess_charset`` is true, the ``useChardet`` option is passed into - html5lib to enable character detection. This option is on by default - when parsing from URLs, off by default when parsing from file(-like) - objects (which tend to return Unicode more often than not), and on by - default when parsing from a file path (which is read in binary mode). - """ - if parser is None: - parser = html_parser - if not isinstance(filename_url_or_file, _strings): - fp = filename_url_or_file - if guess_charset is None: - # assume that file-like objects return Unicode more often than bytes - guess_charset = False - elif _looks_like_url(filename_url_or_file): - fp = urlopen(filename_url_or_file) - if guess_charset is None: - # assume that URLs return bytes - guess_charset = True - else: - fp = open(filename_url_or_file, 'rb') - if guess_charset is None: - guess_charset = True - - options = {} - # html5lib does not accept useChardet as an argument, if it - # detected the html argument would produce unicode objects. - if guess_charset: - options['useChardet'] = guess_charset - return parser.parse(fp, **options) - - -def _looks_like_url(str): - scheme = urlparse(str)[0] - if not scheme: - return False - elif (sys.platform == 'win32' and - scheme in string.ascii_letters - and len(scheme) == 1): - # looks like a 'normal' absolute path - return False - else: - return True - - -html_parser = HTMLParser() diff --git a/env/lib/python3.10/site-packages/lxml/html/soupparser.py b/env/lib/python3.10/site-packages/lxml/html/soupparser.py deleted file mode 100644 index e0cf3a0..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/soupparser.py +++ /dev/null @@ -1,314 +0,0 @@ -"""External interface to the BeautifulSoup HTML parser. -""" - -__all__ = ["fromstring", "parse", "convert_tree"] - -import re -from lxml import etree, html - -try: - from bs4 import ( - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, - Declaration, Doctype) - _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) -except ImportError: - from BeautifulSoup import ( - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, - Declaration) - _DECLARATION_OR_DOCTYPE = Declaration - - -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a string of HTML data into an Element tree using the - BeautifulSoup parser. - - Returns the root ``<html>`` Element of the tree. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - return _parse(data, beautifulsoup, makeelement, **bsargs) - - -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a file into an ElemenTree using the BeautifulSoup parser. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - if not hasattr(file, 'read'): - file = open(file) - root = _parse(file, beautifulsoup, makeelement, **bsargs) - return etree.ElementTree(root) - - -def convert_tree(beautiful_soup_tree, makeelement=None): - """Convert a BeautifulSoup tree to a list of Element trees. - - Returns a list instead of a single root Element to support - HTML-like soup with more than one root element. - - You can pass a different Element factory through the `makeelement` - keyword. - """ - root = _convert_tree(beautiful_soup_tree, makeelement) - children = root.getchildren() - for child in children: - root.remove(child) - return children - - -# helpers - -def _parse(source, beautifulsoup, makeelement, **bsargs): - if beautifulsoup is None: - beautifulsoup = BeautifulSoup - if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 - if 'convertEntities' not in bsargs: - bsargs['convertEntities'] = 'html' - if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 - if 'features' not in bsargs: - bsargs['features'] = 'html.parser' # use Python html parser - tree = beautifulsoup(source, **bsargs) - root = _convert_tree(tree, makeelement) - # from ET: wrap the document in a html root element, if necessary - if len(root) == 1 and root[0].tag == "html": - return root[0] - root.tag = "html" - return root - - -_parse_doctype_declaration = re.compile( - r'(?:\s|[<!])*DOCTYPE\s*HTML' - r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' - r'(?:\s+(\'[^\']*\'|"[^"]*"))?', - re.IGNORECASE).match - - -class _PseudoTag: - # Minimal imitation of BeautifulSoup.Tag - def __init__(self, contents): - self.name = 'html' - self.attrs = [] - self.contents = contents - - def __iter__(self): - return self.contents.__iter__() - - -def _convert_tree(beautiful_soup_tree, makeelement): - if makeelement is None: - makeelement = html.html_parser.makeelement - - # Split the tree into three parts: - # i) everything before the root element: document type - # declaration, comments, processing instructions, whitespace - # ii) the root(s), - # iii) everything after the root: comments, processing - # instructions, whitespace - first_element_idx = last_element_idx = None - html_root = declaration = None - for i, e in enumerate(beautiful_soup_tree): - if isinstance(e, Tag): - if first_element_idx is None: - first_element_idx = i - last_element_idx = i - if html_root is None and e.name and e.name.lower() == 'html': - html_root = e - elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): - declaration = e - - # For a nice, well-formatted document, the variable roots below is - # a list consisting of a single <html> element. However, the document - # may be a soup like '<meta><head><title>Hello</head><body>Hi - # all<\p>'. In this example roots is a list containing meta, head - # and body elements. - if first_element_idx is None: - pre_root = post_root = [] - roots = beautiful_soup_tree.contents - else: - pre_root = beautiful_soup_tree.contents[:first_element_idx] - roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] - post_root = beautiful_soup_tree.contents[last_element_idx+1:] - - # Reorganize so that there is one <html> root... - if html_root is not None: - # ... use existing one if possible, ... - i = roots.index(html_root) - html_root.contents = roots[:i] + html_root.contents + roots[i+1:] - else: - # ... otherwise create a new one. - html_root = _PseudoTag(roots) - - convert_node = _init_node_converters(makeelement) - - # Process pre_root - res_root = convert_node(html_root) - prev = res_root - for e in reversed(pre_root): - converted = convert_node(e) - if converted is not None: - prev.addprevious(converted) - prev = converted - - # ditto for post_root - prev = res_root - for e in post_root: - converted = convert_node(e) - if converted is not None: - prev.addnext(converted) - prev = converted - - if declaration is not None: - try: - # bs4 provides full Doctype string - doctype_string = declaration.output_ready() - except AttributeError: - doctype_string = declaration.string - - match = _parse_doctype_declaration(doctype_string) - if not match: - # Something is wrong if we end up in here. Since soupparser should - # tolerate errors, do not raise Exception, just let it pass. - pass - else: - external_id, sys_uri = match.groups() - docinfo = res_root.getroottree().docinfo - # strip quotes and update DOCTYPE values (any of None, '', '...') - docinfo.public_id = external_id and external_id[1:-1] - docinfo.system_url = sys_uri and sys_uri[1:-1] - - return res_root - - -def _init_node_converters(makeelement): - converters = {} - ordered_node_types = [] - - def converter(*types): - def add(handler): - for t in types: - converters[t] = handler - ordered_node_types.append(t) - return handler - return add - - def find_best_converter(node): - for t in ordered_node_types: - if isinstance(node, t): - return converters[t] - return None - - def convert_node(bs_node, parent=None): - # duplicated in convert_tag() below - try: - handler = converters[type(bs_node)] - except KeyError: - handler = converters[type(bs_node)] = find_best_converter(bs_node) - if handler is None: - return None - return handler(bs_node, parent) - - def map_attrs(bs_attrs): - if isinstance(bs_attrs, dict): # bs4 - attribs = {} - for k, v in bs_attrs.items(): - if isinstance(v, list): - v = " ".join(v) - attribs[k] = unescape(v) - else: - attribs = dict((k, unescape(v)) for k, v in bs_attrs) - return attribs - - def append_text(parent, text): - if len(parent) == 0: - parent.text = (parent.text or '') + text - else: - parent[-1].tail = (parent[-1].tail or '') + text - - # converters are tried in order of their definition - - @converter(Tag, _PseudoTag) - def convert_tag(bs_node, parent): - attrs = bs_node.attrs - if parent is not None: - attribs = map_attrs(attrs) if attrs else None - res = etree.SubElement(parent, bs_node.name, attrib=attribs) - else: - attribs = map_attrs(attrs) if attrs else {} - res = makeelement(bs_node.name, attrib=attribs) - - for child in bs_node: - # avoid double recursion by inlining convert_node(), see above - try: - handler = converters[type(child)] - except KeyError: - pass - else: - if handler is not None: - handler(child, res) - continue - convert_node(child, res) - return res - - @converter(Comment) - def convert_comment(bs_node, parent): - res = html.HtmlComment(bs_node) - if parent is not None: - parent.append(res) - return res - - @converter(ProcessingInstruction) - def convert_pi(bs_node, parent): - if bs_node.endswith('?'): - # The PI is of XML style (<?as df?>) but BeautifulSoup - # interpreted it as being SGML style (<?as df>). Fix. - bs_node = bs_node[:-1] - res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) - if parent is not None: - parent.append(res) - return res - - @converter(NavigableString) - def convert_text(bs_node, parent): - if parent is not None: - append_text(parent, unescape(bs_node)) - return None - - return convert_node - - -# copied from ET's ElementSoup - -try: - from html.entities import name2codepoint # Python 3 -except ImportError: - from htmlentitydefs import name2codepoint - - -handle_entities = re.compile(r"&(\w+);").sub - - -try: - unichr -except NameError: - # Python 3 - unichr = chr - - -def unescape(string): - if not string: - return '' - # work around oddities in BeautifulSoup's entity handling - def unescape_entity(m): - try: - return unichr(name2codepoint[m.group(1)]) - except KeyError: - return m.group(0) # use as is - return handle_entities(unescape_entity, string) diff --git a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py deleted file mode 100644 index f352a1c..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Doctest module for HTML comparison. - -Usage:: - - >>> import lxml.html.usedoctest - >>> # now do your HTML doctests ... - -See `lxml.doctestcompare`. -""" - -from lxml import doctestcompare - -doctestcompare.temp_install(html=True, del_module=__name__) |