aboutsummaryrefslogtreecommitdiffstats
path: root/env/lib/python3.10/site-packages/lxml/html
diff options
context:
space:
mode:
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html')
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/ElementSoup.py10
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__init__.py1946
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pycbin511 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pycbin56397 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pycbin2297 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pycbin3603 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pycbin2096 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pycbin2938 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pycbin17699 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pycbin2812 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pycbin24074 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pycbin7384 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pycbin6414 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pycbin8014 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pycbin437 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_diffcommand.py88
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_html5builder.py100
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/_setmixin.py56
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/builder.py133
-rwxr-xr-xenv/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.sobin564824 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/clean.py786
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/defs.py135
-rwxr-xr-xenv/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.sobin787752 -> 0 bytes
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/diff.py884
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/formfill.py299
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/html5parser.py260
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/soupparser.py314
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/usedoctest.py13
28 files changed, 0 insertions, 5024 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py b/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py
deleted file mode 100644
index c35365d..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/ElementSoup.py
+++ /dev/null
@@ -1,10 +0,0 @@
-__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["parse", "convert_tree"]
-
-from .soupparser import convert_tree, parse as _parse
-
-def parse(file, beautifulsoup=None, makeelement=None):
- root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
- return root.getroot()
diff --git a/env/lib/python3.10/site-packages/lxml/html/__init__.py b/env/lib/python3.10/site-packages/lxml/html/__init__.py
deleted file mode 100644
index ef06a40..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__init__.py
+++ /dev/null
@@ -1,1946 +0,0 @@
-# Copyright (c) 2004 Ian Bicking. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-#
-# 3. Neither the name of Ian Bicking nor the names of its contributors may
-# be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""The ``lxml.html`` tool set for HTML handling.
-"""
-
-from __future__ import absolute_import
-
-__all__ = [
- 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
- 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
- 'find_rel_links', 'find_class', 'make_links_absolute',
- 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
-
-
-import copy
-import sys
-import re
-from functools import partial
-
-try:
- from collections.abc import MutableMapping, MutableSet
-except ImportError:
- from collections import MutableMapping, MutableSet
-
-from .. import etree
-from . import defs
-from ._setmixin import SetMixin
-
-try:
- from urlparse import urljoin
-except ImportError:
- # Python 3
- from urllib.parse import urljoin
-
-try:
- unicode
-except NameError:
- # Python 3
- unicode = str
-try:
- basestring
-except NameError:
- # Python 3
- basestring = (str, bytes)
-
-
-def __fix_docstring(s):
- if not s:
- return s
- if sys.version_info[0] >= 3:
- sub = re.compile(r"^(\s*)u'", re.M).sub
- else:
- sub = re.compile(r"^(\s*)b'", re.M).sub
- return sub(r"\1'", s)
-
-
-XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
-
-_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
- namespaces={'x':XHTML_NAMESPACE})
-_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
- namespaces={'x':XHTML_NAMESPACE})
-_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
- namespaces={'x':XHTML_NAMESPACE})
-#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
-_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
-_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
-_collect_string_content = etree.XPath("string()")
-_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
-_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
-_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
- namespaces={'x':XHTML_NAMESPACE})
-_archive_re = re.compile(r'[^ ]+')
-_parse_meta_refresh_url = re.compile(
- r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
-
-
-def _unquote_match(s, pos):
- if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
- return s[1:-1], pos+1
- else:
- return s,pos
-
-
-def _transform_result(typ, result):
- """Convert the result back into the input type.
- """
- if issubclass(typ, bytes):
- return tostring(result, encoding='utf-8')
- elif issubclass(typ, unicode):
- return tostring(result, encoding='unicode')
- else:
- return result
-
-
-def _nons(tag):
- if isinstance(tag, basestring):
- if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
- return tag.split('}')[-1]
- return tag
-
-
-class Classes(MutableSet):
- """Provides access to an element's class attribute as a set-like collection.
- Usage::
-
- >>> el = fromstring('<p class="hidden large">Text</p>')
- >>> classes = el.classes # or: classes = Classes(el.attrib)
- >>> classes |= ['block', 'paragraph']
- >>> el.get('class')
- 'hidden large block paragraph'
- >>> classes.toggle('hidden')
- False
- >>> el.get('class')
- 'large block paragraph'
- >>> classes -= ('some', 'classes', 'block')
- >>> el.get('class')
- 'large paragraph'
- """
- def __init__(self, attributes):
- self._attributes = attributes
- self._get_class_value = partial(attributes.get, 'class', '')
-
- def add(self, value):
- """
- Add a class.
-
- This has no effect if the class is already present.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = self._get_class_value().split()
- if value in classes:
- return
- classes.append(value)
- self._attributes['class'] = ' '.join(classes)
-
- def discard(self, value):
- """
- Remove a class if it is currently present.
-
- If the class is not present, do nothing.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = [name for name in self._get_class_value().split()
- if name != value]
- if classes:
- self._attributes['class'] = ' '.join(classes)
- elif 'class' in self._attributes:
- del self._attributes['class']
-
- def remove(self, value):
- """
- Remove a class; it must currently be present.
-
- If the class is not present, raise a KeyError.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- super(Classes, self).remove(value)
-
- def __contains__(self, name):
- classes = self._get_class_value()
- return name in classes and name in classes.split()
-
- def __iter__(self):
- return iter(self._get_class_value().split())
-
- def __len__(self):
- return len(self._get_class_value().split())
-
- # non-standard methods
-
- def update(self, values):
- """
- Add all names from 'values'.
- """
- classes = self._get_class_value().split()
- extended = False
- for value in values:
- if value not in classes:
- classes.append(value)
- extended = True
- if extended:
- self._attributes['class'] = ' '.join(classes)
-
- def toggle(self, value):
- """
- Add a class name if it isn't there yet, or remove it if it exists.
-
- Returns true if the class was added (and is now enabled) and
- false if it was removed (and is now disabled).
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = self._get_class_value().split()
- try:
- classes.remove(value)
- enabled = False
- except ValueError:
- classes.append(value)
- enabled = True
- if classes:
- self._attributes['class'] = ' '.join(classes)
- else:
- del self._attributes['class']
- return enabled
-
-
-class HtmlMixin(object):
-
- def set(self, key, value=None):
- """set(self, key, value=None)
-
- Sets an element attribute. If no value is provided, or if the value is None,
- creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
- for ``form.set('novalidate')``.
- """
- super(HtmlMixin, self).set(key, value)
-
- @property
- def classes(self):
- """
- A set-like wrapper around the 'class' attribute.
- """
- return Classes(self.attrib)
-
- @classes.setter
- def classes(self, classes):
- assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
- value = classes._get_class_value()
- if value:
- self.set('class', value)
- elif self.get('class') is not None:
- del self.attrib['class']
-
- @property
- def base_url(self):
- """
- Returns the base URL, given when the page was parsed.
-
- Use with ``urlparse.urljoin(el.base_url, href)`` to get
- absolute URLs.
- """
- return self.getroottree().docinfo.URL
-
- @property
- def forms(self):
- """
- Return a list of all the forms
- """
- return _forms_xpath(self)
-
- @property
- def body(self):
- """
- Return the <body> element. Can be called from a child element
- to get the document's head.
- """
- return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
-
- @property
- def head(self):
- """
- Returns the <head> element. Can be called from a child
- element to get the document's head.
- """
- return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
-
- @property
- def label(self):
- """
- Get or set any <label> element associated with this element.
- """
- id = self.get('id')
- if not id:
- return None
- result = _label_xpath(self, id=id)
- if not result:
- return None
- else:
- return result[0]
-
- @label.setter
- def label(self, label):
- id = self.get('id')
- if not id:
- raise TypeError(
- "You cannot set a label for an element (%r) that has no id"
- % self)
- if _nons(label.tag) != 'label':
- raise TypeError(
- "You can only assign label to a label element (not %r)"
- % label)
- label.set('for', id)
-
- @label.deleter
- def label(self):
- label = self.label
- if label is not None:
- del label.attrib['for']
-
- def drop_tree(self):
- """
- Removes this element from the tree, including its children and
- text. The tail text is joined to the previous element or
- parent.
- """
- parent = self.getparent()
- assert parent is not None
- if self.tail:
- previous = self.getprevious()
- if previous is None:
- parent.text = (parent.text or '') + self.tail
- else:
- previous.tail = (previous.tail or '') + self.tail
- parent.remove(self)
-
- def drop_tag(self):
- """
- Remove the tag, but not its children or text. The children and text
- are merged into the parent.
-
- Example::
-
- >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
- >>> h.find('.//b').drop_tag()
- >>> print(tostring(h, encoding='unicode'))
- <div>Hello World!</div>
- """
- parent = self.getparent()
- assert parent is not None
- previous = self.getprevious()
- if self.text and isinstance(self.tag, basestring):
- # not a Comment, etc.
- if previous is None:
- parent.text = (parent.text or '') + self.text
- else:
- previous.tail = (previous.tail or '') + self.text
- if self.tail:
- if len(self):
- last = self[-1]
- last.tail = (last.tail or '') + self.tail
- elif previous is None:
- parent.text = (parent.text or '') + self.tail
- else:
- previous.tail = (previous.tail or '') + self.tail
- index = parent.index(self)
- parent[index:index+1] = self[:]
-
- def find_rel_links(self, rel):
- """
- Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
- """
- rel = rel.lower()
- return [el for el in _rel_links_xpath(self)
- if el.get('rel').lower() == rel]
-
- def find_class(self, class_name):
- """
- Find any elements with the given class name.
- """
- return _class_xpath(self, class_name=class_name)
-
- def get_element_by_id(self, id, *default):
- """
- Get the first element in a document with the given id. If none is
- found, return the default argument if provided or raise KeyError
- otherwise.
-
- Note that there can be more than one element with the same id,
- and this isn't uncommon in HTML documents found in the wild.
- Browsers return only the first match, and this function does
- the same.
- """
- try:
- # FIXME: should this check for multiple matches?
- # browsers just return the first one
- return _id_xpath(self, id=id)[0]
- except IndexError:
- if default:
- return default[0]
- else:
- raise KeyError(id)
-
- def text_content(self):
- """
- Return the text content of the tag (and the text in any children).
- """
- return _collect_string_content(self)
-
- def cssselect(self, expr, translator='html'):
- """
- Run the CSS expression on this element and its children,
- returning a list of the results.
-
- Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
- -- note that pre-compiling the expression can provide a substantial
- speedup.
- """
- # Do the import here to make the dependency optional.
- from lxml.cssselect import CSSSelector
- return CSSSelector(expr, translator=translator)(self)
-
- ########################################
- ## Link functions
- ########################################
-
- def make_links_absolute(self, base_url=None, resolve_base_href=True,
- handle_failures=None):
- """
- Make all links in the document absolute, given the
- ``base_url`` for the document (the full URL where the document
- came from), or if no ``base_url`` is given, then the ``.base_url``
- of the document.
-
- If ``resolve_base_href`` is true, then any ``<base href>``
- tags in the document are used *and* removed from the document.
- If it is false then any such tag is ignored.
-
- If ``handle_failures`` is None (default), a failure to process
- a URL will abort the processing. If set to 'ignore', errors
- are ignored. If set to 'discard', failing URLs will be removed.
- """
- if base_url is None:
- base_url = self.base_url
- if base_url is None:
- raise TypeError(
- "No base_url given, and the document has no base_url")
- if resolve_base_href:
- self.resolve_base_href()
-
- if handle_failures == 'ignore':
- def link_repl(href):
- try:
- return urljoin(base_url, href)
- except ValueError:
- return href
- elif handle_failures == 'discard':
- def link_repl(href):
- try:
- return urljoin(base_url, href)
- except ValueError:
- return None
- elif handle_failures is None:
- def link_repl(href):
- return urljoin(base_url, href)
- else:
- raise ValueError(
- "unexpected value for handle_failures: %r" % handle_failures)
-
- self.rewrite_links(link_repl)
-
- def resolve_base_href(self, handle_failures=None):
- """
- Find any ``<base href>`` tag in the document, and apply its
- values to all links found in the document. Also remove the
- tag once it has been applied.
-
- If ``handle_failures`` is None (default), a failure to process
- a URL will abort the processing. If set to 'ignore', errors
- are ignored. If set to 'discard', failing URLs will be removed.
- """
- base_href = None
- basetags = self.xpath('//base[@href]|//x:base[@href]',
- namespaces={'x': XHTML_NAMESPACE})
- for b in basetags:
- base_href = b.get('href')
- b.drop_tree()
- if not base_href:
- return
- self.make_links_absolute(base_href, resolve_base_href=False,
- handle_failures=handle_failures)
-
- def iterlinks(self):
- """
- Yield (element, attribute, link, pos), where attribute may be None
- (indicating the link is in the text). ``pos`` is the position
- where the link occurs; often 0, but sometimes something else in
- the case of links in stylesheets or style tags.
-
- Note: <base href> is *not* taken into account in any way. The
- link you get is exactly the link in the document.
-
- Note: multiple links inside of a single text string or
- attribute value are returned in reversed order. This makes it
- possible to replace or delete them from the text string value
- based on their reported text positions. Otherwise, a
- modification at one text position can change the positions of
- links reported later on.
- """
- link_attrs = defs.link_attrs
- for el in self.iter(etree.Element):
- attribs = el.attrib
- tag = _nons(el.tag)
- if tag == 'object':
- codebase = None
- ## <object> tags have attributes that are relative to
- ## codebase
- if 'codebase' in attribs:
- codebase = el.get('codebase')
- yield (el, 'codebase', codebase, 0)
- for attrib in ('classid', 'data'):
- if attrib in attribs:
- value = el.get(attrib)
- if codebase is not None:
- value = urljoin(codebase, value)
- yield (el, attrib, value, 0)
- if 'archive' in attribs:
- for match in _archive_re.finditer(el.get('archive')):
- value = match.group(0)
- if codebase is not None:
- value = urljoin(codebase, value)
- yield (el, 'archive', value, match.start())
- else:
- for attrib in link_attrs:
- if attrib in attribs:
- yield (el, attrib, attribs[attrib], 0)
- if tag == 'meta':
- http_equiv = attribs.get('http-equiv', '').lower()
- if http_equiv == 'refresh':
- content = attribs.get('content', '')
- match = _parse_meta_refresh_url(content)
- url = (match.group('url') if match else content).strip()
- # unexpected content means the redirect won't work, but we might
- # as well be permissive and return the entire string.
- if url:
- url, pos = _unquote_match(
- url, match.start('url') if match else content.find(url))
- yield (el, 'content', url, pos)
- elif tag == 'param':
- valuetype = el.get('valuetype') or ''
- if valuetype.lower() == 'ref':
- ## FIXME: while it's fine we *find* this link,
- ## according to the spec we aren't supposed to
- ## actually change the value, including resolving
- ## it. It can also still be a link, even if it
- ## doesn't have a valuetype="ref" (which seems to be the norm)
- ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
- yield (el, 'value', el.get('value'), 0)
- elif tag == 'style' and el.text:
- urls = [
- # (start_pos, url)
- _unquote_match(match.group(1), match.start(1))[::-1]
- for match in _iter_css_urls(el.text)
- ] + [
- (match.start(1), match.group(1))
- for match in _iter_css_imports(el.text)
- ]
- if urls:
- # sort by start pos to bring both match sets back into order
- # and reverse the list to report correct positions despite
- # modifications
- urls.sort(reverse=True)
- for start, url in urls:
- yield (el, None, url, start)
- if 'style' in attribs:
- urls = list(_iter_css_urls(attribs['style']))
- if urls:
- # return in reversed order to simplify in-place modifications
- for match in urls[::-1]:
- url, start = _unquote_match(match.group(1), match.start(1))
- yield (el, 'style', url, start)
-
- def rewrite_links(self, link_repl_func, resolve_base_href=True,
- base_href=None):
- """
- Rewrite all the links in the document. For each link
- ``link_repl_func(link)`` will be called, and the return value
- will replace the old link.
-
- Note that links may not be absolute (unless you first called
- ``make_links_absolute()``), and may be internal (e.g.,
- ``'#anchor'``). They can also be values like
- ``'mailto:email'`` or ``'javascript:expr'``.
-
- If you give ``base_href`` then all links passed to
- ``link_repl_func()`` will take that into account.
-
- If the ``link_repl_func`` returns None, the attribute or
- tag text will be removed completely.
- """
- if base_href is not None:
- # FIXME: this can be done in one pass with a wrapper
- # around link_repl_func
- self.make_links_absolute(
- base_href, resolve_base_href=resolve_base_href)
- elif resolve_base_href:
- self.resolve_base_href()
-
- for el, attrib, link, pos in self.iterlinks():
- new_link = link_repl_func(link.strip())
- if new_link == link:
- continue
- if new_link is None:
- # Remove the attribute or element content
- if attrib is None:
- el.text = ''
- else:
- del el.attrib[attrib]
- continue
-
- if attrib is None:
- new = el.text[:pos] + new_link + el.text[pos+len(link):]
- el.text = new
- else:
- cur = el.get(attrib)
- if not pos and len(cur) == len(link):
- new = new_link # most common case
- else:
- new = cur[:pos] + new_link + cur[pos+len(link):]
- el.set(attrib, new)
-
-
-class _MethodFunc(object):
- """
- An object that represents a method on an element as a function;
- the function takes either an element or an HTML string. It
- returns whatever the function normally returns, or if the function
- works in-place (and so returns None) it returns a serialized form
- of the resulting document.
- """
- def __init__(self, name, copy=False, source_class=HtmlMixin):
- self.name = name
- self.copy = copy
- self.__doc__ = getattr(source_class, self.name).__doc__
- def __call__(self, doc, *args, **kw):
- result_type = type(doc)
- if isinstance(doc, basestring):
- if 'copy' in kw:
- raise TypeError(
- "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
- doc = fromstring(doc, **kw)
- else:
- if 'copy' in kw:
- make_a_copy = kw.pop('copy')
- else:
- make_a_copy = self.copy
- if make_a_copy:
- doc = copy.deepcopy(doc)
- meth = getattr(doc, self.name)
- result = meth(*args, **kw)
- # FIXME: this None test is a bit sloppy
- if result is None:
- # Then return what we got in
- return _transform_result(result_type, doc)
- else:
- return result
-
-
-find_rel_links = _MethodFunc('find_rel_links', copy=False)
-find_class = _MethodFunc('find_class', copy=False)
-make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
-resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
-iterlinks = _MethodFunc('iterlinks', copy=False)
-rewrite_links = _MethodFunc('rewrite_links', copy=True)
-
-
-class HtmlComment(HtmlMixin, etree.CommentBase):
- pass
-
-
-class HtmlElement(HtmlMixin, etree.ElementBase):
- pass
-
-
-class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):
- pass
-
-
-class HtmlEntity(HtmlMixin, etree.EntityBase):
- pass
-
-
-class HtmlElementClassLookup(etree.CustomElementClassLookup):
- """A lookup scheme for HTML Element classes.
-
- To create a lookup instance with different Element classes, pass a tag
- name mapping of Element classes in the ``classes`` keyword argument and/or
- a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
- The special key '*' denotes a Mixin class that should be mixed into all
- Element classes.
- """
- _default_element_classes = {}
-
- def __init__(self, classes=None, mixins=None):
- etree.CustomElementClassLookup.__init__(self)
- if classes is None:
- classes = self._default_element_classes.copy()
- if mixins:
- mixers = {}
- for name, value in mixins:
- if name == '*':
- for n in classes.keys():
- mixers.setdefault(n, []).append(value)
- else:
- mixers.setdefault(name, []).append(value)
- for name, mix_bases in mixers.items():
- cur = classes.get(name, HtmlElement)
- bases = tuple(mix_bases + [cur])
- classes[name] = type(cur.__name__, bases, {})
- self._element_classes = classes
-
- def lookup(self, node_type, document, namespace, name):
- if node_type == 'element':
- return self._element_classes.get(name.lower(), HtmlElement)
- elif node_type == 'comment':
- return HtmlComment
- elif node_type == 'PI':
- return HtmlProcessingInstruction
- elif node_type == 'entity':
- return HtmlEntity
- # Otherwise normal lookup
- return None
-
-
-################################################################################
-# parsing
-################################################################################
-
-_looks_like_full_html_unicode = re.compile(
- unicode(r'^\s*<(?:html|!doctype)'), re.I).match
-_looks_like_full_html_bytes = re.compile(
- r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
-
-
-def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
- if parser is None:
- parser = html_parser
- value = etree.fromstring(html, parser, **kw)
- if value is None:
- raise etree.ParserError(
- "Document is empty")
- if ensure_head_body and value.find('head') is None:
- value.insert(0, Element('head'))
- if ensure_head_body and value.find('body') is None:
- value.append(Element('body'))
- return value
-
-
-def fragments_fromstring(html, no_leading_text=False, base_url=None,
- parser=None, **kw):
- """Parses several HTML elements, returning a list of elements.
-
- The first item in the list may be a string.
- If no_leading_text is true, then it will be an error if there is
- leading text, and it will always be a list of only elements.
-
- base_url will set the document's base_url attribute
- (and the tree's docinfo.URL).
- """
- if parser is None:
- parser = html_parser
- # FIXME: check what happens when you give html with a body, head, etc.
- if isinstance(html, bytes):
- if not _looks_like_full_html_bytes(html):
- # can't use %-formatting in early Py3 versions
- html = ('<html><body>'.encode('ascii') + html +
- '</body></html>'.encode('ascii'))
- else:
- if not _looks_like_full_html_unicode(html):
- html = '<html><body>%s</body></html>' % html
- doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
- assert _nons(doc.tag) == 'html'
- bodies = [e for e in doc if _nons(e.tag) == 'body']
- assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
- body = bodies[0]
- elements = []
- if no_leading_text and body.text and body.text.strip():
- raise etree.ParserError(
- "There is leading text: %r" % body.text)
- if body.text and body.text.strip():
- elements.append(body.text)
- elements.extend(body)
- # FIXME: removing the reference to the parent artificial document
- # would be nice
- return elements
-
-
-def fragment_fromstring(html, create_parent=False, base_url=None,
- parser=None, **kw):
- """
- Parses a single HTML element; it is an error if there is more than
- one element, or if anything but whitespace precedes or follows the
- element.
-
- If ``create_parent`` is true (or is a tag name) then a parent node
- will be created to encapsulate the HTML in a single element. In this
- case, leading or trailing text is also allowed, as are multiple elements
- as result of the parsing.
-
- Passing a ``base_url`` will set the document's ``base_url`` attribute
- (and the tree's docinfo.URL).
- """
- if parser is None:
- parser = html_parser
-
- accept_leading_text = bool(create_parent)
-
- elements = fragments_fromstring(
- html, parser=parser, no_leading_text=not accept_leading_text,
- base_url=base_url, **kw)
-
- if create_parent:
- if not isinstance(create_parent, basestring):
- create_parent = 'div'
- new_root = Element(create_parent)
- if elements:
- if isinstance(elements[0], basestring):
- new_root.text = elements[0]
- del elements[0]
- new_root.extend(elements)
- return new_root
-
- if not elements:
- raise etree.ParserError('No elements found')
- if len(elements) > 1:
- raise etree.ParserError(
- "Multiple elements found (%s)"
- % ', '.join([_element_name(e) for e in elements]))
- el = elements[0]
- if el.tail and el.tail.strip():
- raise etree.ParserError(
- "Element followed by text: %r" % el.tail)
- el.tail = None
- return el
-
-
-def fromstring(html, base_url=None, parser=None, **kw):
- """
- Parse the html, returning a single element/document.
-
- This tries to minimally parse the chunk of text, without knowing if it
- is a fragment or a document.
-
- base_url will set the document's base_url attribute (and the tree's docinfo.URL)
- """
- if parser is None:
- parser = html_parser
- if isinstance(html, bytes):
- is_full_html = _looks_like_full_html_bytes(html)
- else:
- is_full_html = _looks_like_full_html_unicode(html)
- doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
- if is_full_html:
- return doc
- # otherwise, lets parse it out...
- bodies = doc.findall('body')
- if not bodies:
- bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
- if bodies:
- body = bodies[0]
- if len(bodies) > 1:
- # Somehow there are multiple bodies, which is bad, but just
- # smash them into one body
- for other_body in bodies[1:]:
- if other_body.text:
- if len(body):
- body[-1].tail = (body[-1].tail or '') + other_body.text
- else:
- body.text = (body.text or '') + other_body.text
- body.extend(other_body)
- # We'll ignore tail
- # I guess we are ignoring attributes too
- other_body.drop_tree()
- else:
- body = None
- heads = doc.findall('head')
- if not heads:
- heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
- if heads:
- # Well, we have some sort of structure, so lets keep it all
- head = heads[0]
- if len(heads) > 1:
- for other_head in heads[1:]:
- head.extend(other_head)
- # We don't care about text or tail in a head
- other_head.drop_tree()
- return doc
- if body is None:
- return doc
- if (len(body) == 1 and (not body.text or not body.text.strip())
- and (not body[-1].tail or not body[-1].tail.strip())):
- # The body has just one element, so it was probably a single
- # element passed in
- return body[0]
- # Now we have a body which represents a bunch of tags which have the
- # content that was passed in. We will create a fake container, which
- # is the body tag, except <body> implies too much structure.
- if _contains_block_level_tag(body):
- body.tag = 'div'
- else:
- body.tag = 'span'
- return body
-
-
-def parse(filename_or_url, parser=None, base_url=None, **kw):
- """
- Parse a filename, URL, or file-like object into an HTML document
- tree. Note: this returns a tree, not an element. Use
- ``parse(...).getroot()`` to get the document root.
-
- You can override the base URL with the ``base_url`` keyword. This
- is most useful when parsing from a file-like object.
- """
- if parser is None:
- parser = html_parser
- return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
-
-
-def _contains_block_level_tag(el):
- # FIXME: I could do this with XPath, but would that just be
- # unnecessarily slow?
- for el in el.iter(etree.Element):
- if _nons(el.tag) in defs.block_tags:
- return True
- return False
-
-
-def _element_name(el):
- if isinstance(el, etree.CommentBase):
- return 'comment'
- elif isinstance(el, basestring):
- return 'string'
- else:
- return _nons(el.tag)
-
-
-################################################################################
-# form handling
-################################################################################
-
-class FormElement(HtmlElement):
- """
- Represents a <form> element.
- """
-
- @property
- def inputs(self):
- """
- Returns an accessor for all the input elements in the form.
-
- See `InputGetter` for more information about the object.
- """
- return InputGetter(self)
-
- @property
- def fields(self):
- """
- Dictionary-like object that represents all the fields in this
- form. You can set values in this dictionary to effect the
- form.
- """
- return FieldsDict(self.inputs)
-
- @fields.setter
- def fields(self, value):
- fields = self.fields
- prev_keys = fields.keys()
- for key, value in value.items():
- if key in prev_keys:
- prev_keys.remove(key)
- fields[key] = value
- for key in prev_keys:
- if key is None:
- # Case of an unnamed input; these aren't really
- # expressed in form_values() anyway.
- continue
- fields[key] = None
-
- def _name(self):
- if self.get('name'):
- return self.get('name')
- elif self.get('id'):
- return '#' + self.get('id')
- iter_tags = self.body.iter
- forms = list(iter_tags('form'))
- if not forms:
- forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
- return str(forms.index(self))
-
- def form_values(self):
- """
- Return a list of tuples of the field values for the form.
- This is suitable to be passed to ``urllib.urlencode()``.
- """
- results = []
- for el in self.inputs:
- name = el.name
- if not name or 'disabled' in el.attrib:
- continue
- tag = _nons(el.tag)
- if tag == 'textarea':
- results.append((name, el.value))
- elif tag == 'select':
- value = el.value
- if el.multiple:
- for v in value:
- results.append((name, v))
- elif value is not None:
- results.append((name, el.value))
- else:
- assert tag == 'input', (
- "Unexpected tag: %r" % el)
- if el.checkable and not el.checked:
- continue
- if el.type in ('submit', 'image', 'reset', 'file'):
- continue
- value = el.value
- if value is not None:
- results.append((name, el.value))
- return results
-
- @property
- def action(self):
- """
- Get/set the form's ``action`` attribute.
- """
- base_url = self.base_url
- action = self.get('action')
- if base_url and action is not None:
- return urljoin(base_url, action)
- else:
- return action
-
- @action.setter
- def action(self, value):
- self.set('action', value)
-
- @action.deleter
- def action(self):
- attrib = self.attrib
- if 'action' in attrib:
- del attrib['action']
-
- @property
- def method(self):
- """
- Get/set the form's method. Always returns a capitalized
- string, and defaults to ``'GET'``
- """
- return self.get('method', 'GET').upper()
-
- @method.setter
- def method(self, value):
- self.set('method', value.upper())
-
-
-HtmlElementClassLookup._default_element_classes['form'] = FormElement
-
-
-def submit_form(form, extra_values=None, open_http=None):
- """
- Helper function to submit a form. Returns a file-like object, as from
- ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
- which shows the URL if there were any redirects.
-
- You can use this like::
-
- form = doc.forms[0]
- form.inputs['foo'].value = 'bar' # etc
- response = form.submit()
- doc = parse(response)
- doc.make_links_absolute(response.geturl())
-
- To change the HTTP requester, pass a function as ``open_http`` keyword
- argument that opens the URL for you. The function must have the following
- signature::
-
- open_http(method, URL, values)
-
- The action is one of 'GET' or 'POST', the URL is the target URL as a
- string, and the values are a sequence of ``(name, value)`` tuples with the
- form data.
- """
- values = form.form_values()
- if extra_values:
- if hasattr(extra_values, 'items'):
- extra_values = extra_values.items()
- values.extend(extra_values)
- if open_http is None:
- open_http = open_http_urllib
- if form.action:
- url = form.action
- else:
- url = form.base_url
- return open_http(form.method, url, values)
-
-
-def open_http_urllib(method, url, values):
- if not url:
- raise ValueError("cannot submit, no URL provided")
- ## FIXME: should test that it's not a relative URL or something
- try:
- from urllib import urlencode, urlopen
- except ImportError: # Python 3
- from urllib.request import urlopen
- from urllib.parse import urlencode
- if method == 'GET':
- if '?' in url:
- url += '&'
- else:
- url += '?'
- url += urlencode(values)
- data = None
- else:
- data = urlencode(values)
- if not isinstance(data, bytes):
- data = data.encode('ASCII')
- return urlopen(url, data)
-
-
-class FieldsDict(MutableMapping):
-
- def __init__(self, inputs):
- self.inputs = inputs
- def __getitem__(self, item):
- return self.inputs[item].value
- def __setitem__(self, item, value):
- self.inputs[item].value = value
- def __delitem__(self, item):
- raise KeyError(
- "You cannot remove keys from ElementDict")
- def keys(self):
- return self.inputs.keys()
- def __contains__(self, item):
- return item in self.inputs
- def __iter__(self):
- return iter(self.inputs.keys())
- def __len__(self):
- return len(self.inputs)
-
- def __repr__(self):
- return '<%s for form %s>' % (
- self.__class__.__name__,
- self.inputs.form._name())
-
-
-class InputGetter(object):
-
- """
- An accessor that represents all the input fields in a form.
-
- You can get fields by name from this, with
- ``form.inputs['field_name']``. If there are a set of checkboxes
- with the same name, they are returned as a list (a `CheckboxGroup`
- which also allows value setting). Radio inputs are handled
- similarly. Use ``.keys()`` and ``.items()`` to process all fields
- in this way.
-
- You can also iterate over this to get all input elements. This
- won't return the same thing as if you get all the names, as
- checkboxes and radio elements are returned individually.
- """
-
- def __init__(self, form):
- self.form = form
-
- def __repr__(self):
- return '<%s for form %s>' % (
- self.__class__.__name__,
- self.form._name())
-
- ## FIXME: there should be more methods, and it's unclear if this is
- ## a dictionary-like object or list-like object
-
- def __getitem__(self, name):
- fields = [field for field in self if field.name == name]
- if not fields:
- raise KeyError("No input element with the name %r" % name)
-
- input_type = fields[0].get('type')
- if input_type == 'radio' and len(fields) > 1:
- group = RadioGroup(fields)
- group.name = name
- return group
- elif input_type == 'checkbox' and len(fields) > 1:
- group = CheckboxGroup(fields)
- group.name = name
- return group
- else:
- # I don't like throwing away elements like this
- return fields[0]
-
- def __contains__(self, name):
- for field in self:
- if field.name == name:
- return True
- return False
-
- def keys(self):
- """
- Returns all unique field names, in document order.
-
- :return: A list of all unique field names.
- """
- names = []
- seen = {None}
- for el in self:
- name = el.name
- if name not in seen:
- names.append(name)
- seen.add(name)
- return names
-
- def items(self):
- """
- Returns all fields with their names, similar to dict.items().
-
- :return: A list of (name, field) tuples.
- """
- items = []
- seen = set()
- for el in self:
- name = el.name
- if name not in seen:
- seen.add(name)
- items.append((name, self[name]))
- return items
-
- def __iter__(self):
- return self.form.iter('select', 'input', 'textarea')
-
- def __len__(self):
- return sum(1 for _ in self)
-
-
-class InputMixin(object):
- """
- Mix-in for all input elements (input, select, and textarea)
- """
- @property
- def name(self):
- """
- Get/set the name of the element
- """
- return self.get('name')
-
- @name.setter
- def name(self, value):
- self.set('name', value)
-
- @name.deleter
- def name(self):
- attrib = self.attrib
- if 'name' in attrib:
- del attrib['name']
-
- def __repr__(self):
- type_name = getattr(self, 'type', None)
- if type_name:
- type_name = ' type=%r' % type_name
- else:
- type_name = ''
- return '<%s %x name=%r%s>' % (
- self.__class__.__name__, id(self), self.name, type_name)
-
-
-class TextareaElement(InputMixin, HtmlElement):
- """
- ``<textarea>`` element. You can get the name with ``.name`` and
- get/set the value with ``.value``
- """
- @property
- def value(self):
- """
- Get/set the value (which is the contents of this element)
- """
- content = self.text or ''
- if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
- serialisation_method = 'xml'
- else:
- serialisation_method = 'html'
- for el in self:
- # it's rare that we actually get here, so let's not use ''.join()
- content += etree.tostring(
- el, method=serialisation_method, encoding='unicode')
- return content
-
- @value.setter
- def value(self, value):
- del self[:]
- self.text = value
-
- @value.deleter
- def value(self):
- self.text = ''
- del self[:]
-
-
-HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
-
-
-class SelectElement(InputMixin, HtmlElement):
- """
- ``<select>`` element. You can get the name with ``.name``.
-
- ``.value`` will be the value of the selected option, unless this
- is a multi-select element (``<select multiple>``), in which case
- it will be a set-like object. In either case ``.value_options``
- gives the possible values.
-
- The boolean attribute ``.multiple`` shows if this is a
- multi-select.
- """
- @property
- def value(self):
- """
- Get/set the value of this select (the selected option).
-
- If this is a multi-select, this is a set-like object that
- represents all the selected options.
- """
- if self.multiple:
- return MultipleSelectOptions(self)
- options = _options_xpath(self)
-
- try:
- selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
- except StopIteration:
- try:
- selected_option = next(el for el in options if el.get('disabled') is None)
- except StopIteration:
- return None
- value = selected_option.get('value')
- if value is None:
- value = (selected_option.text or '').strip()
- return value
-
- @value.setter
- def value(self, value):
- if self.multiple:
- if isinstance(value, basestring):
- raise TypeError("You must pass in a sequence")
- values = self.value
- values.clear()
- values.update(value)
- return
- checked_option = None
- if value is not None:
- for el in _options_xpath(self):
- opt_value = el.get('value')
- if opt_value is None:
- opt_value = (el.text or '').strip()
- if opt_value == value:
- checked_option = el
- break
- else:
- raise ValueError(
- "There is no option with the value of %r" % value)
- for el in _options_xpath(self):
- if 'selected' in el.attrib:
- del el.attrib['selected']
- if checked_option is not None:
- checked_option.set('selected', '')
-
- @value.deleter
- def value(self):
- # FIXME: should del be allowed at all?
- if self.multiple:
- self.value.clear()
- else:
- self.value = None
-
- @property
- def value_options(self):
- """
- All the possible values this select can have (the ``value``
- attribute of all the ``<option>`` elements.
- """
- options = []
- for el in _options_xpath(self):
- value = el.get('value')
- if value is None:
- value = (el.text or '').strip()
- options.append(value)
- return options
-
- @property
- def multiple(self):
- """
- Boolean attribute: is there a ``multiple`` attribute on this element.
- """
- return 'multiple' in self.attrib
-
- @multiple.setter
- def multiple(self, value):
- if value:
- self.set('multiple', '')
- elif 'multiple' in self.attrib:
- del self.attrib['multiple']
-
-
-HtmlElementClassLookup._default_element_classes['select'] = SelectElement
-
-
-class MultipleSelectOptions(SetMixin):
- """
- Represents all the selected options in a ``<select multiple>`` element.
-
- You can add to this set-like option to select an option, or remove
- to unselect the option.
- """
-
- def __init__(self, select):
- self.select = select
-
- @property
- def options(self):
- """
- Iterator of all the ``<option>`` elements.
- """
- return iter(_options_xpath(self.select))
-
- def __iter__(self):
- for option in self.options:
- if 'selected' in option.attrib:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = (option.text or '').strip()
- yield opt_value
-
- def add(self, item):
- for option in self.options:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = (option.text or '').strip()
- if opt_value == item:
- option.set('selected', '')
- break
- else:
- raise ValueError(
- "There is no option with the value %r" % item)
-
- def remove(self, item):
- for option in self.options:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = (option.text or '').strip()
- if opt_value == item:
- if 'selected' in option.attrib:
- del option.attrib['selected']
- else:
- raise ValueError(
- "The option %r is not currently selected" % item)
- break
- else:
- raise ValueError(
- "There is not option with the value %r" % item)
-
- def __repr__(self):
- return '<%s {%s} for select name=%r>' % (
- self.__class__.__name__,
- ', '.join([repr(v) for v in self]),
- self.select.name)
-
-
-class RadioGroup(list):
- """
- This object represents several ``<input type=radio>`` elements
- that have the same name.
-
- You can use this like a list, but also use the property
- ``.value`` to check/uncheck inputs. Also you can use
- ``.value_options`` to get the possible values.
- """
- @property
- def value(self):
- """
- Get/set the value, which checks the radio with that value (and
- unchecks any other value).
- """
- for el in self:
- if 'checked' in el.attrib:
- return el.get('value')
- return None
-
- @value.setter
- def value(self, value):
- checked_option = None
- if value is not None:
- for el in self:
- if el.get('value') == value:
- checked_option = el
- break
- else:
- raise ValueError("There is no radio input with the value %r" % value)
- for el in self:
- if 'checked' in el.attrib:
- del el.attrib['checked']
- if checked_option is not None:
- checked_option.set('checked', '')
-
- @value.deleter
- def value(self):
- self.value = None
-
- @property
- def value_options(self):
- """
- Returns a list of all the possible values.
- """
- return [el.get('value') for el in self]
-
- def __repr__(self):
- return '%s(%s)' % (
- self.__class__.__name__,
- list.__repr__(self))
-
-
-class CheckboxGroup(list):
- """
- Represents a group of checkboxes (``<input type=checkbox>``) that
- have the same name.
-
- In addition to using this like a list, the ``.value`` attribute
- returns a set-like object that you can add to or remove from to
- check and uncheck checkboxes. You can also use ``.value_options``
- to get the possible values.
- """
- @property
- def value(self):
- """
- Return a set-like object that can be modified to check or
- uncheck individual checkboxes according to their value.
- """
- return CheckboxValues(self)
-
- @value.setter
- def value(self, value):
- values = self.value
- values.clear()
- if not hasattr(value, '__iter__'):
- raise ValueError(
- "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
- % (self[0].name, value))
- values.update(value)
-
- @value.deleter
- def value(self):
- self.value.clear()
-
- @property
- def value_options(self):
- """
- Returns a list of all the possible values.
- """
- return [el.get('value') for el in self]
-
- def __repr__(self):
- return '%s(%s)' % (
- self.__class__.__name__, list.__repr__(self))
-
-
-class CheckboxValues(SetMixin):
- """
- Represents the values of the checked checkboxes in a group of
- checkboxes with the same name.
- """
-
- def __init__(self, group):
- self.group = group
-
- def __iter__(self):
- return iter([
- el.get('value')
- for el in self.group
- if 'checked' in el.attrib])
-
- def add(self, value):
- for el in self.group:
- if el.get('value') == value:
- el.set('checked', '')
- break
- else:
- raise KeyError("No checkbox with value %r" % value)
-
- def remove(self, value):
- for el in self.group:
- if el.get('value') == value:
- if 'checked' in el.attrib:
- del el.attrib['checked']
- else:
- raise KeyError(
- "The checkbox with value %r was already unchecked" % value)
- break
- else:
- raise KeyError(
- "No checkbox with value %r" % value)
-
- def __repr__(self):
- return '<%s {%s} for checkboxes name=%r>' % (
- self.__class__.__name__,
- ', '.join([repr(v) for v in self]),
- self.group.name)
-
-
-class InputElement(InputMixin, HtmlElement):
- """
- Represents an ``<input>`` element.
-
- You can get the type with ``.type`` (which is lower-cased and
- defaults to ``'text'``).
-
- Also you can get and set the value with ``.value``
-
- Checkboxes and radios have the attribute ``input.checkable ==
- True`` (for all others it is false) and a boolean attribute
- ``.checked``.
-
- """
-
- ## FIXME: I'm a little uncomfortable with the use of .checked
- @property
- def value(self):
- """
- Get/set the value of this element, using the ``value`` attribute.
-
- Also, if this is a checkbox and it has no value, this defaults
- to ``'on'``. If it is a checkbox or radio that is not
- checked, this returns None.
- """
- if self.checkable:
- if self.checked:
- return self.get('value') or 'on'
- else:
- return None
- return self.get('value')
-
- @value.setter
- def value(self, value):
- if self.checkable:
- if not value:
- self.checked = False
- else:
- self.checked = True
- if isinstance(value, basestring):
- self.set('value', value)
- else:
- self.set('value', value)
-
- @value.deleter
- def value(self):
- if self.checkable:
- self.checked = False
- else:
- if 'value' in self.attrib:
- del self.attrib['value']
-
- @property
- def type(self):
- """
- Return the type of this element (using the type attribute).
- """
- return self.get('type', 'text').lower()
-
- @type.setter
- def type(self, value):
- self.set('type', value)
-
- @property
- def checkable(self):
- """
- Boolean: can this element be checked?
- """
- return self.type in ('checkbox', 'radio')
-
- @property
- def checked(self):
- """
- Boolean attribute to get/set the presence of the ``checked``
- attribute.
-
- You can only use this on checkable input types.
- """
- if not self.checkable:
- raise AttributeError('Not a checkable input type')
- return 'checked' in self.attrib
-
- @checked.setter
- def checked(self, value):
- if not self.checkable:
- raise AttributeError('Not a checkable input type')
- if value:
- self.set('checked', '')
- else:
- attrib = self.attrib
- if 'checked' in attrib:
- del attrib['checked']
-
-
-HtmlElementClassLookup._default_element_classes['input'] = InputElement
-
-
-class LabelElement(HtmlElement):
- """
- Represents a ``<label>`` element.
-
- Label elements are linked to other elements with their ``for``
- attribute. You can access this element with ``label.for_element``.
- """
- @property
- def for_element(self):
- """
- Get/set the element this label points to. Return None if it
- can't be found.
- """
- id = self.get('for')
- if not id:
- return None
- return self.body.get_element_by_id(id)
-
- @for_element.setter
- def for_element(self, other):
- id = other.get('id')
- if not id:
- raise TypeError(
- "Element %r has no id attribute" % other)
- self.set('for', id)
-
- @for_element.deleter
- def for_element(self):
- attrib = self.attrib
- if 'id' in attrib:
- del attrib['id']
-
-
-HtmlElementClassLookup._default_element_classes['label'] = LabelElement
-
-
-############################################################
-## Serialization
-############################################################
-
-def html_to_xhtml(html):
- """Convert all tags in an HTML tree to XHTML by moving them to the
- XHTML namespace.
- """
- try:
- html = html.getroot()
- except AttributeError:
- pass
- prefix = "{%s}" % XHTML_NAMESPACE
- for el in html.iter(etree.Element):
- tag = el.tag
- if tag[0] != '{':
- el.tag = prefix + tag
-
-
-def xhtml_to_html(xhtml):
- """Convert all tags in an XHTML tree to HTML by removing their
- XHTML namespace.
- """
- try:
- xhtml = xhtml.getroot()
- except AttributeError:
- pass
- prefix = "{%s}" % XHTML_NAMESPACE
- prefix_len = len(prefix)
- for el in xhtml.iter(prefix + "*"):
- el.tag = el.tag[prefix_len:]
-
-
-# This isn't a general match, but it's a match for what libxml2
-# specifically serialises:
-__str_replace_meta_content_type = re.compile(
- r'<meta http-equiv="Content-Type"[^>]*>').sub
-__bytes_replace_meta_content_type = re.compile(
- r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
-
-
-def tostring(doc, pretty_print=False, include_meta_content_type=False,
- encoding=None, method="html", with_tail=True, doctype=None):
- """Return an HTML string representation of the document.
-
- Note: if include_meta_content_type is true this will create a
- ``<meta http-equiv="Content-Type" ...>`` tag in the head;
- regardless of the value of include_meta_content_type any existing
- ``<meta http-equiv="Content-Type" ...>`` tag will be removed
-
- The ``encoding`` argument controls the output encoding (defaults to
- ASCII, with &#...; character references for any characters outside
- of ASCII). Note that you can pass the name ``'unicode'`` as
- ``encoding`` argument to serialise to a Unicode string.
-
- The ``method`` argument defines the output method. It defaults to
- 'html', but can also be 'xml' for xhtml output, or 'text' to
- serialise to plain text without markup.
-
- To leave out the tail text of the top-level element that is being
- serialised, pass ``with_tail=False``.
-
- The ``doctype`` option allows passing in a plain string that will
- be serialised before the XML tree. Note that passing in non
- well-formed content here will make the XML output non well-formed.
- Also, an existing doctype in the document tree will not be removed
- when serialising an ElementTree instance.
-
- Example::
-
- >>> from lxml import html
- >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
-
- >>> html.tostring(root)
- b'<p>Hello<br>world!</p>'
- >>> html.tostring(root, method='html')
- b'<p>Hello<br>world!</p>'
-
- >>> html.tostring(root, method='xml')
- b'<p>Hello<br/>world!</p>'
-
- >>> html.tostring(root, method='text')
- b'Helloworld!'
-
- >>> html.tostring(root, method='text', encoding='unicode')
- u'Helloworld!'
-
- >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
- >>> html.tostring(root[0], method='text', encoding='unicode')
- u'Helloworld!TAIL'
-
- >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
- u'Helloworld!'
-
- >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
- >>> html.tostring(doc, method='html', encoding='unicode')
- u'<html><body><p>Hello<br>world!</p></body></html>'
-
- >>> print(html.tostring(doc, method='html', encoding='unicode',
- ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
- ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
- <html><body><p>Hello<br>world!</p></body></html>
- """
- html = etree.tostring(doc, method=method, pretty_print=pretty_print,
- encoding=encoding, with_tail=with_tail,
- doctype=doctype)
- if method == 'html' and not include_meta_content_type:
- if isinstance(html, str):
- html = __str_replace_meta_content_type('', html)
- else:
- html = __bytes_replace_meta_content_type(bytes(), html)
- return html
-
-
-tostring.__doc__ = __fix_docstring(tostring.__doc__)
-
-
-def open_in_browser(doc, encoding=None):
- """
- Open the HTML document in a web browser, saving it to a temporary
- file to open it. Note that this does not delete the file after
- use. This is mainly meant for debugging.
- """
- import os
- import webbrowser
- import tempfile
- if not isinstance(doc, etree._ElementTree):
- doc = etree.ElementTree(doc)
- handle, fn = tempfile.mkstemp(suffix='.html')
- f = os.fdopen(handle, 'wb')
- try:
- doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
- finally:
- # we leak the file itself here, but we should at least close it
- f.close()
- url = 'file://' + fn.replace(os.path.sep, '/')
- print(url)
- webbrowser.open(url)
-
-
-################################################################################
-# configure Element class lookup
-################################################################################
-
-class HTMLParser(etree.HTMLParser):
- """An HTML parser that is configured to return lxml.html Element
- objects.
- """
- def __init__(self, **kwargs):
- super(HTMLParser, self).__init__(**kwargs)
- self.set_element_class_lookup(HtmlElementClassLookup())
-
-
-class XHTMLParser(etree.XMLParser):
- """An XML parser that is configured to return lxml.html Element
- objects.
-
- Note that this parser is not really XHTML aware unless you let it
- load a DTD that declares the HTML entities. To do this, make sure
- you have the XHTML DTDs installed in your catalogs, and create the
- parser like this::
-
- >>> parser = XHTMLParser(load_dtd=True)
-
- If you additionally want to validate the document, use this::
-
- >>> parser = XHTMLParser(dtd_validation=True)
-
- For catalog support, see http://www.xmlsoft.org/catalog.html.
- """
- def __init__(self, **kwargs):
- super(XHTMLParser, self).__init__(**kwargs)
- self.set_element_class_lookup(HtmlElementClassLookup())
-
-
-def Element(*args, **kw):
- """Create a new HTML Element.
-
- This can also be used for XHTML documents.
- """
- v = html_parser.makeelement(*args, **kw)
- return v
-
-
-html_parser = HTMLParser()
-xhtml_parser = XHTMLParser()
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc
deleted file mode 100644
index a378207..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/ElementSoup.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 4bc5785..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/__init__.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc
deleted file mode 100644
index fa25497..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_diffcommand.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc
deleted file mode 100644
index b243408..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_html5builder.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc
deleted file mode 100644
index a2de006..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/_setmixin.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc
deleted file mode 100644
index b915259..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/builder.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc
deleted file mode 100644
index c343b40..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/clean.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc
deleted file mode 100644
index 8dc2d4b..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/defs.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc
deleted file mode 100644
index c029ed9..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/diff.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc
deleted file mode 100644
index 049161a..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/formfill.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc
deleted file mode 100644
index 6208e67..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/html5parser.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc
deleted file mode 100644
index 3293704..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/soupparser.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc b/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc
deleted file mode 100644
index d76e7dd..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/__pycache__/usedoctest.cpython-310.pyc
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py b/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py
deleted file mode 100644
index e0502c0..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/_diffcommand.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from __future__ import absolute_import
-
-import optparse
-import sys
-import re
-import os
-from .diff import htmldiff
-
-description = """\
-"""
-
-parser = optparse.OptionParser(
- usage="%prog [OPTIONS] FILE1 FILE2\n"
- "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
- description=description,
- )
-
-parser.add_option(
- '-o', '--output',
- metavar="FILE",
- dest="output",
- default="-",
- help="File to write the difference to",
- )
-
-parser.add_option(
- '-a', '--annotation',
- action="store_true",
- dest="annotation",
- help="Do an annotation")
-
-def main(args=None):
- if args is None:
- args = sys.argv[1:]
- options, args = parser.parse_args(args)
- if options.annotation:
- return annotate(options, args)
- if len(args) != 2:
- print('Error: you must give two files')
- parser.print_help()
- sys.exit(1)
- file1, file2 = args
- input1 = read_file(file1)
- input2 = read_file(file2)
- body1 = split_body(input1)[1]
- pre, body2, post = split_body(input2)
- result = htmldiff(body1, body2)
- result = pre + result + post
- if options.output == '-':
- if not result.endswith('\n'):
- result += '\n'
- sys.stdout.write(result)
- else:
- with open(options.output, 'wb') as f:
- f.write(result)
-
-def read_file(filename):
- if filename == '-':
- c = sys.stdin.read()
- elif not os.path.exists(filename):
- raise OSError(
- "Input file %s does not exist" % filename)
- else:
- with open(filename, 'rb') as f:
- c = f.read()
- return c
-
-body_start_re = re.compile(
- r"<body.*?>", re.I|re.S)
-body_end_re = re.compile(
- r"</body.*?>", re.I|re.S)
-
-def split_body(html):
- pre = post = ''
- match = body_start_re.search(html)
- if match:
- pre = html[:match.end()]
- html = html[match.end():]
- match = body_end_re.search(html)
- if match:
- post = html[match.start():]
- html = html[:match.start()]
- return pre, html, post
-
-def annotate(options, args):
- print("Not yet implemented")
- sys.exit(1)
-
diff --git a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py b/env/lib/python3.10/site-packages/lxml/html/_html5builder.py
deleted file mode 100644
index 3405c20..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/_html5builder.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-Legacy module - don't use in new code!
-
-html5lib now has its own proper implementation.
-
-This module implements a tree builder for html5lib that generates lxml
-html element trees. This module uses camelCase as it follows the
-html5lib style guide.
-"""
-
-from html5lib.treebuilders import _base, etree as etree_builders
-from lxml import html, etree
-
-
-class DocumentType(object):
-
- def __init__(self, name, publicId, systemId):
- self.name = name
- self.publicId = publicId
- self.systemId = systemId
-
-class Document(object):
-
- def __init__(self):
- self._elementTree = None
- self.childNodes = []
-
- def appendChild(self, element):
- self._elementTree.getroot().addnext(element._element)
-
-
-class TreeBuilder(_base.TreeBuilder):
- documentClass = Document
- doctypeClass = DocumentType
- elementClass = None
- commentClass = None
- fragmentClass = Document
-
- def __init__(self, *args, **kwargs):
- html_builder = etree_builders.getETreeModule(html, fullTree=False)
- etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
- self.elementClass = html_builder.Element
- self.commentClass = etree_builder.Comment
- _base.TreeBuilder.__init__(self, *args, **kwargs)
-
- def reset(self):
- _base.TreeBuilder.reset(self)
- self.rootInserted = False
- self.initialComments = []
- self.doctype = None
-
- def getDocument(self):
- return self.document._elementTree
-
- def getFragment(self):
- fragment = []
- element = self.openElements[0]._element
- if element.text:
- fragment.append(element.text)
- fragment.extend(element.getchildren())
- if element.tail:
- fragment.append(element.tail)
- return fragment
-
- def insertDoctype(self, name, publicId, systemId):
- doctype = self.doctypeClass(name, publicId, systemId)
- self.doctype = doctype
-
- def insertComment(self, data, parent=None):
- if not self.rootInserted:
- self.initialComments.append(data)
- else:
- _base.TreeBuilder.insertComment(self, data, parent)
-
- def insertRoot(self, name):
- buf = []
- if self.doctype and self.doctype.name:
- buf.append('<!DOCTYPE %s' % self.doctype.name)
- if self.doctype.publicId is not None or self.doctype.systemId is not None:
- buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
- self.doctype.systemId))
- buf.append('>')
- buf.append('<html></html>')
- root = html.fromstring(''.join(buf))
-
- # Append the initial comments:
- for comment in self.initialComments:
- root.addprevious(etree.Comment(comment))
-
- # Create the root document and add the ElementTree to it
- self.document = self.documentClass()
- self.document._elementTree = root.getroottree()
-
- # Add the root element to the internal child/open data structures
- root_element = self.elementClass(name)
- root_element._element = root
- self.document.childNodes.append(root_element)
- self.openElements.append(root_element)
-
- self.rootInserted = True
diff --git a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py b/env/lib/python3.10/site-packages/lxml/html/_setmixin.py
deleted file mode 100644
index c99738e..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/_setmixin.py
+++ /dev/null
@@ -1,56 +0,0 @@
-try:
- from collections.abc import MutableSet
-except ImportError:
- from collections import MutableSet
-
-
-class SetMixin(MutableSet):
-
- """
- Mix-in for sets. You must define __iter__, add, remove
- """
-
- def __len__(self):
- length = 0
- for item in self:
- length += 1
- return length
-
- def __contains__(self, item):
- for has_item in self:
- if item == has_item:
- return True
- return False
-
- issubset = MutableSet.__le__
- issuperset = MutableSet.__ge__
-
- union = MutableSet.__or__
- intersection = MutableSet.__and__
- difference = MutableSet.__sub__
- symmetric_difference = MutableSet.__xor__
-
- def copy(self):
- return set(self)
-
- def update(self, other):
- self |= other
-
- def intersection_update(self, other):
- self &= other
-
- def difference_update(self, other):
- self -= other
-
- def symmetric_difference_update(self, other):
- self ^= other
-
- def discard(self, item):
- try:
- self.remove(item)
- except KeyError:
- pass
-
- @classmethod
- def _from_iterable(cls, it):
- return set(it)
diff --git a/env/lib/python3.10/site-packages/lxml/html/builder.py b/env/lib/python3.10/site-packages/lxml/html/builder.py
deleted file mode 100644
index 8a074ec..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/builder.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# --------------------------------------------------------------------
-# The ElementTree toolkit is
-# Copyright (c) 1999-2004 by Fredrik Lundh
-# --------------------------------------------------------------------
-
-"""
-A set of HTML generator tags for building HTML documents.
-
-Usage::
-
- >>> from lxml.html.builder import *
- >>> html = HTML(
- ... HEAD( TITLE("Hello World") ),
- ... BODY( CLASS("main"),
- ... H1("Hello World !")
- ... )
- ... )
-
- >>> import lxml.etree
- >>> print lxml.etree.tostring(html, pretty_print=True)
- <html>
- <head>
- <title>Hello World</title>
- </head>
- <body class="main">
- <h1>Hello World !</h1>
- </body>
- </html>
-
-"""
-
-from lxml.builder import ElementMaker
-from lxml.html import html_parser
-
-E = ElementMaker(makeelement=html_parser.makeelement)
-
-# elements
-A = E.a #: anchor
-ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
-ACRONYM = E.acronym #:
-ADDRESS = E.address #: information on author
-APPLET = E.applet #: Java applet (DEPRECATED)
-AREA = E.area #: client-side image map area
-B = E.b #: bold text style
-BASE = E.base #: document base URI
-BASEFONT = E.basefont #: base font size (DEPRECATED)
-BDO = E.bdo #: I18N BiDi over-ride
-BIG = E.big #: large text style
-BLOCKQUOTE = E.blockquote #: long quotation
-BODY = E.body #: document body
-BR = E.br #: forced line break
-BUTTON = E.button #: push button
-CAPTION = E.caption #: table caption
-CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
-CITE = E.cite #: citation
-CODE = E.code #: computer code fragment
-COL = E.col #: table column
-COLGROUP = E.colgroup #: table column group
-DD = E.dd #: definition description
-DEL = getattr(E, 'del') #: deleted text
-DFN = E.dfn #: instance definition
-DIR = E.dir #: directory list (DEPRECATED)
-DIV = E.div #: generic language/style container
-DL = E.dl #: definition list
-DT = E.dt #: definition term
-EM = E.em #: emphasis
-FIELDSET = E.fieldset #: form control group
-FONT = E.font #: local change to font (DEPRECATED)
-FORM = E.form #: interactive form
-FRAME = E.frame #: subwindow
-FRAMESET = E.frameset #: window subdivision
-H1 = E.h1 #: heading
-H2 = E.h2 #: heading
-H3 = E.h3 #: heading
-H4 = E.h4 #: heading
-H5 = E.h5 #: heading
-H6 = E.h6 #: heading
-HEAD = E.head #: document head
-HR = E.hr #: horizontal rule
-HTML = E.html #: document root element
-I = E.i #: italic text style
-IFRAME = E.iframe #: inline subwindow
-IMG = E.img #: Embedded image
-INPUT = E.input #: form control
-INS = E.ins #: inserted text
-ISINDEX = E.isindex #: single line prompt (DEPRECATED)
-KBD = E.kbd #: text to be entered by the user
-LABEL = E.label #: form field label text
-LEGEND = E.legend #: fieldset legend
-LI = E.li #: list item
-LINK = E.link #: a media-independent link
-MAP = E.map #: client-side image map
-MENU = E.menu #: menu list (DEPRECATED)
-META = E.meta #: generic metainformation
-NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
-NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
-OBJECT = E.object #: generic embedded object
-OL = E.ol #: ordered list
-OPTGROUP = E.optgroup #: option group
-OPTION = E.option #: selectable choice
-P = E.p #: paragraph
-PARAM = E.param #: named property value
-PRE = E.pre #: preformatted text
-Q = E.q #: short inline quotation
-S = E.s #: strike-through text style (DEPRECATED)
-SAMP = E.samp #: sample program output, scripts, etc.
-SCRIPT = E.script #: script statements
-SELECT = E.select #: option selector
-SMALL = E.small #: small text style
-SPAN = E.span #: generic language/style container
-STRIKE = E.strike #: strike-through text (DEPRECATED)
-STRONG = E.strong #: strong emphasis
-STYLE = E.style #: style info
-SUB = E.sub #: subscript
-SUP = E.sup #: superscript
-TABLE = E.table #:
-TBODY = E.tbody #: table body
-TD = E.td #: table data cell
-TEXTAREA = E.textarea #: multi-line text field
-TFOOT = E.tfoot #: table footer
-TH = E.th #: table header cell
-THEAD = E.thead #: table header
-TITLE = E.title #: document title
-TR = E.tr #: table row
-TT = E.tt #: teletype or monospaced text style
-U = E.u #: underlined text style (DEPRECATED)
-UL = E.ul #: unordered list
-VAR = E.var #: instance of a variable or program argument
-
-# attributes (only reserved words are included here)
-ATTR = dict
-def CLASS(v): return {'class': v}
-def FOR(v): return {'for': v}
diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so
deleted file mode 100755
index 31087ea..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py
deleted file mode 100644
index e6b0543..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/clean.py
+++ /dev/null
@@ -1,786 +0,0 @@
-# cython: language_level=3str
-
-"""A cleanup tool for HTML.
-
-Removes unwanted tags and content. See the `Cleaner` class for
-details.
-"""
-
-from __future__ import absolute_import
-
-import copy
-import re
-import sys
-try:
- from urlparse import urlsplit
- from urllib import unquote_plus
-except ImportError:
- # Python 3
- from urllib.parse import urlsplit, unquote_plus
-from lxml import etree
-from lxml.html import defs
-from lxml.html import fromstring, XHTML_NAMESPACE
-from lxml.html import xhtml_to_html, _transform_result
-
-try:
- unichr
-except NameError:
- # Python 3
- unichr = chr
-try:
- unicode
-except NameError:
- # Python 3
- unicode = str
-try:
- basestring
-except NameError:
- basestring = (str, bytes)
-
-
-__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
- 'word_break', 'word_break_html']
-
-# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-# Particularly the CSS cleaning; most of the tag cleaning is integrated now
-# I have multiple kinds of schemes searched; but should schemes be
-# whitelisted instead?
-# max height?
-# remove images? Also in CSS? background attribute?
-# Some way to whitelist object, iframe, etc (e.g., if you want to
-# allow *just* embedded YouTube movies)
-# Log what was deleted and why?
-# style="behavior: ..." might be bad in IE?
-# Should we have something for just <meta http-equiv>? That's the worst of the
-# metas.
-# UTF-7 detections? Example:
-# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
-# you don't always have to have the charset set, if the page has no charset
-# and there's UTF7-like code in it.
-# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
-
-
-# This is an IE-specific construct you can have in a stylesheet to
-# run some Javascript:
-_replace_css_javascript = re.compile(
- r'expression\s*\(.*?\)', re.S|re.I).sub
-
-# Do I have to worry about @\nimport?
-_replace_css_import = re.compile(
- r'@\s*import', re.I).sub
-
-_looks_like_tag_content = re.compile(
- r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
- *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
-
-# All kinds of schemes besides just javascript: that can cause
-# execution:
-_find_image_dataurls = re.compile(
- r'data:image/(.+);base64,', re.I).findall
-_possibly_malicious_schemes = re.compile(
- r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
- re.I).findall
-# SVG images can contain script content
-_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
-
-def _has_javascript_scheme(s):
- safe_image_urls = 0
- for image_type in _find_image_dataurls(s):
- if _is_unsafe_image_type(image_type):
- return True
- safe_image_urls += 1
- return len(_possibly_malicious_schemes(s)) > safe_image_urls
-
-_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
-
-# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
-_conditional_comment_re = re.compile(
- r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
-
-_find_styled_elements = etree.XPath(
- "descendant-or-self::*[@style]")
-
-_find_external_links = etree.XPath(
- ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
- "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
- namespaces={'x':XHTML_NAMESPACE})
-
-
-class Cleaner(object):
- """
- Instances cleans the document of each of the possible offending
- elements. The cleaning is controlled by attributes; you can
- override attributes in a subclass, or set them in the constructor.
-
- ``scripts``:
- Removes any ``<script>`` tags.
-
- ``javascript``:
- Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
- as they could contain Javascript.
-
- ``comments``:
- Removes any comments.
-
- ``style``:
- Removes any style tags.
-
- ``inline_style``
- Removes any style attributes. Defaults to the value of the ``style`` option.
-
- ``links``:
- Removes any ``<link>`` tags
-
- ``meta``:
- Removes any ``<meta>`` tags
-
- ``page_structure``:
- Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
-
- ``processing_instructions``:
- Removes any processing instructions.
-
- ``embedded``:
- Removes any embedded objects (flash, iframes)
-
- ``frames``:
- Removes any frame-related tags
-
- ``forms``:
- Removes any form tags
-
- ``annoying_tags``:
- Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
-
- ``remove_tags``:
- A list of tags to remove. Only the tags will be removed,
- their content will get pulled up into the parent tag.
-
- ``kill_tags``:
- A list of tags to kill. Killing also removes the tag's content,
- i.e. the whole subtree, not just the tag itself.
-
- ``allow_tags``:
- A list of tags to include (default include all).
-
- ``remove_unknown_tags``:
- Remove any tags that aren't standard parts of HTML.
-
- ``safe_attrs_only``:
- If true, only include 'safe' attributes (specifically the list
- from the feedparser HTML sanitisation web site).
-
- ``safe_attrs``:
- A set of attribute names to override the default list of attributes
- considered 'safe' (when safe_attrs_only=True).
-
- ``add_nofollow``:
- If true, then any <a> tags will have ``rel="nofollow"`` added to them.
-
- ``host_whitelist``:
- A list or set of hosts that you can use for embedded content
- (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
- You can also implement/override the method
- ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
- implement more complex rules for what can be embedded.
- Anything that passes this test will be shown, regardless of
- the value of (for instance) ``embedded``.
-
- Note that this parameter might not work as intended if you do not
- make the links absolute before doing the cleaning.
-
- Note that you may also need to set ``whitelist_tags``.
-
- ``whitelist_tags``:
- A set of tags that can be included with ``host_whitelist``.
- The default is ``iframe`` and ``embed``; you may wish to
- include other tags like ``script``, or you may want to
- implement ``allow_embedded_url`` for more control. Set to None to
- include all tags.
-
- This modifies the document *in place*.
- """
-
- scripts = True
- javascript = True
- comments = True
- style = False
- inline_style = None
- links = True
- meta = True
- page_structure = True
- processing_instructions = True
- embedded = True
- frames = True
- forms = True
- annoying_tags = True
- remove_tags = None
- allow_tags = None
- kill_tags = None
- remove_unknown_tags = True
- safe_attrs_only = True
- safe_attrs = defs.safe_attrs
- add_nofollow = False
- host_whitelist = ()
- whitelist_tags = {'iframe', 'embed'}
-
- def __init__(self, **kw):
- not_an_attribute = object()
- for name, value in kw.items():
- default = getattr(self, name, not_an_attribute)
- if (default is not None and default is not True and default is not False
- and not isinstance(default, (frozenset, set, tuple, list))):
- raise TypeError(
- "Unknown parameter: %s=%r" % (name, value))
- setattr(self, name, value)
- if self.inline_style is None and 'inline_style' not in kw:
- self.inline_style = self.style
-
- if kw.get("allow_tags"):
- if kw.get("remove_unknown_tags"):
- raise ValueError("It does not make sense to pass in both "
- "allow_tags and remove_unknown_tags")
- self.remove_unknown_tags = False
-
- # Used to lookup the primary URL for a given tag that is up for
- # removal:
- _tag_link_attrs = dict(
- script='src',
- link='href',
- # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
- # From what I can tell, both attributes can contain a link:
- applet=['code', 'object'],
- iframe='src',
- embed='src',
- layer='src',
- # FIXME: there doesn't really seem like a general way to figure out what
- # links an <object> tag uses; links often go in <param> tags with values
- # that we don't really know. You'd have to have knowledge about specific
- # kinds of plugins (probably keyed off classid), and match against those.
- ##object=?,
- # FIXME: not looking at the action currently, because it is more complex
- # than than -- if you keep the form, you should keep the form controls.
- ##form='action',
- a='href',
- )
-
- def __call__(self, doc):
- """
- Cleans the document.
- """
- try:
- getroot = doc.getroot
- except AttributeError:
- pass # Element instance
- else:
- doc = getroot() # ElementTree instance, instead of an element
- # convert XHTML to HTML
- xhtml_to_html(doc)
- # Normalize a case that IE treats <image> like <img>, and that
- # can confuse either this step or later steps.
- for el in doc.iter('image'):
- el.tag = 'img'
- if not self.comments:
- # Of course, if we were going to kill comments anyway, we don't
- # need to worry about this
- self.kill_conditional_comments(doc)
-
- kill_tags = set(self.kill_tags or ())
- remove_tags = set(self.remove_tags or ())
- allow_tags = set(self.allow_tags or ())
-
- if self.scripts:
- kill_tags.add('script')
- if self.safe_attrs_only:
- safe_attrs = set(self.safe_attrs)
- for el in doc.iter(etree.Element):
- attrib = el.attrib
- for aname in attrib.keys():
- if aname not in safe_attrs:
- del attrib[aname]
- if self.javascript:
- if not (self.safe_attrs_only and
- self.safe_attrs == defs.safe_attrs):
- # safe_attrs handles events attributes itself
- for el in doc.iter(etree.Element):
- attrib = el.attrib
- for aname in attrib.keys():
- if aname.startswith('on'):
- del attrib[aname]
- doc.rewrite_links(self._remove_javascript_link,
- resolve_base_href=False)
- # If we're deleting style then we don't have to remove JS links
- # from styles, otherwise...
- if not self.inline_style:
- for el in _find_styled_elements(doc):
- old = el.get('style')
- new = _replace_css_javascript('', old)
- new = _replace_css_import('', new)
- if self._has_sneaky_javascript(new):
- # Something tricky is going on...
- del el.attrib['style']
- elif new != old:
- el.set('style', new)
- if not self.style:
- for el in list(doc.iter('style')):
- if el.get('type', '').lower().strip() == 'text/javascript':
- el.drop_tree()
- continue
- old = el.text or ''
- new = _replace_css_javascript('', old)
- # The imported CSS can do anything; we just can't allow:
- new = _replace_css_import('', new)
- if self._has_sneaky_javascript(new):
- # Something tricky is going on...
- el.text = '/* deleted */'
- elif new != old:
- el.text = new
- if self.comments:
- kill_tags.add(etree.Comment)
- if self.processing_instructions:
- kill_tags.add(etree.ProcessingInstruction)
- if self.style:
- kill_tags.add('style')
- if self.inline_style:
- etree.strip_attributes(doc, 'style')
- if self.links:
- kill_tags.add('link')
- elif self.style or self.javascript:
- # We must get rid of included stylesheets if Javascript is not
- # allowed, as you can put Javascript in them
- for el in list(doc.iter('link')):
- if 'stylesheet' in el.get('rel', '').lower():
- # Note this kills alternate stylesheets as well
- if not self.allow_element(el):
- el.drop_tree()
- if self.meta:
- kill_tags.add('meta')
- if self.page_structure:
- remove_tags.update(('head', 'html', 'title'))
- if self.embedded:
- # FIXME: is <layer> really embedded?
- # We should get rid of any <param> tags not inside <applet>;
- # These are not really valid anyway.
- for el in list(doc.iter('param')):
- parent = el.getparent()
- while parent is not None and parent.tag not in ('applet', 'object'):
- parent = parent.getparent()
- if parent is None:
- el.drop_tree()
- kill_tags.update(('applet',))
- # The alternate contents that are in an iframe are a good fallback:
- remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
- if self.frames:
- # FIXME: ideally we should look at the frame links, but
- # generally frames don't mix properly with an HTML
- # fragment anyway.
- kill_tags.update(defs.frame_tags)
- if self.forms:
- remove_tags.add('form')
- kill_tags.update(('button', 'input', 'select', 'textarea'))
- if self.annoying_tags:
- remove_tags.update(('blink', 'marquee'))
-
- _remove = []
- _kill = []
- for el in doc.iter():
- if el.tag in kill_tags:
- if self.allow_element(el):
- continue
- _kill.append(el)
- elif el.tag in remove_tags:
- if self.allow_element(el):
- continue
- _remove.append(el)
-
- if _remove and _remove[0] == doc:
- # We have to drop the parent-most tag, which we can't
- # do. Instead we'll rewrite it:
- el = _remove.pop(0)
- el.tag = 'div'
- el.attrib.clear()
- elif _kill and _kill[0] == doc:
- # We have to drop the parent-most element, which we can't
- # do. Instead we'll clear it:
- el = _kill.pop(0)
- if el.tag != 'html':
- el.tag = 'div'
- el.clear()
-
- _kill.reverse() # start with innermost tags
- for el in _kill:
- el.drop_tree()
- for el in _remove:
- el.drop_tag()
-
- if self.remove_unknown_tags:
- if allow_tags:
- raise ValueError(
- "It does not make sense to pass in both allow_tags and remove_unknown_tags")
- allow_tags = set(defs.tags)
- if allow_tags:
- # make sure we do not remove comments/PIs if users want them (which is rare enough)
- if not self.comments:
- allow_tags.add(etree.Comment)
- if not self.processing_instructions:
- allow_tags.add(etree.ProcessingInstruction)
-
- bad = []
- for el in doc.iter():
- if el.tag not in allow_tags:
- bad.append(el)
- if bad:
- if bad[0] is doc:
- el = bad.pop(0)
- el.tag = 'div'
- el.attrib.clear()
- for el in bad:
- el.drop_tag()
- if self.add_nofollow:
- for el in _find_external_links(doc):
- if not self.allow_follow(el):
- rel = el.get('rel')
- if rel:
- if ('nofollow' in rel
- and ' nofollow ' in (' %s ' % rel)):
- continue
- rel = '%s nofollow' % rel
- else:
- rel = 'nofollow'
- el.set('rel', rel)
-
- def allow_follow(self, anchor):
- """
- Override to suppress rel="nofollow" on some anchors.
- """
- return False
-
- def allow_element(self, el):
- """
- Decide whether an element is configured to be accepted or rejected.
-
- :param el: an element.
- :return: true to accept the element or false to reject/discard it.
- """
- if el.tag not in self._tag_link_attrs:
- return False
- attr = self._tag_link_attrs[el.tag]
- if isinstance(attr, (list, tuple)):
- for one_attr in attr:
- url = el.get(one_attr)
- if not url:
- return False
- if not self.allow_embedded_url(el, url):
- return False
- return True
- else:
- url = el.get(attr)
- if not url:
- return False
- return self.allow_embedded_url(el, url)
-
- def allow_embedded_url(self, el, url):
- """
- Decide whether a URL that was found in an element's attributes or text
- if configured to be accepted or rejected.
-
- :param el: an element.
- :param url: a URL found on the element.
- :return: true to accept the URL and false to reject it.
- """
- if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
- return False
- scheme, netloc, path, query, fragment = urlsplit(url)
- netloc = netloc.lower().split(':', 1)[0]
- if scheme not in ('http', 'https'):
- return False
- if netloc in self.host_whitelist:
- return True
- return False
-
- def kill_conditional_comments(self, doc):
- """
- IE conditional comments basically embed HTML that the parser
- doesn't normally see. We can't allow anything like that, so
- we'll kill any comments that could be conditional.
- """
- has_conditional_comment = _conditional_comment_re.search
- self._kill_elements(
- doc, lambda el: has_conditional_comment(el.text),
- etree.Comment)
-
- def _kill_elements(self, doc, condition, iterate=None):
- bad = []
- for el in doc.iter(iterate):
- if condition(el):
- bad.append(el)
- for el in bad:
- el.drop_tree()
-
- def _remove_javascript_link(self, link):
- # links like "j a v a s c r i p t:" might be interpreted in IE
- new = _substitute_whitespace('', unquote_plus(link))
- if _has_javascript_scheme(new):
- # FIXME: should this be None to delete?
- return ''
- return link
-
- _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
-
- def _has_sneaky_javascript(self, style):
- """
- Depending on the browser, stuff like ``e x p r e s s i o n(...)``
- can get interpreted, or ``expre/* stuff */ssion(...)``. This
- checks for attempt to do stuff like this.
-
- Typically the response will be to kill the entire style; if you
- have just a bit of Javascript in the style another rule will catch
- that and remove only the Javascript from the style; this catches
- more sneaky attempts.
- """
- style = self._substitute_comments('', style)
- style = style.replace('\\', '')
- style = _substitute_whitespace('', style)
- style = style.lower()
- if _has_javascript_scheme(style):
- return True
- if 'expression(' in style:
- return True
- if '@import' in style:
- return True
- if '</noscript' in style:
- # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
- return True
- if _looks_like_tag_content(style):
- # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
- return True
- return False
-
- def clean_html(self, html):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- self(doc)
- return _transform_result(result_type, doc)
-
-clean = Cleaner()
-clean_html = clean.clean_html
-
-############################################################
-## Autolinking
-############################################################
-
-_link_regexes = [
- re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
- # This is conservative, but autolinking can be a bit conservative:
- re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
- ]
-
-_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
-
-_avoid_hosts = [
- re.compile(r'^localhost', re.I),
- re.compile(r'\bexample\.(?:com|org|net)$', re.I),
- re.compile(r'^127\.0\.0\.1$'),
- ]
-
-_avoid_classes = ['nolink']
-
-def autolink(el, link_regexes=_link_regexes,
- avoid_elements=_avoid_elements,
- avoid_hosts=_avoid_hosts,
- avoid_classes=_avoid_classes):
- """
- Turn any URLs into links.
-
- It will search for links identified by the given regular
- expressions (by default mailto and http(s) links).
-
- It won't link text in an element in avoid_elements, or an element
- with a class in avoid_classes. It won't link to anything with a
- host that matches one of the regular expressions in avoid_hosts
- (default localhost and 127.0.0.1).
-
- If you pass in an element, the element's tail will not be
- substituted, only the contents of the element.
- """
- if el.tag in avoid_elements:
- return
- class_name = el.get('class')
- if class_name:
- class_name = class_name.split()
- for match_class in avoid_classes:
- if match_class in class_name:
- return
- for child in list(el):
- autolink(child, link_regexes=link_regexes,
- avoid_elements=avoid_elements,
- avoid_hosts=avoid_hosts,
- avoid_classes=avoid_classes)
- if child.tail:
- text, tail_children = _link_text(
- child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
- if tail_children:
- child.tail = text
- index = el.index(child)
- el[index+1:index+1] = tail_children
- if el.text:
- text, pre_children = _link_text(
- el.text, link_regexes, avoid_hosts, factory=el.makeelement)
- if pre_children:
- el.text = text
- el[:0] = pre_children
-
-def _link_text(text, link_regexes, avoid_hosts, factory):
- leading_text = ''
- links = []
- last_pos = 0
- while 1:
- best_match, best_pos = None, None
- for regex in link_regexes:
- regex_pos = last_pos
- while 1:
- match = regex.search(text, pos=regex_pos)
- if match is None:
- break
- host = match.group('host')
- for host_regex in avoid_hosts:
- if host_regex.search(host):
- regex_pos = match.end()
- break
- else:
- break
- if match is None:
- continue
- if best_pos is None or match.start() < best_pos:
- best_match = match
- best_pos = match.start()
- if best_match is None:
- # No more matches
- if links:
- assert not links[-1].tail
- links[-1].tail = text
- else:
- assert not leading_text
- leading_text = text
- break
- link = best_match.group(0)
- end = best_match.end()
- if link.endswith('.') or link.endswith(','):
- # These punctuation marks shouldn't end a link
- end -= 1
- link = link[:-1]
- prev_text = text[:best_match.start()]
- if links:
- assert not links[-1].tail
- links[-1].tail = prev_text
- else:
- assert not leading_text
- leading_text = prev_text
- anchor = factory('a')
- anchor.set('href', link)
- body = best_match.group('body')
- if not body:
- body = link
- if body.endswith('.') or body.endswith(','):
- body = body[:-1]
- anchor.text = body
- links.append(anchor)
- text = text[end:]
- return leading_text, links
-
-def autolink_html(html, *args, **kw):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- autolink(doc, *args, **kw)
- return _transform_result(result_type, doc)
-
-autolink_html.__doc__ = autolink.__doc__
-
-############################################################
-## Word wrapping
-############################################################
-
-_avoid_word_break_elements = ['pre', 'textarea', 'code']
-_avoid_word_break_classes = ['nobreak']
-
-def word_break(el, max_width=40,
- avoid_elements=_avoid_word_break_elements,
- avoid_classes=_avoid_word_break_classes,
- break_character=unichr(0x200b)):
- """
- Breaks any long words found in the body of the text (not attributes).
-
- Doesn't effect any of the tags in avoid_elements, by default
- ``<textarea>`` and ``<pre>``
-
- Breaks words by inserting &#8203;, which is a unicode character
- for Zero Width Space character. This generally takes up no space
- in rendering, but does copy as a space, and in monospace contexts
- usually takes up space.
-
- See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
- """
- # Character suggestion of &#8203 comes from:
- # http://www.cs.tut.fi/~jkorpela/html/nobr.html
- if el.tag in _avoid_word_break_elements:
- return
- class_name = el.get('class')
- if class_name:
- dont_break = False
- class_name = class_name.split()
- for avoid in avoid_classes:
- if avoid in class_name:
- dont_break = True
- break
- if dont_break:
- return
- if el.text:
- el.text = _break_text(el.text, max_width, break_character)
- for child in el:
- word_break(child, max_width=max_width,
- avoid_elements=avoid_elements,
- avoid_classes=avoid_classes,
- break_character=break_character)
- if child.tail:
- child.tail = _break_text(child.tail, max_width, break_character)
-
-def word_break_html(html, *args, **kw):
- result_type = type(html)
- doc = fromstring(html)
- word_break(doc, *args, **kw)
- return _transform_result(result_type, doc)
-
-def _break_text(text, max_width, break_character):
- words = text.split()
- for word in words:
- if len(word) > max_width:
- replacement = _insert_break(word, max_width, break_character)
- text = text.replace(word, replacement)
- return text
-
-_break_prefer_re = re.compile(r'[^a-z]', re.I)
-
-def _insert_break(word, width, break_character):
- orig_word = word
- result = ''
- while len(word) > width:
- start = word[:width]
- breaks = list(_break_prefer_re.finditer(start))
- if breaks:
- last_break = breaks[-1]
- # Only walk back up to 10 characters to find a nice break:
- if last_break.end() > width-10:
- # FIXME: should the break character be at the end of the
- # chunk, or the beginning of the next chunk?
- start = word[:last_break.end()]
- result += start + break_character
- word = word[len(start):]
- result += word
- return result
-
diff --git a/env/lib/python3.10/site-packages/lxml/html/defs.py b/env/lib/python3.10/site-packages/lxml/html/defs.py
deleted file mode 100644
index 2058ea3..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/defs.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# FIXME: this should all be confirmed against what a DTD says
-# (probably in a test; this may not match the DTD exactly, but we
-# should document just how it differs).
-
-"""
-Data taken from https://www.w3.org/TR/html401/index/elements.html
-and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
-for html5_tags.
-"""
-
-empty_tags = frozenset([
- 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
- 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
-
-deprecated_tags = frozenset([
- 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
- 'menu', 's', 'strike', 'u'])
-
-# archive actually takes a space-separated list of URIs
-link_attrs = frozenset([
- 'action', 'archive', 'background', 'cite', 'classid',
- 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
- 'usemap',
- # Not standard:
- 'dynsrc', 'lowsrc',
- # HTML5 formaction
- 'formaction'
- ])
-
-# Not in the HTML 4 spec:
-# onerror, onresize
-event_attrs = frozenset([
- 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
- 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
- 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
- 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
- 'onunload',
- ])
-
-safe_attrs = frozenset([
- 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
- 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
- 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
- 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
- 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
- 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
- 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
- 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
- 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
-
-# From http://htmlhelp.com/reference/html40/olist.html
-top_level_tags = frozenset([
- 'html', 'head', 'body', 'frameset',
- ])
-
-head_tags = frozenset([
- 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
- ])
-
-general_block_tags = frozenset([
- 'address',
- 'blockquote',
- 'center',
- 'del',
- 'div',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'hr',
- 'ins',
- 'isindex',
- 'noscript',
- 'p',
- 'pre',
- ])
-
-list_tags = frozenset([
- 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
- ])
-
-table_tags = frozenset([
- 'table', 'caption', 'colgroup', 'col',
- 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
- ])
-
-# just this one from
-# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
-block_tags = general_block_tags | list_tags | table_tags | frozenset([
- # Partial form tags
- 'fieldset', 'form', 'legend', 'optgroup', 'option',
- ])
-
-form_tags = frozenset([
- 'form', 'button', 'fieldset', 'legend', 'input', 'label',
- 'select', 'optgroup', 'option', 'textarea',
- ])
-
-special_inline_tags = frozenset([
- 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
- 'img', 'map', 'area', 'object', 'param', 'q', 'script',
- 'span', 'sub', 'sup',
- ])
-
-phrase_tags = frozenset([
- 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
- 'ins', 'kbd', 'samp', 'strong', 'var',
- ])
-
-font_style_tags = frozenset([
- 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
- ])
-
-frame_tags = frozenset([
- 'frameset', 'frame', 'noframes',
- ])
-
-html5_tags = frozenset([
- 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
- 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
- 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
- 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
- 'svg', 'time', 'track', 'video', 'wbr'
- ])
-
-# These tags aren't standard
-nonstandard_tags = frozenset(['blink', 'marquee'])
-
-
-tags = (top_level_tags | head_tags | general_block_tags | list_tags
- | table_tags | form_tags | special_inline_tags | phrase_tags
- | font_style_tags | nonstandard_tags | html5_tags)
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so b/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so
deleted file mode 100755
index 0c11b90..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/diff.cpython-310-x86_64-linux-gnu.so
+++ /dev/null
Binary files differ
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py
deleted file mode 100644
index 39bec78..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/diff.py
+++ /dev/null
@@ -1,884 +0,0 @@
-# cython: language_level=3
-
-from __future__ import absolute_import
-
-import difflib
-from lxml import etree
-from lxml.html import fragment_fromstring
-import re
-
-__all__ = ['html_annotate', 'htmldiff']
-
-try:
- from html import escape as html_escape
-except ImportError:
- from cgi import escape as html_escape
-try:
- _unicode = unicode
-except NameError:
- # Python 3
- _unicode = str
-try:
- basestring
-except NameError:
- # Python 3
- basestring = str
-
-############################################################
-## Annotation
-############################################################
-
-def default_markup(text, version):
- return '<span title="%s">%s</span>' % (
- html_escape(_unicode(version), 1), text)
-
-def html_annotate(doclist, markup=default_markup):
- """
- doclist should be ordered from oldest to newest, like::
-
- >>> version1 = 'Hello World'
- >>> version2 = 'Goodbye World'
- >>> print(html_annotate([(version1, 'version 1'),
- ... (version2, 'version 2')]))
- <span title="version 2">Goodbye</span> <span title="version 1">World</span>
-
- The documents must be *fragments* (str/UTF8 or unicode), not
- complete documents
-
- The markup argument is a function to markup the spans of words.
- This function is called like markup('Hello', 'version 2'), and
- returns HTML. The first argument is text and never includes any
- markup. The default uses a span with a title:
-
- >>> print(default_markup('Some Text', 'by Joe'))
- <span title="by Joe">Some Text</span>
- """
- # The basic strategy we have is to split the documents up into
- # logical tokens (which are words with attached markup). We then
- # do diffs of each of the versions to track when a token first
- # appeared in the document; the annotation attached to the token
- # is the version where it first appeared.
- tokenlist = [tokenize_annotated(doc, version)
- for doc, version in doclist]
- cur_tokens = tokenlist[0]
- for tokens in tokenlist[1:]:
- html_annotate_merge_annotations(cur_tokens, tokens)
- cur_tokens = tokens
-
- # After we've tracked all the tokens, we can combine spans of text
- # that are adjacent and have the same annotation
- cur_tokens = compress_tokens(cur_tokens)
- # And finally add markup
- result = markup_serialize_tokens(cur_tokens, markup)
- return ''.join(result).strip()
-
-def tokenize_annotated(doc, annotation):
- """Tokenize a document and add an annotation attribute to each token
- """
- tokens = tokenize(doc, include_hrefs=False)
- for tok in tokens:
- tok.annotation = annotation
- return tokens
-
-def html_annotate_merge_annotations(tokens_old, tokens_new):
- """Merge the annotations from tokens_old into tokens_new, when the
- tokens in the new document already existed in the old document.
- """
- s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
- commands = s.get_opcodes()
-
- for command, i1, i2, j1, j2 in commands:
- if command == 'equal':
- eq_old = tokens_old[i1:i2]
- eq_new = tokens_new[j1:j2]
- copy_annotations(eq_old, eq_new)
-
-def copy_annotations(src, dest):
- """
- Copy annotations from the tokens listed in src to the tokens in dest
- """
- assert len(src) == len(dest)
- for src_tok, dest_tok in zip(src, dest):
- dest_tok.annotation = src_tok.annotation
-
-def compress_tokens(tokens):
- """
- Combine adjacent tokens when there is no HTML between the tokens,
- and they share an annotation
- """
- result = [tokens[0]]
- for tok in tokens[1:]:
- if (not result[-1].post_tags and
- not tok.pre_tags and
- result[-1].annotation == tok.annotation):
- compress_merge_back(result, tok)
- else:
- result.append(tok)
- return result
-
-def compress_merge_back(tokens, tok):
- """ Merge tok into the last element of tokens (modifying the list of
- tokens in-place). """
- last = tokens[-1]
- if type(last) is not token or type(tok) is not token:
- tokens.append(tok)
- else:
- text = _unicode(last)
- if last.trailing_whitespace:
- text += last.trailing_whitespace
- text += tok
- merged = token(text,
- pre_tags=last.pre_tags,
- post_tags=tok.post_tags,
- trailing_whitespace=tok.trailing_whitespace)
- merged.annotation = last.annotation
- tokens[-1] = merged
-
-def markup_serialize_tokens(tokens, markup_func):
- """
- Serialize the list of tokens into a list of text chunks, calling
- markup_func around text to add annotations.
- """
- for token in tokens:
- for pre in token.pre_tags:
- yield pre
- html = token.html()
- html = markup_func(html, token.annotation)
- if token.trailing_whitespace:
- html += token.trailing_whitespace
- yield html
- for post in token.post_tags:
- yield post
-
-
-############################################################
-## HTML Diffs
-############################################################
-
-def htmldiff(old_html, new_html):
- ## FIXME: this should take parsed documents too, and use their body
- ## or other content.
- """ Do a diff of the old and new document. The documents are HTML
- *fragments* (str/UTF8 or unicode), they are not complete documents
- (i.e., no <html> tag).
-
- Returns HTML with <ins> and <del> tags added around the
- appropriate text.
-
- Markup is generally ignored, with the markup from new_html
- preserved, and possibly some markup from old_html (though it is
- considered acceptable to lose some of the old markup). Only the
- words in the HTML are diffed. The exception is <img> tags, which
- are treated like words, and the href attribute of <a> tags, which
- are noted inside the tag itself when there are changes.
- """
- old_html_tokens = tokenize(old_html)
- new_html_tokens = tokenize(new_html)
- result = htmldiff_tokens(old_html_tokens, new_html_tokens)
- result = ''.join(result).strip()
- return fixup_ins_del_tags(result)
-
-def htmldiff_tokens(html1_tokens, html2_tokens):
- """ Does a diff on the tokens themselves, returning a list of text
- chunks (not tokens).
- """
- # There are several passes as we do the differences. The tokens
- # isolate the portion of the content we care to diff; difflib does
- # all the actual hard work at that point.
- #
- # Then we must create a valid document from pieces of both the old
- # document and the new document. We generally prefer to take
- # markup from the new document, and only do a best effort attempt
- # to keep markup from the old document; anything that we can't
- # resolve we throw away. Also we try to put the deletes as close
- # to the location where we think they would have been -- because
- # we are only keeping the markup from the new document, it can be
- # fuzzy where in the new document the old text would have gone.
- # Again we just do a best effort attempt.
- s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
- commands = s.get_opcodes()
- result = []
- for command, i1, i2, j1, j2 in commands:
- if command == 'equal':
- result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
- continue
- if command == 'insert' or command == 'replace':
- ins_tokens = expand_tokens(html2_tokens[j1:j2])
- merge_insert(ins_tokens, result)
- if command == 'delete' or command == 'replace':
- del_tokens = expand_tokens(html1_tokens[i1:i2])
- merge_delete(del_tokens, result)
- # If deletes were inserted directly as <del> then we'd have an
- # invalid document at this point. Instead we put in special
- # markers, and when the complete diffed document has been created
- # we try to move the deletes around and resolve any problems.
- result = cleanup_delete(result)
-
- return result
-
-def expand_tokens(tokens, equal=False):
- """Given a list of tokens, return a generator of the chunks of
- text for the data in the tokens.
- """
- for token in tokens:
- for pre in token.pre_tags:
- yield pre
- if not equal or not token.hide_when_equal:
- if token.trailing_whitespace:
- yield token.html() + token.trailing_whitespace
- else:
- yield token.html()
- for post in token.post_tags:
- yield post
-
-def merge_insert(ins_chunks, doc):
- """ doc is the already-handled document (as a list of text chunks);
- here we add <ins>ins_chunks</ins> to the end of that. """
- # Though we don't throw away unbalanced_start or unbalanced_end
- # (we assume there is accompanying markup later or earlier in the
- # document), we only put <ins> around the balanced portion.
- unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
- doc.extend(unbalanced_start)
- if doc and not doc[-1].endswith(' '):
- # Fix up the case where the word before the insert didn't end with
- # a space
- doc[-1] += ' '
- doc.append('<ins>')
- if balanced and balanced[-1].endswith(' '):
- # We move space outside of </ins>
- balanced[-1] = balanced[-1][:-1]
- doc.extend(balanced)
- doc.append('</ins> ')
- doc.extend(unbalanced_end)
-
-# These are sentinels to represent the start and end of a <del>
-# segment, until we do the cleanup phase to turn them into proper
-# markup:
-class DEL_START:
- pass
-class DEL_END:
- pass
-
-class NoDeletes(Exception):
- """ Raised when the document no longer contains any pending deletes
- (DEL_START/DEL_END) """
-
-def merge_delete(del_chunks, doc):
- """ Adds the text chunks in del_chunks to the document doc (another
- list of text chunks) with marker to show it is a delete.
- cleanup_delete later resolves these markers into <del> tags."""
- doc.append(DEL_START)
- doc.extend(del_chunks)
- doc.append(DEL_END)
-
-def cleanup_delete(chunks):
- """ Cleans up any DEL_START/DEL_END markers in the document, replacing
- them with <del></del>. To do this while keeping the document
- valid, it may need to drop some tags (either start or end tags).
-
- It may also move the del into adjacent tags to try to move it to a
- similar location where it was originally located (e.g., moving a
- delete into preceding <div> tag, if the del looks like (DEL_START,
- 'Text</div>', DEL_END)"""
- while 1:
- # Find a pending DEL_START/DEL_END, splitting the document
- # into stuff-preceding-DEL_START, stuff-inside, and
- # stuff-following-DEL_END
- try:
- pre_delete, delete, post_delete = split_delete(chunks)
- except NoDeletes:
- # Nothing found, we've cleaned up the entire doc
- break
- # The stuff-inside-DEL_START/END may not be well balanced
- # markup. First we figure out what unbalanced portions there are:
- unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
- # Then we move the span forward and/or backward based on these
- # unbalanced portions:
- locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
- locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
- doc = pre_delete
- if doc and not doc[-1].endswith(' '):
- # Fix up case where the word before us didn't have a trailing space
- doc[-1] += ' '
- doc.append('<del>')
- if balanced and balanced[-1].endswith(' '):
- # We move space outside of </del>
- balanced[-1] = balanced[-1][:-1]
- doc.extend(balanced)
- doc.append('</del> ')
- doc.extend(post_delete)
- chunks = doc
- return chunks
-
-def split_unbalanced(chunks):
- """Return (unbalanced_start, balanced, unbalanced_end), where each is
- a list of text and tag chunks.
-
- unbalanced_start is a list of all the tags that are opened, but
- not closed in this span. Similarly, unbalanced_end is a list of
- tags that are closed but were not opened. Extracting these might
- mean some reordering of the chunks."""
- start = []
- end = []
- tag_stack = []
- balanced = []
- for chunk in chunks:
- if not chunk.startswith('<'):
- balanced.append(chunk)
- continue
- endtag = chunk[1] == '/'
- name = chunk.split()[0].strip('<>/')
- if name in empty_tags:
- balanced.append(chunk)
- continue
- if endtag:
- if tag_stack and tag_stack[-1][0] == name:
- balanced.append(chunk)
- name, pos, tag = tag_stack.pop()
- balanced[pos] = tag
- elif tag_stack:
- start.extend([tag for name, pos, tag in tag_stack])
- tag_stack = []
- end.append(chunk)
- else:
- end.append(chunk)
- else:
- tag_stack.append((name, len(balanced), chunk))
- balanced.append(None)
- start.extend(
- [chunk for name, pos, chunk in tag_stack])
- balanced = [chunk for chunk in balanced if chunk is not None]
- return start, balanced, end
-
-def split_delete(chunks):
- """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
- stuff_after_DEL_END). Returns the first case found (there may be
- more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
- there's no DEL_START found. """
- try:
- pos = chunks.index(DEL_START)
- except ValueError:
- raise NoDeletes
- pos2 = chunks.index(DEL_END)
- return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
-
-def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
- """ pre_delete and post_delete implicitly point to a place in the
- document (where the two were split). This moves that point (by
- popping items from one and pushing them onto the other). It moves
- the point to try to find a place where unbalanced_start applies.
-
- As an example::
-
- >>> unbalanced_start = ['<div>']
- >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
- >>> pre, post = doc[:3], doc[3:]
- >>> pre, post
- (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
- >>> locate_unbalanced_start(unbalanced_start, pre, post)
- >>> pre, post
- (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
-
- As you can see, we moved the point so that the dangling <div> that
- we found will be effectively replaced by the div in the original
- document. If this doesn't work out, we just throw away
- unbalanced_start without doing anything.
- """
- while 1:
- if not unbalanced_start:
- # We have totally succeeded in finding the position
- break
- finding = unbalanced_start[0]
- finding_name = finding.split()[0].strip('<>')
- if not post_delete:
- break
- next = post_delete[0]
- if next is DEL_START or not next.startswith('<'):
- # Reached a word, we can't move the delete text forward
- break
- if next[1] == '/':
- # Reached a closing tag, can we go further? Maybe not...
- break
- name = next.split()[0].strip('<>')
- if name == 'ins':
- # Can't move into an insert
- break
- assert name != 'del', (
- "Unexpected delete tag: %r" % next)
- if name == finding_name:
- unbalanced_start.pop(0)
- pre_delete.append(post_delete.pop(0))
- else:
- # Found a tag that doesn't match
- break
-
-def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
- """ like locate_unbalanced_start, except handling end tags and
- possibly moving the point earlier in the document. """
- while 1:
- if not unbalanced_end:
- # Success
- break
- finding = unbalanced_end[-1]
- finding_name = finding.split()[0].strip('<>/')
- if not pre_delete:
- break
- next = pre_delete[-1]
- if next is DEL_END or not next.startswith('</'):
- # A word or a start tag
- break
- name = next.split()[0].strip('<>/')
- if name == 'ins' or name == 'del':
- # Can't move into an insert or delete
- break
- if name == finding_name:
- unbalanced_end.pop()
- post_delete.insert(0, pre_delete.pop())
- else:
- # Found a tag that doesn't match
- break
-
-class token(_unicode):
- """ Represents a diffable token, generally a word that is displayed to
- the user. Opening tags are attached to this token when they are
- adjacent (pre_tags) and closing tags that follow the word
- (post_tags). Some exceptions occur when there are empty tags
- adjacent to a word, so there may be close tags in pre_tags, or
- open tags in post_tags.
-
- We also keep track of whether the word was originally followed by
- whitespace, even though we do not want to treat the word as
- equivalent to a similar word that does not have a trailing
- space."""
-
- # When this is true, the token will be eliminated from the
- # displayed diff if no change has occurred:
- hide_when_equal = False
-
- def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
- obj = _unicode.__new__(cls, text)
-
- if pre_tags is not None:
- obj.pre_tags = pre_tags
- else:
- obj.pre_tags = []
-
- if post_tags is not None:
- obj.post_tags = post_tags
- else:
- obj.post_tags = []
-
- obj.trailing_whitespace = trailing_whitespace
-
- return obj
-
- def __repr__(self):
- return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
- self.post_tags, self.trailing_whitespace)
-
- def html(self):
- return _unicode(self)
-
-class tag_token(token):
-
- """ Represents a token that is actually a tag. Currently this is just
- the <img> tag, which takes up visible space just like a word but
- is only represented in a document by a tag. """
-
- def __new__(cls, tag, data, html_repr, pre_tags=None,
- post_tags=None, trailing_whitespace=""):
- obj = token.__new__(cls, "%s: %s" % (type, data),
- pre_tags=pre_tags,
- post_tags=post_tags,
- trailing_whitespace=trailing_whitespace)
- obj.tag = tag
- obj.data = data
- obj.html_repr = html_repr
- return obj
-
- def __repr__(self):
- return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
- self.tag,
- self.data,
- self.html_repr,
- self.pre_tags,
- self.post_tags,
- self.trailing_whitespace)
- def html(self):
- return self.html_repr
-
-class href_token(token):
-
- """ Represents the href in an anchor tag. Unlike other words, we only
- show the href when it changes. """
-
- hide_when_equal = True
-
- def html(self):
- return ' Link: %s' % self
-
-def tokenize(html, include_hrefs=True):
- """
- Parse the given HTML and returns token objects (words with attached tags).
-
- This parses only the content of a page; anything in the head is
- ignored, and the <head> and <body> elements are themselves
- optional. The content is then parsed by lxml, which ensures the
- validity of the resulting parsed document (though lxml may make
- incorrect guesses when the markup is particular bad).
-
- <ins> and <del> tags are also eliminated from the document, as
- that gets confusing.
-
- If include_hrefs is true, then the href attribute of <a> tags is
- included as a special kind of diffable token."""
- if etree.iselement(html):
- body_el = html
- else:
- body_el = parse_html(html, cleanup=True)
- # Then we split the document into text chunks for each tag, word, and end tag:
- chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
- # Finally re-joining them into token objects:
- return fixup_chunks(chunks)
-
-def parse_html(html, cleanup=True):
- """
- Parses an HTML fragment, returning an lxml element. Note that the HTML will be
- wrapped in a <div> tag that was not in the original document.
-
- If cleanup is true, make sure there's no <head> or <body>, and get
- rid of any <ins> and <del> tags.
- """
- if cleanup:
- # This removes any extra markup or structure like <head>:
- html = cleanup_html(html)
- return fragment_fromstring(html, create_parent=True)
-
-_body_re = re.compile(r'<body.*?>', re.I|re.S)
-_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
-_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
-
-def cleanup_html(html):
- """ This 'cleans' the HTML, meaning that any page structure is removed
- (only the contents of <body> are used, if there is any <body).
- Also <ins> and <del> tags are removed. """
- match = _body_re.search(html)
- if match:
- html = html[match.end():]
- match = _end_body_re.search(html)
- if match:
- html = html[:match.start()]
- html = _ins_del_re.sub('', html)
- return html
-
-
-end_whitespace_re = re.compile(r'[ \t\n\r]$')
-
-def split_trailing_whitespace(word):
- """
- This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
- """
- stripped_length = len(word.rstrip())
- return word[0:stripped_length], word[stripped_length:]
-
-
-def fixup_chunks(chunks):
- """
- This function takes a list of chunks and produces a list of tokens.
- """
- tag_accum = []
- cur_word = None
- result = []
- for chunk in chunks:
- if isinstance(chunk, tuple):
- if chunk[0] == 'img':
- src = chunk[1]
- tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
- cur_word = tag_token('img', src, html_repr=tag,
- pre_tags=tag_accum,
- trailing_whitespace=trailing_whitespace)
- tag_accum = []
- result.append(cur_word)
-
- elif chunk[0] == 'href':
- href = chunk[1]
- cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
- tag_accum = []
- result.append(cur_word)
- continue
-
- if is_word(chunk):
- chunk, trailing_whitespace = split_trailing_whitespace(chunk)
- cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
- tag_accum = []
- result.append(cur_word)
-
- elif is_start_tag(chunk):
- tag_accum.append(chunk)
-
- elif is_end_tag(chunk):
- if tag_accum:
- tag_accum.append(chunk)
- else:
- assert cur_word, (
- "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
- % (cur_word, result, chunk, chunks))
- cur_word.post_tags.append(chunk)
- else:
- assert False
-
- if not result:
- return [token('', pre_tags=tag_accum)]
- else:
- result[-1].post_tags.extend(tag_accum)
-
- return result
-
-
-# All the tags in HTML that don't require end tags:
-empty_tags = (
- 'param', 'img', 'area', 'br', 'basefont', 'input',
- 'base', 'meta', 'link', 'col')
-
-block_level_tags = (
- 'address',
- 'blockquote',
- 'center',
- 'dir',
- 'div',
- 'dl',
- 'fieldset',
- 'form',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'hr',
- 'isindex',
- 'menu',
- 'noframes',
- 'noscript',
- 'ol',
- 'p',
- 'pre',
- 'table',
- 'ul',
- )
-
-block_level_container_tags = (
- 'dd',
- 'dt',
- 'frameset',
- 'li',
- 'tbody',
- 'td',
- 'tfoot',
- 'th',
- 'thead',
- 'tr',
- )
-
-
-def flatten_el(el, include_hrefs, skip_tag=False):
- """ Takes an lxml element el, and generates all the text chunks for
- that tag. Each start tag is a chunk, each word is a chunk, and each
- end tag is a chunk.
-
- If skip_tag is true, then the outermost container tag is
- not returned (just its contents)."""
- if not skip_tag:
- if el.tag == 'img':
- yield ('img', el.get('src'), start_tag(el))
- else:
- yield start_tag(el)
- if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
- return
- start_words = split_words(el.text)
- for word in start_words:
- yield html_escape(word)
- for child in el:
- for item in flatten_el(child, include_hrefs=include_hrefs):
- yield item
- if el.tag == 'a' and el.get('href') and include_hrefs:
- yield ('href', el.get('href'))
- if not skip_tag:
- yield end_tag(el)
- end_words = split_words(el.tail)
- for word in end_words:
- yield html_escape(word)
-
-split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
-
-def split_words(text):
- """ Splits some text into words. Includes trailing whitespace
- on each word when appropriate. """
- if not text or not text.strip():
- return []
-
- words = split_words_re.findall(text)
- return words
-
-start_whitespace_re = re.compile(r'^[ \t\n\r]')
-
-def start_tag(el):
- """
- The text representation of the start tag for a tag.
- """
- return '<%s%s>' % (
- el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
- for name, value in el.attrib.items()]))
-
-def end_tag(el):
- """ The text representation of an end tag for a tag. Includes
- trailing whitespace when appropriate. """
- if el.tail and start_whitespace_re.search(el.tail):
- extra = ' '
- else:
- extra = ''
- return '</%s>%s' % (el.tag, extra)
-
-def is_word(tok):
- return not tok.startswith('<')
-
-def is_end_tag(tok):
- return tok.startswith('</')
-
-def is_start_tag(tok):
- return tok.startswith('<') and not tok.startswith('</')
-
-def fixup_ins_del_tags(html):
- """ Given an html string, move any <ins> or <del> tags inside of any
- block-level elements, e.g. transform <ins><p>word</p></ins> to
- <p><ins>word</ins></p> """
- doc = parse_html(html, cleanup=False)
- _fixup_ins_del_tags(doc)
- html = serialize_html_fragment(doc, skip_outer=True)
- return html
-
-def serialize_html_fragment(el, skip_outer=False):
- """ Serialize a single lxml element as HTML. The serialized form
- includes the elements tail.
-
- If skip_outer is true, then don't serialize the outermost tag
- """
- assert not isinstance(el, basestring), (
- "You should pass in an element, not a string like %r" % el)
- html = etree.tostring(el, method="html", encoding=_unicode)
- if skip_outer:
- # Get rid of the extra starting tag:
- html = html[html.find('>')+1:]
- # Get rid of the extra end tag:
- html = html[:html.rfind('<')]
- return html.strip()
- else:
- return html
-
-def _fixup_ins_del_tags(doc):
- """fixup_ins_del_tags that works on an lxml document in-place
- """
- for tag in ['ins', 'del']:
- for el in doc.xpath('descendant-or-self::%s' % tag):
- if not _contains_block_level_tag(el):
- continue
- _move_el_inside_block(el, tag=tag)
- el.drop_tag()
- #_merge_element_contents(el)
-
-def _contains_block_level_tag(el):
- """True if the element contains any block-level elements, like <p>, <td>, etc.
- """
- if el.tag in block_level_tags or el.tag in block_level_container_tags:
- return True
- for child in el:
- if _contains_block_level_tag(child):
- return True
- return False
-
-def _move_el_inside_block(el, tag):
- """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
- and moves them inside any block-level tags. """
- for child in el:
- if _contains_block_level_tag(child):
- break
- else:
- # No block-level tags in any child
- children_tag = etree.Element(tag)
- children_tag.text = el.text
- el.text = None
- children_tag.extend(list(el))
- el[:] = [children_tag]
- return
- for child in list(el):
- if _contains_block_level_tag(child):
- _move_el_inside_block(child, tag)
- if child.tail:
- tail_tag = etree.Element(tag)
- tail_tag.text = child.tail
- child.tail = None
- el.insert(el.index(child)+1, tail_tag)
- else:
- child_tag = etree.Element(tag)
- el.replace(child, child_tag)
- child_tag.append(child)
- if el.text:
- text_tag = etree.Element(tag)
- text_tag.text = el.text
- el.text = None
- el.insert(0, text_tag)
-
-def _merge_element_contents(el):
- """
- Removes an element, but merges its contents into its place, e.g.,
- given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
- <p>Hi there!</p>
- """
- parent = el.getparent()
- text = el.text or ''
- if el.tail:
- if not len(el):
- text += el.tail
- else:
- if el[-1].tail:
- el[-1].tail += el.tail
- else:
- el[-1].tail = el.tail
- index = parent.index(el)
- if text:
- if index == 0:
- previous = None
- else:
- previous = parent[index-1]
- if previous is None:
- if parent.text:
- parent.text += text
- else:
- parent.text = text
- else:
- if previous.tail:
- previous.tail += text
- else:
- previous.tail = text
- parent[index:index+1] = el.getchildren()
-
-class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
- """
- Acts like SequenceMatcher, but tries not to find very small equal
- blocks amidst large spans of changes
- """
-
- threshold = 2
-
- def get_matching_blocks(self):
- size = min(len(self.b), len(self.b))
- threshold = min(self.threshold, size / 4)
- actual = difflib.SequenceMatcher.get_matching_blocks(self)
- return [item for item in actual
- if item[2] > threshold
- or not item[2]]
-
-if __name__ == '__main__':
- from lxml.html import _diffcommand
- _diffcommand.main()
-
diff --git a/env/lib/python3.10/site-packages/lxml/html/formfill.py b/env/lib/python3.10/site-packages/lxml/html/formfill.py
deleted file mode 100644
index 2499a8e..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/formfill.py
+++ /dev/null
@@ -1,299 +0,0 @@
-from lxml.etree import XPath, ElementBase
-from lxml.html import fromstring, XHTML_NAMESPACE
-from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
-from lxml.html import defs
-import copy
-
-try:
- basestring
-except NameError:
- # Python 3
- basestring = str
-
-__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
- 'insert_errors', 'insert_errors_html',
- 'DefaultErrorCreator']
-
-class FormNotFound(LookupError):
- """
- Raised when no form can be found
- """
-
-_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
-_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
- namespaces={'x':XHTML_NAMESPACE})
-_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
- namespaces={'x':XHTML_NAMESPACE})
-_name_xpath = XPath('descendant-or-self::*[@name=$name]')
-
-def fill_form(
- el,
- values,
- form_id=None,
- form_index=None,
- ):
- el = _find_form(el, form_id=form_id, form_index=form_index)
- _fill_form(el, values)
-
-def fill_form_html(html, values, form_id=None, form_index=None):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- fill_form(doc, values, form_id=form_id, form_index=form_index)
- return _transform_result(result_type, doc)
-
-def _fill_form(el, values):
- counts = {}
- if hasattr(values, 'mixed'):
- # For Paste request parameters
- values = values.mixed()
- inputs = _input_xpath(el)
- for input in inputs:
- name = input.get('name')
- if not name:
- continue
- if _takes_multiple(input):
- value = values.get(name, [])
- if not isinstance(value, (list, tuple)):
- value = [value]
- _fill_multiple(input, value)
- elif name not in values:
- continue
- else:
- index = counts.get(name, 0)
- counts[name] = index + 1
- value = values[name]
- if isinstance(value, (list, tuple)):
- try:
- value = value[index]
- except IndexError:
- continue
- elif index > 0:
- continue
- _fill_single(input, value)
-
-def _takes_multiple(input):
- if _nons(input.tag) == 'select' and input.get('multiple'):
- # FIXME: multiple="0"?
- return True
- type = input.get('type', '').lower()
- if type in ('radio', 'checkbox'):
- return True
- return False
-
-def _fill_multiple(input, value):
- type = input.get('type', '').lower()
- if type == 'checkbox':
- v = input.get('value')
- if v is None:
- if not value:
- result = False
- else:
- result = value[0]
- if isinstance(value, basestring):
- # The only valid "on" value for an unnamed checkbox is 'on'
- result = result == 'on'
- _check(input, result)
- else:
- _check(input, v in value)
- elif type == 'radio':
- v = input.get('value')
- _check(input, v in value)
- else:
- assert _nons(input.tag) == 'select'
- for option in _options_xpath(input):
- v = option.get('value')
- if v is None:
- # This seems to be the default, at least on IE
- # FIXME: but I'm not sure
- v = option.text_content()
- _select(option, v in value)
-
-def _check(el, check):
- if check:
- el.set('checked', '')
- else:
- if 'checked' in el.attrib:
- del el.attrib['checked']
-
-def _select(el, select):
- if select:
- el.set('selected', '')
- else:
- if 'selected' in el.attrib:
- del el.attrib['selected']
-
-def _fill_single(input, value):
- if _nons(input.tag) == 'textarea':
- input.text = value
- else:
- input.set('value', value)
-
-def _find_form(el, form_id=None, form_index=None):
- if form_id is None and form_index is None:
- forms = _forms_xpath(el)
- for form in forms:
- return form
- raise FormNotFound(
- "No forms in page")
- if form_id is not None:
- form = el.get_element_by_id(form_id)
- if form is not None:
- return form
- forms = _form_name_xpath(el, name=form_id)
- if forms:
- return forms[0]
- else:
- raise FormNotFound(
- "No form with the name or id of %r (forms: %s)"
- % (id, ', '.join(_find_form_ids(el))))
- if form_index is not None:
- forms = _forms_xpath(el)
- try:
- return forms[form_index]
- except IndexError:
- raise FormNotFound(
- "There is no form with the index %r (%i forms found)"
- % (form_index, len(forms)))
-
-def _find_form_ids(el):
- forms = _forms_xpath(el)
- if not forms:
- yield '(no forms)'
- return
- for index, form in enumerate(forms):
- if form.get('id'):
- if form.get('name'):
- yield '%s or %s' % (form.get('id'),
- form.get('name'))
- else:
- yield form.get('id')
- elif form.get('name'):
- yield form.get('name')
- else:
- yield '(unnamed form %s)' % index
-
-############################################################
-## Error filling
-############################################################
-
-class DefaultErrorCreator(object):
- insert_before = True
- block_inside = True
- error_container_tag = 'div'
- error_message_class = 'error-message'
- error_block_class = 'error-block'
- default_message = "Invalid"
-
- def __init__(self, **kw):
- for name, value in kw.items():
- if not hasattr(self, name):
- raise TypeError(
- "Unexpected keyword argument: %s" % name)
- setattr(self, name, value)
-
- def __call__(self, el, is_block, message):
- error_el = el.makeelement(self.error_container_tag)
- if self.error_message_class:
- error_el.set('class', self.error_message_class)
- if is_block and self.error_block_class:
- error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
- if message is None or message == '':
- message = self.default_message
- if isinstance(message, ElementBase):
- error_el.append(message)
- else:
- assert isinstance(message, basestring), (
- "Bad message; should be a string or element: %r" % message)
- error_el.text = message or self.default_message
- if is_block and self.block_inside:
- if self.insert_before:
- error_el.tail = el.text
- el.text = None
- el.insert(0, error_el)
- else:
- el.append(error_el)
- else:
- parent = el.getparent()
- pos = parent.index(el)
- if self.insert_before:
- parent.insert(pos, error_el)
- else:
- error_el.tail = el.tail
- el.tail = None
- parent.insert(pos+1, error_el)
-
-default_error_creator = DefaultErrorCreator()
-
-
-def insert_errors(
- el,
- errors,
- form_id=None,
- form_index=None,
- error_class="error",
- error_creator=default_error_creator,
- ):
- el = _find_form(el, form_id=form_id, form_index=form_index)
- for name, error in errors.items():
- if error is None:
- continue
- for error_el, message in _find_elements_for_name(el, name, error):
- assert isinstance(message, (basestring, type(None), ElementBase)), (
- "Bad message: %r" % message)
- _insert_error(error_el, message, error_class, error_creator)
-
-def insert_errors_html(html, values, **kw):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- insert_errors(doc, values, **kw)
- return _transform_result(result_type, doc)
-
-def _insert_error(el, error, error_class, error_creator):
- if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
- is_block = False
- else:
- is_block = True
- if _nons(el.tag) != 'form' and error_class:
- _add_class(el, error_class)
- if el.get('id'):
- labels = _label_for_xpath(el, for_id=el.get('id'))
- if labels:
- for label in labels:
- _add_class(label, error_class)
- error_creator(el, is_block, error)
-
-def _add_class(el, class_name):
- if el.get('class'):
- el.set('class', el.get('class')+' '+class_name)
- else:
- el.set('class', class_name)
-
-def _find_elements_for_name(form, name, error):
- if name is None:
- # An error for the entire form
- yield form, error
- return
- if name.startswith('#'):
- # By id
- el = form.get_element_by_id(name[1:])
- if el is not None:
- yield el, error
- return
- els = _name_xpath(form, name=name)
- if not els:
- # FIXME: should this raise an exception?
- return
- if not isinstance(error, (list, tuple)):
- yield els[0], error
- return
- # FIXME: if error is longer than els, should it raise an error?
- for el, err in zip(els, error):
- if err is None:
- continue
- yield el, err
diff --git a/env/lib/python3.10/site-packages/lxml/html/html5parser.py b/env/lib/python3.10/site-packages/lxml/html/html5parser.py
deleted file mode 100644
index 2f7be15..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/html5parser.py
+++ /dev/null
@@ -1,260 +0,0 @@
-"""
-An interface to html5lib that mimics the lxml.html interface.
-"""
-import sys
-import string
-
-from html5lib import HTMLParser as _HTMLParser
-from html5lib.treebuilders.etree_lxml import TreeBuilder
-from lxml import etree
-from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
-
-# python3 compatibility
-try:
- _strings = basestring
-except NameError:
- _strings = (bytes, str)
-try:
- from urllib2 import urlopen
-except ImportError:
- from urllib.request import urlopen
-try:
- from urlparse import urlparse
-except ImportError:
- from urllib.parse import urlparse
-
-
-class HTMLParser(_HTMLParser):
- """An html5lib HTML parser with lxml as tree."""
-
- def __init__(self, strict=False, **kwargs):
- _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
-
-
-try:
- from html5lib import XHTMLParser as _XHTMLParser
-except ImportError:
- pass
-else:
- class XHTMLParser(_XHTMLParser):
- """An html5lib XHTML Parser with lxml as tree."""
-
- def __init__(self, strict=False, **kwargs):
- _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
-
- xhtml_parser = XHTMLParser()
-
-
-def _find_tag(tree, tag):
- elem = tree.find(tag)
- if elem is not None:
- return elem
- return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
-
-
-def document_fromstring(html, guess_charset=None, parser=None):
- """
- Parse a whole document into a string.
-
- If `guess_charset` is true, or if the input is not Unicode but a
- byte string, the `chardet` library will perform charset guessing
- on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- if parser is None:
- parser = html_parser
-
- options = {}
- if guess_charset is None and isinstance(html, bytes):
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- guess_charset = True
- if guess_charset is not None:
- options['useChardet'] = guess_charset
- return parser.parse(html, **options).getroot()
-
-
-def fragments_fromstring(html, no_leading_text=False,
- guess_charset=None, parser=None):
- """Parses several HTML elements, returning a list of elements.
-
- The first item in the list may be a string. If no_leading_text is true,
- then it will be an error if there is leading text, and it will always be
- a list of only elements.
-
- If `guess_charset` is true, the `chardet` library will perform charset
- guessing on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- if parser is None:
- parser = html_parser
-
- options = {}
- if guess_charset is None and isinstance(html, bytes):
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- guess_charset = False
- if guess_charset is not None:
- options['useChardet'] = guess_charset
- children = parser.parseFragment(html, 'div', **options)
- if children and isinstance(children[0], _strings):
- if no_leading_text:
- if children[0].strip():
- raise etree.ParserError('There is leading text: %r' %
- children[0])
- del children[0]
- return children
-
-
-def fragment_fromstring(html, create_parent=False,
- guess_charset=None, parser=None):
- """Parses a single HTML element; it is an error if there is more than
- one element, or if anything but whitespace precedes or follows the
- element.
-
- If 'create_parent' is true (or is a tag name) then a parent node
- will be created to encapsulate the HTML in a single element. In
- this case, leading or trailing text is allowed.
-
- If `guess_charset` is true, the `chardet` library will perform charset
- guessing on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- accept_leading_text = bool(create_parent)
-
- elements = fragments_fromstring(
- html, guess_charset=guess_charset, parser=parser,
- no_leading_text=not accept_leading_text)
-
- if create_parent:
- if not isinstance(create_parent, _strings):
- create_parent = 'div'
- new_root = Element(create_parent)
- if elements:
- if isinstance(elements[0], _strings):
- new_root.text = elements[0]
- del elements[0]
- new_root.extend(elements)
- return new_root
-
- if not elements:
- raise etree.ParserError('No elements found')
- if len(elements) > 1:
- raise etree.ParserError('Multiple elements found')
- result = elements[0]
- if result.tail and result.tail.strip():
- raise etree.ParserError('Element followed by text: %r' % result.tail)
- result.tail = None
- return result
-
-
-def fromstring(html, guess_charset=None, parser=None):
- """Parse the html, returning a single element/document.
-
- This tries to minimally parse the chunk of text, without knowing if it
- is a fragment or a document.
-
- 'base_url' will set the document's base_url attribute (and the tree's
- docinfo.URL)
-
- If `guess_charset` is true, or if the input is not Unicode but a
- byte string, the `chardet` library will perform charset guessing
- on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
- doc = document_fromstring(html, parser=parser,
- guess_charset=guess_charset)
-
- # document starts with doctype or <html>, full document!
- start = html[:50]
- if isinstance(start, bytes):
- # Allow text comparison in python3.
- # Decode as ascii, that also covers latin-1 and utf-8 for the
- # characters we need.
- start = start.decode('ascii', 'replace')
-
- start = start.lstrip().lower()
- if start.startswith('<html') or start.startswith('<!doctype'):
- return doc
-
- head = _find_tag(doc, 'head')
-
- # if the head is not empty we have a full document
- if len(head):
- return doc
-
- body = _find_tag(doc, 'body')
-
- # The body has just one element, so it was probably a single
- # element passed in
- if (len(body) == 1 and (not body.text or not body.text.strip())
- and (not body[-1].tail or not body[-1].tail.strip())):
- return body[0]
-
- # Now we have a body which represents a bunch of tags which have the
- # content that was passed in. We will create a fake container, which
- # is the body tag, except <body> implies too much structure.
- if _contains_block_level_tag(body):
- body.tag = 'div'
- else:
- body.tag = 'span'
- return body
-
-
-def parse(filename_url_or_file, guess_charset=None, parser=None):
- """Parse a filename, URL, or file-like object into an HTML document
- tree. Note: this returns a tree, not an element. Use
- ``parse(...).getroot()`` to get the document root.
-
- If ``guess_charset`` is true, the ``useChardet`` option is passed into
- html5lib to enable character detection. This option is on by default
- when parsing from URLs, off by default when parsing from file(-like)
- objects (which tend to return Unicode more often than not), and on by
- default when parsing from a file path (which is read in binary mode).
- """
- if parser is None:
- parser = html_parser
- if not isinstance(filename_url_or_file, _strings):
- fp = filename_url_or_file
- if guess_charset is None:
- # assume that file-like objects return Unicode more often than bytes
- guess_charset = False
- elif _looks_like_url(filename_url_or_file):
- fp = urlopen(filename_url_or_file)
- if guess_charset is None:
- # assume that URLs return bytes
- guess_charset = True
- else:
- fp = open(filename_url_or_file, 'rb')
- if guess_charset is None:
- guess_charset = True
-
- options = {}
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- if guess_charset:
- options['useChardet'] = guess_charset
- return parser.parse(fp, **options)
-
-
-def _looks_like_url(str):
- scheme = urlparse(str)[0]
- if not scheme:
- return False
- elif (sys.platform == 'win32' and
- scheme in string.ascii_letters
- and len(scheme) == 1):
- # looks like a 'normal' absolute path
- return False
- else:
- return True
-
-
-html_parser = HTMLParser()
diff --git a/env/lib/python3.10/site-packages/lxml/html/soupparser.py b/env/lib/python3.10/site-packages/lxml/html/soupparser.py
deleted file mode 100644
index e0cf3a0..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/soupparser.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["fromstring", "parse", "convert_tree"]
-
-import re
-from lxml import etree, html
-
-try:
- from bs4 import (
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
- Declaration, Doctype)
- _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
-except ImportError:
- from BeautifulSoup import (
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
- Declaration)
- _DECLARATION_OR_DOCTYPE = Declaration
-
-
-def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
- """Parse a string of HTML data into an Element tree using the
- BeautifulSoup parser.
-
- Returns the root ``<html>`` Element of the tree.
-
- You can pass a different BeautifulSoup parser through the
- `beautifulsoup` keyword, and a diffent Element factory function
- through the `makeelement` keyword. By default, the standard
- ``BeautifulSoup`` class and the default factory of `lxml.html` are
- used.
- """
- return _parse(data, beautifulsoup, makeelement, **bsargs)
-
-
-def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
- """Parse a file into an ElemenTree using the BeautifulSoup parser.
-
- You can pass a different BeautifulSoup parser through the
- `beautifulsoup` keyword, and a diffent Element factory function
- through the `makeelement` keyword. By default, the standard
- ``BeautifulSoup`` class and the default factory of `lxml.html` are
- used.
- """
- if not hasattr(file, 'read'):
- file = open(file)
- root = _parse(file, beautifulsoup, makeelement, **bsargs)
- return etree.ElementTree(root)
-
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
- """Convert a BeautifulSoup tree to a list of Element trees.
-
- Returns a list instead of a single root Element to support
- HTML-like soup with more than one root element.
-
- You can pass a different Element factory through the `makeelement`
- keyword.
- """
- root = _convert_tree(beautiful_soup_tree, makeelement)
- children = root.getchildren()
- for child in children:
- root.remove(child)
- return children
-
-
-# helpers
-
-def _parse(source, beautifulsoup, makeelement, **bsargs):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
- if 'convertEntities' not in bsargs:
- bsargs['convertEntities'] = 'html'
- if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
- if 'features' not in bsargs:
- bsargs['features'] = 'html.parser' # use Python html parser
- tree = beautifulsoup(source, **bsargs)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
-
-
-_parse_doctype_declaration = re.compile(
- r'(?:\s|[<!])*DOCTYPE\s*HTML'
- r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
- r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
- re.IGNORECASE).match
-
-
-class _PseudoTag:
- # Minimal imitation of BeautifulSoup.Tag
- def __init__(self, contents):
- self.name = 'html'
- self.attrs = []
- self.contents = contents
-
- def __iter__(self):
- return self.contents.__iter__()
-
-
-def _convert_tree(beautiful_soup_tree, makeelement):
- if makeelement is None:
- makeelement = html.html_parser.makeelement
-
- # Split the tree into three parts:
- # i) everything before the root element: document type
- # declaration, comments, processing instructions, whitespace
- # ii) the root(s),
- # iii) everything after the root: comments, processing
- # instructions, whitespace
- first_element_idx = last_element_idx = None
- html_root = declaration = None
- for i, e in enumerate(beautiful_soup_tree):
- if isinstance(e, Tag):
- if first_element_idx is None:
- first_element_idx = i
- last_element_idx = i
- if html_root is None and e.name and e.name.lower() == 'html':
- html_root = e
- elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
- declaration = e
-
- # For a nice, well-formatted document, the variable roots below is
- # a list consisting of a single <html> element. However, the document
- # may be a soup like '<meta><head><title>Hello</head><body>Hi
- # all<\p>'. In this example roots is a list containing meta, head
- # and body elements.
- if first_element_idx is None:
- pre_root = post_root = []
- roots = beautiful_soup_tree.contents
- else:
- pre_root = beautiful_soup_tree.contents[:first_element_idx]
- roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
- post_root = beautiful_soup_tree.contents[last_element_idx+1:]
-
- # Reorganize so that there is one <html> root...
- if html_root is not None:
- # ... use existing one if possible, ...
- i = roots.index(html_root)
- html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
- else:
- # ... otherwise create a new one.
- html_root = _PseudoTag(roots)
-
- convert_node = _init_node_converters(makeelement)
-
- # Process pre_root
- res_root = convert_node(html_root)
- prev = res_root
- for e in reversed(pre_root):
- converted = convert_node(e)
- if converted is not None:
- prev.addprevious(converted)
- prev = converted
-
- # ditto for post_root
- prev = res_root
- for e in post_root:
- converted = convert_node(e)
- if converted is not None:
- prev.addnext(converted)
- prev = converted
-
- if declaration is not None:
- try:
- # bs4 provides full Doctype string
- doctype_string = declaration.output_ready()
- except AttributeError:
- doctype_string = declaration.string
-
- match = _parse_doctype_declaration(doctype_string)
- if not match:
- # Something is wrong if we end up in here. Since soupparser should
- # tolerate errors, do not raise Exception, just let it pass.
- pass
- else:
- external_id, sys_uri = match.groups()
- docinfo = res_root.getroottree().docinfo
- # strip quotes and update DOCTYPE values (any of None, '', '...')
- docinfo.public_id = external_id and external_id[1:-1]
- docinfo.system_url = sys_uri and sys_uri[1:-1]
-
- return res_root
-
-
-def _init_node_converters(makeelement):
- converters = {}
- ordered_node_types = []
-
- def converter(*types):
- def add(handler):
- for t in types:
- converters[t] = handler
- ordered_node_types.append(t)
- return handler
- return add
-
- def find_best_converter(node):
- for t in ordered_node_types:
- if isinstance(node, t):
- return converters[t]
- return None
-
- def convert_node(bs_node, parent=None):
- # duplicated in convert_tag() below
- try:
- handler = converters[type(bs_node)]
- except KeyError:
- handler = converters[type(bs_node)] = find_best_converter(bs_node)
- if handler is None:
- return None
- return handler(bs_node, parent)
-
- def map_attrs(bs_attrs):
- if isinstance(bs_attrs, dict): # bs4
- attribs = {}
- for k, v in bs_attrs.items():
- if isinstance(v, list):
- v = " ".join(v)
- attribs[k] = unescape(v)
- else:
- attribs = dict((k, unescape(v)) for k, v in bs_attrs)
- return attribs
-
- def append_text(parent, text):
- if len(parent) == 0:
- parent.text = (parent.text or '') + text
- else:
- parent[-1].tail = (parent[-1].tail or '') + text
-
- # converters are tried in order of their definition
-
- @converter(Tag, _PseudoTag)
- def convert_tag(bs_node, parent):
- attrs = bs_node.attrs
- if parent is not None:
- attribs = map_attrs(attrs) if attrs else None
- res = etree.SubElement(parent, bs_node.name, attrib=attribs)
- else:
- attribs = map_attrs(attrs) if attrs else {}
- res = makeelement(bs_node.name, attrib=attribs)
-
- for child in bs_node:
- # avoid double recursion by inlining convert_node(), see above
- try:
- handler = converters[type(child)]
- except KeyError:
- pass
- else:
- if handler is not None:
- handler(child, res)
- continue
- convert_node(child, res)
- return res
-
- @converter(Comment)
- def convert_comment(bs_node, parent):
- res = html.HtmlComment(bs_node)
- if parent is not None:
- parent.append(res)
- return res
-
- @converter(ProcessingInstruction)
- def convert_pi(bs_node, parent):
- if bs_node.endswith('?'):
- # The PI is of XML style (<?as df?>) but BeautifulSoup
- # interpreted it as being SGML style (<?as df>). Fix.
- bs_node = bs_node[:-1]
- res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
- if parent is not None:
- parent.append(res)
- return res
-
- @converter(NavigableString)
- def convert_text(bs_node, parent):
- if parent is not None:
- append_text(parent, unescape(bs_node))
- return None
-
- return convert_node
-
-
-# copied from ET's ElementSoup
-
-try:
- from html.entities import name2codepoint # Python 3
-except ImportError:
- from htmlentitydefs import name2codepoint
-
-
-handle_entities = re.compile(r"&(\w+);").sub
-
-
-try:
- unichr
-except NameError:
- # Python 3
- unichr = chr
-
-
-def unescape(string):
- if not string:
- return ''
- # work around oddities in BeautifulSoup's entity handling
- def unescape_entity(m):
- try:
- return unichr(name2codepoint[m.group(1)])
- except KeyError:
- return m.group(0) # use as is
- return handle_entities(unescape_entity, string)
diff --git a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py b/env/lib/python3.10/site-packages/lxml/html/usedoctest.py
deleted file mode 100644
index f352a1c..0000000
--- a/env/lib/python3.10/site-packages/lxml/html/usedoctest.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Doctest module for HTML comparison.
-
-Usage::
-
- >>> import lxml.html.usedoctest
- >>> # now do your HTML doctests ...
-
-See `lxml.doctestcompare`.
-"""
-
-from lxml import doctestcompare
-
-doctestcompare.temp_install(html=True, del_module=__name__)