diff options
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html/soupparser.py')
-rw-r--r-- | env/lib/python3.10/site-packages/lxml/html/soupparser.py | 314 |
1 files changed, 0 insertions, 314 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/soupparser.py b/env/lib/python3.10/site-packages/lxml/html/soupparser.py deleted file mode 100644 index e0cf3a0..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/soupparser.py +++ /dev/null @@ -1,314 +0,0 @@ -"""External interface to the BeautifulSoup HTML parser. -""" - -__all__ = ["fromstring", "parse", "convert_tree"] - -import re -from lxml import etree, html - -try: - from bs4 import ( - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, - Declaration, Doctype) - _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) -except ImportError: - from BeautifulSoup import ( - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, - Declaration) - _DECLARATION_OR_DOCTYPE = Declaration - - -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a string of HTML data into an Element tree using the - BeautifulSoup parser. - - Returns the root ``<html>`` Element of the tree. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - return _parse(data, beautifulsoup, makeelement, **bsargs) - - -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a file into an ElemenTree using the BeautifulSoup parser. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - if not hasattr(file, 'read'): - file = open(file) - root = _parse(file, beautifulsoup, makeelement, **bsargs) - return etree.ElementTree(root) - - -def convert_tree(beautiful_soup_tree, makeelement=None): - """Convert a BeautifulSoup tree to a list of Element trees. - - Returns a list instead of a single root Element to support - HTML-like soup with more than one root element. - - You can pass a different Element factory through the `makeelement` - keyword. - """ - root = _convert_tree(beautiful_soup_tree, makeelement) - children = root.getchildren() - for child in children: - root.remove(child) - return children - - -# helpers - -def _parse(source, beautifulsoup, makeelement, **bsargs): - if beautifulsoup is None: - beautifulsoup = BeautifulSoup - if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 - if 'convertEntities' not in bsargs: - bsargs['convertEntities'] = 'html' - if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 - if 'features' not in bsargs: - bsargs['features'] = 'html.parser' # use Python html parser - tree = beautifulsoup(source, **bsargs) - root = _convert_tree(tree, makeelement) - # from ET: wrap the document in a html root element, if necessary - if len(root) == 1 and root[0].tag == "html": - return root[0] - root.tag = "html" - return root - - -_parse_doctype_declaration = re.compile( - r'(?:\s|[<!])*DOCTYPE\s*HTML' - r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' - r'(?:\s+(\'[^\']*\'|"[^"]*"))?', - re.IGNORECASE).match - - -class _PseudoTag: - # Minimal imitation of BeautifulSoup.Tag - def __init__(self, contents): - self.name = 'html' - self.attrs = [] - self.contents = contents - - def __iter__(self): - return self.contents.__iter__() - - -def _convert_tree(beautiful_soup_tree, makeelement): - if makeelement is None: - makeelement = html.html_parser.makeelement - - # Split the tree into three parts: - # i) everything before the root element: document type - # declaration, comments, processing instructions, whitespace - # ii) the root(s), - # iii) everything after the root: comments, processing - # instructions, whitespace - first_element_idx = last_element_idx = None - html_root = declaration = None - for i, e in enumerate(beautiful_soup_tree): - if isinstance(e, Tag): - if first_element_idx is None: - first_element_idx = i - last_element_idx = i - if html_root is None and e.name and e.name.lower() == 'html': - html_root = e - elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): - declaration = e - - # For a nice, well-formatted document, the variable roots below is - # a list consisting of a single <html> element. However, the document - # may be a soup like '<meta><head><title>Hello</head><body>Hi - # all<\p>'. In this example roots is a list containing meta, head - # and body elements. - if first_element_idx is None: - pre_root = post_root = [] - roots = beautiful_soup_tree.contents - else: - pre_root = beautiful_soup_tree.contents[:first_element_idx] - roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] - post_root = beautiful_soup_tree.contents[last_element_idx+1:] - - # Reorganize so that there is one <html> root... - if html_root is not None: - # ... use existing one if possible, ... - i = roots.index(html_root) - html_root.contents = roots[:i] + html_root.contents + roots[i+1:] - else: - # ... otherwise create a new one. - html_root = _PseudoTag(roots) - - convert_node = _init_node_converters(makeelement) - - # Process pre_root - res_root = convert_node(html_root) - prev = res_root - for e in reversed(pre_root): - converted = convert_node(e) - if converted is not None: - prev.addprevious(converted) - prev = converted - - # ditto for post_root - prev = res_root - for e in post_root: - converted = convert_node(e) - if converted is not None: - prev.addnext(converted) - prev = converted - - if declaration is not None: - try: - # bs4 provides full Doctype string - doctype_string = declaration.output_ready() - except AttributeError: - doctype_string = declaration.string - - match = _parse_doctype_declaration(doctype_string) - if not match: - # Something is wrong if we end up in here. Since soupparser should - # tolerate errors, do not raise Exception, just let it pass. - pass - else: - external_id, sys_uri = match.groups() - docinfo = res_root.getroottree().docinfo - # strip quotes and update DOCTYPE values (any of None, '', '...') - docinfo.public_id = external_id and external_id[1:-1] - docinfo.system_url = sys_uri and sys_uri[1:-1] - - return res_root - - -def _init_node_converters(makeelement): - converters = {} - ordered_node_types = [] - - def converter(*types): - def add(handler): - for t in types: - converters[t] = handler - ordered_node_types.append(t) - return handler - return add - - def find_best_converter(node): - for t in ordered_node_types: - if isinstance(node, t): - return converters[t] - return None - - def convert_node(bs_node, parent=None): - # duplicated in convert_tag() below - try: - handler = converters[type(bs_node)] - except KeyError: - handler = converters[type(bs_node)] = find_best_converter(bs_node) - if handler is None: - return None - return handler(bs_node, parent) - - def map_attrs(bs_attrs): - if isinstance(bs_attrs, dict): # bs4 - attribs = {} - for k, v in bs_attrs.items(): - if isinstance(v, list): - v = " ".join(v) - attribs[k] = unescape(v) - else: - attribs = dict((k, unescape(v)) for k, v in bs_attrs) - return attribs - - def append_text(parent, text): - if len(parent) == 0: - parent.text = (parent.text or '') + text - else: - parent[-1].tail = (parent[-1].tail or '') + text - - # converters are tried in order of their definition - - @converter(Tag, _PseudoTag) - def convert_tag(bs_node, parent): - attrs = bs_node.attrs - if parent is not None: - attribs = map_attrs(attrs) if attrs else None - res = etree.SubElement(parent, bs_node.name, attrib=attribs) - else: - attribs = map_attrs(attrs) if attrs else {} - res = makeelement(bs_node.name, attrib=attribs) - - for child in bs_node: - # avoid double recursion by inlining convert_node(), see above - try: - handler = converters[type(child)] - except KeyError: - pass - else: - if handler is not None: - handler(child, res) - continue - convert_node(child, res) - return res - - @converter(Comment) - def convert_comment(bs_node, parent): - res = html.HtmlComment(bs_node) - if parent is not None: - parent.append(res) - return res - - @converter(ProcessingInstruction) - def convert_pi(bs_node, parent): - if bs_node.endswith('?'): - # The PI is of XML style (<?as df?>) but BeautifulSoup - # interpreted it as being SGML style (<?as df>). Fix. - bs_node = bs_node[:-1] - res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) - if parent is not None: - parent.append(res) - return res - - @converter(NavigableString) - def convert_text(bs_node, parent): - if parent is not None: - append_text(parent, unescape(bs_node)) - return None - - return convert_node - - -# copied from ET's ElementSoup - -try: - from html.entities import name2codepoint # Python 3 -except ImportError: - from htmlentitydefs import name2codepoint - - -handle_entities = re.compile(r"&(\w+);").sub - - -try: - unichr -except NameError: - # Python 3 - unichr = chr - - -def unescape(string): - if not string: - return '' - # work around oddities in BeautifulSoup's entity handling - def unescape_entity(m): - try: - return unichr(name2codepoint[m.group(1)]) - except KeyError: - return m.group(0) # use as is - return handle_entities(unescape_entity, string) |