diff options
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html/diff.py')
-rw-r--r-- | env/lib/python3.10/site-packages/lxml/html/diff.py | 884 |
1 files changed, 0 insertions, 884 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py deleted file mode 100644 index 39bec78..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/diff.py +++ /dev/null @@ -1,884 +0,0 @@ -# cython: language_level=3 - -from __future__ import absolute_import - -import difflib -from lxml import etree -from lxml.html import fragment_fromstring -import re - -__all__ = ['html_annotate', 'htmldiff'] - -try: - from html import escape as html_escape -except ImportError: - from cgi import escape as html_escape -try: - _unicode = unicode -except NameError: - # Python 3 - _unicode = str -try: - basestring -except NameError: - # Python 3 - basestring = str - -############################################################ -## Annotation -############################################################ - -def default_markup(text, version): - return '<span title="%s">%s</span>' % ( - html_escape(_unicode(version), 1), text) - -def html_annotate(doclist, markup=default_markup): - """ - doclist should be ordered from oldest to newest, like:: - - >>> version1 = 'Hello World' - >>> version2 = 'Goodbye World' - >>> print(html_annotate([(version1, 'version 1'), - ... (version2, 'version 2')])) - <span title="version 2">Goodbye</span> <span title="version 1">World</span> - - The documents must be *fragments* (str/UTF8 or unicode), not - complete documents - - The markup argument is a function to markup the spans of words. - This function is called like markup('Hello', 'version 2'), and - returns HTML. The first argument is text and never includes any - markup. The default uses a span with a title: - - >>> print(default_markup('Some Text', 'by Joe')) - <span title="by Joe">Some Text</span> - """ - # The basic strategy we have is to split the documents up into - # logical tokens (which are words with attached markup). We then - # do diffs of each of the versions to track when a token first - # appeared in the document; the annotation attached to the token - # is the version where it first appeared. - tokenlist = [tokenize_annotated(doc, version) - for doc, version in doclist] - cur_tokens = tokenlist[0] - for tokens in tokenlist[1:]: - html_annotate_merge_annotations(cur_tokens, tokens) - cur_tokens = tokens - - # After we've tracked all the tokens, we can combine spans of text - # that are adjacent and have the same annotation - cur_tokens = compress_tokens(cur_tokens) - # And finally add markup - result = markup_serialize_tokens(cur_tokens, markup) - return ''.join(result).strip() - -def tokenize_annotated(doc, annotation): - """Tokenize a document and add an annotation attribute to each token - """ - tokens = tokenize(doc, include_hrefs=False) - for tok in tokens: - tok.annotation = annotation - return tokens - -def html_annotate_merge_annotations(tokens_old, tokens_new): - """Merge the annotations from tokens_old into tokens_new, when the - tokens in the new document already existed in the old document. - """ - s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) - commands = s.get_opcodes() - - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - eq_old = tokens_old[i1:i2] - eq_new = tokens_new[j1:j2] - copy_annotations(eq_old, eq_new) - -def copy_annotations(src, dest): - """ - Copy annotations from the tokens listed in src to the tokens in dest - """ - assert len(src) == len(dest) - for src_tok, dest_tok in zip(src, dest): - dest_tok.annotation = src_tok.annotation - -def compress_tokens(tokens): - """ - Combine adjacent tokens when there is no HTML between the tokens, - and they share an annotation - """ - result = [tokens[0]] - for tok in tokens[1:]: - if (not result[-1].post_tags and - not tok.pre_tags and - result[-1].annotation == tok.annotation): - compress_merge_back(result, tok) - else: - result.append(tok) - return result - -def compress_merge_back(tokens, tok): - """ Merge tok into the last element of tokens (modifying the list of - tokens in-place). """ - last = tokens[-1] - if type(last) is not token or type(tok) is not token: - tokens.append(tok) - else: - text = _unicode(last) - if last.trailing_whitespace: - text += last.trailing_whitespace - text += tok - merged = token(text, - pre_tags=last.pre_tags, - post_tags=tok.post_tags, - trailing_whitespace=tok.trailing_whitespace) - merged.annotation = last.annotation - tokens[-1] = merged - -def markup_serialize_tokens(tokens, markup_func): - """ - Serialize the list of tokens into a list of text chunks, calling - markup_func around text to add annotations. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - html = token.html() - html = markup_func(html, token.annotation) - if token.trailing_whitespace: - html += token.trailing_whitespace - yield html - for post in token.post_tags: - yield post - - -############################################################ -## HTML Diffs -############################################################ - -def htmldiff(old_html, new_html): - ## FIXME: this should take parsed documents too, and use their body - ## or other content. - """ Do a diff of the old and new document. The documents are HTML - *fragments* (str/UTF8 or unicode), they are not complete documents - (i.e., no <html> tag). - - Returns HTML with <ins> and <del> tags added around the - appropriate text. - - Markup is generally ignored, with the markup from new_html - preserved, and possibly some markup from old_html (though it is - considered acceptable to lose some of the old markup). Only the - words in the HTML are diffed. The exception is <img> tags, which - are treated like words, and the href attribute of <a> tags, which - are noted inside the tag itself when there are changes. - """ - old_html_tokens = tokenize(old_html) - new_html_tokens = tokenize(new_html) - result = htmldiff_tokens(old_html_tokens, new_html_tokens) - result = ''.join(result).strip() - return fixup_ins_del_tags(result) - -def htmldiff_tokens(html1_tokens, html2_tokens): - """ Does a diff on the tokens themselves, returning a list of text - chunks (not tokens). - """ - # There are several passes as we do the differences. The tokens - # isolate the portion of the content we care to diff; difflib does - # all the actual hard work at that point. - # - # Then we must create a valid document from pieces of both the old - # document and the new document. We generally prefer to take - # markup from the new document, and only do a best effort attempt - # to keep markup from the old document; anything that we can't - # resolve we throw away. Also we try to put the deletes as close - # to the location where we think they would have been -- because - # we are only keeping the markup from the new document, it can be - # fuzzy where in the new document the old text would have gone. - # Again we just do a best effort attempt. - s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) - commands = s.get_opcodes() - result = [] - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) - continue - if command == 'insert' or command == 'replace': - ins_tokens = expand_tokens(html2_tokens[j1:j2]) - merge_insert(ins_tokens, result) - if command == 'delete' or command == 'replace': - del_tokens = expand_tokens(html1_tokens[i1:i2]) - merge_delete(del_tokens, result) - # If deletes were inserted directly as <del> then we'd have an - # invalid document at this point. Instead we put in special - # markers, and when the complete diffed document has been created - # we try to move the deletes around and resolve any problems. - result = cleanup_delete(result) - - return result - -def expand_tokens(tokens, equal=False): - """Given a list of tokens, return a generator of the chunks of - text for the data in the tokens. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - if not equal or not token.hide_when_equal: - if token.trailing_whitespace: - yield token.html() + token.trailing_whitespace - else: - yield token.html() - for post in token.post_tags: - yield post - -def merge_insert(ins_chunks, doc): - """ doc is the already-handled document (as a list of text chunks); - here we add <ins>ins_chunks</ins> to the end of that. """ - # Though we don't throw away unbalanced_start or unbalanced_end - # (we assume there is accompanying markup later or earlier in the - # document), we only put <ins> around the balanced portion. - unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) - doc.extend(unbalanced_start) - if doc and not doc[-1].endswith(' '): - # Fix up the case where the word before the insert didn't end with - # a space - doc[-1] += ' ' - doc.append('<ins>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </ins> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</ins> ') - doc.extend(unbalanced_end) - -# These are sentinels to represent the start and end of a <del> -# segment, until we do the cleanup phase to turn them into proper -# markup: -class DEL_START: - pass -class DEL_END: - pass - -class NoDeletes(Exception): - """ Raised when the document no longer contains any pending deletes - (DEL_START/DEL_END) """ - -def merge_delete(del_chunks, doc): - """ Adds the text chunks in del_chunks to the document doc (another - list of text chunks) with marker to show it is a delete. - cleanup_delete later resolves these markers into <del> tags.""" - doc.append(DEL_START) - doc.extend(del_chunks) - doc.append(DEL_END) - -def cleanup_delete(chunks): - """ Cleans up any DEL_START/DEL_END markers in the document, replacing - them with <del></del>. To do this while keeping the document - valid, it may need to drop some tags (either start or end tags). - - It may also move the del into adjacent tags to try to move it to a - similar location where it was originally located (e.g., moving a - delete into preceding <div> tag, if the del looks like (DEL_START, - 'Text</div>', DEL_END)""" - while 1: - # Find a pending DEL_START/DEL_END, splitting the document - # into stuff-preceding-DEL_START, stuff-inside, and - # stuff-following-DEL_END - try: - pre_delete, delete, post_delete = split_delete(chunks) - except NoDeletes: - # Nothing found, we've cleaned up the entire doc - break - # The stuff-inside-DEL_START/END may not be well balanced - # markup. First we figure out what unbalanced portions there are: - unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) - # Then we move the span forward and/or backward based on these - # unbalanced portions: - locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) - locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) - doc = pre_delete - if doc and not doc[-1].endswith(' '): - # Fix up case where the word before us didn't have a trailing space - doc[-1] += ' ' - doc.append('<del>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </del> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</del> ') - doc.extend(post_delete) - chunks = doc - return chunks - -def split_unbalanced(chunks): - """Return (unbalanced_start, balanced, unbalanced_end), where each is - a list of text and tag chunks. - - unbalanced_start is a list of all the tags that are opened, but - not closed in this span. Similarly, unbalanced_end is a list of - tags that are closed but were not opened. Extracting these might - mean some reordering of the chunks.""" - start = [] - end = [] - tag_stack = [] - balanced = [] - for chunk in chunks: - if not chunk.startswith('<'): - balanced.append(chunk) - continue - endtag = chunk[1] == '/' - name = chunk.split()[0].strip('<>/') - if name in empty_tags: - balanced.append(chunk) - continue - if endtag: - if tag_stack and tag_stack[-1][0] == name: - balanced.append(chunk) - name, pos, tag = tag_stack.pop() - balanced[pos] = tag - elif tag_stack: - start.extend([tag for name, pos, tag in tag_stack]) - tag_stack = [] - end.append(chunk) - else: - end.append(chunk) - else: - tag_stack.append((name, len(balanced), chunk)) - balanced.append(None) - start.extend( - [chunk for name, pos, chunk in tag_stack]) - balanced = [chunk for chunk in balanced if chunk is not None] - return start, balanced, end - -def split_delete(chunks): - """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, - stuff_after_DEL_END). Returns the first case found (there may be - more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if - there's no DEL_START found. """ - try: - pos = chunks.index(DEL_START) - except ValueError: - raise NoDeletes - pos2 = chunks.index(DEL_END) - return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] - -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): - """ pre_delete and post_delete implicitly point to a place in the - document (where the two were split). This moves that point (by - popping items from one and pushing them onto the other). It moves - the point to try to find a place where unbalanced_start applies. - - As an example:: - - >>> unbalanced_start = ['<div>'] - >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] - >>> pre, post = doc[:3], doc[3:] - >>> pre, post - (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) - >>> locate_unbalanced_start(unbalanced_start, pre, post) - >>> pre, post - (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) - - As you can see, we moved the point so that the dangling <div> that - we found will be effectively replaced by the div in the original - document. If this doesn't work out, we just throw away - unbalanced_start without doing anything. - """ - while 1: - if not unbalanced_start: - # We have totally succeeded in finding the position - break - finding = unbalanced_start[0] - finding_name = finding.split()[0].strip('<>') - if not post_delete: - break - next = post_delete[0] - if next is DEL_START or not next.startswith('<'): - # Reached a word, we can't move the delete text forward - break - if next[1] == '/': - # Reached a closing tag, can we go further? Maybe not... - break - name = next.split()[0].strip('<>') - if name == 'ins': - # Can't move into an insert - break - assert name != 'del', ( - "Unexpected delete tag: %r" % next) - if name == finding_name: - unbalanced_start.pop(0) - pre_delete.append(post_delete.pop(0)) - else: - # Found a tag that doesn't match - break - -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): - """ like locate_unbalanced_start, except handling end tags and - possibly moving the point earlier in the document. """ - while 1: - if not unbalanced_end: - # Success - break - finding = unbalanced_end[-1] - finding_name = finding.split()[0].strip('<>/') - if not pre_delete: - break - next = pre_delete[-1] - if next is DEL_END or not next.startswith('</'): - # A word or a start tag - break - name = next.split()[0].strip('<>/') - if name == 'ins' or name == 'del': - # Can't move into an insert or delete - break - if name == finding_name: - unbalanced_end.pop() - post_delete.insert(0, pre_delete.pop()) - else: - # Found a tag that doesn't match - break - -class token(_unicode): - """ Represents a diffable token, generally a word that is displayed to - the user. Opening tags are attached to this token when they are - adjacent (pre_tags) and closing tags that follow the word - (post_tags). Some exceptions occur when there are empty tags - adjacent to a word, so there may be close tags in pre_tags, or - open tags in post_tags. - - We also keep track of whether the word was originally followed by - whitespace, even though we do not want to treat the word as - equivalent to a similar word that does not have a trailing - space.""" - - # When this is true, the token will be eliminated from the - # displayed diff if no change has occurred: - hide_when_equal = False - - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): - obj = _unicode.__new__(cls, text) - - if pre_tags is not None: - obj.pre_tags = pre_tags - else: - obj.pre_tags = [] - - if post_tags is not None: - obj.post_tags = post_tags - else: - obj.post_tags = [] - - obj.trailing_whitespace = trailing_whitespace - - return obj - - def __repr__(self): - return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, - self.post_tags, self.trailing_whitespace) - - def html(self): - return _unicode(self) - -class tag_token(token): - - """ Represents a token that is actually a tag. Currently this is just - the <img> tag, which takes up visible space just like a word but - is only represented in a document by a tag. """ - - def __new__(cls, tag, data, html_repr, pre_tags=None, - post_tags=None, trailing_whitespace=""): - obj = token.__new__(cls, "%s: %s" % (type, data), - pre_tags=pre_tags, - post_tags=post_tags, - trailing_whitespace=trailing_whitespace) - obj.tag = tag - obj.data = data - obj.html_repr = html_repr - return obj - - def __repr__(self): - return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( - self.tag, - self.data, - self.html_repr, - self.pre_tags, - self.post_tags, - self.trailing_whitespace) - def html(self): - return self.html_repr - -class href_token(token): - - """ Represents the href in an anchor tag. Unlike other words, we only - show the href when it changes. """ - - hide_when_equal = True - - def html(self): - return ' Link: %s' % self - -def tokenize(html, include_hrefs=True): - """ - Parse the given HTML and returns token objects (words with attached tags). - - This parses only the content of a page; anything in the head is - ignored, and the <head> and <body> elements are themselves - optional. The content is then parsed by lxml, which ensures the - validity of the resulting parsed document (though lxml may make - incorrect guesses when the markup is particular bad). - - <ins> and <del> tags are also eliminated from the document, as - that gets confusing. - - If include_hrefs is true, then the href attribute of <a> tags is - included as a special kind of diffable token.""" - if etree.iselement(html): - body_el = html - else: - body_el = parse_html(html, cleanup=True) - # Then we split the document into text chunks for each tag, word, and end tag: - chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) - # Finally re-joining them into token objects: - return fixup_chunks(chunks) - -def parse_html(html, cleanup=True): - """ - Parses an HTML fragment, returning an lxml element. Note that the HTML will be - wrapped in a <div> tag that was not in the original document. - - If cleanup is true, make sure there's no <head> or <body>, and get - rid of any <ins> and <del> tags. - """ - if cleanup: - # This removes any extra markup or structure like <head>: - html = cleanup_html(html) - return fragment_fromstring(html, create_parent=True) - -_body_re = re.compile(r'<body.*?>', re.I|re.S) -_end_body_re = re.compile(r'</body.*?>', re.I|re.S) -_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) - -def cleanup_html(html): - """ This 'cleans' the HTML, meaning that any page structure is removed - (only the contents of <body> are used, if there is any <body). - Also <ins> and <del> tags are removed. """ - match = _body_re.search(html) - if match: - html = html[match.end():] - match = _end_body_re.search(html) - if match: - html = html[:match.start()] - html = _ins_del_re.sub('', html) - return html - - -end_whitespace_re = re.compile(r'[ \t\n\r]$') - -def split_trailing_whitespace(word): - """ - This function takes a word, such as 'test\n\n' and returns ('test','\n\n') - """ - stripped_length = len(word.rstrip()) - return word[0:stripped_length], word[stripped_length:] - - -def fixup_chunks(chunks): - """ - This function takes a list of chunks and produces a list of tokens. - """ - tag_accum = [] - cur_word = None - result = [] - for chunk in chunks: - if isinstance(chunk, tuple): - if chunk[0] == 'img': - src = chunk[1] - tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) - cur_word = tag_token('img', src, html_repr=tag, - pre_tags=tag_accum, - trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - - elif chunk[0] == 'href': - href = chunk[1] - cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") - tag_accum = [] - result.append(cur_word) - continue - - if is_word(chunk): - chunk, trailing_whitespace = split_trailing_whitespace(chunk) - cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - - elif is_start_tag(chunk): - tag_accum.append(chunk) - - elif is_end_tag(chunk): - if tag_accum: - tag_accum.append(chunk) - else: - assert cur_word, ( - "Weird state, cur_word=%r, result=%r, chunks=%r of %r" - % (cur_word, result, chunk, chunks)) - cur_word.post_tags.append(chunk) - else: - assert False - - if not result: - return [token('', pre_tags=tag_accum)] - else: - result[-1].post_tags.extend(tag_accum) - - return result - - -# All the tags in HTML that don't require end tags: -empty_tags = ( - 'param', 'img', 'area', 'br', 'basefont', 'input', - 'base', 'meta', 'link', 'col') - -block_level_tags = ( - 'address', - 'blockquote', - 'center', - 'dir', - 'div', - 'dl', - 'fieldset', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'hr', - 'isindex', - 'menu', - 'noframes', - 'noscript', - 'ol', - 'p', - 'pre', - 'table', - 'ul', - ) - -block_level_container_tags = ( - 'dd', - 'dt', - 'frameset', - 'li', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ) - - -def flatten_el(el, include_hrefs, skip_tag=False): - """ Takes an lxml element el, and generates all the text chunks for - that tag. Each start tag is a chunk, each word is a chunk, and each - end tag is a chunk. - - If skip_tag is true, then the outermost container tag is - not returned (just its contents).""" - if not skip_tag: - if el.tag == 'img': - yield ('img', el.get('src'), start_tag(el)) - else: - yield start_tag(el) - if el.tag in empty_tags and not el.text and not len(el) and not el.tail: - return - start_words = split_words(el.text) - for word in start_words: - yield html_escape(word) - for child in el: - for item in flatten_el(child, include_hrefs=include_hrefs): - yield item - if el.tag == 'a' and el.get('href') and include_hrefs: - yield ('href', el.get('href')) - if not skip_tag: - yield end_tag(el) - end_words = split_words(el.tail) - for word in end_words: - yield html_escape(word) - -split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) - -def split_words(text): - """ Splits some text into words. Includes trailing whitespace - on each word when appropriate. """ - if not text or not text.strip(): - return [] - - words = split_words_re.findall(text) - return words - -start_whitespace_re = re.compile(r'^[ \t\n\r]') - -def start_tag(el): - """ - The text representation of the start tag for a tag. - """ - return '<%s%s>' % ( - el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) - for name, value in el.attrib.items()])) - -def end_tag(el): - """ The text representation of an end tag for a tag. Includes - trailing whitespace when appropriate. """ - if el.tail and start_whitespace_re.search(el.tail): - extra = ' ' - else: - extra = '' - return '</%s>%s' % (el.tag, extra) - -def is_word(tok): - return not tok.startswith('<') - -def is_end_tag(tok): - return tok.startswith('</') - -def is_start_tag(tok): - return tok.startswith('<') and not tok.startswith('</') - -def fixup_ins_del_tags(html): - """ Given an html string, move any <ins> or <del> tags inside of any - block-level elements, e.g. transform <ins><p>word</p></ins> to - <p><ins>word</ins></p> """ - doc = parse_html(html, cleanup=False) - _fixup_ins_del_tags(doc) - html = serialize_html_fragment(doc, skip_outer=True) - return html - -def serialize_html_fragment(el, skip_outer=False): - """ Serialize a single lxml element as HTML. The serialized form - includes the elements tail. - - If skip_outer is true, then don't serialize the outermost tag - """ - assert not isinstance(el, basestring), ( - "You should pass in an element, not a string like %r" % el) - html = etree.tostring(el, method="html", encoding=_unicode) - if skip_outer: - # Get rid of the extra starting tag: - html = html[html.find('>')+1:] - # Get rid of the extra end tag: - html = html[:html.rfind('<')] - return html.strip() - else: - return html - -def _fixup_ins_del_tags(doc): - """fixup_ins_del_tags that works on an lxml document in-place - """ - for tag in ['ins', 'del']: - for el in doc.xpath('descendant-or-self::%s' % tag): - if not _contains_block_level_tag(el): - continue - _move_el_inside_block(el, tag=tag) - el.drop_tag() - #_merge_element_contents(el) - -def _contains_block_level_tag(el): - """True if the element contains any block-level elements, like <p>, <td>, etc. - """ - if el.tag in block_level_tags or el.tag in block_level_container_tags: - return True - for child in el: - if _contains_block_level_tag(child): - return True - return False - -def _move_el_inside_block(el, tag): - """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags - and moves them inside any block-level tags. """ - for child in el: - if _contains_block_level_tag(child): - break - else: - # No block-level tags in any child - children_tag = etree.Element(tag) - children_tag.text = el.text - el.text = None - children_tag.extend(list(el)) - el[:] = [children_tag] - return - for child in list(el): - if _contains_block_level_tag(child): - _move_el_inside_block(child, tag) - if child.tail: - tail_tag = etree.Element(tag) - tail_tag.text = child.tail - child.tail = None - el.insert(el.index(child)+1, tail_tag) - else: - child_tag = etree.Element(tag) - el.replace(child, child_tag) - child_tag.append(child) - if el.text: - text_tag = etree.Element(tag) - text_tag.text = el.text - el.text = None - el.insert(0, text_tag) - -def _merge_element_contents(el): - """ - Removes an element, but merges its contents into its place, e.g., - given <p>Hi <i>there!</i></p>, if you remove the <i> element you get - <p>Hi there!</p> - """ - parent = el.getparent() - text = el.text or '' - if el.tail: - if not len(el): - text += el.tail - else: - if el[-1].tail: - el[-1].tail += el.tail - else: - el[-1].tail = el.tail - index = parent.index(el) - if text: - if index == 0: - previous = None - else: - previous = parent[index-1] - if previous is None: - if parent.text: - parent.text += text - else: - parent.text = text - else: - if previous.tail: - previous.tail += text - else: - previous.tail = text - parent[index:index+1] = el.getchildren() - -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): - """ - Acts like SequenceMatcher, but tries not to find very small equal - blocks amidst large spans of changes - """ - - threshold = 2 - - def get_matching_blocks(self): - size = min(len(self.b), len(self.b)) - threshold = min(self.threshold, size / 4) - actual = difflib.SequenceMatcher.get_matching_blocks(self) - return [item for item in actual - if item[2] > threshold - or not item[2]] - -if __name__ == '__main__': - from lxml.html import _diffcommand - _diffcommand.main() - |