aboutsummaryrefslogtreecommitdiffstats
path: root/env/lib/python3.10/site-packages/lxml/html/diff.py
diff options
context:
space:
mode:
authorLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2022-11-13 23:46:45 +0530
committerLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2022-11-13 23:46:45 +0530
commit9468226a9e2e2ab8cdd599f1d8538e860ca86120 (patch)
tree0a77ada226d6db80639f96b438bf83e4e756edb5 /env/lib/python3.10/site-packages/lxml/html/diff.py
downloadidcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.gz
idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.bz2
idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.zip
id card generator
Diffstat (limited to 'env/lib/python3.10/site-packages/lxml/html/diff.py')
-rw-r--r--env/lib/python3.10/site-packages/lxml/html/diff.py884
1 files changed, 884 insertions, 0 deletions
diff --git a/env/lib/python3.10/site-packages/lxml/html/diff.py b/env/lib/python3.10/site-packages/lxml/html/diff.py
new file mode 100644
index 0000000..39bec78
--- /dev/null
+++ b/env/lib/python3.10/site-packages/lxml/html/diff.py
@@ -0,0 +1,884 @@
+# cython: language_level=3
+
+from __future__ import absolute_import
+
+import difflib
+from lxml import etree
+from lxml.html import fragment_fromstring
+import re
+
+__all__ = ['html_annotate', 'htmldiff']
+
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+try:
+ _unicode = unicode
+except NameError:
+ # Python 3
+ _unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+############################################################
+## Annotation
+############################################################
+
+def default_markup(text, version):
+ return '<span title="%s">%s</span>' % (
+ html_escape(_unicode(version), 1), text)
+
+def html_annotate(doclist, markup=default_markup):
+ """
+ doclist should be ordered from oldest to newest, like::
+
+ >>> version1 = 'Hello World'
+ >>> version2 = 'Goodbye World'
+ >>> print(html_annotate([(version1, 'version 1'),
+ ... (version2, 'version 2')]))
+ <span title="version 2">Goodbye</span> <span title="version 1">World</span>
+
+ The documents must be *fragments* (str/UTF8 or unicode), not
+ complete documents
+
+ The markup argument is a function to markup the spans of words.
+ This function is called like markup('Hello', 'version 2'), and
+ returns HTML. The first argument is text and never includes any
+ markup. The default uses a span with a title:
+
+ >>> print(default_markup('Some Text', 'by Joe'))
+ <span title="by Joe">Some Text</span>
+ """
+ # The basic strategy we have is to split the documents up into
+ # logical tokens (which are words with attached markup). We then
+ # do diffs of each of the versions to track when a token first
+ # appeared in the document; the annotation attached to the token
+ # is the version where it first appeared.
+ tokenlist = [tokenize_annotated(doc, version)
+ for doc, version in doclist]
+ cur_tokens = tokenlist[0]
+ for tokens in tokenlist[1:]:
+ html_annotate_merge_annotations(cur_tokens, tokens)
+ cur_tokens = tokens
+
+ # After we've tracked all the tokens, we can combine spans of text
+ # that are adjacent and have the same annotation
+ cur_tokens = compress_tokens(cur_tokens)
+ # And finally add markup
+ result = markup_serialize_tokens(cur_tokens, markup)
+ return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation):
+ """Tokenize a document and add an annotation attribute to each token
+ """
+ tokens = tokenize(doc, include_hrefs=False)
+ for tok in tokens:
+ tok.annotation = annotation
+ return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+ """Merge the annotations from tokens_old into tokens_new, when the
+ tokens in the new document already existed in the old document.
+ """
+ s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+ commands = s.get_opcodes()
+
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ eq_old = tokens_old[i1:i2]
+ eq_new = tokens_new[j1:j2]
+ copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest):
+ """
+ Copy annotations from the tokens listed in src to the tokens in dest
+ """
+ assert len(src) == len(dest)
+ for src_tok, dest_tok in zip(src, dest):
+ dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+ """
+ Combine adjacent tokens when there is no HTML between the tokens,
+ and they share an annotation
+ """
+ result = [tokens[0]]
+ for tok in tokens[1:]:
+ if (not result[-1].post_tags and
+ not tok.pre_tags and
+ result[-1].annotation == tok.annotation):
+ compress_merge_back(result, tok)
+ else:
+ result.append(tok)
+ return result
+
+def compress_merge_back(tokens, tok):
+ """ Merge tok into the last element of tokens (modifying the list of
+ tokens in-place). """
+ last = tokens[-1]
+ if type(last) is not token or type(tok) is not token:
+ tokens.append(tok)
+ else:
+ text = _unicode(last)
+ if last.trailing_whitespace:
+ text += last.trailing_whitespace
+ text += tok
+ merged = token(text,
+ pre_tags=last.pre_tags,
+ post_tags=tok.post_tags,
+ trailing_whitespace=tok.trailing_whitespace)
+ merged.annotation = last.annotation
+ tokens[-1] = merged
+
+def markup_serialize_tokens(tokens, markup_func):
+ """
+ Serialize the list of tokens into a list of text chunks, calling
+ markup_func around text to add annotations.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ html = token.html()
+ html = markup_func(html, token.annotation)
+ if token.trailing_whitespace:
+ html += token.trailing_whitespace
+ yield html
+ for post in token.post_tags:
+ yield post
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+ ## FIXME: this should take parsed documents too, and use their body
+ ## or other content.
+ """ Do a diff of the old and new document. The documents are HTML
+ *fragments* (str/UTF8 or unicode), they are not complete documents
+ (i.e., no <html> tag).
+
+ Returns HTML with <ins> and <del> tags added around the
+ appropriate text.
+
+ Markup is generally ignored, with the markup from new_html
+ preserved, and possibly some markup from old_html (though it is
+ considered acceptable to lose some of the old markup). Only the
+ words in the HTML are diffed. The exception is <img> tags, which
+ are treated like words, and the href attribute of <a> tags, which
+ are noted inside the tag itself when there are changes.
+ """
+ old_html_tokens = tokenize(old_html)
+ new_html_tokens = tokenize(new_html)
+ result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+ result = ''.join(result).strip()
+ return fixup_ins_del_tags(result)
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+ """ Does a diff on the tokens themselves, returning a list of text
+ chunks (not tokens).
+ """
+ # There are several passes as we do the differences. The tokens
+ # isolate the portion of the content we care to diff; difflib does
+ # all the actual hard work at that point.
+ #
+ # Then we must create a valid document from pieces of both the old
+ # document and the new document. We generally prefer to take
+ # markup from the new document, and only do a best effort attempt
+ # to keep markup from the old document; anything that we can't
+ # resolve we throw away. Also we try to put the deletes as close
+ # to the location where we think they would have been -- because
+ # we are only keeping the markup from the new document, it can be
+ # fuzzy where in the new document the old text would have gone.
+ # Again we just do a best effort attempt.
+ s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+ commands = s.get_opcodes()
+ result = []
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+ continue
+ if command == 'insert' or command == 'replace':
+ ins_tokens = expand_tokens(html2_tokens[j1:j2])
+ merge_insert(ins_tokens, result)
+ if command == 'delete' or command == 'replace':
+ del_tokens = expand_tokens(html1_tokens[i1:i2])
+ merge_delete(del_tokens, result)
+ # If deletes were inserted directly as <del> then we'd have an
+ # invalid document at this point. Instead we put in special
+ # markers, and when the complete diffed document has been created
+ # we try to move the deletes around and resolve any problems.
+ result = cleanup_delete(result)
+
+ return result
+
+def expand_tokens(tokens, equal=False):
+ """Given a list of tokens, return a generator of the chunks of
+ text for the data in the tokens.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ if not equal or not token.hide_when_equal:
+ if token.trailing_whitespace:
+ yield token.html() + token.trailing_whitespace
+ else:
+ yield token.html()
+ for post in token.post_tags:
+ yield post
+
+def merge_insert(ins_chunks, doc):
+ """ doc is the already-handled document (as a list of text chunks);
+ here we add <ins>ins_chunks</ins> to the end of that. """
+ # Though we don't throw away unbalanced_start or unbalanced_end
+ # (we assume there is accompanying markup later or earlier in the
+ # document), we only put <ins> around the balanced portion.
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+ doc.extend(unbalanced_start)
+ if doc and not doc[-1].endswith(' '):
+ # Fix up the case where the word before the insert didn't end with
+ # a space
+ doc[-1] += ' '
+ doc.append('<ins>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </ins>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</ins> ')
+ doc.extend(unbalanced_end)
+
+# These are sentinels to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+ pass
+class DEL_END:
+ pass
+
+class NoDeletes(Exception):
+ """ Raised when the document no longer contains any pending deletes
+ (DEL_START/DEL_END) """
+
+def merge_delete(del_chunks, doc):
+ """ Adds the text chunks in del_chunks to the document doc (another
+ list of text chunks) with marker to show it is a delete.
+ cleanup_delete later resolves these markers into <del> tags."""
+ doc.append(DEL_START)
+ doc.extend(del_chunks)
+ doc.append(DEL_END)
+
+def cleanup_delete(chunks):
+ """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+ them with <del></del>. To do this while keeping the document
+ valid, it may need to drop some tags (either start or end tags).
+
+ It may also move the del into adjacent tags to try to move it to a
+ similar location where it was originally located (e.g., moving a
+ delete into preceding <div> tag, if the del looks like (DEL_START,
+ 'Text</div>', DEL_END)"""
+ while 1:
+ # Find a pending DEL_START/DEL_END, splitting the document
+ # into stuff-preceding-DEL_START, stuff-inside, and
+ # stuff-following-DEL_END
+ try:
+ pre_delete, delete, post_delete = split_delete(chunks)
+ except NoDeletes:
+ # Nothing found, we've cleaned up the entire doc
+ break
+ # The stuff-inside-DEL_START/END may not be well balanced
+ # markup. First we figure out what unbalanced portions there are:
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+ # Then we move the span forward and/or backward based on these
+ # unbalanced portions:
+ locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+ locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+ doc = pre_delete
+ if doc and not doc[-1].endswith(' '):
+ # Fix up case where the word before us didn't have a trailing space
+ doc[-1] += ' '
+ doc.append('<del>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </del>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</del> ')
+ doc.extend(post_delete)
+ chunks = doc
+ return chunks
+
+def split_unbalanced(chunks):
+ """Return (unbalanced_start, balanced, unbalanced_end), where each is
+ a list of text and tag chunks.
+
+ unbalanced_start is a list of all the tags that are opened, but
+ not closed in this span. Similarly, unbalanced_end is a list of
+ tags that are closed but were not opened. Extracting these might
+ mean some reordering of the chunks."""
+ start = []
+ end = []
+ tag_stack = []
+ balanced = []
+ for chunk in chunks:
+ if not chunk.startswith('<'):
+ balanced.append(chunk)
+ continue
+ endtag = chunk[1] == '/'
+ name = chunk.split()[0].strip('<>/')
+ if name in empty_tags:
+ balanced.append(chunk)
+ continue
+ if endtag:
+ if tag_stack and tag_stack[-1][0] == name:
+ balanced.append(chunk)
+ name, pos, tag = tag_stack.pop()
+ balanced[pos] = tag
+ elif tag_stack:
+ start.extend([tag for name, pos, tag in tag_stack])
+ tag_stack = []
+ end.append(chunk)
+ else:
+ end.append(chunk)
+ else:
+ tag_stack.append((name, len(balanced), chunk))
+ balanced.append(None)
+ start.extend(
+ [chunk for name, pos, chunk in tag_stack])
+ balanced = [chunk for chunk in balanced if chunk is not None]
+ return start, balanced, end
+
+def split_delete(chunks):
+ """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+ stuff_after_DEL_END). Returns the first case found (there may be
+ more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
+ there's no DEL_START found. """
+ try:
+ pos = chunks.index(DEL_START)
+ except ValueError:
+ raise NoDeletes
+ pos2 = chunks.index(DEL_END)
+ return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+ """ pre_delete and post_delete implicitly point to a place in the
+ document (where the two were split). This moves that point (by
+ popping items from one and pushing them onto the other). It moves
+ the point to try to find a place where unbalanced_start applies.
+
+ As an example::
+
+ >>> unbalanced_start = ['<div>']
+ >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+ >>> pre, post = doc[:3], doc[3:]
+ >>> pre, post
+ (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+ >>> locate_unbalanced_start(unbalanced_start, pre, post)
+ >>> pre, post
+ (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+
+ As you can see, we moved the point so that the dangling <div> that
+ we found will be effectively replaced by the div in the original
+ document. If this doesn't work out, we just throw away
+ unbalanced_start without doing anything.
+ """
+ while 1:
+ if not unbalanced_start:
+ # We have totally succeeded in finding the position
+ break
+ finding = unbalanced_start[0]
+ finding_name = finding.split()[0].strip('<>')
+ if not post_delete:
+ break
+ next = post_delete[0]
+ if next is DEL_START or not next.startswith('<'):
+ # Reached a word, we can't move the delete text forward
+ break
+ if next[1] == '/':
+ # Reached a closing tag, can we go further? Maybe not...
+ break
+ name = next.split()[0].strip('<>')
+ if name == 'ins':
+ # Can't move into an insert
+ break
+ assert name != 'del', (
+ "Unexpected delete tag: %r" % next)
+ if name == finding_name:
+ unbalanced_start.pop(0)
+ pre_delete.append(post_delete.pop(0))
+ else:
+ # Found a tag that doesn't match
+ break
+
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+ """ like locate_unbalanced_start, except handling end tags and
+ possibly moving the point earlier in the document. """
+ while 1:
+ if not unbalanced_end:
+ # Success
+ break
+ finding = unbalanced_end[-1]
+ finding_name = finding.split()[0].strip('<>/')
+ if not pre_delete:
+ break
+ next = pre_delete[-1]
+ if next is DEL_END or not next.startswith('</'):
+ # A word or a start tag
+ break
+ name = next.split()[0].strip('<>/')
+ if name == 'ins' or name == 'del':
+ # Can't move into an insert or delete
+ break
+ if name == finding_name:
+ unbalanced_end.pop()
+ post_delete.insert(0, pre_delete.pop())
+ else:
+ # Found a tag that doesn't match
+ break
+
+class token(_unicode):
+ """ Represents a diffable token, generally a word that is displayed to
+ the user. Opening tags are attached to this token when they are
+ adjacent (pre_tags) and closing tags that follow the word
+ (post_tags). Some exceptions occur when there are empty tags
+ adjacent to a word, so there may be close tags in pre_tags, or
+ open tags in post_tags.
+
+ We also keep track of whether the word was originally followed by
+ whitespace, even though we do not want to treat the word as
+ equivalent to a similar word that does not have a trailing
+ space."""
+
+ # When this is true, the token will be eliminated from the
+ # displayed diff if no change has occurred:
+ hide_when_equal = False
+
+ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
+ obj = _unicode.__new__(cls, text)
+
+ if pre_tags is not None:
+ obj.pre_tags = pre_tags
+ else:
+ obj.pre_tags = []
+
+ if post_tags is not None:
+ obj.post_tags = post_tags
+ else:
+ obj.post_tags = []
+
+ obj.trailing_whitespace = trailing_whitespace
+
+ return obj
+
+ def __repr__(self):
+ return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+ self.post_tags, self.trailing_whitespace)
+
+ def html(self):
+ return _unicode(self)
+
+class tag_token(token):
+
+ """ Represents a token that is actually a tag. Currently this is just
+ the <img> tag, which takes up visible space just like a word but
+ is only represented in a document by a tag. """
+
+ def __new__(cls, tag, data, html_repr, pre_tags=None,
+ post_tags=None, trailing_whitespace=""):
+ obj = token.__new__(cls, "%s: %s" % (type, data),
+ pre_tags=pre_tags,
+ post_tags=post_tags,
+ trailing_whitespace=trailing_whitespace)
+ obj.tag = tag
+ obj.data = data
+ obj.html_repr = html_repr
+ return obj
+
+ def __repr__(self):
+ return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
+ self.tag,
+ self.data,
+ self.html_repr,
+ self.pre_tags,
+ self.post_tags,
+ self.trailing_whitespace)
+ def html(self):
+ return self.html_repr
+
+class href_token(token):
+
+ """ Represents the href in an anchor tag. Unlike other words, we only
+ show the href when it changes. """
+
+ hide_when_equal = True
+
+ def html(self):
+ return ' Link: %s' % self
+
+def tokenize(html, include_hrefs=True):
+ """
+ Parse the given HTML and returns token objects (words with attached tags).
+
+ This parses only the content of a page; anything in the head is
+ ignored, and the <head> and <body> elements are themselves
+ optional. The content is then parsed by lxml, which ensures the
+ validity of the resulting parsed document (though lxml may make
+ incorrect guesses when the markup is particular bad).
+
+ <ins> and <del> tags are also eliminated from the document, as
+ that gets confusing.
+
+ If include_hrefs is true, then the href attribute of <a> tags is
+ included as a special kind of diffable token."""
+ if etree.iselement(html):
+ body_el = html
+ else:
+ body_el = parse_html(html, cleanup=True)
+ # Then we split the document into text chunks for each tag, word, and end tag:
+ chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
+ # Finally re-joining them into token objects:
+ return fixup_chunks(chunks)
+
+def parse_html(html, cleanup=True):
+ """
+ Parses an HTML fragment, returning an lxml element. Note that the HTML will be
+ wrapped in a <div> tag that was not in the original document.
+
+ If cleanup is true, make sure there's no <head> or <body>, and get
+ rid of any <ins> and <del> tags.
+ """
+ if cleanup:
+ # This removes any extra markup or structure like <head>:
+ html = cleanup_html(html)
+ return fragment_fromstring(html, create_parent=True)
+
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+
+def cleanup_html(html):
+ """ This 'cleans' the HTML, meaning that any page structure is removed
+ (only the contents of <body> are used, if there is any <body).
+ Also <ins> and <del> tags are removed. """
+ match = _body_re.search(html)
+ if match:
+ html = html[match.end():]
+ match = _end_body_re.search(html)
+ if match:
+ html = html[:match.start()]
+ html = _ins_del_re.sub('', html)
+ return html
+
+
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+
+def split_trailing_whitespace(word):
+ """
+ This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+ """
+ stripped_length = len(word.rstrip())
+ return word[0:stripped_length], word[stripped_length:]
+
+
+def fixup_chunks(chunks):
+ """
+ This function takes a list of chunks and produces a list of tokens.
+ """
+ tag_accum = []
+ cur_word = None
+ result = []
+ for chunk in chunks:
+ if isinstance(chunk, tuple):
+ if chunk[0] == 'img':
+ src = chunk[1]
+ tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
+ cur_word = tag_token('img', src, html_repr=tag,
+ pre_tags=tag_accum,
+ trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif chunk[0] == 'href':
+ href = chunk[1]
+ cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
+ tag_accum = []
+ result.append(cur_word)
+ continue
+
+ if is_word(chunk):
+ chunk, trailing_whitespace = split_trailing_whitespace(chunk)
+ cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif is_start_tag(chunk):
+ tag_accum.append(chunk)
+
+ elif is_end_tag(chunk):
+ if tag_accum:
+ tag_accum.append(chunk)
+ else:
+ assert cur_word, (
+ "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+ % (cur_word, result, chunk, chunks))
+ cur_word.post_tags.append(chunk)
+ else:
+ assert False
+
+ if not result:
+ return [token('', pre_tags=tag_accum)]
+ else:
+ result[-1].post_tags.extend(tag_accum)
+
+ return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+block_level_tags = (
+ 'address',
+ 'blockquote',
+ 'center',
+ 'dir',
+ 'div',
+ 'dl',
+ 'fieldset',
+ 'form',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'isindex',
+ 'menu',
+ 'noframes',
+ 'noscript',
+ 'ol',
+ 'p',
+ 'pre',
+ 'table',
+ 'ul',
+ )
+
+block_level_container_tags = (
+ 'dd',
+ 'dt',
+ 'frameset',
+ 'li',
+ 'tbody',
+ 'td',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ )
+
+
+def flatten_el(el, include_hrefs, skip_tag=False):
+ """ Takes an lxml element el, and generates all the text chunks for
+ that tag. Each start tag is a chunk, each word is a chunk, and each
+ end tag is a chunk.
+
+ If skip_tag is true, then the outermost container tag is
+ not returned (just its contents)."""
+ if not skip_tag:
+ if el.tag == 'img':
+ yield ('img', el.get('src'), start_tag(el))
+ else:
+ yield start_tag(el)
+ if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
+ return
+ start_words = split_words(el.text)
+ for word in start_words:
+ yield html_escape(word)
+ for child in el:
+ for item in flatten_el(child, include_hrefs=include_hrefs):
+ yield item
+ if el.tag == 'a' and el.get('href') and include_hrefs:
+ yield ('href', el.get('href'))
+ if not skip_tag:
+ yield end_tag(el)
+ end_words = split_words(el.tail)
+ for word in end_words:
+ yield html_escape(word)
+
+split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
+
+def split_words(text):
+ """ Splits some text into words. Includes trailing whitespace
+ on each word when appropriate. """
+ if not text or not text.strip():
+ return []
+
+ words = split_words_re.findall(text)
+ return words
+
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+
+def start_tag(el):
+ """
+ The text representation of the start tag for a tag.
+ """
+ return '<%s%s>' % (
+ el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
+ for name, value in el.attrib.items()]))
+
+def end_tag(el):
+ """ The text representation of an end tag for a tag. Includes
+ trailing whitespace when appropriate. """
+ if el.tail and start_whitespace_re.search(el.tail):
+ extra = ' '
+ else:
+ extra = ''
+ return '</%s>%s' % (el.tag, extra)
+
+def is_word(tok):
+ return not tok.startswith('<')
+
+def is_end_tag(tok):
+ return tok.startswith('</')
+
+def is_start_tag(tok):
+ return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+ """ Given an html string, move any <ins> or <del> tags inside of any
+ block-level elements, e.g. transform <ins><p>word</p></ins> to
+ <p><ins>word</ins></p> """
+ doc = parse_html(html, cleanup=False)
+ _fixup_ins_del_tags(doc)
+ html = serialize_html_fragment(doc, skip_outer=True)
+ return html
+
+def serialize_html_fragment(el, skip_outer=False):
+ """ Serialize a single lxml element as HTML. The serialized form
+ includes the elements tail.
+
+ If skip_outer is true, then don't serialize the outermost tag
+ """
+ assert not isinstance(el, basestring), (
+ "You should pass in an element, not a string like %r" % el)
+ html = etree.tostring(el, method="html", encoding=_unicode)
+ if skip_outer:
+ # Get rid of the extra starting tag:
+ html = html[html.find('>')+1:]
+ # Get rid of the extra end tag:
+ html = html[:html.rfind('<')]
+ return html.strip()
+ else:
+ return html
+
+def _fixup_ins_del_tags(doc):
+ """fixup_ins_del_tags that works on an lxml document in-place
+ """
+ for tag in ['ins', 'del']:
+ for el in doc.xpath('descendant-or-self::%s' % tag):
+ if not _contains_block_level_tag(el):
+ continue
+ _move_el_inside_block(el, tag=tag)
+ el.drop_tag()
+ #_merge_element_contents(el)
+
+def _contains_block_level_tag(el):
+ """True if the element contains any block-level elements, like <p>, <td>, etc.
+ """
+ if el.tag in block_level_tags or el.tag in block_level_container_tags:
+ return True
+ for child in el:
+ if _contains_block_level_tag(child):
+ return True
+ return False
+
+def _move_el_inside_block(el, tag):
+ """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+ and moves them inside any block-level tags. """
+ for child in el:
+ if _contains_block_level_tag(child):
+ break
+ else:
+ # No block-level tags in any child
+ children_tag = etree.Element(tag)
+ children_tag.text = el.text
+ el.text = None
+ children_tag.extend(list(el))
+ el[:] = [children_tag]
+ return
+ for child in list(el):
+ if _contains_block_level_tag(child):
+ _move_el_inside_block(child, tag)
+ if child.tail:
+ tail_tag = etree.Element(tag)
+ tail_tag.text = child.tail
+ child.tail = None
+ el.insert(el.index(child)+1, tail_tag)
+ else:
+ child_tag = etree.Element(tag)
+ el.replace(child, child_tag)
+ child_tag.append(child)
+ if el.text:
+ text_tag = etree.Element(tag)
+ text_tag.text = el.text
+ el.text = None
+ el.insert(0, text_tag)
+
+def _merge_element_contents(el):
+ """
+ Removes an element, but merges its contents into its place, e.g.,
+ given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+ <p>Hi there!</p>
+ """
+ parent = el.getparent()
+ text = el.text or ''
+ if el.tail:
+ if not len(el):
+ text += el.tail
+ else:
+ if el[-1].tail:
+ el[-1].tail += el.tail
+ else:
+ el[-1].tail = el.tail
+ index = parent.index(el)
+ if text:
+ if index == 0:
+ previous = None
+ else:
+ previous = parent[index-1]
+ if previous is None:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+ else:
+ if previous.tail:
+ previous.tail += text
+ else:
+ previous.tail = text
+ parent[index:index+1] = el.getchildren()
+
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+ """
+ Acts like SequenceMatcher, but tries not to find very small equal
+ blocks amidst large spans of changes
+ """
+
+ threshold = 2
+
+ def get_matching_blocks(self):
+ size = min(len(self.b), len(self.b))
+ threshold = min(self.threshold, size / 4)
+ actual = difflib.SequenceMatcher.get_matching_blocks(self)
+ return [item for item in actual
+ if item[2] > threshold
+ or not item[2]]
+
+if __name__ == '__main__':
+ from lxml.html import _diffcommand
+ _diffcommand.main()
+