env/lib/python3.10/site-packages/lxml/doctestcompare.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507

"""
lxml-based doctest output comparison.

Note: normally, you should just import the `lxml.usedoctest` and
`lxml.html.usedoctest` modules from within a doctest, instead of this
one::

    >>> import lxml.usedoctest # for XML output

    >>> import lxml.html.usedoctest # for HTML output

To use this module directly, you must call ``lxmldoctest.install()``,
which will cause doctest to use this in all subsequent calls.

This changes the way output is checked and comparisons are made for
XML or HTML-like content.

XML or HTML content is noticed because the example starts with ``<``
(it's HTML if it starts with ``<html``).  You can also use the
``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.

Some rough wildcard-like things are allowed.  Whitespace is generally
ignored (except in attributes).  In text (attributes and text in the
body) you can use ``...`` as a wildcard.  In an example it also
matches any trailing tags in the element, though it does not match
leading tags.  You may create a tag ``<any>`` or include an ``any``
attribute in the tag.  An ``any`` tag matches any tag, while the
attribute matches any and all attributes.

When a match fails, the reformatted example and gotten text is
displayed (indented), and a rough diff-like output is given.  Anything
marked with ``+`` is in the output but wasn't supposed to be, and
similarly ``-`` means its in the example but wasn't in the output.

You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
"""

from lxml import etree
import sys
import re
import doctest
try:
    from html import escape as html_escape
except ImportError:
    from cgi import escape as html_escape

__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
           'LHTMLOutputChecker', 'install', 'temp_install']

try:
    _basestring = basestring
except NameError:
    _basestring = (str, bytes)

_IS_PYTHON_3 = sys.version_info[0] >= 3

PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
PARSE_XML = doctest.register_optionflag('PARSE_XML')
NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')

OutputChecker = doctest.OutputChecker

def strip(v):
    if v is None:
        return None
    else:
        return v.strip()

def norm_whitespace(v):
    return _norm_whitespace_re.sub(' ', v)

_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)

def html_fromstring(html):
    return etree.fromstring(html, _html_parser)

# We use this to distinguish repr()s from elements:
_repr_re = re.compile(r'^<[^>]+ (at|object) ')
_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')

class LXMLOutputChecker(OutputChecker):

    empty_tags = (
        'param', 'img', 'area', 'br', 'basefont', 'input',
        'base', 'meta', 'link', 'col')

    def get_default_parser(self):
        return etree.XML

    def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

    def get_parser(self, want, got, optionflags):
        parser = None
        if NOPARSE_MARKUP & optionflags:
            return None
        if PARSE_HTML & optionflags:
            parser = html_fromstring
        elif PARSE_XML & optionflags:
            parser = etree.XML
        elif (want.strip().lower().startswith('<html')
              and got.strip().startswith('<html')):
            parser = html_fromstring
        elif (self._looks_like_markup(want)
              and self._looks_like_markup(got)):
            parser = self.get_default_parser()
        return parser

    def _looks_like_markup(self, s):
        s = s.strip()
        return (s.startswith('<')
                and not _repr_re.search(s))

    def compare_docs(self, want, got):
        if not self.tag_compare(want.tag, got.tag):
            return False
        if not self.text_compare(want.text, got.text, True):
            return False
        if not self.text_compare(want.tail, got.tail, True):
            return False
        if 'any' not in want.attrib:
            want_keys = sorted(want.attrib.keys())
            got_keys = sorted(got.attrib.keys())
            if want_keys != got_keys:
                return False
            for key in want_keys:
                if not self.text_compare(want.attrib[key], got.attrib[key], False):
                    return False
        if want.text != '...' or len(want):
            want_children = list(want)
            got_children = list(got)
            while want_children or got_children:
                if not want_children or not got_children:
                    return False
                want_first = want_children.pop(0)
                got_first = got_children.pop(0)
                if not self.compare_docs(want_first, got_first):
                    return False
                if not got_children and want_first.tail == '...':
                    break
        return True

    def text_compare(self, want, got, strip):
        want = want or ''
        got = got or ''
        if strip:
            want = norm_whitespace(want).strip()
            got = norm_whitespace(got).strip()
        want = '^%s$' % re.escape(want)
        want = want.replace(r'\.\.\.', '.*')
        if re.search(want, got):
            return True
        else:
            return False

    def tag_compare(self, want, got):
        if want == 'any':
            return True
        if (not isinstance(want, _basestring)
            or not isinstance(got, _basestring)):
            return want == got
        want = want or ''
        got = got or ''
        if want.startswith('{...}'):
            # Ellipsis on the namespace
            return want.split('}')[-1] == got.split('}')[-1]
        else:
            return want == got

    def output_difference(self, example, got, optionflags):
        want = example.want
        parser = self.get_parser(want, got, optionflags)
        errors = []
        if parser is not None:
            try:
                want_doc = parser(want)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In example: %s' % e)
            try:
                got_doc = parser(got)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In actual output: %s' % e)
        if parser is None or errors:
            value = OutputChecker.output_difference(
                self, example, got, optionflags)
            if errors:
                errors.append(value)
                return '\n'.join(errors)
            else:
                return value
        html = parser is html_fromstring
        diff_parts = ['Expected:',
                      self.format_doc(want_doc, html, 2),
                      'Got:',
                      self.format_doc(got_doc, html, 2),
                      'Diff:',
                      self.collect_diff(want_doc, got_doc, html, 2)]
        return '\n'.join(diff_parts)

    def html_empty_tag(self, el, html=True):
        if not html:
            return False
        if el.tag not in self.empty_tags:
            return False
        if el.text or len(el):
            # This shouldn't happen (contents in an empty tag)
            return False
        return True

    def format_doc(self, doc, html, indent, prefix=''):
        parts = []
        if not len(doc):
            # No children...
            parts.append(' '*indent)
            parts.append(prefix)
            parts.append(self.format_tag(doc))
            if not self.html_empty_tag(doc, html):
                if strip(doc.text):
                    parts.append(self.format_text(doc.text))
                parts.append(self.format_end_tag(doc))
            if strip(doc.tail):
                parts.append(self.format_text(doc.tail))
            parts.append('\n')
            return ''.join(parts)
        parts.append(' '*indent)
        parts.append(prefix)
        parts.append(self.format_tag(doc))
        if not self.html_empty_tag(doc, html):
            parts.append('\n')
            if strip(doc.text):
                parts.append(' '*indent)
                parts.append(self.format_text(doc.text))
                parts.append('\n')
            for el in doc:
                parts.append(self.format_doc(el, html, indent+2))
            parts.append(' '*indent)
            parts.append(self.format_end_tag(doc))
            parts.append('\n')
        if strip(doc.tail):
            parts.append(' '*indent)
            parts.append(self.format_text(doc.tail))
            parts.append('\n')
        return ''.join(parts)

    def format_text(self, text, strip=True):
        if text is None:
            return ''
        if strip:
            text = text.strip()
        return html_escape(text, 1)

    def format_tag(self, el):
        attrs = []
        if isinstance(el, etree.CommentBase):
            # FIXME: probably PIs should be handled specially too?
            return '<!--'
        for name, value in sorted(el.attrib.items()):
            attrs.append('%s="%s"' % (name, self.format_text(value, False)))
        if not attrs:
            return '<%s>' % el.tag
        return '<%s %s>' % (el.tag, ' '.join(attrs))
    
    def format_end_tag(self, el):
        if isinstance(el, etree.CommentBase):
            # FIXME: probably PIs should be handled specially too?
            return '-->'
        return '</%s>' % el.tag

    def collect_diff(self, want, got, html, indent):
        parts = []
        if not len(want) and not len(got):
            parts.append(' '*indent)
            parts.append(self.collect_diff_tag(want, got))
            if not self.html_empty_tag(got, html):
                parts.append(self.collect_diff_text(want.text, got.text))
                parts.append(self.collect_diff_end_tag(want, got))
            parts.append(self.collect_diff_text(want.tail, got.tail))
            parts.append('\n')
            return ''.join(parts)
        parts.append(' '*indent)
        parts.append(self.collect_diff_tag(want, got))
        parts.append('\n')
        if strip(want.text) or strip(got.text):
            parts.append(' '*indent)
            parts.append(self.collect_diff_text(want.text, got.text))
            parts.append('\n')
        want_children = list(want)
        got_children = list(got)
        while want_children or got_children:
            if not want_children:
                parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+'))
                continue
            if not got_children:
                parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-'))
                continue
            parts.append(self.collect_diff(
                want_children.pop(0), got_children.pop(0), html, indent+2))
        parts.append(' '*indent)
        parts.append(self.collect_diff_end_tag(want, got))
        parts.append('\n')
        if strip(want.tail) or strip(got.tail):
            parts.append(' '*indent)
            parts.append(self.collect_diff_text(want.tail, got.tail))
            parts.append('\n')
        return ''.join(parts)

    def collect_diff_tag(self, want, got):
        if not self.tag_compare(want.tag, got.tag):
            tag = '%s (got: %s)' % (want.tag, got.tag)
        else:
            tag = got.tag
        attrs = []
        any = want.tag == 'any' or 'any' in want.attrib
        for name, value in sorted(got.attrib.items()):
            if name not in want.attrib and not any:
                attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
            else:
                if name in want.attrib:
                    text = self.collect_diff_text(want.attrib[name], value, False)
                else:
                    text = self.format_text(value, False)
                attrs.append('%s="%s"' % (name, text))
        if not any:
            for name, value in sorted(want.attrib.items()):
                if name in got.attrib:
                    continue
                attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
        if attrs:
            tag = '<%s %s>' % (tag, ' '.join(attrs))
        else:
            tag = '<%s>' % tag
        return tag

    def collect_diff_end_tag(self, want, got):
        if want.tag != got.tag:
            tag = '%s (got: %s)' % (want.tag, got.tag)
        else:
            tag = got.tag
        return '</%s>' % tag

    def collect_diff_text(self, want, got, strip=True):
        if self.text_compare(want, got, strip):
            if not got:
                return ''
            return self.format_text(got, strip)
        text = '%s (got: %s)' % (want, got)
        return self.format_text(text, strip)

class LHTMLOutputChecker(LXMLOutputChecker):
    def get_default_parser(self):
        return html_fromstring
    
def install(html=False):
    """
    Install doctestcompare for all future doctests.

    If html is true, then by default the HTML parser will be used;
    otherwise the XML parser is used.
    """
    if html:
        doctest.OutputChecker = LHTMLOutputChecker
    else:
        doctest.OutputChecker = LXMLOutputChecker

def temp_install(html=False, del_module=None):
    """
    Use this *inside* a doctest to enable this checker for this
    doctest only.

    If html is true, then by default the HTML parser will be used;
    otherwise the XML parser is used.
    """
    if html:
        Checker = LHTMLOutputChecker
    else:
        Checker = LXMLOutputChecker
    frame = _find_doctest_frame()
    dt_self = frame.f_locals['self']
    checker = Checker()
    old_checker = dt_self._checker
    dt_self._checker = checker
    # The unfortunate thing is that there is a local variable 'check'
    # in the function that runs the doctests, that is a bound method
    # into the output checker.  We have to update that.  We can't
    # modify the frame, so we have to modify the object in place.  The
    # only way to do this is to actually change the func_code
    # attribute of the method.  We change it, and then wait for
    # __record_outcome to be run, which signals the end of the __run
    # method, at which point we restore the previous check_output
    # implementation.
    if _IS_PYTHON_3:
        check_func = frame.f_locals['check'].__func__
        checker_check_func = checker.check_output.__func__
    else:
        check_func = frame.f_locals['check'].im_func
        checker_check_func = checker.check_output.im_func
    # Because we can't patch up func_globals, this is the only global
    # in check_output that we care about:
    doctest.etree = etree
    _RestoreChecker(dt_self, old_checker, checker,
                    check_func, checker_check_func,
                    del_module)

class _RestoreChecker(object):
    def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
                 del_module):
        self.dt_self = dt_self
        self.checker = old_checker
        self.checker._temp_call_super_check_output = self.call_super
        self.checker._temp_override_self = new_checker
        self.check_func = check_func
        self.clone_func = clone_func
        self.del_module = del_module
        self.install_clone()
        self.install_dt_self()
    def install_clone(self):
        if _IS_PYTHON_3:
            self.func_code = self.check_func.__code__
            self.func_globals = self.check_func.__globals__
            self.check_func.__code__ = self.clone_func.__code__
        else:
            self.func_code = self.check_func.func_code
            self.func_globals = self.check_func.func_globals
            self.check_func.func_code = self.clone_func.func_code
    def uninstall_clone(self):
        if _IS_PYTHON_3:
            self.check_func.__code__ = self.func_code
        else:
            self.check_func.func_code = self.func_code
    def install_dt_self(self):
        self.prev_func = self.dt_self._DocTestRunner__record_outcome
        self.dt_self._DocTestRunner__record_outcome = self
    def uninstall_dt_self(self):
        self.dt_self._DocTestRunner__record_outcome = self.prev_func
    def uninstall_module(self):
        if self.del_module:
            import sys
            del sys.modules[self.del_module]
            if '.' in self.del_module:
                package, module = self.del_module.rsplit('.', 1)
                package_mod = sys.modules[package]
                delattr(package_mod, module)
    def __call__(self, *args, **kw):
        self.uninstall_clone()
        self.uninstall_dt_self()
        del self.checker._temp_override_self
        del self.checker._temp_call_super_check_output
        result = self.prev_func(*args, **kw)
        self.uninstall_module()
        return result
    def call_super(self, *args, **kw):
        self.uninstall_clone()
        try:
            return self.check_func(*args, **kw)
        finally:
            self.install_clone()
            
def _find_doctest_frame():
    import sys
    frame = sys._getframe(1)
    while frame:
        l = frame.f_locals
        if 'BOOM' in l:
            # Sign of doctest
            return frame
        frame = frame.f_back
    raise LookupError(
        "Could not find doctest (only use this function *inside* a doctest)")
    
__test__ = {
    'basic': '''
    >>> temp_install()
    >>> print """<xml a="1" b="2">stuff</xml>"""
    <xml b="2" a="1">...</xml>
    >>> print """<xml xmlns="http://example.com"><tag   attr="bar"   /></xml>"""
    <xml xmlns="...">
      <tag attr="..." />
    </xml>
    >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
    <xml>...foo /></xml>
    '''}

if __name__ == '__main__':
    import doctest
    doctest.testmod()