From 9468226a9e2e2ab8cdd599f1d8538e860ca86120 Mon Sep 17 00:00:00 2001 From: Biswakalyan Bhuyan Date: Sun, 13 Nov 2022 23:46:45 +0530 Subject: id card generator --- .../python3.10/site-packages/lxml/html/clean.py | 786 +++++++++++++++++++++ 1 file changed, 786 insertions(+) create mode 100644 env/lib/python3.10/site-packages/lxml/html/clean.py (limited to 'env/lib/python3.10/site-packages/lxml/html/clean.py') diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py new file mode 100644 index 0000000..e6b0543 --- /dev/null +++ b/env/lib/python3.10/site-packages/lxml/html/clean.py @@ -0,0 +1,786 @@ +# cython: language_level=3str + +"""A cleanup tool for HTML. + +Removes unwanted tags and content. See the `Cleaner` class for +details. +""" + +from __future__ import absolute_import + +import copy +import re +import sys +try: + from urlparse import urlsplit + from urllib import unquote_plus +except ImportError: + # Python 3 + from urllib.parse import urlsplit, unquote_plus +from lxml import etree +from lxml.html import defs +from lxml.html import fromstring, XHTML_NAMESPACE +from lxml.html import xhtml_to_html, _transform_result + +try: + unichr +except NameError: + # Python 3 + unichr = chr +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + basestring = (str, bytes) + + +__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', + 'word_break', 'word_break_html'] + +# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# Particularly the CSS cleaning; most of the tag cleaning is integrated now +# I have multiple kinds of schemes searched; but should schemes be +# whitelisted instead? +# max height? +# remove images? Also in CSS? background attribute? +# Some way to whitelist object, iframe, etc (e.g., if you want to +# allow *just* embedded YouTube movies) +# Log what was deleted and why? +# style="behavior: ..." might be bad in IE? +# Should we have something for just ? That's the worst of the +# metas. +# UTF-7 detections? Example: +# +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- +# you don't always have to have the charset set, if the page has no charset +# and there's UTF7-like code in it. +# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php + + +# This is an IE-specific construct you can have in a stylesheet to +# run some Javascript: +_replace_css_javascript = re.compile( + r'expression\s*\(.*?\)', re.S|re.I).sub + +# Do I have to worry about @\nimport? +_replace_css_import = re.compile( + r'@\s*import', re.I).sub + +_looks_like_tag_content = re.compile( + r'= 3 else ())).search + +# All kinds of schemes besides just javascript: that can cause +# execution: +_find_image_dataurls = re.compile( + r'data:image/(.+);base64,', re.I).findall +_possibly_malicious_schemes = re.compile( + r'(javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).findall +# SVG images can contain script content +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search + +def _has_javascript_scheme(s): + safe_image_urls = 0 + for image_type in _find_image_dataurls(s): + if _is_unsafe_image_type(image_type): + return True + safe_image_urls += 1 + return len(_possibly_malicious_schemes(s)) > safe_image_urls + +_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub + +# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx +_conditional_comment_re = re.compile( + r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) + +_find_styled_elements = etree.XPath( + "descendant-or-self::*[@style]") + +_find_external_links = etree.XPath( + ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" + "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), + namespaces={'x':XHTML_NAMESPACE}) + + +class Cleaner(object): + """ + Instances cleans the document of each of the possible offending + elements. The cleaning is controlled by attributes; you can + override attributes in a subclass, or set them in the constructor. + + ``scripts``: + Removes any ``