From d47f8b48935d258f4c5c3e2267911753bebd5214 Mon Sep 17 00:00:00 2001 From: Biswakalyan Bhuyan Date: Mon, 14 Nov 2022 16:43:12 +0530 Subject: id card --- .../python3.10/site-packages/lxml/html/clean.py | 786 --------------------- 1 file changed, 786 deletions(-) delete mode 100644 env/lib/python3.10/site-packages/lxml/html/clean.py (limited to 'env/lib/python3.10/site-packages/lxml/html/clean.py') diff --git a/env/lib/python3.10/site-packages/lxml/html/clean.py b/env/lib/python3.10/site-packages/lxml/html/clean.py deleted file mode 100644 index e6b0543..0000000 --- a/env/lib/python3.10/site-packages/lxml/html/clean.py +++ /dev/null @@ -1,786 +0,0 @@ -# cython: language_level=3str - -"""A cleanup tool for HTML. - -Removes unwanted tags and content. See the `Cleaner` class for -details. -""" - -from __future__ import absolute_import - -import copy -import re -import sys -try: - from urlparse import urlsplit - from urllib import unquote_plus -except ImportError: - # Python 3 - from urllib.parse import urlsplit, unquote_plus -from lxml import etree -from lxml.html import defs -from lxml.html import fromstring, XHTML_NAMESPACE -from lxml.html import xhtml_to_html, _transform_result - -try: - unichr -except NameError: - # Python 3 - unichr = chr -try: - unicode -except NameError: - # Python 3 - unicode = str -try: - basestring -except NameError: - basestring = (str, bytes) - - -__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', - 'word_break', 'word_break_html'] - -# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# Particularly the CSS cleaning; most of the tag cleaning is integrated now -# I have multiple kinds of schemes searched; but should schemes be -# whitelisted instead? -# max height? -# remove images? Also in CSS? background attribute? -# Some way to whitelist object, iframe, etc (e.g., if you want to -# allow *just* embedded YouTube movies) -# Log what was deleted and why? -# style="behavior: ..." might be bad in IE? -# Should we have something for just ? That's the worst of the -# metas. -# UTF-7 detections? Example: -# +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- -# you don't always have to have the charset set, if the page has no charset -# and there's UTF7-like code in it. -# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php - - -# This is an IE-specific construct you can have in a stylesheet to -# run some Javascript: -_replace_css_javascript = re.compile( - r'expression\s*\(.*?\)', re.S|re.I).sub - -# Do I have to worry about @\nimport? -_replace_css_import = re.compile( - r'@\s*import', re.I).sub - -_looks_like_tag_content = re.compile( - r'= 3 else ())).search - -# All kinds of schemes besides just javascript: that can cause -# execution: -_find_image_dataurls = re.compile( - r'data:image/(.+);base64,', re.I).findall -_possibly_malicious_schemes = re.compile( - r'(javascript|jscript|livescript|vbscript|data|about|mocha):', - re.I).findall -# SVG images can contain script content -_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search - -def _has_javascript_scheme(s): - safe_image_urls = 0 - for image_type in _find_image_dataurls(s): - if _is_unsafe_image_type(image_type): - return True - safe_image_urls += 1 - return len(_possibly_malicious_schemes(s)) > safe_image_urls - -_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub - -# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx -_conditional_comment_re = re.compile( - r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) - -_find_styled_elements = etree.XPath( - "descendant-or-self::*[@style]") - -_find_external_links = etree.XPath( - ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" - "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), - namespaces={'x':XHTML_NAMESPACE}) - - -class Cleaner(object): - """ - Instances cleans the document of each of the possible offending - elements. The cleaning is controlled by attributes; you can - override attributes in a subclass, or set them in the constructor. - - ``scripts``: - Removes any ``