147 lines
4.1 KiB
Python
147 lines
4.1 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import warnings
|
||
|
from pkg_resources import parse_version
|
||
|
|
||
|
from bleach.linkifier import (
|
||
|
DEFAULT_CALLBACKS,
|
||
|
Linker,
|
||
|
)
|
||
|
from bleach.sanitizer import (
|
||
|
ALLOWED_ATTRIBUTES,
|
||
|
ALLOWED_PROTOCOLS,
|
||
|
ALLOWED_STYLES,
|
||
|
ALLOWED_TAGS,
|
||
|
Cleaner,
|
||
|
)
|
||
|
|
||
|
|
||
|
import html5lib
|
||
|
try:
|
||
|
_html5lib_version = html5lib.__version__.split('.')
|
||
|
if len(_html5lib_version) < 2:
|
||
|
_html5lib_version = _html5lib_version + ['0']
|
||
|
except Exception:
|
||
|
_h5ml5lib_version = ['unknown', 'unknown']
|
||
|
|
||
|
|
||
|
# Bleach 3.0.0 won't support html5lib-python < 1.0.0.
|
||
|
if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]:
|
||
|
warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning)
|
||
|
|
||
|
|
||
|
# yyyymmdd
|
||
|
__releasedate__ = '20180305'
|
||
|
# x.y.z or x.y.z.dev0 -- semver
|
||
|
__version__ = '2.1.3'
|
||
|
VERSION = parse_version(__version__)
|
||
|
|
||
|
|
||
|
__all__ = ['clean', 'linkify']
|
||
|
|
||
|
|
||
|
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
||
|
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
||
|
strip_comments=True):
|
||
|
"""Clean an HTML fragment of malicious content and return it
|
||
|
|
||
|
This function is a security-focused function whose sole purpose is to
|
||
|
remove malicious content from a string such that it can be displayed as
|
||
|
content in a web page.
|
||
|
|
||
|
This function is not designed to use to transform content to be used in
|
||
|
non-web-page contexts.
|
||
|
|
||
|
Example::
|
||
|
|
||
|
import bleach
|
||
|
|
||
|
better_text = bleach.clean(yucky_text)
|
||
|
|
||
|
|
||
|
.. Note::
|
||
|
|
||
|
If you're cleaning a lot of text and passing the same argument values or
|
||
|
you want more configurability, consider using a
|
||
|
:py:class:`bleach.sanitizer.Cleaner` instance.
|
||
|
|
||
|
:arg str text: the text to clean
|
||
|
|
||
|
:arg list tags: allowed list of tags; defaults to
|
||
|
``bleach.sanitizer.ALLOWED_TAGS``
|
||
|
|
||
|
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||
|
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||
|
|
||
|
:arg list styles: allowed list of css styles; defaults to
|
||
|
``bleach.sanitizer.ALLOWED_STYLES``
|
||
|
|
||
|
:arg list protocols: allowed list of protocols for links; defaults
|
||
|
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||
|
|
||
|
:arg bool strip: whether or not to strip disallowed elements
|
||
|
|
||
|
:arg bool strip_comments: whether or not to strip HTML comments
|
||
|
|
||
|
:returns: cleaned text as unicode
|
||
|
|
||
|
"""
|
||
|
cleaner = Cleaner(
|
||
|
tags=tags,
|
||
|
attributes=attributes,
|
||
|
styles=styles,
|
||
|
protocols=protocols,
|
||
|
strip=strip,
|
||
|
strip_comments=strip_comments,
|
||
|
)
|
||
|
return cleaner.clean(text)
|
||
|
|
||
|
|
||
|
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
|
||
|
"""Convert URL-like strings in an HTML fragment to links
|
||
|
|
||
|
This function converts strings that look like URLs, domain names and email
|
||
|
addresses in text that may be an HTML fragment to links, while preserving:
|
||
|
|
||
|
1. links already in the string
|
||
|
2. urls found in attributes
|
||
|
3. email addresses
|
||
|
|
||
|
linkify does a best-effort approach and tries to recover from bad
|
||
|
situations due to crazy text.
|
||
|
|
||
|
.. Note::
|
||
|
|
||
|
If you're linking a lot of text and passing the same argument values or
|
||
|
you want more configurability, consider using a
|
||
|
:py:class:`bleach.linkifier.Linker` instance.
|
||
|
|
||
|
.. Note::
|
||
|
|
||
|
If you have text that you want to clean and then linkify, consider using
|
||
|
the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
|
||
|
pass. That way you're not parsing the HTML twice.
|
||
|
|
||
|
:arg str text: the text to linkify
|
||
|
|
||
|
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||
|
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||
|
|
||
|
:arg list skip_tags: list of tags that you don't want to linkify the
|
||
|
contents of; for example, you could set this to ``['pre']`` to skip
|
||
|
linkifying contents of ``pre`` tags
|
||
|
|
||
|
:arg bool parse_email: whether or not to linkify email addresses
|
||
|
|
||
|
:returns: linkified text as unicode
|
||
|
|
||
|
"""
|
||
|
linker = Linker(
|
||
|
callbacks=callbacks,
|
||
|
skip_tags=skip_tags,
|
||
|
parse_email=parse_email
|
||
|
)
|
||
|
return linker.linkify(text)
|