[Python-modules-commits] [python-bleach] 01/07: import python-bleach_2.0.orig.tar.gz
Scott Kitterman
kitterman at moszumanska.debian.org
Sat Mar 11 13:52:06 UTC 2017
This is an automated email from the git hooks/post-receive script.
kitterman pushed a commit to branch master
in repository python-bleach.
commit fb79febacc53548017a1729584fc3c257b7f8e0a
Author: Scott Kitterman <scott at kitterman.com>
Date: Fri Mar 10 13:53:08 2017 -0500
import python-bleach_2.0.orig.tar.gz
---
.gitignore | 12 +
.travis.yml | 34 +++
CHANGES | 277 +++++++++++++++++++++
CONTRIBUTING.rst | 15 ++
CONTRIBUTORS | 55 +++++
LICENSE | 13 +
MANIFEST.in | 14 ++
README.rst | 103 ++++++++
bleach/__init__.py | 124 ++++++++++
bleach/callbacks.py | 25 ++
bleach/encoding.py | 62 +++++
bleach/linkifier.py | 526 ++++++++++++++++++++++++++++++++++++++++
bleach/sanitizer.py | 368 ++++++++++++++++++++++++++++
bleach/utils.py | 23 ++
bleach/version.py | 6 +
docs/Makefile | 153 ++++++++++++
docs/changes.rst | 3 +
docs/clean.rst | 367 ++++++++++++++++++++++++++++
docs/conf.py | 255 ++++++++++++++++++++
docs/dev.rst | 75 ++++++
docs/goals.rst | 106 ++++++++
docs/index.rst | 21 ++
docs/linkify.rst | 361 ++++++++++++++++++++++++++++
requirements.txt | 13 +
setup.cfg | 13 +
setup.py | 75 ++++++
tests/data/1.test | 1 +
tests/data/1.test.out | 1 +
tests/data/10.test | 1 +
tests/data/10.test.out | 1 +
tests/data/11.test | 1 +
tests/data/11.test.out | 1 +
tests/data/12.test | 1 +
tests/data/12.test.out | 1 +
tests/data/13.test | 1 +
tests/data/13.test.out | 1 +
tests/data/14.test | 1 +
tests/data/14.test.out | 1 +
tests/data/15.test | 1 +
tests/data/15.test.out | 1 +
tests/data/16.test | 1 +
tests/data/16.test.out | 1 +
tests/data/17.test | 1 +
tests/data/17.test.out | 1 +
tests/data/18.test | 1 +
tests/data/18.test.out | 1 +
tests/data/19.test | 1 +
tests/data/19.test.out | 2 +
tests/data/2.test | 1 +
tests/data/2.test.out | 1 +
tests/data/3.test | 1 +
tests/data/3.test.out | 1 +
tests/data/4.test | 1 +
tests/data/4.test.out | 1 +
tests/data/5.test | 1 +
tests/data/5.test.out | 1 +
tests/data/7.test | 1 +
tests/data/7.test.out | 1 +
tests/data/8.test | 1 +
tests/data/8.test.out | 1 +
tests/data/9.test | 1 +
tests/data/9.test.out | 1 +
tests/test_basics.py | 365 ++++++++++++++++++++++++++++
tests/test_css.py | 153 ++++++++++++
tests/test_links.py | 641 +++++++++++++++++++++++++++++++++++++++++++++++++
tests/test_security.py | 186 ++++++++++++++
tests/test_unicode.py | 49 ++++
tests/test_utils.py | 44 ++++
tox.ini | 21 ++
69 files changed, 4595 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f5adb54
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+*.pyo
+*.pyc
+pip-log.txt
+.coverage
+dist
+*.egg-info
+.noseids
+build
+.tox
+docs/_build/
+.cache/
+.eggs/
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..1401537
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,34 @@
+sudo: false
+language: python
+cache:
+ directories:
+ - "~/.cache/pip"
+python:
+- "2.7"
+- "3.3"
+- "3.4"
+- "3.5"
+- "3.6"
+- "pypy"
+env:
+- HTML5LIB=0.99999999 # 8
+- HTML5LIB=0.999999999 # 9
+install:
+ # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+ - pip install -U pip setuptools>=18.5
+ - pip install -r requirements.txt
+ # stomp on html5lib install with the specified one
+ - pip install html5lib==$HTML5LIB
+script:
+- py.test
+- flake8 bleach/
+deploy:
+ provider: pypi
+ user: jezdez
+ distributions: sdist bdist_wheel
+ password:
+ secure: TTLpnNBAmRBPe4qITwtM6MRXw3CvGpflnkG6V97oKYL1RJhDXmxIxxImkGyVoT2IR4Oy/jqEikWUCCC3aDoqDnIkkDVriTPmo5PGnS2WgvEmYdcaTIp+RXdKwKhpCVX8ITEuye0iCXYu28vDaySGjnxjlYAP4S0PGPUzh/tn4DY=
+ on:
+ tags: true
+ repo: mozilla/bleach
+ python: "2.7"
diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..7caa99f
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,277 @@
+Bleach Changes
+==============
+
+Version 2.0 (March 8th, 2017)
+-----------------------------
+
+**Backwards incompatible changes**
+
+* Removed support for Python 2.6. #206
+
+* Removed support for Python 3.2. #224
+
+* Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+ This version is a rewrite to use the new sanitizing API since the old
+ one was dropped in html5lib 0.99999999 (8 9s).
+
+* ``bleach.clean`` and friends were rewritten
+
+ ``clean`` was reimplemented as an html5lib filter and happens at a different
+ step in the HTML parsing -> traversing -> serializing process. Because of
+ that, there are some differences in clean's output as compared with previous
+ versions.
+
+ Amongst other things, this version will add end tags even if the tag in
+ question is to be escaped.
+
+* ``bleach.clean`` and friends attribute callables now take three arguments:
+ tag, attribute name and attribute value. Previously they only took attribute
+ name and attribute value.
+
+ All attribute callables will need to be updated.
+
+* ``bleach.linkify`` was rewritten
+
+ ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer
+ accepts a ``tokenizer`` argument.
+
+ The callback functions for adjusting link attributes now takes a namespaced
+ attribute.
+
+ Previously you'd do something like this::
+
+ def check_protocol(attrs, is_new):
+ if not attrs.get('href', '').startswith('http:', 'https:')):
+ return None
+ return attrs
+
+ Now it's more like this::
+
+ def check_protocol(attrs, is_new):
+ if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
+ # ^^^^^^^^^^^^^^^
+ return None
+ return attrs
+
+ Further, you need to make sure you're always using unicode values. If you
+ don't then html5lib will raise an assertion error that the value is not
+ unicode.
+
+ All linkify filters will need to be updated.
+
+* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been
+ replaced with a more general ``skip_tags`` argument.
+
+ Before, you might do::
+
+ bleach.linkify(some_text, skip_pre=True)
+
+ The equivalent with Bleach 2.0 is::
+
+ bleach.linkify(some_text, skip_tags=['pre'])
+
+ You can skip other tags, too, like ``style`` or ``script`` or other places
+ where you don't want linkification happening.
+
+ All uses of linkify that use ``skip_pre`` will need to be updated.
+
+
+**Changes**
+
+* Supports Python 3.6.
+
+* Supports html5lib >= 0.99999999 (8 9s).
+
+* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your
+ favorite clean settings for easy reuse.
+
+* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your
+ favorite linkify settings for easy reuse.
+
+* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
+ you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
+ and linkify in one pass.
+
+* ``bleach.clean`` and friends can now take a callable as an attributes arg value.
+
+* Tons of bug fixes.
+
+* Cleaned up tests.
+
+* Documentation fixes.
+
+
+Version 1.5 (November 4th, 2016)
+--------------------------------
+
+**Backwards incompatible changes**
+
+- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
+ mailto.
+
+ Previously it was a long list of protocols something like ed2k, ftp, http,
+ https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+ urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+
+**Changes**
+
+- clean: Added ``protocols`` to arguments list to let you override the list of
+ allowed protocols. Thank you, Andreas Malecki! #149
+- linkify: Fix a bug involving periods at the end of an email address. Thank you,
+ Lorenz Schori! #219
+- linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
+ #207
+- linkify: Fix linkify inappropriately removing node tails when dropping nodes.
+ #132
+- Fixed a test that failed periodically. #161
+- Switched from nose to py.test. #204
+- Add test matrix for all supported Python and html5lib versions. #230
+- Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
+ and 0.99999 are busted.
+- Add support for ``python setup.py test``. #97
+
+
+Version 1.4.3 (May 23rd, 2016)
+------------------------------
+
+**Changes**
+
+- Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to
+ sanitizer api. #195
+
+
+Version 1.4.2 (September 11, 2015)
+----------------------------------
+
+**Changes**
+
+- linkify: Fix hang in linkify with ``parse_email=True``. #124
+- linkify: Fix crash in linkify when removing a link that is a first-child. #136
+- Updated TLDs.
+- linkify: Don't remove exterior brackets when linkifying. #146
+
+
+Version 1.4.1 (December 15, 2014)
+---------------------------------
+
+**Changes**
+
+- Consistent order of attributes in output.
+- Python 3.4 support.
+
+
+Version 1.4 (January 12, 2014)
+------------------------------
+
+**Changes**
+
+- linkify: Update linkify to use etree type Treewalker instead of simpletree.
+- Updated html5lib to version ``>=0.999``.
+- Update all code to be compatible with Python 3 and 2 using six.
+- Switch to Apache License.
+
+
+Version 1.3
+-----------
+
+- Used by Python 3-only fork.
+
+
+Version 1.2.2 (May 18, 2013)
+----------------------------
+
+- Pin html5lib to version 0.95 for now due to major API break.
+
+Version 1.2.1 (February 19, 2013)
+---------------------------------
+
+- clean() no longer considers ``feed:`` an acceptable protocol due to
+ inconsistencies in browser behavior.
+
+
+Version 1.2 (January 28, 2013)
+------------------------------
+
+- linkify() has changed considerably. Many keyword arguments have been
+ replaced with a single callbacks list. Please see the documentation
+ for more information.
+- Bleach will no longer consider unacceptable protocols when linkifying.
+- linkify() now takes a tokenizer argument that allows it to skip
+ sanitization.
+- delinkify() is gone.
+- Removed exception handling from _render. clean() and linkify() may now
+ throw.
+- linkify() correctly ignores case for protocols and domain names.
+- linkify() correctly handles markup within an <a> tag.
+
+
+Version 1.1.5
+-------------
+
+
+Version 1.1.4
+-------------
+
+
+Version 1.1.3 (July 10, 2012)
+-----------------------------
+
+- Fix parsing bare URLs when parse_email=True.
+
+
+Version 1.1.2 (June 1, 2012)
+----------------------------
+
+- Fix hang in style attribute sanitizer. (#61)
+- Allow '/' in style attribute values.
+
+
+Version 1.1.1 (February 17, 2012)
+---------------------------------
+
+- Fix tokenizer for html5lib 0.9.5.
+
+
+Version 1.1.0 (October 24, 2011)
+--------------------------------
+
+- linkify() now understands port numbers. (#38)
+- Documented character encoding behavior. (#41)
+- Add an optional target argument to linkify().
+- Add delinkify() method. (#45)
+- Support subdomain whitelist for delinkify(). (#47, #48)
+
+
+Version 1.0.4 (September 2, 2011)
+---------------------------------
+
+- Switch to SemVer git tags.
+- Make linkify() smarter about trailing punctuation. (#30)
+- Pass exc_info to logger during rendering issues.
+- Add wildcard key for attributes. (#19)
+- Make linkify() use the HTMLSanitizer tokenizer. (#36)
+- Fix URLs wrapped in parentheses. (#23)
+- Make linkify() UTF-8 safe. (#33)
+
+
+Version 1.0.3 (June 14, 2011)
+-----------------------------
+
+- linkify() works with 3rd level domains. (#24)
+- clean() supports vendor prefixes in style values. (#31, #32)
+- Fix linkify() email escaping.
+
+
+Version 1.0.2 (June 6, 2011)
+----------------------------
+
+- linkify() supports email addresses.
+- clean() supports callables in attributes filter.
+
+
+Version 1.0.1 (April 12, 2011)
+------------------------------
+
+- linkify() doesn't drop trailing slashes. (#21)
+- linkify() won't linkify 'libgl.so.1'. (#22)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
new file mode 100644
index 0000000..d8ad24c
--- /dev/null
+++ b/CONTRIBUTING.rst
@@ -0,0 +1,15 @@
+Reporting Bugs
+==============
+
+For regular bugs, please report them `in our issue tracker
+<https://github.com/mozilla/bleach/issues>`_.
+
+If you believe that you've found a security vulnerability, please `file a secure
+bug report in our bug tracker
+<https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security>`_
+or send an email to *security AT mozilla DOT org*.
+
+For more information on security-related bug disclosure and the PGP key to use
+for sending encrypted mail or to verify responses received from that address,
+please read our wiki page at
+`<https://www.mozilla.org/en-US/security/#For_Developers>`_.
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..4c90ae5
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,55 @@
+Bleach was originally written and maintained by James Socol and various
+contributors within and without the Mozilla Corporation and Foundation.
+It is currently maintained by Jannis Leidel and Will Kahn-Greene.
+
+Maintainers:
+
+- Will Kahn-Greene <willkg at mozilla.com>
+
+Maintainer emeritus:
+
+- Jannis Leidel <jleidel at mozilla.com>
+- James Socol <me at jamessocol.com>
+
+Contributors:
+
+- Adam Lofts
+- Adrian "ThiefMaster"
+- Alek
+- Alexandre Macabies
+- Alexandr N. Zamaraev
+- Alex Ehlke
+- Alireza Savand
+- Andreas Malecki
+- Andy Freeland
+- Anton Kovalyov
+- Chris Beaven
+- Dan Gayle
+- Erik Rose
+- Gaurav Dadhania
+- Geoffrey Sneddon
+- Istvan Albert
+- Jaime Irurzun
+- James Socol
+- Jannis Leidel
+- Jeff Balogh
+- Lee, Cheon-il
+- Les Orchard
+- Lorenz Schori
+- Luis Nell
+- Marc Abramowitz
+- Marc DM
+- Mark Lee
+- Mark Paschal
+- mdxs
+- nikolas
+- Oh Jinkyun
+- Paul Craciunoiu
+- Ricky Rosario
+- Ryan Niemeyer
+- Sébastien Fievet
+- Tim Dumol
+- Timothy Fitz
+- Vitaly Volkov
+- Will Kahn-Greene
+- zyegfryed
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..467c38e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2014-2017, Mozilla Foundation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..d8329f6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,14 @@
+include CHANGES
+include CONTRIBUTORS
+include CONTRIBUTING.rst
+include requirements.txt
+include tox.ini
+include LICENSE
+include README.rst
+
+include docs/conf.py
+include docs/Makefile
+
+recursive-include docs *.rst
+
+recursive-include tests *.py *.test *.out
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..08dd886
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,103 @@
+======
+Bleach
+======
+
+.. image:: https://travis-ci.org/mozilla/bleach.png?branch=master
+ :target: https://travis-ci.org/mozilla/bleach
+
+.. image:: https://badge.fury.io/py/bleach.svg
+ :target: http://badge.fury.io/py/bleach
+
+Bleach is a allowed-list-based HTML sanitizing library that escapes or strips
+markup and attributes.
+
+Bleach can also linkify text safely, applying filters that Django's ``urlize``
+filter cannot, and optionally setting ``rel`` attributes, even on links already
+in the text.
+
+Bleach is intended for sanitizing text from *untrusted* sources. If you find
+yourself jumping through hoops to allow your site administrators to do lots of
+things, you're probably outside the use cases. Either trust those users, or
+don't.
+
+Because it relies on html5lib_, Bleach is as good as modern browsers at dealing
+with weird, quirky HTML fragments. And *any* of Bleach's methods will fix
+unbalanced or mis-nested tags.
+
+The version on GitHub_ is the most up-to-date and contains the latest bug
+fixes. You can find full documentation on `ReadTheDocs`_.
+
+:Code: https://github.com/mozilla/bleach
+:Documentation: https://bleach.readthedocs.io/
+:Issue tracker: https://github.com/mozilla/bleach/issues
+:IRC: ``#bleach`` on irc.mozilla.org
+:License: Apache License v2; see LICENSE file
+
+
+Reporting Bugs
+==============
+
+For regular bugs, please report them `in our issue tracker
+<https://github.com/mozilla/bleach/issues>`_.
+
+If you believe that you've found a security vulnerability, please `file a secure
+bug report in our bug tracker
+<https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security>`_
+or send an email to *security AT mozilla DOT org*.
+
+For more information on security-related bug disclosure and the PGP key to use
+for sending encrypted mail or to verify responses received from that address,
+please read our wiki page at
+`<https://www.mozilla.org/en-US/security/#For_Developers>`_.
+
+
+Installing Bleach
+=================
+
+Bleach is available on PyPI_, so you can install it with ``pip``::
+
+ $ pip install bleach
+
+Or with ``easy_install``::
+
+ $ easy_install bleach
+
+Or by cloning the repo from GitHub_::
+
+ $ git clone git://github.com/mozilla/bleach.git
+
+Then install it by running::
+
+ $ python setup.py install
+
+
+Upgrading Bleach
+================
+
+.. warning::
+
+ Before doing any upgrades, read through `Bleach Changes
+ <https://bleach.readthedocs.io/en/latest/changes.html>`_ for backwards
+ incompatible changes, newer versions, etc.
+
+
+Basic use
+=========
+
+The simplest way to use Bleach is:
+
+.. code-block:: python
+
+ >>> import bleach
+
+ >>> bleach.clean('an <script>evil()</script> example')
+ u'an <script>evil()</script> example'
+
+ >>> bleach.linkify('an http://example.com url')
+ u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
+
+
+.. _html5lib: https://github.com/html5lib/html5lib-python
+.. _GitHub: https://github.com/mozilla/bleach
+.. _ReadTheDocs: https://bleach.readthedocs.io/
+.. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/bleach/__init__.py b/bleach/__init__.py
new file mode 100644
index 0000000..c9a7fe4
--- /dev/null
+++ b/bleach/__init__.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from bleach.linkifier import (
+ DEFAULT_CALLBACKS,
+ Linker,
+ LinkifyFilter,
+)
+from bleach.sanitizer import (
+ ALLOWED_ATTRIBUTES,
+ ALLOWED_PROTOCOLS,
+ ALLOWED_STYLES,
+ ALLOWED_TAGS,
+ BleachSanitizerFilter,
+ Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
+
+__all__ = ['clean', 'linkify']
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+ styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+ strip_comments=True):
+ """Clean an HTML fragment of malicious content and return it
+
+ This function is a security-focused function whose sole purpose is to
+ remove malicious content from a string such that it can be displayed as
+ content in a web page.
+
+ This function is not designed to use to transform content to be used in
+ non-web-page contexts.
+
+ Example::
+
+ import bleach
+
+ better_text = bleach.clean(yucky_text)
+
+
+ .. Note::
+
+ If you're cleaning a lot of text and passing the same argument values or
+ you want more configurability, consider using a
+ :py:class:`bleach.sanitizer.Cleaner` instance.
+
+ :arg str text: the text to clean
+
+ :arg list tags: allowed list of tags; defaults to
+ ``bleach.sanitizer.ALLOWED_TAGS``
+
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
+ defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+ :arg list styles: allowed list of css styles; defaults to
+ ``bleach.sanitizer.ALLOWED_STYLES``
+
+ :arg list protocols: allowed list of protocols for links; defaults
+ to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+ :arg bool strip: whether or not to strip disallowed elements
+
+ :arg bool strip_comments: whether or not to strip HTML comments
+
+ :returns: cleaned text as unicode
+
+ """
+ cleaner = Cleaner(
+ tags=tags,
+ attributes=attributes,
+ styles=styles,
+ protocols=protocols,
+ strip=strip,
+ strip_comments=strip_comments,
+ )
+ return cleaner.clean(text)
+
+
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
+ """Convert URL-like strings in an HTML fragment to links
+
+ This function converts strings that look like URLs, domain names and email
+ addresses in text that may be an HTML fragment to links, while preserving:
+
+ 1. links already in the string
+ 2. urls found in attributes
+ 3. email addresses
+
+ linkify does a best-effort approach and tries to recover from bad
+ situations due to crazy text.
+
+ .. Note::
+
+ If you're linking a lot of text and passing the same argument values or
+ you want more configurability, consider using a
+ :py:class:`bleach.linkifier.Linker` instance.
+
+ .. Note::
+
+ If you have text that you want to clean and then linkify, consider using
+ the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+ pass. That way you're not parsing the HTML twice.
+
+ :arg str text: the text to linkify
+
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+ defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+ :arg list skip_tags: list of tags that you don't want to linkify the
+ contents of; for example, you could set this to ``['pre']`` to skip
+ linkifying contents of ``pre`` tags
+
+ :arg bool parse_email: whether or not to linkify email addresses
+
+ :returns: linkified text as unicode
+
+ """
+ linker = Linker(
+ callbacks=callbacks,
+ skip_tags=skip_tags,
+ parse_email=parse_email
+ )
+ return linker.linkify(text)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
new file mode 100644
index 0000000..d2ba101
--- /dev/null
+++ b/bleach/callbacks.py
@@ -0,0 +1,25 @@
+"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
+
+
+def nofollow(attrs, new=False):
+ href_key = (None, u'href')
+ if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
+ return attrs
+
+ rel_key = (None, u'rel')
+ rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
+ if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+ rel_values.append(u'nofollow')
+ attrs[rel_key] = u' '.join(rel_values)
+
+ return attrs
+
+
+def target_blank(attrs, new=False):
+ href_key = (None, u'href')
+ if attrs[href_key].startswith(u'mailto:'):
+ return attrs
+
+ attrs[(None, u'target')] = u'_blank'
+ return attrs
diff --git a/bleach/encoding.py b/bleach/encoding.py
new file mode 100644
index 0000000..707adaa
--- /dev/null
+++ b/bleach/encoding.py
@@ -0,0 +1,62 @@
+import datetime
+from decimal import Decimal
+import types
+import six
+
+
+def is_protected_type(obj):
+ """Determine if the object instance is of a protected type.
+
+ Objects of protected types are preserved as-is when passed to
+ force_unicode(strings_only=True).
+ """
+ return isinstance(obj, (
+ six.integer_types +
+ (types.NoneType,
+ datetime.datetime, datetime.date, datetime.time,
+ float, Decimal))
+ )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_text, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ # Handle the common case first, saves 30-40% when s is an instance of
+ # six.text_type. This function gets called often in that setting.
+ if isinstance(s, six.text_type):
+ return s
+ if strings_only and is_protected_type(s):
+ return s
+ try:
+ if not isinstance(s, six.string_types):
+ if hasattr(s, '__unicode__'):
+ s = s.__unicode__()
+ else:
+ if six.PY3:
+ if isinstance(s, bytes):
+ s = six.text_type(s, encoding, errors)
+ else:
+ s = six.text_type(s)
+ else:
+ s = six.text_type(bytes(s), encoding, errors)
+ else:
+ # Note: We use .decode() here, instead of six.text_type(s,
+ # encoding, errors), so that if s is a SafeBytes, it ends up being
+ # a SafeText at the end.
+ s = s.decode(encoding, errors)
+ except UnicodeDecodeError as e:
+ if not isinstance(s, Exception):
+ raise UnicodeDecodeError(*e.args)
+ else:
+ # If we get to here, the caller has passed in an Exception
+ # subclass populated with non-ASCII bytestring data without a
+ # working unicode method. Try to handle this without raising a
+ # further exception by individually forcing the exception args
+ # to unicode.
+ s = ' '.join([force_unicode(arg, encoding, strings_only,
+ errors) for arg in s])
+ return s
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
new file mode 100644
index 0000000..fc346c3
--- /dev/null
+++ b/bleach/linkifier.py
@@ -0,0 +1,526 @@
+from __future__ import unicode_literals
+import re
+
+import html5lib
+from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
+
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.utils import alphabetize_attributes
+
+
+#: List of default callbacks
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+ ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+ cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+ dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+ gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+ im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+ kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+ ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+ net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+ pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+ sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+ tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+ xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+
+def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+ """Builds the url regex used by linkifier
+
+ If you want a different set of tlds or allowed protocols, pass those in
+ and stomp on the existing ``url_re``::
+
+ from bleach import linkifier
+
+ my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+ linker = LinkifyFilter(url_re=my_url_re)
+
+ """
+ return re.compile(
+ r"""\(* # Match any opening parentheses.
+ \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
+ ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
+ (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+ # /path/zz (excluding "unsafe" chars from RFC 1738,
+ # except for # and ~, which happen in practice)
+ """.format('|'.join(protocols), '|'.join(tlds)),
+ re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+URL_RE = build_url_re()
+
+
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+
+
+EMAIL_RE = re.compile(
+ r"""(?<!//)
+ (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+ (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
+ |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+ |\\[\001-\011\013\014\016-\177])*" # quoted-string
+ )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain
+ """,
+ re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+
+class Linker(object):
+ """Convert URL-like strings in an HTML fragment to links
+
+ This function converts strings that look like URLs, domain names and email
+ addresses in text that may be an HTML fragment to links, while preserving:
+
+ 1. links already in the string
+ 2. urls found in attributes
+ 3. email addresses
+
+ linkify does a best-effort approach and tries to recover from bad
+ situations due to crazy text.
+
+ """
+ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
+ url_re=URL_RE, email_re=EMAIL_RE):
+ """Creates a Linker instance
+
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+ defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+ :arg list skip_tags: list of tags that you don't want to linkify the
+ contents of; for example, you could set this to ``['pre']`` to skip
+ linkifying contents of ``pre`` tags
+
+ :arg bool parse_email: whether or not to linkify email addresses
+
+ :arg re url_re: url matching regex
+
+ :arg re email_re: email matching regex
+
+ :returns: linkified text as unicode
+
+ """
+ self.callbacks = callbacks
+ self.skip_tags = skip_tags
+ self.parse_email = parse_email
+ self.url_re = url_re
+ self.email_re = email_re
+
+ self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
... 4103 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-bleach.git
More information about the Python-modules-commits
mailing list