[Python-modules-commits] [python-bleach] 01/07: import python-bleach_2.0.orig.tar.gz

Scott Kitterman kitterman at moszumanska.debian.org
Sat Mar 11 13:52:06 UTC 2017


This is an automated email from the git hooks/post-receive script.

kitterman pushed a commit to branch master
in repository python-bleach.

commit fb79febacc53548017a1729584fc3c257b7f8e0a
Author: Scott Kitterman <scott at kitterman.com>
Date:   Fri Mar 10 13:53:08 2017 -0500

    import python-bleach_2.0.orig.tar.gz
---
 .gitignore             |  12 +
 .travis.yml            |  34 +++
 CHANGES                | 277 +++++++++++++++++++++
 CONTRIBUTING.rst       |  15 ++
 CONTRIBUTORS           |  55 +++++
 LICENSE                |  13 +
 MANIFEST.in            |  14 ++
 README.rst             | 103 ++++++++
 bleach/__init__.py     | 124 ++++++++++
 bleach/callbacks.py    |  25 ++
 bleach/encoding.py     |  62 +++++
 bleach/linkifier.py    | 526 ++++++++++++++++++++++++++++++++++++++++
 bleach/sanitizer.py    | 368 ++++++++++++++++++++++++++++
 bleach/utils.py        |  23 ++
 bleach/version.py      |   6 +
 docs/Makefile          | 153 ++++++++++++
 docs/changes.rst       |   3 +
 docs/clean.rst         | 367 ++++++++++++++++++++++++++++
 docs/conf.py           | 255 ++++++++++++++++++++
 docs/dev.rst           |  75 ++++++
 docs/goals.rst         | 106 ++++++++
 docs/index.rst         |  21 ++
 docs/linkify.rst       | 361 ++++++++++++++++++++++++++++
 requirements.txt       |  13 +
 setup.cfg              |  13 +
 setup.py               |  75 ++++++
 tests/data/1.test      |   1 +
 tests/data/1.test.out  |   1 +
 tests/data/10.test     |   1 +
 tests/data/10.test.out |   1 +
 tests/data/11.test     |   1 +
 tests/data/11.test.out |   1 +
 tests/data/12.test     |   1 +
 tests/data/12.test.out |   1 +
 tests/data/13.test     |   1 +
 tests/data/13.test.out |   1 +
 tests/data/14.test     |   1 +
 tests/data/14.test.out |   1 +
 tests/data/15.test     |   1 +
 tests/data/15.test.out |   1 +
 tests/data/16.test     |   1 +
 tests/data/16.test.out |   1 +
 tests/data/17.test     |   1 +
 tests/data/17.test.out |   1 +
 tests/data/18.test     |   1 +
 tests/data/18.test.out |   1 +
 tests/data/19.test     |   1 +
 tests/data/19.test.out |   2 +
 tests/data/2.test      |   1 +
 tests/data/2.test.out  |   1 +
 tests/data/3.test      |   1 +
 tests/data/3.test.out  |   1 +
 tests/data/4.test      |   1 +
 tests/data/4.test.out  |   1 +
 tests/data/5.test      |   1 +
 tests/data/5.test.out  |   1 +
 tests/data/7.test      |   1 +
 tests/data/7.test.out  |   1 +
 tests/data/8.test      |   1 +
 tests/data/8.test.out  |   1 +
 tests/data/9.test      |   1 +
 tests/data/9.test.out  |   1 +
 tests/test_basics.py   | 365 ++++++++++++++++++++++++++++
 tests/test_css.py      | 153 ++++++++++++
 tests/test_links.py    | 641 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/test_security.py | 186 ++++++++++++++
 tests/test_unicode.py  |  49 ++++
 tests/test_utils.py    |  44 ++++
 tox.ini                |  21 ++
 69 files changed, 4595 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f5adb54
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+*.pyo
+*.pyc
+pip-log.txt
+.coverage
+dist
+*.egg-info
+.noseids
+build
+.tox
+docs/_build/
+.cache/
+.eggs/
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..1401537
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,34 @@
+sudo: false
+language: python
+cache:
+  directories:
+  - "~/.cache/pip"
+python:
+- "2.7"
+- "3.3"
+- "3.4"
+- "3.5"
+- "3.6"
+- "pypy"
+env:
+- HTML5LIB=0.99999999   # 8
+- HTML5LIB=0.999999999  # 9
+install:
+  # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+  - pip install -U pip setuptools>=18.5
+  - pip install -r requirements.txt
+  # stomp on html5lib install with the specified one
+  - pip install html5lib==$HTML5LIB
+script:
+- py.test
+- flake8 bleach/
+deploy:
+  provider: pypi
+  user: jezdez
+  distributions: sdist bdist_wheel
+  password:
+    secure: TTLpnNBAmRBPe4qITwtM6MRXw3CvGpflnkG6V97oKYL1RJhDXmxIxxImkGyVoT2IR4Oy/jqEikWUCCC3aDoqDnIkkDVriTPmo5PGnS2WgvEmYdcaTIp+RXdKwKhpCVX8ITEuye0iCXYu28vDaySGjnxjlYAP4S0PGPUzh/tn4DY=
+  on:
+    tags: true
+    repo: mozilla/bleach
+    python: "2.7"
diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..7caa99f
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,277 @@
+Bleach Changes
+==============
+
+Version 2.0 (March 8th, 2017)
+-----------------------------
+
+**Backwards incompatible changes**
+
+* Removed support for Python 2.6. #206
+
+* Removed support for Python 3.2. #224
+
+* Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+  This version is a rewrite to use the new sanitizing API since the old
+  one was dropped in html5lib 0.99999999 (8 9s).
+
+* ``bleach.clean`` and friends were rewritten
+
+  ``clean`` was reimplemented as an html5lib filter and happens at a different
+  step in the HTML parsing -> traversing -> serializing process. Because of
+  that, there are some differences in clean's output as compared with previous
+  versions.
+
+  Amongst other things, this version will add end tags even if the tag in
+  question is to be escaped.
+
+* ``bleach.clean`` and friends attribute callables now take three arguments:
+  tag, attribute name and attribute value. Previously they only took attribute
+  name and attribute value.
+
+  All attribute callables will need to be updated.
+
+* ``bleach.linkify`` was rewritten
+
+  ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer
+  accepts a ``tokenizer`` argument.
+
+  The callback functions for adjusting link attributes now takes a namespaced
+  attribute.
+
+  Previously you'd do something like this::
+
+      def check_protocol(attrs, is_new):
+          if not attrs.get('href', '').startswith('http:', 'https:')):
+              return None
+          return attrs
+
+  Now it's more like this::
+
+      def check_protocol(attrs, is_new):
+          if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
+              #            ^^^^^^^^^^^^^^^
+              return None
+          return attrs
+
+  Further, you need to make sure you're always using unicode values. If you
+  don't then html5lib will raise an assertion error that the value is not
+  unicode.
+
+  All linkify filters will need to be updated.
+
+* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been
+  replaced with a more general ``skip_tags`` argument.
+
+  Before, you might do::
+
+      bleach.linkify(some_text, skip_pre=True)
+
+  The equivalent with Bleach 2.0 is::
+
+      bleach.linkify(some_text, skip_tags=['pre'])
+
+  You can skip other tags, too, like ``style`` or ``script`` or other places
+  where you don't want linkification happening.
+
+  All uses of linkify that use ``skip_pre`` will need to be updated.
+
+
+**Changes**
+
+* Supports Python 3.6.
+
+* Supports html5lib >= 0.99999999 (8 9s).
+
+* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your
+  favorite clean settings for easy reuse.
+
+* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your
+  favorite linkify settings for easy reuse.
+
+* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
+  you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
+  and linkify in one pass.
+
+* ``bleach.clean`` and friends can now take a callable as an attributes arg value.
+
+* Tons of bug fixes.
+
+* Cleaned up tests.
+
+* Documentation fixes.
+
+
+Version 1.5 (November 4th, 2016)
+--------------------------------
+
+**Backwards incompatible changes**
+
+- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
+  mailto.
+
+  Previously it was a long list of protocols something like ed2k, ftp, http,
+  https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+  urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+
+**Changes**
+
+- clean: Added ``protocols`` to arguments list to let you override the list of
+  allowed protocols. Thank you, Andreas Malecki! #149
+- linkify: Fix a bug involving periods at the end of an email address. Thank you,
+  Lorenz Schori! #219
+- linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
+  #207
+- linkify: Fix linkify inappropriately removing node tails when dropping nodes.
+  #132
+- Fixed a test that failed periodically. #161
+- Switched from nose to py.test. #204
+- Add test matrix for all supported Python and html5lib versions. #230
+- Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
+  and 0.99999 are busted.
+- Add support for ``python setup.py test``. #97
+
+
+Version 1.4.3 (May 23rd, 2016)
+------------------------------
+
+**Changes**
+
+- Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to
+  sanitizer api. #195
+
+
+Version 1.4.2 (September 11, 2015)
+----------------------------------
+
+**Changes**
+
+- linkify: Fix hang in linkify with ``parse_email=True``. #124
+- linkify: Fix crash in linkify when removing a link that is a first-child. #136
+- Updated TLDs.
+- linkify: Don't remove exterior brackets when linkifying. #146
+
+
+Version 1.4.1 (December 15, 2014)
+---------------------------------
+
+**Changes**
+
+- Consistent order of attributes in output.
+- Python 3.4 support.
+
+
+Version 1.4 (January 12, 2014)
+------------------------------
+
+**Changes**
+
+- linkify: Update linkify to use etree type Treewalker instead of simpletree.
+- Updated html5lib to version ``>=0.999``.
+- Update all code to be compatible with Python 3 and 2 using six.
+- Switch to Apache License.
+
+
+Version 1.3
+-----------
+
+- Used by Python 3-only fork.
+
+
+Version 1.2.2 (May 18, 2013)
+----------------------------
+
+- Pin html5lib to version 0.95 for now due to major API break.
+
+Version 1.2.1 (February 19, 2013)
+---------------------------------
+
+- clean() no longer considers ``feed:`` an acceptable protocol due to
+  inconsistencies in browser behavior.
+
+
+Version 1.2 (January 28, 2013)
+------------------------------
+
+- linkify() has changed considerably. Many keyword arguments have been
+  replaced with a single callbacks list. Please see the documentation
+  for more information.
+- Bleach will no longer consider unacceptable protocols when linkifying.
+- linkify() now takes a tokenizer argument that allows it to skip
+  sanitization.
+- delinkify() is gone.
+- Removed exception handling from _render. clean() and linkify() may now
+  throw.
+- linkify() correctly ignores case for protocols and domain names.
+- linkify() correctly handles markup within an <a> tag.
+
+
+Version 1.1.5
+-------------
+
+
+Version 1.1.4
+-------------
+
+
+Version 1.1.3 (July 10, 2012)
+-----------------------------
+
+- Fix parsing bare URLs when parse_email=True.
+
+
+Version 1.1.2 (June 1, 2012)
+----------------------------
+
+- Fix hang in style attribute sanitizer. (#61)
+- Allow '/' in style attribute values.
+
+
+Version 1.1.1 (February 17, 2012)
+---------------------------------
+
+- Fix tokenizer for html5lib 0.9.5.
+
+
+Version 1.1.0 (October 24, 2011)
+--------------------------------
+
+- linkify() now understands port numbers. (#38)
+- Documented character encoding behavior. (#41)
+- Add an optional target argument to linkify().
+- Add delinkify() method. (#45)
+- Support subdomain whitelist for delinkify(). (#47, #48)
+
+
+Version 1.0.4 (September 2, 2011)
+---------------------------------
+
+- Switch to SemVer git tags.
+- Make linkify() smarter about trailing punctuation. (#30)
+- Pass exc_info to logger during rendering issues.
+- Add wildcard key for attributes. (#19)
+- Make linkify() use the HTMLSanitizer tokenizer. (#36)
+- Fix URLs wrapped in parentheses. (#23)
+- Make linkify() UTF-8 safe. (#33)
+
+
+Version 1.0.3 (June 14, 2011)
+-----------------------------
+
+- linkify() works with 3rd level domains. (#24)
+- clean() supports vendor prefixes in style values. (#31, #32)
+- Fix linkify() email escaping.
+
+
+Version 1.0.2 (June 6, 2011)
+----------------------------
+
+- linkify() supports email addresses.
+- clean() supports callables in attributes filter.
+
+
+Version 1.0.1 (April 12, 2011)
+------------------------------
+
+- linkify() doesn't drop trailing slashes. (#21)
+- linkify() won't linkify 'libgl.so.1'. (#22)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
new file mode 100644
index 0000000..d8ad24c
--- /dev/null
+++ b/CONTRIBUTING.rst
@@ -0,0 +1,15 @@
+Reporting Bugs
+==============
+
+For regular bugs, please report them `in our issue tracker
+<https://github.com/mozilla/bleach/issues>`_.
+
+If you believe that you've found a security vulnerability, please `file a secure
+bug report in our bug tracker
+<https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security>`_
+or send an email to *security AT mozilla DOT org*.
+
+For more information on security-related bug disclosure and the PGP key to use
+for sending encrypted mail or to verify responses received from that address,
+please read our wiki page at
+`<https://www.mozilla.org/en-US/security/#For_Developers>`_.
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..4c90ae5
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,55 @@
+Bleach was originally written and maintained by James Socol and various
+contributors within and without the Mozilla Corporation and Foundation.
+It is currently maintained by Jannis Leidel and Will Kahn-Greene.
+
+Maintainers:
+
+- Will Kahn-Greene <willkg at mozilla.com>
+
+Maintainer emeritus:
+
+- Jannis Leidel <jleidel at mozilla.com>
+- James Socol <me at jamessocol.com>
+
+Contributors:
+
+- Adam Lofts
+- Adrian "ThiefMaster"
+- Alek
+- Alexandre Macabies
+- Alexandr N. Zamaraev
+- Alex Ehlke
+- Alireza Savand
+- Andreas Malecki
+- Andy Freeland
+- Anton Kovalyov
+- Chris Beaven
+- Dan Gayle
+- Erik Rose
+- Gaurav Dadhania
+- Geoffrey Sneddon
+- Istvan Albert
+- Jaime Irurzun
+- James Socol
+- Jannis Leidel
+- Jeff Balogh
+- Lee, Cheon-il
+- Les Orchard
+- Lorenz Schori
+- Luis Nell
+- Marc Abramowitz
+- Marc DM
+- Mark Lee
+- Mark Paschal
+- mdxs
+- nikolas
+- Oh Jinkyun
+- Paul Craciunoiu
+- Ricky Rosario
+- Ryan Niemeyer
+- Sébastien Fievet
+- Tim Dumol
+- Timothy Fitz
+- Vitaly Volkov
+- Will Kahn-Greene
+- zyegfryed
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..467c38e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2014-2017, Mozilla Foundation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..d8329f6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,14 @@
+include CHANGES
+include CONTRIBUTORS
+include CONTRIBUTING.rst
+include requirements.txt
+include tox.ini
+include LICENSE
+include README.rst
+
+include docs/conf.py
+include docs/Makefile
+
+recursive-include docs *.rst
+
+recursive-include tests *.py *.test *.out
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..08dd886
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,103 @@
+======
+Bleach
+======
+
+.. image:: https://travis-ci.org/mozilla/bleach.png?branch=master
+   :target: https://travis-ci.org/mozilla/bleach
+
+.. image:: https://badge.fury.io/py/bleach.svg
+   :target: http://badge.fury.io/py/bleach
+
+Bleach is a allowed-list-based HTML sanitizing library that escapes or strips
+markup and attributes.
+
+Bleach can also linkify text safely, applying filters that Django's ``urlize``
+filter cannot, and optionally setting ``rel`` attributes, even on links already
+in the text.
+
+Bleach is intended for sanitizing text from *untrusted* sources. If you find
+yourself jumping through hoops to allow your site administrators to do lots of
+things, you're probably outside the use cases. Either trust those users, or
+don't.
+
+Because it relies on html5lib_, Bleach is as good as modern browsers at dealing
+with weird, quirky HTML fragments. And *any* of Bleach's methods will fix
+unbalanced or mis-nested tags.
+
+The version on GitHub_ is the most up-to-date and contains the latest bug
+fixes. You can find full documentation on `ReadTheDocs`_.
+
+:Code:           https://github.com/mozilla/bleach
+:Documentation:  https://bleach.readthedocs.io/
+:Issue tracker:  https://github.com/mozilla/bleach/issues
+:IRC:            ``#bleach`` on irc.mozilla.org
+:License:        Apache License v2; see LICENSE file
+
+
+Reporting Bugs
+==============
+
+For regular bugs, please report them `in our issue tracker
+<https://github.com/mozilla/bleach/issues>`_.
+
+If you believe that you've found a security vulnerability, please `file a secure
+bug report in our bug tracker
+<https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security>`_
+or send an email to *security AT mozilla DOT org*.
+
+For more information on security-related bug disclosure and the PGP key to use
+for sending encrypted mail or to verify responses received from that address,
+please read our wiki page at
+`<https://www.mozilla.org/en-US/security/#For_Developers>`_.
+
+
+Installing Bleach
+=================
+
+Bleach is available on PyPI_, so you can install it with ``pip``::
+
+    $ pip install bleach
+
+Or with ``easy_install``::
+
+    $ easy_install bleach
+
+Or by cloning the repo from GitHub_::
+
+    $ git clone git://github.com/mozilla/bleach.git
+
+Then install it by running::
+
+    $ python setup.py install
+
+
+Upgrading Bleach
+================
+
+.. warning::
+
+   Before doing any upgrades, read through `Bleach Changes
+   <https://bleach.readthedocs.io/en/latest/changes.html>`_ for backwards
+   incompatible changes, newer versions, etc.
+
+
+Basic use
+=========
+
+The simplest way to use Bleach is:
+
+.. code-block:: python
+
+    >>> import bleach
+
+    >>> bleach.clean('an <script>evil()</script> example')
+    u'an <script>evil()</script> example'
+
+    >>> bleach.linkify('an http://example.com url')
+    u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
+
+
+.. _html5lib: https://github.com/html5lib/html5lib-python
+.. _GitHub: https://github.com/mozilla/bleach
+.. _ReadTheDocs: https://bleach.readthedocs.io/
+.. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/bleach/__init__.py b/bleach/__init__.py
new file mode 100644
index 0000000..c9a7fe4
--- /dev/null
+++ b/bleach/__init__.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from bleach.linkifier import (
+    DEFAULT_CALLBACKS,
+    Linker,
+    LinkifyFilter,
+)
+from bleach.sanitizer import (
+    ALLOWED_ATTRIBUTES,
+    ALLOWED_PROTOCOLS,
+    ALLOWED_STYLES,
+    ALLOWED_TAGS,
+    BleachSanitizerFilter,
+    Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
+
+__all__ = ['clean', 'linkify']
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+          strip_comments=True):
+    """Clean an HTML fragment of malicious content and return it
+
+    This function is a security-focused function whose sole purpose is to
+    remove malicious content from a string such that it can be displayed as
+    content in a web page.
+
+    This function is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    Example::
+
+        import bleach
+
+        better_text = bleach.clean(yucky_text)
+
+
+    .. Note::
+
+       If you're cleaning a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.sanitizer.Cleaner` instance.
+
+    :arg str text: the text to clean
+
+    :arg list tags: allowed list of tags; defaults to
+        ``bleach.sanitizer.ALLOWED_TAGS``
+
+    :arg dict attributes: allowed attributes; can be a callable, list or dict;
+        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+    :arg list styles: allowed list of css styles; defaults to
+        ``bleach.sanitizer.ALLOWED_STYLES``
+
+    :arg list protocols: allowed list of protocols for links; defaults
+        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+    :arg bool strip: whether or not to strip disallowed elements
+
+    :arg bool strip_comments: whether or not to strip HTML comments
+
+    :returns: cleaned text as unicode
+
+    """
+    cleaner = Cleaner(
+        tags=tags,
+        attributes=attributes,
+        styles=styles,
+        protocols=protocols,
+        strip=strip,
+        strip_comments=strip_comments,
+    )
+    return cleaner.clean(text)
+
+
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    .. Note::
+
+       If you're linking a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.linkifier.Linker` instance.
+
+    .. Note::
+
+       If you have text that you want to clean and then linkify, consider using
+       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+       pass. That way you're not parsing the HTML twice.
+
+    :arg str text: the text to linkify
+
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+    :arg list skip_tags: list of tags that you don't want to linkify the
+        contents of; for example, you could set this to ``['pre']`` to skip
+        linkifying contents of ``pre`` tags
+
+    :arg bool parse_email: whether or not to linkify email addresses
+
+    :returns: linkified text as unicode
+
+    """
+    linker = Linker(
+        callbacks=callbacks,
+        skip_tags=skip_tags,
+        parse_email=parse_email
+    )
+    return linker.linkify(text)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
new file mode 100644
index 0000000..d2ba101
--- /dev/null
+++ b/bleach/callbacks.py
@@ -0,0 +1,25 @@
+"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
+
+
+def nofollow(attrs, new=False):
+    href_key = (None, u'href')
+    if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    rel_key = (None, u'rel')
+    rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
+    if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append(u'nofollow')
+    attrs[rel_key] = u' '.join(rel_values)
+
+    return attrs
+
+
+def target_blank(attrs, new=False):
+    href_key = (None, u'href')
+    if attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    attrs[(None, u'target')] = u'_blank'
+    return attrs
diff --git a/bleach/encoding.py b/bleach/encoding.py
new file mode 100644
index 0000000..707adaa
--- /dev/null
+++ b/bleach/encoding.py
@@ -0,0 +1,62 @@
+import datetime
+from decimal import Decimal
+import types
+import six
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_unicode(strings_only=True).
+    """
+    return isinstance(obj, (
+        six.integer_types +
+        (types.NoneType,
+         datetime.datetime, datetime.date, datetime.time,
+         float, Decimal))
+    )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_text, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    # Handle the common case first, saves 30-40% when s is an instance of
+    # six.text_type. This function gets called often in that setting.
+    if isinstance(s, six.text_type):
+        return s
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, six.string_types):
+            if hasattr(s, '__unicode__'):
+                s = s.__unicode__()
+            else:
+                if six.PY3:
+                    if isinstance(s, bytes):
+                        s = six.text_type(s, encoding, errors)
+                    else:
+                        s = six.text_type(s)
+                else:
+                    s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s,
+            # encoding, errors), so that if s is a SafeBytes, it ends up being
+            # a SafeText at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError as e:
+        if not isinstance(s, Exception):
+            raise UnicodeDecodeError(*e.args)
+        else:
+            # If we get to here, the caller has passed in an Exception
+            # subclass populated with non-ASCII bytestring data without a
+            # working unicode method. Try to handle this without raising a
+            # further exception by individually forcing the exception args
+            # to unicode.
+            s = ' '.join([force_unicode(arg, encoding, strings_only,
+                          errors) for arg in s])
+    return s
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
new file mode 100644
index 0000000..fc346c3
--- /dev/null
+++ b/bleach/linkifier.py
@@ -0,0 +1,526 @@
+from __future__ import unicode_literals
+import re
+
+import html5lib
+from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
+
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.utils import alphabetize_attributes
+
+
+#: List of default callbacks
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+       xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+
+def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+    """Builds the url regex used by linkifier
+
+   If you want a different set of tlds or allowed protocols, pass those in
+   and stomp on the existing ``url_re``::
+
+       from bleach import linkifier
+
+       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+       linker = LinkifyFilter(url_re=my_url_re)
+
+    """
+    return re.compile(
+        r"""\(*  # Match any opening parentheses.
+        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+            # /path/zz (excluding "unsafe" chars from RFC 1738,
+            # except for # and ~, which happen in practice)
+        """.format('|'.join(protocols), '|'.join(tlds)),
+        re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+URL_RE = build_url_re()
+
+
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+
+
+EMAIL_RE = re.compile(
+    r"""(?<!//)
+    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
+    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+        |\\[\001-\011\013\014\016-\177])*"  # quoted-string
+    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
+    """,
+    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+
+class Linker(object):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    """
+    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
+                 url_re=URL_RE, email_re=EMAIL_RE):
+        """Creates a Linker instance
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg re email_re: email matching regex
+
+        :returns: linkified text as unicode
+
+        """
+        self.callbacks = callbacks
+        self.skip_tags = skip_tags
+        self.parse_email = parse_email
+        self.url_re = url_re
+        self.email_re = email_re
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
... 4103 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-bleach.git



More information about the Python-modules-commits mailing list