[Python-modules-commits] [python-webencodings] 01/03: import webencodings_0.5.orig.tar.gz
Barry Warsaw
barry at moszumanska.debian.org
Fri Nov 11 17:23:45 UTC 2016
This is an automated email from the git hooks/post-receive script.
barry pushed a commit to branch master
in repository python-webencodings.
commit 0c408cb8cc56d09db1c4378dc126d5ce63cb2068
Author: Barry Warsaw <barry at python.org>
Date: Fri Nov 11 12:01:20 2016 -0500
import webencodings_0.5.orig.tar.gz
---
PKG-INFO | 41 ++++
README.rst | 25 +++
setup.cfg | 12 +
setup.py | 36 +++
webencodings.egg-info/PKG-INFO | 41 ++++
webencodings.egg-info/SOURCES.txt | 12 +
webencodings.egg-info/dependency_links.txt | 1 +
webencodings.egg-info/top_level.txt | 1 +
webencodings/__init__.py | 342 +++++++++++++++++++++++++++++
webencodings/labels.py | 231 +++++++++++++++++++
webencodings/mklabels.py | 59 +++++
webencodings/tests.py | 153 +++++++++++++
webencodings/x_user_defined.py | 325 +++++++++++++++++++++++++++
13 files changed, 1279 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..f3f4b57
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,41 @@
+Metadata-Version: 1.1
+Name: webencodings
+Version: 0.5
+Summary: Character encoding aliases for legacy web content
+Home-page: https://github.com/SimonSapin/python-webencodings
+Author: Simon Sapin
+Author-email: simon.sapin at exyr.org
+License: BSD
+Description: python-webencodings
+ ===================
+
+ This is a Python implementation of the `WHATWG Encoding standard
+ <http://encoding.spec.whatwg.org/>`_.
+
+ * Latest documentation: http://packages.python.org/webencodings/
+ * Source code and issue tracker:
+ https://github.com/gsnedders/python-webencodings
+ * PyPI releases: http://pypi.python.org/pypi/webencodings
+ * License: BSD
+ * Python 2.6+ and 3.3+
+
+ In order to be compatible with legacy web content
+ when interpreting something like ``Content-Type: text/html; charset=latin1``,
+ tools need to use a particular set of aliases for encoding labels
+ as well as some overriding rules.
+ For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually
+ aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence
+ over any other encoding declaration.
+ The Encoding standard defines all such details so that implementations do
+ not have to reverse-engineer each other.
+
+ This module has encoding labels and BOM detection,
+ but the actual implementation for encoders and decoders is Python’s.
+
+Platform: UNKNOWN
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..c7e0f0c
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,25 @@
+python-webencodings
+===================
+
+This is a Python implementation of the `WHATWG Encoding standard
+<http://encoding.spec.whatwg.org/>`_.
+
+* Latest documentation: http://packages.python.org/webencodings/
+* Source code and issue tracker:
+ https://github.com/gsnedders/python-webencodings
+* PyPI releases: http://pypi.python.org/pypi/webencodings
+* License: BSD
+* Python 2.6+ and 3.3+
+
+In order to be compatible with legacy web content
+when interpreting something like ``Content-Type: text/html; charset=latin1``,
+tools need to use a particular set of aliases for encoding labels
+as well as some overriding rules.
+For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually
+aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence
+over any other encoding declaration.
+The Encoding standard defines all such details so that implementations do
+not have to reverse-engineer each other.
+
+This module has encoding labels and BOM detection,
+but the actual implementation for encoders and decoders is Python’s.
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..9d3a045
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,12 @@
+[build_sphinx]
+source-dir = docs
+build-dir = docs/_build
+
+[upload_sphinx]
+upload-dir = docs/_build/html
+
+[egg_info]
+tag_build =
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..541163a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,36 @@
+from setuptools import setup, find_packages
+import io
+from os import path
+import re
+
+
+VERSION = re.search("VERSION = '([^']+)'", io.open(
+ path.join(path.dirname(__file__), 'webencodings', '__init__.py'),
+ encoding='utf-8'
+).read().strip()).group(1)
+
+LONG_DESCRIPTION = io.open(
+ path.join(path.dirname(__file__), 'README.rst'),
+ encoding='utf-8'
+).read()
+
+
+setup(
+ name='webencodings',
+ version=VERSION,
+ url='https://github.com/SimonSapin/python-webencodings',
+ license='BSD',
+ author='Simon Sapin',
+ author_email='simon.sapin at exyr.org',
+ description='Character encoding aliases for legacy web content',
+ long_description=LONG_DESCRIPTION,
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: BSD License',
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
+ 'Topic :: Internet :: WWW/HTTP',
+ ],
+ packages=find_packages(),
+)
diff --git a/webencodings.egg-info/PKG-INFO b/webencodings.egg-info/PKG-INFO
new file mode 100644
index 0000000..f3f4b57
--- /dev/null
+++ b/webencodings.egg-info/PKG-INFO
@@ -0,0 +1,41 @@
+Metadata-Version: 1.1
+Name: webencodings
+Version: 0.5
+Summary: Character encoding aliases for legacy web content
+Home-page: https://github.com/SimonSapin/python-webencodings
+Author: Simon Sapin
+Author-email: simon.sapin at exyr.org
+License: BSD
+Description: python-webencodings
+ ===================
+
+ This is a Python implementation of the `WHATWG Encoding standard
+ <http://encoding.spec.whatwg.org/>`_.
+
+ * Latest documentation: http://packages.python.org/webencodings/
+ * Source code and issue tracker:
+ https://github.com/gsnedders/python-webencodings
+ * PyPI releases: http://pypi.python.org/pypi/webencodings
+ * License: BSD
+ * Python 2.6+ and 3.3+
+
+ In order to be compatible with legacy web content
+ when interpreting something like ``Content-Type: text/html; charset=latin1``,
+ tools need to use a particular set of aliases for encoding labels
+ as well as some overriding rules.
+ For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually
+ aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence
+ over any other encoding declaration.
+ The Encoding standard defines all such details so that implementations do
+ not have to reverse-engineer each other.
+
+ This module has encoding labels and BOM detection,
+ but the actual implementation for encoders and decoders is Python’s.
+
+Platform: UNKNOWN
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/webencodings.egg-info/SOURCES.txt b/webencodings.egg-info/SOURCES.txt
new file mode 100644
index 0000000..785d33e
--- /dev/null
+++ b/webencodings.egg-info/SOURCES.txt
@@ -0,0 +1,12 @@
+README.rst
+setup.cfg
+setup.py
+webencodings/__init__.py
+webencodings/labels.py
+webencodings/mklabels.py
+webencodings/tests.py
+webencodings/x_user_defined.py
+webencodings.egg-info/PKG-INFO
+webencodings.egg-info/SOURCES.txt
+webencodings.egg-info/dependency_links.txt
+webencodings.egg-info/top_level.txt
\ No newline at end of file
diff --git a/webencodings.egg-info/dependency_links.txt b/webencodings.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/webencodings.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/webencodings.egg-info/top_level.txt b/webencodings.egg-info/top_level.txt
new file mode 100644
index 0000000..be8fcb7
--- /dev/null
+++ b/webencodings.egg-info/top_level.txt
@@ -0,0 +1 @@
+webencodings
diff --git a/webencodings/__init__.py b/webencodings/__init__.py
new file mode 100644
index 0000000..03d5d35
--- /dev/null
+++ b/webencodings/__init__.py
@@ -0,0 +1,342 @@
+# coding: utf8
+"""
+
+ webencodings
+ ~~~~~~~~~~~~
+
+ This is a Python implementation of the `WHATWG Encoding standard
+ <http://encoding.spec.whatwg.org/>`. See README for details.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+from .labels import LABELS
+
+
+VERSION = '0.5'
+
+
+# Some names in Encoding are not valid Python aliases. Remap these.
+PYTHON_NAMES = {
+ 'iso-8859-8-i': 'iso-8859-8',
+ 'x-mac-cyrillic': 'mac-cyrillic',
+ 'macintosh': 'mac-roman',
+ 'windows-874': 'cp874'}
+
+CACHE = {}
+
+
+def ascii_lower(string):
+ r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
+
+ :param string: An Unicode string.
+ :returns: A new Unicode string.
+
+ This is used for `ASCII case-insensitive
+ <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
+ matching of encoding labels.
+ The same matching is also used, among other things,
+ for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
+
+ This is different from the :meth:`~py:str.lower` method of Unicode strings
+ which also affect non-ASCII characters,
+ sometimes mapping them into the ASCII range:
+
+ >>> keyword = u'Bac\N{KELVIN SIGN}ground'
+ >>> assert keyword.lower() == u'background'
+ >>> assert ascii_lower(keyword) != keyword.lower()
+ >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
+
+ """
+ # This turns out to be faster than unicode.translate()
+ return string.encode('utf8').lower().decode('utf8')
+
+
+def lookup(label):
+ """
+ Look for an encoding by its label.
+ This is the spec’s `get an encoding
+ <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
+ Supported labels are listed there.
+
+ :param label: A string.
+ :returns:
+ An :class:`Encoding` object, or :obj:`None` for an unknown label.
+
+ """
+ # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
+ label = ascii_lower(label.strip('\t\n\f\r '))
+ name = LABELS.get(label)
+ if name is None:
+ return None
+ encoding = CACHE.get(name)
+ if encoding is None:
+ if name == 'x-user-defined':
+ from .x_user_defined import codec_info
+ else:
+ python_name = PYTHON_NAMES.get(name, name)
+ # Any python_name value that gets to here should be valid.
+ codec_info = codecs.lookup(python_name)
+ encoding = Encoding(name, codec_info)
+ CACHE[name] = encoding
+ return encoding
+
+
+def _get_encoding(encoding_or_label):
+ """
+ Accept either an encoding object or label.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :returns: An :class:`Encoding` object.
+ :raises: :exc:`~exceptions.LookupError` for an unknown label.
+
+ """
+ if hasattr(encoding_or_label, 'codec_info'):
+ return encoding_or_label
+
+ encoding = lookup(encoding_or_label)
+ if encoding is None:
+ raise LookupError('Unknown encoding label: %r' % encoding_or_label)
+ return encoding
+
+
+class Encoding(object):
+ """Reresents a character encoding such as UTF-8,
+ that can be used for decoding or encoding.
+
+ .. attribute:: name
+
+ Canonical name of the encoding
+
+ .. attribute:: codec_info
+
+ The actual implementation of the encoding,
+ a stdlib :class:`~codecs.CodecInfo` object.
+ See :func:`codecs.register`.
+
+ """
+ def __init__(self, name, codec_info):
+ self.name = name
+ self.codec_info = codec_info
+
+ def __repr__(self):
+ return '<Encoding %s>' % self.name
+
+
+#: The UTF-8 encoding. Should be used for new content and formats.
+UTF8 = lookup('utf-8')
+
+_UTF16LE = lookup('utf-16le')
+_UTF16BE = lookup('utf-16be')
+
+
+def decode(input, fallback_encoding, errors='replace'):
+ """
+ Decode a single string.
+
+ :param input: A byte string
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return:
+ A ``(output, encoding)`` tuple of an Unicode string
+ and an :obj:`Encoding`.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ fallback_encoding = _get_encoding(fallback_encoding)
+ bom_encoding, input = _detect_bom(input)
+ encoding = bom_encoding or fallback_encoding
+ return encoding.codec_info.decode(input, errors)[0], encoding
+
+
+def _detect_bom(input):
+ """Return (bom_encoding, input), with any BOM removed from the input."""
+ if input.startswith(b'\xFF\xFE'):
+ return _UTF16LE, input[2:]
+ if input.startswith(b'\xFE\xFF'):
+ return _UTF16BE, input[2:]
+ if input.startswith(b'\xEF\xBB\xBF'):
+ return UTF8, input[3:]
+ return None, input
+
+
+def encode(input, encoding=UTF8, errors='strict'):
+ """
+ Encode a single string.
+
+ :param input: An Unicode string.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return: A byte string.
+
+ """
+ return _get_encoding(encoding).codec_info.encode(input, errors)[0]
+
+
+def iter_decode(input, fallback_encoding, errors='replace'):
+ """
+ "Pull"-based decoder.
+
+ :param input:
+ An iterable of byte strings.
+
+ The input is first consumed just enough to determine the encoding
+ based on the precense of a BOM,
+ then consumed on demand when the return value is.
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns:
+ An ``(output, encoding)`` tuple.
+ :obj:`output` is an iterable of Unicode strings,
+ :obj:`encoding` is the :obj:`Encoding` that is being used.
+
+ """
+
+ decoder = IncrementalDecoder(fallback_encoding, errors)
+ generator = _iter_decode_generator(input, decoder)
+ encoding = next(generator)
+ return generator, encoding
+
+
+def _iter_decode_generator(input, decoder):
+ """Return a generator that first yields the :obj:`Encoding`,
+ then yields output chukns as Unicode strings.
+
+ """
+ decode = decoder.decode
+ input = iter(input)
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ yield output
+ break
+ else:
+ # Input exhausted without determining the encoding
+ output = decode(b'', final=True)
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ if output:
+ yield output
+ return
+
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ yield output
+ output = decode(b'', final=True)
+ if output:
+ yield output
+
+
+def iter_encode(input, encoding=UTF8, errors='strict'):
+ """
+ “Pull”-based encoder.
+
+ :param input: An iterable of Unicode strings.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns: An iterable of byte strings.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ encode = IncrementalEncoder(encoding, errors).encode
+ return _iter_encode_generator(input, encode)
+
+
+def _iter_encode_generator(input, encode):
+ for chunck in input:
+ output = encode(chunck)
+ if output:
+ yield output
+ output = encode('', final=True)
+ if output:
+ yield output
+
+
+class IncrementalDecoder(object):
+ """
+ “Push”-based decoder.
+
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ """
+ def __init__(self, fallback_encoding, errors='replace'):
+ # Fail early if `encoding` is an invalid label.
+ self._fallback_encoding = _get_encoding(fallback_encoding)
+ self._errors = errors
+ self._buffer = b''
+ self._decoder = None
+ #: The actual :class:`Encoding` that is being used,
+ #: or :obj:`None` if that is not determined yet.
+ #: (Ie. if there is not enough input yet to determine
+ #: if there is a BOM.)
+ self.encoding = None # Not known yet.
+
+ def decode(self, input, final=False):
+ """Decode one chunk of the input.
+
+ :param input: A byte string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: An Unicode string.
+
+ """
+ decoder = self._decoder
+ if decoder is not None:
+ return decoder(input, final)
+
+ input = self._buffer + input
+ encoding, input = _detect_bom(input)
+ if encoding is None:
+ if len(input) < 3 and not final: # Not enough data yet.
+ self._buffer = input
+ return ''
+ else: # No BOM
+ encoding = self._fallback_encoding
+ decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
+ self._decoder = decoder
+ self.encoding = encoding
+ return decoder(input, final)
+
+
+class IncrementalEncoder(object):
+ """
+ “Push”-based encoder.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ .. method:: encode(input, final=False)
+
+ :param input: An Unicode string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: A byte string.
+
+ """
+ def __init__(self, encoding=UTF8, errors='strict'):
+ encoding = _get_encoding(encoding)
+ self.encode = encoding.codec_info.incrementalencoder(errors).encode
diff --git a/webencodings/labels.py b/webencodings/labels.py
new file mode 100644
index 0000000..29cbf91
--- /dev/null
+++ b/webencodings/labels.py
@@ -0,0 +1,231 @@
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+ 'unicode-1-1-utf-8': 'utf-8',
+ 'utf-8': 'utf-8',
+ 'utf8': 'utf-8',
+ '866': 'ibm866',
+ 'cp866': 'ibm866',
+ 'csibm866': 'ibm866',
+ 'ibm866': 'ibm866',
+ 'csisolatin2': 'iso-8859-2',
+ 'iso-8859-2': 'iso-8859-2',
+ 'iso-ir-101': 'iso-8859-2',
+ 'iso8859-2': 'iso-8859-2',
+ 'iso88592': 'iso-8859-2',
+ 'iso_8859-2': 'iso-8859-2',
+ 'iso_8859-2:1987': 'iso-8859-2',
+ 'l2': 'iso-8859-2',
+ 'latin2': 'iso-8859-2',
+ 'csisolatin3': 'iso-8859-3',
+ 'iso-8859-3': 'iso-8859-3',
+ 'iso-ir-109': 'iso-8859-3',
+ 'iso8859-3': 'iso-8859-3',
+ 'iso88593': 'iso-8859-3',
+ 'iso_8859-3': 'iso-8859-3',
+ 'iso_8859-3:1988': 'iso-8859-3',
+ 'l3': 'iso-8859-3',
+ 'latin3': 'iso-8859-3',
+ 'csisolatin4': 'iso-8859-4',
+ 'iso-8859-4': 'iso-8859-4',
+ 'iso-ir-110': 'iso-8859-4',
+ 'iso8859-4': 'iso-8859-4',
+ 'iso88594': 'iso-8859-4',
+ 'iso_8859-4': 'iso-8859-4',
+ 'iso_8859-4:1988': 'iso-8859-4',
+ 'l4': 'iso-8859-4',
+ 'latin4': 'iso-8859-4',
+ 'csisolatincyrillic': 'iso-8859-5',
+ 'cyrillic': 'iso-8859-5',
+ 'iso-8859-5': 'iso-8859-5',
+ 'iso-ir-144': 'iso-8859-5',
+ 'iso8859-5': 'iso-8859-5',
+ 'iso88595': 'iso-8859-5',
+ 'iso_8859-5': 'iso-8859-5',
+ 'iso_8859-5:1988': 'iso-8859-5',
+ 'arabic': 'iso-8859-6',
+ 'asmo-708': 'iso-8859-6',
+ 'csiso88596e': 'iso-8859-6',
+ 'csiso88596i': 'iso-8859-6',
+ 'csisolatinarabic': 'iso-8859-6',
+ 'ecma-114': 'iso-8859-6',
+ 'iso-8859-6': 'iso-8859-6',
+ 'iso-8859-6-e': 'iso-8859-6',
+ 'iso-8859-6-i': 'iso-8859-6',
+ 'iso-ir-127': 'iso-8859-6',
+ 'iso8859-6': 'iso-8859-6',
+ 'iso88596': 'iso-8859-6',
+ 'iso_8859-6': 'iso-8859-6',
+ 'iso_8859-6:1987': 'iso-8859-6',
+ 'csisolatingreek': 'iso-8859-7',
+ 'ecma-118': 'iso-8859-7',
+ 'elot_928': 'iso-8859-7',
+ 'greek': 'iso-8859-7',
+ 'greek8': 'iso-8859-7',
+ 'iso-8859-7': 'iso-8859-7',
+ 'iso-ir-126': 'iso-8859-7',
+ 'iso8859-7': 'iso-8859-7',
+ 'iso88597': 'iso-8859-7',
+ 'iso_8859-7': 'iso-8859-7',
+ 'iso_8859-7:1987': 'iso-8859-7',
+ 'sun_eu_greek': 'iso-8859-7',
+ 'csiso88598e': 'iso-8859-8',
+ 'csisolatinhebrew': 'iso-8859-8',
+ 'hebrew': 'iso-8859-8',
+ 'iso-8859-8': 'iso-8859-8',
+ 'iso-8859-8-e': 'iso-8859-8',
+ 'iso-ir-138': 'iso-8859-8',
+ 'iso8859-8': 'iso-8859-8',
+ 'iso88598': 'iso-8859-8',
+ 'iso_8859-8': 'iso-8859-8',
+ 'iso_8859-8:1988': 'iso-8859-8',
+ 'visual': 'iso-8859-8',
+ 'csiso88598i': 'iso-8859-8-i',
+ 'iso-8859-8-i': 'iso-8859-8-i',
+ 'logical': 'iso-8859-8-i',
+ 'csisolatin6': 'iso-8859-10',
+ 'iso-8859-10': 'iso-8859-10',
+ 'iso-ir-157': 'iso-8859-10',
+ 'iso8859-10': 'iso-8859-10',
+ 'iso885910': 'iso-8859-10',
+ 'l6': 'iso-8859-10',
+ 'latin6': 'iso-8859-10',
+ 'iso-8859-13': 'iso-8859-13',
+ 'iso8859-13': 'iso-8859-13',
+ 'iso885913': 'iso-8859-13',
+ 'iso-8859-14': 'iso-8859-14',
+ 'iso8859-14': 'iso-8859-14',
+ 'iso885914': 'iso-8859-14',
+ 'csisolatin9': 'iso-8859-15',
+ 'iso-8859-15': 'iso-8859-15',
+ 'iso8859-15': 'iso-8859-15',
+ 'iso885915': 'iso-8859-15',
+ 'iso_8859-15': 'iso-8859-15',
+ 'l9': 'iso-8859-15',
+ 'iso-8859-16': 'iso-8859-16',
+ 'cskoi8r': 'koi8-r',
+ 'koi': 'koi8-r',
+ 'koi8': 'koi8-r',
+ 'koi8-r': 'koi8-r',
+ 'koi8_r': 'koi8-r',
+ 'koi8-u': 'koi8-u',
+ 'csmacintosh': 'macintosh',
+ 'mac': 'macintosh',
+ 'macintosh': 'macintosh',
+ 'x-mac-roman': 'macintosh',
+ 'dos-874': 'windows-874',
+ 'iso-8859-11': 'windows-874',
+ 'iso8859-11': 'windows-874',
+ 'iso885911': 'windows-874',
+ 'tis-620': 'windows-874',
+ 'windows-874': 'windows-874',
+ 'cp1250': 'windows-1250',
+ 'windows-1250': 'windows-1250',
+ 'x-cp1250': 'windows-1250',
+ 'cp1251': 'windows-1251',
+ 'windows-1251': 'windows-1251',
+ 'x-cp1251': 'windows-1251',
+ 'ansi_x3.4-1968': 'windows-1252',
+ 'ascii': 'windows-1252',
+ 'cp1252': 'windows-1252',
+ 'cp819': 'windows-1252',
+ 'csisolatin1': 'windows-1252',
+ 'ibm819': 'windows-1252',
+ 'iso-8859-1': 'windows-1252',
+ 'iso-ir-100': 'windows-1252',
+ 'iso8859-1': 'windows-1252',
+ 'iso88591': 'windows-1252',
+ 'iso_8859-1': 'windows-1252',
+ 'iso_8859-1:1987': 'windows-1252',
+ 'l1': 'windows-1252',
+ 'latin1': 'windows-1252',
+ 'us-ascii': 'windows-1252',
+ 'windows-1252': 'windows-1252',
+ 'x-cp1252': 'windows-1252',
+ 'cp1253': 'windows-1253',
+ 'windows-1253': 'windows-1253',
+ 'x-cp1253': 'windows-1253',
+ 'cp1254': 'windows-1254',
+ 'csisolatin5': 'windows-1254',
+ 'iso-8859-9': 'windows-1254',
+ 'iso-ir-148': 'windows-1254',
+ 'iso8859-9': 'windows-1254',
+ 'iso88599': 'windows-1254',
+ 'iso_8859-9': 'windows-1254',
+ 'iso_8859-9:1989': 'windows-1254',
+ 'l5': 'windows-1254',
+ 'latin5': 'windows-1254',
+ 'windows-1254': 'windows-1254',
+ 'x-cp1254': 'windows-1254',
+ 'cp1255': 'windows-1255',
+ 'windows-1255': 'windows-1255',
+ 'x-cp1255': 'windows-1255',
+ 'cp1256': 'windows-1256',
+ 'windows-1256': 'windows-1256',
+ 'x-cp1256': 'windows-1256',
+ 'cp1257': 'windows-1257',
+ 'windows-1257': 'windows-1257',
+ 'x-cp1257': 'windows-1257',
+ 'cp1258': 'windows-1258',
+ 'windows-1258': 'windows-1258',
+ 'x-cp1258': 'windows-1258',
+ 'x-mac-cyrillic': 'x-mac-cyrillic',
+ 'x-mac-ukrainian': 'x-mac-cyrillic',
+ 'chinese': 'gbk',
+ 'csgb2312': 'gbk',
+ 'csiso58gb231280': 'gbk',
+ 'gb2312': 'gbk',
+ 'gb_2312': 'gbk',
+ 'gb_2312-80': 'gbk',
+ 'gbk': 'gbk',
+ 'iso-ir-58': 'gbk',
+ 'x-gbk': 'gbk',
+ 'gb18030': 'gb18030',
+ 'hz-gb-2312': 'hz-gb-2312',
+ 'big5': 'big5',
+ 'big5-hkscs': 'big5',
+ 'cn-big5': 'big5',
+ 'csbig5': 'big5',
+ 'x-x-big5': 'big5',
+ 'cseucpkdfmtjapanese': 'euc-jp',
+ 'euc-jp': 'euc-jp',
+ 'x-euc-jp': 'euc-jp',
+ 'csiso2022jp': 'iso-2022-jp',
+ 'iso-2022-jp': 'iso-2022-jp',
+ 'csshiftjis': 'shift_jis',
+ 'ms_kanji': 'shift_jis',
+ 'shift-jis': 'shift_jis',
+ 'shift_jis': 'shift_jis',
+ 'sjis': 'shift_jis',
+ 'windows-31j': 'shift_jis',
+ 'x-sjis': 'shift_jis',
+ 'cseuckr': 'euc-kr',
+ 'csksc56011987': 'euc-kr',
+ 'euc-kr': 'euc-kr',
+ 'iso-ir-149': 'euc-kr',
+ 'korean': 'euc-kr',
+ 'ks_c_5601-1987': 'euc-kr',
+ 'ks_c_5601-1989': 'euc-kr',
+ 'ksc5601': 'euc-kr',
+ 'ksc_5601': 'euc-kr',
+ 'windows-949': 'euc-kr',
+ 'csiso2022kr': 'iso-2022-kr',
+ 'iso-2022-kr': 'iso-2022-kr',
+ 'utf-16be': 'utf-16be',
+ 'utf-16': 'utf-16le',
+ 'utf-16le': 'utf-16le',
+ 'x-user-defined': 'x-user-defined',
+}
diff --git a/webencodings/mklabels.py b/webencodings/mklabels.py
new file mode 100644
index 0000000..295dc92
--- /dev/null
+++ b/webencodings/mklabels.py
@@ -0,0 +1,59 @@
+"""
+
+ webencodings.mklabels
+ ~~~~~~~~~~~~~~~~~~~~~
+
+ Regenarate the webencodings.labels module.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+import json
+try:
+ from urllib import urlopen
+except ImportError:
+ from urllib.request import urlopen
+
+
+def assert_lower(string):
+ assert string == string.lower()
+ return string
+
+
+def generate(url):
+ parts = ['''\
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+''']
+ labels = [
+ (repr(assert_lower(label)).lstrip('u'),
+ repr(encoding['name']).lstrip('u'))
+ for category in json.loads(urlopen(url).read().decode('ascii'))
+ for encoding in category['encodings']
+ for label in encoding['labels']]
+ max_len = max(len(label) for label, name in labels)
+ parts.extend(
+ ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
+ for label, name in labels)
+ parts.append('}')
+ return ''.join(parts)
+
+
+if __name__ == '__main__':
+ print(generate('http://encoding.spec.whatwg.org/encodings.json'))
diff --git a/webencodings/tests.py b/webencodings/tests.py
new file mode 100644
index 0000000..b8c5653
--- /dev/null
+++ b/webencodings/tests.py
@@ -0,0 +1,153 @@
+# coding: utf8
+"""
+
+ webencodings.tests
+ ~~~~~~~~~~~~~~~~~~
+
+ A basic test suite for Encoding.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
+ IncrementalDecoder, IncrementalEncoder, UTF8)
+
+
+def assert_raises(exception, function, *args, **kwargs):
+ try:
+ function(*args, **kwargs)
+ except exception:
+ return
+ else: # pragma: no cover
+ raise AssertionError('Did not raise %s.' % exception)
+
+
+def test_labels():
+ assert lookup('utf-8').name == 'utf-8'
+ assert lookup('Utf-8').name == 'utf-8'
+ assert lookup('UTF-8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8 ').name == 'utf-8'
+ assert lookup(' \r\nutf8\t').name == 'utf-8'
+ assert lookup('u8') is None # Python label.
+ assert lookup('utf-8 ') is None # Non-ASCII white space.
+
+ assert lookup('US-ASCII').name == 'windows-1252'
+ assert lookup('iso-8859-1').name == 'windows-1252'
+ assert lookup('latin1').name == 'windows-1252'
+ assert lookup('LATIN1').name == 'windows-1252'
+ assert lookup('latin-1') is None
+ assert lookup('LATİN1') is None # ASCII-only case insensitivity.
+
+
+def test_all_labels():
+ for label in LABELS:
+ assert decode(b'', label) == ('', lookup(label))
+ assert encode('', label) == b''
+ for repeat in [0, 1, 12]:
+ output, _ = iter_decode([b''] * repeat, label)
+ assert list(output) == []
+ assert list(iter_encode([''] * repeat, label)) == []
+ decoder = IncrementalDecoder(label)
+ assert decoder.decode(b'') == ''
+ assert decoder.decode(b'', final=True) == ''
+ encoder = IncrementalEncoder(label)
+ assert encoder.encode('') == b''
+ assert encoder.encode('', final=True) == b''
+ # All encoding names are valid labels too:
+ for name in set(LABELS.values()):
+ assert lookup(name).name == name
+
+
+def test_invalid_label():
+ assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
+ assert_raises(LookupError, encode, 'é', 'invalid')
+ assert_raises(LookupError, iter_decode, [], 'invalid')
+ assert_raises(LookupError, iter_encode, [], 'invalid')
+ assert_raises(LookupError, IncrementalDecoder, 'invalid')
+ assert_raises(LookupError, IncrementalEncoder, 'invalid')
+
+
+def test_decode():
+ assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
+ assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
+ assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
+ assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
+
+ assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
+ assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
+ assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
+
+ assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
+ assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
+ assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
+
+ assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
+ assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
+
+
+def test_encode():
+ assert encode('é', 'latin1') == b'\xe9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf-16') == b'\xe9\x00'
+ assert encode('é', 'utf-16le') == b'\xe9\x00'
+ assert encode('é', 'utf-16be') == b'\x00\xe9'
+
... 379 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-webencodings.git
More information about the Python-modules-commits
mailing list