[Python-modules-commits] [python-w3lib] 01/04: Import python-w3lib_1.17.0.orig.tar.gz

Michael Fladischer fladi at moszumanska.debian.org
Fri Feb 17 08:24:21 UTC 2017


This is an automated email from the git hooks/post-receive script.

fladi pushed a commit to branch master
in repository python-w3lib.

commit 40a898e1e6c99ff7e5fcce1dd1228b4ce0ffdd02
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date:   Fri Feb 17 08:55:09 2017 +0100

    Import python-w3lib_1.17.0.orig.tar.gz
---
 MANIFEST.in                                        |   2 +-
 PKG-INFO                                           |   4 +-
 setup.cfg                                          |   1 -
 setup.py                                           |   4 +-
 tests/__init__.pyc                                 | Bin 136 -> 0 bytes
 .../test_encoding.cpython-27-PYTEST.pyc            | Bin 13289 -> 0 bytes
 tests/__pycache__/test_form.cpython-27-PYTEST.pyc  | Bin 2549 -> 0 bytes
 tests/__pycache__/test_html.cpython-27-PYTEST.pyc  | Bin 40896 -> 0 bytes
 tests/__pycache__/test_http.cpython-27-PYTEST.pyc  | Bin 3230 -> 0 bytes
 tests/__pycache__/test_url.cpython-27-PYTEST.pyc   | Bin 31802 -> 0 bytes
 tests/test_http.py                                 |   6 +-
 tests/test_url.py                                  |  92 ++++++++++++++++++-
 w3lib.egg-info/PKG-INFO                            |   4 +-
 w3lib.egg-info/SOURCES.txt                         |   6 --
 w3lib/__init__.py                                  |   2 +-
 w3lib/html.py                                      |  19 ++++
 w3lib/http.py                                      |  20 +++--
 w3lib/url.py                                       |  99 +++++++++++++++++++++
 18 files changed, 238 insertions(+), 21 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 9b58344..fb4a5a7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
 # Include tests into distribution
-recursive-include tests *
+recursive-include tests *.py *.txt
 
diff --git a/PKG-INFO b/PKG-INFO
index c17a298..98ac7ae 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: w3lib
-Version: 1.16.0
+Version: 1.17.0
 Summary: Library of web-related functions
 Home-page: https://github.com/scrapy/w3lib
 Author: Scrapy project
@@ -17,6 +17,8 @@ Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.3
 Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/setup.cfg b/setup.cfg
index 6f08d0e..adf5ed7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,5 +4,4 @@ universal = 1
 [egg_info]
 tag_build = 
 tag_date = 0
-tag_svn_revision = 0
 
diff --git a/setup.py b/setup.py
index e356577..b9937dd 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='w3lib',
-    version='1.16.0',
+    version='1.17.0',
     license='BSD',
     description='Library of web-related functions',
     author='Scrapy project',
@@ -23,6 +23,8 @@ setup(
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Internet :: WWW/HTTP',
diff --git a/tests/__init__.pyc b/tests/__init__.pyc
deleted file mode 100644
index ee8ba29..0000000
Binary files a/tests/__init__.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
deleted file mode 100644
index 3634a7a..0000000
Binary files a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc
deleted file mode 100644
index 5065298..0000000
Binary files a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc
deleted file mode 100644
index 085bcaa..0000000
Binary files a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc
deleted file mode 100644
index 380552b..0000000
Binary files a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc
deleted file mode 100644
index dfb6551..0000000
Binary files a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/test_http.py b/tests/test_http.py
index 6ce53ca..453624f 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -19,8 +19,10 @@ class HttpTests(unittest.TestCase):
         self.assertIsNone(headers_dict_to_raw(None))
 
     def test_headers_raw_to_dict(self):
-        raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
-        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
+        raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
+                Cache-Control: no-cache\n\rCache-Control: no-store\n\n"
+        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], 
+               b'Cache-Control': [b'no-cache', b'no-store']}
         self.assertEqual(headers_raw_to_dict(raw), dct)
 
     def test_headers_dict_to_raw(self):
diff --git a/tests/test_url.py b/tests/test_url.py
index 99ceb0f..9bb1ea4 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -4,8 +4,8 @@ import os
 import unittest
 from w3lib.url import (is_url, safe_url_string, safe_download_url,
     url_query_parameter, add_or_replace_parameter, url_query_cleaner,
-    file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc,
-    canonicalize_url, parse_url)
+    file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
+    urljoin_rfc, canonicalize_url, parse_url)
 from six.moves.urllib.parse import urlparse
 
 
@@ -574,6 +574,94 @@ class CanonicalizeUrlTest(unittest.TestCase):
                     label=u"example"*11))
 
 
+class DataURITests(unittest.TestCase):
+
+    def test_default_mediatype_charset(self):
+        result = parse_data_uri("data:,A%20brief%20note")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters, {"charset": "US-ASCII"})
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_text_uri(self):
+        result = parse_data_uri(u"data:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_bytes_uri(self):
+        result = parse_data_uri(b"data:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_unicode_uri(self):
+        result = parse_data_uri(u"data:,é")
+        self.assertEqual(result.data, u"é".encode('utf-8'))
+
+    def test_default_mediatype(self):
+        result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "iso-8859-7"})
+        self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+    def test_text_charset(self):
+        result = parse_data_uri("data:text/plain;charset=iso-8859-7,%be%d3%be")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "iso-8859-7"})
+        self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+    def test_mediatype_parameters(self):
+        result = parse_data_uri('data:text/plain;'
+                                'foo=%22foo;bar%5C%22%22;'
+                                'charset=utf-8;'
+                                'bar=%22foo;%5C%22foo%20;/%20,%22,'
+                                '%CE%8E%CE%A3%CE%8E')
+
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "utf-8",
+                          "foo": 'foo;bar"',
+                          "bar": 'foo;"foo ;/ ,'})
+        self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
+
+    def test_base64(self):
+        result = parse_data_uri("data:text/plain;base64,"
+                                "SGVsbG8sIHdvcmxkLg%3D%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+    def test_base64_spaces(self):
+        result = parse_data_uri("data:text/plain;base64,SGVsb%20G8sIH%0A%20%20"
+                                "dvcm%20%20%20xk%20Lg%3D%0A%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+        result = parse_data_uri("data:text/plain;base64,SGVsb G8sIH\n  "
+                                "dvcm   xk Lg%3D\n%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+    def test_wrong_base64_param(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D")
+
+    def test_missing_comma(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("data:A%20brief%20note")
+
+    def test_missing_scheme(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("text/plain,A%20brief%20note")
+
+    def test_wrong_scheme(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("http://example.com/")
+
+    def test_scheme_case_insensitive(self):
+        result = parse_data_uri("DATA:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+        result = parse_data_uri("DaTa:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+
 if __name__ == "__main__":
     unittest.main()
 
diff --git a/w3lib.egg-info/PKG-INFO b/w3lib.egg-info/PKG-INFO
index c17a298..98ac7ae 100644
--- a/w3lib.egg-info/PKG-INFO
+++ b/w3lib.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: w3lib
-Version: 1.16.0
+Version: 1.17.0
 Summary: Library of web-related functions
 Home-page: https://github.com/scrapy/w3lib
 Author: Scrapy project
@@ -17,6 +17,8 @@ Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.3
 Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/w3lib.egg-info/SOURCES.txt b/w3lib.egg-info/SOURCES.txt
index e224c60..afc648b 100644
--- a/w3lib.egg-info/SOURCES.txt
+++ b/w3lib.egg-info/SOURCES.txt
@@ -3,18 +3,12 @@ README.rst
 setup.cfg
 setup.py
 tests/__init__.py
-tests/__init__.pyc
 tests/py3-ignores.txt
 tests/test_encoding.py
 tests/test_form.py
 tests/test_html.py
 tests/test_http.py
 tests/test_url.py
-tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
-tests/__pycache__/test_form.cpython-27-PYTEST.pyc
-tests/__pycache__/test_html.cpython-27-PYTEST.pyc
-tests/__pycache__/test_http.cpython-27-PYTEST.pyc
-tests/__pycache__/test_url.cpython-27-PYTEST.pyc
 w3lib/__init__.py
 w3lib/encoding.py
 w3lib/form.py
diff --git a/w3lib/__init__.py b/w3lib/__init__.py
index e118a92..0a4c374 100644
--- a/w3lib/__init__.py
+++ b/w3lib/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "1.16.0"
+__version__ = "1.17.0"
 version_info = tuple(int(v) if v.isdigit() else v
                      for v in __version__.split('.'))
diff --git a/w3lib/html.py b/w3lib/html.py
index 24d01a5..9990a35 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -17,6 +17,9 @@ _baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*
 _meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
 _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
 
+HTML5_WHITESPACE = ' \t\n\r\x0c'
+
+
 def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
     r"""
 
@@ -317,3 +320,19 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
         return interval, url
     else:
         return None, None
+
+
+def strip_html5_whitespace(text):
+    r"""
+    Strip all leading and trailing space characters (as defined in
+    https://www.w3.org/TR/html5/infrastructure.html#space-character).
+
+    Such stripping is useful e.g. for processing HTML element attributes which
+    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
+    defines them as "valid URL potentially surrounded by spaces"
+    or "valid non-empty URL potentially surrounded by spaces".
+
+    >>> strip_html5_whitespace(' hello\n')
+    'hello'
+    """
+    return text.strip(HTML5_WHITESPACE)
diff --git a/w3lib/http.py b/w3lib/http.py
index 8c5dfed..accfb5d 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -29,11 +29,21 @@ def headers_raw_to_dict(headers_raw):
         return None
     headers = headers_raw.splitlines()
     headers_tuples = [header.split(b':', 1) for header in headers]
-    return dict([
-        (header_item[0].strip(), [header_item[1].strip()])
-        for header_item in headers_tuples
-        if len(header_item) == 2
-    ])
+
+    result_dict = {}
+    for header_item in headers_tuples:
+        if not len(header_item) == 2:
+            continue
+
+        item_key = header_item[0].strip()
+        item_value = header_item[1].strip()
+
+        if item_key in result_dict:
+            result_dict[item_key].append(item_value)
+        else:
+            result_dict[item_key] = [item_value]
+
+    return result_dict
 
 
 def headers_dict_to_raw(headers_dict):
diff --git a/w3lib/url.py b/w3lib/url.py
index 8d58c91..ef3189d 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -2,12 +2,14 @@
 This module contains general purpose URL functions not found in the standard
 library.
 """
+import base64
 import codecs
 import os
 import re
 import posixpath
 import warnings
 import six
+from collections import namedtuple
 from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
                                     urldefrag, urlencode, urlparse,
                                     quote, parse_qs, parse_qsl,
@@ -285,11 +287,108 @@ def any_to_uri(uri_or_path):
     return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
 
 
+# ASCII characters.
+_char = set(map(chr, range(127)))
+
+# RFC 2045 token.
+_token = r'[{}]+'.format(re.escape(''.join(_char -
+                                           # Control characters.
+                                           set(map(chr, range(0, 32))) -
+                                           # tspecials and space.
+                                           set('()<>@,;:\\"/[]?= '))))
+
+# RFC 822 quoted-string, without surrounding quotation marks.
+_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
+    re.escape(''.join(_char - {'"', '\\', '\r'})),
+    re.escape(''.join(_char))
+)
+
+# Encode the regular expression strings to make them into bytes, as Python 3
+# bytes have no format() method, but bytes must be passed to re.compile() in
+# order to make a pattern object that can be used to match on bytes.
+
+# RFC 2397 mediatype.
+_mediatype_pattern = re.compile(
+    r'{token}/{token}'.format(token=_token).encode()
+)
+_mediatype_parameter_pattern = re.compile(
+    r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
+                                                    quoted=_quoted_string
+                                                    ).encode()
+)
+
+_ParseDataURIResult = namedtuple("ParseDataURIResult",
+                                 "media_type media_type_parameters data")
+
+def parse_data_uri(uri):
+    """
+
+    Parse a data: URI, returning a 3-tuple of media type, dictionary of media
+    type parameters, and data.
+
+    """
+
+    if not isinstance(uri, bytes):
+        uri = safe_url_string(uri).encode('ascii')
+
+    try:
+        scheme, uri = uri.split(b':', 1)
+    except ValueError:
+        raise ValueError("invalid URI")
+    if scheme.lower() != b'data':
+        raise ValueError("not a data URI")
+
+    # RFC 3986 section 2.1 allows percent encoding to escape characters that
+    # would be interpreted as delimiters, implying that actual delimiters
+    # should not be percent-encoded.
+    # Decoding before parsing will allow malformed URIs with percent-encoded
+    # delimiters, but it makes parsing easier and should not affect
+    # well-formed URIs, as the delimiters used in this URI scheme are not
+    # allowed, percent-encoded or not, in tokens.
+    if six.PY2:
+        uri = unquote(uri)
+    else:
+        uri = unquote_to_bytes(uri)
+
+    media_type = "text/plain"
+    media_type_params = {}
+
+    m = _mediatype_pattern.match(uri)
+    if m:
+        media_type = m.group().decode()
+        uri = uri[m.end():]
+    else:
+        media_type_params['charset'] = "US-ASCII"
+
+    while True:
+        m = _mediatype_parameter_pattern.match(uri)
+        if m:
+            attribute, value, value_quoted = m.groups()
+            if value_quoted:
+                value = re.sub(br'\\(.)', r'\1', value_quoted)
+            media_type_params[attribute.decode()] = value.decode()
+            uri = uri[m.end():]
+        else:
+            break
+
+    try:
+        is_base64, data = uri.split(b',', 1)
+    except ValueError:
+        raise ValueError("invalid data URI")
+    if is_base64:
+        if is_base64 != b";base64":
+            raise ValueError("invalid data URI")
+        data = base64.b64decode(data)
+
+    return _ParseDataURIResult(media_type, media_type_params, data)
+
+
 __all__ = ["add_or_replace_parameter",
            "any_to_uri",
            "canonicalize_url",
            "file_uri_to_path",
            "is_url",
+           "parse_data_uri",
            "path_to_file_uri",
            "safe_download_url",
            "safe_url_string",

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-w3lib.git



More information about the Python-modules-commits mailing list