[Python-modules-commits] [python-w3lib] 01/04: Import python-w3lib_1.17.0.orig.tar.gz
Michael Fladischer
fladi at moszumanska.debian.org
Fri Feb 17 08:24:21 UTC 2017
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch master
in repository python-w3lib.
commit 40a898e1e6c99ff7e5fcce1dd1228b4ce0ffdd02
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Fri Feb 17 08:55:09 2017 +0100
Import python-w3lib_1.17.0.orig.tar.gz
---
MANIFEST.in | 2 +-
PKG-INFO | 4 +-
setup.cfg | 1 -
setup.py | 4 +-
tests/__init__.pyc | Bin 136 -> 0 bytes
.../test_encoding.cpython-27-PYTEST.pyc | Bin 13289 -> 0 bytes
tests/__pycache__/test_form.cpython-27-PYTEST.pyc | Bin 2549 -> 0 bytes
tests/__pycache__/test_html.cpython-27-PYTEST.pyc | Bin 40896 -> 0 bytes
tests/__pycache__/test_http.cpython-27-PYTEST.pyc | Bin 3230 -> 0 bytes
tests/__pycache__/test_url.cpython-27-PYTEST.pyc | Bin 31802 -> 0 bytes
tests/test_http.py | 6 +-
tests/test_url.py | 92 ++++++++++++++++++-
w3lib.egg-info/PKG-INFO | 4 +-
w3lib.egg-info/SOURCES.txt | 6 --
w3lib/__init__.py | 2 +-
w3lib/html.py | 19 ++++
w3lib/http.py | 20 +++--
w3lib/url.py | 99 +++++++++++++++++++++
18 files changed, 238 insertions(+), 21 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
index 9b58344..fb4a5a7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
# Include tests into distribution
-recursive-include tests *
+recursive-include tests *.py *.txt
diff --git a/PKG-INFO b/PKG-INFO
index c17a298..98ac7ae 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: w3lib
-Version: 1.16.0
+Version: 1.17.0
Summary: Library of web-related functions
Home-page: https://github.com/scrapy/w3lib
Author: Scrapy project
@@ -17,6 +17,8 @@ Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.3
Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/setup.cfg b/setup.cfg
index 6f08d0e..adf5ed7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,5 +4,4 @@ universal = 1
[egg_info]
tag_build =
tag_date = 0
-tag_svn_revision = 0
diff --git a/setup.py b/setup.py
index e356577..b9937dd 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
setup(
name='w3lib',
- version='1.16.0',
+ version='1.17.0',
license='BSD',
description='Library of web-related functions',
author='Scrapy project',
@@ -23,6 +23,8 @@ setup(
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Internet :: WWW/HTTP',
diff --git a/tests/__init__.pyc b/tests/__init__.pyc
deleted file mode 100644
index ee8ba29..0000000
Binary files a/tests/__init__.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
deleted file mode 100644
index 3634a7a..0000000
Binary files a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc
deleted file mode 100644
index 5065298..0000000
Binary files a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc
deleted file mode 100644
index 085bcaa..0000000
Binary files a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc
deleted file mode 100644
index 380552b..0000000
Binary files a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc
deleted file mode 100644
index dfb6551..0000000
Binary files a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc and /dev/null differ
diff --git a/tests/test_http.py b/tests/test_http.py
index 6ce53ca..453624f 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -19,8 +19,10 @@ class HttpTests(unittest.TestCase):
self.assertIsNone(headers_dict_to_raw(None))
def test_headers_raw_to_dict(self):
- raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
- dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
+ raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
+ Cache-Control: no-cache\n\rCache-Control: no-store\n\n"
+ dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'],
+ b'Cache-Control': [b'no-cache', b'no-store']}
self.assertEqual(headers_raw_to_dict(raw), dct)
def test_headers_dict_to_raw(self):
diff --git a/tests/test_url.py b/tests/test_url.py
index 99ceb0f..9bb1ea4 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -4,8 +4,8 @@ import os
import unittest
from w3lib.url import (is_url, safe_url_string, safe_download_url,
url_query_parameter, add_or_replace_parameter, url_query_cleaner,
- file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc,
- canonicalize_url, parse_url)
+ file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
+ urljoin_rfc, canonicalize_url, parse_url)
from six.moves.urllib.parse import urlparse
@@ -574,6 +574,94 @@ class CanonicalizeUrlTest(unittest.TestCase):
label=u"example"*11))
+class DataURITests(unittest.TestCase):
+
+ def test_default_mediatype_charset(self):
+ result = parse_data_uri("data:,A%20brief%20note")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.media_type_parameters, {"charset": "US-ASCII"})
+ self.assertEqual(result.data, b"A brief note")
+
+ def test_text_uri(self):
+ result = parse_data_uri(u"data:,A%20brief%20note")
+ self.assertEqual(result.data, b"A brief note")
+
+ def test_bytes_uri(self):
+ result = parse_data_uri(b"data:,A%20brief%20note")
+ self.assertEqual(result.data, b"A brief note")
+
+ def test_unicode_uri(self):
+ result = parse_data_uri(u"data:,é")
+ self.assertEqual(result.data, u"é".encode('utf-8'))
+
+ def test_default_mediatype(self):
+ result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.media_type_parameters,
+ {"charset": "iso-8859-7"})
+ self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+ def test_text_charset(self):
+ result = parse_data_uri("data:text/plain;charset=iso-8859-7,%be%d3%be")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.media_type_parameters,
+ {"charset": "iso-8859-7"})
+ self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+ def test_mediatype_parameters(self):
+ result = parse_data_uri('data:text/plain;'
+ 'foo=%22foo;bar%5C%22%22;'
+ 'charset=utf-8;'
+ 'bar=%22foo;%5C%22foo%20;/%20,%22,'
+ '%CE%8E%CE%A3%CE%8E')
+
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.media_type_parameters,
+ {"charset": "utf-8",
+ "foo": 'foo;bar"',
+ "bar": 'foo;"foo ;/ ,'})
+ self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
+
+ def test_base64(self):
+ result = parse_data_uri("data:text/plain;base64,"
+ "SGVsbG8sIHdvcmxkLg%3D%3D")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.data, b"Hello, world.")
+
+ def test_base64_spaces(self):
+ result = parse_data_uri("data:text/plain;base64,SGVsb%20G8sIH%0A%20%20"
+ "dvcm%20%20%20xk%20Lg%3D%0A%3D")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.data, b"Hello, world.")
+
+ result = parse_data_uri("data:text/plain;base64,SGVsb G8sIH\n "
+ "dvcm xk Lg%3D\n%3D")
+ self.assertEqual(result.media_type, "text/plain")
+ self.assertEqual(result.data, b"Hello, world.")
+
+ def test_wrong_base64_param(self):
+ with self.assertRaises(ValueError):
+ parse_data_uri("data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D")
+
+ def test_missing_comma(self):
+ with self.assertRaises(ValueError):
+ parse_data_uri("data:A%20brief%20note")
+
+ def test_missing_scheme(self):
+ with self.assertRaises(ValueError):
+ parse_data_uri("text/plain,A%20brief%20note")
+
+ def test_wrong_scheme(self):
+ with self.assertRaises(ValueError):
+ parse_data_uri("http://example.com/")
+
+ def test_scheme_case_insensitive(self):
+ result = parse_data_uri("DATA:,A%20brief%20note")
+ self.assertEqual(result.data, b"A brief note")
+ result = parse_data_uri("DaTa:,A%20brief%20note")
+ self.assertEqual(result.data, b"A brief note")
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/w3lib.egg-info/PKG-INFO b/w3lib.egg-info/PKG-INFO
index c17a298..98ac7ae 100644
--- a/w3lib.egg-info/PKG-INFO
+++ b/w3lib.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: w3lib
-Version: 1.16.0
+Version: 1.17.0
Summary: Library of web-related functions
Home-page: https://github.com/scrapy/w3lib
Author: Scrapy project
@@ -17,6 +17,8 @@ Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.3
Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
diff --git a/w3lib.egg-info/SOURCES.txt b/w3lib.egg-info/SOURCES.txt
index e224c60..afc648b 100644
--- a/w3lib.egg-info/SOURCES.txt
+++ b/w3lib.egg-info/SOURCES.txt
@@ -3,18 +3,12 @@ README.rst
setup.cfg
setup.py
tests/__init__.py
-tests/__init__.pyc
tests/py3-ignores.txt
tests/test_encoding.py
tests/test_form.py
tests/test_html.py
tests/test_http.py
tests/test_url.py
-tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
-tests/__pycache__/test_form.cpython-27-PYTEST.pyc
-tests/__pycache__/test_html.cpython-27-PYTEST.pyc
-tests/__pycache__/test_http.cpython-27-PYTEST.pyc
-tests/__pycache__/test_url.cpython-27-PYTEST.pyc
w3lib/__init__.py
w3lib/encoding.py
w3lib/form.py
diff --git a/w3lib/__init__.py b/w3lib/__init__.py
index e118a92..0a4c374 100644
--- a/w3lib/__init__.py
+++ b/w3lib/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "1.16.0"
+__version__ = "1.17.0"
version_info = tuple(int(v) if v.isdigit() else v
for v in __version__.split('.'))
diff --git a/w3lib/html.py b/w3lib/html.py
index 24d01a5..9990a35 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -17,6 +17,9 @@ _baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*
_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
+HTML5_WHITESPACE = ' \t\n\r\x0c'
+
+
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
r"""
@@ -317,3 +320,19 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
return interval, url
else:
return None, None
+
+
+def strip_html5_whitespace(text):
+ r"""
+ Strip all leading and trailing space characters (as defined in
+ https://www.w3.org/TR/html5/infrastructure.html#space-character).
+
+ Such stripping is useful e.g. for processing HTML element attributes which
+ contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
+ defines them as "valid URL potentially surrounded by spaces"
+ or "valid non-empty URL potentially surrounded by spaces".
+
+ >>> strip_html5_whitespace(' hello\n')
+ 'hello'
+ """
+ return text.strip(HTML5_WHITESPACE)
diff --git a/w3lib/http.py b/w3lib/http.py
index 8c5dfed..accfb5d 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -29,11 +29,21 @@ def headers_raw_to_dict(headers_raw):
return None
headers = headers_raw.splitlines()
headers_tuples = [header.split(b':', 1) for header in headers]
- return dict([
- (header_item[0].strip(), [header_item[1].strip()])
- for header_item in headers_tuples
- if len(header_item) == 2
- ])
+
+ result_dict = {}
+ for header_item in headers_tuples:
+ if not len(header_item) == 2:
+ continue
+
+ item_key = header_item[0].strip()
+ item_value = header_item[1].strip()
+
+ if item_key in result_dict:
+ result_dict[item_key].append(item_value)
+ else:
+ result_dict[item_key] = [item_value]
+
+ return result_dict
def headers_dict_to_raw(headers_dict):
diff --git a/w3lib/url.py b/w3lib/url.py
index 8d58c91..ef3189d 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -2,12 +2,14 @@
This module contains general purpose URL functions not found in the standard
library.
"""
+import base64
import codecs
import os
import re
import posixpath
import warnings
import six
+from collections import namedtuple
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
urldefrag, urlencode, urlparse,
quote, parse_qs, parse_qsl,
@@ -285,11 +287,108 @@ def any_to_uri(uri_or_path):
return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
+# ASCII characters.
+_char = set(map(chr, range(127)))
+
+# RFC 2045 token.
+_token = r'[{}]+'.format(re.escape(''.join(_char -
+ # Control characters.
+ set(map(chr, range(0, 32))) -
+ # tspecials and space.
+ set('()<>@,;:\\"/[]?= '))))
+
+# RFC 822 quoted-string, without surrounding quotation marks.
+_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
+ re.escape(''.join(_char - {'"', '\\', '\r'})),
+ re.escape(''.join(_char))
+)
+
+# Encode the regular expression strings to make them into bytes, as Python 3
+# bytes have no format() method, but bytes must be passed to re.compile() in
+# order to make a pattern object that can be used to match on bytes.
+
+# RFC 2397 mediatype.
+_mediatype_pattern = re.compile(
+ r'{token}/{token}'.format(token=_token).encode()
+)
+_mediatype_parameter_pattern = re.compile(
+ r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
+ quoted=_quoted_string
+ ).encode()
+)
+
+_ParseDataURIResult = namedtuple("ParseDataURIResult",
+ "media_type media_type_parameters data")
+
+def parse_data_uri(uri):
+ """
+
+ Parse a data: URI, returning a 3-tuple of media type, dictionary of media
+ type parameters, and data.
+
+ """
+
+ if not isinstance(uri, bytes):
+ uri = safe_url_string(uri).encode('ascii')
+
+ try:
+ scheme, uri = uri.split(b':', 1)
+ except ValueError:
+ raise ValueError("invalid URI")
+ if scheme.lower() != b'data':
+ raise ValueError("not a data URI")
+
+ # RFC 3986 section 2.1 allows percent encoding to escape characters that
+ # would be interpreted as delimiters, implying that actual delimiters
+ # should not be percent-encoded.
+ # Decoding before parsing will allow malformed URIs with percent-encoded
+ # delimiters, but it makes parsing easier and should not affect
+ # well-formed URIs, as the delimiters used in this URI scheme are not
+ # allowed, percent-encoded or not, in tokens.
+ if six.PY2:
+ uri = unquote(uri)
+ else:
+ uri = unquote_to_bytes(uri)
+
+ media_type = "text/plain"
+ media_type_params = {}
+
+ m = _mediatype_pattern.match(uri)
+ if m:
+ media_type = m.group().decode()
+ uri = uri[m.end():]
+ else:
+ media_type_params['charset'] = "US-ASCII"
+
+ while True:
+ m = _mediatype_parameter_pattern.match(uri)
+ if m:
+ attribute, value, value_quoted = m.groups()
+ if value_quoted:
+ value = re.sub(br'\\(.)', r'\1', value_quoted)
+ media_type_params[attribute.decode()] = value.decode()
+ uri = uri[m.end():]
+ else:
+ break
+
+ try:
+ is_base64, data = uri.split(b',', 1)
+ except ValueError:
+ raise ValueError("invalid data URI")
+ if is_base64:
+ if is_base64 != b";base64":
+ raise ValueError("invalid data URI")
+ data = base64.b64decode(data)
+
+ return _ParseDataURIResult(media_type, media_type_params, data)
+
+
__all__ = ["add_or_replace_parameter",
"any_to_uri",
"canonicalize_url",
"file_uri_to_path",
"is_url",
+ "parse_data_uri",
"path_to_file_uri",
"safe_download_url",
"safe_url_string",
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-w3lib.git
More information about the Python-modules-commits
mailing list