[Python-modules-commits] [python-w3lib] 01/13: Import python-w3lib_1.16.0.orig.tar.gz
Michael Fladischer
fladi at moszumanska.debian.org
Tue Jan 24 20:34:10 UTC 2017
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch master
in repository python-w3lib.
commit 93569ce6f5c44a1d85da06f72c12e14443b4cc61
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Tue Jan 24 16:34:44 2017 +0100
Import python-w3lib_1.16.0.orig.tar.gz
---
MANIFEST.in | 3 +
PKG-INFO | 2 +-
README.rst | 5 +
setup.py | 4 +-
{w3lib => tests}/__init__.py | 0
tests/__init__.pyc | Bin 0 -> 136 bytes
.../test_encoding.cpython-27-PYTEST.pyc | Bin 0 -> 13289 bytes
tests/__pycache__/test_form.cpython-27-PYTEST.pyc | Bin 0 -> 2549 bytes
tests/__pycache__/test_html.cpython-27-PYTEST.pyc | Bin 0 -> 40896 bytes
tests/__pycache__/test_http.cpython-27-PYTEST.pyc | Bin 0 -> 3230 bytes
tests/__pycache__/test_url.cpython-27-PYTEST.pyc | Bin 0 -> 31802 bytes
tests/py3-ignores.txt | 5 +
tests/test_encoding.py | 253 +++++++++
tests/test_form.py | 67 +++
tests/test_html.py | 444 ++++++++++++++++
tests/test_http.py | 89 ++++
tests/test_url.py | 579 +++++++++++++++++++++
w3lib.egg-info/PKG-INFO | 2 +-
w3lib.egg-info/SOURCES.txt | 15 +
w3lib.egg-info/not-zip-safe | 1 +
w3lib.egg-info/requires.txt | 2 +-
w3lib/__init__.py | 3 +
w3lib/encoding.py | 17 +-
w3lib/html.py | 85 +--
w3lib/url.py | 319 ++++++++++--
w3lib/util.py | 32 ++
26 files changed, 1848 insertions(+), 79 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9b58344
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+# Include tests into distribution
+recursive-include tests *
+
diff --git a/PKG-INFO b/PKG-INFO
index de7f487..c17a298 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: w3lib
-Version: 1.11.0
+Version: 1.16.0
Summary: Library of web-related functions
Home-page: https://github.com/scrapy/w3lib
Author: Scrapy project
diff --git a/README.rst b/README.rst
index 4a51b6d..15db969 100644
--- a/README.rst
+++ b/README.rst
@@ -5,6 +5,11 @@ w3lib
.. image:: https://secure.travis-ci.org/scrapy/w3lib.png?branch=master
:target: http://travis-ci.org/scrapy/w3lib
+.. image:: https://img.shields.io/codecov/c/github/scrapy/w3lib/master.svg
+ :target: http://codecov.io/github/scrapy/w3lib?branch=master
+ :alt: Coverage report
+
+
Overview
========
diff --git a/setup.py b/setup.py
index 52656e0..e356577 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
setup(
name='w3lib',
- version='1.11.0',
+ version='1.16.0',
license='BSD',
description='Library of web-related functions',
author='Scrapy project',
@@ -11,7 +11,7 @@ setup(
url='https://github.com/scrapy/w3lib',
packages=find_packages(exclude=('tests', 'tests.*')),
include_package_data=True,
- zip_zafe=False,
+ zip_safe=False,
platforms=['Any'],
classifiers=[
'Development Status :: 5 - Production/Stable',
diff --git a/w3lib/__init__.py b/tests/__init__.py
similarity index 100%
copy from w3lib/__init__.py
copy to tests/__init__.py
diff --git a/tests/__init__.pyc b/tests/__init__.pyc
new file mode 100644
index 0000000..ee8ba29
Binary files /dev/null and b/tests/__init__.pyc differ
diff --git a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..3634a7a
Binary files /dev/null and b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..5065298
Binary files /dev/null and b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..085bcaa
Binary files /dev/null and b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..380552b
Binary files /dev/null and b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..dfb6551
Binary files /dev/null and b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc differ
diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt
new file mode 100644
index 0000000..09f34ec
--- /dev/null
+++ b/tests/py3-ignores.txt
@@ -0,0 +1,5 @@
+w3lib/encoding.py
+w3lib/form.py
+w3lib/html.py
+w3lib/http.py
+w3lib/url.py
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
new file mode 100644
index 0000000..df2e5ce
--- /dev/null
+++ b/tests/test_encoding.py
@@ -0,0 +1,253 @@
+import unittest, codecs
+import six
+from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
+ http_content_type_encoding, resolve_encoding, html_to_unicode)
+
+class RequestEncodingTests(unittest.TestCase):
+ utf8_fragments = [
+ # Content-Type as meta http-equiv
+ b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
+ b"""\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
+ b"""<meta http-equiv="Content-Type" content="text/html" charset="utf-8">""",
+ b"""<meta http-equiv=Content-Type content="text/html" charset='utf-8'>""",
+ b"""<meta http-equiv="Content-Type" content\t=\n"text/html" charset\t="utf-8">""",
+ b"""<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
+ b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
+ # html5 meta charset
+ b"""<meta charset="utf-8">""",
+ b"""<meta charset =\n"utf-8">""",
+ # xml encoding
+ b"""<?xml version="1.0" encoding="utf-8"?>""",
+ ]
+
+ def test_bom(self):
+ # cjk water character in unicode
+ water_unicode = u'\u6C34'
+ # BOM + water character encoded
+ utf16be = b'\xfe\xff\x6c\x34'
+ utf16le = b'\xff\xfe\x34\x6c'
+ utf32be = b'\x00\x00\xfe\xff\x00\x00\x6c\x34'
+ utf32le = b'\xff\xfe\x00\x00\x34\x6c\x00\x00'
+ for string in (utf16be, utf16le, utf32be, utf32le):
+ bom_encoding, bom = read_bom(string)
+ decoded = string[len(bom):].decode(bom_encoding)
+ self.assertEqual(water_unicode, decoded)
+ # Body without BOM
+ enc, bom = read_bom("foo")
+ self.assertEqual(enc, None)
+ self.assertEqual(bom, None)
+ # Empty body
+ enc, bom = read_bom("")
+ self.assertEqual(enc, None)
+ self.assertEqual(bom, None)
+
+ def test_http_encoding_header(self):
+ header_value = "Content-Type: text/html; charset=ISO-8859-4"
+ extracted = http_content_type_encoding(header_value)
+ self.assertEqual(extracted, "iso8859-4")
+ self.assertEqual(None, http_content_type_encoding("something else"))
+
+ def test_html_body_declared_encoding(self):
+ for fragment in self.utf8_fragments:
+ encoding = html_body_declared_encoding(fragment)
+ self.assertEqual(encoding, 'utf-8', fragment)
+ self.assertEqual(None, html_body_declared_encoding(b"something else"))
+ self.assertEqual(None, html_body_declared_encoding(b"""
+ <head></head><body>
+ this isn't searched
+ <meta charset="utf-8">
+ """))
+ self.assertEqual(None, html_body_declared_encoding(
+ b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
+
+ def test_html_body_declared_encoding_unicode(self):
+ # html_body_declared_encoding should work when unicode body is passed
+ self.assertEqual(None, html_body_declared_encoding(u"something else"))
+
+ for fragment in self.utf8_fragments:
+ encoding = html_body_declared_encoding(fragment.decode('utf8'))
+ self.assertEqual(encoding, 'utf-8', fragment)
+
+ self.assertEqual(None, html_body_declared_encoding(u"""
+ <head></head><body>
+ this isn't searched
+ <meta charset="utf-8">
+ """))
+ self.assertEqual(None, html_body_declared_encoding(
+ u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
+
+
+class CodecsEncodingTestCase(unittest.TestCase):
+ def test_resolve_encoding(self):
+ self.assertEqual(resolve_encoding('latin1'), 'cp1252')
+ self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
+ self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030')
+ self.assertEqual(resolve_encoding('unknown encoding'), None)
+
+
+class UnicodeDecodingTestCase(unittest.TestCase):
+
+ def test_utf8(self):
+ self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), u'\xa3')
+
+ def test_invalid_utf8(self):
+ self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), u'\ufffd\xa3')
+
+
+def ct(charset):
+ return "Content-Type: text/html; charset=" + charset if charset else None
+
+def norm_encoding(enc):
+ return codecs.lookup(enc).name
+
+class HtmlConversionTests(unittest.TestCase):
+
+ def test_unicode_body(self):
+ unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
+ original_string = unicode_string.encode('cp1251')
+ encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
+ # check body_as_unicode
+ self.assertTrue(isinstance(body_unicode, six.text_type))
+ self.assertEqual(body_unicode, unicode_string)
+
+ def _assert_encoding(self, content_type, body, expected_encoding,
+ expected_unicode):
+ assert not isinstance(body, six.text_type)
+ encoding, body_unicode = html_to_unicode(ct(content_type), body)
+ self.assertTrue(isinstance(body_unicode, six.text_type))
+ self.assertEqual(norm_encoding(encoding),
+ norm_encoding(expected_encoding))
+
+ if isinstance(expected_unicode, six.string_types):
+ self.assertEqual(body_unicode, expected_unicode)
+ else:
+ self.assertTrue(
+ body_unicode in expected_unicode,
+ "%s is not in %s" % (body_unicode, expected_unicode)
+ )
+
+ def test_content_type_and_conversion(self):
+ """Test content type header is interpreted and text converted as
+ expected
+ """
+ self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', u"\xa3")
+ # something like this in the scrapy tests - but that's invalid?
+ # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
+ # iso-8859-1 is overridden to cp1252
+ self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', u"\xa3")
+ self._assert_encoding('', b"\xc2\xa3", 'utf-8', u"\xa3")
+ self._assert_encoding('none', b"\xc2\xa3", 'utf-8', u"\xa3")
+ self._assert_encoding('gb2312', b"\xa8D", 'gb18030', u"\u2015")
+ self._assert_encoding('gbk', b"\xa8D", 'gb18030', u"\u2015")
+ self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', u"\u6052")
+
+ def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
+ # unlike scrapy, the BOM is stripped
+ self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xabWORD2",
+ 'utf-8', u'WORD\ufffd\ufffdWORD2')
+ self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xabWORD2",
+ 'utf-8', u'WORD\ufffd\ufffdWORD2')
+
+ def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):
+ # Python implementations handle unexpected end of UTF8 data
+ # differently (see https://bugs.pypy.org/issue1536).
+ # It is hard to fix this for PyPy in w3lib, so the test
+ # is permissive.
+
+ # unlike scrapy, the BOM is stripped
+ self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xab",
+ 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
+ self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xab",
+ 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
+
+ def test_replace_wrong_encoding(self):
+ """Test invalid chars are replaced properly"""
+ encoding, body_unicode = html_to_unicode(ct('utf-8'),
+ b'PREFIX\xe3\xabSUFFIX')
+ # XXX: Policy for replacing invalid chars may suffer minor variations
+ # but it should always contain the unicode replacement char (u'\ufffd')
+ assert u'\ufffd' in body_unicode, repr(body_unicode)
+ assert u'PREFIX' in body_unicode, repr(body_unicode)
+ assert u'SUFFIX' in body_unicode, repr(body_unicode)
+
+ # Do not destroy html tags due to encoding bugs
+ encoding, body_unicode = html_to_unicode(ct('utf-8'),
+ b'\xf0<span>value</span>')
+ assert u'<span>value</span>' in body_unicode, repr(body_unicode)
+
+ def _assert_encoding_detected(self, content_type, expected_encoding, body,
+ **kwargs):
+ assert not isinstance(body, six.text_type)
+ encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
+ self.assertTrue(isinstance(body_unicode, six.text_type))
+ self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
+
+ def test_BOM(self):
+ # utf-16 cases already tested, as is the BOM detection function
+
+ # http header takes precedence, irrespective of BOM
+ bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+ expected = u'\ufffd\ufffd\x00h\x00i'
+ self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)
+
+ # BOM is stripped when it agrees with the encoding, or used to
+ # determine encoding
+ bom_utf8_str = codecs.BOM_UTF8 + b'hi'
+ self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
+ self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
+
+ def test_utf16_32(self):
+ # tools.ietf.org/html/rfc2781 section 4.3
+
+ # USE BOM and strip it
+ bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+ self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
+ self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")
+
+ bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
+ self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
+ self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
+
+ bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
+ self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
+ self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")
+
+ bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
+ self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
+ self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")
+
+ # if there is no BOM, big endian should be chosen
+ self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
+ self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")
+
+ def test_html_encoding(self):
+ # extracting the encoding from raw html is tested elsewhere
+ body = b"""blah blah < meta http-equiv="Content-Type"
+ content="text/html; charset=iso-8859-1"> other stuff"""
+ self._assert_encoding_detected(None, 'cp1252', body)
+
+ # header encoding takes precedence
+ self._assert_encoding_detected('utf-8', 'utf-8', body)
+ # BOM encoding takes precedence
+ self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body)
+
+ def test_autodetect(self):
+ asciif = lambda x: 'ascii'
+ body = b"""<meta charset="utf-8">"""
+ # body encoding takes precedence
+ self._assert_encoding_detected(None, 'utf-8', body,
+ auto_detect_fun=asciif)
+ # if no other encoding, the auto detect encoding is used.
+ self._assert_encoding_detected(None, 'ascii', b"no encoding info",
+ auto_detect_fun=asciif)
+
+ def test_default_encoding(self):
+ # if no other method available, the default encoding of utf-8 is used
+ self._assert_encoding_detected(None, 'utf-8', b"no encoding info")
+ # this can be overridden
+ self._assert_encoding_detected(None, 'ascii', b"no encoding info",
+ default_encoding='ascii')
+
+ def test_empty_body(self):
+ # if no other method available, the default encoding of utf-8 is used
+ self._assert_encoding_detected(None, 'utf-8', b"")
diff --git a/tests/test_form.py b/tests/test_form.py
new file mode 100644
index 0000000..280d879
--- /dev/null
+++ b/tests/test_form.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import warnings
+import unittest
+from collections import OrderedDict
+from w3lib.form import encode_multipart
+
+
+class EncodeMultipartTest(unittest.TestCase):
+
+ def test_encode_multipart(self):
+ data = {'key': 'value'}
+ with warnings.catch_warnings(record=True):
+ body, boundary = encode_multipart(data)
+ expected_body = (
+ '\r\n--{boundary}'
+ '\r\nContent-Disposition: form-data; name="key"\r\n'
+ '\r\nvalue'
+ '\r\n--{boundary}--'
+ '\r\n'.format(boundary=boundary).encode('utf8')
+ )
+ self.assertEqual(body, expected_body)
+
+ def test_encode_multipart_unicode(self):
+ data = OrderedDict([
+ (u'ключ1', u'значение1'.encode('utf8')),
+ (u'ключ2', u'значение2'),
+ ])
+ with warnings.catch_warnings(record=True):
+ body, boundary = encode_multipart(data)
+ expected_body = (
+ u'\r\n--{boundary}'
+ u'\r\nContent-Disposition: form-data; name="ключ1"\r\n'
+ u'\r\nзначение1'
+ u'\r\n--{boundary}'
+ u'\r\nContent-Disposition: form-data; name="ключ2"\r\n'
+ u'\r\nзначение2'
+ u'\r\n--{boundary}--'
+ u'\r\n'.format(boundary=boundary).encode('utf8')
+ )
+ self.assertEqual(body, expected_body)
+
+ def test_encode_multipart_file(self):
+ # this data is not decodable using utf8
+ data = {'key': ('file/name', b'\xa1\xa2\xa3\xa4\r\n\r')}
+ with warnings.catch_warnings(record=True):
+ body, boundary = encode_multipart(data)
+ body_lines = [
+ b'\r\n--' + boundary.encode('ascii'),
+ b'\r\nContent-Disposition: form-data; name="key"; filename="file/name"\r\n',
+ b'\r\n\xa1\xa2\xa3\xa4\r\n\r',
+ b'\r\n--' + boundary.encode('ascii') + b'--\r\n',
+ ]
+ expected_body = b''.join(body_lines)
+ self.assertEqual(body, expected_body)
+
+ #def test_encode_multipart_int(self):
+ # data = {'key': 123}
+ # body, boundary = encode_multipart2(data)
+ # expected_body = (
+ # '\n--{boundary}'
+ # '\nContent-Disposition: form-data; name="key"\n'
+ # '\n123'
+ # '\n--{boundary}--'
+ # '\n'.format(boundary=boundary)
+ # )
+ # self.assertEqual(body, expected_body)
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..68133cb
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,444 @@
+# -*- coding: utf-8 -*-
+import unittest
+import six
+from w3lib.html import (replace_entities, replace_tags, remove_comments,
+ remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup,
+ get_base_url, get_meta_refresh)
+
+
+class RemoveEntitiesTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return uncode
+ assert isinstance(replace_entities(b'no entities'), six.text_type)
+ assert isinstance(replace_entities(b'Price: £100!'), six.text_type)
+ assert isinstance(replace_entities(u'no entities'), six.text_type)
+ assert isinstance(replace_entities(u'Price: £100!'), six.text_type)
+
+ def test_regular(self):
+ # regular conversions
+ self.assertEqual(replace_entities(u'As low as £100!'),
+ u'As low as \xa3100!')
+ self.assertEqual(replace_entities(b'As low as £100!'),
+ u'As low as \xa3100!')
+ self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
+ u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
+
+ def test_keep_entities(self):
+ # keep some entities
+ self.assertEqual(replace_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']),
+ u'<b>Low < High & Medium \xa3 six</b>')
+ self.assertEqual(replace_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']),
+ u'<b>Low < High & Medium \xa3 six</b>')
+
+ def test_illegal_entities(self):
+ self.assertEqual(replace_entities('a < b &illegal; c six', remove_illegal=False),
+ u'a < b &illegal; c six')
+ self.assertEqual(replace_entities('a < b &illegal; c six', remove_illegal=True),
+ u'a < b c six')
+ self.assertEqual(replace_entities('x≤y'), u'x\u2264y')
+ self.assertEqual(replace_entities('xy'), u'xy')
+ self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy')
+
+ def test_browser_hack(self):
+ # check browser hack for numeric character references in the 80-9F range
+ self.assertEqual(replace_entities('xy', encoding='cp1252'), u'x\u2122y')
+ self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y')
+
+ def test_missing_semicolon(self):
+ for entity, result in (
+ ('<<!', '<<!',),
+ ('<!', '<!',),
+ ('A ', 'A ',),
+ ('A!', 'A!',),
+ ('Ah', 'Ah',),
+ ('A!', 'A!',),
+ ('Ax', 'Ax',),
+ ('³!', u'\u00B3!',),
+ ('Á!', u'\u00C1!',),
+ ('☃!', u'\u2603!',),
+ ('™', u'\u2122',),
+ ('™', u'\u2122',),
+ ):
+ self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
+ self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
+
+
+ def test_encoding(self):
+ self.assertEqual(replace_entities(b'x\x99™y', encoding='cp1252'), \
+ u'x\u2122\u2122\u2122y')
+
+
+class ReplaceTagsTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return uncode
+ assert isinstance(replace_tags(b'no entities'), six.text_type)
+ assert isinstance(replace_tags('no entities'), six.text_type)
+
+ def test_replace_tags(self):
+ self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
+ u'This text contains some tag')
+ self.assertEqual(replace_tags(b'This text is very im<b>port</b>ant', ' '),
+ u'This text is very im port ant')
+
+ def test_replace_tags_multiline(self):
+ self.assertEqual(replace_tags(b'Click <a class="one"\r\n href="url">here</a>'),
+ u'Click here')
+
+
+class RemoveCommentsTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return unicode
+ assert isinstance(remove_comments(b'without comments'), six.text_type)
+ assert isinstance(remove_comments(b'<!-- with comments -->'), six.text_type)
+ assert isinstance(remove_comments(u'without comments'), six.text_type)
+ assert isinstance(remove_comments(u'<!-- with comments -->'), six.text_type)
+
+ def test_no_comments(self):
+ # text without comments
+ self.assertEqual(remove_comments(u'text without comments'), u'text without comments')
+
+ def test_remove_comments(self):
+ # text with comments
+ self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
+ self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
+ self.assertEqual(remove_comments(u'Hello<!--My\nWorld-->'), u'Hello')
+
+ self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test whatever')
+ self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test whatever')
+
+
+class RemoveTagsTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return unicode
+ assert isinstance(remove_tags(b'no tags'), six.text_type)
+ assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
+ assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
+ assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
+ assert isinstance(remove_tags(u'no tags'), six.text_type)
+ assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
+ assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
+ assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
+
+ def test_remove_tags_without_tags(self):
+ # text without tags
+ self.assertEqual(remove_tags(u'no tags'), u'no tags')
+ self.assertEqual(remove_tags(u'no tags', which_ones=('p', 'b',)), u'no tags')
+
+ def test_remove_tags(self):
+ # text with tags
+ self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
+ self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')
+
+ self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
+ u'<b>not will removed</b>i will removed')
+
+ def test_remove_tags_with_attributes(self):
+ # text with tags and attributes
+ self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>'), u'texty')
+ self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>', which_ones=('b',)),
+ u'<p align="center" class="one">texty</p>')
+
+ def test_remove_empty_tags(self):
+ # text with empty tags
+ self.assertEqual(remove_tags(u'a<br />b<br/>c'), u'abc')
+ self.assertEqual(remove_tags(u'a<br />b<br/>c', which_ones=('br',)), u'abc')
+
+ def test_keep_argument(self):
+ self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('br',)), u'a<br />b<br/>c')
+ self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p',)), u'<p>abc</p>')
+ self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p', 'br', 'div')), u'<p>a<br />b<br/>c</p>')
+
+ def test_uppercase_tags(self):
+ self.assertEqual(remove_tags(u'<foo></foo><bar></bar><baz/>', which_ones=('Foo', 'BAR', 'baZ')), u'')
+ self.assertEqual(remove_tags(u'<FOO></foO><BaR></bAr><BAZ/>', which_ones=('foo', 'bar', 'baz')), u'')
+
+
+class RemoveTagsWithContentTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return unicode
+ assert isinstance(remove_tags_with_content(b'no tags'), six.text_type)
+ assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type)
+ assert isinstance(remove_tags_with_content(u'no tags'), six.text_type)
+ assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type)
+ assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
+
+ def test_without_tags(self):
+ # text without tags
+ self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
+ self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags')
+
+ def test_with_tags(self):
+ # text with tags
+ self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
+ self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')
+
+ self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
+ u'<b>not will removed</b>')
+
+ def test_empty_tags(self):
+ # text with empty tags
+ self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')
+
+
+class ReplaceEscapeCharsTest(unittest.TestCase):
+ def test_returns_unicode(self):
+ # make sure it always return unicode
+ assert isinstance(replace_escape_chars(b'no ec'), six.text_type)
+ assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type)
+ assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type)
+ assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type)
+ assert isinstance(replace_escape_chars(u'no ec'), six.text_type)
+ assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type)
+ assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
+
+ def test_without_escape_chars(self):
+ # text without escape chars
+ self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
+ self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')
+
+ def test_with_escape_chars(self):
+ # text with escape chars
+ self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
+ self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
+ self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
+ self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
+ self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
+ self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=b'\xc2\xa3'), u'escape\xa3chars\xa3')
+
+
+class UnquoteMarkupTest(unittest.TestCase):
+
+ sample_txt1 = u"""<node1>hi, this is sample text with entities: & ©
+<![CDATA[although this is inside a cdata! & "]]></node1>"""
+ sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>'
+ sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4'
+
+ def test_returns_unicode(self):
+ # make sure it always return unicode
+ assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type)
+ assert isinstance(unquote_markup(self.sample_txt2), six.text_type)
+
+ def test_unquote_markup(self):
+ self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
+although this is inside a cdata! & "</node1>""")
+
+ self.assertEqual(unquote_markup(self.sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>')
+
+ self.assertEqual(unquote_markup(self.sample_txt1 + self.sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
+although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""")
+
+ self.assertEqual(unquote_markup(self.sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
+
+
+class GetBaseUrlTest(unittest.TestCase):
+
+ def test_get_base_url(self):
+ baseurl = u'https://example.org'
+
+ text = u"""\
+ <html>\
+ <head><title>Dummy</title><base href='http://example.org/something' /></head>\
+ <body>blahablsdfsal&</body>\
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
+ self.assertEqual(get_base_url(text, baseurl.encode('ascii')), 'http://example.org/something')
+
+
+ def test_relative_url_with_absolute_path(self):
+ baseurl = 'https://example.org'
+ text = u"""\
+ <html>\
+ <head><title>Dummy</title><base href='/absolutepath' /></head>\
+ <body>blahablsdfsal&</body>\
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')
+
+ def test_no_scheme_url(self):
+ baseurl = 'https://example.org'
+ text = b"""\
+ <html>\
+ <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\
+ <body>blahablsdfsal&</body>\
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
+
+ def test_attributes_before_href(self):
+ baseurl = u'https://example.org'
+
+ text = u"""\
+ <html>\
+ <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\
+ <body>blahablsdfsal&</body>\
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
+
+ def test_tag_name(self):
+ baseurl = u'https://example.org'
+
+ text = u"""\
+ <html>\
+ <head><title>Dummy</title><basefoo href='http://example.org/something' /></head>\
+ <body>blahablsdfsal&</body>\
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl), 'https://example.org')
+
+ def test_get_base_url_utf8(self):
+ baseurl = u'https://example.org'
+
+ text = u"""
+ <html>
+ <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head>
+ <body>blahablsdfsal&</body>
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl),
+ 'http://example.org/snowman%E2%8D%A8')
+
+ def test_get_base_url_latin1(self):
+ # page encoding does not affect URL path encoding before percent-escaping
+ # we should still use UTF-8 by default
+ baseurl = u'https://example.org'
+
+ text = u"""
+ <html>
+ <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head>
+ <body>blahablsdfsal&</body>
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
+ 'http://example.org/sterling%C2%A3')
+
+ def test_get_base_url_latin1_percent(self):
+ # non-UTF-8 percent-encoded characters sequence are left untouched
+ baseurl = u'https://example.org'
+
+ text = u"""
+ <html>
+ <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head>
+ <body>blahablsdfsal&</body>
+ </html>"""
+ self.assertEqual(get_base_url(text, baseurl),
+ 'http://example.org/sterling%a3')
+
+
+class GetMetaRefreshTest(unittest.TestCase):
+ def test_get_meta_refresh(self):
+ baseurl = 'http://example.org'
+ body = """
+ <html>
+ <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
+ <body>blahablsdfsal&</body>
+ </html>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
+
+ def test_without_url(self):
+ # refresh without url should return (None, None)
+ baseurl = 'http://example.org'
+ body = """<meta http-equiv="refresh" content="5" />"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+ body = """<meta http-equiv="refresh" content="5;
+ url=http://example.org/newpage" /></head>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
+
+ def test_multiline(self):
+ # meta refresh in multiple lines
+ baseurl = 'http://example.org'
+ body = """<html><head>
+ <META
+ HTTP-EQUIV="Refresh"
+ CONTENT="1; URL=http://example.org/newpage">"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
+
+ def test_entities_in_redirect_url(self):
+ # entities in the redirect url
+ baseurl = 'http://example.org'
+ body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))
+
+ def test_relative_redirects(self):
+ # relative redirects
+ baseurl = 'http://example.com/page/this.html'
+ body = """<meta http-equiv="refresh" content="3; url=other.html">"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))
+
+ def test_nonascii_url_utf8(self):
+ # non-ascii chars in the url (utf8 - default)
+ baseurl = 'http://example.com'
+ body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
+
+ def test_nonascii_url_latin1(self):
+ # non-ascii chars in the url path (latin1)
+ # should end up UTF-8 encoded anyway
+ baseurl = 'http://example.com'
+ body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
+ self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
+
+ def test_nonascii_url_latin1_query(self):
+ # non-ascii chars in the url path and query (latin1)
+ # only query part should be kept latin1 encoded before percent escaping
+ baseurl = 'http://example.com'
+ body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
+ self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
+
+ def test_commented_meta_refresh(self):
+ # html commented meta refresh header must not directed
+ baseurl = 'http://example.com'
+ body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+ def test_html_comments_with_uncommented_meta_refresh(self):
+ # html comments must not interfere with uncommented meta refresh header
+ baseurl = 'http://example.com'
+ body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))
+
+ def test_float_refresh_intervals(self):
+ # float refresh intervals
+ baseurl = 'http://example.com'
+ body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))
+
+ body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
+
+ def test_tag_name(self):
+ baseurl = 'http://example.org'
+ body = """
+ <html>
+ <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
+ <body>blahablsdfsal&</body>
+ </html>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+ def test_leading_newline_in_url(self):
+ baseurl = 'http://example.org'
+ body = """
+ <html>
+ <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
+http://www.example.org/index.php" />
+ </head>
+ </html>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php'))
+
+ def test_inside_noscript(self):
+ baseurl = 'http://example.org'
+ body = """
+ <html>
+ <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
+ </html>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+ self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required"))
+
+ def test_inside_script(self):
+ baseurl = 'http://example.org'
+ body = """
+ <html>
+ <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
+ </html>"""
+ self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+ self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))
diff --git a/tests/test_http.py b/tests/test_http.py
new file mode 100644
index 0000000..6ce53ca
--- /dev/null
+++ b/tests/test_http.py
@@ -0,0 +1,89 @@
+import unittest
+from collections import OrderedDict
+from w3lib.http import (basic_auth_header,
+ headers_dict_to_raw, headers_raw_to_dict)
+
+__doctests__ = ['w3lib.http'] # for trial support
+
+class HttpTests(unittest.TestCase):
+
+ def test_basic_auth_header(self):
+ self.assertEqual(b'Basic c29tZXVzZXI6c29tZXBhc3M=',
+ basic_auth_header('someuser', 'somepass'))
+ # Check url unsafe encoded header
+ self.assertEqual(b'Basic c29tZXVzZXI6QDx5dTk-Jm8_UQ==',
+ basic_auth_header('someuser', '@<yu9>&o?Q'))
+
+ def test_headers_raw_dict_none(self):
+ self.assertIsNone(headers_raw_to_dict(None))
+ self.assertIsNone(headers_dict_to_raw(None))
+
+ def test_headers_raw_to_dict(self):
+ raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
+ dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
+ self.assertEqual(headers_raw_to_dict(raw), dct)
+
+ def test_headers_dict_to_raw(self):
+ dct = OrderedDict([
+ (b'Content-type', b'text/html'),
+ (b'Accept', b'gzip')
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b'Content-type: text/html\r\nAccept: gzip'
+ )
+
+ def test_headers_dict_to_raw_listtuple(self):
+ dct = OrderedDict([
+ (b'Content-type', [b'text/html']),
+ (b'Accept', [b'gzip'])
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b'Content-type: text/html\r\nAccept: gzip'
+ )
+
+ dct = OrderedDict([
+ (b'Content-type', (b'text/html',)),
+ (b'Accept', (b'gzip',))
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b'Content-type: text/html\r\nAccept: gzip'
+ )
+
+ dct = OrderedDict([
+ (b'Cookie', (b'val001', b'val002')),
+ (b'Accept', b'gzip')
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
+ )
+
+ dct = OrderedDict([
+ (b'Cookie', [b'val001', b'val002']),
+ (b'Accept', b'gzip')
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
+ )
+
+ def test_headers_dict_to_raw_wrong_values(self):
+ dct = OrderedDict([
+ (b'Content-type', 0),
+ ])
+ self.assertEqual(
+ headers_dict_to_raw(dct),
+ b''
+ )
... 1373 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-w3lib.git
More information about the Python-modules-commits
mailing list