[Python-modules-commits] [python-w3lib] 01/13: Import python-w3lib_1.16.0.orig.tar.gz

Tue Jan 24 20:34:10 UTC 2017

This is an automated email from the git hooks/post-receive script.

fladi pushed a commit to branch master
in repository python-w3lib.

commit 93569ce6f5c44a1d85da06f72c12e14443b4cc61
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date:   Tue Jan 24 16:34:44 2017 +0100

    Import python-w3lib_1.16.0.orig.tar.gz
---
 MANIFEST.in                                        |   3 +
 PKG-INFO                                           |   2 +-
 README.rst                                         |   5 +
 setup.py                                           |   4 +-
 {w3lib => tests}/__init__.py                       |   0
 tests/__init__.pyc                                 | Bin 0 -> 136 bytes
 .../test_encoding.cpython-27-PYTEST.pyc            | Bin 0 -> 13289 bytes
 tests/__pycache__/test_form.cpython-27-PYTEST.pyc  | Bin 0 -> 2549 bytes
 tests/__pycache__/test_html.cpython-27-PYTEST.pyc  | Bin 0 -> 40896 bytes
 tests/__pycache__/test_http.cpython-27-PYTEST.pyc  | Bin 0 -> 3230 bytes
 tests/__pycache__/test_url.cpython-27-PYTEST.pyc   | Bin 0 -> 31802 bytes
 tests/py3-ignores.txt                              |   5 +
 tests/test_encoding.py                             | 253 +++++++++
 tests/test_form.py                                 |  67 +++
 tests/test_html.py                                 | 444 ++++++++++++++++
 tests/test_http.py                                 |  89 ++++
 tests/test_url.py                                  | 579 +++++++++++++++++++++
 w3lib.egg-info/PKG-INFO                            |   2 +-
 w3lib.egg-info/SOURCES.txt                         |  15 +
 w3lib.egg-info/not-zip-safe                        |   1 +
 w3lib.egg-info/requires.txt                        |   2 +-
 w3lib/__init__.py                                  |   3 +
 w3lib/encoding.py                                  |  17 +-
 w3lib/html.py                                      |  85 +--
 w3lib/url.py                                       | 319 ++++++++++--
 w3lib/util.py                                      |  32 ++
 26 files changed, 1848 insertions(+), 79 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9b58344
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+# Include tests into distribution
+recursive-include tests *
+
diff --git a/PKG-INFO b/PKG-INFO
index de7f487..c17a298 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: w3lib
-Version: 1.11.0
+Version: 1.16.0
 Summary: Library of web-related functions
 Home-page: https://github.com/scrapy/w3lib
 Author: Scrapy project
diff --git a/README.rst b/README.rst
index 4a51b6d..15db969 100644
--- a/README.rst
+++ b/README.rst
@@ -5,6 +5,11 @@ w3lib
 .. image:: https://secure.travis-ci.org/scrapy/w3lib.png?branch=master
    :target: http://travis-ci.org/scrapy/w3lib
 
+.. image:: https://img.shields.io/codecov/c/github/scrapy/w3lib/master.svg
+   :target: http://codecov.io/github/scrapy/w3lib?branch=master
+   :alt: Coverage report
+
+
 Overview
 ========
 
diff --git a/setup.py b/setup.py
index 52656e0..e356577 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='w3lib',
-    version='1.11.0',
+    version='1.16.0',
     license='BSD',
     description='Library of web-related functions',
     author='Scrapy project',
@@ -11,7 +11,7 @@ setup(
     url='https://github.com/scrapy/w3lib',
     packages=find_packages(exclude=('tests', 'tests.*')),
     include_package_data=True,
-    zip_zafe=False,
+    zip_safe=False,
     platforms=['Any'],
     classifiers=[
         'Development Status :: 5 - Production/Stable',
diff --git a/w3lib/__init__.py b/tests/__init__.py
similarity index 100%
copy from w3lib/__init__.py
copy to tests/__init__.py
diff --git a/tests/__init__.pyc b/tests/__init__.pyc
new file mode 100644
index 0000000..ee8ba29
Binary files /dev/null and b/tests/__init__.pyc differ
diff --git a/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..3634a7a
Binary files /dev/null and b/tests/__pycache__/test_encoding.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_form.cpython-27-PYTEST.pyc b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..5065298
Binary files /dev/null and b/tests/__pycache__/test_form.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_html.cpython-27-PYTEST.pyc b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..085bcaa
Binary files /dev/null and b/tests/__pycache__/test_html.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_http.cpython-27-PYTEST.pyc b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..380552b
Binary files /dev/null and b/tests/__pycache__/test_http.cpython-27-PYTEST.pyc differ
diff --git a/tests/__pycache__/test_url.cpython-27-PYTEST.pyc b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc
new file mode 100644
index 0000000..dfb6551
Binary files /dev/null and b/tests/__pycache__/test_url.cpython-27-PYTEST.pyc differ
diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt
new file mode 100644
index 0000000..09f34ec
--- /dev/null
+++ b/tests/py3-ignores.txt
@@ -0,0 +1,5 @@
+w3lib/encoding.py
+w3lib/form.py
+w3lib/html.py
+w3lib/http.py
+w3lib/url.py
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
new file mode 100644
index 0000000..df2e5ce
--- /dev/null
+++ b/tests/test_encoding.py
@@ -0,0 +1,253 @@
+import unittest, codecs
+import six
+from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
+        http_content_type_encoding, resolve_encoding, html_to_unicode)
+
+class RequestEncodingTests(unittest.TestCase):
+    utf8_fragments = [
+        # Content-Type as meta http-equiv
+        b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
+        b"""\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
+        b"""<meta http-equiv="Content-Type" content="text/html" charset="utf-8">""",
+        b"""<meta http-equiv=Content-Type content="text/html" charset='utf-8'>""",
+        b"""<meta http-equiv="Content-Type" content\t=\n"text/html" charset\t="utf-8">""",
+        b"""<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
+        b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
+        # html5 meta charset
+        b"""<meta charset="utf-8">""",
+        b"""<meta charset =\n"utf-8">""",
+        # xml encoding
+        b"""<?xml version="1.0" encoding="utf-8"?>""",
+    ]
+
+    def test_bom(self):
+        # cjk water character in unicode
+        water_unicode = u'\u6C34'
+        # BOM + water character encoded
+        utf16be = b'\xfe\xff\x6c\x34'
+        utf16le = b'\xff\xfe\x34\x6c'
+        utf32be = b'\x00\x00\xfe\xff\x00\x00\x6c\x34'
+        utf32le = b'\xff\xfe\x00\x00\x34\x6c\x00\x00'
+        for string in (utf16be, utf16le, utf32be, utf32le):
+            bom_encoding, bom = read_bom(string)
+            decoded = string[len(bom):].decode(bom_encoding)
+            self.assertEqual(water_unicode, decoded)
+        # Body without BOM
+        enc, bom = read_bom("foo")
+        self.assertEqual(enc, None)
+        self.assertEqual(bom, None)
+        # Empty body
+        enc, bom = read_bom("")
+        self.assertEqual(enc, None)
+        self.assertEqual(bom, None)
+
+    def test_http_encoding_header(self):
+        header_value = "Content-Type: text/html; charset=ISO-8859-4"
+        extracted = http_content_type_encoding(header_value)
+        self.assertEqual(extracted, "iso8859-4")
+        self.assertEqual(None, http_content_type_encoding("something else"))
+
+    def test_html_body_declared_encoding(self):
+        for fragment in self.utf8_fragments:
+            encoding = html_body_declared_encoding(fragment)
+            self.assertEqual(encoding, 'utf-8', fragment)
+        self.assertEqual(None, html_body_declared_encoding(b"something else"))
+        self.assertEqual(None, html_body_declared_encoding(b"""
+            <head></head><body>
+            this isn't searched
+            <meta charset="utf-8">
+        """))
+        self.assertEqual(None, html_body_declared_encoding(
+            b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
+
+    def test_html_body_declared_encoding_unicode(self):
+        # html_body_declared_encoding should work when unicode body is passed
+        self.assertEqual(None, html_body_declared_encoding(u"something else"))
+
+        for fragment in self.utf8_fragments:
+            encoding = html_body_declared_encoding(fragment.decode('utf8'))
+            self.assertEqual(encoding, 'utf-8', fragment)
+
+        self.assertEqual(None, html_body_declared_encoding(u"""
+            <head></head><body>
+            this isn't searched
+            <meta charset="utf-8">
+        """))
+        self.assertEqual(None, html_body_declared_encoding(
+            u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
+
+
+class CodecsEncodingTestCase(unittest.TestCase):
+    def test_resolve_encoding(self):
+        self.assertEqual(resolve_encoding('latin1'), 'cp1252')
+        self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
+        self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030')
+        self.assertEqual(resolve_encoding('unknown encoding'), None)
+
+
+class UnicodeDecodingTestCase(unittest.TestCase):
+
+    def test_utf8(self):
+        self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), u'\xa3')
+
+    def test_invalid_utf8(self):
+        self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), u'\ufffd\xa3')
+
+
+def ct(charset):
+    return "Content-Type: text/html; charset=" + charset if charset else None
+
+def norm_encoding(enc):
+    return codecs.lookup(enc).name
+
+class HtmlConversionTests(unittest.TestCase):
+
+    def test_unicode_body(self):
+        unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
+        original_string = unicode_string.encode('cp1251')
+        encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
+        # check body_as_unicode
+        self.assertTrue(isinstance(body_unicode, six.text_type))
+        self.assertEqual(body_unicode, unicode_string)
+
+    def _assert_encoding(self, content_type, body, expected_encoding,
+                expected_unicode):
+        assert not isinstance(body, six.text_type)
+        encoding, body_unicode = html_to_unicode(ct(content_type), body)
+        self.assertTrue(isinstance(body_unicode, six.text_type))
+        self.assertEqual(norm_encoding(encoding),
+                norm_encoding(expected_encoding))
+
+        if isinstance(expected_unicode, six.string_types):
+            self.assertEqual(body_unicode, expected_unicode)
+        else:
+            self.assertTrue(
+                body_unicode in expected_unicode,
+                "%s is not in %s" % (body_unicode, expected_unicode)
+            )
+
+    def test_content_type_and_conversion(self):
+        """Test content type header is interpreted and text converted as
+        expected
+        """
+        self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', u"\xa3")
+        # something like this in the scrapy tests - but that's invalid?
+        # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
+        # iso-8859-1 is overridden to cp1252
+        self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', u"\xa3")
+        self._assert_encoding('', b"\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('none', b"\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('gb2312', b"\xa8D", 'gb18030', u"\u2015")
+        self._assert_encoding('gbk', b"\xa8D", 'gb18030', u"\u2015")
+        self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', u"\u6052")
+
+    def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
+        # unlike scrapy, the BOM is stripped
+        self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xabWORD2",
+                'utf-8', u'WORD\ufffd\ufffdWORD2')
+        self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xabWORD2",
+                'utf-8', u'WORD\ufffd\ufffdWORD2')
+
+    def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):
+        # Python implementations handle unexpected end of UTF8 data
+        # differently (see https://bugs.pypy.org/issue1536).
+        # It is hard to fix this for PyPy in w3lib, so the test
+        # is permissive.
+
+        # unlike scrapy, the BOM is stripped
+        self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xab",
+                'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
+        self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xab",
+                'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
+
+    def test_replace_wrong_encoding(self):
+        """Test invalid chars are replaced properly"""
+        encoding, body_unicode = html_to_unicode(ct('utf-8'),
+                b'PREFIX\xe3\xabSUFFIX')
+        # XXX: Policy for replacing invalid chars may suffer minor variations
+        # but it should always contain the unicode replacement char (u'\ufffd')
+        assert u'\ufffd' in body_unicode, repr(body_unicode)
+        assert u'PREFIX' in body_unicode, repr(body_unicode)
+        assert u'SUFFIX' in body_unicode, repr(body_unicode)
+
+        # Do not destroy html tags due to encoding bugs
+        encoding, body_unicode = html_to_unicode(ct('utf-8'),
+            b'\xf0<span>value</span>')
+        assert u'<span>value</span>' in body_unicode, repr(body_unicode)
+
+    def _assert_encoding_detected(self, content_type, expected_encoding, body,
+            **kwargs):
+        assert not isinstance(body, six.text_type)
+        encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
+        self.assertTrue(isinstance(body_unicode, six.text_type))
+        self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
+
+    def test_BOM(self):
+        # utf-16 cases already tested, as is the BOM detection function
+
+        # http header takes precedence, irrespective of BOM
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        expected = u'\ufffd\ufffd\x00h\x00i'
+        self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)
+
+        # BOM is stripped when it agrees with the encoding, or used to
+        # determine encoding
+        bom_utf8_str = codecs.BOM_UTF8 + b'hi'
+        self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
+        self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
+
+    def test_utf16_32(self):
+        # tools.ietf.org/html/rfc2781 section 4.3
+
+        # USE BOM and strip it
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")
+
+        bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
+        self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
+
+        bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
+        self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")
+
+        bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
+        self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")
+
+        # if there is no BOM,  big endian should be chosen
+        self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
+        self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")
+
+    def test_html_encoding(self):
+        # extracting the encoding from raw html is tested elsewhere
+        body = b"""blah blah < meta   http-equiv="Content-Type"
+            content="text/html; charset=iso-8859-1"> other stuff"""
+        self._assert_encoding_detected(None, 'cp1252', body)
+
+        # header encoding takes precedence
+        self._assert_encoding_detected('utf-8', 'utf-8', body)
+        # BOM encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body)
+
+    def test_autodetect(self):
+        asciif = lambda x: 'ascii'
+        body = b"""<meta charset="utf-8">"""
+        # body encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', body,
+                auto_detect_fun=asciif)
+        # if no other encoding, the auto detect encoding is used.
+        self._assert_encoding_detected(None, 'ascii', b"no encoding info",
+                auto_detect_fun=asciif)
+
+    def test_default_encoding(self):
+        # if no other method available, the default encoding of utf-8 is used
+        self._assert_encoding_detected(None, 'utf-8', b"no encoding info")
+        # this can be overridden
+        self._assert_encoding_detected(None, 'ascii', b"no encoding info",
+                default_encoding='ascii')
+
+    def test_empty_body(self):
+        # if no other method available, the default encoding of utf-8 is used
+        self._assert_encoding_detected(None, 'utf-8', b"")
diff --git a/tests/test_form.py b/tests/test_form.py
new file mode 100644
index 0000000..280d879
--- /dev/null
+++ b/tests/test_form.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import warnings
+import unittest
+from collections import OrderedDict
+from w3lib.form import encode_multipart
+
+
+class EncodeMultipartTest(unittest.TestCase):
+
+    def test_encode_multipart(self):
+        data = {'key': 'value'}
+        with warnings.catch_warnings(record=True):
+            body, boundary = encode_multipart(data)
+        expected_body = (
+            '\r\n--{boundary}'
+            '\r\nContent-Disposition: form-data; name="key"\r\n'
+            '\r\nvalue'
+            '\r\n--{boundary}--'
+            '\r\n'.format(boundary=boundary).encode('utf8')
+        )
+        self.assertEqual(body, expected_body)
+
+    def test_encode_multipart_unicode(self):
+        data = OrderedDict([
+            (u'ключ1', u'значение1'.encode('utf8')),
+            (u'ключ2', u'значение2'),
+        ])
+        with warnings.catch_warnings(record=True):
+            body, boundary = encode_multipart(data)
+        expected_body = (
+            u'\r\n--{boundary}'
+            u'\r\nContent-Disposition: form-data; name="ключ1"\r\n'
+            u'\r\nзначение1'
+            u'\r\n--{boundary}'
+            u'\r\nContent-Disposition: form-data; name="ключ2"\r\n'
+            u'\r\nзначение2'
+            u'\r\n--{boundary}--'
+            u'\r\n'.format(boundary=boundary).encode('utf8')
+        )
+        self.assertEqual(body, expected_body)
+
+    def test_encode_multipart_file(self):
+        # this data is not decodable using utf8
+        data = {'key': ('file/name', b'\xa1\xa2\xa3\xa4\r\n\r')}
+        with warnings.catch_warnings(record=True):
+            body, boundary = encode_multipart(data)
+        body_lines = [
+            b'\r\n--' + boundary.encode('ascii'),
+            b'\r\nContent-Disposition: form-data; name="key"; filename="file/name"\r\n',
+            b'\r\n\xa1\xa2\xa3\xa4\r\n\r',
+            b'\r\n--' + boundary.encode('ascii') + b'--\r\n',
+        ]
+        expected_body = b''.join(body_lines)
+        self.assertEqual(body, expected_body)
+
+    #def test_encode_multipart_int(self):
+    #    data = {'key': 123}
+    #    body, boundary = encode_multipart2(data)
+    #    expected_body = (
+    #        '\n--{boundary}'
+    #        '\nContent-Disposition: form-data; name="key"\n'
+    #        '\n123'
+    #        '\n--{boundary}--'
+    #        '\n'.format(boundary=boundary)
+    #    )
+    #    self.assertEqual(body, expected_body)
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..68133cb
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,444 @@
+# -*- coding: utf-8 -*-
+import unittest
+import six
+from w3lib.html import (replace_entities, replace_tags, remove_comments,
+    remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup,
+    get_base_url, get_meta_refresh)
+
+
+class RemoveEntitiesTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return uncode
+        assert isinstance(replace_entities(b'no entities'), six.text_type)
+        assert isinstance(replace_entities(b'Price: £100!'),  six.text_type)
+        assert isinstance(replace_entities(u'no entities'), six.text_type)
+        assert isinstance(replace_entities(u'Price: £100!'),  six.text_type)
+
+    def test_regular(self):
+        # regular conversions
+        self.assertEqual(replace_entities(u'As low as £100!'),
+                         u'As low as \xa3100!')
+        self.assertEqual(replace_entities(b'As low as £100!'),
+                         u'As low as \xa3100!')
+        self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
+                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
+
+    def test_keep_entities(self):
+        # keep some entities
+        self.assertEqual(replace_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']),
+                         u'<b>Low < High & Medium \xa3 six</b>')
+        self.assertEqual(replace_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']),
+                         u'<b>Low < High & Medium \xa3 six</b>')
+
+    def test_illegal_entities(self):
+        self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False),
+                         u'a < b &illegal; c � six')
+        self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True),
+                         u'a < b  c  six')
+        self.assertEqual(replace_entities('x&#x2264;y'), u'x\u2264y')
+        self.assertEqual(replace_entities('xy'), u'xy')
+        self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy')
+
+    def test_browser_hack(self):
+        # check browser hack for numeric character references in the 80-9F range
+        self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y')
+        self.assertEqual(replace_entities('x&#x99;y', encoding='cp1252'), u'x\u2122y')
+
+    def test_missing_semicolon(self):
+        for entity, result in (
+                ('&lt&lt!', '<<!',),
+                ('&LT!', '<!',),
+                ('&#X41 ', 'A ',),
+                ('&#x41!', 'A!',),
+                ('&#x41h', 'Ah',),
+                ('&#65!', 'A!',),
+                ('&#65x', 'Ax',),
+                ('&sup3!', u'\u00B3!',),
+                ('&Aacute!', u'\u00C1!',),
+                ('&#9731!', u'\u2603!',),
+                ('&#153', u'\u2122',),
+                ('&#x99', u'\u2122',),
+                ):
+            self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
+            self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
+
+
+    def test_encoding(self):
+        self.assertEqual(replace_entities(b'x\x99™™y', encoding='cp1252'), \
+                         u'x\u2122\u2122\u2122y')
+
+
+class ReplaceTagsTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return uncode
+        assert isinstance(replace_tags(b'no entities'), six.text_type)
+        assert isinstance(replace_tags('no entities'), six.text_type)
+
+    def test_replace_tags(self):
+        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
+                         u'This text contains some tag')
+        self.assertEqual(replace_tags(b'This text is very im<b>port</b>ant', ' '),
+                         u'This text is very im port ant')
+
+    def test_replace_tags_multiline(self):
+        self.assertEqual(replace_tags(b'Click <a class="one"\r\n href="url">here</a>'),
+                         u'Click here')
+
+
+class RemoveCommentsTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return unicode
+        assert isinstance(remove_comments(b'without comments'), six.text_type)
+        assert isinstance(remove_comments(b'<!-- with comments -->'), six.text_type)
+        assert isinstance(remove_comments(u'without comments'), six.text_type)
+        assert isinstance(remove_comments(u'<!-- with comments -->'), six.text_type)
+
+    def test_no_comments(self):
+        # text without comments
+        self.assertEqual(remove_comments(u'text without comments'), u'text without comments')
+
+    def test_remove_comments(self):
+        # text with comments
+        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
+        self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
+        self.assertEqual(remove_comments(u'Hello<!--My\nWorld-->'), u'Hello')
+
+        self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test  whatever')
+        self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test  whatever')
+
+
+class RemoveTagsTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return unicode
+        assert isinstance(remove_tags(b'no tags'), six.text_type)
+        assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
+        assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
+        assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
+        assert isinstance(remove_tags(u'no tags'), six.text_type)
+        assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
+        assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
+        assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
+
+    def test_remove_tags_without_tags(self):
+        # text without tags
+        self.assertEqual(remove_tags(u'no tags'), u'no tags')
+        self.assertEqual(remove_tags(u'no tags', which_ones=('p', 'b',)), u'no tags')
+
+    def test_remove_tags(self):
+        # text with tags
+        self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
+        self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')
+
+        self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
+                         u'<b>not will removed</b>i will removed')
+
+    def test_remove_tags_with_attributes(self):
+        # text with tags and attributes
+        self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>'), u'texty')
+        self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>', which_ones=('b',)),
+                         u'<p align="center" class="one">texty</p>')
+
+    def test_remove_empty_tags(self):
+        # text with empty tags
+        self.assertEqual(remove_tags(u'a<br />b<br/>c'), u'abc')
+        self.assertEqual(remove_tags(u'a<br />b<br/>c', which_ones=('br',)), u'abc')
+
+    def test_keep_argument(self):
+        self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('br',)), u'a<br />b<br/>c')
+        self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p',)), u'<p>abc</p>')
+        self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p', 'br', 'div')), u'<p>a<br />b<br/>c</p>')
+
+    def test_uppercase_tags(self):
+        self.assertEqual(remove_tags(u'<foo></foo><bar></bar><baz/>', which_ones=('Foo', 'BAR', 'baZ')), u'')
+        self.assertEqual(remove_tags(u'<FOO></foO><BaR></bAr><BAZ/>', which_ones=('foo', 'bar', 'baz')), u'')
+
+
+class RemoveTagsWithContentTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return unicode
+        assert isinstance(remove_tags_with_content(b'no tags'), six.text_type)
+        assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type)
+        assert isinstance(remove_tags_with_content(u'no tags'), six.text_type)
+        assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type)
+        assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
+
+    def test_without_tags(self):
+        # text without tags
+        self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
+        self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags')
+
+    def test_with_tags(self):
+        # text with tags
+        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
+        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')
+
+        self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
+                         u'<b>not will removed</b>')
+
+    def test_empty_tags(self):
+        # text with empty tags
+        self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')
+
+
+class ReplaceEscapeCharsTest(unittest.TestCase):
+    def test_returns_unicode(self):
+        # make sure it always return unicode
+        assert isinstance(replace_escape_chars(b'no ec'), six.text_type)
+        assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type)
+        assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type)
+        assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type)
+        assert isinstance(replace_escape_chars(u'no ec'), six.text_type)
+        assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type)
+        assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
+
+    def test_without_escape_chars(self):
+        # text without escape chars
+        self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
+        self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')
+
+    def test_with_escape_chars(self):
+        # text with escape chars
+        self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
+        self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
+        self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
+        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
+        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
+        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=b'\xc2\xa3'), u'escape\xa3chars\xa3')
+
+
+class UnquoteMarkupTest(unittest.TestCase):
+
+    sample_txt1 = u"""<node1>hi, this is sample text with entities: & ©
+<![CDATA[although this is inside a cdata! & "]]></node1>"""
+    sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>'
+    sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4'
+
+    def test_returns_unicode(self):
+        # make sure it always return unicode
+        assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type)
+        assert isinstance(unquote_markup(self.sample_txt2), six.text_type)
+
+    def test_unquote_markup(self):
+        self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
+although this is inside a cdata! & "</node1>""")
+
+        self.assertEqual(unquote_markup(self.sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>')
+
+        self.assertEqual(unquote_markup(self.sample_txt1 + self.sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
+although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""")
+
+        self.assertEqual(unquote_markup(self.sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
+
+
+class GetBaseUrlTest(unittest.TestCase):
+
+    def test_get_base_url(self):
+        baseurl = u'https://example.org'
+
+        text = u"""\
+            <html>\
+            <head><title>Dummy</title><base href='http://example.org/something' /></head>\
+            <body>blahablsdfsal&</body>\
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
+        self.assertEqual(get_base_url(text, baseurl.encode('ascii')), 'http://example.org/something')
+
+
+    def test_relative_url_with_absolute_path(self):
+        baseurl = 'https://example.org'
+        text = u"""\
+            <html>\
+            <head><title>Dummy</title><base href='/absolutepath' /></head>\
+            <body>blahablsdfsal&</body>\
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')
+
+    def test_no_scheme_url(self):
+        baseurl = 'https://example.org'
+        text = b"""\
+            <html>\
+            <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\
+            <body>blahablsdfsal&</body>\
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
+
+    def test_attributes_before_href(self):
+        baseurl = u'https://example.org'
+
+        text = u"""\
+            <html>\
+            <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\
+            <body>blahablsdfsal&</body>\
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
+
+    def test_tag_name(self):
+        baseurl = u'https://example.org'
+
+        text = u"""\
+            <html>\
+            <head><title>Dummy</title><basefoo href='http://example.org/something' /></head>\
+            <body>blahablsdfsal&</body>\
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl), 'https://example.org')
+
+    def test_get_base_url_utf8(self):
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head>
+            <body>blahablsdfsal&</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl),
+                         'http://example.org/snowman%E2%8D%A8')
+
+    def test_get_base_url_latin1(self):
+        # page encoding does not affect URL path encoding before percent-escaping
+        # we should still use UTF-8 by default
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head>
+            <body>blahablsdfsal&</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
+                         'http://example.org/sterling%C2%A3')
+
+    def test_get_base_url_latin1_percent(self):
+        # non-UTF-8 percent-encoded characters sequence are left untouched
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head>
+            <body>blahablsdfsal&</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl),
+                         'http://example.org/sterling%a3')
+
+
+class GetMetaRefreshTest(unittest.TestCase):
+    def test_get_meta_refresh(self):
+        baseurl = 'http://example.org'
+        body = """
+            <html>
+            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
+            <body>blahablsdfsal&</body>
+            </html>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
+
+    def test_without_url(self):
+        # refresh without url should return (None, None)
+        baseurl = 'http://example.org'
+        body = """<meta http-equiv="refresh" content="5" />"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+        body = """<meta http-equiv="refresh" content="5;
+            url=http://example.org/newpage" /></head>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
+
+    def test_multiline(self):
+        # meta refresh in multiple lines
+        baseurl = 'http://example.org'
+        body = """<html><head>
+               <META
+               HTTP-EQUIV="Refresh"
+               CONTENT="1; URL=http://example.org/newpage">"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
+
+    def test_entities_in_redirect_url(self):
+        # entities in the redirect url
+        baseurl = 'http://example.org'
+        body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))
+
+    def test_relative_redirects(self):
+        # relative redirects
+        baseurl = 'http://example.com/page/this.html'
+        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))
+
+    def test_nonascii_url_utf8(self):
+        # non-ascii chars in the url (utf8 - default)
+        baseurl = 'http://example.com'
+        body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
+
+    def test_nonascii_url_latin1(self):
+        # non-ascii chars in the url path (latin1)
+        # should end up UTF-8 encoded anyway
+        baseurl = 'http://example.com'
+        body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
+        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
+
+    def test_nonascii_url_latin1_query(self):
+        # non-ascii chars in the url path and query (latin1)
+        # only query part should be kept latin1 encoded before percent escaping
+        baseurl = 'http://example.com'
+        body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
+        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
+
+    def test_commented_meta_refresh(self):
+        # html commented meta refresh header must not directed
+        baseurl = 'http://example.com'
+        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+    def test_html_comments_with_uncommented_meta_refresh(self):
+        # html comments must not interfere with uncommented meta refresh header
+        baseurl = 'http://example.com'
+        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))
+
+    def test_float_refresh_intervals(self):
+        # float refresh intervals
+        baseurl = 'http://example.com'
+        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))
+
+        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
+
+    def test_tag_name(self):
+        baseurl = 'http://example.org'
+        body = """
+            <html>
+            <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
+            <body>blahablsdfsal&</body>
+            </html>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+
+    def test_leading_newline_in_url(self):
+        baseurl = 'http://example.org'
+        body = """
+        <html>
+        <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
+http://www.example.org/index.php" />
+        </head>
+        </html>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php'))
+
+    def test_inside_noscript(self):
+        baseurl = 'http://example.org'
+        body = """
+            <html>
+            <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
+            </html>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+        self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required"))
+
+    def test_inside_script(self):
+        baseurl = 'http://example.org'
+        body = """
+            <html>
+            <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
+            </html>"""
+        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
+        self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))
diff --git a/tests/test_http.py b/tests/test_http.py
new file mode 100644
index 0000000..6ce53ca
--- /dev/null
+++ b/tests/test_http.py
@@ -0,0 +1,89 @@
+import unittest
+from collections import OrderedDict
+from w3lib.http import (basic_auth_header,
+                        headers_dict_to_raw, headers_raw_to_dict)
+
+__doctests__ = ['w3lib.http'] # for trial support
+
+class HttpTests(unittest.TestCase):
+
+    def test_basic_auth_header(self):
+        self.assertEqual(b'Basic c29tZXVzZXI6c29tZXBhc3M=',
+                basic_auth_header('someuser', 'somepass'))
+        # Check url unsafe encoded header
+        self.assertEqual(b'Basic c29tZXVzZXI6QDx5dTk-Jm8_UQ==',
+            basic_auth_header('someuser', '@<yu9>&o?Q'))
+
+    def test_headers_raw_dict_none(self):
+        self.assertIsNone(headers_raw_to_dict(None))
+        self.assertIsNone(headers_dict_to_raw(None))
+
+    def test_headers_raw_to_dict(self):
+        raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
+        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
+        self.assertEqual(headers_raw_to_dict(raw), dct)
+
+    def test_headers_dict_to_raw(self):
+        dct = OrderedDict([
+            (b'Content-type', b'text/html'),
+            (b'Accept', b'gzip')
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b'Content-type: text/html\r\nAccept: gzip'
+        )
+
+    def test_headers_dict_to_raw_listtuple(self):
+        dct = OrderedDict([
+            (b'Content-type', [b'text/html']),
+            (b'Accept', [b'gzip'])
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b'Content-type: text/html\r\nAccept: gzip'
+        )
+
+        dct = OrderedDict([
+            (b'Content-type', (b'text/html',)),
+            (b'Accept', (b'gzip',))
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b'Content-type: text/html\r\nAccept: gzip'
+        )
+
+        dct = OrderedDict([
+            (b'Cookie', (b'val001', b'val002')),
+            (b'Accept', b'gzip')
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
+        )
+
+        dct = OrderedDict([
+            (b'Cookie', [b'val001', b'val002']),
+            (b'Accept', b'gzip')
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
+        )
+
+    def test_headers_dict_to_raw_wrong_values(self):
+        dct = OrderedDict([
+            (b'Content-type', 0),
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b''
+        )
... 1373 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-w3lib.git