[Python-modules-commits] [python-parsel] 01/04: Import python-parsel_1.2.0.orig.tar.gz

Michael Fladischer fladi at moszumanska.debian.org
Mon May 29 08:06:27 UTC 2017


This is an automated email from the git hooks/post-receive script.

fladi pushed a commit to branch master
in repository python-parsel.

commit f3348b883412fc419928334f8f1f29767a8feada
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date:   Mon May 29 09:34:23 2017 +0200

    Import python-parsel_1.2.0.orig.tar.gz
---
 .bumpversion.cfg       |   2 +-
 .travis.yml            |  27 ++++++++-----
 NEWS                   |  18 +++++++++
 docs/usage.rst         |  11 ++++--
 parsel/__init__.py     |   2 +-
 parsel/selector.py     |  66 ++++++++++++++++++++++++++------
 parsel/utils.py        |  25 +++++++++----
 release.rst            |   2 +
 requirements.txt       |   4 --
 setup.py               |   5 ++-
 tests/test_selector.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
 tox.ini                |   4 +-
 12 files changed, 225 insertions(+), 41 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index a3c78c7..f8747a9 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.1.0
+current_version = 1.2.0
 commit = True
 tag = True
 tag_name = v{new_version}
diff --git a/.travis.yml b/.travis.yml
index 180fe4b..4beb4ec 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,18 @@
 language: python
-python: 3.5
-env:
-  - TOXENV=py27
-  - TOXENV=pypy
-  - TOXENV=py33
-  - TOXENV=py34
-  - TOXENV=py35
+matrix:
+  include:
+    - python: 2.7
+      env: TOXENV=py27
+    - python: 2.7
+      env: TOXENV=pypy
+    - python: 3.3
+      env: TOXENV=py33
+    - python: 3.4
+      env: TOXENV=py34
+    - python: 3.5
+      env: TOXENV=py35
+    - python: 3.6
+      env: TOXENV=py36
 
 install:
   - |
@@ -16,13 +23,13 @@ install:
         else
           rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT"
         fi
-        # get latest PyPy from pyenv directly (thanks to natural version sort option -V)
-        export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-[0-9][\.0-9]*$' |sort -V |tail -1`
+        # get latest (portable) PyPy from pyenv directly (thanks to natural version sort option -V)
+        export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-portable-[0-9][\.0-9]*$' |sort -V |tail -1`
         "$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION"
         virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION"
         source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
       fi
-  - pip install -U tox twine wheel codecov
+  - pip install -U pip tox twine wheel codecov
 script: tox
 after_success:
   - codecov
diff --git a/NEWS b/NEWS
index 71740db..28e2b68 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,22 @@
 History
 -------
 
+1.2.0 (2017-05-XX)
+~~~~~~~~~~~~~~~~~~
+
+* Add :meth:`~parsel.selector.SelectorList.get` and :meth:`~parsel.selector.SelectorList.getall`
+  methods as aliases for :meth:`~parsel.selector.SelectorList.extract_first`
+  and :meth:`~parsel.selector.SelectorList.extract` respectively
+* Add default value parameter to :meth:`~parsel.selector.SelectorList.re_first` method
+* Add :meth:`~parsel.selector.Selector.re_first` method to :class:`parsel.selector.Selector` class
+* Bug fix: detect ``None`` result from lxml parsing and fallback with an empty document
+* Rearrange XML/HTML examples in the selectors usage docs
+* Travis CI:
+
+  * Test against Python 3.6
+  * Test against PyPy using "Portable PyPy for Linux" distribution
+
+
 1.1.0 (2016-11-22)
 ~~~~~~~~~~~~~~~~~~
 
@@ -22,12 +38,14 @@ History
 * Integrate py.test runs with setuptools (needed for Debian packaging)
 * Changelog is now called ``NEWS``
 
+
 1.0.2 (2016-04-26)
 ~~~~~~~~~~~~~~~~~~
 
 * Fix bug in exception handling causing original traceback to be lost
 * Added docstrings and other doc fixes
 
+
 1.0.1 (2015-08-24)
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/usage.rst b/docs/usage.rst
index 9afda59..fa05f58 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -578,6 +578,8 @@ to use the ``.`` in the XPath expressions that will follow.
 API reference
 =============
 
+Selector objects
+----------------
 
 .. autoclass:: parsel.selector.Selector
     :members:
@@ -590,9 +592,10 @@ SelectorList objects
     :members:
 
 
+.. _selector-examples-html:
 
-Selector examples on HTML text
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Working on HTML
+---------------
 
 Here are some :class:`Selector` examples to illustrate several concepts.
 In all cases, we assume there is already a :class:`Selector` instantiated with
@@ -619,8 +622,8 @@ an HTML text like this::
 
 .. _selector-examples-xml:
 
-Selector examples on XML text
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Working on XML (and namespaces)
+-------------------------------
 
 Here are some examples to illustrate concepts for :class:`Selector` objects
 instantiated with an XML text like this::
diff --git a/parsel/__init__.py b/parsel/__init__.py
index 113644e..735e62d 100644
--- a/parsel/__init__.py
+++ b/parsel/__init__.py
@@ -5,7 +5,7 @@ or CSS selectors
 
 __author__ = 'Scrapy project'
 __email__ = 'info at scrapy.org'
-__version__ = '1.1.0'
+__version__ = '1.2.0'
 
 from parsel.selector import Selector, SelectorList  # NOQA
 from parsel.csstranslator import css2xpath  # NOQA
diff --git a/parsel/selector.py b/parsel/selector.py
index ae3c633..33eaede 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -40,7 +40,10 @@ def create_root_node(text, parser_cls, base_url=None):
     """
     body = text.strip().encode('utf8') or b'<html/>'
     parser = parser_cls(recover=True, encoding='utf8')
-    return etree.fromstring(body, parser=parser, base_url=base_url)
+    root = etree.fromstring(body, parser=parser, base_url=base_url)
+    if root is None:
+        root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
+    return root
 
 
 class SelectorList(list):
@@ -71,7 +74,7 @@ class SelectorList(list):
         saved for future calls.
 
         Any additional named arguments can be used to pass values for XPath
-        variables in the XPath expression, e.g.:
+        variables in the XPath expression, e.g.::
 
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
@@ -86,20 +89,34 @@ class SelectorList(list):
         """
         return self.__class__(flatten([x.css(query) for x in self]))
 
-    def re(self, regex):
+    def re(self, regex, replace_entities=True):
         """
         Call the ``.re()`` method for each element in this list and return
         their results flattened, as a list of unicode strings.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&`` and ``<``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        return flatten([x.re(regex) for x in self])
+        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
 
-    def re_first(self, regex):
+    def re_first(self, regex, default=None, replace_entities=True):
         """
         Call the ``.re()`` method for the first element in this list and
-        return the result in an unicode string.
+        return the result in an unicode string. If the list is empty or the
+        regex doesn't match anything, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&`` and ``<``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        for el in iflatten(x.re(regex) for x in self):
+        for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
             return el
+        else:
+            return default
 
     def extract(self):
         """
@@ -107,6 +124,7 @@ class SelectorList(list):
         their results flattened, as a list of unicode strings.
         """
         return [x.extract() for x in self]
+    getall = extract
 
     def extract_first(self, default=None):
         """
@@ -117,6 +135,7 @@ class SelectorList(list):
             return x.extract()
         else:
             return default
+    get = extract_first
 
 
 class Selector(object):
@@ -185,7 +204,7 @@ class Selector(object):
         saved for future calls.
 
         Any additional named arguments can be used to pass values for XPath
-        variables in the XPath expression, e.g.:
+        variables in the XPath expression, e.g.::
 
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
@@ -229,15 +248,33 @@ class Selector(object):
     def _css2xpath(self, query):
         return self._csstranslator.css_to_xpath(query)
 
-    def re(self, regex):
+    def re(self, regex, replace_entities=True):
         """
         Apply the given regex and return a list of unicode strings with the
         matches.
 
         ``regex`` can be either a compiled regular expression or a string which
-        will be compiled to a regular expression using ``re.compile(regex)``
+        will be compiled to a regular expression using ``re.compile(regex)``.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&`` and ``<``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        return extract_regex(regex, self.extract(), replace_entities=replace_entities)
+
+    def re_first(self, regex, default=None, replace_entities=True):
+        """
+        Apply the given regex and return the first unicode string which
+        matches. If there is no match, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&`` and ``<``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        return extract_regex(regex, self.extract())
+        return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
 
     def extract(self):
         """
@@ -256,6 +293,13 @@ class Selector(object):
                 return u'0'
             else:
                 return six.text_type(self.root)
+    get = extract
+
+    def getall(self):
+        """
+        Serialize and return the matched node in a 1-element list of unicode strings.
+        """
+        return [self.get()]
 
     def register_namespace(self, prefix, uri):
         """
diff --git a/parsel/utils.py b/parsel/utils.py
index 5c2bdef..56bb105 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,6 +1,6 @@
 import re
 import six
-from w3lib.html import replace_entities
+from w3lib.html import replace_entities as w3lib_replace_entities
 
 
 def flatten(x):
@@ -56,7 +56,7 @@ def _is_listlike(x):
     return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
 
 
-def extract_regex(regex, text):
+def extract_regex(regex, text, replace_entities=True):
     """Extract a list of unicode strings from the given text/encoding using the following policies:
     * if the regex contains a named group called "extract" that will be returned
     * if the regex contains multiple numbered groups, all those will be returned (flattened)
@@ -65,8 +65,19 @@ def extract_regex(regex, text):
     if isinstance(regex, six.string_types):
         regex = re.compile(regex, re.UNICODE)
 
-    try:
-        strings = [regex.search(text).group('extract')]   # named group
-    except:
-        strings = regex.findall(text)    # full regex or numbered groups
-    return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)]
+    if 'extract' in regex.groupindex:
+        # named group
+        try:
+            extracted = regex.search(text).group('extract')
+        except AttributeError:
+            strings = []
+        else:
+            strings = [extracted] if extracted is not None else []
+    else:
+        # full regex or numbered groups
+        strings = regex.findall(text)
+
+    strings = flatten(strings)
+    if not replace_entities:
+        return strings
+    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
\ No newline at end of file
diff --git a/release.rst b/release.rst
index 469093d..72b4341 100644
--- a/release.rst
+++ b/release.rst
@@ -7,3 +7,5 @@ Release procedures
 * Copy release notes to https://github.com/scrapy/parsel/releases
 * Verify in a temporary virtualenv that ``pip install parsel`` installs the
   latest version
+* Update version builds at: https://readthedocs.org/projects/parsel/versions/
+  You should ensure that previous stable version is active and point stable to the new tag
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index a02cc48..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-lxml==3.4.4
-six==1.9.0
-cssselect==0.9.1
-w3lib==1.11.0
diff --git a/setup.py b/setup.py
index 2f42c5b..44b2954 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ test_requirements = [
 
 setup(
     name='parsel',
-    version='1.1.0',
+    version='1.2.0',
     description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
     long_description=readme + '\n\n' + history,
     author="Scrapy project",
@@ -51,6 +51,9 @@ setup(
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy',
     ],
     setup_requires=['pytest-runner',],
     tests_require=['pytest',],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 09ecd51..d36aa27 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -139,6 +139,30 @@ class SelectorTestCase(unittest.TestCase):
 
         self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
 
+    def test_selector_get_alias(self):
+        """Test if get() returns extracted value on a Selector"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'<li id="2">2</li>')
+        self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2')
+
+    def test_selector_getall_alias(self):
+        """Test if get() returns extracted value on a Selector"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'<li id="2">2</li>'])
+        self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2'])
+
+    def test_selectorlist_get_alias(self):
+        """Test if get() returns first element for a selection call"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//ul/li').get(), u'<li id="1">1</li>')
+        self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1')
+
     def test_re_first(self):
         """Test if re_first() returns first matched element"""
         body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
@@ -156,6 +180,18 @@ class SelectorTestCase(unittest.TestCase):
         self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+'), None)
         self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first('\d'), None)
 
+        self.assertEqual(sel.re_first(r'id="(\d+)'), '1')
+        self.assertEqual(sel.re_first(r'foo'), None)
+        self.assertEqual(sel.re_first(r'foo', default='bar'), 'bar')
+
+    def test_extract_first_default(self):
+        """Test if re_first() returns default value when no results found"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//div/text()').re_first('\w+', default='missing'), 'missing')
+        self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+', default='missing'), 'missing')
+
     def test_select_unicode_query(self):
         body = u"<p><input name='\xa9' value='1'/></p>"
         sel = self.sscls(text=body)
@@ -237,6 +273,31 @@ class SelectorTestCase(unittest.TestCase):
                          ["<li>four</li>", "<li>five</li>", "<li>six</li>"])
         self.assertEqual(divtwo.xpath("./li").extract(), [])
 
+    def test_selectorlist_getall_alias(self):
+        """Nested selector tests using getall()"""
+        body = u"""<body>
+                    <div class='one'>
+                      <ul>
+                        <li>one</li><li>two</li>
+                      </ul>
+                    </div>
+                    <div class='two'>
+                      <ul>
+                        <li>four</li><li>five</li><li>six</li>
+                      </ul>
+                    </div>
+                  </body>"""
+
+        x = self.sscls(text=body)
+        divtwo = x.xpath('//div[@class="two"]')
+        self.assertEqual(divtwo.xpath("//li").getall(),
+                         ["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"])
+        self.assertEqual(divtwo.xpath("./ul/li").getall(),
+                         ["<li>four</li>", "<li>five</li>", "<li>six</li>"])
+        self.assertEqual(divtwo.xpath(".//li").getall(),
+                         ["<li>four</li>", "<li>five</li>", "<li>six</li>"])
+        self.assertEqual(divtwo.xpath("./li").getall(), [])
+
     def test_mixed_nested_selectors(self):
         body = u'''<body>
                     <div id=1>not<span>me</span></div>
@@ -394,6 +455,41 @@ class SelectorTestCase(unittest.TestCase):
                          ["John", "Paul"])
         self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
                          ["10", "20"])
+        
+        # Test named group, hit and miss
+        x = self.sscls(text=u'foobar')
+        self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
+        self.assertEqual(x.re('(?P<extract>baz)'), [])
+
+        # A purposely constructed test for an edge case
+        x = self.sscls(text=u'baz')
+        self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
+
+    def test_re_replace_entities(self):
+        body = u"""<script>{"foo":"bar & "baz""}</script>"""
+        x = self.sscls(text=body)
+        
+        name_re = re.compile('{"foo":(.*)}')
+
+        # by default, only & and < are preserved ;
+        # other entities are converted
+        expected = u'"bar & "baz""'
+        self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
+        self.assertEqual(x.xpath("//script").re(name_re), [expected])
+        self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
+        self.assertEqual(x.xpath("//script")[0].re(name_re), [expected])
+
+        # check that re_first() works the same way for single value output
+        self.assertEqual(x.xpath("//script").re_first(name_re), expected)
+        self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
+
+        # switching off replace_entities will preserve " also
+        expected = u'"bar & "baz""'
+        self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected])
+        self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected])
+
+        self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected)
+        self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
 
     def test_re_intl(self):
         body = u'<div>Evento: cumplea\xf1os</div>'
@@ -435,6 +531,10 @@ class SelectorTestCase(unittest.TestCase):
     def test_empty_bodies_shouldnt_raise_errors(self):
         self.sscls(text=u'').xpath('//text()').extract()
 
+    def test_bodies_with_comments_only(self):
+        sel = self.sscls(text=u'<!-- hello world -->', base_url='http://example.com')
+        self.assertEquals(u'http://example.com', sel.root.base)
+
     def test_null_bytes_shouldnt_raise_errors(self):
         text = u'<root>pre\x00post</root>'
         self.sscls(text).xpath('//text()').extract()
diff --git a/tox.ini b/tox.ini
index 5faf92b..ae9eee0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,8 @@
 [tox]
-envlist = py27, py33, py34, py35, pypy
+envlist = py27, py33, py34, py35, py36, pypy
 
 [testenv]
 deps =
     -r{toxinidir}/tests/requirements.txt
-    .
+
 commands = py.test --cov=parsel --cov-report= {posargs:parsel tests}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-parsel.git



More information about the Python-modules-commits mailing list