[Python-modules-commits] [python-parsel] 01/04: Import python-parsel_1.2.0.orig.tar.gz
Michael Fladischer
fladi at moszumanska.debian.org
Mon May 29 08:06:27 UTC 2017
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch master
in repository python-parsel.
commit f3348b883412fc419928334f8f1f29767a8feada
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Mon May 29 09:34:23 2017 +0200
Import python-parsel_1.2.0.orig.tar.gz
---
.bumpversion.cfg | 2 +-
.travis.yml | 27 ++++++++-----
NEWS | 18 +++++++++
docs/usage.rst | 11 ++++--
parsel/__init__.py | 2 +-
parsel/selector.py | 66 ++++++++++++++++++++++++++------
parsel/utils.py | 25 +++++++++----
release.rst | 2 +
requirements.txt | 4 --
setup.py | 5 ++-
tests/test_selector.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
tox.ini | 4 +-
12 files changed, 225 insertions(+), 41 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index a3c78c7..f8747a9 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.1.0
+current_version = 1.2.0
commit = True
tag = True
tag_name = v{new_version}
diff --git a/.travis.yml b/.travis.yml
index 180fe4b..4beb4ec 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,18 @@
language: python
-python: 3.5
-env:
- - TOXENV=py27
- - TOXENV=pypy
- - TOXENV=py33
- - TOXENV=py34
- - TOXENV=py35
+matrix:
+ include:
+ - python: 2.7
+ env: TOXENV=py27
+ - python: 2.7
+ env: TOXENV=pypy
+ - python: 3.3
+ env: TOXENV=py33
+ - python: 3.4
+ env: TOXENV=py34
+ - python: 3.5
+ env: TOXENV=py35
+ - python: 3.6
+ env: TOXENV=py36
install:
- |
@@ -16,13 +23,13 @@ install:
else
rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT"
fi
- # get latest PyPy from pyenv directly (thanks to natural version sort option -V)
- export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-[0-9][\.0-9]*$' |sort -V |tail -1`
+ # get latest (portable) PyPy from pyenv directly (thanks to natural version sort option -V)
+ export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-portable-[0-9][\.0-9]*$' |sort -V |tail -1`
"$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION"
virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION"
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
fi
- - pip install -U tox twine wheel codecov
+ - pip install -U pip tox twine wheel codecov
script: tox
after_success:
- codecov
diff --git a/NEWS b/NEWS
index 71740db..28e2b68 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,22 @@
History
-------
+1.2.0 (2017-05-XX)
+~~~~~~~~~~~~~~~~~~
+
+* Add :meth:`~parsel.selector.SelectorList.get` and :meth:`~parsel.selector.SelectorList.getall`
+ methods as aliases for :meth:`~parsel.selector.SelectorList.extract_first`
+ and :meth:`~parsel.selector.SelectorList.extract` respectively
+* Add default value parameter to :meth:`~parsel.selector.SelectorList.re_first` method
+* Add :meth:`~parsel.selector.Selector.re_first` method to :class:`parsel.selector.Selector` class
+* Bug fix: detect ``None`` result from lxml parsing and fallback with an empty document
+* Rearrange XML/HTML examples in the selectors usage docs
+* Travis CI:
+
+ * Test against Python 3.6
+ * Test against PyPy using "Portable PyPy for Linux" distribution
+
+
1.1.0 (2016-11-22)
~~~~~~~~~~~~~~~~~~
@@ -22,12 +38,14 @@ History
* Integrate py.test runs with setuptools (needed for Debian packaging)
* Changelog is now called ``NEWS``
+
1.0.2 (2016-04-26)
~~~~~~~~~~~~~~~~~~
* Fix bug in exception handling causing original traceback to be lost
* Added docstrings and other doc fixes
+
1.0.1 (2015-08-24)
~~~~~~~~~~~~~~~~~~
diff --git a/docs/usage.rst b/docs/usage.rst
index 9afda59..fa05f58 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -578,6 +578,8 @@ to use the ``.`` in the XPath expressions that will follow.
API reference
=============
+Selector objects
+----------------
.. autoclass:: parsel.selector.Selector
:members:
@@ -590,9 +592,10 @@ SelectorList objects
:members:
+.. _selector-examples-html:
-Selector examples on HTML text
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Working on HTML
+---------------
Here are some :class:`Selector` examples to illustrate several concepts.
In all cases, we assume there is already a :class:`Selector` instantiated with
@@ -619,8 +622,8 @@ an HTML text like this::
.. _selector-examples-xml:
-Selector examples on XML text
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Working on XML (and namespaces)
+-------------------------------
Here are some examples to illustrate concepts for :class:`Selector` objects
instantiated with an XML text like this::
diff --git a/parsel/__init__.py b/parsel/__init__.py
index 113644e..735e62d 100644
--- a/parsel/__init__.py
+++ b/parsel/__init__.py
@@ -5,7 +5,7 @@ or CSS selectors
__author__ = 'Scrapy project'
__email__ = 'info at scrapy.org'
-__version__ = '1.1.0'
+__version__ = '1.2.0'
from parsel.selector import Selector, SelectorList # NOQA
from parsel.csstranslator import css2xpath # NOQA
diff --git a/parsel/selector.py b/parsel/selector.py
index ae3c633..33eaede 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -40,7 +40,10 @@ def create_root_node(text, parser_cls, base_url=None):
"""
body = text.strip().encode('utf8') or b'<html/>'
parser = parser_cls(recover=True, encoding='utf8')
- return etree.fromstring(body, parser=parser, base_url=base_url)
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
+ if root is None:
+ root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
+ return root
class SelectorList(list):
@@ -71,7 +74,7 @@ class SelectorList(list):
saved for future calls.
Any additional named arguments can be used to pass values for XPath
- variables in the XPath expression, e.g.:
+ variables in the XPath expression, e.g.::
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
@@ -86,20 +89,34 @@ class SelectorList(list):
"""
return self.__class__(flatten([x.css(query) for x in self]))
- def re(self, regex):
+ def re(self, regex, replace_entities=True):
"""
Call the ``.re()`` method for each element in this list and return
their results flattened, as a list of unicode strings.
+
+ By default, character entity references are replaced by their
+ corresponding character (except for ``&`` and ``<``.
+ Passing ``replace_entities`` as ``False`` switches off these
+ replacements.
"""
- return flatten([x.re(regex) for x in self])
+ return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
- def re_first(self, regex):
+ def re_first(self, regex, default=None, replace_entities=True):
"""
Call the ``.re()`` method for the first element in this list and
- return the result in an unicode string.
+ return the result in an unicode string. If the list is empty or the
+ regex doesn't match anything, return the default value (``None`` if
+ the argument is not provided).
+
+ By default, character entity references are replaced by their
+ corresponding character (except for ``&`` and ``<``.
+ Passing ``replace_entities`` as ``False`` switches off these
+ replacements.
"""
- for el in iflatten(x.re(regex) for x in self):
+ for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
return el
+ else:
+ return default
def extract(self):
"""
@@ -107,6 +124,7 @@ class SelectorList(list):
their results flattened, as a list of unicode strings.
"""
return [x.extract() for x in self]
+ getall = extract
def extract_first(self, default=None):
"""
@@ -117,6 +135,7 @@ class SelectorList(list):
return x.extract()
else:
return default
+ get = extract_first
class Selector(object):
@@ -185,7 +204,7 @@ class Selector(object):
saved for future calls.
Any additional named arguments can be used to pass values for XPath
- variables in the XPath expression, e.g.:
+ variables in the XPath expression, e.g.::
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
@@ -229,15 +248,33 @@ class Selector(object):
def _css2xpath(self, query):
return self._csstranslator.css_to_xpath(query)
- def re(self, regex):
+ def re(self, regex, replace_entities=True):
"""
Apply the given regex and return a list of unicode strings with the
matches.
``regex`` can be either a compiled regular expression or a string which
- will be compiled to a regular expression using ``re.compile(regex)``
+ will be compiled to a regular expression using ``re.compile(regex)``.
+
+ By default, character entity references are replaced by their
+ corresponding character (except for ``&`` and ``<``.
+ Passing ``replace_entities`` as ``False`` switches off these
+ replacements.
+ """
+ return extract_regex(regex, self.extract(), replace_entities=replace_entities)
+
+ def re_first(self, regex, default=None, replace_entities=True):
+ """
+ Apply the given regex and return the first unicode string which
+ matches. If there is no match, return the default value (``None`` if
+ the argument is not provided).
+
+ By default, character entity references are replaced by their
+ corresponding character (except for ``&`` and ``<``.
+ Passing ``replace_entities`` as ``False`` switches off these
+ replacements.
"""
- return extract_regex(regex, self.extract())
+ return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
def extract(self):
"""
@@ -256,6 +293,13 @@ class Selector(object):
return u'0'
else:
return six.text_type(self.root)
+ get = extract
+
+ def getall(self):
+ """
+ Serialize and return the matched node in a 1-element list of unicode strings.
+ """
+ return [self.get()]
def register_namespace(self, prefix, uri):
"""
diff --git a/parsel/utils.py b/parsel/utils.py
index 5c2bdef..56bb105 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,6 +1,6 @@
import re
import six
-from w3lib.html import replace_entities
+from w3lib.html import replace_entities as w3lib_replace_entities
def flatten(x):
@@ -56,7 +56,7 @@ def _is_listlike(x):
return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
-def extract_regex(regex, text):
+def extract_regex(regex, text, replace_entities=True):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
@@ -65,8 +65,19 @@ def extract_regex(regex, text):
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
- try:
- strings = [regex.search(text).group('extract')] # named group
- except:
- strings = regex.findall(text) # full regex or numbered groups
- return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)]
+ if 'extract' in regex.groupindex:
+ # named group
+ try:
+ extracted = regex.search(text).group('extract')
+ except AttributeError:
+ strings = []
+ else:
+ strings = [extracted] if extracted is not None else []
+ else:
+ # full regex or numbered groups
+ strings = regex.findall(text)
+
+ strings = flatten(strings)
+ if not replace_entities:
+ return strings
+ return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
\ No newline at end of file
diff --git a/release.rst b/release.rst
index 469093d..72b4341 100644
--- a/release.rst
+++ b/release.rst
@@ -7,3 +7,5 @@ Release procedures
* Copy release notes to https://github.com/scrapy/parsel/releases
* Verify in a temporary virtualenv that ``pip install parsel`` installs the
latest version
+* Update version builds at: https://readthedocs.org/projects/parsel/versions/
+ You should ensure that previous stable version is active and point stable to the new tag
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index a02cc48..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-lxml==3.4.4
-six==1.9.0
-cssselect==0.9.1
-w3lib==1.11.0
diff --git a/setup.py b/setup.py
index 2f42c5b..44b2954 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ test_requirements = [
setup(
name='parsel',
- version='1.1.0',
+ version='1.2.0',
description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
long_description=readme + '\n\n' + history,
author="Scrapy project",
@@ -51,6 +51,9 @@ setup(
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: Implementation :: CPython',
+ 'Programming Language :: Python :: Implementation :: PyPy',
],
setup_requires=['pytest-runner',],
tests_require=['pytest',],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 09ecd51..d36aa27 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -139,6 +139,30 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
+ def test_selector_get_alias(self):
+ """Test if get() returns extracted value on a Selector"""
+ body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ sel = self.sscls(text=body)
+
+ self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'<li id="2">2</li>')
+ self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2')
+
+ def test_selector_getall_alias(self):
+ """Test if get() returns extracted value on a Selector"""
+ body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ sel = self.sscls(text=body)
+
+ self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'<li id="2">2</li>'])
+ self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2'])
+
+ def test_selectorlist_get_alias(self):
+ """Test if get() returns first element for a selection call"""
+ body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ sel = self.sscls(text=body)
+
+ self.assertEqual(sel.xpath('//ul/li').get(), u'<li id="1">1</li>')
+ self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1')
+
def test_re_first(self):
"""Test if re_first() returns first matched element"""
body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
@@ -156,6 +180,18 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+'), None)
self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first('\d'), None)
+ self.assertEqual(sel.re_first(r'id="(\d+)'), '1')
+ self.assertEqual(sel.re_first(r'foo'), None)
+ self.assertEqual(sel.re_first(r'foo', default='bar'), 'bar')
+
+ def test_extract_first_default(self):
+ """Test if re_first() returns default value when no results found"""
+ body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+ sel = self.sscls(text=body)
+
+ self.assertEqual(sel.xpath('//div/text()').re_first('\w+', default='missing'), 'missing')
+ self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+', default='missing'), 'missing')
+
def test_select_unicode_query(self):
body = u"<p><input name='\xa9' value='1'/></p>"
sel = self.sscls(text=body)
@@ -237,6 +273,31 @@ class SelectorTestCase(unittest.TestCase):
["<li>four</li>", "<li>five</li>", "<li>six</li>"])
self.assertEqual(divtwo.xpath("./li").extract(), [])
+ def test_selectorlist_getall_alias(self):
+ """Nested selector tests using getall()"""
+ body = u"""<body>
+ <div class='one'>
+ <ul>
+ <li>one</li><li>two</li>
+ </ul>
+ </div>
+ <div class='two'>
+ <ul>
+ <li>four</li><li>five</li><li>six</li>
+ </ul>
+ </div>
+ </body>"""
+
+ x = self.sscls(text=body)
+ divtwo = x.xpath('//div[@class="two"]')
+ self.assertEqual(divtwo.xpath("//li").getall(),
+ ["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"])
+ self.assertEqual(divtwo.xpath("./ul/li").getall(),
+ ["<li>four</li>", "<li>five</li>", "<li>six</li>"])
+ self.assertEqual(divtwo.xpath(".//li").getall(),
+ ["<li>four</li>", "<li>five</li>", "<li>six</li>"])
+ self.assertEqual(divtwo.xpath("./li").getall(), [])
+
def test_mixed_nested_selectors(self):
body = u'''<body>
<div id=1>not<span>me</span></div>
@@ -394,6 +455,41 @@ class SelectorTestCase(unittest.TestCase):
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
["10", "20"])
+
+ # Test named group, hit and miss
+ x = self.sscls(text=u'foobar')
+ self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
+ self.assertEqual(x.re('(?P<extract>baz)'), [])
+
+ # A purposely constructed test for an edge case
+ x = self.sscls(text=u'baz')
+ self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
+
+ def test_re_replace_entities(self):
+ body = u"""<script>{"foo":"bar & "baz""}</script>"""
+ x = self.sscls(text=body)
+
+ name_re = re.compile('{"foo":(.*)}')
+
+ # by default, only & and < are preserved ;
+ # other entities are converted
+ expected = u'"bar & "baz""'
+ self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
+ self.assertEqual(x.xpath("//script").re(name_re), [expected])
+ self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
+ self.assertEqual(x.xpath("//script")[0].re(name_re), [expected])
+
+ # check that re_first() works the same way for single value output
+ self.assertEqual(x.xpath("//script").re_first(name_re), expected)
+ self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
+
+ # switching off replace_entities will preserve " also
+ expected = u'"bar & "baz""'
+ self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected])
+ self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected])
+
+ self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected)
+ self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
def test_re_intl(self):
body = u'<div>Evento: cumplea\xf1os</div>'
@@ -435,6 +531,10 @@ class SelectorTestCase(unittest.TestCase):
def test_empty_bodies_shouldnt_raise_errors(self):
self.sscls(text=u'').xpath('//text()').extract()
+ def test_bodies_with_comments_only(self):
+ sel = self.sscls(text=u'<!-- hello world -->', base_url='http://example.com')
+ self.assertEquals(u'http://example.com', sel.root.base)
+
def test_null_bytes_shouldnt_raise_errors(self):
text = u'<root>pre\x00post</root>'
self.sscls(text).xpath('//text()').extract()
diff --git a/tox.ini b/tox.ini
index 5faf92b..ae9eee0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,8 @@
[tox]
-envlist = py27, py33, py34, py35, pypy
+envlist = py27, py33, py34, py35, py36, pypy
[testenv]
deps =
-r{toxinidir}/tests/requirements.txt
- .
+
commands = py.test --cov=parsel --cov-report= {posargs:parsel tests}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-parsel.git
More information about the Python-modules-commits
mailing list