[Python-modules-commits] [python-parsel] 01/07: Import python-parsel_1.1.0.orig.tar.gz

Michael Fladischer fladi at moszumanska.debian.org
Wed Jan 25 08:56:20 UTC 2017


This is an automated email from the git hooks/post-receive script.

fladi pushed a commit to branch master
in repository python-parsel.

commit ba3cf6c2a5912cfb7dac35a17d61ccf27434a4d2
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date:   Wed Jan 25 09:38:13 2017 +0100

    Import python-parsel_1.1.0.orig.tar.gz
---
 .bumpversion.cfg                     |   2 +-
 NEWS                                 |  11 +++
 docs/usage.rst                       | 178 ++++++++++++++++++++++++++++++++++-
 parsel/__init__.py                   |   3 +-
 parsel/csstranslator.py              |   8 ++
 parsel/selector.py                   |  42 +++++++--
 setup.py                             |   7 +-
 tests/test_selector.py               | 145 ++++++++++++++++++++++++++++
 tests/test_selector_csstranslator.py |   8 ++
 9 files changed, 387 insertions(+), 17 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 8867812..a3c78c7 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.0.3
+current_version = 1.1.0
 commit = True
 tag = True
 tag_name = v{new_version}
diff --git a/NEWS b/NEWS
index 0826850..71740db 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,17 @@
 History
 -------
 
+1.1.0 (2016-11-22)
+~~~~~~~~~~~~~~~~~~
+
+* Change default HTML parser to `lxml.html.HTMLParser <http://lxml.de/api/lxml.html.HTMLParser-class.html>`_,
+  which makes easier to use some HTML specific features
+* Add css2xpath function to translate CSS to XPath
+* Add support for ad-hoc namespaces declarations
+* Add support for XPath variables
+* Documentation improvements and updates
+
+
 1.0.3 (2016-07-29)
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/usage.rst b/docs/usage.rst
index 3820412..9afda59 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -159,6 +159,64 @@ Now we're going to get the base URL and some image links::
      u'image4_thumb.jpg',
      u'image5_thumb.jpg']
 
+.. _topics-selectors-css-extensions:
+
+Extensions to CSS Selectors
+---------------------------
+
+Per W3C standards, `CSS selectors`_ do not support selecting text nodes
+or attribute values.
+But selecting these is so essential in a web scraping context
+that Parsel implements a couple of **non-standard pseudo-elements**:
+
+* to select text nodes, use ``::text``
+* to select attribute values, use ``::attr(name)`` where *name* is the
+  name of the attribute that you want the value of
+
+.. warning::
+    These pseudo-elements are Scrapy-/Parsel-specific.
+    They will most probably not work with other libraries like `lxml`_ or `PyQuery`_.
+
+
+Examples:
+
+* ``title::text`` selects children text nodes of a descendant ``<title>`` element::
+
+    >>> selector.css('title::text').extract_first()
+    u'Example website'
+
+* ``*::text`` selects all descendant text nodes of the current selector context::
+
+    >>> selector.css('#images *::text').extract()
+    [u'\n   ',
+     u'Name: My image 1 ',
+     u'\n   ',
+     u'Name: My image 2 ',
+     u'\n   ',
+     u'Name: My image 3 ',
+     u'\n   ',
+     u'Name: My image 4 ',
+     u'\n   ',
+     u'Name: My image 5 ',
+     u'\n  ']
+
+* ``a::attr(href)`` selects the *href* attribute value of descendant links::
+
+    >>> selector.css('a::attr(href)').extract()
+    [u'image1.html',
+     u'image2.html',
+     u'image3.html',
+     u'image4.html',
+     u'image5.html']
+
+.. note::
+    You cannot chain these pseudo-elements. But in practice it would not
+    make much sense: text nodes do not have attributes, and attribute values
+    are string values already and do not have children nodes.
+
+
+.. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors
+
 .. _topics-selectors-nesting-selectors:
 
 Nesting selectors
@@ -246,6 +304,7 @@ XPath specification.
 
 .. _Location Paths: http://www.w3.org/TR/xpath#location-paths
 
+
 Using EXSLT extensions
 ----------------------
 
@@ -447,7 +506,7 @@ But using the ``.`` to mean the node, works::
 .. _`XPath string function`: http://www.w3.org/TR/xpath/#section-String-Functions
 
 Beware of the difference between //node[1] and (//node)[1]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ``//node[1]`` selects all the nodes occurring first under their respective parents.
 
@@ -598,6 +657,18 @@ Let's download the atom feed using `requests`_ and create a selector::
     >>> text = requests.get('https://github.com/blog.atom').text
     >>> sel = Selector(text=text, type='xml')
 
+This is how the file starts::
+
+    <?xml version="1.0" encoding="UTF-8"?>
+    <feed xml:lang="en-US"
+          xmlns="http://www.w3.org/2005/Atom"
+          xmlns:media="http://search.yahoo.com/mrss/">
+      <id>tag:github.com,2008:/blog</id>
+      ...
+
+You can see two namespace declarations: a default "http://www.w3.org/2005/Atom"
+and another one using the "media:" prefix for "http://search.yahoo.com/mrss/".
+
 We can try selecting all ``<link>`` objects and then see that it doesn't work
 (because the Atom XML namespace is obfuscating those nodes)::
 
@@ -629,6 +700,108 @@ of relevance, are:
 .. _requests: http://www.python-requests.org/
 
 
+Ad-hoc namespaces references
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`~parsel.selector.Selector` objects also allow passing namespaces
+references along with the query, through a ``namespaces`` argument,
+with the prefixes you declare being used in your XPath or CSS query.
+
+Let's use the same Atom feed from Github::
+
+    >>> import requests
+    >>> from parsel import Selector
+    >>> text = requests.get('https://github.com/blog.atom').text
+    >>> sel = Selector(text=text, type='xml')
+
+And try to select the links again, now using an "atom:" prefix
+for the "link" node test::
+
+    >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
+    [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
+     <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
+     ...
+
+You can pass several namespaces (here we're using shorter 1-letter prefixes)::
+
+    >>> sel.xpath("//a:entry/m:thumbnail/@url",
+    ...               namespaces={"a": "http://www.w3.org/2005/Atom",
+    ...                           "m": "http://search.yahoo.com/mrss/"}).extract()
+    ['https://avatars1.githubusercontent.com/u/11529908?v=3&s=60',
+     'https://avatars0.githubusercontent.com/u/15114852?v=3&s=60',
+     ...
+
+
+Variables in XPath expressions
+------------------------------
+
+XPath allows you to reference variables in your XPath expressions, using
+the ``$somevariable`` syntax. This is somewhat similar to parameterized
+queries or prepared statements in the SQL world where you replace
+some arguments in your queries with placeholders like ``?``,
+which are then substituted with values passed with the query.
+
+Here's an example to match an element based on its normalized string-value::
+
+    >>> str_to_match = "Name: My image 3"
+    >>> selector.xpath('//a[normalize-space(.)=$match]',
+    ...                match=str_to_match).extract_first()
+    u'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>'
+
+All variable references must have a binding value when calling ``.xpath()``
+(otherwise you'll get a ``ValueError: XPath error:`` exception).
+This is done by passing as many named arguments as necessary.
+
+Here's another example using a position range passed as two integers::
+
+    >>> start, stop = 2, 4
+    >>> selector.xpath('//a[position()>=$_from and position()<=$_to]',
+    ...                _from=start, _to=stop).extract()
+    [u'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
+     u'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
+     u'<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>']
+
+Named variables can be useful when strings need to be escaped for single
+or double quotes characters. The example below would be a bit tricky to
+get right (or legible) without a variable reference::
+
+    >>> html = u'''<html>
+    ... <body>
+    ...   <p>He said: "I don't know why, but I like mixing single and double quotes!"</p>
+    ... </body>
+    ... </html>'''
+    >>> selector = Selector(text=html)
+    >>>
+    >>> selector.xpath('//p[contains(., $mystring)]',
+    ...                mystring='''He said: "I don't know''').extract_first()
+    u'<p>He said: "I don\'t know why, but I like mixing single and double quotes!"</p>'
+
+
+Converting CSS to XPath
+-----------------------
+
+.. autofunction:: parsel.css2xpath
+
+When you're using an API that only accepts XPath expressions, it's sometimes
+useful to convert CSS to XPath. This allows you to take advantage of the
+conciseness of CSS to query elements by classes and the easeness of
+manipulating XPath expressions at the same time.
+
+On those occasions, use the function :func:`~parsel.css2xpath`:
+
+::
+
+    >>> from parsel import css2xpath
+    >>> css2xpath('h1.title')
+    u"descendant-or-self::h1[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]"
+    >>> css2xpath('.profile-data') + '//h2'
+    u"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' profile-data ')]//h2"
+
+As you can see from the examples above, it returns the translated CSS query
+into an XPath expression as a string, which you can use as-is or combine to
+build a more complex expression, before feeding to a function expecting XPath.
+
+
 Similar libraries
 =================
 
@@ -649,8 +822,7 @@ Parsel is built on top of the `lxml`_ library, which means they're very similar
 in speed and parsing accuracy. The advantage of using Parsel over `lxml`_ is
 that Parsel is simpler to use and extend, unlike the `lxml`_ API which is much
 bigger because the `lxml`_ library can be used for many other tasks, besides
-selecting markup documents. Also, Parsel allows you to use CSS, by translating
-CSS to XPath using the `cssselect`_ library.
+selecting markup documents.
 
 
 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
diff --git a/parsel/__init__.py b/parsel/__init__.py
index 0dab114..113644e 100644
--- a/parsel/__init__.py
+++ b/parsel/__init__.py
@@ -5,6 +5,7 @@ or CSS selectors
 
 __author__ = 'Scrapy project'
 __email__ = 'info at scrapy.org'
-__version__ = '1.0.3'
+__version__ = '1.1.0'
 
 from parsel.selector import Selector, SelectorList  # NOQA
+from parsel.csstranslator import css2xpath  # NOQA
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index e49a2e8..f752f2b 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -96,3 +96,11 @@ class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
 
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
     pass
+
+
+_translator = HTMLTranslator()
+
+
+def css2xpath(query):
+    "Return translated XPath version of a given CSS query"
+    return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index 8d8c693..ae3c633 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -5,7 +5,7 @@ XPath selectors based on lxml
 import sys
 
 import six
-from lxml import etree
+from lxml import etree, html
 
 from .utils import flatten, iflatten, extract_regex
 from .csstranslator import HTMLTranslator, GenericTranslator
@@ -17,7 +17,7 @@ class SafeXMLParser(etree.XMLParser):
         super(SafeXMLParser, self).__init__(*args, **kwargs)
 
 _ctgroup = {
-    'html': {'_parser': etree.HTMLParser,
+    'html': {'_parser': html.HTMLParser,
              '_csstranslator': HTMLTranslator(),
              '_tostring_method': 'html'},
     'xml': {'_parser': SafeXMLParser,
@@ -58,23 +58,33 @@ class SelectorList(list):
         o = super(SelectorList, self).__getitem__(pos)
         return self.__class__(o) if isinstance(pos, slice) else o
 
-    def xpath(self, xpath):
+    def xpath(self, xpath, namespaces=None, **kwargs):
         """
         Call the ``.xpath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
 
         ``query`` is the same argument as the one in :meth:`Selector.xpath`
+
+        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+        Contrary to ``register_namespace()``, these prefixes are not
+        saved for future calls.
+
+        Any additional named arguments can be used to pass values for XPath
+        variables in the XPath expression, e.g.:
+
+            selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
-        return self.__class__(flatten([x.xpath(xpath) for x in self]))
+        return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
 
-    def css(self, xpath):
+    def css(self, query):
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
 
         ``query`` is the same argument as the one in :meth:`Selector.css`
         """
-        return self.__class__(flatten([x.css(xpath) for x in self]))
+        return self.__class__(flatten([x.css(query) for x in self]))
 
     def re(self, regex):
         """
@@ -161,22 +171,36 @@ class Selector(object):
     def _get_root(self, text, base_url=None):
         return create_root_node(text, self._parser, base_url=base_url)
 
-    def xpath(self, query):
+    def xpath(self, query, namespaces=None, **kwargs):
         """
         Find nodes matching the xpath ``query`` and return the result as a
         :class:`SelectorList` instance with all elements flattened. List
         elements implement :class:`Selector` interface too.
 
         ``query`` is a string containing the XPATH query to apply.
+
+        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+        Contrary to ``register_namespace()``, these prefixes are not
+        saved for future calls.
+
+        Any additional named arguments can be used to pass values for XPath
+        variables in the XPath expression, e.g.:
+
+            selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
         try:
             xpathev = self.root.xpath
         except AttributeError:
             return self.selectorlist_cls([])
 
+        nsp = dict(self.namespaces)
+        if namespaces is not None:
+            nsp.update(namespaces)
         try:
-            result = xpathev(query, namespaces=self.namespaces,
-                             smart_strings=self._lxml_smart_strings)
+            result = xpathev(query, namespaces=nsp,
+                             smart_strings=self._lxml_smart_strings,
+                             **kwargs)
         except etree.XPathError as exc:
             msg = u"XPath error: %s in %s" % (exc, query)
             msg = msg if six.PY3 else msg.encode('unicode_escape')
diff --git a/setup.py b/setup.py
index 0658743..2f42c5b 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ test_requirements = [
 
 setup(
     name='parsel',
-    version='1.0.3',
+    version='1.1.0',
     description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
     long_description=readme + '\n\n' + history,
     author="Scrapy project",
@@ -30,7 +30,7 @@ setup(
     include_package_data=True,
     install_requires=[
         'w3lib>=1.8.0',
-        'lxml',
+        'lxml>=2.3',
         'six>=1.5.2',
         'cssselect>=0.9',
     ],
@@ -45,11 +45,12 @@ setup(
         'Topic :: Text Processing :: Markup',
         'Topic :: Text Processing :: Markup :: HTML',
         'Topic :: Text Processing :: Markup :: XML',
-        "Programming Language :: Python :: 2",
+        'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
     ],
     setup_requires=['pytest-runner',],
     tests_require=['pytest',],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 29446d4..09ecd51 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -33,6 +33,59 @@ class SelectorTestCase(unittest.TestCase):
         self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
                          [u'12'])
 
+    def test_simple_selection_with_variables(self):
+        """Using XPath variables"""
+        body = u"<p><input name='a' value='1'/><input name='b' value='2'/></p>"
+        sel = self.sscls(text=body)
+
+        self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
+                         [u'a'])
+        self.assertEqual([x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter='b')],
+                         [u'2'])
+
+        self.assertEqual(sel.xpath("count(//input[@value=$number or @name=$letter])",
+                                   number=2, letter='a').extract(),
+                         [u'2.0'])
+
+        # you can also pass booleans
+        self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+                                   cnt=2, test=True).extract(),
+                         [u'1'])
+        self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+                                   cnt=4, test=True).extract(),
+                         [u'0'])
+        self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+                                   cnt=4, test=False).extract(),
+                         [u'1'])
+
+        # for named nodes, you need to use "name()=node_name"
+        self.assertEqual(sel.xpath("boolean(count(//*[name()=$tag])=$cnt)=$test",
+                                   tag="input", cnt=2, test=True).extract(),
+                         [u'1'])
+
+    def test_simple_selection_with_variables_escape_friendly(self):
+        """Using XPath variables with quotes that would need escaping with string formatting"""
+        body = u"""<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/>
+        "double quotes" and I don't care :)</p>"""
+        sel = self.sscls(text=body)
+
+        t = 'I say "Yeah!"'
+        # naive string formatting with give something like:
+        # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name
+        self.assertRaises(ValueError, sel.xpath, '//input[@value="{}"]/@name'.format(t))
+
+        # with XPath variables, escaping is done for you
+        self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)],
+                         [u'a'])
+        lt = """I'm mixing single and "double quotes" and I don't care :)"""
+        # the following gives you something like
+        # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name
+        self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt))
+
+        self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name",
+            lng=lt)],
+                         [u'a'])
+
     def test_representation_slice(self):
         body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
         sel = self.sscls(text=body)
@@ -211,6 +264,35 @@ class SelectorTestCase(unittest.TestCase):
         self.assertEqual(x.xpath("//somens:a/text()").extract(),
                          [u'take this'])
 
+    def test_namespaces_adhoc(self):
+        body = u"""
+        <test xmlns:somens="http://scrapy.org">
+           <somens:a id="foo">take this</a>
+           <a id="bar">found</a>
+        </test>
+        """
+
+        x = self.sscls(text=body, type='xml')
+
+        self.assertEqual(x.xpath("//somens:a/text()",
+                                 namespaces={"somens": "http://scrapy.org"}).extract(),
+                         [u'take this'])
+
+    def test_namespaces_adhoc_variables(self):
+        body = u"""
+        <test xmlns:somens="http://scrapy.org">
+           <somens:a id="foo">take this</a>
+           <a id="bar">found</a>
+        </test>
+        """
+
+        x = self.sscls(text=body, type='xml')
+
+        self.assertEqual(x.xpath("//somens:a/following-sibling::a[@id=$identifier]/text()",
+                                 namespaces={"somens": "http://scrapy.org"},
+                                 identifier="bar").extract(),
+                         [u'found'])
+
     def test_namespaces_multiple(self):
         body = u"""<?xml version="1.0" encoding="UTF-8"?>
 <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
@@ -232,6 +314,69 @@ class SelectorTestCase(unittest.TestCase):
         self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90')
         self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
 
+    def test_namespaces_multiple_adhoc(self):
+        body = u"""<?xml version="1.0" encoding="UTF-8"?>
+<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
+            xmlns:b="http://somens.com"
+            xmlns:p="http://www.scrapy.org/product" >
+    <b:Operation>hello</b:Operation>
+    <TestTag b:att="value"><Other>value</Other></TestTag>
+    <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
+</BrowseNode>
+        """
+        x = self.sscls(text=body, type='xml')
+        x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
+        self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
+
+        # "b" namespace is not declared yet
+        self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
+
+        # "b" namespace being passed ad-hoc
+        self.assertEqual(x.xpath("//b:Operation/text()",
+            namespaces={"b": "http://somens.com"}).extract()[0], 'hello')
+
+        # "b" namespace declaration is not cached
+        self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
+
+        # "xmlns" is still defined
+        self.assertEqual(x.xpath("//xmlns:TestTag/@b:att",
+            namespaces={"b": "http://somens.com"}).extract()[0], 'value')
+
+        # chained selectors still have knowledge of register_namespace() operations
+        self.assertEqual(x.xpath("//p:SecondTestTag",
+            namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[0].extract(), '90')
+
+        # but chained selector don't know about parent ad-hoc declarations
+        self.assertRaises(ValueError,x.xpath("//p:SecondTestTag",
+            namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()")
+
+        # ad-hoc declarations need repeats when chaining
+        self.assertEqual(x.xpath("//p:SecondTestTag",
+                            namespaces={"p": "http://www.scrapy.org/product"}
+                        ).xpath("p:name/text()",
+                            namespaces={"p": "http://www.scrapy.org/product"}
+                        ).extract_first(), 'Dried Rose')
+
+        # declaring several ad-hoc namespaces
+        self.assertEqual(x.xpath("""string(
+                //b:Operation
+                 /following-sibling::xmlns:TestTag
+                 /following-sibling::*//p:name)""",
+            namespaces={"b": "http://somens.com",
+                        "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose')
+
+        # "p" prefix is not cached from previous calls
+        self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()")
+
+        x.register_namespace("p", "http://www.scrapy.org/product")
+        self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
+
+    def test_make_links_absolute(self):
+        text = u'<a href="file.html">link to file</a>'
+        sel = Selector(text=text, base_url='http://example.com')
+        sel.root.make_links_absolute()
+        self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
+
     def test_re(self):
         body = u"""<div>Name: Mary
                     <ul>
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
index 1dcae7d..83d81b6 100644
--- a/tests/test_selector_csstranslator.py
+++ b/tests/test_selector_csstranslator.py
@@ -113,6 +113,14 @@ class TranslatorMixinTest(unittest.TestCase):
             self.assertRaises(exc, self.c2x, css)
 
 
+class UtilCss2XPathTest(unittest.TestCase):
+    def test_css2xpath(self):
+        from parsel import css2xpath
+        expected_xpath = (u"descendant-or-self::*[@class and contains("
+                          "concat(' ', normalize-space(@class), ' '), ' some-class ')]")
+        self.assertEqual(css2xpath('.some-class'), expected_xpath)
+
+
 class CSSSelectorTest(unittest.TestCase):
 
     sscls = Selector

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-parsel.git



More information about the Python-modules-commits mailing list