[Python-modules-commits] [python-parsel] 01/07: Import python-parsel_1.1.0.orig.tar.gz
Michael Fladischer
fladi at moszumanska.debian.org
Wed Jan 25 08:56:20 UTC 2017
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch master
in repository python-parsel.
commit ba3cf6c2a5912cfb7dac35a17d61ccf27434a4d2
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Wed Jan 25 09:38:13 2017 +0100
Import python-parsel_1.1.0.orig.tar.gz
---
.bumpversion.cfg | 2 +-
NEWS | 11 +++
docs/usage.rst | 178 ++++++++++++++++++++++++++++++++++-
parsel/__init__.py | 3 +-
parsel/csstranslator.py | 8 ++
parsel/selector.py | 42 +++++++--
setup.py | 7 +-
tests/test_selector.py | 145 ++++++++++++++++++++++++++++
tests/test_selector_csstranslator.py | 8 ++
9 files changed, 387 insertions(+), 17 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 8867812..a3c78c7 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.0.3
+current_version = 1.1.0
commit = True
tag = True
tag_name = v{new_version}
diff --git a/NEWS b/NEWS
index 0826850..71740db 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,17 @@
History
-------
+1.1.0 (2016-11-22)
+~~~~~~~~~~~~~~~~~~
+
+* Change default HTML parser to `lxml.html.HTMLParser <http://lxml.de/api/lxml.html.HTMLParser-class.html>`_,
+ which makes easier to use some HTML specific features
+* Add css2xpath function to translate CSS to XPath
+* Add support for ad-hoc namespaces declarations
+* Add support for XPath variables
+* Documentation improvements and updates
+
+
1.0.3 (2016-07-29)
~~~~~~~~~~~~~~~~~~
diff --git a/docs/usage.rst b/docs/usage.rst
index 3820412..9afda59 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -159,6 +159,64 @@ Now we're going to get the base URL and some image links::
u'image4_thumb.jpg',
u'image5_thumb.jpg']
+.. _topics-selectors-css-extensions:
+
+Extensions to CSS Selectors
+---------------------------
+
+Per W3C standards, `CSS selectors`_ do not support selecting text nodes
+or attribute values.
+But selecting these is so essential in a web scraping context
+that Parsel implements a couple of **non-standard pseudo-elements**:
+
+* to select text nodes, use ``::text``
+* to select attribute values, use ``::attr(name)`` where *name* is the
+ name of the attribute that you want the value of
+
+.. warning::
+ These pseudo-elements are Scrapy-/Parsel-specific.
+ They will most probably not work with other libraries like `lxml`_ or `PyQuery`_.
+
+
+Examples:
+
+* ``title::text`` selects children text nodes of a descendant ``<title>`` element::
+
+ >>> selector.css('title::text').extract_first()
+ u'Example website'
+
+* ``*::text`` selects all descendant text nodes of the current selector context::
+
+ >>> selector.css('#images *::text').extract()
+ [u'\n ',
+ u'Name: My image 1 ',
+ u'\n ',
+ u'Name: My image 2 ',
+ u'\n ',
+ u'Name: My image 3 ',
+ u'\n ',
+ u'Name: My image 4 ',
+ u'\n ',
+ u'Name: My image 5 ',
+ u'\n ']
+
+* ``a::attr(href)`` selects the *href* attribute value of descendant links::
+
+ >>> selector.css('a::attr(href)').extract()
+ [u'image1.html',
+ u'image2.html',
+ u'image3.html',
+ u'image4.html',
+ u'image5.html']
+
+.. note::
+ You cannot chain these pseudo-elements. But in practice it would not
+ make much sense: text nodes do not have attributes, and attribute values
+ are string values already and do not have children nodes.
+
+
+.. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors
+
.. _topics-selectors-nesting-selectors:
Nesting selectors
@@ -246,6 +304,7 @@ XPath specification.
.. _Location Paths: http://www.w3.org/TR/xpath#location-paths
+
Using EXSLT extensions
----------------------
@@ -447,7 +506,7 @@ But using the ``.`` to mean the node, works::
.. _`XPath string function`: http://www.w3.org/TR/xpath/#section-String-Functions
Beware of the difference between //node[1] and (//node)[1]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``//node[1]`` selects all the nodes occurring first under their respective parents.
@@ -598,6 +657,18 @@ Let's download the atom feed using `requests`_ and create a selector::
>>> text = requests.get('https://github.com/blog.atom').text
>>> sel = Selector(text=text, type='xml')
+This is how the file starts::
+
+ <?xml version="1.0" encoding="UTF-8"?>
+ <feed xml:lang="en-US"
+ xmlns="http://www.w3.org/2005/Atom"
+ xmlns:media="http://search.yahoo.com/mrss/">
+ <id>tag:github.com,2008:/blog</id>
+ ...
+
+You can see two namespace declarations: a default "http://www.w3.org/2005/Atom"
+and another one using the "media:" prefix for "http://search.yahoo.com/mrss/".
+
We can try selecting all ``<link>`` objects and then see that it doesn't work
(because the Atom XML namespace is obfuscating those nodes)::
@@ -629,6 +700,108 @@ of relevance, are:
.. _requests: http://www.python-requests.org/
+Ad-hoc namespaces references
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`~parsel.selector.Selector` objects also allow passing namespaces
+references along with the query, through a ``namespaces`` argument,
+with the prefixes you declare being used in your XPath or CSS query.
+
+Let's use the same Atom feed from Github::
+
+ >>> import requests
+ >>> from parsel import Selector
+ >>> text = requests.get('https://github.com/blog.atom').text
+ >>> sel = Selector(text=text, type='xml')
+
+And try to select the links again, now using an "atom:" prefix
+for the "link" node test::
+
+ >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
+ [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
+ <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>,
+ ...
+
+You can pass several namespaces (here we're using shorter 1-letter prefixes)::
+
+ >>> sel.xpath("//a:entry/m:thumbnail/@url",
+ ... namespaces={"a": "http://www.w3.org/2005/Atom",
+ ... "m": "http://search.yahoo.com/mrss/"}).extract()
+ ['https://avatars1.githubusercontent.com/u/11529908?v=3&s=60',
+ 'https://avatars0.githubusercontent.com/u/15114852?v=3&s=60',
+ ...
+
+
+Variables in XPath expressions
+------------------------------
+
+XPath allows you to reference variables in your XPath expressions, using
+the ``$somevariable`` syntax. This is somewhat similar to parameterized
+queries or prepared statements in the SQL world where you replace
+some arguments in your queries with placeholders like ``?``,
+which are then substituted with values passed with the query.
+
+Here's an example to match an element based on its normalized string-value::
+
+ >>> str_to_match = "Name: My image 3"
+ >>> selector.xpath('//a[normalize-space(.)=$match]',
+ ... match=str_to_match).extract_first()
+ u'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>'
+
+All variable references must have a binding value when calling ``.xpath()``
+(otherwise you'll get a ``ValueError: XPath error:`` exception).
+This is done by passing as many named arguments as necessary.
+
+Here's another example using a position range passed as two integers::
+
+ >>> start, stop = 2, 4
+ >>> selector.xpath('//a[position()>=$_from and position()<=$_to]',
+ ... _from=start, _to=stop).extract()
+ [u'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
+ u'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
+ u'<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>']
+
+Named variables can be useful when strings need to be escaped for single
+or double quotes characters. The example below would be a bit tricky to
+get right (or legible) without a variable reference::
+
+ >>> html = u'''<html>
+ ... <body>
+ ... <p>He said: "I don't know why, but I like mixing single and double quotes!"</p>
+ ... </body>
+ ... </html>'''
+ >>> selector = Selector(text=html)
+ >>>
+ >>> selector.xpath('//p[contains(., $mystring)]',
+ ... mystring='''He said: "I don't know''').extract_first()
+ u'<p>He said: "I don\'t know why, but I like mixing single and double quotes!"</p>'
+
+
+Converting CSS to XPath
+-----------------------
+
+.. autofunction:: parsel.css2xpath
+
+When you're using an API that only accepts XPath expressions, it's sometimes
+useful to convert CSS to XPath. This allows you to take advantage of the
+conciseness of CSS to query elements by classes and the easeness of
+manipulating XPath expressions at the same time.
+
+On those occasions, use the function :func:`~parsel.css2xpath`:
+
+::
+
+ >>> from parsel import css2xpath
+ >>> css2xpath('h1.title')
+ u"descendant-or-self::h1[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]"
+ >>> css2xpath('.profile-data') + '//h2'
+ u"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' profile-data ')]//h2"
+
+As you can see from the examples above, it returns the translated CSS query
+into an XPath expression as a string, which you can use as-is or combine to
+build a more complex expression, before feeding to a function expecting XPath.
+
+
Similar libraries
=================
@@ -649,8 +822,7 @@ Parsel is built on top of the `lxml`_ library, which means they're very similar
in speed and parsing accuracy. The advantage of using Parsel over `lxml`_ is
that Parsel is simpler to use and extend, unlike the `lxml`_ API which is much
bigger because the `lxml`_ library can be used for many other tasks, besides
-selecting markup documents. Also, Parsel allows you to use CSS, by translating
-CSS to XPath using the `cssselect`_ library.
+selecting markup documents.
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
diff --git a/parsel/__init__.py b/parsel/__init__.py
index 0dab114..113644e 100644
--- a/parsel/__init__.py
+++ b/parsel/__init__.py
@@ -5,6 +5,7 @@ or CSS selectors
__author__ = 'Scrapy project'
__email__ = 'info at scrapy.org'
-__version__ = '1.0.3'
+__version__ = '1.1.0'
from parsel.selector import Selector, SelectorList # NOQA
+from parsel.csstranslator import css2xpath # NOQA
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index e49a2e8..f752f2b 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -96,3 +96,11 @@ class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
pass
+
+
+_translator = HTMLTranslator()
+
+
+def css2xpath(query):
+ "Return translated XPath version of a given CSS query"
+ return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index 8d8c693..ae3c633 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -5,7 +5,7 @@ XPath selectors based on lxml
import sys
import six
-from lxml import etree
+from lxml import etree, html
from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
@@ -17,7 +17,7 @@ class SafeXMLParser(etree.XMLParser):
super(SafeXMLParser, self).__init__(*args, **kwargs)
_ctgroup = {
- 'html': {'_parser': etree.HTMLParser,
+ 'html': {'_parser': html.HTMLParser,
'_csstranslator': HTMLTranslator(),
'_tostring_method': 'html'},
'xml': {'_parser': SafeXMLParser,
@@ -58,23 +58,33 @@ class SelectorList(list):
o = super(SelectorList, self).__getitem__(pos)
return self.__class__(o) if isinstance(pos, slice) else o
- def xpath(self, xpath):
+ def xpath(self, xpath, namespaces=None, **kwargs):
"""
Call the ``.xpath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
``query`` is the same argument as the one in :meth:`Selector.xpath`
+
+ ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+ for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+ Contrary to ``register_namespace()``, these prefixes are not
+ saved for future calls.
+
+ Any additional named arguments can be used to pass values for XPath
+ variables in the XPath expression, e.g.:
+
+ selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- return self.__class__(flatten([x.xpath(xpath) for x in self]))
+ return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
- def css(self, xpath):
+ def css(self, query):
"""
Call the ``.css()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
``query`` is the same argument as the one in :meth:`Selector.css`
"""
- return self.__class__(flatten([x.css(xpath) for x in self]))
+ return self.__class__(flatten([x.css(query) for x in self]))
def re(self, regex):
"""
@@ -161,22 +171,36 @@ class Selector(object):
def _get_root(self, text, base_url=None):
return create_root_node(text, self._parser, base_url=base_url)
- def xpath(self, query):
+ def xpath(self, query, namespaces=None, **kwargs):
"""
Find nodes matching the xpath ``query`` and return the result as a
:class:`SelectorList` instance with all elements flattened. List
elements implement :class:`Selector` interface too.
``query`` is a string containing the XPATH query to apply.
+
+ ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+ for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+ Contrary to ``register_namespace()``, these prefixes are not
+ saved for future calls.
+
+ Any additional named arguments can be used to pass values for XPath
+ variables in the XPath expression, e.g.:
+
+ selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
try:
xpathev = self.root.xpath
except AttributeError:
return self.selectorlist_cls([])
+ nsp = dict(self.namespaces)
+ if namespaces is not None:
+ nsp.update(namespaces)
try:
- result = xpathev(query, namespaces=self.namespaces,
- smart_strings=self._lxml_smart_strings)
+ result = xpathev(query, namespaces=nsp,
+ smart_strings=self._lxml_smart_strings,
+ **kwargs)
except etree.XPathError as exc:
msg = u"XPath error: %s in %s" % (exc, query)
msg = msg if six.PY3 else msg.encode('unicode_escape')
diff --git a/setup.py b/setup.py
index 0658743..2f42c5b 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ test_requirements = [
setup(
name='parsel',
- version='1.0.3',
+ version='1.1.0',
description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
long_description=readme + '\n\n' + history,
author="Scrapy project",
@@ -30,7 +30,7 @@ setup(
include_package_data=True,
install_requires=[
'w3lib>=1.8.0',
- 'lxml',
+ 'lxml>=2.3',
'six>=1.5.2',
'cssselect>=0.9',
],
@@ -45,11 +45,12 @@ setup(
'Topic :: Text Processing :: Markup',
'Topic :: Text Processing :: Markup :: HTML',
'Topic :: Text Processing :: Markup :: XML',
- "Programming Language :: Python :: 2",
+ 'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
],
setup_requires=['pytest-runner',],
tests_require=['pytest',],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 29446d4..09ecd51 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -33,6 +33,59 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
[u'12'])
+ def test_simple_selection_with_variables(self):
+ """Using XPath variables"""
+ body = u"<p><input name='a' value='1'/><input name='b' value='2'/></p>"
+ sel = self.sscls(text=body)
+
+ self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
+ [u'a'])
+ self.assertEqual([x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter='b')],
+ [u'2'])
+
+ self.assertEqual(sel.xpath("count(//input[@value=$number or @name=$letter])",
+ number=2, letter='a').extract(),
+ [u'2.0'])
+
+ # you can also pass booleans
+ self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+ cnt=2, test=True).extract(),
+ [u'1'])
+ self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+ cnt=4, test=True).extract(),
+ [u'0'])
+ self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
+ cnt=4, test=False).extract(),
+ [u'1'])
+
+ # for named nodes, you need to use "name()=node_name"
+ self.assertEqual(sel.xpath("boolean(count(//*[name()=$tag])=$cnt)=$test",
+ tag="input", cnt=2, test=True).extract(),
+ [u'1'])
+
+ def test_simple_selection_with_variables_escape_friendly(self):
+ """Using XPath variables with quotes that would need escaping with string formatting"""
+ body = u"""<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/>
+ "double quotes" and I don't care :)</p>"""
+ sel = self.sscls(text=body)
+
+ t = 'I say "Yeah!"'
+ # naive string formatting with give something like:
+ # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name
+ self.assertRaises(ValueError, sel.xpath, '//input[@value="{}"]/@name'.format(t))
+
+ # with XPath variables, escaping is done for you
+ self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)],
+ [u'a'])
+ lt = """I'm mixing single and "double quotes" and I don't care :)"""
+ # the following gives you something like
+ # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name
+ self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt))
+
+ self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name",
+ lng=lt)],
+ [u'a'])
+
def test_representation_slice(self):
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
sel = self.sscls(text=body)
@@ -211,6 +264,35 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(x.xpath("//somens:a/text()").extract(),
[u'take this'])
+ def test_namespaces_adhoc(self):
+ body = u"""
+ <test xmlns:somens="http://scrapy.org">
+ <somens:a id="foo">take this</a>
+ <a id="bar">found</a>
+ </test>
+ """
+
+ x = self.sscls(text=body, type='xml')
+
+ self.assertEqual(x.xpath("//somens:a/text()",
+ namespaces={"somens": "http://scrapy.org"}).extract(),
+ [u'take this'])
+
+ def test_namespaces_adhoc_variables(self):
+ body = u"""
+ <test xmlns:somens="http://scrapy.org">
+ <somens:a id="foo">take this</a>
+ <a id="bar">found</a>
+ </test>
+ """
+
+ x = self.sscls(text=body, type='xml')
+
+ self.assertEqual(x.xpath("//somens:a/following-sibling::a[@id=$identifier]/text()",
+ namespaces={"somens": "http://scrapy.org"},
+ identifier="bar").extract(),
+ [u'found'])
+
def test_namespaces_multiple(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?>
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
@@ -232,6 +314,69 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90')
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
+ def test_namespaces_multiple_adhoc(self):
+ body = u"""<?xml version="1.0" encoding="UTF-8"?>
+<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
+ xmlns:b="http://somens.com"
+ xmlns:p="http://www.scrapy.org/product" >
+ <b:Operation>hello</b:Operation>
+ <TestTag b:att="value"><Other>value</Other></TestTag>
+ <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
+</BrowseNode>
+ """
+ x = self.sscls(text=body, type='xml')
+ x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
+ self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
+
+ # "b" namespace is not declared yet
+ self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
+
+ # "b" namespace being passed ad-hoc
+ self.assertEqual(x.xpath("//b:Operation/text()",
+ namespaces={"b": "http://somens.com"}).extract()[0], 'hello')
+
+ # "b" namespace declaration is not cached
+ self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
+
+ # "xmlns" is still defined
+ self.assertEqual(x.xpath("//xmlns:TestTag/@b:att",
+ namespaces={"b": "http://somens.com"}).extract()[0], 'value')
+
+ # chained selectors still have knowledge of register_namespace() operations
+ self.assertEqual(x.xpath("//p:SecondTestTag",
+ namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[0].extract(), '90')
+
+ # but chained selector don't know about parent ad-hoc declarations
+ self.assertRaises(ValueError,x.xpath("//p:SecondTestTag",
+ namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()")
+
+ # ad-hoc declarations need repeats when chaining
+ self.assertEqual(x.xpath("//p:SecondTestTag",
+ namespaces={"p": "http://www.scrapy.org/product"}
+ ).xpath("p:name/text()",
+ namespaces={"p": "http://www.scrapy.org/product"}
+ ).extract_first(), 'Dried Rose')
+
+ # declaring several ad-hoc namespaces
+ self.assertEqual(x.xpath("""string(
+ //b:Operation
+ /following-sibling::xmlns:TestTag
+ /following-sibling::*//p:name)""",
+ namespaces={"b": "http://somens.com",
+ "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose')
+
+ # "p" prefix is not cached from previous calls
+ self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()")
+
+ x.register_namespace("p", "http://www.scrapy.org/product")
+ self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
+
+ def test_make_links_absolute(self):
+ text = u'<a href="file.html">link to file</a>'
+ sel = Selector(text=text, base_url='http://example.com')
+ sel.root.make_links_absolute()
+ self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
+
def test_re(self):
body = u"""<div>Name: Mary
<ul>
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
index 1dcae7d..83d81b6 100644
--- a/tests/test_selector_csstranslator.py
+++ b/tests/test_selector_csstranslator.py
@@ -113,6 +113,14 @@ class TranslatorMixinTest(unittest.TestCase):
self.assertRaises(exc, self.c2x, css)
+class UtilCss2XPathTest(unittest.TestCase):
+ def test_css2xpath(self):
+ from parsel import css2xpath
+ expected_xpath = (u"descendant-or-self::*[@class and contains("
+ "concat(' ', normalize-space(@class), ' '), ' some-class ')]")
+ self.assertEqual(css2xpath('.some-class'), expected_xpath)
+
+
class CSSSelectorTest(unittest.TestCase):
sscls = Selector
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-parsel.git
More information about the Python-modules-commits
mailing list