[Python-modules-commits] [python-scrapy] 01/06: Import python-scrapy_1.3.2.orig.tar.gz

Michael Fladischer fladi at moszumanska.debian.org
Fri Feb 17 09:18:05 UTC 2017


This is an automated email from the git hooks/post-receive script.

fladi pushed a commit to branch master
in repository python-scrapy.

commit a24b933ee858b766e10e0d7761ac6327d43c757d
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date:   Fri Feb 17 09:28:55 2017 +0100

    Import python-scrapy_1.3.2.orig.tar.gz
---
 .bumpversion.cfg                                 |  2 +-
 .travis.yml                                      | 47 +++++++++++---
 CODE_OF_CONDUCT.md                               | 80 +++++++++++++++---------
 README.rst                                       |  7 ++-
 artwork/{README => README.rst}                   |  2 +
 docs/{README => README.rst}                      |  2 +
 docs/intro/tutorial.rst                          | 12 ++--
 docs/news.rst                                    | 58 +++++++++++++++++
 docs/topics/architecture.rst                     |  2 +-
 docs/topics/commands.rst                         |  6 ++
 docs/topics/downloader-middleware.rst            |  5 +-
 docs/topics/request-response.rst                 | 12 +++-
 docs/topics/selectors.rst                        | 38 +++++++++++
 docs/topics/spider-middleware.rst                |  2 +-
 docs/topics/spiders.rst                          | 31 +++++++++
 extras/scrapy.1                                  |  8 +--
 requirements.txt                                 |  2 +-
 scrapy/VERSION                                   |  2 +-
 scrapy/commands/view.py                          |  5 +-
 scrapy/core/downloader/handlers/http.py          |  8 +--
 scrapy/core/downloader/handlers/http11.py        | 11 ++--
 scrapy/core/downloader/tls.py                    | 28 ++++++---
 scrapy/downloadermiddlewares/redirect.py         | 15 ++---
 scrapy/http/response/text.py                     |  4 +-
 scrapy/item.py                                   |  3 +
 scrapy/linkextractors/__init__.py                |  6 +-
 scrapy/resolver.py                               |  7 ++-
 scrapy/settings/__init__.py                      | 16 ++++-
 scrapy/templates/project/module/settings.py.tmpl |  2 +-
 scrapy/utils/conf.py                             |  9 +++
 scrapy/utils/reqser.py                           |  6 +-
 sep/{README => README.rst}                       |  2 +
 setup.py                                         |  3 +-
 tests/mockserver.py                              | 39 +++---------
 tests/test_crawl.py                              | 13 ++--
 tests/test_downloader_handlers.py                | 49 ++++++---------
 tests/test_downloadermiddleware_retry.py         |  5 +-
 tests/test_http_response.py                      | 32 ++++++++++
 tests/test_item.py                               | 52 ++++++++++++++-
 tests/test_pipeline_files.py                     |  2 +-
 tests/test_settings/__init__.py                  | 12 ++++
 tests/test_utils_conf.py                         | 21 +++++++
 tests/test_utils_reqser.py                       | 13 +++-
 tox.ini                                          | 21 +++++--
 44 files changed, 516 insertions(+), 186 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 57ff603..b95e0ba 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.0
+current_version = 1.3.2
 commit = True
 tag = True
 tag_name = {new_version}
diff --git a/.travis.yml b/.travis.yml
index 506f377..2df02ea 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,19 +1,46 @@
 language: python
-python: 3.5
 sudo: false
 branches:
   only:
     - master
     - /^\d\.\d+$/
-    - /^\d\.\d+\.\d+(rc\d+|dev\d+)?$/
-env:
- - TOXENV=py27
- - TOXENV=jessie
- - TOXENV=py33
- - TOXENV=py35
- - TOXENV=docs
+    - /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
+matrix:
+  include:
+    - python: 2.7
+      env: TOXENV=py27
+    - python: 2.7
+      env: TOXENV=jessie
+    - python: 3.3
+      env: TOXENV=py33
+    - python: 3.5
+      env: TOXENV=py35
+    - python: 3.6
+      env: TOXENV=py36
+    - python: 2.7
+      env: TOXENV=pypy
+    - python: 3.6
+      env: TOXENV=docs
+  allow_failures:
+    - python: 2.7
+      env: TOXENV=pypy
 install:
- - pip install -U tox twine wheel codecov
+  - |
+      if [ "$TOXENV" = "pypy" ]; then
+        export PYENV_ROOT="$HOME/.pyenv"
+        if [ -f "$PYENV_ROOT/bin/pyenv" ]; then
+          pushd "$PYENV_ROOT" && git pull && popd
+        else
+          rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT"
+        fi
+        # get latest PyPy from pyenv directly (thanks to natural version sort option -V)
+        export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-[0-9][\.0-9]*$' |sort -V |tail -1`
+        "$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION"
+        virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION"
+        source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
+      fi
+  - pip install -U tox twine wheel codecov
+
 script: tox
 after_success:
   - codecov
@@ -35,4 +62,4 @@ deploy:
   on:
     tags: true
     repo: scrapy/scrapy
-    condition: "$TOXENV == py27 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|dev[0-9]+)?$"
+    condition: "$TOXENV == py27 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 95b4a7e..1626022 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,24 +1,41 @@
-# Contributor Code of Conduct
+# Contributor Covenant Code of Conduct
 
-As contributors and maintainers of this project, and in the interest of
-fostering an open and welcoming community, we pledge to respect all people who
-contribute through reporting issues, posting feature requests, updating
-documentation, submitting pull requests or patches, and other activities.
+## Our Pledge
 
-We are committed to making participation in this project a harassment-free
-experience for everyone, regardless of level of experience, gender, gender
-identity and expression, sexual orientation, disability, personal appearance,
-body size, race, ethnicity, age, religion, or nationality.
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
 
 Examples of unacceptable behavior by participants include:
 
-* The use of sexualized language or imagery
-* Personal attacks
-* Trolling or insulting/derogatory comments
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
 * Public or private harassment
-* Publishing other's private information, such as physical or electronic
-  addresses, without explicit permission
-* Other unethical or unprofessional conduct
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
 
 Project maintainers have the right and responsibility to remove, edit, or
 reject comments, commits, code, wiki edits, issues, and other contributions
@@ -26,25 +43,32 @@ that are not aligned to this Code of Conduct, or to ban temporarily or
 permanently any contributor for other behaviors that they deem inappropriate,
 threatening, offensive, or harmful.
 
-By adopting this Code of Conduct, project maintainers commit themselves to
-fairly and consistently applying these principles to every aspect of managing
-this project. Project maintainers who do not follow or enforce the Code of
-Conduct may be permanently removed from the project team.
+## Scope
 
 This Code of Conduct applies both within project spaces and in public spaces
-when an individual is representing the project or its community.
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting a project maintainer at opensource at scrapinghub.com. All
+reported by contacting the project team at opensource at scrapinghub.com. All
 complaints will be reviewed and investigated and will result in a response that
-is deemed necessary and appropriate to the circumstances. Maintainers are
-obligated to maintain confidentiality with regard to the reporter of an
-incident.
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
 
+## Attribution
 
-This Code of Conduct is adapted from the [Contributor Covenant][homepage],
-version 1.3.0, available at
-[http://contributor-covenant.org/version/1/3/0/][version]
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at [http://contributor-covenant.org/version/1/4][version]
 
 [homepage]: http://contributor-covenant.org
-[version]: http://contributor-covenant.org/version/1/3/0/
+[version]: http://contributor-covenant.org/version/1/4/
diff --git a/README.rst b/README.rst
index b72ebf5..38dda62 100644
--- a/README.rst
+++ b/README.rst
@@ -73,14 +73,17 @@ See http://scrapy.org/community/
 Contributing
 ============
 
+See http://doc.scrapy.org/en/master/contributing.html
+
+Code of Conduct
+---------------
+
 Please note that this project is released with a Contributor Code of Conduct
 (see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md).
 
 By participating in this project you agree to abide by its terms.
 Please report unacceptable behavior to opensource at scrapinghub.com.
 
-See http://doc.scrapy.org/en/master/contributing.html
-
 Companies using Scrapy
 ======================
 
diff --git a/artwork/README b/artwork/README.rst
similarity index 97%
rename from artwork/README
rename to artwork/README.rst
index c185d57..016462f 100644
--- a/artwork/README
+++ b/artwork/README.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 Scrapy artwork
 ==============
 
diff --git a/docs/README b/docs/README.rst
similarity index 99%
rename from docs/README
rename to docs/README.rst
index cf04965..733af2a 100644
--- a/docs/README
+++ b/docs/README.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 ======================================
 Scrapy documentation quick start guide
 ======================================
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 8e14d1b..3dc5ad2 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -225,7 +225,7 @@ You will see something like::
     [s]   shelp()           Shell help (print this help)
     [s]   fetch(req_or_url) Fetch request (or URL) and update local objects
     [s]   view(response)    View response in a browser
-    >>> 
+    >>>
 
 Using the shell, you can try selecting elements using `CSS`_ with the response
 object::
@@ -423,7 +423,7 @@ in the callback, as you can see below::
             for quote in response.css('div.quote'):
                 yield {
                     'text': quote.css('span.text::text').extract_first(),
-                    'author': quote.css('span small::text').extract_first(),
+                    'author': quote.css('small.author::text').extract_first(),
                     'tags': quote.css('div.tags a.tag::text').extract(),
                 }
 
@@ -522,7 +522,7 @@ page, extracting data from it::
             for quote in response.css('div.quote'):
                 yield {
                     'text': quote.css('span.text::text').extract_first(),
-                    'author': quote.css('span small::text').extract_first(),
+                    'author': quote.css('small.author::text').extract_first(),
                     'tags': quote.css('div.tags a.tag::text').extract(),
                 }
 
@@ -568,7 +568,7 @@ this time for scraping author information::
 
         def parse(self, response):
             # follow links to author pages
-            for href in response.css('.author+a::attr(href)').extract():
+            for href in response.css('.author + a::attr(href)').extract():
                 yield scrapy.Request(response.urljoin(href),
                                      callback=self.parse_author)
 
@@ -624,7 +624,7 @@ option when running them::
     scrapy crawl quotes -o quotes-humor.json -a tag=humor
 
 These arguments are passed to the Spider's ``__init__`` method and become
-spider attributes by default.  
+spider attributes by default.
 
 In this example, the value provided for the ``tag`` argument will be available
 via ``self.tag``. You can use this to make your spider fetch only quotes
@@ -647,7 +647,7 @@ with a specific tag, building the URL based on the argument::
             for quote in response.css('div.quote'):
                 yield {
                     'text': quote.css('span.text::text').extract_first(),
-                    'author': quote.css('span small a::text').extract_first(),
+                    'author': quote.css('small.author::text').extract_first(),
                 }
 
             next_page = response.css('li.next a::attr(href)').extract_first()
diff --git a/docs/news.rst b/docs/news.rst
index cce4659..ff1e4ce 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -3,6 +3,64 @@
 Release notes
 =============
 
+Scrapy 1.3.2 (2017-02-13)
+-------------------------
+
+Bug fixes
+~~~~~~~~~
+
+- Preserve crequest class when converting to/from dicts (utils.reqser) (:issue:`2510`).
+- Use consistent selectors for author field in tutorial (:issue:`2551`).
+- Fix TLS compatibility in Twisted 17+ (:issue:`2558`)
+
+Scrapy 1.3.1 (2017-02-08)
+-------------------------
+
+New features
+~~~~~~~~~~~~
+
+- Support ``'True'`` and ``'False'`` string values for boolean settings (:issue:`2519`);
+  you can now do something like ``scrapy crawl myspider -s REDIRECT_ENABLED=False``.
+- Support kwargs with ``response.xpath()`` to use :ref:`XPath variables <topics-selectors-xpath-variables>`
+  and ad-hoc namespaces declarations ;
+  this requires at least Parsel v1.1 (:issue:`2457`).
+- Add support for Python 3.6 (:issue:`2485`).
+- Run tests on PyPy (warning: some tests still fail, so PyPy is not supported yet).
+
+Bug fixes
+~~~~~~~~~
+
+- Enforce ``DNS_TIMEOUT`` setting (:issue:`2496`).
+- Fix :command:`view` command ; it was a regression in v1.3.0 (:issue:`2503`).
+- Fix tests regarding ``*_EXPIRES settings`` with Files/Images pipelines (:issue:`2460`).
+- Fix name of generated pipeline class when using basic project template (:issue:`2466`).
+- Fix compatiblity with Twisted 17+ (:issue:`2496`, :issue:`2528`).
+- Fix ``scrapy.Item`` inheritance on Python 3.6 (:issue:`2511`).
+- Enforce numeric values for components order in ``SPIDER_MIDDLEWARES``,
+  ``DOWNLOADER_MIDDLEWARES``, ``EXTENIONS`` and ``SPIDER_CONTRACTS`` (:issue:`2420`).
+
+Documentation
+~~~~~~~~~~~~~
+
+- Reword Code of Coduct section and upgrade to Contributor Covenant v1.4
+  (:issue:`2469`).
+- Clarify that passing spider arguments converts them to spider attributes
+  (:issue:`2483`).
+- Document ``formid`` argument on ``FormRequest.from_response()`` (:issue:`2497`).
+- Add .rst extension to README files (:issue:`2507`).
+- Mention LevelDB cache storage backend (:issue:`2525`).
+- Use ``yield`` in sample callback code (:issue:`2533`).
+- Add note about HTML entities decoding with ``.re()/.re_first()`` (:issue:`1704`).
+- Typos (:issue:`2512`, :issue:`2534`, :issue:`2531`).
+
+Cleanups
+~~~~~~~~
+
+- Remove reduntant check in ``MetaRefreshMiddleware`` (:issue:`2542`).
+- Faster checks in ``LinkExtractor`` for allow/deny patterns (:issue:`2538`).
+- Remove dead code supporting old Twisted versions (:issue:`2544`).
+
+
 Scrapy 1.3.0 (2016-12-21)
 -------------------------
 
diff --git a/docs/topics/architecture.rst b/docs/topics/architecture.rst
index ea0cb0e..4ac39ad 100644
--- a/docs/topics/architecture.rst
+++ b/docs/topics/architecture.rst
@@ -12,7 +12,7 @@ Overview
 
 The following diagram shows an overview of the Scrapy architecture with its
 components and an outline of the data flow that takes place inside the system
-(shown by the green arrows). A brief description of the components is included
+(shown by the red arrows). A brief description of the components is included
 below with links for more detailed information about them. The data flow is
 also described below.
 
diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst
index 6636c30..eaeeee1 100644
--- a/docs/topics/commands.rst
+++ b/docs/topics/commands.rst
@@ -358,6 +358,12 @@ Opens the given URL in a browser, as your Scrapy spider would "see" it.
 Sometimes spiders see pages differently from regular users, so this can be used
 to check what the spider "sees" and confirm it's what you expect.
 
+Supported options:
+
+* ``--spider=SPIDER``: bypass spider autodetection and force use of specific spider
+
+* ``--no-redirect``: do not follow HTTP 3xx redirects (default is to follow them)
+
 Usage example::
 
     $ scrapy view http://www.example.com/some/page.html
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index 3b9a533..1ca78cc 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -318,10 +318,11 @@ HttpCacheMiddleware
     This middleware provides low-level cache to all HTTP requests and responses.
     It has to be combined with a cache storage backend as well as a cache policy.
 
-    Scrapy ships with two HTTP cache storage backends:
+    Scrapy ships with three HTTP cache storage backends:
 
         * :ref:`httpcache-storage-fs`
         * :ref:`httpcache-storage-dbm`
+        * :ref:`httpcache-storage-leveldb`
 
     You can change the HTTP cache storage backend with the :setting:`HTTPCACHE_STORAGE`
     setting. Or you can also implement your own storage backend.
@@ -748,7 +749,7 @@ REDIRECT_MAX_TIMES
 
 Default: ``20``
 
-The maximum number of redirections that will be follow for a single request.
+The maximum number of redirections that will be followed for a single request.
 
 MetaRefreshMiddleware
 ---------------------
diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst
index 664a723..1fdd260 100644
--- a/docs/topics/request-response.rst
+++ b/docs/topics/request-response.rst
@@ -207,12 +207,12 @@ different fields from different pages::
         request = scrapy.Request("http://www.example.com/some_page.html",
                                  callback=self.parse_page2)
         request.meta['item'] = item
-        return request
+        yield request
 
     def parse_page2(self, response):
         item = response.meta['item']
         item['other_url'] = response.url
-        return item
+        yield item
 
 
 .. _topics-request-response-ref-errbacks:
@@ -358,7 +358,7 @@ fields with form data from :class:`Response` objects.
     The :class:`FormRequest` objects support the following class method in
     addition to the standard :class:`Request` methods:
 
-    .. classmethod:: FormRequest.from_response(response, [formname=None, formnumber=0, formdata=None, formxpath=None, formcss=None, clickdata=None, dont_click=False, ...])
+    .. classmethod:: FormRequest.from_response(response, [formname=None, formid=None, formnumber=0, formdata=None, formxpath=None, formcss=None, clickdata=None, dont_click=False, ...])
 
        Returns a new :class:`FormRequest` object with its form field values
        pre-populated with those found in the HTML ``<form>`` element contained
@@ -383,6 +383,9 @@ fields with form data from :class:`Response` objects.
        :param formname: if given, the form with name attribute set to this value will be used.
        :type formname: string
 
+       :param formid: if given, the form with id attribute set to this value will be used.
+       :type formid: string
+
        :param formxpath: if given, the first form that matches the xpath will be used.
        :type formxpath: string
 
@@ -421,6 +424,9 @@ fields with form data from :class:`Response` objects.
        .. versionadded:: 1.1.0
           The ``formcss`` parameter.
 
+       .. versionadded:: 1.1.0
+          The ``formid`` parameter.
+
 Request usage examples
 ----------------------
 
diff --git a/docs/topics/selectors.rst b/docs/topics/selectors.rst
index 39ec9b7..8a5d44a 100644
--- a/docs/topics/selectors.rst
+++ b/docs/topics/selectors.rst
@@ -283,6 +283,40 @@ XPath specification.
 
 .. _Location Paths: https://www.w3.org/TR/xpath#location-paths
 
+.. _topics-selectors-xpath-variables:
+
+Variables in XPath expressions
+------------------------------
+
+XPath allows you to reference variables in your XPath expressions, using
+the ``$somevariable`` syntax. This is somewhat similar to parameterized
+queries or prepared statements in the SQL world where you replace
+some arguments in your queries with placeholders like ``?``,
+which are then substituted with values passed with the query.
+
+Here's an example to match an element based on its "id" attribute value,
+without hard-coding it (that was shown previously)::
+
+    >>> # `$val` used in the expression, a `val` argument needs to be passed
+    >>> response.xpath('//div[@id=$val]/a/text()', val='images').extract_first()
+    u'Name: My image 1 '
+
+Here's another example, to find the "id" attribute of a ``<div>`` tag containing
+five ``<a>`` children (here we pass the value ``5`` as an integer)::
+
+    >>> response.xpath('//div[count(a)=$cnt]/@id', cnt=5).extract_first()
+    u'images'
+
+All variable references must have a binding value when calling ``.xpath()``
+(otherwise you'll get a ``ValueError: XPath error:`` exception).
+This is done by passing as many named arguments as necessary.
+
+`parsel`_, the library powering Scrapy selectors, has more details and examples
+on `XPath variables`_.
+
+.. _parsel: https://parsel.readthedocs.io/
+.. _XPath variables: https://parsel.readthedocs.io/en/latest/usage.html#variables-in-xpath-expressions
+
 Using EXSLT extensions
 ----------------------
 
@@ -626,6 +660,10 @@ Built-in Selectors reference
      ``regex`` can be either a compiled regular expression or a string which
      will be compiled to a regular expression using ``re.compile(regex)``
 
+    .. note::
+
+        Note that ``re()`` and ``re_first()`` both decode HTML entities (except ``<`` and ``&``).
+
   .. method:: register_namespace(prefix, uri)
 
      Register the given namespace to be used in this :class:`Selector`.
diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst
index 604f186..8360827 100644
--- a/docs/topics/spider-middleware.rst
+++ b/docs/topics/spider-middleware.rst
@@ -112,7 +112,7 @@ following methods:
 
     .. method:: process_spider_exception(response, exception, spider)
 
-        This method is called when when a spider or :meth:`process_spider_input`
+        This method is called when a spider or :meth:`process_spider_input`
         method (from other spider middleware) raises an exception.
 
         :meth:`process_spider_exception` should return either ``None`` or an
diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst
index 0179e92..c123c26 100644
--- a/docs/topics/spiders.rst
+++ b/docs/topics/spiders.rst
@@ -297,6 +297,37 @@ Spiders can access arguments in their `__init__` methods::
             self.start_urls = ['http://www.example.com/categories/%s' % category]
             # ...
 
+The default `__init__` method will take any spider arguments
+and copy them to the spider as attributes.
+The above example can also be written as follows::
+
+    import scrapy
+
+    class MySpider(scrapy.Spider):
+        name = 'myspider'
+
+        def start_requests(self):
+            yield scrapy.Request('http://www.example.com/categories/%s' % self.category)
+
+Keep in mind that spider arguments are only strings.
+The spider will not do any parsing on its own.
+If you were to set the `start_urls` attribute from the command line,
+you would have to parse it on your own into a list
+using something like
+`ast.literal_eval <https://docs.python.org/library/ast.html#ast.literal_eval>`_
+or `json.loads <https://docs.python.org/library/json.html#json.loads>`_
+and then set it as an attribute.
+Otherwise, you would cause iteration over a `start_urls` string
+(a very common python pitfall)
+resulting in each character being seen as a separate url.
+
+A valid use case is to set the http auth credentials
+used by :class:`~scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware`
+or the user agent
+used by :class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`::
+
+    scrapy crawl myspider -a http_user=myuser -a http_pass=mypassword -a user_agent=mybot
+
 Spider arguments can also be passed through the Scrapyd ``schedule.json`` API.
 See `Scrapyd documentation`_.
 
diff --git a/extras/scrapy.1 b/extras/scrapy.1
index a4f2956..2fa8d82 100644
--- a/extras/scrapy.1
+++ b/extras/scrapy.1
@@ -28,16 +28,16 @@ Query Scrapy settings
 Print raw setting value
 .TP
 .I --getbool=SETTING
-Print setting value, intepreted as a boolean
+Print setting value, interpreted as a boolean
 .TP
 .I --getint=SETTING
-Print setting value, intepreted as an integer
+Print setting value, interpreted as an integer
 .TP
 .I --getfloat=SETTING
-Print setting value, intepreted as an float
+Print setting value, interpreted as a float
 .TP
 .I --getlist=SETTING
-Print setting value, intepreted as an float
+Print setting value, interpreted as a float
 .TP
 .I --init
 Print initial setting value (before loading extensions and spiders)
diff --git a/requirements.txt b/requirements.txt
index 64b6e77..f92603d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,4 @@ queuelib
 six>=1.5.2
 PyDispatcher>=2.0.5
 service_identity
-parsel>=0.9.5
+parsel>=1.1
diff --git a/scrapy/VERSION b/scrapy/VERSION
index f0bb29e..1892b92 100644
--- a/scrapy/VERSION
+++ b/scrapy/VERSION
@@ -1 +1 @@
-1.3.0
+1.3.2
diff --git a/scrapy/commands/view.py b/scrapy/commands/view.py
index 4eb44f7..59e6650 100644
--- a/scrapy/commands/view.py
+++ b/scrapy/commands/view.py
@@ -11,9 +11,8 @@ class Command(fetch.Command):
             "contents in a browser"
 
     def add_options(self, parser):
-        ScrapyCommand.add_options(self, parser)
-        parser.add_option("--spider", dest="spider",
-            help="use this spider")
+        super(Command, self).add_options(parser)
+        parser.remove_option("--headers")
 
     def _print_response(self, response, opts):
         open_in_browser(response)
diff --git a/scrapy/core/downloader/handlers/http.py b/scrapy/core/downloader/handlers/http.py
index 81da261..e4a7d85 100644
--- a/scrapy/core/downloader/handlers/http.py
+++ b/scrapy/core/downloader/handlers/http.py
@@ -1,10 +1,6 @@
-from scrapy import twisted_version
+from __future__ import absolute_import
 from .http10 import HTTP10DownloadHandler
-
-if twisted_version >= (11, 1, 0):
-    from .http11 import HTTP11DownloadHandler as HTTPDownloadHandler
-else:
-    HTTPDownloadHandler = HTTP10DownloadHandler
+from .http11 import HTTP11DownloadHandler as HTTPDownloadHandler
 
 
 # backwards compatibility
diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py
index 54aa359..ecd7f90 100644
--- a/scrapy/core/downloader/handlers/http11.py
+++ b/scrapy/core/downloader/handlers/http11.py
@@ -319,14 +319,13 @@ class ScrapyAgent(object):
         expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
 
         if maxsize and expected_size > maxsize:
-            error_message = ("Cancelling download of {url}: expected response "
-                             "size ({size}) larger than "
-                             "download max size ({maxsize})."
-            ).format(url=request.url, size=expected_size, maxsize=maxsize)
+            error_msg = ("Cancelling download of %(url)s: expected response "
+                         "size (%(size)s) larger than download max size (%(maxsize)s).")
+            error_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
 
-            logger.error(error_message)
+            logger.error(error_msg, error_args)
             txresponse._transport._producer.loseConnection()
-            raise defer.CancelledError(error_message)
+            raise defer.CancelledError(error_msg % error_args)
 
         if warnsize and expected_size > warnsize:
             logger.warning("Expected response size (%(size)s) larger than "
diff --git a/scrapy/core/downloader/tls.py b/scrapy/core/downloader/tls.py
index 955b763..498e3d6 100644
--- a/scrapy/core/downloader/tls.py
+++ b/scrapy/core/downloader/tls.py
@@ -1,6 +1,8 @@
 import logging
 from OpenSSL import SSL
 
+from scrapy import twisted_version
+
 
 logger = logging.getLogger(__name__)
 
@@ -18,11 +20,17 @@ openssl_methods = {
     METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6),   # TLS 1.2 only
 }
 
-# ClientTLSOptions requires a recent-enough version of Twisted
-try:
+if twisted_version >= (14, 0, 0):
+    # ClientTLSOptions requires a recent-enough version of Twisted.
+    # Not having ScrapyClientTLSOptions should not matter for older
+    # Twisted versions because it is not used in the fallback
+    # ScrapyClientContextFactory.
 
     # taken from twisted/twisted/internet/_sslverify.py
+
     try:
+        # XXX: this try-except is not needed in Twisted 17.0.0+ because
+        # it requires pyOpenSSL 0.16+.
         from OpenSSL.SSL import SSL_CB_HANDSHAKE_DONE, SSL_CB_HANDSHAKE_START
     except ImportError:
         SSL_CB_HANDSHAKE_START = 0x10
@@ -30,10 +38,17 @@ try:
 
     from twisted.internet.ssl import AcceptableCiphers
     from twisted.internet._sslverify import (ClientTLSOptions,
-                                             _maybeSetHostNameIndication,
                                              verifyHostname,
                                              VerificationError)
 
+    if twisted_version < (17, 0, 0):
+        from twisted.internet._sslverify import _maybeSetHostNameIndication
+        set_tlsext_host_name = _maybeSetHostNameIndication
+    else:
+        def set_tlsext_host_name(connection, hostNameBytes):
+            connection.set_tlsext_host_name(hostNameBytes)
+
+
     class ScrapyClientTLSOptions(ClientTLSOptions):
         """
         SSL Client connection creator ignoring certificate verification errors
@@ -46,7 +61,7 @@ try:
 
         def _identityVerifyingInfoCallback(self, connection, where, ret):
             if where & SSL_CB_HANDSHAKE_START:
-                _maybeSetHostNameIndication(connection, self._hostnameBytes)
+                set_tlsext_host_name(connection, self._hostnameBytes)
             elif where & SSL_CB_HANDSHAKE_DONE:
                 try:
                     verifyHostname(connection, self._hostnameASCII)
@@ -62,8 +77,3 @@ try:
                             self._hostnameASCII, repr(e)))
 
     DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')
-
-except ImportError:
-    # ImportError should not matter for older Twisted versions
-    # as the above is not used in the fallback ScrapyClientContextFactory
-    pass
diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index db276ee..26677e5 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -53,8 +53,10 @@ class BaseRedirectMiddleware(object):
 
 
 class RedirectMiddleware(BaseRedirectMiddleware):
-    """Handle redirection of requests based on response status and meta-refresh html tag"""
-
+    """
+    Handle redirection of requests based on response status
+    and meta-refresh html tag.
+    """
     def process_response(self, request, response, spider):
         if (request.meta.get('dont_redirect', False) or
                 response.status in getattr(spider, 'handle_httpstatus_list', []) or
@@ -92,10 +94,9 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
                 not isinstance(response, HtmlResponse):
             return response
 
-        if isinstance(response, HtmlResponse):
-            interval, url = get_meta_refresh(response)
-            if url and interval < self._maxdelay:
-                redirected = self._redirect_request_using_get(request, url)
-                return self._redirect(redirected, request, spider, 'meta refresh')
+        interval, url = get_meta_refresh(response)
+        if url and interval < self._maxdelay:
+            redirected = self._redirect_request_using_get(request, url)
+            return self._redirect(redirected, request, spider, 'meta refresh')
 
         return response
diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py
index afa4303..5a6507a 100644
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@@ -111,8 +111,8 @@ class TextResponse(Response):
             self._cached_selector = Selector(self)
         return self._cached_selector
 
-    def xpath(self, query):
-        return self.selector.xpath(query)
+    def xpath(self, query, **kwargs):
+        return self.selector.xpath(query, **kwargs)
 
     def css(self, query):
         return self.selector.css(query)
diff --git a/scrapy/item.py b/scrapy/item.py
index 138728a..aa05e9c 100644
--- a/scrapy/item.py
+++ b/scrapy/item.py
@@ -25,6 +25,7 @@ class Field(dict):
 class ItemMeta(ABCMeta):
 
     def __new__(mcs, class_name, bases, attrs):
+        classcell = attrs.pop('__classcell__', None)
         new_bases = tuple(base._class for base in bases if hasattr(base, '_class'))
         _class = super(ItemMeta, mcs).__new__(mcs, 'x_' + class_name, new_bases, attrs)
 
@@ -39,6 +40,8 @@ class ItemMeta(ABCMeta):
 
         new_attrs['fields'] = fields
         new_attrs['_class'] = _class
+        if classcell is not None:
+            new_attrs['__classcell__'] = classcell
         return super(ItemMeta, mcs).__new__(mcs, class_name, bases, new_attrs)
 
 
diff --git a/scrapy/linkextractors/__init__.py b/scrapy/linkextractors/__init__.py
index f51934b..e5d21e1 100644
--- a/scrapy/linkextractors/__init__.py
+++ b/scrapy/linkextractors/__init__.py
@@ -40,7 +40,7 @@ IGNORED_EXTENSIONS = [
 
 
 _re_type = type(re.compile("", 0))
-_matches = lambda url, regexs: any((r.search(url) for r in regexs))
+_matches = lambda url, regexs: any(r.search(url) for r in regexs)
 _is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
 
 
@@ -93,8 +93,8 @@ class FilteringLinkExtractor(object):
         if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
             return False
 
-        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
-        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
+        allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
+        denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
         return any(allowed) and not any(denied)
 
     def _process_links(self, links):
diff --git a/scrapy/resolver.py b/scrapy/resolver.py
index 3954fd9..4f4f0b0 100644
--- a/scrapy/resolver.py
+++ b/scrapy/resolver.py
@@ -16,8 +16,11 @@ class CachingThreadedResolver(ThreadedResolver):
     def getHostByName(self, name, timeout=None):
         if name in dnscache:
             return defer.succeed(dnscache[name])
-        if not timeout:
-            timeout = self.timeout
+        # in Twisted<=16.6, getHostByName() is always called with
+        # a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple),
+        # so the input argument above is simply overridden
+        # to enforce Scrapy's DNS_TIMEOUT setting's value
+        timeout = (self.timeout,)
         d = super(CachingThreadedResolver, self).getHostByName(name, timeout)
         d.addCallback(self._cache_result, name)
         return d
diff --git a/scrapy/settings/__init__.py b/scrapy/settings/__init__.py
index 7b78089..28446a3 100644
--- a/scrapy/settings/__init__.py
+++ b/scrapy/settings/__init__.py
@@ -114,8 +114,8 @@ class BaseSettings(MutableMapping):
         """
         Get a setting value as a boolean.
 
-        ``1``, ``'1'``, and ``True`` return ``True``, while ``0``, ``'0'``,
-        ``False`` and ``None`` return ``False``.
+        ``1``, ``'1'``, `True`` and ``'True'`` return ``True``,
+        while ``0``, ``'0'``, ``False``, ``'False'`` and ``None`` return ``False``.
 
         For example, settings populated through environment variables set to
         ``'0'`` will return ``False`` when using this method.
@@ -126,7 +126,17 @@ class BaseSettings(MutableMapping):
         :param default: the value to return if no setting is found
         :type default: any
         """
-        return bool(int(self.get(name, default)))
+        got = self.get(name, default)
+        try:
+            return bool(int(got))
+        except ValueError:
+            if got in ("True", "true"):
+                return True
+            if got in ("False", "false"):
+                return False
+            raise ValueError("Supported values for boolean settings "
+                             "are 0/1, True/False, '0'/'1', "
+                             "'True'/'False' and 'true'/'false'")
 
     def getint(self, name, default=0):
         """
diff --git a/scrapy/templates/project/module/settings.py.tmpl b/scrapy/templates/project/module/settings.py.tmpl
index 72f25eb..486df6b 100644
--- a/scrapy/templates/project/module/settings.py.tmpl
+++ b/scrapy/templates/project/module/settings.py.tmpl
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    '$project_name.pipelines.SomePipeline': 300,
+#    '$project_name.pipelines.${ProjectName}Pipeline': 300,
 #}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
diff --git a/scrapy/utils/conf.py b/scrapy/utils/conf.py
index e8af90f..435e9a6 100644
--- a/scrapy/utils/conf.py
+++ b/scrapy/utils/conf.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import numbers
 from operator import itemgetter
 
 import six
@@ -34,6 +35,13 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
             _check_components(compdict)
             return {convert(k): v for k, v in six.iteritems(compdict)}
 
+    def _validate_values(compdict):
+        """Fail if a value in the components dict is not a real number or None."""
+        for name, value in six.iteritems(compdict):
+            if value is not None and not isinstance(value, numbers.Real):
+                raise ValueError('Invalid value {} for component {}, please provide ' \
+                                 'a real number or None instead'.format(value, name))
+
     # BEGIN Backwards compatibility for old (base, custom) call signature
     if isinstance(custom, (list, tuple)):
         _check_components(custom)
@@ -43,6 +51,7 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
         compdict.update(custom)
     # END Backwards compatibility
 
+    _validate_values(compdict)
     compdict = without_none_values(_map_keys(compdict))
     return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
 
diff --git a/scrapy/utils/reqser.py b/scrapy/utils/reqser.py
index 7e1e99e..2fceb0d 100644
--- a/scrapy/utils/reqser.py
+++ b/scrapy/utils/reqser.py
@@ -5,6 +5,7 @@ import six
 
 from scrapy.http import Request
 from scrapy.utils.python import to_unicode, to_native_str
+from scrapy.utils.misc import load_object
 
 
 def request_to_dict(request, spider=None):
@@ -32,6 +33,8 @@ def request_to_dict(request, spider=None):
... 568 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-scrapy.git



More information about the Python-modules-commits mailing list