[Python-modules-commits] [python-scrapy] 01/09: Import python-scrapy_1.4.0.orig.tar.gz
Michael Fladischer
fladi at moszumanska.debian.org
Tue Jun 20 10:10:55 UTC 2017
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch master
in repository python-scrapy.
commit f2dd115e9923fae44ed807f9a5435fafa6d26fdd
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Mon May 29 10:09:12 2017 +0200
Import python-scrapy_1.4.0.orig.tar.gz
---
.bumpversion.cfg | 23 +-
.gitignore | 2 +
.travis.yml | 4 +-
LICENSE | 6 +-
README.rst | 4 -
codecov.yml | 6 +
docs/Makefile | 6 +-
docs/README.rst | 4 +-
docs/contributing.rst | 9 -
docs/faq.rst | 3 +-
docs/intro/install.rst | 78 +-
docs/intro/overview.rst | 3 +-
docs/intro/tutorial.rst | 73 +-
docs/news.rst | 204 +++++
docs/requirements.txt | 2 +
docs/topics/commands.rst | 6 +-
docs/topics/downloader-middleware.rst | 23 +-
docs/topics/exporters.rst | 31 +-
docs/topics/extensions.rst | 1 -
docs/topics/feed-exports.rst | 17 +
docs/topics/item-pipeline.rst | 5 +-
docs/topics/link-extractors.rst | 20 +-
docs/topics/logging.rst | 14 +-
docs/topics/media-pipeline.rst | 12 +
docs/topics/practices.rst | 2 +-
docs/topics/request-response.rst | 41 +-
docs/topics/selectors.rst | 13 +-
docs/topics/settings.rst | 100 ++-
docs/topics/spider-middleware.rst | 86 +-
docs/topics/spiders.rst | 29 +-
requirements-py3.txt | 2 +-
requirements.txt | 2 +-
scrapy/VERSION | 2 +-
scrapy/__main__.py | 4 +
scrapy/cmdline.py | 11 +-
scrapy/commands/edit.py | 4 +-
scrapy/core/downloader/contextfactory.py | 8 +-
scrapy/core/downloader/handlers/datauri.py | 23 +
scrapy/core/downloader/handlers/ftp.py | 28 +-
scrapy/core/downloader/handlers/http11.py | 77 +-
scrapy/core/engine.py | 4 +-
scrapy/crawler.py | 9 +-
scrapy/downloadermiddlewares/httpcompression.py | 22 +-
scrapy/downloadermiddlewares/httpproxy.py | 25 +-
scrapy/downloadermiddlewares/retry.py | 16 +-
scrapy/downloadermiddlewares/stats.py | 4 +-
scrapy/exporters.py | 46 +-
scrapy/extensions/feedexport.py | 5 +-
scrapy/extensions/httpcache.py | 11 +-
scrapy/extensions/memusage.py | 1 -
scrapy/http/request/__init__.py | 3 +-
scrapy/http/request/form.py | 12 +-
scrapy/http/response/__init__.py | 30 +
scrapy/http/response/text.py | 56 ++
scrapy/linkextractors/__init__.py | 2 +-
scrapy/linkextractors/htmlparser.py | 8 +-
scrapy/linkextractors/lxmlhtml.py | 36 +-
scrapy/linkextractors/regex.py | 3 +-
scrapy/linkextractors/sgml.py | 28 +-
scrapy/logformatter.py | 10 +-
scrapy/pipelines/files.py | 4 +-
scrapy/pipelines/images.py | 5 +
scrapy/pipelines/media.py | 27 +-
scrapy/settings/default_settings.py | 24 +-
scrapy/spiderloader.py | 16 +
scrapy/spidermiddlewares/httperror.py | 4 +
scrapy/spidermiddlewares/referer.py | 343 +++++++-
scrapy/spiders/__init__.py | 18 +-
scrapy/spiders/crawl.py | 8 +-
scrapy/spiders/sitemap.py | 18 +-
.../templates/project/module/middlewares.py.tmpl | 8 +-
scrapy/templates/spiders/basic.tmpl | 4 +-
scrapy/utils/datatypes.py | 4 +-
scrapy/utils/deprecate.py | 32 +
scrapy/utils/gz.py | 4 +
scrapy/utils/log.py | 22 +-
scrapy/utils/misc.py | 2 +-
scrapy/utils/python.py | 11 +
scrapy/utils/reqser.py | 4 +-
scrapy/utils/response.py | 3 +-
scrapy/utils/url.py | 33 +-
setup.py | 2 +-
tests/__init__.py | 7 +-
tests/keys/cert.pem | 36 -
tests/keys/localhost.crt | 20 +
tests/keys/localhost.gen.README | 21 +
tests/keys/localhost.key | 28 +
tests/mockserver.py | 22 +-
tests/py3-ignores.txt | 7 -
tests/requirements-py3.txt | 1 +
tests/requirements.txt | 1 +
tests/sample_data/compressed/html-br.bin | Bin 0 -> 4027 bytes
.../link_extractor/sgml_linkextractor.html | 2 +
.../images/python-logo-master-v3-TM-flattened.png | Bin 0 -> 11155 bytes
.../files/images/python-powered-h-50x65.png | Bin 0 -> 3243 bytes
.../sample_data/test_site/files/images/scrapy.png | Bin 0 -> 2710 bytes
tests/spiders.py | 5 +-
tests/test_crawler.py | 46 ++
tests/test_downloader_handlers.py | 315 +++++++-
tests/test_downloadermiddleware_httpcompression.py | 101 ++-
tests/test_downloadermiddleware_httpproxy.py | 49 +-
tests/test_downloadermiddleware_retry.py | 93 ++-
tests/test_engine.py | 4 +-
tests/test_feedexport.py | 194 ++++-
tests/test_http_request.py | 21 +-
tests/test_http_response.py | 119 +++
tests/test_linkextractors.py | 41 +
tests/test_linkextractors_deprecated.py | 23 +-
tests/test_logformatter.py | 33 +
tests/test_pipeline_crawl.py | 182 +++++
tests/test_pipeline_images.py | 8 +
tests/test_pipeline_media.py | 68 +-
tests/test_proxy_connect.py | 4 +-
tests/test_spider.py | 33 +-
tests/test_spiderloader/__init__.py | 51 ++
tests/test_spidermiddleware_httperror.py | 13 +-
tests/test_spidermiddleware_referer.py | 869 ++++++++++++++++++++-
tests/test_spiderstate.py | 32 +-
tests/test_utils_datatypes.py | 48 +-
tests/test_utils_project.py | 5 +-
tests/test_utils_reqser.py | 4 +-
tests/test_utils_url.py | 169 +++-
tests/test_webclient.py | 21 +-
tox.ini | 3 +-
124 files changed, 4108 insertions(+), 506 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index ed7aa0d..21800f6 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,29 +1,8 @@
[bumpversion]
-current_version = 1.3.3
+current_version = 1.4.0
commit = True
tag = True
tag_name = {new_version}
-parse = ^
- (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
- (?:(?P<prerel>[abc]|rc|dev)(?P<prerelversion>\d+))?
-serialize =
- {major}.{minor}.{patch}{prerel}{prerelversion}
- {major}.{minor}.{patch}
[bumpversion:file:scrapy/VERSION]
-[bumpversion:part:prerel]
-optional_value = gamma
-values =
- dev
- rc
- gamma
-
-[bumpversion:part:prerelversion]
-values =
- 1
- 2
- 3
- 4
- 5
-
diff --git a/.gitignore b/.gitignore
index b116640..406146e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,8 @@ dist
.idea
htmlcov/
.coverage
+.coverage.*
+.cache/
# Windows
Thumbs.db
diff --git a/.travis.yml b/.travis.yml
index 2df02ea..9061150 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,8 +33,8 @@ install:
else
rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT"
fi
- # get latest PyPy from pyenv directly (thanks to natural version sort option -V)
- export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-[0-9][\.0-9]*$' |sort -V |tail -1`
+ # get latest portable PyPy from pyenv directly (thanks to natural version sort option -V)
+ export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-portable-[0-9][\.0-9]*$' |sort -V |tail -1`
"$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION"
virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION"
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
diff --git a/LICENSE b/LICENSE
index 68ccf97..6ead05e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -4,10 +4,10 @@ All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
- 1. Redistributions of source code must retain the above copyright notice,
+ 1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
+
+ 2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
diff --git a/README.rst b/README.rst
index 38dda62..4eb36b4 100644
--- a/README.rst
+++ b/README.rst
@@ -13,10 +13,6 @@ Scrapy
.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg
:target: https://pypi.python.org/pypi/Scrapy
:alt: Wheel Status
-
-.. image:: http://static.scrapy.org/py3progress/badge.svg
- :target: https://github.com/scrapy/scrapy/wiki/Python-3-Porting
- :alt: Python 3 Porting Status
.. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg
:target: http://codecov.io/github/scrapy/scrapy?branch=master
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..d8aa6b9
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,6 @@
+comment:
+ layout: "header, diff, tree"
+
+coverage:
+ status:
+ project: false
diff --git a/docs/Makefile b/docs/Makefile
index eaba3ba..187f03c 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -10,7 +10,8 @@ PAPER =
SOURCES =
SHELL = /bin/bash
-ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees -D latex_paper_size=$(PAPER) \
+ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \
+ -D latex_elements.papersize=$(PAPER) \
$(SPHINXOPTS) . build/$(BUILDER) $(SOURCES)
.PHONY: help update build html htmlhelp clean
@@ -82,7 +83,8 @@ pydoc-topics: build
"into the Lib/ directory"
htmlview: html
- $(PYTHON) -c "import webbrowser; webbrowser.open('build/html/index.html')"
+ $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
+ os.path.realpath('build/html/index.html'))"
clean:
-rm -rf build/*
diff --git a/docs/README.rst b/docs/README.rst
index 733af2a..0a343cd 100644
--- a/docs/README.rst
+++ b/docs/README.rst
@@ -11,11 +11,11 @@ Setup the environment
---------------------
To compile the documentation you need Sphinx Python library. To install it
-and all its dependencies run
+and all its dependencies run the following command from this dir
::
- pip install 'Sphinx >= 1.3'
+ pip install -r requirements.txt
Compile the documentation
diff --git a/docs/contributing.rst b/docs/contributing.rst
index b0a435a..ab37793 100644
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@@ -124,15 +124,6 @@ Scrapy:
* Don't put your name in the code you contribute. Our policy is to keep
the contributor's name in the `AUTHORS`_ file distributed with Scrapy.
-Scrapy Contrib
-==============
-
-Scrapy contrib shares a similar rationale as Django contrib, which is explained
-in `this post <https://jacobian.org/writing/what-is-django-contrib/>`_. If you
-are working on a new functionality, please follow that rationale to decide
-whether it should be a Scrapy contrib. If unsure, you can ask in
-`scrapy-users`_.
-
Documentation policies
======================
diff --git a/docs/faq.rst b/docs/faq.rst
index ad11b07..f0ee20b 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -74,7 +74,8 @@ Python 2.6 support was dropped starting at Scrapy 0.20.
Python 3 support was added in Scrapy 1.1.
.. note::
- Python 3 is not yet supported on Windows.
+ For Python 3 support on Windows, it is recommended to use
+ Anaconda/Miniconda as :ref:`outlined in the installation guide <intro-install-windows>`.
Did Scrapy "steal" X from Django?
---------------------------------
diff --git a/docs/intro/install.rst b/docs/intro/install.rst
index 767749e..9cec2ea 100644
--- a/docs/intro/install.rst
+++ b/docs/intro/install.rst
@@ -7,14 +7,25 @@ Installation guide
Installing Scrapy
=================
-Scrapy runs on Python 2.7 and Python 3.3 or above
-(except on Windows where Python 3 is not supported yet).
+Scrapy runs on Python 2.7 and Python 3.3 or above.
-If you’re already familiar with installation of Python packages,
+If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
+the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows
+and OS X.
+
+To install Scrapy using ``conda``, run::
+
+ conda install -c conda-forge scrapy
+
+Alternatively, if you’re already familiar with installation of Python packages,
you can install Scrapy and its dependencies from PyPI with::
pip install Scrapy
+Note that sometimes this may require solving compilation issues for some Scrapy
+dependencies depending on your operating system, so be sure to check the
+:ref:`intro-install-platform-notes`.
+
We strongly recommend that you install Scrapy in :ref:`a dedicated virtualenv <intro-using-virtualenv>`,
to avoid conflicting with your system packages.
@@ -105,45 +116,21 @@ Python virtualenvs can be created to use Python 2 by default, or Python 3 by def
Platform specific installation notes
====================================
+.. _intro-install-windows:
+
Windows
-------
-* Install Python 2.7 from https://www.python.org/downloads/
-
- You need to adjust ``PATH`` environment variable to include paths to
- the Python executable and additional scripts. The following paths need to be
- added to ``PATH``::
-
- C:\Python27\;C:\Python27\Scripts\;
-
- To update the ``PATH`` open a Command prompt and run::
-
- c:\python27\python.exe c:\python27\tools\scripts\win_add2path.py
-
- Close the command prompt window and reopen it so changes take effect, run the
- following command and check it shows the expected Python version::
-
- python --version
-
-* Install `pywin32` from http://sourceforge.net/projects/pywin32/
-
- Be sure you download the architecture (win32 or amd64) that matches your system
-
-* *(Only required for Python<2.7.9)* Install `pip`_ from
- https://pip.pypa.io/en/latest/installing/
+Though it's possible to install Scrapy on Windows using pip, we recommend you
+to install `Anaconda`_ or `Miniconda`_ and use the package from the
+`conda-forge`_ channel, which will avoid most installation issues.
- Now open a Command prompt to check ``pip`` is installed correctly::
+Once you've installed `Anaconda`_ or `Miniconda`_, install Scrapy with::
- pip --version
-
-* At this point Python 2.7 and ``pip`` package manager must be working, let's
- install Scrapy::
+ conda install -c conda-forge scrapy
- pip install Scrapy
-.. note::
- Python 3 is not supported on Windows. This is because Scrapy core requirement Twisted does not support
- Python 3 on Windows.
+.. _intro-install-ubuntu:
Ubuntu 12.04 or above
---------------------
@@ -180,6 +167,8 @@ you can install Scrapy with ``pip`` after that::
Wheezy (7.0) and above.
+.. _intro-install-macos:
+
Mac OS X
--------
@@ -234,27 +223,8 @@ After any of these workarounds you should be able to install Scrapy::
pip install Scrapy
-Anaconda
---------
-
-
-Using Anaconda is an alternative to using a virtualenv and installing with ``pip``.
-
-.. note::
-
- For Windows users, or if you have issues installing through ``pip``, this is
- the recommended way to install Scrapy.
-
-If you already have `Anaconda`_ or `Miniconda`_ installed, the `conda-forge`_
-community have up-to-date packages for Linux, Windows and OS X.
-
-To install Scrapy using ``conda``, run::
-
- conda install -c conda-forge scrapy
-
.. _Python: https://www.python.org/
.. _pip: https://pip.pypa.io/en/latest/installing/
-.. _Control Panel: https://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx
.. _lxml: http://lxml.de/
.. _parsel: https://pypi.python.org/pypi/parsel
.. _w3lib: https://pypi.python.org/pypi/w3lib
diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst
index 7195017..1da1a40 100644
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@@ -40,8 +40,7 @@ http://quotes.toscrape.com, following the pagination::
next_page = response.css('li.next a::attr("href")').extract_first()
if next_page is not None:
- next_page = response.urljoin(next_page)
- yield scrapy.Request(next_page, callback=self.parse)
+ yield response.follow(next_page, self.parse)
Put this in a text file, name it to something like ``quotes_spider.py``
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 3dc5ad2..3b3bd8d 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -399,7 +399,7 @@ quotes elements and put them together into a Python dictionary::
>>>
Extracting data in our spider
-------------------------------
+-----------------------------
Let's get back to our spider. Until now, it doesn't extract any data in
particular, just saves the whole HTML page to a local file. Let's integrate the
@@ -551,13 +551,65 @@ In our example, it creates a sort of loop, following all the links to the next p
until it doesn't find one -- handy for crawling blogs, forums and other sites with
pagination.
+
+.. _response-follow-example:
+
+A shortcut for creating Requests
+--------------------------------
+
+As a shortcut for creating Request objects you can use
+:meth:`response.follow <scrapy.http.TextResponse.follow>`::
+
+ import scrapy
+
+
+ class QuotesSpider(scrapy.Spider):
+ name = "quotes"
+ start_urls = [
+ 'http://quotes.toscrape.com/page/1/',
+ ]
+
+ def parse(self, response):
+ for quote in response.css('div.quote'):
+ yield {
+ 'text': quote.css('span.text::text').extract_first(),
+ 'author': quote.css('span small::text').extract_first(),
+ 'tags': quote.css('div.tags a.tag::text').extract(),
+ }
+
+ next_page = response.css('li.next a::attr(href)').extract_first()
+ if next_page is not None:
+ yield response.follow(next_page, callback=self.parse)
+
+Unlike scrapy.Request, ``response.follow`` supports relative URLs directly - no
+need to call urljoin. Note that ``response.follow`` just returns a Request
+instance; you still have to yield this Request.
+
+You can also pass a selector to ``response.follow`` instead of a string;
+this selector should extract necessary attributes::
+
+ for href in response.css('li.next a::attr(href)'):
+ yield response.follow(href, callback=self.parse)
+
+For ``<a>`` elements there is a shortcut: ``response.follow`` uses their href
+attribute automatically. So the code can be shortened further::
+
+ for a in response.css('li.next a'):
+ yield response.follow(a, callback=self.parse)
+
+.. note::
+
+ ``response.follow(response.css('li.next a'))`` is not valid because
+ ``response.css`` returns a list-like object with selectors for all results,
+ not a single selector. A ``for`` loop like in the example above, or
+ ``response.follow(response.css('li.next a')[0])`` is fine.
+
More examples and patterns
--------------------------
Here is another spider that illustrates callbacks and following links,
this time for scraping author information::
-
import scrapy
@@ -568,15 +620,12 @@ this time for scraping author information::
def parse(self, response):
# follow links to author pages
- for href in response.css('.author + a::attr(href)').extract():
- yield scrapy.Request(response.urljoin(href),
- callback=self.parse_author)
+ for href in response.css('.author + a::attr(href)'):
+ yield response.follow(href, self.parse_author)
# follow pagination links
- next_page = response.css('li.next a::attr(href)').extract_first()
- if next_page is not None:
- next_page = response.urljoin(next_page)
- yield scrapy.Request(next_page, callback=self.parse)
+ for href in response.css('li.next a::attr(href)'):
+ yield response.follow(href, self.parse)
def parse_author(self, response):
def extract_with_css(query):
@@ -592,6 +641,9 @@ This spider will start from the main page, it will follow all the links to the
authors pages calling the ``parse_author`` callback for each of them, and also
the pagination links with the ``parse`` callback as we saw before.
+Here we're passing callbacks to ``response.follow`` as positional arguments
+to make the code shorter; it also works for ``scrapy.Request``.
+
The ``parse_author`` callback defines a helper function to extract and cleanup the
data from a CSS query and yields the Python dict with the author data.
@@ -652,8 +704,7 @@ with a specific tag, building the URL based on the argument::
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
- next_page = response.urljoin(next_page)
- yield scrapy.Request(next_page, self.parse)
+ yield response.follow(next_page, self.parse)
If you pass the ``tag=humor`` argument to this spider, you'll notice that it
diff --git a/docs/news.rst b/docs/news.rst
index 305e431..e0f8eee 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -3,6 +3,191 @@
Release notes
=============
+Scrapy 1.4.0 (2017-05-18)
+-------------------------
+
+Scrapy 1.4 does not bring that many breathtaking new features
+but quite a few handy improvements nonetheless.
+
+Scrapy now supports anonymous FTP sessions with customizable user and
+password via the new :setting:`FTP_USER` and :setting:`FTP_PASSWORD` settings.
+And if you're using Twisted version 17.1.0 or above, FTP is now available
+with Python 3.
+
+There's a new :meth:`response.follow <scrapy.http.TextResponse.follow>` method
+for creating requests; **it is now a recommended way to create Requests
+in Scrapy spiders**. This method makes it easier to write correct
+spiders; ``response.follow`` has several advantages over creating
+``scrapy.Request`` objects directly:
+
+* it handles relative URLs;
+* it works properly with non-ascii URLs on non-UTF8 pages;
+* in addition to absolute and relative URLs it supports Selectors;
+ for ``<a>`` elements it can also extract their href values.
+
+For example, instead of this::
+
+ for href in response.css('li.page a::attr(href)').extract():
+ url = response.urljoin(href)
+ yield scrapy.Request(url, self.parse, encoding=response.encoding)
+
+One can now write this::
+
+ for a in response.css('li.page a'):
+ yield response.follow(a, self.parse)
+
+Link extractors are also improved. They work similarly to what a regular
+modern browser would do: leading and trailing whitespace are removed
+from attributes (think ``href=" http://example.com"``) when building
+``Link`` objects. This whitespace-stripping also happens for ``action``
+attributes with ``FormRequest``.
+
+**Please also note that link extractors do not canonicalize URLs by default
+anymore.** This was puzzling users every now and then, and it's not what
+browsers do in fact, so we removed that extra transformation on extractred
+links.
+
+For those of you wanting more control on the ``Referer:`` header that Scrapy
+sends when following links, you can set your own ``Referrer Policy``.
+Prior to Scrapy 1.4, the default ``RefererMiddleware`` would simply and
+blindly set it to the URL of the response that generated the HTTP request
+(which could leak information on your URL seeds).
+By default, Scrapy now behaves much like your regular browser does.
+And this policy is fully customizable with W3C standard values
+(or with something really custom of your own if you wish).
+See :setting:`REFERRER_POLICY` for details.
+
+To make Scrapy spiders easier to debug, Scrapy logs more stats by default
+in 1.4: memory usage stats, detailed retry stats, detailed HTTP error code
+stats. A similar change is that HTTP cache path is also visible in logs now.
+
+Last but not least, Scrapy now has the option to make JSON and XML items
+more human-readable, with newlines between items and even custom indenting
+offset, using the new :setting:`FEED_EXPORT_INDENT` setting.
+
+Enjoy! (Or read on for the rest of changes in this release.)
+
+Deprecations and Backwards Incompatible Changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Default to ``canonicalize=False`` in :class:`scrapy.linkextractors.LinkExtractor`
+ (:issue:`2537`, fixes :issue:`1941` and :issue:`1982`):
+ **warning, this is technically backwards-incompatible**
+- Enable memusage extension by default (:issue:`2539`, fixes :issue:`2187`);
+ **this is technically backwards-incompatible** so please check if you have
+ any non-default ``MEMUSAGE_***`` options set.
+- ``EDITOR`` environment variable now takes precedence over ``EDITOR``
+ option defined in settings.py (:issue:`1829`); Scrapy default settings
+ no longer depend on environment variables. **This is technically a backwards
+ incompatible change**.
+- ``Spider.make_requests_from_url`` is deprecated
+ (:issue:`1728`, fixes :issue:`1495`).
+
+New Features
+~~~~~~~~~~~~
+
+- Accept proxy credentials in :reqmeta:`proxy` request meta key (:issue:`2526`)
+- Support `brotli`_-compressed content; requires optional `brotlipy`_
+ (:issue:`2535`)
+- New :ref:`response.follow <response-follow-example>` shortcut
+ for creating requests (:issue:`1940`)
+- Added ``flags`` argument and attribute to :class:`Request <scrapy.http.Request>`
+ objects (:issue:`2047`)
+- Support Anonymous FTP (:issue:`2342`)
+- Added ``retry/count``, ``retry/max_reached`` and ``retry/reason_count/<reason>``
+ stats to :class:`RetryMiddleware <scrapy.downloadermiddlewares.retry.RetryMiddleware>`
+ (:issue:`2543`)
+- Added ``httperror/response_ignored_count`` and ``httperror/response_ignored_status_count/<status>``
+ stats to :class:`HttpErrorMiddleware <scrapy.spidermiddlewares.httperror.HttpErrorMiddleware>`
+ (:issue:`2566`)
+- Customizable :setting:`Referrer policy <REFERRER_POLICY>` in
+ :class:`RefererMiddleware <scrapy.spidermiddlewares.referer.RefererMiddleware>`
+ (:issue:`2306`)
+- New ``data:`` URI download handler (:issue:`2334`, fixes :issue:`2156`)
+- Log cache directory when HTTP Cache is used (:issue:`2611`, fixes :issue:`2604`)
+- Warn users when project contains duplicate spider names (fixes :issue:`2181`)
+- :class:`CaselessDict` now accepts ``Mapping`` instances and not only dicts (:issue:`2646`)
+- :ref:`Media downloads <topics-media-pipeline>`, with :class:`FilesPipelines`
+ or :class:`ImagesPipelines`, can now optionally handle HTTP redirects
+ using the new :setting:`MEDIA_ALLOW_REDIRECTS` setting (:issue:`2616`, fixes :issue:`2004`)
+- Accept non-complete responses from websites using a new
+ :setting:`DOWNLOAD_FAIL_ON_DATALOSS` setting (:issue:`2590`, fixes :issue:`2586`)
+- Optional pretty-printing of JSON and XML items via
+ :setting:`FEED_EXPORT_INDENT` setting (:issue:`2456`, fixes :issue:`1327`)
+- Allow dropping fields in ``FormRequest.from_response`` formdata when
+ ``None`` value is passed (:issue:`667`)
+- Per-request retry times with the new :reqmeta:`max_retry_times` meta key
+ (:issue:`2642`)
+- ``python -m scrapy`` as a more explicit alternative to ``scrapy`` command
+ (:issue:`2740`)
+
+.. _brotli: https://github.com/google/brotli
+.. _brotlipy: https://github.com/python-hyper/brotlipy/
+
+Bug fixes
+~~~~~~~~~
+
+- LinkExtractor now strips leading and trailing whitespaces from attributes
+ (:issue:`2547`, fixes :issue:`1614`)
+- Properly handle whitespaces in action attribute in :class:`FormRequest`
+ (:issue:`2548`)
+- Buffer CONNECT response bytes from proxy until all HTTP headers are received
+ (:issue:`2495`, fixes :issue:`2491`)
+- FTP downloader now works on Python 3, provided you use Twisted>=17.1
+ (:issue:`2599`)
+- Use body to choose response type after decompressing content (:issue:`2393`,
+ fixes :issue:`2145`)
+- Always decompress ``Content-Encoding: gzip`` at :class:`HttpCompressionMiddleware
+ <scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware>` stage (:issue:`2391`)
+- Respect custom log level in ``Spider.custom_settings`` (:issue:`2581`,
+ fixes :issue:`1612`)
+- 'make htmlview' fix for macOS (:issue:`2661`)
+- Remove "commands" from the command list (:issue:`2695`)
+- Fix duplicate Content-Length header for POST requests with empty body (:issue:`2677`)
+- Properly cancel large downloads, i.e. above :setting:`DOWNLOAD_MAXSIZE` (:issue:`1616`)
+- ImagesPipeline: fixed processing of transparent PNG images with palette
+ (:issue:`2675`)
+
+Cleanups & Refactoring
+~~~~~~~~~~~~~~~~~~~~~~
+
+- Tests: remove temp files and folders (:issue:`2570`),
+ fixed ProjectUtilsTest on OS X (:issue:`2569`),
+ use portable pypy for Linux on Travis CI (:issue:`2710`)
+- Separate building request from ``_requests_to_follow`` in CrawlSpider (:issue:`2562`)
+- Remove “Python 3 progress” badge (:issue:`2567`)
+- Add a couple more lines to ``.gitignore`` (:issue:`2557`)
+- Remove bumpversion prerelease configuration (:issue:`2159`)
+- Add codecov.yml file (:issue:`2750`)
+- Set context factory implementation based on Twisted version (:issue:`2577`,
+ fixes :issue:`2560`)
+- Add omitted ``self`` arguments in default project middleware template (:issue:`2595`)
+- Remove redundant ``slot.add_request()`` call in ExecutionEngine (:issue:`2617`)
+- Catch more specific ``os.error`` exception in :class:`FSFilesStore` (:issue:`2644`)
+- Change "localhost" test server certificate (:issue:`2720`)
+- Remove unused ``MEMUSAGE_REPORT`` setting (:issue:`2576`)
+
+Documentation
+~~~~~~~~~~~~~
+
+- Binary mode is required for exporters (:issue:`2564`, fixes :issue:`2553`)
+- Mention issue with :meth:`FormRequest.from_response
+ <scrapy.http.FormRequest.from_response>` due to bug in lxml (:issue:`2572`)
+- Use single quotes uniformly in templates (:issue:`2596`)
+- Document :reqmeta:`ftp_user` and :reqmeta:`ftp_password` meta keys (:issue:`2587`)
+- Removed section on deprecated ``contrib/`` (:issue:`2636`)
+- Recommend Anaconda when installing Scrapy on Windows
+ (:issue:`2477`, fixes :issue:`2475`)
+- FAQ: rewrite note on Python 3 support on Windows (:issue:`2690`)
+- Rearrange selector sections (:issue:`2705`)
+- Remove ``__nonzero__`` from :class:`SelectorList` docs (:issue:`2683`)
+- Mention how to disable request filtering in documentation of
+ :setting:`DUPEFILTER_CLASS` setting (:issue:`2714`)
+- Add sphinx_rtd_theme to docs setup readme (:issue:`2668`)
+- Open file in text mode in JSON item writer example (:issue:`2729`)
+- Clarify ``allowed_domains`` example (:issue:`2670`)
+
+
Scrapy 1.3.3 (2017-03-10)
-------------------------
@@ -15,6 +200,7 @@ Bug fixes
A new setting is introduced to toggle between warning or exception if needed ;
see :setting:`SPIDER_LOADER_WARN_ONLY` for details.
+
Scrapy 1.3.2 (2017-02-13)
-------------------------
@@ -113,6 +299,12 @@ Dependencies & Cleanups
downloader middlewares.
+Scrapy 1.2.3 (2017-03-03)
+-------------------------
+
+- Packaging fix: disallow unsupported Twisted versions in setup.py
+
+
Scrapy 1.2.2 (2016-12-06)
-------------------------
@@ -241,6 +433,12 @@ Documentation
- Add StackOverflow as a support channel (:issue:`2257`).
+Scrapy 1.1.4 (2017-03-03)
+-------------------------
+
+- Packaging fix: disallow unsupported Twisted versions in setup.py
+
+
Scrapy 1.1.3 (2016-09-22)
-------------------------
@@ -513,6 +711,12 @@ Bugfixes
to same remote host (:issue:`1912`).
+Scrapy 1.0.7 (2017-03-03)
+-------------------------
+
+- Packaging fix: disallow unsupported Twisted versions in setup.py
+
+
Scrapy 1.0.6 (2016-05-04)
-------------------------
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..d3dcb97
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+Sphinx>=1.3
+sphinx_rtd_theme
\ No newline at end of file
diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst
index eaeeee1..3e69c4e 100644
--- a/docs/topics/commands.rst
+++ b/docs/topics/commands.rst
@@ -291,12 +291,12 @@ edit
* Syntax: ``scrapy edit <spider>``
* Requires project: *yes*
-Edit the given spider using the editor defined in the :setting:`EDITOR`
-setting.
+Edit the given spider using the editor defined in the ``EDITOR`` environment
+variable or (if unset) the :setting:`EDITOR` setting.
This command is provided only as a convenience shortcut for the most common
case, the developer is of course free to choose any tool or IDE to write and
-debug his spiders.
+debug spiders.
Usage example::
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index 1ca78cc..0d16801 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -645,6 +645,12 @@ HttpCompressionMiddleware
This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites.
+ This middleware also supports decoding `brotli-compressed`_ responses,
+ provided `brotlipy`_ is installed.
+
+.. _brotli-compressed: https://www.ietf.org/rfc/rfc7932.txt
+.. _brotlipy: https://pypi.python.org/pypi/brotlipy
+
HttpCompressionMiddleware Settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -681,7 +687,9 @@ HttpProxyMiddleware
* ``no_proxy``
You can also set the meta key ``proxy`` per-request, to a value like
- ``http://some_proxy_server:port``.
+ ``http://some_proxy_server:port`` or ``http://username:password@some_proxy_server:port``.
+ Keep in mind this value will take precedence over ``http_proxy``/``https_proxy``
+ environment variables, and it will also ignore ``no_proxy`` environment variable.
.. _urllib: https://docs.python.org/2/library/urllib.html
.. _urllib2: https://docs.python.org/2/library/urllib2.html
@@ -844,6 +852,11 @@ Default: ``2``
Maximum number of times to retry, in addition to the first download.
+Maximum number of retries can also be specified per-request using
+:reqmeta:`max_retry_times` attribute of :attr:`Request.meta <scrapy.http.Request.meta>`.
+When initialized, the :reqmeta:`max_retry_times` meta key takes higher
+precedence over the :setting:`RETRY_TIMES` setting.
+
.. setting:: RETRY_HTTP_CODES
RETRY_HTTP_CODES
@@ -949,8 +962,16 @@ enable it for :ref:`broad crawls <topics-broad-crawls>`.
HttpProxyMiddleware settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. setting:: HTTPPROXY_ENABLED
.. setting:: HTTPPROXY_AUTH_ENCODING
+HTTPPROXY_ENABLED
+^^^^^^^^^^^^^^^^^
+
+Default: ``True``
+
+Whether or not to enable the :class:`HttpProxyMiddleware`.
+
HTTPPROXY_AUTH_ENCODING
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst
index af469eb..b6139af 100644
--- a/docs/topics/exporters.rst
+++ b/docs/topics/exporters.rst
@@ -140,7 +140,7 @@ output examples, which assume you're exporting these two items::
BaseItemExporter
----------------
-.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8')
+.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0)
This is the (abstract) base class for all Item Exporters. It provides
support for common features used by all (concrete) Item Exporters, such as
@@ -149,7 +149,7 @@ BaseItemExporter
These features can be configured through the constructor arguments which
populate their respective instance attributes: :attr:`fields_to_export`,
- :attr:`export_empty_fields`, :attr:`encoding`.
+ :attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
.. method:: export_item(item)
@@ -216,6 +216,15 @@ BaseItemExporter
encoding). Other value types are passed unchanged to the specific
serialization library.
+ .. attribute:: indent
+
+ Amount of spaces used to indent the output on each level. Defaults to ``0``.
+
+ * ``indent=None`` selects the most compact representation,
+ all items in the same line with no indentation
+ * ``indent<=0`` each item on its own line, no indentation
+ * ``indent>0`` each item on its own line, indented with the provided numeric value
+
.. highlight:: none
XmlItemExporter
@@ -225,7 +234,8 @@ XmlItemExporter
Exports Items in XML format to the specified file object.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
:param root_element: The name of root element in the exported XML.
:type root_element: str
@@ -281,7 +291,8 @@ CsvItemExporter
CSV columns and their order. The :attr:`export_empty_fields` attribute has
no effect on this exporter.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
:param include_headers_line: If enabled, makes the exporter output a header
line with the field names taken from
@@ -312,7 +323,8 @@ PickleItemExporter
Exports Items in pickle format to the given file-like object.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
:param protocol: The pickle protocol to use.
:type protocol: int
@@ -333,7 +345,8 @@ PprintItemExporter
Exports Items in pretty print format to the specified file object.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
The additional keyword arguments of this constructor are passed to the
:class:`BaseItemExporter` constructor.
@@ -356,7 +369,8 @@ JsonItemExporter
arguments to the `JSONEncoder`_ constructor, so you can use any
`JSONEncoder`_ constructor argument to customize this exporter.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
A typical output of this exporter would be::
@@ -386,7 +400,8 @@ JsonLinesItemExporter
the `JSONEncoder`_ constructor, so you can use any `JSONEncoder`_
constructor argument to customize this exporter.
- :param file: the file-like object to use for exporting the data.
+ :param file: the file-like object to use for exporting the data. Its ``write`` method should
+ accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
A typical output of this exporter would be::
... 6730 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-scrapy.git
More information about the Python-modules-commits
mailing list