[Python-modules-commits] [python-scrapy] 02/09: New upstream version 1.5.0
Michael Fladischer
fladi at moszumanska.debian.org
Tue Jan 9 14:16:21 UTC 2018
This is an automated email from the git hooks/post-receive script.
fladi pushed a commit to branch debian/master
in repository python-scrapy.
commit f128f1481bd5e321fbdf390b92247f53725618c6
Author: Michael Fladischer <FladischerMichael at fladi.at>
Date: Tue Jan 9 08:54:56 2018 +0100
New upstream version 1.5.0
---
.bumpversion.cfg | 2 +-
.travis.yml | 34 +++---
CONTRIBUTING.md | 4 +-
INSTALL | 2 +-
README.rst | 35 ++++---
artwork/README.rst | 4 +-
debian/control | 6 +-
debian/copyright | 8 +-
docs/conf.py | 4 +-
docs/contributing.rst | 80 +++++++++++---
docs/faq.rst | 6 +-
docs/index.rst | 10 +-
docs/intro/install.rst | 50 ++++++---
docs/intro/overview.rst | 4 +-
docs/intro/tutorial.rst | 6 +-
docs/news.rst | 116 ++++++++++++++++++++-
docs/topics/autothrottle.rst | 1 +
docs/topics/broad-crawls.rst | 6 +-
docs/topics/commands.rst | 8 +-
docs/topics/debug.rst | 2 +-
docs/topics/deploy.rst | 12 +--
docs/topics/downloader-middleware.rst | 22 ++++
docs/topics/email.rst | 4 +-
docs/topics/exceptions.rst | 8 ++
docs/topics/exporters.rst | 62 ++++++-----
docs/topics/extensions.rst | 10 +-
docs/topics/firebug.rst | 2 +-
docs/topics/firefox.rst | 8 +-
docs/topics/item-pipeline.rst | 4 +-
docs/topics/items.rst | 2 +-
docs/topics/jobs.rst | 2 +-
docs/topics/loaders.rst | 4 +-
docs/topics/logging.rst | 4 +-
docs/topics/media-pipeline.rst | 29 +++++-
docs/topics/practices.rst | 8 +-
docs/topics/request-response.rst | 12 +--
docs/topics/scrapyd.rst | 2 +-
docs/topics/selectors.rst | 6 +-
docs/topics/settings.rst | 8 +-
docs/topics/shell.rst | 14 +--
docs/topics/signals.rst | 10 +-
docs/topics/spider-middleware.rst | 21 +++-
docs/topics/spiders.rst | 11 +-
docs/topics/ubuntu.rst | 2 +-
docs/utils/linkfix.py | 2 +-
extras/coverage-report.sh | 2 +-
extras/qpsclient.py | 2 +-
requirements-py3.txt | 2 +-
scrapy/VERSION | 2 +-
scrapy/_monkeypatches.py | 4 +-
scrapy/cmdline.py | 8 +-
scrapy/commands/edit.py | 3 +-
scrapy/commands/parse.py | 30 +++++-
scrapy/commands/version.py | 43 ++------
scrapy/core/downloader/contextfactory.py | 4 +-
scrapy/core/downloader/handlers/http11.py | 48 +++++++--
scrapy/crawler.py | 16 ++-
scrapy/downloadermiddlewares/chunked.py | 2 +-
scrapy/downloadermiddlewares/httpcache.py | 2 +-
scrapy/downloadermiddlewares/redirect.py | 4 +-
scrapy/exporters.py | 2 +-
scrapy/extensions/httpcache.py | 13 +--
scrapy/extensions/telnet.py | 2 +-
scrapy/http/request/__init__.py | 4 +
scrapy/http/response/text.py | 9 +-
scrapy/linkextractors/__init__.py | 2 +-
scrapy/mail.py | 22 ++--
scrapy/pipelines/files.py | 49 ++++++++-
scrapy/pipelines/images.py | 3 +
scrapy/resolver.py | 3 +-
scrapy/settings/default_settings.py | 4 +-
scrapy/shell.py | 2 +-
scrapy/signalmanager.py | 2 +-
scrapy/spidermiddlewares/offsite.py | 10 ++
scrapy/spiders/__init__.py | 2 +-
scrapy/spiders/sitemap.py | 2 +-
scrapy/templates/project/module/items.py.tmpl | 2 +-
.../templates/project/module/middlewares.py.tmpl | 49 ++++++++-
scrapy/templates/project/module/pipelines.py.tmpl | 2 +-
scrapy/templates/project/module/settings.py.tmpl | 22 ++--
scrapy/templates/project/scrapy.cfg | 2 +-
scrapy/utils/console.py | 12 ++-
scrapy/utils/defer.py | 2 +-
scrapy/utils/deprecate.py | 12 +--
scrapy/utils/http.py | 2 +-
scrapy/utils/log.py | 13 ++-
scrapy/utils/python.py | 38 ++++++-
scrapy/utils/test.py | 15 +++
scrapy/utils/url.py | 2 +-
scrapy/utils/versions.py | 50 +++++++++
sep/sep-001.rst | 2 +-
sep/sep-006.rst | 8 +-
sep/sep-013.rst | 2 +-
sep/sep-017.rst | 2 +-
sep/sep-020.rst | 2 +-
setup.py | 29 +++++-
tests/__init__.py | 2 +-
tests/keys/example-com.conf | 4 +-
tests/test_cmdline/__init__.py | 2 +-
tests/test_command_parse.py | 39 +++++++
tests/test_command_version.py | 2 +-
tests/test_commands.py | 21 ++++
tests/test_downloader_handlers.py | 88 ++++++++--------
tests/test_downloadermiddleware_cookies.py | 20 ++--
tests/test_downloadermiddleware_defaultheaders.py | 6 +-
tests/test_downloadermiddleware_downloadtimeout.py | 8 +-
tests/test_downloadermiddleware_httpauth.py | 4 +-
tests/test_downloadermiddleware_httpcache.py | 1 +
tests/test_downloadermiddleware_httpcompression.py | 2 +-
tests/test_downloadermiddleware_httpproxy.py | 48 ++++-----
tests/test_downloadermiddleware_redirect.py | 20 ++--
tests/test_downloadermiddleware_useragent.py | 6 +-
tests/test_http_cookies.py | 2 +-
tests/test_http_request.py | 24 ++++-
tests/test_http_response.py | 8 +-
tests/test_item.py | 2 +-
tests/test_loader.py | 20 ++--
tests/test_pipeline_files.py | 28 ++++-
tests/test_pipeline_images.py | 16 +--
tests/test_selector.py | 2 +-
tests/test_spider.py | 47 ++++++++-
tests/test_spidermiddleware_depth.py | 8 +-
tests/test_spidermiddleware_httperror.py | 24 ++---
tests/test_spidermiddleware_offsite.py | 18 +++-
tests/test_spidermiddleware_referer.py | 10 +-
tests/test_spidermiddleware_urllength.py | 2 +-
tests/test_squeues.py | 30 ++++--
tests/test_urlparse_monkeypatches.py | 8 +-
tests/test_utils_datatypes.py | 8 +-
tests/test_utils_defer.py | 2 +-
tests/test_utils_iterators.py | 4 +-
tests/test_utils_misc/__init__.py | 8 +-
tests/test_utils_project.py | 8 +-
tests/test_utils_python.py | 48 ++++++---
tests/test_utils_reqser.py | 6 +-
tests/test_utils_signal.py | 2 +-
tests/test_webclient.py | 28 ++---
tox.ini | 24 +++--
138 files changed, 1326 insertions(+), 562 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 21800f6..6e7be14 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.4.0
+current_version = 1.5.0
commit = True
tag = True
tag_name = {new_version}
diff --git a/.travis.yml b/.travis.yml
index 9061150..6635f5d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,32 +11,32 @@ matrix:
env: TOXENV=py27
- python: 2.7
env: TOXENV=jessie
- - python: 3.3
- env: TOXENV=py33
+ - python: 2.7
+ env: TOXENV=pypy
+ - python: 2.7
+ env: TOXENV=pypy3
+ - python: 3.4
+ env: TOXENV=py34
- python: 3.5
env: TOXENV=py35
- python: 3.6
env: TOXENV=py36
- - python: 2.7
- env: TOXENV=pypy
- python: 3.6
env: TOXENV=docs
- allow_failures:
- - python: 2.7
- env: TOXENV=pypy
install:
- |
if [ "$TOXENV" = "pypy" ]; then
- export PYENV_ROOT="$HOME/.pyenv"
- if [ -f "$PYENV_ROOT/bin/pyenv" ]; then
- pushd "$PYENV_ROOT" && git pull && popd
- else
- rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT"
- fi
- # get latest portable PyPy from pyenv directly (thanks to natural version sort option -V)
- export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-portable-[0-9][\.0-9]*$' |sort -V |tail -1`
- "$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION"
- virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION"
+ export PYPY_VERSION="pypy-5.9-linux_x86_64-portable"
+ wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2"
+ tar -jxf ${PYPY_VERSION}.tar.bz2
+ virtualenv --python="$PYPY_VERSION/bin/pypy" "$HOME/virtualenvs/$PYPY_VERSION"
+ source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
+ fi
+ if [ "$TOXENV" = "pypy3" ]; then
+ export PYPY_VERSION="pypy3.5-5.9-beta-linux_x86_64-portable"
+ wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2"
+ tar -jxf ${PYPY_VERSION}.tar.bz2
+ virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION"
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
fi
- pip install -U tox twine wheel codecov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 88c472f..0a11b05 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
The guidelines for contributing are available here:
-http://doc.scrapy.org/en/master/contributing.html
+https://doc.scrapy.org/en/master/contributing.html
Please do not abuse the issue tracker for support questions.
If your issue topic can be rephrased to "How to ...?", please use the
-support channels to get it answered: http://scrapy.org/community/
+support channels to get it answered: https://scrapy.org/community/
diff --git a/INSTALL b/INSTALL
index 84803a9..a3c7899 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,4 +1,4 @@
For information about installing Scrapy see:
* docs/intro/install.rst (local file)
-* http://doc.scrapy.org/en/latest/intro/install.html (online version)
+* https://doc.scrapy.org/en/latest/intro/install.html (online version)
diff --git a/README.rst b/README.rst
index 4eb36b4..1361eac 100644
--- a/README.rst
+++ b/README.rst
@@ -6,8 +6,12 @@ Scrapy
:target: https://pypi.python.org/pypi/Scrapy
:alt: PyPI Version
+.. image:: https://img.shields.io/pypi/pyversions/Scrapy.svg
+ :target: https://pypi.python.org/pypi/Scrapy
+ :alt: Supported Python Versions
+
.. image:: https://img.shields.io/travis/scrapy/scrapy/master.svg
- :target: http://travis-ci.org/scrapy/scrapy
+ :target: https://travis-ci.org/scrapy/scrapy
:alt: Build Status
.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg
@@ -15,7 +19,7 @@ Scrapy
:alt: Wheel Status
.. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg
- :target: http://codecov.io/github/scrapy/scrapy?branch=master
+ :target: https://codecov.io/github/scrapy/scrapy?branch=master
:alt: Coverage report
.. image:: https://anaconda.org/conda-forge/scrapy/badges/version.svg
@@ -31,12 +35,12 @@ crawl websites and extract structured data from their pages. It can be used for
a wide range of purposes, from data mining to monitoring and automated testing.
For more information including a list of features check the Scrapy homepage at:
-http://scrapy.org
+https://scrapy.org
Requirements
============
-* Python 2.7 or Python 3.3+
+* Python 2.7 or Python 3.4+
* Works on Linux, Windows, Mac OSX, BSD
Install
@@ -47,29 +51,28 @@ The quick way::
pip install scrapy
For more details see the install section in the documentation:
-http://doc.scrapy.org/en/latest/intro/install.html
-
-Releases
-========
-
-You can download the latest stable and development releases from:
-http://scrapy.org/download/
+https://doc.scrapy.org/en/latest/intro/install.html
Documentation
=============
-Documentation is available online at http://doc.scrapy.org/ and in the ``docs``
+Documentation is available online at https://doc.scrapy.org/ and in the ``docs``
directory.
+Releases
+========
+
+You can find release notes at https://doc.scrapy.org/en/latest/news.html
+
Community (blog, twitter, mail list, IRC)
=========================================
-See http://scrapy.org/community/
+See https://scrapy.org/community/
Contributing
============
-See http://doc.scrapy.org/en/master/contributing.html
+See https://doc.scrapy.org/en/master/contributing.html
Code of Conduct
---------------
@@ -83,9 +86,9 @@ Please report unacceptable behavior to opensource at scrapinghub.com.
Companies using Scrapy
======================
-See http://scrapy.org/companies/
+See https://scrapy.org/companies/
Commercial Support
==================
-See http://scrapy.org/support/
+See https://scrapy.org/support/
diff --git a/artwork/README.rst b/artwork/README.rst
index 016462f..92f6ecb 100644
--- a/artwork/README.rst
+++ b/artwork/README.rst
@@ -10,10 +10,10 @@ scrapy-logo.jpg
Main Scrapy logo, in JPEG format.
-qlassik.zip
+qlassik.zip
-----------
-Font used for Scrapy logo. Homepage: http://www.dafont.com/qlassik.font
+Font used for Scrapy logo. Homepage: https://www.dafont.com/qlassik.font
scrapy-blog.logo.xcf
--------------------
diff --git a/debian/control b/debian/control
index f3a3175..2cc8eed 100644
--- a/debian/control
+++ b/debian/control
@@ -4,7 +4,7 @@ Priority: optional
Maintainer: Scrapinghub Team <info at scrapinghub.com>
Build-Depends: debhelper (>= 7.0.50), python (>=2.7), python-twisted, python-w3lib, python-lxml, python-six (>=1.5.2)
Standards-Version: 3.8.4
-Homepage: http://scrapy.org/
+Homepage: https://scrapy.org/
Package: scrapy
Architecture: all
@@ -15,6 +15,6 @@ Conflicts: python-scrapy, scrapy-0.25
Provides: python-scrapy, scrapy-0.25
Description: Python web crawling and web scraping framework
Scrapy is a fast high-level web crawling and web scraping framework,
- used to crawl websites and extract structured data from their pages.
- It can be used for a wide range of purposes, from data mining to
+ used to crawl websites and extract structured data from their pages.
+ It can be used for a wide range of purposes, from data mining to
monitoring and automated testing.
diff --git a/debian/copyright b/debian/copyright
index 4cc2390..c1bf475 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,6 +1,6 @@
This package was debianized by the Scrapinghub team <info at scrapinghub.com>.
-It was downloaded from http://scrapy.org
+It was downloaded from https://scrapy.org
Upstream Author: Scrapy Developers
@@ -14,10 +14,10 @@ All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
- 1. Redistributions of source code must retain the above copyright notice,
+ 1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
+
+ 2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
diff --git a/docs/conf.py b/docs/conf.py
index 640dcd7..007dc27 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -191,8 +191,8 @@ htmlhelp_basename = 'Scrapydoc'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, document class [howto/manual]).
latex_documents = [
- ('index', 'Scrapy.tex', ur'Scrapy Documentation',
- ur'Scrapy developers', 'manual'),
+ ('index', 'Scrapy.tex', u'Scrapy Documentation',
+ u'Scrapy developers', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
diff --git a/docs/contributing.rst b/docs/contributing.rst
index ab37793..9a02634 100644
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@@ -7,7 +7,7 @@ Contributing to Scrapy
.. important::
Double check you are reading the most recent version of this document at
- http://doc.scrapy.org/en/master/contributing.html
+ https://doc.scrapy.org/en/master/contributing.html
There are many ways to contribute to Scrapy. Here are some of them:
@@ -19,12 +19,16 @@ There are many ways to contribute to Scrapy. Here are some of them:
the guidelines detailed in `Reporting bugs`_ below.
* Submit patches for new functionality and/or bug fixes. Please read
- `Writing patches`_ and `Submitting patches`_ below for details on how to
+ :ref:`writing-patches` and `Submitting patches`_ below for details on how to
write and submit a patch.
-* Join the `scrapy-users`_ mailing list and share your ideas on how to
+* Join the `Scrapy subreddit`_ and share your ideas on how to
improve Scrapy. We're always open to suggestions.
+* Answer Scrapy questions at
+ `Stack Overflow <https://stackoverflow.com/questions/tagged/scrapy>`__.
+
+
Reporting bugs
==============
@@ -40,13 +44,18 @@ guidelines when reporting a new bug.
* check the :ref:`FAQ <faq>` first to see if your issue is addressed in a
well-known question
+* if you have a general question about scrapy usage, please ask it at
+ `Stack Overflow <https://stackoverflow.com/questions/tagged/scrapy>`__
+ (use "scrapy" tag).
+
* check the `open issues`_ to see if it has already been reported. If it has,
- don't dismiss the report but check the ticket history and comments, you may
- find additional useful information to contribute.
+ don't dismiss the report, but check the ticket history and comments. If you
+ have additional useful information, please leave a comment, or consider
+ :ref:`sending a pull request <writing-patches>` with a fix.
-* search the `scrapy-users`_ list to see if it has been discussed there, or
- if you're not sure if what you're seeing is a bug. You can also ask in the
- `#scrapy` IRC channel.
+* search the `scrapy-users`_ list and `Scrapy subreddit`_ to see if it has
+ been discussed there, or if you're not sure if what you're seeing is a bug.
+ You can also ask in the `#scrapy` IRC channel.
* write **complete, reproducible, specific bug reports**. The smaller the test
case, the better. Remember that other developers won't have your project to
@@ -54,12 +63,20 @@ guidelines when reporting a new bug.
it. See for example StackOverflow's guide on creating a
`Minimal, Complete, and Verifiable example`_ exhibiting the issue.
+* the most awesome way to provide a complete reproducible example is to
+ send a pull request which adds a failing test case to the
+ Scrapy testing suite (see :ref:`submitting-patches`).
+ This is helpful even if you don't have an intention to
+ fix the issue yourselves.
+
* include the output of ``scrapy version -v`` so developers working on your bug
know exactly which version and platform it occurred on, which is often very
helpful for reproducing it, or knowing if it was already fixed.
.. _Minimal, Complete, and Verifiable example: https://stackoverflow.com/help/mcve
+.. _writing-patches:
+
Writing patches
===============
@@ -83,6 +100,8 @@ Well-written patches should:
the documentation changes in the same patch. See `Documentation policies`_
below.
+.. _submitting-patches:
+
Submitting patches
==================
@@ -98,13 +117,31 @@ patch, but it's always good to have a patch ready to illustrate your arguments
and show that you have put some additional thought into the subject. A good
starting point is to send a pull request on GitHub. It can be simple enough to
illustrate your idea, and leave documentation/tests for later, after the idea
-has been validated and proven useful. Alternatively, you can send an email to
-`scrapy-users`_ to discuss your idea first.
+has been validated and proven useful. Alternatively, you can start a
+conversation in the `Scrapy subreddit`_ to discuss your idea first.
+
+Sometimes there is an existing pull request for the problem you'd like to
+solve, which is stalled for some reason. Often the pull request is in a
+right direction, but changes are requested by Scrapy maintainers, and the
+original pull request author haven't had time to address them.
+In this case consider picking up this pull request: open
+a new pull request with all commits from the original pull request, as well as
+additional changes to address the raised issues. Doing so helps a lot; it is
+not considered rude as soon as the original author is acknowledged by keeping
+his/her commits.
+
+You can pull an existing pull request to a local branch
+by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE``
+(replace 'upstream' with a remote name for scrapy repository,
+``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE``
+with a name of the branch you want to create locally).
+See also: https://help.github.com/articles/checking-out-pull-requests-locally/#modifying-an-inactive-pull-request-locally.
+
When writing GitHub pull requests, try to keep titles short but descriptive.
E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests"
prefer "Fix hanging when exception occurs in start_requests (#411)"
-instead of "Fix for #411".
-Complete titles make it easy to skim through the issue tracker.
+instead of "Fix for #411". Complete titles make it easy to skim through
+the issue tracker.
Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports
removal, etc) in separate commits than functional changes. This will make pull
@@ -121,21 +158,29 @@ Scrapy:
* It's OK to use lines longer than 80 chars if it improves the code
readability.
-* Don't put your name in the code you contribute. Our policy is to keep
- the contributor's name in the `AUTHORS`_ file distributed with Scrapy.
+* Don't put your name in the code you contribute; git provides enough
+ metadata to identify author of the code.
+ See https://help.github.com/articles/setting-your-username-in-git/ for
+ setup instructions.
Documentation policies
======================
* **Don't** use docstrings for documenting classes, or methods which are
- already documented in the official (sphinx) documentation. For example, the
- :meth:`ItemLoader.add_value` method should be documented in the sphinx
- documentation, not its docstring.
+ already documented in the official (sphinx) documentation. Alternatively,
+ **do** provide a docstring, but make sure sphinx documentation uses
+ autodoc_ extension to pull the docstring. For example, the
+ :meth:`ItemLoader.add_value` method should be either
+ documented only in the sphinx documentation (not it a docstring), or
+ it should have a docstring which is pulled to sphinx documentation using
+ autodoc_ extension.
* **Do** use docstrings for documenting functions not present in the official
(sphinx) documentation, such as functions from ``scrapy.utils`` package and
its sub-modules.
+.. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
+
Tests
=====
@@ -188,6 +233,7 @@ And their unit-tests are in::
.. _issue tracker: https://github.com/scrapy/scrapy/issues
.. _scrapy-users: https://groups.google.com/forum/#!forum/scrapy-users
+.. _Scrapy subreddit: https://reddit.com/r/scrapy
.. _Twisted unit-testing framework: https://twistedmatrix.com/documents/current/core/development/policy/test-standard.html
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
diff --git a/docs/faq.rst b/docs/faq.rst
index f0ee20b..7a0628f 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -21,7 +21,7 @@ Python code.
In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like
comparing `jinja2`_ to `Django`_.
-.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
.. _lxml: http://lxml.de/
.. _jinja2: http://jinja.pocoo.org/
.. _Django: https://www.djangoproject.com/
@@ -69,9 +69,11 @@ Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML pars
What Python versions does Scrapy support?
-----------------------------------------
-Scrapy is supported under Python 2.7 and Python 3.3+.
+Scrapy is supported under Python 2.7 and Python 3.4+
+under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
Python 2.6 support was dropped starting at Scrapy 0.20.
Python 3 support was added in Scrapy 1.1.
+PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5.
.. note::
For Python 3 support on Windows, it is recommended to use
diff --git a/docs/index.rst b/docs/index.rst
index 289fb2b..7e8c979 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,14 +13,14 @@ Having trouble? We'd like to help!
* Try the :doc:`FAQ <faq>` -- it's got answers to some common questions.
* Looking for specific information? Try the :ref:`genindex` or :ref:`modindex`.
-* Ask or search questions in `StackOverflow using the scrapy tag`_,
-* Search for information in the `archives of the scrapy-users mailing list`_, or
- `post a question`_.
+* Ask or search questions in `StackOverflow using the scrapy tag`_.
+* Ask or search questions in the `Scrapy subreddit`_.
+* Search for questions on the archives of the `scrapy-users mailing list`_.
* Ask a question in the `#scrapy IRC channel`_,
* Report bugs with Scrapy in our `issue tracker`_.
-.. _archives of the scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users
-.. _post a question: https://groups.google.com/forum/#!forum/scrapy-users
+.. _scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users
+.. _Scrapy subreddit: https://www.reddit.com/r/scrapy/
.. _StackOverflow using the scrapy tag: https://stackoverflow.com/tags/scrapy
.. _#scrapy IRC channel: irc://irc.freenode.net/scrapy
.. _issue tracker: https://github.com/scrapy/scrapy/issues
diff --git a/docs/intro/install.rst b/docs/intro/install.rst
index 9cec2ea..4a9aa3c 100644
--- a/docs/intro/install.rst
+++ b/docs/intro/install.rst
@@ -7,7 +7,8 @@ Installation guide
Installing Scrapy
=================
-Scrapy runs on Python 2.7 and Python 3.3 or above.
+Scrapy runs on Python 2.7 and Python 3.4 or above
+under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows
@@ -107,7 +108,7 @@ Python virtualenvs can be created to use Python 2 by default, or Python 3 by def
.. _virtualenv: https://virtualenv.pypa.io
.. _virtualenv installation instructions: https://virtualenv.pypa.io/en/stable/installation/
-.. _virtualenvwrapper: http://virtualenvwrapper.readthedocs.io/en/latest/install.html
+.. _virtualenvwrapper: https://virtualenvwrapper.readthedocs.io/en/latest/install.html
.. _user guide: https://virtualenv.pypa.io/en/stable/userguide/
@@ -132,12 +133,12 @@ Once you've installed `Anaconda`_ or `Miniconda`_, install Scrapy with::
.. _intro-install-ubuntu:
-Ubuntu 12.04 or above
+Ubuntu 14.04 or above
---------------------
Scrapy is currently tested with recent-enough versions of lxml,
twisted and pyOpenSSL, and is compatible with recent Ubuntu distributions.
-But it should support older versions of Ubuntu too, like Ubuntu 12.04,
+But it should support older versions of Ubuntu too, like Ubuntu 14.04,
albeit with potential issues with TLS connections.
**Don't** use the ``python-scrapy`` package provided by Ubuntu, they are
@@ -163,8 +164,8 @@ you can install Scrapy with ``pip`` after that::
pip install scrapy
.. note::
- The same non-python dependencies can be used to install Scrapy in Debian
- Wheezy (7.0) and above.
+ The same non-Python dependencies can be used to install Scrapy in Debian
+ Jessie (8.0) and above.
.. _intro-install-macos:
@@ -188,7 +189,7 @@ solutions:
that doesn't conflict with the rest of your system. Here's how to do it using
the `homebrew`_ package manager:
- * Install `homebrew`_ following the instructions in http://brew.sh/
+ * Install `homebrew`_ following the instructions in https://brew.sh/
* Update your ``PATH`` variable to state that homebrew packages should be
used before system packages (Change ``.bashrc`` to ``.zshrc`` accordantly
@@ -223,6 +224,29 @@ After any of these workarounds you should be able to install Scrapy::
pip install Scrapy
+PyPy
+----
+
+We recommend using the latest PyPy version. The version tested is 5.9.0.
+For PyPy3, only Linux installation was tested.
+
+Most scrapy dependencides now have binary wheels for CPython, but not for PyPy.
+This means that these dependecies will be built during installation.
+On OS X, you are likely to face an issue with building Cryptography dependency,
+solution to this problem is described
+`here <https://github.com/pyca/cryptography/issues/2692#issuecomment-272773481>`_,
+that is to ``brew install openssl`` and then export the flags that this command
+recommends (only needed when installing scrapy). Installing on Linux has no special
+issues besides installing build dependencies.
+Installing scrapy with PyPy on Windows is not tested.
+
+You can check that scrapy is installed correctly by running ``scrapy bench``.
+If this command gives errors such as
+``TypeError: ... got 2 unexpected keyword arguments``, this means
+that setuptools was unable to pick up one PyPy-specific dependency.
+To fix this issue, run ``pip install 'PyPyDispatcher>=2.1.0'``.
+
+
.. _Python: https://www.python.org/
.. _pip: https://pip.pypa.io/en/latest/installing/
.. _lxml: http://lxml.de/
@@ -233,9 +257,9 @@ After any of these workarounds you should be able to install Scrapy::
.. _pyOpenSSL: https://pypi.python.org/pypi/pyOpenSSL
.. _setuptools: https://pypi.python.org/pypi/setuptools
.. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/
-.. _homebrew: http://brew.sh/
-.. _zsh: http://www.zsh.org/
-.. _Scrapinghub: http://scrapinghub.com
-.. _Anaconda: http://docs.continuum.io/anaconda/index
-.. _Miniconda: http://conda.pydata.org/docs/install/quick.html
-.. _conda-forge: https://conda-forge.github.io/
+.. _homebrew: https://brew.sh/
+.. _zsh: https://www.zsh.org/
+.. _Scrapinghub: https://scrapinghub.com
+.. _Anaconda: https://docs.anaconda.com/anaconda/
+.. _Miniconda: https://conda.io/docs/user-guide/install/index.html
+.. _conda-forge: https://conda-forge.org/
diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst
index 1da1a40..6f1c2c4 100644
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@@ -160,8 +160,8 @@ The next steps for you are to :ref:`install Scrapy <intro-install>`,
a full-blown Scrapy project and `join the community`_. Thanks for your
interest!
-.. _join the community: http://scrapy.org/community/
+.. _join the community: https://scrapy.org/community/
.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping
.. _Amazon Associates Web Services: https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html
.. _Amazon S3: https://aws.amazon.com/s3/
-.. _Sitemaps: http://www.sitemaps.org
+.. _Sitemaps: https://www.sitemaps.org/index.html
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 3b3bd8d..20538e9 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -34,7 +34,7 @@ list of Python resources for non-programmers`_.
.. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers
.. _Dive Into Python 3: http://www.diveintopython3.net
.. _Python Tutorial: https://docs.python.org/3/tutorial
-.. _Learn Python The Hard Way: http://learnpythonthehardway.org/book/
+.. _Learn Python The Hard Way: https://learnpythonthehardway.org/book/
Creating a project
@@ -54,6 +54,8 @@ This will create a ``tutorial`` directory with the following contents::
__init__.py
items.py # project items definition file
+
+ middlewares.py # project middlewares file
pipelines.py # project pipelines file
@@ -452,7 +454,7 @@ For historic reasons, Scrapy appends to a given file instead of overwriting
its contents. If you run this command twice without removing the file
before the second time, you'll end up with a broken JSON file.
-You can also used other formats, like `JSON Lines`_::
+You can also use other formats, like `JSON Lines`_::
scrapy crawl quotes -o quotes.jl
diff --git a/docs/news.rst b/docs/news.rst
index e0f8eee..36ead3a 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -3,6 +3,118 @@
Release notes
=============
+Scrapy 1.5.0 (2017-12-29)
+-------------------------
+
+This release brings small new features and improvements across the codebase.
+Some highlights:
+
+* Google Cloud Storage is supported in FilesPipeline and ImagesPipeline.
+* Crawling with proxy servers becomes more efficient, as connections
+ to proxies can be reused now.
+* Warnings, exception and logging messages are improved to make debugging
+ easier.
+* ``scrapy parse`` command now allows to set custom request meta via
+ ``--meta`` argument.
+* Compatibility with Python 3.6, PyPy and PyPy3 is improved;
+ PyPy and PyPy3 are now supported officially, by running tests on CI.
+* Better default handling of HTTP 308, 522 and 524 status codes.
+* Documentation is improved, as usual.
+
+Backwards Incompatible Changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Scrapy 1.5 drops support for Python 3.3.
+* Default Scrapy User-Agent now uses https link to scrapy.org (:issue:`2983`).
+ **This is technically backwards-incompatible**; override
+ :setting:`USER_AGENT` if you relied on old value.
+* Logging of settings overridden by ``custom_settings`` is fixed;
+ **this is technically backwards-incompatible** because the logger
+ changes from ``[scrapy.utils.log]`` to ``[scrapy.crawler]``. If you're
+ parsing Scrapy logs, please update your log parsers (:issue:`1343`).
+* LinkExtractor now ignores ``m4v`` extension by default, this is change
+ in behavior.
+* 522 and 524 status codes are added to ``RETRY_HTTP_CODES`` (:issue:`2851`)
+
+New features
+~~~~~~~~~~~~
+
+- Support ``<link>`` tags in ``Response.follow`` (:issue:`2785`)
+- Support for ``ptpython`` REPL (:issue:`2654`)
+- Google Cloud Storage support for FilesPipeline and ImagesPipeline
+ (:issue:`2923`).
+- New ``--meta`` option of the "scrapy parse" command allows to pass additional
+ request.meta (:issue:`2883`)
+- Populate spider variable when using ``shell.inspect_response`` (:issue:`2812`)
+- Handle HTTP 308 Permanent Redirect (:issue:`2844`)
+- Add 522 and 524 to ``RETRY_HTTP_CODES`` (:issue:`2851`)
+- Log versions information at startup (:issue:`2857`)
+- ``scrapy.mail.MailSender`` now works in Python 3 (it requires Twisted 17.9.0)
+- Connections to proxy servers are reused (:issue:`2743`)
+- Add template for a downloader middleware (:issue:`2755`)
+- Explicit message for NotImplementedError when parse callback not defined
+ (:issue:`2831`)
+- CrawlerProcess got an option to disable installation of root log handler
+ (:issue:`2921`)
+- LinkExtractor now ignores ``m4v`` extension by default
+- Better log messages for responses over :setting:`DOWNLOAD_WARNSIZE` and
+ :setting:`DOWNLOAD_MAXSIZE` limits (:issue:`2927`)
+- Show warning when a URL is put to ``Spider.allowed_domains`` instead of
+ a domain (:issue:`2250`).
+
+Bug fixes
+~~~~~~~~~
+
+- Fix logging of settings overridden by ``custom_settings``;
+ **this is technically backwards-incompatible** because the logger
+ changes from ``[scrapy.utils.log]`` to ``[scrapy.crawler]``, so please
+ update your log parsers if needed (:issue:`1343`)
+- Default Scrapy User-Agent now uses https link to scrapy.org (:issue:`2983`).
+ **This is technically backwards-incompatible**; override
+ :setting:`USER_AGENT` if you relied on old value.
+- Fix PyPy and PyPy3 test failures, support them officially
+ (:issue:`2793`, :issue:`2935`, :issue:`2990`, :issue:`3050`, :issue:`2213`,
+ :issue:`3048`)
+- Fix DNS resolver when ``DNSCACHE_ENABLED=False`` (:issue:`2811`)
+- Add ``cryptography`` for Debian Jessie tox test env (:issue:`2848`)
+- Add verification to check if Request callback is callable (:issue:`2766`)
+- Port ``extras/qpsclient.py`` to Python 3 (:issue:`2849`)
+- Use getfullargspec under the scenes for Python 3 to stop DeprecationWarning
+ (:issue:`2862`)
+- Update deprecated test aliases (:issue:`2876`)
+- Fix ``SitemapSpider`` support for alternate links (:issue:`2853`)
+
+Docs
+~~~~
+
+- Added missing bullet point for the ``AUTOTHROTTLE_TARGET_CONCURRENCY``
+ setting. (:issue:`2756`)
+- Update Contributing docs, document new support channels
+ (:issue:`2762`, issue:`3038`)
+- Include references to Scrapy subreddit in the docs
+- Fix broken links; use https:// for external links
+ (:issue:`2978`, :issue:`2982`, :issue:`2958`)
+- Document CloseSpider extension better (:issue:`2759`)
+- Use ``pymongo.collection.Collection.insert_one()`` in MongoDB example
+ (:issue:`2781`)
+- Spelling mistake and typos
+ (:issue:`2828`, :issue:`2837`, :issue:`#2884`, :issue:`2924`)
+- Clarify ``CSVFeedSpider.headers`` documentation (:issue:`2826`)
+- Document ``DontCloseSpider`` exception and clarify ``spider_idle``
+ (:issue:`2791`)
+- Update "Releases" section in README (:issue:`2764`)
+- Fix rst syntax in ``DOWNLOAD_FAIL_ON_DATALOSS`` docs (:issue:`2763`)
+- Small fix in description of startproject arguments (:issue:`2866`)
+- Clarify data types in Response.body docs (:issue:`2922`)
+- Add a note about ``request.meta['depth']`` to DepthMiddleware docs (:issue:`2374`)
+- Add a note about ``request.meta['dont_merge_cookies']`` to CookiesMiddleware
+ docs (:issue:`2999`)
+- Up-to-date example of project structure (:issue:`2964`, :issue:`2976`)
+- A better example of ItemExporters usage (:issue:`2989`)
+- Document ``from_crawler`` methods for spider and downloader middlewares
+ (:issue:`3019`)
+
+
Scrapy 1.4.0 (2017-05-18)
-------------------------
@@ -12,7 +124,7 @@ but quite a few handy improvements nonetheless.
Scrapy now supports anonymous FTP sessions with customizable user and
password via the new :setting:`FTP_USER` and :setting:`FTP_PASSWORD` settings.
And if you're using Twisted version 17.1.0 or above, FTP is now available
-with Python 3.
+with Python 3.
There's a new :meth:`response.follow <scrapy.http.TextResponse.follow>` method
for creating requests; **it is now a recommended way to create Requests
@@ -407,7 +519,7 @@ Refactoring
- ``canonicalize_url`` has been moved to `w3lib.url`_ (:issue:`2168`).
-.. _w3lib.url: http://w3lib.readthedocs.io/en/latest/w3lib.html#w3lib.url.canonicalize_url
+.. _w3lib.url: https://w3lib.readthedocs.io/en/latest/w3lib.html#w3lib.url.canonicalize_url
Tests & Requirements
~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/topics/autothrottle.rst b/docs/topics/autothrottle.rst
index b83946a..c9bece7 100644
--- a/docs/topics/autothrottle.rst
+++ b/docs/topics/autothrottle.rst
@@ -88,6 +88,7 @@ The settings used to control the AutoThrottle extension are:
* :setting:`AUTOTHROTTLE_ENABLED`
* :setting:`AUTOTHROTTLE_START_DELAY`
* :setting:`AUTOTHROTTLE_MAX_DELAY`
+* :setting:`AUTOTHROTTLE_TARGET_CONCURRENCY`
* :setting:`AUTOTHROTTLE_DEBUG`
* :setting:`CONCURRENT_REQUESTS_PER_DOMAIN`
* :setting:`CONCURRENT_REQUESTS_PER_IP`
diff --git a/docs/topics/broad-crawls.rst b/docs/topics/broad-crawls.rst
index 28ed7c0..eb02086 100644
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@@ -20,7 +20,7 @@ These are some common properties often found in broad crawls:
* they crawl many domains (often, unbounded) instead of a specific set of sites
-* they don't necessarily crawl domains to completion, because it would
+* they don't necessarily crawl domains to completion, because it would be
impractical (or impossible) to do so, and instead limit the crawl by time or
number of pages crawled
@@ -85,8 +85,8 @@ When doing broad crawls you are often only interested in the crawl rates you
get and any errors found. These stats are reported by Scrapy when using the
``INFO`` log level. In order to save CPU (and log storage requirements) you
should not use ``DEBUG`` log level when preforming large broad crawls in
-production. Using ``DEBUG`` level when developing your (broad) crawler may fine
-though.
+production. Using ``DEBUG`` level when developing your (broad) crawler may be
+fine though.
To set the log level use::
diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst
index 3e69c4e..3088017 100644
--- a/docs/topics/commands.rst
+++ b/docs/topics/commands.rst
@@ -55,6 +55,7 @@ structure by default, similar to this::
myproject/
__init__.py
items.py
+ middlewares.py
pipelines.py
settings.py
spiders/
@@ -187,7 +188,7 @@ startproject
Creates a new Scrapy project named ``project_name``, under the ``project_dir``
directory.
-If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``myproject``.
+If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``project_name``.
Usage example::
@@ -430,6 +431,9 @@ Supported options:
* ``--callback`` or ``-c``: spider method to use as callback for parsing the
response
+* ``--meta`` or ``-m``: additional request meta that will be passed to the callback
+ request. This must be a valid json string. Example: --meta='{"foo" : "bar"}'
+
* ``--pipelines``: process items through pipelines
* ``--rules`` or ``-r``: use :class:`~scrapy.spiders.CrawlSpider`
@@ -543,7 +547,7 @@ Example::
COMMANDS_MODULE = 'mybot.commands'
-.. _Deploying your project: http://scrapyd.readthedocs.org/en/latest/deploy.html
+.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html
Register commands via setup.py entry points
-------------------------------------------
diff --git a/docs/topics/debug.rst b/docs/topics/debug.rst
index a3e7209..d1991c0 100644
--- a/docs/topics/debug.rst
+++ b/docs/topics/debug.rst
@@ -142,4 +142,4 @@ available in all future runs should they be necessary again::
For more information, check the :ref:`topics-logging` section.
-.. _base tag: http://www.w3schools.com/tags/tag_base.asp
+.. _base tag: https://www.w3schools.com/tags/tag_base.asp
diff --git a/docs/topics/deploy.rst b/docs/topics/deploy.rst
... 4098 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-scrapy.git
More information about the Python-modules-commits
mailing list