[med-svn] [Git][med-team/mypy][master] 6 commits: routine-update: Standards-Version: 4.5.1
Andreas Tille
gitlab at salsa.debian.org
Tue Dec 8 21:16:25 GMT 2020
Andreas Tille pushed to branch master at Debian Med / mypy
Commits:
78ee7973 by Andreas Tille at 2020-12-08T11:08:09+01:00
routine-update: Standards-Version: 4.5.1
- - - - -
05913f0c by Andreas Tille at 2020-12-08T11:08:17+01:00
routine-update: Remove trailing whitespace in debian/rules
- - - - -
c3c6a0e4 by Andreas Tille at 2020-12-08T11:08:26+01:00
routine-update: watch file standard 4
- - - - -
a017471d by Andreas Tille at 2020-12-08T12:47:43+01:00
Do not hardcode Python3.8
- - - - -
443144d6 by Andreas Tille at 2020-12-08T21:39:51+01:00
Remove no longer valid code on Python 3.9 in test suite which is not maintained upstream any more
- - - - -
96cfb5a4 by Andreas Tille at 2020-12-08T21:57:59+01:00
Upload to unstable
- - - - -
6 changed files:
- debian/changelog
- debian/control
- + debian/patches/015190f1fb2ce024c1ae022a7665c382b27167bf.patch
- debian/patches/series
- debian/rules
- debian/watch
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,14 @@
+mypy (0.790-3) unstable; urgency=medium
+
+ * Team upload.
+ * Standards-Version: 4.5.1 (routine-update)
+ * Remove trailing whitespace in debian/rules (routine-update)
+ * watch file standard 4 (routine-update)
+ * Do not hardcode Python3.8
+ Closes: #976779
+
+ -- Andreas Tille <tille at debian.org> Tue, 08 Dec 2020 21:40:01 +0100
+
mypy (0.790-2) unstable; urgency=medium
* debian/patches/py39: from upstream to support Python 3.9
=====================================
debian/control
=====================================
@@ -22,7 +22,7 @@ Build-Depends: debhelper-compat (= 13),
python3-typing-extensions,
python3-virtualenv <!nocheck>,
python3-all-dev
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
Vcs-Browser: https://salsa.debian.org/med-team/mypy
Vcs-Git: https://salsa.debian.org/med-team/mypy.git
Homepage: http://www.mypy-lang.org/
=====================================
debian/patches/015190f1fb2ce024c1ae022a7665c382b27167bf.patch
=====================================
@@ -0,0 +1,882 @@
+From 015190f1fb2ce024c1ae022a7665c382b27167bf Mon Sep 17 00:00:00 2001
+From: hauntsaninja <>
+Date: Sat, 29 Aug 2020 15:24:32 -0700
+Subject: [PATCH] samples: remove crawl.py
+
+This is no longer valid code on Python 3.9, so py39 CI fails.
+We've talked about not investing in keeping these samples up to date:
+https://github.com/python/mypy/pull/8838#issuecomment-630070909
+---
+ test-data/samples/crawl.py | 863 -------------------------------------
+ 1 file changed, 863 deletions(-)
+ delete mode 100644 test-data/samples/crawl.py
+
+diff --git a/test-data/samples/crawl.py b/test-data/samples/crawl.py
+deleted file mode 100644
+index 2caf631e0c..0000000000
+--- a/test-data/samples/crawl.py
++++ /dev/null
+@@ -1,863 +0,0 @@
+-#!/usr/bin/env python3.4
+-
+-"""A simple web crawler."""
+-
+-# This is cloned from <asyncio>/examples/crawl.py,
+-# with type annotations added (PEP 484).
+-#
+-# TODO: convert to `async def` + `await` (PEP 492).
+-
+-import argparse
+-import asyncio
+-import cgi
+-from http.client import BadStatusLine
+-import logging
+-import re
+-import sys
+-import time
+-import urllib.parse
+-from typing import Any, Generator, IO, Optional, Sequence, Set, Tuple, List, Dict
+-
+-
+-ARGS = argparse.ArgumentParser(description="Web crawler")
+-ARGS.add_argument(
+- '--iocp', action='store_true', dest='iocp',
+- default=False, help='Use IOCP event loop (Windows only)')
+-ARGS.add_argument(
+- '--select', action='store_true', dest='select',
+- default=False, help='Use Select event loop instead of default')
+-ARGS.add_argument(
+- 'roots', nargs='*',
+- default=[], help='Root URL (may be repeated)')
+-ARGS.add_argument(
+- '--max_redirect', action='store', type=int, metavar='N',
+- default=10, help='Limit redirection chains (for 301, 302 etc.)')
+-ARGS.add_argument(
+- '--max_tries', action='store', type=int, metavar='N',
+- default=4, help='Limit retries on network errors')
+-ARGS.add_argument(
+- '--max_tasks', action='store', type=int, metavar='N',
+- default=100, help='Limit concurrent connections')
+-ARGS.add_argument(
+- '--max_pool', action='store', type=int, metavar='N',
+- default=100, help='Limit connection pool size')
+-ARGS.add_argument(
+- '--exclude', action='store', metavar='REGEX',
+- help='Exclude matching URLs')
+-ARGS.add_argument(
+- '--strict', action='store_true',
+- default=True, help='Strict host matching (default)')
+-ARGS.add_argument(
+- '--lenient', action='store_false', dest='strict',
+- default=False, help='Lenient host matching')
+-ARGS.add_argument(
+- '-v', '--verbose', action='count', dest='level',
+- default=1, help='Verbose logging (repeat for more verbose)')
+-ARGS.add_argument(
+- '-q', '--quiet', action='store_const', const=0, dest='level',
+- default=1, help='Quiet logging (opposite of --verbose)')
+-
+-
+-ESCAPES = [('quot', '"'),
+- ('gt', '>'),
+- ('lt', '<'),
+- ('amp', '&') # Must be last.
+- ]
+-
+-
+-def unescape(url: str) -> str:
+- """Turn & into &, and so on.
+-
+- This is the inverse of cgi.escape().
+- """
+- for name, char in ESCAPES:
+- url = url.replace('&' + name + ';', char)
+- return url
+-
+-
+-def fix_url(url: str) -> str:
+- """Prefix a schema-less URL with http://."""
+- if '://' not in url:
+- url = 'http://' + url
+- return url
+-
+-
+-class Logger:
+-
+- def __init__(self, level: int) -> None:
+- self.level = level
+-
+- def _log(self, n: int, args: Sequence[Any]) -> None:
+- if self.level >= n:
+- print(*args, file=sys.stderr, flush=True)
+-
+- def log(self, n: int, *args: Any) -> None:
+- self._log(n, args)
+-
+- def __call__(self, n: int, *args: Any) -> None:
+- self._log(n, args)
+-
+-
+-KeyTuple = Tuple[str, int, bool]
+-
+-
+-class ConnectionPool:
+- """A connection pool.
+-
+- To open a connection, use reserve(). To recycle it, use unreserve().
+-
+- The pool is mostly just a mapping from (host, port, ssl) tuples to
+- lists of Connections. The currently active connections are *not*
+- in the data structure; get_connection() takes the connection out,
+- and recycle_connection() puts it back in. To recycle a
+- connection, call conn.close(recycle=True).
+-
+- There are limits to both the overall pool and the per-key pool.
+- """
+-
+- def __init__(self, log: Logger, max_pool: int = 10, max_tasks: int = 5) -> None:
+- self.log = log
+- self.max_pool = max_pool # Overall limit.
+- self.max_tasks = max_tasks # Per-key limit.
+- self.loop = asyncio.get_event_loop()
+- self.connections = {} # type: Dict[KeyTuple, List[Connection]]
+- self.queue = [] # type: List[Connection]
+-
+- def close(self) -> None:
+- """Close all connections available for reuse."""
+- for conns in self.connections.values():
+- for conn in conns:
+- conn.close()
+- self.connections.clear()
+- self.queue.clear()
+-
+- @asyncio.coroutine
+- def get_connection(self, host: str, port: int, ssl: bool) -> Generator[Any, None, 'Connection']:
+- """Create or reuse a connection."""
+- port = port or (443 if ssl else 80)
+- try:
+- ipaddrs = yield from self.loop.getaddrinfo(host, port)
+- except Exception as exc:
+- self.log(0, 'Exception %r for (%r, %r)' % (exc, host, port))
+- raise
+- self.log(1, '* %s resolves to %s' %
+- (host, ', '.join(ip[4][0] for ip in ipaddrs)))
+-
+- # Look for a reusable connection.
+- for _1, _2, _3, _4, (h, p, *_5) in ipaddrs:
+- key = h, p, ssl
+- conn = None
+- conns = self.connections.get(key)
+- while conns:
+- conn = conns.pop(0)
+- self.queue.remove(conn)
+- if not conns:
+- del self.connections[key]
+- if conn.stale():
+- self.log(1, 'closing stale connection for', key)
+- conn.close() # Just in case.
+- else:
+- self.log(1, '* Reusing pooled connection', key,
+- 'FD =', conn.fileno())
+- return conn
+-
+- # Create a new connection.
+- conn = Connection(self.log, self, host, port, ssl)
+- yield from conn.connect()
+- self.log(1, '* New connection', conn.key, 'FD =', conn.fileno())
+- return conn
+-
+- def recycle_connection(self, conn: 'Connection') -> None:
+- """Make a connection available for reuse.
+-
+- This also prunes the pool if it exceeds the size limits.
+- """
+- if conn.stale():
+- conn.close()
+- return
+-
+- key = conn.key
+- conns = self.connections.setdefault(key, [])
+- conns.append(conn)
+- self.queue.append(conn)
+-
+- if len(conns) <= self.max_tasks and len(self.queue) <= self.max_pool:
+- return
+-
+- # Prune the queue.
+-
+- # Close stale connections for this key first.
+- stale = [conn for conn in conns if conn.stale()]
+- if stale:
+- for conn in stale:
+- conns.remove(conn)
+- self.queue.remove(conn)
+- self.log(1, 'closing stale connection for', key)
+- conn.close()
+- if not conns:
+- del self.connections[key]
+-
+- # Close oldest connection(s) for this key if limit reached.
+- while len(conns) > self.max_tasks:
+- conn = conns.pop(0)
+- self.queue.remove(conn)
+- self.log(1, 'closing oldest connection for', key)
+- conn.close()
+-
+- if len(self.queue) <= self.max_pool:
+- return
+-
+- # Close overall stale connections.
+- stale = [conn for conn in self.queue if conn.stale()]
+- if stale:
+- for conn in stale:
+- conns = self.connections.get(conn.key)
+- conns.remove(conn)
+- self.queue.remove(conn)
+- self.log(1, 'closing stale connection for', key)
+- conn.close()
+-
+- # Close oldest overall connection(s) if limit reached.
+- while len(self.queue) > self.max_pool:
+- conn = self.queue.pop(0)
+- conns = self.connections.get(conn.key)
+- c = conns.pop(0)
+- assert conn == c, (conn.key, conn, c, conns)
+- self.log(1, 'closing overall oldest connection for', conn.key)
+- conn.close()
+-
+-
+-class Connection:
+-
+- def __init__(self, log: Logger, pool: ConnectionPool, host: str, port: int, ssl: bool) -> None:
+- self.log = log
+- self.pool = pool
+- self.host = host
+- self.port = port
+- self.ssl = ssl
+- self.reader = None # type: asyncio.StreamReader
+- self.writer = None # type: asyncio.StreamWriter
+- self.key = None # type: KeyTuple
+-
+- def stale(self) -> bool:
+- return self.reader is None or self.reader.at_eof()
+-
+- def fileno(self) -> Optional[int]:
+- writer = self.writer
+- if writer is not None:
+- transport = writer.transport
+- if transport is not None:
+- sock = transport.get_extra_info('socket')
+- if sock is not None:
+- return sock.fileno()
+- return None
+-
+- @asyncio.coroutine
+- def connect(self) -> Generator[Any, None, None]:
+- self.reader, self.writer = yield from asyncio.open_connection(
+- self.host, self.port, ssl=self.ssl)
+- peername = self.writer.get_extra_info('peername')
+- if peername:
+- self.host, self.port = peername[:2]
+- else:
+- self.log(1, 'NO PEERNAME???', self.host, self.port, self.ssl)
+- self.key = self.host, self.port, self.ssl
+-
+- def close(self, recycle: bool = False) -> None:
+- if recycle and not self.stale():
+- self.pool.recycle_connection(self)
+- else:
+- self.writer.close()
+- self.pool = self.reader = self.writer = None
+-
+-
+-class Request:
+- """HTTP request.
+-
+- Use connect() to open a connection; send_request() to send the
+- request; get_response() to receive the response headers.
+- """
+-
+- def __init__(self, log: Logger, url: str, pool: ConnectionPool) -> None:
+- self.log = log
+- self.url = url
+- self.pool = pool
+- self.parts = urllib.parse.urlparse(self.url)
+- self.scheme = self.parts.scheme
+- assert self.scheme in ('http', 'https'), repr(url)
+- self.ssl = self.parts.scheme == 'https'
+- self.netloc = self.parts.netloc
+- self.hostname = self.parts.hostname
+- self.port = self.parts.port or (443 if self.ssl else 80)
+- self.path = (self.parts.path or '/')
+- self.query = self.parts.query
+- if self.query:
+- self.full_path = '%s?%s' % (self.path, self.query)
+- else:
+- self.full_path = self.path
+- self.http_version = 'HTTP/1.1'
+- self.method = 'GET'
+- self.headers = [] # type: List[Tuple[str, str]]
+- self.conn = None # type: Connection
+-
+- @asyncio.coroutine
+- def connect(self) -> Generator[Any, None, None]:
+- """Open a connection to the server."""
+- self.log(1, '* Connecting to %s:%s using %s for %s' %
+- (self.hostname, self.port,
+- 'ssl' if self.ssl else 'tcp',
+- self.url))
+- self.conn = yield from self.pool.get_connection(self.hostname,
+- self.port, self.ssl)
+-
+- def close(self, recycle: bool = False) -> None:
+- """Close the connection, recycle if requested."""
+- if self.conn is not None:
+- if not recycle:
+- self.log(1, 'closing connection for', self.conn.key)
+- self.conn.close(recycle)
+- self.conn = None
+-
+- @asyncio.coroutine
+- def putline(self, line: str) -> None:
+- """Write a line to the connection.
+-
+- Used for the request line and headers.
+- """
+- self.log(2, '>', line)
+- self.conn.writer.write(line.encode('latin-1') + b'\r\n')
+-
+- @asyncio.coroutine
+- def send_request(self) -> Generator[Any, None, None]:
+- """Send the request."""
+- request_line = '%s %s %s' % (self.method, self.full_path,
+- self.http_version)
+- yield from self.putline(request_line)
+- # TODO: What if a header is already set?
+- self.headers.append(('User-Agent', 'asyncio-example-crawl/0.0'))
+- self.headers.append(('Host', self.netloc))
+- self.headers.append(('Accept', '*/*'))
+- # self.headers.append(('Accept-Encoding', 'gzip'))
+- for key, value in self.headers:
+- line = '%s: %s' % (key, value)
+- yield from self.putline(line)
+- yield from self.putline('')
+-
+- @asyncio.coroutine
+- def get_response(self) -> Generator[Any, None, 'Response']:
+- """Receive the response."""
+- response = Response(self.log, self.conn.reader)
+- yield from response.read_headers()
+- return response
+-
+-
+-class Response:
+- """HTTP response.
+-
+- Call read_headers() to receive the request headers. Then check
+- the status attribute and call get_header() to inspect the headers.
+- Finally call read() to receive the body.
+- """
+-
+- def __init__(self, log: Logger, reader: asyncio.StreamReader) -> None:
+- self.log = log
+- self.reader = reader
+- self.http_version = None # type: str # 'HTTP/1.1'
+- self.status = None # type: int # 200
+- self.reason = None # type: str # 'Ok'
+- self.headers = [] # type: List[Tuple[str, str]] # [('Content-Type', 'text/html')]
+-
+- @asyncio.coroutine
+- def getline(self) -> Generator[Any, None, str]:
+- """Read one line from the connection."""
+- line = (yield from self.reader.readline()).decode('latin-1').rstrip()
+- self.log(2, '<', line)
+- return line
+-
+- @asyncio.coroutine
+- def read_headers(self) -> Generator[Any, None, None]:
+- """Read the response status and the request headers."""
+- status_line = yield from self.getline()
+- status_parts = status_line.split(None, 2)
+- if len(status_parts) != 3:
+- self.log(0, 'bad status_line', repr(status_line))
+- raise BadStatusLine(status_line)
+- self.http_version, status, self.reason = status_parts
+- self.status = int(status)
+- while True:
+- header_line = yield from self.getline()
+- if not header_line:
+- break
+- # TODO: Continuation lines.
+- key, value = header_line.split(':', 1)
+- self.headers.append((key, value.strip()))
+-
+- def get_redirect_url(self, default: str = '') -> str:
+- """Inspect the status and return the redirect url if appropriate."""
+- if self.status not in (300, 301, 302, 303, 307):
+- return default
+- return self.get_header('Location', default)
+-
+- def get_header(self, key: str, default: str = '') -> str:
+- """Get one header value, using a case insensitive header name."""
+- key = key.lower()
+- for k, v in self.headers:
+- if k.lower() == key:
+- return v
+- return default
+-
+- @asyncio.coroutine
+- def read(self) -> Generator[Any, None, bytes]:
+- """Read the response body.
+-
+- This honors Content-Length and Transfer-Encoding: chunked.
+- """
+- nbytes = None
+- for key, value in self.headers:
+- if key.lower() == 'content-length':
+- nbytes = int(value)
+- break
+- if nbytes is None:
+- if self.get_header('transfer-encoding').lower() == 'chunked':
+- self.log(2, 'parsing chunked response')
+- blocks = []
+- while True:
+- size_header = yield from self.reader.readline()
+- if not size_header:
+- self.log(0, 'premature end of chunked response')
+- break
+- self.log(3, 'size_header =', repr(size_header))
+- parts = size_header.split(b';')
+- size = int(parts[0], 16)
+- if size:
+- self.log(3, 'reading chunk of', size, 'bytes')
+- block = yield from self.reader.readexactly(size)
+- assert len(block) == size, (len(block), size)
+- blocks.append(block)
+- crlf = yield from self.reader.readline()
+- assert crlf == b'\r\n', repr(crlf)
+- if not size:
+- break
+- body = b''.join(blocks)
+- self.log(1, 'chunked response had', len(body),
+- 'bytes in', len(blocks), 'blocks')
+- else:
+- self.log(3, 'reading until EOF')
+- body = yield from self.reader.read()
+- # TODO: Should make sure not to recycle the connection
+- # in this case.
+- else:
+- body = yield from self.reader.readexactly(nbytes)
+- return body
+-
+-
+-class Fetcher:
+- """Logic and state for one URL.
+-
+- When found in crawler.busy, this represents a URL to be fetched or
+- in the process of being fetched; when found in crawler.done, this
+- holds the results from fetching it.
+-
+- This is usually associated with a task. This references the
+- crawler for the connection pool and to add more URLs to its todo
+- list.
+-
+- Call fetch() to do the fetching, then report() to print the results.
+- """
+-
+- def __init__(self, log: Logger, url: str, crawler: 'Crawler',
+- max_redirect: int = 10, max_tries: int = 4) -> None:
+- self.log = log
+- self.url = url
+- self.crawler = crawler
+- # We don't loop resolving redirects here -- we just use this
+- # to decide whether to add the redirect URL to crawler.todo.
+- self.max_redirect = max_redirect
+- # But we do loop to retry on errors a few times.
+- self.max_tries = max_tries
+- # Everything we collect from the response goes here.
+- self.task = None # type: asyncio.Task
+- self.exceptions = [] # type: List[Exception]
+- self.tries = 0
+- self.request = None # type: Request
+- self.response = None # type: Response
+- self.body = None # type: bytes
+- self.next_url = None # type: str
+- self.ctype = None # type: str
+- self.pdict = None # type: Dict[str, str]
+- self.encoding = None # type: str
+- self.urls = None # type: Set[str]
+- self.new_urls = None # type: Set[str]
+-
+- @asyncio.coroutine
+- def fetch(self) -> Generator[Any, None, None]:
+- """Attempt to fetch the contents of the URL.
+-
+- If successful, and the data is HTML, extract further links and
+- add them to the crawler. Redirects are also added back there.
+- """
+- while self.tries < self.max_tries:
+- self.tries += 1
+- self.request = None
+- try:
+- self.request = Request(self.log, self.url, self.crawler.pool)
+- yield from self.request.connect()
+- yield from self.request.send_request()
+- self.response = yield from self.request.get_response()
+- self.body = yield from self.response.read()
+- h_conn = self.response.get_header('connection').lower()
+- if h_conn != 'close':
+- self.request.close(recycle=True)
+- self.request = None
+- if self.tries > 1:
+- self.log(1, 'try', self.tries, 'for', self.url, 'success')
+- break
+- except (BadStatusLine, OSError) as exc:
+- self.exceptions.append(exc)
+- self.log(1, 'try', self.tries, 'for', self.url,
+- 'raised', repr(exc))
+- # import pdb; pdb.set_trace()
+- # Don't reuse the connection in this case.
+- finally:
+- if self.request is not None:
+- self.request.close()
+- else:
+- # We never broke out of the while loop, i.e. all tries failed.
+- self.log(0, 'no success for', self.url,
+- 'in', self.max_tries, 'tries')
+- return
+- next_url = self.response.get_redirect_url()
+- if next_url:
+- self.next_url = urllib.parse.urljoin(self.url, next_url)
+- if self.max_redirect > 0:
+- self.log(1, 'redirect to', self.next_url, 'from', self.url)
+- self.crawler.add_url(self.next_url, self.max_redirect - 1)
+- else:
+- self.log(0, 'redirect limit reached for', self.next_url,
+- 'from', self.url)
+- else:
+- if self.response.status == 200:
+- self.ctype = self.response.get_header('content-type')
+- self.pdict = {}
+- if self.ctype:
+- self.ctype, self.pdict = cgi.parse_header(self.ctype)
+- self.encoding = self.pdict.get('charset', 'utf-8')
+- if self.ctype == 'text/html':
+- body = self.body.decode(self.encoding, 'replace')
+- # Replace href with (?:href|src) to follow image links.
+- self.urls = set(re.findall(r'(?i)href=["\']?([^\s"\'<>]+)',
+- body))
+- if self.urls:
+- self.log(1, 'got', len(self.urls),
+- 'distinct urls from', self.url)
+- self.new_urls = set()
+- for url in self.urls:
+- url = unescape(url)
+- url = urllib.parse.urljoin(self.url, url)
+- url, frag = urllib.parse.urldefrag(url)
+- if self.crawler.add_url(url):
+- self.new_urls.add(url)
+-
+- def report(self, stats: 'Stats', file: IO[str] = None) -> None:
+- """Print a report on the state for this URL.
+-
+- Also update the Stats instance.
+- """
+- if self.task is not None:
+- if not self.task.done():
+- stats.add('pending')
+- print(self.url, 'pending', file=file)
+- return
+- elif self.task.cancelled():
+- stats.add('cancelled')
+- print(self.url, 'cancelled', file=file)
+- return
+- elif self.task.exception():
+- stats.add('exception')
+- exc = self.task.exception()
+- stats.add('exception_' + exc.__class__.__name__)
+- print(self.url, exc, file=file)
+- return
+- if len(self.exceptions) == self.tries:
+- stats.add('fail')
+- exc = self.exceptions[-1]
+- stats.add('fail_' + str(exc.__class__.__name__))
+- print(self.url, 'error', exc, file=file)
+- elif self.next_url:
+- stats.add('redirect')
+- print(self.url, self.response.status, 'redirect', self.next_url,
+- file=file)
+- elif self.ctype == 'text/html':
+- stats.add('html')
+- size = len(self.body or b'')
+- stats.add('html_bytes', size)
+- if self.log.level:
+- print(self.url, self.response.status,
+- self.ctype, self.encoding,
+- size,
+- '%d/%d' % (len(self.new_urls or ()), len(self.urls or ())),
+- file=file)
+- elif self.response is None:
+- print(self.url, 'no response object')
+- else:
+- size = len(self.body or b'')
+- if self.response.status == 200:
+- stats.add('other')
+- stats.add('other_bytes', size)
+- else:
+- stats.add('error')
+- stats.add('error_bytes', size)
+- stats.add('status_%s' % self.response.status)
+- print(self.url, self.response.status,
+- self.ctype, self.encoding,
+- size,
+- file=file)
+-
+-
+-class Stats:
+- """Record stats of various sorts."""
+-
+- def __init__(self) -> None:
+- self.stats = {} # type: Dict[str, int]
+-
+- def add(self, key: str, count: int = 1) -> None:
+- self.stats[key] = self.stats.get(key, 0) + count
+-
+- def report(self, file: IO[str] = None) -> None:
+- for key, count in sorted(self.stats.items()):
+- print('%10d' % count, key, file=file)
+-
+-
+-class Crawler:
+- """Crawl a set of URLs.
+-
+- This manages three disjoint sets of URLs (todo, busy, done). The
+- data structures actually store dicts -- the values in todo give
+- the redirect limit, while the values in busy and done are Fetcher
+- instances.
+- """
+- def __init__(self, log: Logger,
+- roots: Set[str], exclude: str = None, strict: bool = True, # What to crawl.
+- max_redirect: int = 10, max_tries: int = 4, # Per-url limits.
+- max_tasks: int = 10, max_pool: int = 10, # Global limits.
+- ) -> None:
+- self.log = log
+- self.roots = roots
+- self.exclude = exclude
+- self.strict = strict
+- self.max_redirect = max_redirect
+- self.max_tries = max_tries
+- self.max_tasks = max_tasks
+- self.max_pool = max_pool
+- self.todo = {} # type: Dict[str, int]
+- self.busy = {} # type: Dict[str, Fetcher]
+- self.done = {} # type: Dict[str, Fetcher]
+- self.pool = ConnectionPool(self.log, max_pool, max_tasks)
+- self.root_domains = set() # type: Set[str]
+- for root in roots:
+- host = urllib.parse.urlparse(root).hostname
+- if not host:
+- continue
+- if re.match(r'\A[\d\.]*\Z', host):
+- self.root_domains.add(host)
+- else:
+- host = host.lower()
+- if self.strict:
+- self.root_domains.add(host)
+- if host.startswith('www.'):
+- self.root_domains.add(host[4:])
+- else:
+- self.root_domains.add('www.' + host)
+- else:
+- parts = host.split('.')
+- if len(parts) > 2:
+- host = '.'.join(parts[-2:])
+- self.root_domains.add(host)
+- for root in roots:
+- self.add_url(root)
+- self.governor = asyncio.Semaphore(max_tasks)
+- self.termination = asyncio.Condition()
+- self.t0 = time.time()
+- self.t1 = None # type: Optional[float]
+-
+- def close(self) -> None:
+- """Close resources (currently only the pool)."""
+- self.pool.close()
+-
+- def host_okay(self, host: str) -> bool:
+- """Check if a host should be crawled.
+-
+- A literal match (after lowercasing) is always good. For hosts
+- that don't look like IP addresses, some approximate matches
+- are okay depending on the strict flag.
+- """
+- host = host.lower()
+- if host in self.root_domains:
+- return True
+- if re.match(r'\A[\d\.]*\Z', host):
+- return False
+- if self.strict:
+- return self._host_okay_strictish(host)
+- else:
+- return self._host_okay_lenient(host)
+-
+- def _host_okay_strictish(self, host: str) -> bool:
+- """Check if a host should be crawled, strict-ish version.
+-
+- This checks for equality modulo an initial 'www.' component.
+- """
+- if host.startswith('www.'):
+- if host[4:] in self.root_domains:
+- return True
+- else:
+- if 'www.' + host in self.root_domains:
+- return True
+- return False
+-
+- def _host_okay_lenient(self, host: str) -> bool:
+- """Check if a host should be crawled, lenient version.
+-
+- This compares the last two components of the host.
+- """
+- parts = host.split('.')
+- if len(parts) > 2:
+- host = '.'.join(parts[-2:])
+- return host in self.root_domains
+-
+- def add_url(self, url: str, max_redirect: int = None) -> bool:
+- """Add a URL to the todo list if not seen before."""
+- if self.exclude and re.search(self.exclude, url):
+- return False
+- parsed = urllib.parse.urlparse(url)
+- if parsed.scheme not in ('http', 'https'):
+- self.log(2, 'skipping non-http scheme in', url)
+- return False
+- host = parsed.hostname
+- if not self.host_okay(host):
+- self.log(2, 'skipping non-root host in', url)
+- return False
+- if max_redirect is None:
+- max_redirect = self.max_redirect
+- if url in self.todo or url in self.busy or url in self.done:
+- return False
+- self.log(1, 'adding', url, max_redirect)
+- self.todo[url] = max_redirect
+- return True
+-
+- @asyncio.coroutine
+- def crawl(self) -> Generator[Any, None, None]:
+- """Run the crawler until all finished."""
+- with (yield from self.termination):
+- while self.todo or self.busy:
+- if self.todo:
+- url, max_redirect = self.todo.popitem()
+- fetcher = Fetcher(self.log, url,
+- crawler=self,
+- max_redirect=max_redirect,
+- max_tries=self.max_tries,
+- )
+- self.busy[url] = fetcher
+- fetcher.task = asyncio.Task(self.fetch(fetcher))
+- else:
+- yield from self.termination.wait()
+- self.t1 = time.time()
+-
+- @asyncio.coroutine
+- def fetch(self, fetcher: Fetcher) -> Generator[Any, None, None]:
+- """Call the Fetcher's fetch(), with a limit on concurrency.
+-
+- Once this returns, move the fetcher from busy to done.
+- """
+- url = fetcher.url
+- with (yield from self.governor):
+- try:
+- yield from fetcher.fetch() # Fetcher gonna fetch.
+- finally:
+- # Force GC of the task, so the error is logged.
+- fetcher.task = None
+- with (yield from self.termination):
+- self.done[url] = fetcher
+- del self.busy[url]
+- self.termination.notify()
+-
+- def report(self, file: IO[str] = None) -> None:
+- """Print a report on all completed URLs."""
+- if self.t1 is None:
+- self.t1 = time.time()
+- dt = self.t1 - self.t0
+- if dt and self.max_tasks:
+- speed = len(self.done) / dt / self.max_tasks
+- else:
+- speed = 0
+- stats = Stats()
+- print('*** Report ***', file=file)
+- try:
+- show = [] # type: List[Tuple[str, Fetcher]]
+- show.extend(self.done.items())
+- show.extend(self.busy.items())
+- show.sort()
+- for url, fetcher in show:
+- fetcher.report(stats, file=file)
+- except KeyboardInterrupt:
+- print('\nInterrupted', file=file)
+- print('Finished', len(self.done),
+- 'urls in %.3f secs' % dt,
+- '(max_tasks=%d)' % self.max_tasks,
+- '(%.3f urls/sec/task)' % speed,
+- file=file)
+- stats.report(file=file)
+- print('Todo:', len(self.todo), file=file)
+- print('Busy:', len(self.busy), file=file)
+- print('Done:', len(self.done), file=file)
+- print('Date:', time.ctime(), 'local time', file=file)
+-
+-
+-def main() -> None:
+- """Main program.
+-
+- Parse arguments, set up event loop, run crawler, print report.
+- """
+- args = ARGS.parse_args()
+- if not args.roots:
+- print('Use --help for command line help')
+- return
+-
+- log = Logger(args.level)
+-
+- if args.iocp:
+- if sys.platform == 'win32':
+- from asyncio import ProactorEventLoop
+- loop = ProactorEventLoop() # type: ignore
+- asyncio.set_event_loop(loop)
+- else:
+- assert False
+- elif args.select:
+- loop = asyncio.SelectorEventLoop() # type: ignore
+- asyncio.set_event_loop(loop)
+- else:
+- loop = asyncio.get_event_loop() # type: ignore
+-
+- roots = {fix_url(root) for root in args.roots}
+-
+- crawler = Crawler(log,
+- roots, exclude=args.exclude,
+- strict=args.strict,
+- max_redirect=args.max_redirect,
+- max_tries=args.max_tries,
+- max_tasks=args.max_tasks,
+- max_pool=args.max_pool,
+- )
+- try:
+- loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl.
+- except KeyboardInterrupt:
+- sys.stderr.flush()
+- print('\nInterrupted\n')
+- finally:
+- crawler.report()
+- crawler.close()
+- loop.close()
+-
+-
+-if __name__ == '__main__':
+- logging.basicConfig(level=logging.INFO) # type: ignore
+- main()
=====================================
debian/patches/series
=====================================
@@ -3,3 +3,4 @@ older_pytest
mypyc_analysis
ignore_mypyc
verbose
+015190f1fb2ce024c1ae022a7665c382b27167bf.patch
=====================================
debian/rules
=====================================
@@ -75,8 +75,7 @@ ifeq (,$(filter nocheck,$(DEB_BUILD_PROFILES)))
# --config-file {dir}/mypy_self_check.ini -p mypy" dh_auto_test
# ^^^ requires pytest > 6.0.0
dh_auto_install
- #set -e; for v in $(PY3VERS); do
- set -e; for v in 3.8; do \
+ set -e; for v in $(PY3VERS); do \
PATH=$$PATH:$(CURDIR)/debian/mypy/usr/bin/ python$$v -m pytest -n auto \
-o testpaths=mypy/test -o python_files=test*.py \
-k "not StubtestMiscUnit" \
=====================================
debian/watch
=====================================
@@ -1,3 +1,3 @@
-version=3
+version=4
opts=uversionmangle=s/(rc|a|b|c)/~$1/ \
https://pypi.debian.net/mypy/mypy-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))
View it on GitLab: https://salsa.debian.org/med-team/mypy/-/compare/af00356f4b834627dc9e8604f9df61fb6b2ae5c1...96cfb5a427333ff3c4d70113f063916ac2af829d
--
View it on GitLab: https://salsa.debian.org/med-team/mypy/-/compare/af00356f4b834627dc9e8604f9df61fb6b2ae5c1...96cfb5a427333ff3c4d70113f063916ac2af829d
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201208/c1f2c434/attachment-0001.html>
More information about the debian-med-commit
mailing list