[med-svn] [Git][med-team/biomaj3-download][upstream] New upstream version 3.1.0
Olivier Sallou
gitlab at salsa.debian.org
Tue Nov 12 10:52:58 GMT 2019
Olivier Sallou pushed to branch upstream at Debian Med / biomaj3-download
Commits:
d6208921 by Olivier Sallou at 2019-11-12T10:18:07Z
New upstream version 3.1.0
- - - - -
13 changed files:
- .travis.yml
- CHANGES.txt
- README.md
- biomaj_download/download/ftp.py → biomaj_download/download/curl.py
- biomaj_download/download/direct.py
- − biomaj_download/download/http.py
- biomaj_download/download/interface.py
- biomaj_download/download/localcopy.py
- biomaj_download/download/protocolirods.py
- biomaj_download/download/rsync.py
- biomaj_download/downloadservice.py
- setup.py
- tests/biomaj_tests.py
Changes:
=====================================
.travis.yml
=====================================
@@ -20,7 +20,7 @@ install:
- pip install python-coveralls
- python setup.py -q install
script:
-- nosetests -a '!network'
+- nosetests -a '!network,!local_irods'
- flake8 --ignore E501 biomaj_download/*.py biomaj_download/download
deploy:
provider: pypi
=====================================
CHANGES.txt
=====================================
@@ -1,3 +1,6 @@
+3.1.0:
+ #16 Don't change name after download in DirectHTTPDownloader
+ PR #7 Refactor downloaders (*WARNING* breaks API)
3.0.27:
Fix previous release broken with a bug in direct protocols
3.0.26:
=====================================
README.md
=====================================
@@ -17,6 +17,19 @@ To compile protobuf, in biomaj_download/message:
flake8 biomaj_download/\*.py biomaj_download/download
+# Test
+
+To run the test suite, use:
+
+ nosetests -a '!local_irods' tests/biomaj_tests.py
+
+This command skips the test that need a local iRODS server.
+
+Some test might fail due to network connection. You can skip them with:
+
+ nosetests -a '!network' tests/biomaj_tests.py
+
+(To skip the local iRODS test and the network tests, use `-a '!network,!local_irods'`).
# Run
=====================================
biomaj_download/download/ftp.py → biomaj_download/download/curl.py
=====================================
@@ -1,15 +1,24 @@
-import pycurl
-import re
+import sys
import os
-import time
+import re
from datetime import datetime
-import stat
import hashlib
+import time
+import stat
+
+import pycurl
import ftputil
+import humanfriendly
+
from biomaj_core.utils import Utils
from biomaj_download.download.interface import DownloadInterface
+if sys.version_info[0] < 3:
+ from urllib import urlencode
+else:
+ from urllib.parse import urlencode
+
try:
from io import BytesIO
except ImportError:
@@ -58,9 +67,31 @@ if 'filemode' not in stat.__dict__:
stat.filemode = _filemode
-class FTPDownload(DownloadInterface):
+class HTTPParse(object):
+
+ def __init__(self, dir_line, file_line, dir_name=1, dir_date=2, file_name=1, file_date=2, file_date_format=None, file_size=3):
+ r'''
+ http.parse.dir.line: <img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+ http.parse.file.line: <img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
+ http.group.dir.name: 1
+ http.group.dir.date: 2
+ http.group.file.name: 1
+ http.group.file.date: 2
+ http.group.file.size: 3
+ '''
+ self.dir_line = dir_line
+ self.file_line = file_line
+ self.dir_name = dir_name
+ self.dir_date = dir_date
+ self.file_name = file_name
+ self.file_date = file_date
+ self.file_size = file_size
+ self.file_date_format = file_date_format
+
+
+class CurlDownload(DownloadInterface):
'''
- Base class to download files from FTP
+ Base class to download files from FTP(S), HTTP(S) and SFTP.
protocol=ftp
server=ftp.ncbi.nih.gov
@@ -69,6 +100,12 @@ class FTPDownload(DownloadInterface):
remote.files=^alu.*\\.gz$
'''
+
+ FTP_PROTOCOL_FAMILY = ["ftp", "ftps"]
+ HTTP_PROTOCOL_FAMILY = ["http", "https"]
+ SFTP_PROTOCOL_FAMILY = ["sftp"]
+ ALL_PROTOCOLS = FTP_PROTOCOL_FAMILY + HTTP_PROTOCOL_FAMILY + SFTP_PROTOCOL_FAMILY
+
# Utilities to parse ftp listings: UnixParser is the more common hence we
# put it first
ftp_listing_parsers = [
@@ -76,14 +113,55 @@ class FTPDownload(DownloadInterface):
ftputil.stat.MSParser(),
]
- def __init__(self, protocol, host, rootdir):
+ def __init__(self, curl_protocol, host, rootdir, http_parse=None):
+ """
+ Initialize a CurlDownloader.
+
+ :param curl_protocol: (real) protocol to use
+ :type curl_protocol: str (see :py:var:~CurlDownload.ALL_PROTOCOLS)
+
+ :param host: server name
+ :type host: str
+
+ :param rootdir: base directory
+ :type rootdir: str
+
+ :param http_parse: object used to extract file information from HTML pages
+ :type http_parse: py:class:HTTPParse.
+ """
DownloadInterface.__init__(self)
self.logger.debug('Download')
- self.crl = pycurl.Curl()
- url = protocol + '://' + host
+ # Initialize curl_protocol.
+ # Note that we don't change that field in set_protocol since this
+ # method uses the protocol from the configuration file. It's not clear
+ # what to do in this case.
+ curl_protocol = curl_protocol.lower()
+ if curl_protocol not in self.ALL_PROTOCOLS:
+ raise ValueError("curl_protocol must be one of %s (case insensitive). Got %s." % (self.ALL_PROTOCOLS, curl_protocol))
+ self.curl_protocol = curl_protocol
+ # Initialize protocol specific constants
+ if self.curl_protocol in self.FTP_PROTOCOL_FAMILY:
+ self.protocol_family = "ftp"
+ self._parse_result = self._ftp_parse_result
+ self.ERRCODE_OK = 226
+ elif self.curl_protocol in self.HTTP_PROTOCOL_FAMILY:
+ self.protocol_family = "http"
+ self._parse_result = self._http_parse_result
+ self.ERRCODE_OK = 200
+ elif self.curl_protocol in self.SFTP_PROTOCOL_FAMILY:
+ self.protocol_family = "sftp"
+ self._parse_result = self._ftp_parse_result
+ self.ERRCODE_OK = 0
+ else: # Should not happen since we check before
+ raise ValueError("Unknown protocol")
self.rootdir = rootdir
- self.url = url
+ self.set_server(host)
self.headers = {}
+ self.http_parse = http_parse
+ # Create the cURL object
+ # This object is shared by all operations to use the cache.
+ # Before using it, call method:`_basic_curl_configuration`.
+ self.crl = pycurl.Curl()
# Initialize options
# Should we skip SSL verification (cURL -k/--insecure option)
self.ssl_verifyhost = True
@@ -93,8 +171,85 @@ class FTPDownload(DownloadInterface):
# Keep alive
self.tcp_keepalive = 0
+ def _basic_curl_configuration(self):
+ """
+ Perform basic configuration (i.e. that doesn't depend on the
+ operation: _download or list). This method shoulmd be called before any
+ operation.
+ """
+ # Reset cURL options before setting them
+ self.crl.reset()
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+
+ # Configure TCP keepalive
+ if self.tcp_keepalive:
+ self.crl.setopt(pycurl.TCP_KEEPALIVE, True)
+ self.crl.setopt(pycurl.TCP_KEEPIDLE, self.tcp_keepalive * 2)
+ self.crl.setopt(pycurl.TCP_KEEPINTVL, self.tcp_keepalive)
+
+ # Configure SSL verification (on some platforms, disabling
+ # SSL_VERIFYPEER implies disabling SSL_VERIFYHOST so we set
+ # SSL_VERIFYPEER after)
+ self.crl.setopt(pycurl.SSL_VERIFYHOST, 2 if self.ssl_verifyhost else 0)
+ self.crl.setopt(pycurl.SSL_VERIFYPEER, 1 if self.ssl_verifypeer else 0)
+ if self.ssl_server_cert:
+ # cacert is the name of the option for the curl command. The
+ # corresponding cURL option is CURLOPT_CAINFO.
+ # See https://curl.haxx.se/libcurl/c/CURLOPT_CAINFO.html
+ # This is inspired by that https://curl.haxx.se/docs/sslcerts.html
+ # (section "Certificate Verification", option 2) but the option
+ # CURLOPT_CAPATH is for a directory of certificates.
+ self.crl.setopt(pycurl.CAINFO, self.ssl_server_cert)
+
+ # Configure timeouts
+ self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
+ self.crl.setopt(pycurl.TIMEOUT, self.timeout)
+ self.crl.setopt(pycurl.NOSIGNAL, 1)
+
+ # Header function
+ self.crl.setopt(pycurl.HEADERFUNCTION, self._header_function)
+
+ def _header_function(self, header_line):
+ # HTTP standard specifies that headers are encoded in iso-8859-1.
+ # On Python 2, decoding step can be skipped.
+ # On Python 3, decoding step is required.
+ header_line = header_line.decode('iso-8859-1')
+
+ # Header lines include the first status line (HTTP/1.x ...).
+ # We are going to ignore all lines that don't have a colon in them.
+ # This will botch headers that are split on multiple lines...
+ if ':' not in header_line:
+ return
+
+ # Break the header line into header name and value.
+ name, value = header_line.split(':', 1)
+
+ # Remove whitespace that may be present.
+ # Header lines include the trailing newline, and there may be whitespace
+ # around the colon.
+ name = name.strip()
+ value = value.strip()
+
+ # Header names are case insensitive.
+ # Lowercase name here.
+ name = name.lower()
+
+ # Now we can actually record the header name and value.
+ self.headers[name] = value
+
+ def set_server(self, server):
+ super(CurlDownload, self).set_server(server)
+ self.url = self.curl_protocol + '://' + self.server
+
def set_options(self, protocol_options):
- super(FTPDownload, self).set_options(protocol_options)
+ super(CurlDownload, self).set_options(protocol_options)
if "ssl_verifyhost" in protocol_options:
self.ssl_verifyhost = Utils.to_bool(protocol_options["ssl_verifyhost"])
if "ssl_verifypeer" in protocol_options:
@@ -104,127 +259,65 @@ class FTPDownload(DownloadInterface):
if "tcp_keepalive" in protocol_options:
self.tcp_keepalive = Utils.to_int(protocol_options["tcp_keepalive"])
- def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
- '''
- Find files matching patterns. Sets instance variable files_to_download.
-
- :param patterns: regexps to match
- :type patterns: list
- :param file_list: list of files to match
- :type file_list: list
- :param dir_list: sub directories in current dir
- :type dir_list: list
- :param prefix: directory prefix
- :type prefix: str
- :param submatch: first call to match, or called from match
- :type submatch: bool
- '''
- self.logger.debug('Download:File:RegExp:' + str(patterns))
- if dir_list is None:
- dir_list = []
- if not submatch:
- self.files_to_download = []
- for pattern in patterns:
- subdirs_pattern = pattern.split('/')
- if len(subdirs_pattern) > 1:
- # Pattern contains sub directories
- subdir = subdirs_pattern[0]
- if subdir == '^':
- subdirs_pattern = subdirs_pattern[1:]
- subdir = subdirs_pattern[0]
- # If getting all, get all files
- if pattern == '**/*':
- for rfile in file_list:
- rfile['root'] = self.rootdir
- if prefix != '':
- rfile['name'] = prefix + '/' + rfile['name']
- self.files_to_download.append(rfile)
- self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
- for direlt in dir_list:
- subdir = direlt['name']
- self.logger.debug('Download:File:Subdir:Check:' + subdir)
- if pattern == '**/*':
- (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
- self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
-
- else:
- if re.match(subdirs_pattern[0], subdir):
- self.logger.debug('Download:File:Subdir:Match:' + subdir)
- # subdir match the beginning of the pattern
- # check match in subdir
- (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
- self.match(['/'.join(subdirs_pattern[1:])], subfile_list, subdirs_list, prefix + '/' + subdir, True)
-
- else:
- for rfile in file_list:
- if re.match(pattern, rfile['name']):
- rfile['root'] = self.rootdir
- if prefix != '':
- rfile['name'] = prefix + '/' + rfile['name']
- self.files_to_download.append(rfile)
- self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
- if not submatch and len(self.files_to_download) == 0:
- raise Exception('no file found matching expressions')
-
- def curl_download(self, file_path, file_to_download):
+ def _append_file_to_download(self, rfile):
+ # Add url and root to the file if needed (for safety)
+ if 'url' not in rfile or not rfile['url']:
+ rfile['url'] = self.url
+ if 'root' not in rfile or not rfile['root']:
+ rfile['root'] = self.rootdir
+ super(CurlDownload, self)._append_file_to_download(rfile)
+
+ def _file_url(self, rfile):
+ # rfile['root'] is set to self.rootdir if needed but may be different.
+ # We don't use os.path.join because rfile['name'] may starts with /
+ return self.url + '/' + rfile['root'] + rfile['name']
+
+ def _download(self, file_path, rfile):
+ """
+ This method is designed to work for FTP(S), HTTP(S) and SFTP.
+ """
error = True
nbtry = 1
+ # Forge URL of remote file
+ file_url = self._file_url(rfile)
while(error is True and nbtry < 3):
- fp = open(file_path, "wb")
- curl = pycurl.Curl()
-
- # Configure TCP keepalive
- if self.tcp_keepalive:
- curl.setopt(pycurl.TCP_KEEPALIVE, True)
- curl.setopt(pycurl.TCP_KEEPIDLE, self.tcp_keepalive * 2)
- curl.setopt(pycurl.TCP_KEEPINTVL, self.tcp_keepalive)
-
- # Configure SSL verification (on some platforms, disabling
- # SSL_VERIFYPEER implies disabling SSL_VERIFYHOST so we set
- # SSL_VERIFYPEER after)
- curl.setopt(pycurl.SSL_VERIFYHOST, 2 if self.ssl_verifyhost else 0)
- curl.setopt(pycurl.SSL_VERIFYPEER, 1 if self.ssl_verifypeer else 0)
- if self.ssl_server_cert:
- # cacert is the name of the option for the curl command. The
- # corresponding cURL option is CURLOPT_CAINFO.
- # See https://curl.haxx.se/libcurl/c/CURLOPT_CAINFO.html
- # This is inspired by that https://curl.haxx.se/docs/sslcerts.html
- # (section "Certificate Verification", option 2) but the option
- # CURLOPT_CAPATH is for a directory of certificates.
- curl.setopt(pycurl.CAINFO, self.ssl_server_cert)
+
+ self._basic_curl_configuration()
try:
- curl.setopt(pycurl.URL, file_to_download)
+ self.crl.setopt(pycurl.URL, file_url)
except Exception:
- curl.setopt(pycurl.URL, file_to_download.encode('ascii', 'ignore'))
- if self.proxy is not None:
- curl.setopt(pycurl.PROXY, self.proxy)
- if self.proxy_auth is not None:
- curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
- if self.credentials is not None:
- curl.setopt(pycurl.USERPWD, self.credentials)
-
- curl.setopt(pycurl.CONNECTTIMEOUT, 300)
- # Download should not take more than 5minutes
- curl.setopt(pycurl.TIMEOUT, self.timeout)
- curl.setopt(pycurl.NOSIGNAL, 1)
- curl.setopt(pycurl.WRITEDATA, fp)
+ self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))
+ # Create file and assign it to the pycurl object
+ fp = open(file_path, "wb")
+ self.crl.setopt(pycurl.WRITEFUNCTION, fp.write)
+
+ # This is specific to HTTP
+ if self.method == 'POST':
+ # Form data must be provided already urlencoded.
+ postfields = urlencode(self.param)
+ # Sets request method to POST,
+ # Content-Type header to application/x-www-form-urlencoded
+ # and data to send in request body.
+ self.crl.setopt(pycurl.POSTFIELDS, postfields)
+
+ # Try download
try:
- curl.perform()
- errcode = curl.getinfo(pycurl.HTTP_CODE)
- if int(errcode) != 226 and int(errcode) != 200:
+ self.crl.perform()
+ errcode = self.crl.getinfo(pycurl.RESPONSE_CODE)
+ if int(errcode) != self.ERRCODE_OK:
error = True
- self.logger.error('Error while downloading ' + file_to_download + ' - ' + str(errcode))
+ self.logger.error('Error while downloading ' + file_url + ' - ' + str(errcode))
else:
error = False
except Exception as e:
self.logger.error('Could not get errcode:' + str(e))
- nbtry += 1
- curl.close()
+ # Close file
fp.close()
+
+ # Check that the archive is correct
if not error and not self.skip_check_uncompress:
archive_status = Utils.archive_check(file_path)
if not archive_status:
@@ -232,130 +325,36 @@ class FTPDownload(DownloadInterface):
error = True
if os.path.exists(file_path):
os.remove(file_path)
- return error
-
- def download(self, local_dir, keep_dirs=True):
- '''
- Download remote files to local_dir
-
- :param local_dir: Directory where files should be downloaded
- :type local_dir: str
- :param keep_dirs: keep file name directory structure or copy file in local_dir directly
- :param keep_dirs: bool
- :return: list of downloaded files
- '''
- self.logger.debug('FTP:Download')
-
- nb_files = len(self.files_to_download)
- cur_files = 1
-
- for rfile in self.files_to_download:
- if self.kill_received:
- raise Exception('Kill request received, exiting')
- file_dir = local_dir
- if 'save_as' not in rfile or not rfile['save_as']:
- rfile['save_as'] = rfile['name']
- if keep_dirs:
- file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
- file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
-
- # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
- if not os.path.exists(file_dir):
- os.makedirs(file_dir)
-
- self.logger.debug('FTP:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
- self.logger.debug('FTP:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
- cur_files += 1
- if 'url' not in rfile or not rfile['url']:
- rfile['url'] = self.url
- if 'root' not in rfile or not rfile['root']:
- rfile['root'] = self.rootdir
- start_time = datetime.now()
- start_time = time.mktime(start_time.timetuple())
- error = self.curl_download(file_path, rfile['url'] + rfile['root'] + '/' + rfile['name'])
- if error:
- rfile['download_time'] = 0
- rfile['error'] = True
- raise Exception("FTP:Download:Error:" + rfile['url'] + rfile['root'] + '/' + rfile['name'])
- else:
- end_time = datetime.now()
- end_time = time.mktime(end_time.timetuple())
- rfile['download_time'] = end_time - start_time
-
- self.set_permissions(file_path, rfile)
-
- return self.files_to_download
-
- def header_function(self, header_line):
- # HTTP standard specifies that headers are encoded in iso-8859-1.
- # On Python 2, decoding step can be skipped.
- # On Python 3, decoding step is required.
- header_line = header_line.decode('iso-8859-1')
-
- # Header lines include the first status line (HTTP/1.x ...).
- # We are going to ignore all lines that don't have a colon in them.
- # This will botch headers that are split on multiple lines...
- if ':' not in header_line:
- return
-
- # Break the header line into header name and value.
- name, value = header_line.split(':', 1)
-
- # Remove whitespace that may be present.
- # Header lines include the trailing newline, and there may be whitespace
- # around the colon.
- name = name.strip()
- value = value.strip()
- # Header names are case insensitive.
- # Lowercase name here.
- name = name.lower()
+ # Increment retry counter
+ nbtry += 1
- # Now we can actually record the header name and value.
- self.headers[name] = value
+ return error
def list(self, directory=''):
'''
- List FTP directory
+ List remote directory
:return: tuple of file and dirs in current directory with details
- '''
- self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
- # Configure TCP keepalive
- if self.tcp_keepalive:
- self.crl.setopt(pycurl.TCP_KEEPALIVE, True)
- self.crl.setopt(pycurl.TCP_KEEPIDLE, self.tcp_keepalive * 2)
- self.crl.setopt(pycurl.TCP_KEEPINTVL, self.tcp_keepalive)
+ This is a generic method for HTTP and FTP. The protocol-specific parts
+ are done in _<protocol>_parse_result.
+ '''
+ dir_url = self.url + self.rootdir + directory
+ self.logger.debug('Download:List:' + dir_url)
- # See the corresponding lines in method:`curl_download`
- self.crl.setopt(pycurl.SSL_VERIFYHOST, 2 if self.ssl_verifyhost else 0)
- self.crl.setopt(pycurl.SSL_VERIFYPEER, 1 if self.ssl_verifypeer else 0)
- if self.ssl_server_cert:
- self.crl.setopt(pycurl.CAINFO, self.ssl_server_cert)
+ self._basic_curl_configuration()
try:
- self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
+ self.crl.setopt(pycurl.URL, dir_url)
except Exception:
- self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
+ self.crl.setopt(pycurl.URL, dir_url.encode('ascii', 'ignore'))
- if self.proxy is not None:
- self.crl.setopt(pycurl.PROXY, self.proxy)
- if self.proxy_auth is not None:
- self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
- if self.credentials is not None:
- self.crl.setopt(pycurl.USERPWD, self.credentials)
+ # Create buffer and assign it to the pycurl object
output = BytesIO()
- # lets assign this buffer to pycurl object
self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
- self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
-
- self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
- # Download should not take more than 5minutes
- self.crl.setopt(pycurl.TIMEOUT, self.timeout)
- self.crl.setopt(pycurl.NOSIGNAL, 1)
+ # Try to list
try:
self.crl.perform()
except Exception as e:
@@ -378,6 +377,9 @@ class FTPDownload(DownloadInterface):
# lets get the output in a string
result = output.getvalue().decode(encoding)
+ return self._parse_result(result)
+
+ def _ftp_parse_result(self, result):
# FTP LIST output is separated by \r\n
# lets split the output in lines
lines = re.split(r'[\n\r]+', result)
@@ -428,8 +430,65 @@ class FTPDownload(DownloadInterface):
rdirs.append(rfile)
return (rfiles, rdirs)
- def chroot(self, cwd):
- self.logger.debug('Download: change dir ' + cwd)
+ def _http_parse_result(self, result):
+ rfiles = []
+ rdirs = []
+
+ dirs = re.findall(self.http_parse.dir_line, result)
+ if dirs is not None and len(dirs) > 0:
+ for founddir in dirs:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = 0
+ date = founddir[self.http_parse.dir_date - 1]
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ # 19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = int(parts[0])
+ rfile['year'] = int(parts[2])
+ rfile['name'] = founddir[self.http_parse.dir_name - 1]
+ rdirs.append(rfile)
+
+ files = re.findall(self.http_parse.file_line, result)
+
+ if files is not None and len(files) > 0:
+ for foundfile in files:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ if self.http_parse.file_size != -1:
+ rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
+ else:
+ rfile['size'] = 0
+ if self.http_parse.file_date != -1:
+ date = foundfile[self.http_parse.file_date - 1]
+ if self.http_parse.file_date_format:
+ date_object = datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
+ rfile['month'] = date_object.month
+ rfile['day'] = date_object.day
+ rfile['year'] = date_object.year
+ else:
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ # 19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = int(parts[0])
+ rfile['year'] = int(parts[2])
+ else:
+ today = datetime.now()
+ date = '%s-%s-%s' % (today.year, today.month, today.day)
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ rfile['name'] = foundfile[self.http_parse.file_name - 1]
+ filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
+ rfile['hash'] = hashlib.md5(filehash).hexdigest()
+ rfiles.append(rfile)
+ return (rfiles, rdirs)
def close(self):
if self.crl is not None:
=====================================
biomaj_download/download/direct.py
=====================================
@@ -1,12 +1,29 @@
+"""
+Subclasses for direct download (i.e. downloading without regexp). The usage is
+a bit different: instead of calling method:`list` and method:`match`, client
+code explicitely calls method:`set_files_to_download` (passing a list
+containing only the file name). method:`list` is used to get more information
+about the file (if possile). method:`match` matches everything.
+Also client code can use method:`set_save_as` to indicate the name of the file
+to save.
+
+The trick for the implementation is to override
+method:`_append_file_to_download` to initialize the rfile with the file name
+and dummy values. Note that we use a list of rfile even if it contains only one
+file.
+method:`list` will modify directly the files_to_download.
+method:``match` don't call method:`_append_file_to_download` (since the list of
+files to download is already set up).
+We also override method:`set_files_to_download` to check that we pass only one
+file.
+"""
import datetime
-import time
import pycurl
-import os
import re
import hashlib
import sys
-from biomaj_download.download.ftp import FTPDownload
+from biomaj_download.download.curl import CurlDownload
from biomaj_core.utils import Utils
if sys.version_info[0] < 3:
@@ -20,217 +37,98 @@ except ImportError:
from StringIO import StringIO as BytesIO
-class DirectFTPDownload(FTPDownload):
+class DirectFTPDownload(CurlDownload):
'''
download a list of files from FTP, no regexp
'''
- def __init__(self, protocol, host, rootdir=''):
- '''
+ ALL_PROTOCOLS = ["ftp", "ftps"]
+ def _append_file_to_download(self, filename):
+ '''
Initialize the files in list with today as last-modification date.
- Size is also preset to zero, size will be set after download
-
+ Size is also preset to zero.
'''
- FTPDownload.__init__(self, protocol, host, rootdir)
- self.save_as = None
- self.headers = {}
-
- def set_files_to_download(self, files):
today = datetime.date.today()
- self.files_to_download = []
- for file_to_download in files:
- rfile = {}
- rfile['root'] = ''
- rfile['permissions'] = ''
- rfile['group'] = ''
- rfile['user'] = ''
- rfile['size'] = 0
- rfile['month'] = today.month
- rfile['day'] = today.day
- rfile['year'] = today.year
- if file_to_download.endswith('/'):
- rfile['name'] = file_to_download[:-1]
- else:
- rfile['name'] = file_to_download
- rfile['hash'] = None
- if self.param:
- if 'param' not in file_to_download or not file_to_download['param']:
- rfile['param'] = self.param
- self.files_to_download.append(rfile)
+ rfile = {}
+ rfile['root'] = self.rootdir
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = 0
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ if filename.endswith('/'):
+ rfile['name'] = filename[:-1]
+ else:
+ rfile['name'] = filename
+ rfile['hash'] = None
+ # Use self.save_as even if we use it in list(). This is important.
+ rfile['save_as'] = self.save_as
+ super(DirectFTPDownload, self)._append_file_to_download(rfile)
+
+ def set_files_to_download(self, files_to_download):
+ if len(files_to_download) > 1:
+ self.files_to_download = []
+ msg = self.__class__.__name__ + ' accepts only 1 file'
+ self.logger.error(msg)
+ raise ValueError(msg)
+ return super(DirectFTPDownload, self).set_files_to_download(files_to_download)
def list(self, directory=''):
'''
FTP protocol does not give us the possibility to get file date from remote url
'''
- for rfile in self.files_to_download:
- if self.save_as is None:
- self.save_as = rfile['name']
- rfile['save_as'] = self.save_as
+ # TODO: are we sure about this implementation ?
return (self.files_to_download, [])
def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
'''
All files to download match, no pattern
'''
- if dir_list is None:
- dir_list = []
- self.files_to_download = file_list
+ pass
-class DirectHttpDownload(DirectFTPDownload):
+class DirectHTTPDownload(DirectFTPDownload):
- def __init__(self, protocol, host, rootdir=''):
- '''
- :param file_list: list of files to download on server
- :type file_list: list
- '''
- DirectFTPDownload.__init__(self, protocol, host, rootdir)
- self.save_as = None
+ ALL_PROTOCOLS = ["http", "https"]
+
+ def __init__(self, curl_protocol, host, rootdir=''):
+ DirectFTPDownload.__init__(self, curl_protocol, host, rootdir)
self.method = 'GET'
self.param = {}
- def download(self, local_dir, keep_dirs=True):
- '''
- Download remote files to local_dir
-
- :param local_dir: Directory where files should be downloaded
- :type local_dir: str
- :param keep_dirs: keep file name directory structure or copy file in local_dir directly
- :param keep_dirs: bool
- :return: list of downloaded files
- '''
- self.logger.debug('DirectHTTP:Download')
- nb_files = len(self.files_to_download)
-
- if nb_files > 1:
- self.files_to_download = []
- self.logger.error('DirectHTTP accepts only 1 file')
-
- cur_files = 1
-
- for rfile in self.files_to_download:
- if self.kill_received:
- raise Exception('Kill request received, exiting')
-
- if not self.save_as:
- self.save_as = rfile['name']
- else:
- rfile['save_as'] = self.save_as
- file_dir = local_dir
- if keep_dirs:
- file_dir = local_dir + os.path.dirname(self.save_as)
- file_path = file_dir + '/' + os.path.basename(self.save_as)
-
- # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
- if not os.path.exists(file_dir):
- os.makedirs(file_dir)
- self.logger.debug('DirectHTTP:Download:Progress' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'] + ', save as ' + self.save_as)
- cur_files += 1
- if 'url' not in rfile:
- rfile['url'] = self.url
- fp = open(file_path, "wb")
- curl = pycurl.Curl()
-
- if self.proxy is not None:
- curl.setopt(pycurl.PROXY, self.proxy)
- if self.proxy_auth is not None:
- curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
- if self.method == 'POST':
- # Form data must be provided already urlencoded.
- postfields = urlencode(self.param)
- # Sets request method to POST,
- # Content-Type header to application/x-www-form-urlencoded
- # and data to send in request body.
- if self.credentials is not None:
- curl.setopt(pycurl.USERPWD, self.credentials)
-
- curl.setopt(pycurl.POSTFIELDS, postfields)
- try:
- curl.setopt(pycurl.URL, rfile['url'] + rfile['root'] + '/' + rfile['name'])
- except Exception:
- curl.setopt(pycurl.URL, (rfile['url'] + rfile['root'] + '/' + rfile['name']).encode('ascii', 'ignore'))
-
- else:
- url = rfile['url'] + rfile['root'] + '/' + rfile['name'] + '?' + urlencode(self.param)
- try:
- curl.setopt(pycurl.URL, url)
- except Exception:
- curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
-
- curl.setopt(pycurl.WRITEDATA, fp)
- start_time = datetime.datetime.now()
- start_time = time.mktime(start_time.timetuple())
- curl.perform()
- end_time = datetime.datetime.now()
- end_time = time.mktime(end_time.timetuple())
- rfile['download_time'] = end_time - start_time
-
- curl.close()
- fp.close()
- self.logger.debug('downloaded!')
- rfile['name'] = self.save_as
- self.set_permissions(file_path, rfile)
- return self.files_to_download
-
- def header_function(self, header_line):
- # HTTP standard specifies that headers are encoded in iso-8859-1.
- # On Python 2, decoding step can be skipped.
- # On Python 3, decoding step is required.
- header_line = header_line.decode('iso-8859-1')
-
- # Header lines include the first status line (HTTP/1.x ...).
- # We are going to ignore all lines that don't have a colon in them.
- # This will botch headers that are split on multiple lines...
- if ':' not in header_line:
- return
-
- # Break the header line into header name and value.
- name, value = header_line.split(':', 1)
-
- # Remove whitespace that may be present.
- # Header lines include the trailing newline, and there may be whitespace
- # around the colon.
- name = name.strip()
- value = value.strip()
-
- # Header names are case insensitive.
- # Lowercase name here.
- name = name.lower()
-
- # Now we can actually record the header name and value.
- self.headers[name] = value
+ def _file_url(self, file_to_download):
+ url = super(DirectHTTPDownload, self)._file_url(file_to_download)
+ if self.method == "GET":
+ url += '?' + urlencode(self.param)
+ return url
def list(self, directory=''):
'''
Try to get file headers to get last_modification and size
'''
+ self._basic_curl_configuration()
+ # Specific configuration
+ self.crl.setopt(pycurl.HEADER, True)
+ self.crl.setopt(pycurl.NOBODY, True)
for rfile in self.files_to_download:
if self.save_as is None:
self.save_as = rfile['name']
rfile['save_as'] = self.save_as
- self.crl.setopt(pycurl.HEADER, True)
- if self.credentials is not None:
- self.crl.setopt(pycurl.USERPWD, self.credentials)
-
- if self.proxy is not None:
- self.crl.setopt(pycurl.PROXY, self.proxy)
- if self.proxy_auth is not None:
- self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
- self.crl.setopt(pycurl.NOBODY, True)
+ file_url = self._file_url(rfile)
try:
- self.crl.setopt(pycurl.URL, self.url + self.rootdir + rfile['name'])
+ self.crl.setopt(pycurl.URL, file_url)
except Exception:
- self.crl.setopt(pycurl.URL, (self.url + self.rootdir + rfile['name']).encode('ascii', 'ignore'))
+ self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))
+ # Create a buffer and assign it to the pycurl object
output = BytesIO()
- # lets assign this buffer to pycurl object
self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
- self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+
self.crl.perform()
# Figure out what encoding was sent with the response, if any.
=====================================
biomaj_download/download/http.py deleted
=====================================
@@ -1,166 +0,0 @@
-import pycurl
-import re
-import hashlib
-import datetime
-
-import humanfriendly
-
-from biomaj_core.utils import Utils
-from biomaj_download.download.ftp import FTPDownload
-
-try:
- from io import BytesIO
-except ImportError:
- from StringIO import StringIO as BytesIO
-
-
-class HTTPParse(object):
-
- def __init__(self, dir_line, file_line, dir_name=1, dir_date=2, file_name=1, file_date=2, file_date_format=None, file_size=3):
- r'''
- http.parse.dir.line: <img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
- http.parse.file.line: <img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
- http.group.dir.name: 1
- http.group.dir.date: 2
- http.group.file.name: 1
- http.group.file.date: 2
- http.group.file.size: 3
- '''
- self.dir_line = dir_line
- self.file_line = file_line
- self.dir_name = dir_name
- self.dir_date = dir_date
- self.file_name = file_name
- self.file_date = file_date
- self.file_size = file_size
- self.file_date_format = file_date_format
-
-
-class HTTPDownload(FTPDownload):
- '''
- Base class to download files from HTTP
-
- Makes use of http.parse.dir.line etc.. regexps to extract page information
-
- protocol=http
- server=ftp.ncbi.nih.gov
- remote.dir=/blast/db/FASTA/
-
- remote.files=^alu.*\\.gz$
-
- '''
-
- def __init__(self, protocol, host, rootdir, http_parse=None):
- FTPDownload.__init__(self, protocol, host, rootdir)
- self.http_parse = http_parse
-
- def list(self, directory=''):
- '''
- List FTP directory
-
- :return: tuple of file and dirs in current directory with details
- '''
- self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
-
- try:
- self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
- except Exception:
- self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
-
- if self.proxy is not None:
- self.crl.setopt(pycurl.PROXY, self.proxy)
- if self.proxy_auth is not None:
- self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
- if self.credentials is not None:
- self.crl.setopt(pycurl.USERPWD, self.credentials)
-
- output = BytesIO()
- # lets assign this buffer to pycurl object
- self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
- self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
- self.crl.perform()
- # Figure out what encoding was sent with the response, if any.
- # Check against lowercased header name.
- encoding = None
- if 'content-type' in self.headers:
- content_type = self.headers['content-type'].lower()
- match = re.search(r'charset=(\S+)', content_type)
- if match:
- encoding = match.group(1)
- if encoding is None:
- # Default encoding for HTML is iso-8859-1.
- # Other content types may have different default encoding,
- # or in case of binary data, may have no encoding at all.
- encoding = 'iso-8859-1'
-
- # lets get the output in a string
- result = output.getvalue().decode(encoding)
- r'''
- http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
- http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
- http.group.dir.name': 1,
- http.group.dir.date': 2,
- http.group.file.name': 1,
- http.group.file.date': 2,
- http.group.file.size': 3,
- '''
-
- rfiles = []
- rdirs = []
-
- dirs = re.findall(self.http_parse.dir_line, result)
- if dirs is not None and len(dirs) > 0:
- for founddir in dirs:
- rfile = {}
- rfile['permissions'] = ''
- rfile['group'] = ''
- rfile['user'] = ''
- rfile['size'] = 0
- date = founddir[self.http_parse.dir_date - 1]
- dirdate = date.split()
- parts = dirdate[0].split('-')
- # 19-Jul-2014 13:02
- rfile['month'] = Utils.month_to_num(parts[1])
- rfile['day'] = int(parts[0])
- rfile['year'] = int(parts[2])
- rfile['name'] = founddir[self.http_parse.dir_name - 1]
- rdirs.append(rfile)
-
- files = re.findall(self.http_parse.file_line, result)
- if files is not None and len(files) > 0:
- for foundfile in files:
- rfile = {}
- rfile['permissions'] = ''
- rfile['group'] = ''
- rfile['user'] = ''
- if self.http_parse.file_size != -1:
- rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
- else:
- rfile['size'] = 0
- if self.http_parse.file_date != -1:
- date = foundfile[self.http_parse.file_date - 1]
- if self.http_parse.file_date_format:
- date_object = datetime.datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
- rfile['month'] = date_object.month
- rfile['day'] = date_object.day
- rfile['year'] = date_object.year
- else:
- dirdate = date.split()
- parts = dirdate[0].split('-')
- # 19-Jul-2014 13:02
- rfile['month'] = Utils.month_to_num(parts[1])
- rfile['day'] = int(parts[0])
- rfile['year'] = int(parts[2])
- else:
- today = datetime.datetime.now()
- date = '%s-%s-%s' % (today.year, today.month, today.day)
- rfile['month'] = today.month
- rfile['day'] = today.day
- rfile['year'] = today.year
- rfile['name'] = foundfile[self.http_parse.file_name - 1]
- filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
- rfile['hash'] = hashlib.md5(filehash).hexdigest()
- rfiles.append(rfile)
-
- return (rfiles, rdirs)
=====================================
biomaj_download/download/interface.py
=====================================
@@ -24,12 +24,30 @@ class _FakeLock(object):
class DownloadInterface(object):
'''
- Main interface that all downloaders must extend
+ Main interface that all downloaders must extend.
+
+ The methods are divided into 2 broad categories:
+ - setters which act on properties of the downloader; those methods are
+ important in microservice mode
+ - file operations which are used to list and match remote files, download
+ them, etc.
+
+ Usually, it is enough to overload list, _append_file_to_download and
+ _download.
+
+ TODO:
+ - the purpose of some setters (set_server, set_protocol) is not clear
+ since a subclass cannot always change those parameters arbitrarily
+ - chroot is not used in BioMaJ
'''
files_num_threads = 4
def __init__(self):
+ # This variable defines the protocol as passed by the config file (i.e.
+ # this is directftp for DirectFTPDownload). It is used by the workflow
+ # to send the download message so it must be set.
+ self.protocol = None
self.config = None
self.files_to_download = []
self.files_to_copy = []
@@ -47,13 +65,16 @@ class DownloadInterface(object):
self.logger = logging.getLogger('biomaj')
self.param = None
self.method = None
- self.protocol = None
self.server = None
self.offline_dir = None
# Options
self.protocol_options = {}
self.skip_check_uncompress = False
+ #
+ # Setters for downloader
+ #
+
def set_offline_dir(self, offline_dir):
self.offline_dir = offline_dir
@@ -61,15 +82,13 @@ class DownloadInterface(object):
self.server = server
def set_protocol(self, protocol):
+ """
+ Method used by DownloadService to set the protocol. This value is
+ passed from the config file so is not always a real protocol (for
+ instance it can be "directhttp" for a direct downloader).
+ """
self.protocol = protocol
- def set_files_to_download(self, files):
- self.files_to_download = files
- for file_to_download in self.files_to_download:
- if self.param:
- if 'param' not in file_to_download or not file_to_download['param']:
- file_to_download['param'] = self.param
-
def set_param(self, param):
self.param = param
@@ -100,6 +119,54 @@ class DownloadInterface(object):
def set_method(self, method):
self.method = method
+ def set_credentials(self, userpwd):
+ '''
+ Set credentials in format user:pwd
+
+ :param userpwd: credentials
+ :type userpwd: str
+ '''
+ self.credentials = userpwd
+
+ def set_options(self, protocol_options):
+ """
+ Set protocol specific options.
+
+ Subclasses that override this method must call the
+ parent implementation.
+ """
+ self.protocol_options = protocol_options
+ if "skip_check_uncompress" in protocol_options:
+ self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
+
+ #
+ # File operations (match, list, download) and associated hook methods
+ #
+
+ def _append_file_to_download(self, rfile):
+ """
+ Add a file to the download list and check its properties (this method
+ is called in `match` and `set_files_to_download`).
+
+ Downloaders can override this to add some properties to the file (for
+ instance, most of them will add "root").
+ """
+ # Add properties to the file if needed (for safety)
+ if 'save_as' not in rfile or rfile['save_as'] is None:
+ rfile['save_as'] = rfile['name']
+ if self.param:
+ if 'param' not in rfile or not rfile['param']:
+ rfile['param'] = self.param
+ self.files_to_download.append(rfile)
+
+ def set_files_to_download(self, files):
+ """
+ Convenience method to set the list of files to download.
+ """
+ self.files_to_download = []
+ for file_to_download in files:
+ self._append_file_to_download(file_to_download)
+
def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
'''
Find files matching patterns. Sets instance variable files_to_download.
@@ -130,13 +197,12 @@ class DownloadInterface(object):
if subdir == '^':
subdirs_pattern = subdirs_pattern[1:]
subdir = subdirs_pattern[0]
- if not dir_list and pattern == '**/*':
- # Take all and no more dirs, take all files
+ # If getting all, get all files
+ if pattern == '**/*':
for rfile in file_list:
- rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
- self.files_to_download.append(rfile)
+ self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
return
for direlt in dir_list:
@@ -147,10 +213,9 @@ class DownloadInterface(object):
self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
for rfile in file_list:
if pattern == '**/*' or re.match(pattern, rfile['name']):
- rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
- self.files_to_download.append(rfile)
+ self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
else:
if re.match(subdirs_pattern[0], subdir):
@@ -163,10 +228,9 @@ class DownloadInterface(object):
else:
for rfile in file_list:
if re.match(pattern, rfile['name']):
- rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
- self.files_to_download.append(rfile)
+ self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
if not submatch and len(self.files_to_download) == 0:
raise Exception('no file found matching expressions')
@@ -226,7 +290,6 @@ class DownloadInterface(object):
self.files_to_copy.append(dfile)
else:
new_files_to_download.append(dfile)
-
else:
# Copy everything
for dfile in self.files_to_download:
@@ -236,17 +299,66 @@ class DownloadInterface(object):
else:
new_files_to_download.append(dfile)
- self.files_to_download = new_files_to_download
+ self.set_files_to_download(new_files_to_download)
- def download(self, local_dir):
+ def _download(self, file_path, rfile):
+ '''
+ Download one file and return False in case of success and True
+ otherwise. This must be implemented in subclasses.
+ '''
+ raise NotImplementedError()
+
+ def download(self, local_dir, keep_dirs=True):
'''
Download remote files to local_dir
:param local_dir: Directory where files should be downloaded
:type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
:return: list of downloaded files
'''
- pass
+ self.logger.debug(self.__class__.__name__ + ':Download')
+ nb_files = len(self.files_to_download)
+ cur_files = 1
+ self.offline_dir = local_dir
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ # Determine where to store file (directory and name)
+ file_dir = local_dir
+ if keep_dirs:
+ file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
+ if file_dir[-1] == "/":
+ file_path = file_dir + os.path.basename(rfile['save_as'])
+ else:
+ file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
+
+ # For unit tests only, workflow will take in charge directory
+ # creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+
+ msg = self.__class__.__name__ + ':Download:Progress:'
+ msg += str(cur_files) + '/' + str(nb_files)
+ msg += ' downloading file ' + rfile['name'] + ' save as ' + rfile['save_as']
+ self.logger.debug(msg)
+ cur_files += 1
+ start_time = datetime.datetime.now()
+ start_time = time.mktime(start_time.timetuple())
+ error = self._download(file_path, rfile)
+ if error:
+ rfile['download_time'] = 0
+ rfile['error'] = True
+ raise Exception(self.__class__.__name__ + ":Download:Error:" + rfile["name"])
+ else:
+ end_time = datetime.datetime.now()
+ end_time = time.mktime(end_time.timetuple())
+ rfile['download_time'] = end_time - start_time
+ # Set permissions
+ self.set_permissions(file_path, rfile)
+
+ return self.files_to_download
def list(self):
'''
@@ -262,26 +374,6 @@ class DownloadInterface(object):
'''
pass
- def set_credentials(self, userpwd):
- '''
- Set credentials in format user:pwd
-
- :param userpwd: credentials
- :type userpwd: str
- '''
- self.credentials = userpwd
-
- def set_options(self, protocol_options):
- """
- Set protocol specific options.
-
- Subclasses that override this method must call the
- parent implementation.
- """
- self.protocol_options = protocol_options
- if "skip_check_uncompress" in protocol_options:
- self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
-
def close(self):
'''
Close connection
=====================================
biomaj_download/download/localcopy.py
=====================================
@@ -15,7 +15,6 @@ class LocalDownload(DownloadInterface):
remote.dir=/blast/db/FASTA/
remote.files=^alu.*\\.gz$
-
'''
def __init__(self, rootdir, use_hardlinks=False):
@@ -24,6 +23,11 @@ class LocalDownload(DownloadInterface):
self.rootdir = rootdir
self.use_hardlinks = use_hardlinks
+ def _append_file_to_download(self, rfile):
+ if 'root' not in rfile or not rfile['root']:
+ rfile['root'] = self.rootdir
+ super(LocalDownload, self)._append_file_to_download(rfile)
+
def download(self, local_dir):
'''
Copy local files to local_dir
=====================================
biomaj_download/download/protocolirods.py
=====================================
@@ -1,34 +1,37 @@
-import logging
-import os
-from datetime import datetime
-import time
-
-from biomaj_core.utils import Utils
from biomaj_download.download.interface import DownloadInterface
from irods.session import iRODSSession
-from irods.models import Collection, DataObject, User
+from irods.models import DataObject, User
class IRODSDownload(DownloadInterface):
- # To connect to irods session : sess = iRODSSession(host='localhost', port=1247, user='rods', password='rods', zone='tempZone')
- # password : self.credentials
- def __init__(self, protocol, server, remote_dir):
+
+ # This is used only for messages
+ real_protocol = "irods"
+
+ def __init__(self, server, remote_dir):
DownloadInterface.__init__(self)
- self.port = None
- self.remote_dir = remote_dir # directory on the remote server : zone
+ self.port = 1247
+ self.remote_dir = remote_dir # directory on the remote server including zone
self.rootdir = remote_dir
self.user = None
self.password = None
self.server = server
- self.zone = None
+ self.zone = remote_dir.split("/")[0]
+
+ def _append_file_to_download(self, rfile):
+ if 'root' not in rfile or not rfile['root']:
+ rfile['root'] = self.rootdir
+ super(IRODSDownload, self)._append_file_to_download(rfile)
def set_param(self, param):
- # self.param is a dictionnary which has the following form :{'password': u'biomaj', 'protocol': u'iget', 'user': u'biomaj', 'port': u'port'}
+ # param is a dictionary which has the following form :
+ # {'password': u'biomaj', 'user': u'biomaj', 'port': u'port'}
+ # port is optional
self.param = param
- self.port = int(param['port'])
self.user = str(param['user'])
self.password = str(param['password'])
- self.zone = str(param['zone'])
+ if 'port' in param:
+ self.port = int(param['port'])
def list(self, directory=''):
session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
@@ -36,10 +39,13 @@ class IRODSDownload(DownloadInterface):
rdirs = []
rfile = {}
date = None
- for result in session.query(Collection.name, DataObject.name, DataObject.size, DataObject.owner_name, DataObject.modify_time).filter(User.name == self.user).get_results():
- # if the user is biomaj : he will have access to all the irods data (biomaj ressource) : drwxr-xr-x
+ query = session.query(DataObject.name, DataObject.size,
+ DataObject.owner_name, DataObject.modify_time)
+ results = query.filter(User.name == self.user).get_results()
+ for result in results:
# Avoid duplication
- if rfile != {} and rfile['name'] == str(result[DataObject.name]) and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
+ if rfile != {} and rfile['name'] == str(result[DataObject.name]) \
+ and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
continue
rfile = {}
date = str(result[DataObject.modify_time]).split(" ")[0].split('-')
@@ -49,81 +55,28 @@ class IRODSDownload(DownloadInterface):
rfile['day'] = int(date[2])
rfile['year'] = int(date[0])
rfile['name'] = str(result[DataObject.name])
- rfile['download_path'] = str(result[Collection.name])
rfiles.append(rfile)
session.cleanup()
return (rfiles, rdirs)
- def download(self, local_dir, keep_dirs=True):
- '''
- Download remote files to local_dir
-
- :param local_dir: Directory where files should be downloaded
- :type local_dir: str
- :param keep_dirs: keep file name directory structure or copy file in local_dir directly
- :param keep_dirs: bool
- :return: list of downloaded files
- '''
- logging.debug('IRODS:Download')
- try:
- os.chdir(local_dir)
- except TypeError:
- logging.error("IRODS:list:Could not find offline_dir")
- nb_files = len(self.files_to_download)
- cur_files = 1
- # give a working directory to copy the file from irods
- remote_dir = self.remote_dir
- for rfile in self.files_to_download:
- if self.kill_received:
- raise Exception('Kill request received, exiting')
- file_dir = local_dir
- if 'save_as' not in rfile or rfile['save_as'] is None:
- rfile['save_as'] = rfile['name']
- if keep_dirs:
- file_dir = local_dir + os.path.dirname(rfile['save_as'])
- file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
- # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
- if not os.path.exists(file_dir):
- os.makedirs(file_dir)
-
- logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
- logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
- cur_files += 1
- start_time = datetime.now()
- start_time = time.mktime(start_time.timetuple())
- self.remote_dir = rfile['root']
- error = self.irods_download(file_dir, str(self.remote_dir), str(rfile['name']))
- if error:
- rfile['download_time'] = 0
- rfile['error'] = True
- raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
- else:
- archive_status = Utils.archive_check(file_path)
- if not archive_status:
- self.logger.error('Archive is invalid or corrupted, deleting file')
- rfile['error'] = True
- if os.path.exists(file_path):
- os.remove(file_path)
- raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
-
- end_time = datetime.now()
- end_time = time.mktime(end_time.timetuple())
- rfile['download_time'] = end_time - start_time
- self.set_permissions(file_path, rfile)
- self.remote_dir = remote_dir
- return(self.files_to_download)
-
- def irods_download(self, file_dir, file_path, file_to_download):
+ def _download(self, file_dir, rfile):
error = False
- logging.debug('IRODS:IRODS DOWNLOAD')
- session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
+ self.logger.debug('IRODS:IRODS DOWNLOAD')
+ session = iRODSSession(host=self.server, port=self.port,
+ user=self.user, password=self.password,
+ zone=self.zone)
try:
- file_to_get = str(file_path) + str(file_to_download)
- # Write the file to download in the wanted file_dir : with the python-irods iget
+ # iRODS don't like multiple "/"
+ if rfile['root'][-1] == "/":
+ file_to_get = rfile['root'] + rfile['name']
+ else:
+ file_to_get = rfile['root'] + "/" + rfile['name']
+ # Write the file to download in the wanted file_dir with the
+ # python-irods iget
obj = session.data_objects.get(file_to_get, file_dir)
except ExceptionIRODS as e:
- logging.error("RsyncError:" + str(e))
- logging.error("RsyncError: irods object" + str(obj))
+ self.logger.error(self.__class__.__name__ + ":Download:Error:Can't get irods object " + str(obj))
+ self.logger.error(self.__class__.__name__ + ":Download:Error:" + str(e))
session.cleanup()
return(error)
=====================================
biomaj_download/download/rsync.py
=====================================
@@ -1,19 +1,16 @@
# from future import standard_library
# standard_library.install_aliases()
# from builtins import str
-import logging
import re
import os
import subprocess
-from datetime import datetime
-import time
from biomaj_download.download.interface import DownloadInterface
class RSYNCDownload(DownloadInterface):
'''
- Base class to download files from rsyncc
+ Base class to download files from rsync
protocol = rsync
server =
remote.dir =
@@ -21,18 +18,76 @@ class RSYNCDownload(DownloadInterface):
remote.files =
'''
- def __init__(self, protocol, server, remote_dir):
+ # This is used to forge the command
+ real_protocol = "rsync"
+
+ def __init__(self, server, rootdir):
DownloadInterface.__init__(self)
- logging.debug('Download')
- self.rootdir = remote_dir
- self.protocol = protocol
- if server and remote_dir:
+ self.logger.debug('Download')
+ # If rootdir is not given, we are in local mode. In this case, server
+ # is interpreted as rootdir
+ self.local_mode = not rootdir
+ if not self.local_mode:
self.server = server # name of the remote server
- self.remote_dir = remote_dir # directory on the remote server
+ self.rootdir = rootdir # directory on the remote server
+ else:
+ self.server = None
+ self.rootdir = server
+ # give a working directory to run rsync
+ if self.local_mode:
+ try:
+ os.chdir(self.rootdir)
+ except TypeError:
+ self.logger.error("RSYNC:Could not find local dir " + self.rootdir)
+
+ def _append_file_to_download(self, rfile):
+ if 'root' not in rfile or not rfile['root']:
+ rfile['root'] = self.rootdir
+ super(RSYNCDownload, self)._append_file_to_download(rfile)
+
+ def _remote_file_name(self, rfile):
+ # rfile['root'] is set to self.rootdir. We don't use os.path.join
+ # because rfile['name'] may starts with /
+ url = rfile['root'] + "/" + rfile['name']
+ if not self.local_mode:
+ url = self.server + ":" + url
+ return url
+
+ def _download(self, file_path, rfile):
+ error = False
+ err_code = ''
+ url = self._remote_file_name(rfile)
+ # Create the rsync command
+ if self.credentials:
+ cmd = str(self.real_protocol) + " " + str(self.credentials) + "@" + url + " " + str(file_path)
else:
- if server:
- self.server = server
- self.remote_dir = ""
+ cmd = str(self.real_protocol) + " " + url + " " + str(file_path)
+ self.logger.debug('RSYNC:RSYNC DOwNLOAD:' + cmd)
+ # Launch the command (we are in offline_dir)
+ try:
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+ stdout, stderr = p.communicate()
+ err_code = p.returncode
+ self.test_stderr_rsync_message(stderr)
+ self.test_stderr_rsync_error(stderr)
+ except ExceptionRsync as e:
+ self.logger.error(str(self.real_protocol) + " error:" + str(e))
+ if err_code != 0:
+ self.logger.error('Error while downloading ' + rfile["name"] + ' - ' + str(err_code))
+ error = True
+ return(error)
+
+ def test_stderr_rsync_error(self, stderr):
+ stderr = str(stderr.decode('utf-8'))
+ if "rsync error" in str(stderr):
+ reason = stderr.split(str(self.real_protocol) + " error:")[1].split("\n")[0]
+ raise ExceptionRsync(reason)
+
+ def test_stderr_rsync_message(self, stderr):
+ stderr = str(stderr.decode('utf-8'))
+ if "rsync:" in str(stderr):
+ reason = stderr.split(str(self.real_protocol) + ":")[1].split("\n")[0]
+ raise ExceptionRsync(reason)
def list(self, directory=''):
'''
@@ -43,18 +98,14 @@ class RSYNCDownload(DownloadInterface):
err_code = None
rfiles = []
rdirs = []
- logging.debug('RSYNC:List')
- # give a working directory to run rsync
- try:
- os.chdir(self.offline_dir)
- except TypeError:
- logging.error("RSYNC:list:Could not find offline_dir")
- if self.remote_dir and self.credentials:
- cmd = str(self.protocol) + " --list-only " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(directory)
- elif (self.remote_dir and not self.credentials):
- cmd = str(self.protocol) + " --list-only " + str(self.server) + ":" + str(self.remote_dir) + str(directory)
- else: # Local rsync for unitest
- cmd = str(self.protocol) + " --list-only " + str(self.server) + str(directory)
+ self.logger.debug('RSYNC:List')
+ if self.local_mode:
+ remote = str(self.rootdir) + str(directory)
+ else:
+ remote = str(self.server) + ":" + str(self.rootdir) + str(directory)
+ if self.credentials:
+ remote = str(self.credentials) + "@" + remote
+ cmd = str(self.real_protocol) + " --list-only " + remote
try:
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
list_rsync, err = p.communicate()
@@ -62,9 +113,9 @@ class RSYNCDownload(DownloadInterface):
self.test_stderr_rsync_error(err)
err_code = p.returncode
except ExceptionRsync as e:
- logging.error("RsyncError:" + str(e))
+ self.logger.error("RsyncError:" + str(e))
if err_code != 0:
- logging.error('Error while listing ' + str(err_code))
+ self.logger.error('Error while listing ' + str(err_code))
return(rfiles, rdirs)
list_rsync = str(list_rsync.decode('utf-8'))
lines = list_rsync.rstrip().split("\n")
@@ -92,97 +143,6 @@ class RSYNCDownload(DownloadInterface):
return (rfiles, rdirs)
- def download(self, local_dir, keep_dirs=True):
- '''
- Download remote files to local_dir
-
- :param local_dir: Directory where files should be downloaded
- :type local_dir: str
- :param keep_dirs: keep file name directory structure or copy file in local_dir directly
- :param keep_dirs: bool
- :return: list of downloaded files
- '''
-
- logging.debug('RSYNC:Download')
- nb_files = len(self.files_to_download)
- cur_files = 1
- # give a working directory to run rsync
- try:
- os.chdir(self.offline_dir)
- except TypeError:
- logging.error("RSYNC:list:Could not find offline_dir")
- for rfile in self.files_to_download:
- if self.kill_received:
- raise Exception('Kill request received, exiting')
- file_dir = local_dir
- if 'save_as' not in rfile or rfile['save_as'] is None:
- rfile['save_as'] = rfile['name']
- if keep_dirs:
- file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
- if re.match(r'\S*\/$', file_dir):
- file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
- else:
- file_path = file_dir + os.path.basename(rfile['save_as'])
- # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
- if not os.path.exists(file_dir):
- os.makedirs(file_dir)
-
- logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
- logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
- cur_files += 1
- start_time = datetime.now()
- start_time = time.mktime(start_time.timetuple())
- error = self.rsync_download(file_path, rfile['name'])
- if error:
- rfile['download_time'] = 0
- rfile['error'] = True
- raise Exception("RSYNC:Download:Error:" + rfile['root'] + '/' + rfile['name'])
- end_time = datetime.now()
- end_time = time.mktime(end_time.timetuple())
- rfile['download_time'] = end_time - start_time
- self.set_permissions(file_path, rfile)
- return(self.files_to_download)
-
- def rsync_download(self, file_path, file_to_download):
- error = False
- err_code = ''
- logging.debug('RSYNC:RSYNC DOwNLOAD')
- # give a working directory to run rsync
- try:
- os.chdir(self.offline_dir)
- except TypeError:
- logging.error("RSYNC:list:Could not find offline_dir")
- try:
- if self.remote_dir and self.credentials: # download on server
- cmd = str(self.protocol) + " " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
- elif self.remote_dir and not self.credentials:
- cmd = str(self.protocol) + " " + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
- else: # Local rsync for unitest
- cmd = str(self.protocol) + " " + str(self.server) + str(file_to_download) + " " + str(file_path)
- p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
- stdout, stderr = p.communicate()
- err_code = p.returncode
- self.test_stderr_rsync_message(stderr)
- self.test_stderr_rsync_error(stderr)
- except ExceptionRsync as e:
- logging.error("RsyncError:" + str(e))
- if err_code != 0:
- logging.error('Error while downloading ' + file_to_download + ' - ' + str(err_code))
- error = True
- return(error)
-
- def test_stderr_rsync_error(self, stderr):
- stderr = str(stderr.decode('utf-8'))
- if "rsync error" in str(stderr):
- reason = stderr.split(str(self.protocol) + " error:")[1].split("\n")[0]
- raise ExceptionRsync(reason)
-
- def test_stderr_rsync_message(self, stderr):
- stderr = str(stderr.decode('utf-8'))
- if "rsync:" in str(stderr):
- reason = stderr.split(str(self.protocol) + ":")[1].split("\n")[0]
- raise ExceptionRsync(reason)
-
class ExceptionRsync(Exception):
def __init__(self, exception_reason):
=====================================
biomaj_download/downloadservice.py
=====================================
@@ -13,10 +13,9 @@ import pika
from flask import Flask
from flask import jsonify
-from biomaj_download.download.ftp import FTPDownload
-from biomaj_download.download.http import HTTPDownload
+from biomaj_download.download.curl import CurlDownload
from biomaj_download.download.direct import DirectFTPDownload
-from biomaj_download.download.direct import DirectHttpDownload
+from biomaj_download.download.direct import DirectHTTPDownload
from biomaj_download.download.localcopy import LocalDownload
from biomaj_download.message import downmessage_pb2
from biomaj_download.download.rsync import RSYNCDownload
@@ -134,24 +133,24 @@ class DownloadService(object):
protocol_options={}):
protocol = downmessage_pb2.DownloadFile.Protocol.Value(protocol_name.upper())
downloader = None
- if protocol in [0, 1]:
- downloader = FTPDownload(protocol_name, server, remote_dir)
- if protocol in [2, 3]:
- downloader = HTTPDownload(protocol_name, server, remote_dir, http_parse)
- if protocol == 7:
- downloader = LocalDownload(remote_dir)
- if protocol == 4:
- downloader = DirectFTPDownload('ftp', server, '/')
- if protocol == 10:
+ if protocol in [0, 1]: # FTP, SFTP
+ downloader = CurlDownload(protocol_name, server, remote_dir)
+ if protocol in [2, 3]: # HTTP, HTTPS (could be factored with previous case)
+ downloader = CurlDownload(protocol_name, server, remote_dir, http_parse)
+ if protocol == 4: # DirectFTP
+ downloader = DirectFTPDownload("ftp", server, '/')
+ if protocol == 5: # DirectHTTP
+ downloader = DirectHTTPDownload("http", server, '/')
+ if protocol == 6: # DirectHTTPS
+ downloader = DirectHTTPDownload("https", server, '/')
+ if protocol == 10: # DirectFTPS
downloader = DirectFTPDownload('ftps', server, '/')
- if protocol == 5:
- downloader = DirectHttpDownload('http', server, '/')
- if protocol == 6:
- downloader = DirectHttpDownload('https', server, '/')
- if protocol == 8:
- downloader = RSYNCDownload('rsync', server, remote_dir)
- if protocol == 9:
- downloader = IRODSDownload('irods', server, remote_dir)
+ if protocol == 7: # Local
+ downloader = LocalDownload(remote_dir)
+ if protocol == 8: # RSYNC
+ downloader = RSYNCDownload(server, remote_dir)
+ if protocol == 9: # iRods
+ downloader = IRODSDownload(server, remote_dir)
if downloader is None:
return None
@@ -182,11 +181,13 @@ class DownloadService(object):
if save_as:
downloader.set_save_as(save_as)
+
if param:
downloader.set_param(param)
downloader.set_server(server)
+ # Set the name of the BioMAJ protocol to which we respond.
downloader.set_protocol(protocol_name)
if protocol_options is not None:
=====================================
setup.py
=====================================
@@ -22,7 +22,7 @@ config = {
'url': 'http://biomaj.genouest.org',
'download_url': 'http://biomaj.genouest.org',
'author_email': 'olivier.sallou at irisa.fr',
- 'version': '3.0.27',
+ 'version': '3.1.0',
'classifiers': [
# How mature is this project? Common values are
# 3 - Alpha
=====================================
tests/biomaj_tests.py
=====================================
@@ -1,33 +1,25 @@
-from nose.tools import *
+"""
+Note that attributes 'network' and 'local_irods' are ignored for CI.
+"""
from nose.plugins.attrib import attr
import json
import shutil
import os
-import sys
import tempfile
import logging
-import copy
import stat
-import time
from mock import patch
-from optparse import OptionParser
-
-
from biomaj_core.config import BiomajConfig
from biomaj_core.utils import Utils
-from biomaj_download.download.ftp import FTPDownload
-from biomaj_download.download.direct import DirectFTPDownload, DirectHttpDownload
-from biomaj_download.download.http import HTTPDownload, HTTPParse
+from biomaj_download.download.curl import CurlDownload, HTTPParse
+from biomaj_download.download.direct import DirectFTPDownload, DirectHTTPDownload
from biomaj_download.download.localcopy import LocalDownload
-from biomaj_download.download.downloadthreads import DownloadThread
from biomaj_download.download.rsync import RSYNCDownload
from biomaj_download.download.protocolirods import IRODSDownload
-import pprint
-
import unittest
class UtilsForTest():
@@ -263,7 +255,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.utils.clean()
def test_http_list(self):
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.close()
self.assertTrue(len(file_list) == 1)
@@ -271,7 +263,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_list_dateregexp(self):
#self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.close()
self.assertTrue(len(file_list) == 1)
@@ -287,7 +279,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
-1
)
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
@@ -304,7 +296,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.config.get('http.group.file.date_format', None),
int(self.config.get('http.group.file.size'))
)
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
@@ -313,7 +305,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_download(self):
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
print(str(file_list))
httpd.match([r'^README$'], file_list, dir_list)
@@ -323,7 +315,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_download_in_subdir(self):
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
- httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
+ httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^dists/README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
@@ -331,6 +323,65 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.assertTrue(len(httpd.files_to_download) == 1)
+ at attr('network')
+ at attr('https')
+class TestBiomajHTTPSDownload(unittest.TestCase):
+ """
+ Test HTTPS downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_download(self):
+ self.utils = UtilsForTest()
+ self.http_parse = HTTPParse(
+ "<a[\s]+href=\"([\w\-\.]+\">[\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
+ "<a[\s]+href=\"[\w\-\.]+\">([\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
+ 1,
+ 2,
+ 1,
+ 2,
+ None,
+ 3
+ )
+ self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
+ httpd = CurlDownload('https', 'mirrors.edge.kernel.org', '/pub/software/scm/git/debian/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^git-core-0.99.6.tar.gz$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+
+ at attr('network')
+ at attr('sftp')
+class TestBiomajSFTPDownload(unittest.TestCase):
+ """
+ Test SFTP downloader
+ """
+
+ PROTOCOL = "ftps"
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_download(self):
+ sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
+ sftpd.set_credentials("demo:password")
+ (file_list, dir_list) = sftpd.list()
+ sftpd.match([r'^readme.txt$'], file_list, dir_list)
+ sftpd.download(self.utils.data_dir)
+ sftpd.close()
+ self.assertTrue(len(sftpd.files_to_download) == 1)
+
+
@attr('directftp')
@attr('network')
class TestBiomajDirectFTPDownload(unittest.TestCase):
@@ -411,7 +462,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_http_list(self):
file_list = ['/debian/README.html']
- ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
fday = ftpd.files_to_download[0]['day']
fmonth = ftpd.files_to_download[0]['month']
@@ -424,7 +475,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download(self):
file_list = ['/debian/README.html']
- ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
(file_list, dir_list) = ftpd.list()
ftpd.download(self.utils.data_dir, False)
@@ -433,7 +484,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download_get_params_save_as(self):
file_list = ['/get']
- ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+ ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
ftpd.set_files_to_download(file_list)
ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
ftpd.save_as = 'test.json'
@@ -449,7 +500,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
@attr('test')
def test_download_save_as(self):
file_list = ['/debian/README.html']
- ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
ftpd.save_as = 'test.html'
(file_list, dir_list) = ftpd.list()
@@ -460,7 +511,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download_post_params(self):
#file_list = ['/debian/README.html']
file_list = ['/post']
- ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+ ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
ftpd.set_files_to_download(file_list)
ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
ftpd.save_as = 'test.json'
@@ -489,19 +540,19 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.utils.clean()
def test_ftp_list(self):
- ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+ ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
(file_list, dir_list) = ftpd.list()
ftpd.close()
self.assertTrue(len(file_list) > 1)
@attr('test')
def test_download(self):
- ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+ ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
# This tests fails because the zip file is fake. We intercept the failure
# and continue.
- # See test_download_skip_uncompress_checks
+ # See test_download_skip_check_uncompress
try:
ftpd.download(self.utils.data_dir)
except Exception:
@@ -511,9 +562,9 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 2)
ftpd.close()
- def test_download_skip_checks_uncompress(self):
+ def test_download_skip_check_uncompress(self):
# This test is similar to test_download but we skip test of zip file.
- ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+ ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
ftpd.set_options(dict(skip_check_uncompress=True))
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
@@ -522,7 +573,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 2)
def test_download_in_subdir(self):
- ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
+ ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
(file_list, dir_list) = ftpd.list()
try:
ftpd.match([r'^doc/mailing-lists.txt$'], file_list, dir_list)
@@ -534,20 +585,20 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 1)
def test_download_or_copy(self):
- ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
+ ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
ftpd.files_to_download = [
{'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test2', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
- ]
+ ]
available_files = [
{'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test12', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test3', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 20},
{'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
- ]
+ ]
ftpd.download_or_copy(available_files, '/biomaj', False)
ftpd.close()
self.assertTrue(len(ftpd.files_to_download)==2)
@@ -566,7 +617,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(release['day']=='12')
def test_ms_server(self):
- ftpd = FTPDownload("ftp", "test.rebex.net", "/")
+ ftpd = CurlDownload("ftp", "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.match(["^readme.txt$"], file_list, dir_list)
@@ -579,7 +630,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
Test setting tcp_keepalive (it probably doesn't change anything here but
we test that there is no obvious mistake in the code).
"""
- ftpd = FTPDownload("ftp", "test.rebex.net", "/")
+ ftpd = CurlDownload("ftp", "test.rebex.net", "/")
ftpd.set_options(dict(tcp_keepalive=10))
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
@@ -604,14 +655,14 @@ class TestBiomajFTPSDownload(unittest.TestCase):
self.utils.clean()
def test_ftps_list(self):
- ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
+ ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.close()
self.assertTrue(len(file_list) == 1)
def test_download(self):
- ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
+ ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^readme.txt$'], file_list, dir_list)
@@ -624,7 +675,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
- ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+ ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
ftpd.set_credentials(CREDENTIALS)
(file_list, dir_list) = ftpd.list()
@@ -636,7 +687,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
- ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+ ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
ftpd.set_credentials(CREDENTIALS)
(file_list, dir_list) = ftpd.list()
@@ -651,7 +702,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
- ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+ ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
curdir = os.path.dirname(os.path.realpath(__file__))
cert_file = os.path.join(curdir, "caert.demo.wftpserver.com.pem")
ftpd.set_options(dict(ssl_verifyhost="False", ssl_server_cert=cert_file))
@@ -672,7 +723,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
def setUp(self):
self.utils = UtilsForTest()
- self.curdir = os.path.dirname(os.path.realpath(__file__))
+ self.curdir = os.path.dirname(os.path.realpath(__file__)) + '/'
self.examples = os.path.join(self.curdir,'bank') + '/'
BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
@@ -680,40 +731,34 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
self.utils.clean()
def test_rsync_list(self):
- rsyncd = RSYNCDownload('rsync', self.examples, "")
- rsyncd.set_credentials(None)
- rsyncd.set_offline_dir(self.utils.data_dir)
+ rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
self.assertTrue(len(files_list) != 0)
def test_rsync_match(self):
- rsyncd = RSYNCDownload('rsync', self.examples, "")
- rsyncd.set_credentials(None)
- rsyncd.set_offline_dir(self.utils.data_dir)
+ rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'], files_list, dir_list, prefix='', submatch=False)
self.assertTrue(len(rsyncd.files_to_download) != 0)
def test_rsync_download(self):
- rsyncd = RSYNCDownload('rsync', self.examples, "")
- rsyncd.set_credentials(None)
- rsyncd.set_offline_dir(self.utils.data_dir)
- error = rsyncd.rsync_download(self.utils.data_dir, "test2.fasta")
- self.assertTrue(error == 0)
-
+ rsyncd = RSYNCDownload(self.examples, "")
+ rfile = {
+ "name": "test2.fasta",
+ "root": self.examples
+ }
+ error = rsyncd._download(self.utils.data_dir, rfile)
+ self.assertFalse(error)
def test_rsync_general_download(self):
- rsyncd = RSYNCDownload('rsync', self.examples, "")
- rsyncd.set_credentials(None)
- rsyncd.set_offline_dir(self.utils.data_dir)
+ rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'],files_list,dir_list, prefix='')
download_files=rsyncd.download(self.curdir)
self.assertTrue(len(download_files)==1)
def test_rsync_download_or_copy(self):
- rsyncd = RSYNCDownload('rsync', self.examples, "")
- rsyncd.set_offline_dir(self.utils.data_dir)
+ rsyncd = RSYNCDownload(self.examples, "")
(file_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
files_to_download_prev = rsyncd.files_to_download
@@ -721,8 +766,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
self.assertTrue(files_to_download_prev != rsyncd.files_to_download)
def test_rsync_download_in_subdir(self):
- rsyncd = RSYNCDownload('rsync', self.curdir+'/', "")
- rsyncd.set_offline_dir(self.curdir+'/')
+ rsyncd = RSYNCDownload(self.curdir, "")
(file_list, dir_list) = rsyncd.list()
rsyncd.match([r'^/bank/test*'], file_list, dir_list, prefix='')
rsyncd.download(self.utils.data_dir)
@@ -732,7 +776,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
class iRodsResult(object):
def __init__(self, collname, dataname, datasize, owner, modify):
- self.Collname = 'tests/'
+ self.Collname = 'tests/'
self.Dataname = 'test.fasta.gz'
self.Datasize = 45
self.Dataowner_name = 'biomaj'
@@ -824,8 +868,22 @@ class TestBiomajIRODSDownload(unittest.TestCase):
initialize_mock.return_value=mock_session.configure()
query_mock.return_value = mock_session.query(None,None,None,None,None)
cleanup_mock.return_value = mock_session.cleanup()
- irodsd = IRODSDownload('irods', self.examples, "")
- irodsd.set_credentials(None)
- irodsd.set_offline_dir(self.utils.data_dir)
+ irodsd = IRODSDownload(self.examples, "")
(files_list, dir_list) = irodsd.list()
self.assertTrue(len(files_list) != 0)
+
+ @attr('local_irods')
+ def test_irods_download(self):
+ # To run this test, you need an iRODS server on localhost (default
+ # port, user 'rods', password 'rods'), and populate a zone
+ # /tempZone/home/rods with a file that matches r'^test.*\.gz$' (for
+ # instance, by copying tests/bank/test/test.fasta.gz).
+ irodsd = IRODSDownload("localhost", "/tempZone/home/rods")
+ irodsd.set_param(dict(
+ user='rods',
+ password='rods',
+ ))
+ (file_list, dir_list) = irodsd.list()
+ irodsd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
+ irodsd.download(self.utils.data_dir)
+ self.assertTrue(len(irodsd.files_to_download) == 1)
View it on GitLab: https://salsa.debian.org/med-team/biomaj3-download/commit/d6208921c34a815cbaa971585355885da3b9661d
--
View it on GitLab: https://salsa.debian.org/med-team/biomaj3-download/commit/d6208921c34a815cbaa971585355885da3b9661d
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191112/35360f64/attachment-0001.html>
More information about the debian-med-commit
mailing list