[med-svn] [biomaj3-download] 01/02: New upstream version 3.0.13
Olivier Sallou
osallou at debian.org
Thu Aug 17 14:44:56 UTC 2017
This is an automated email from the git hooks/post-receive script.
osallou pushed a commit to branch master
in repository biomaj3-download.
commit 26a35a29dd8561a9c1ea9ce8f0b14dd8ce1b6c00
Author: Olivier Sallou <osallou at debian.org>
Date: Thu Aug 17 12:32:20 2017 +0000
New upstream version 3.0.13
---
.gitignore | 72 +++
.travis.yml | 29 +
CHANGES.txt | 31 +
LICENSE | 662 ++++++++++++++++++++++
MANIFEST.in | 2 +
README.md | 43 ++
bin/biomaj_download_consumer.py | 49 ++
biomaj_download/__init__.py | 0
biomaj_download/biomaj_download_web.py | 161 ++++++
biomaj_download/download/__init__.py | 0
biomaj_download/download/direct.py | 283 ++++++++++
biomaj_download/download/downloadthreads.py | 53 ++
biomaj_download/download/ftp.py | 322 +++++++++++
biomaj_download/download/http.py | 166 ++++++
biomaj_download/download/interface.py | 273 +++++++++
biomaj_download/download/localcopy.py | 82 +++
biomaj_download/download/rsync.py | 192 +++++++
biomaj_download/downloadclient.py | 257 +++++++++
biomaj_download/downloadservice.py | 483 ++++++++++++++++
biomaj_download/message/__init__.py | 0
biomaj_download/message/message.proto | 122 ++++
biomaj_download/message/message_pb2.py | 844 ++++++++++++++++++++++++++++
biomaj_download/mimes-bio.txt | 18 +
biomaj_download/wsgi.py | 4 +
config.yml | 51 ++
gunicorn_conf.py | 3 +
requirements.txt | 16 +
setup.cfg | 2 +
setup.py | 67 +++
tests/alu.properties | 43 ++
tests/bank/process/test.sh | 11 +
tests/bank/test.fasta.gz | Bin 0 -> 45 bytes
tests/bank/test2.fasta | 2 +
tests/bank/test_100.txt | 1 +
tests/biomaj_tests.py | 554 ++++++++++++++++++
tests/computed.properties | 44 ++
tests/computed2.properties | 45 ++
tests/computederror.properties | 43 ++
tests/directhttp.properties | 41 ++
tests/error.properties | 43 ++
tests/global.properties | 123 ++++
tests/local.properties | 41 ++
tests/locallist.properties | 44 ++
tests/localprocess.properties | 100 ++++
tests/multi.properties | 60 ++
tests/sub1.properties | 43 ++
tests/sub2.properties | 41 ++
tests/test.fasta.gz | Bin 0 -> 45 bytes
tests/testhttp.properties | 44 ++
49 files changed, 5610 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..754c2a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,72 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Coveralls
+.coveralls.yml
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# PyCharm
+.idea
+
+# Vim
+.viminfo
+# Less history
+.lesshst
+
+.dbshell
+.emacs*
+.ipython
+.mongo*
+#*.properties
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..bb2b8a2
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,29 @@
+language: python
+sudo: false
+python:
+ - "2.7"
+ - "3.4"
+ - "3.5"
+services:
+ - redis
+# Apply only on main branches
+branches:
+ except:
+ - /^feature.*$/
+# command to install dependencies
+#before_install:
+# - "sudo apt-get update -qq"
+# - "sudo apt-get install -qq libldap2-dev libsasl2-dev"
+install:
+ - "pip install -r requirements.txt"
+ - "pip install coverage"
+ - "pip install python-coveralls"
+ - "python setup.py -q install"
+# - "echo data_file=$TRAVIS_BUILD_DIR/.coverage >> .coveragerc"
+# command to run tests
+script:
+ - python setup.py test
+ - flake8 --ignore E501 biomaj_download/*.py biomaj_download/download
+#after_success:
+# - coveralls
+
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 0000000..326ea02
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,31 @@
+3.0.13:
+ In rate limiting, add progress vs total of download
+ Fix rate limiting submission
+3.0.12:
+ Add retry in case of session creation failure
+ disable web thread logging
+3.0.11:
+ Display progress of download by percent of downloads
+ In case of contact error in downloadclient, retry connection
+3.0.10:
+ Feature #3: Add rate limiting option to limit number of parallel downloads for a client
+3.0.9:
+ Add host in prometheus stats
+ Fix #2: allow setting http.group.file.size or http.group.file.date to -1 if not avalaible in http(s) page for regexp
+3.0.8:
+ Fix prometheus stats
+ Add consul supervision
+3.0.7:
+ Change size type to int64
+3.0.6:
+ Fix download_or_copy to avoid downloading a file existing in a previous production directory
+3.0.4:
+ Fixes on messages
+3.0.3:
+ Fix management of timeout leading to a crash when using biomaj.download parameter.
+3.0.2:
+ set rabbitmq parameter optional
+3.0.1:
+ add missing README etc.. in package
+3.0.0:
+ move download management out of biomaj main package
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..cebe035
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,662 @@
+GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..5dcc15a
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.txt *.md
+recursive-include biomaj_core *.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c9e1fac
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+# About
+
+Microservice to manage the downloads of biomaj.
+
+A protobuf interface is available in biomaj_download/message/message_pb2.py to exchange messages between BioMAJ and the download service.
+Messages go through RabbitMQ (to be installed).
+
+# Protobuf
+
+To compile protobuf, in biomaj_download/message:
+
+ protoc --python_out=. message.proto
+
+# Development
+
+ flake8 biomaj_download/\*.py biomaj_download/download
+
+
+# Run
+
+## Message consumer:
+
+ export BIOMAJ_CONFIG=path_to_config.yml
+ python bin/biomaj_download_consumer.py
+
+## Web server
+
+If package is installed via pip, you need a file named *gunicorn_conf.py* containing somehwhere on local server:
+
+ def worker_exit(server, worker):
+ from prometheus_client import multiprocess
+ multiprocess.mark_process_dead(worker.pid)
+
+If you cloned the repository and installed it via python setup.py install, just refer to the *gunicorn_conf.py* in the cloned repository.
+
+
+ export BIOMAJ_CONFIG=path_to_config.yml
+ rm -rf ..path_to/godocker-prometheus-multiproc
+ mkdir -p ..path_to/godocker-prometheus-multiproc
+ export prometheus_multiproc_dir=..path_to/godocker-prometheus-multiproc
+ gunicorn biomaj_download.biomaj_download_web:app
+
+Web processes should be behind a proxy/load balancer, API base url /api/download
diff --git a/bin/biomaj_download_consumer.py b/bin/biomaj_download_consumer.py
new file mode 100644
index 0000000..758a862
--- /dev/null
+++ b/bin/biomaj_download_consumer.py
@@ -0,0 +1,49 @@
+'''
+Message consumer for download requests
+'''
+
+import os
+import logging
+
+import requests
+import yaml
+import consul
+
+from biomaj_download.downloadservice import DownloadService
+from biomaj_core.utils import Utils
+
+config_file = 'config.yml'
+if 'BIOMAJ_CONFIG' in os.environ:
+ config_file = os.environ['BIOMAJ_CONFIG']
+
+config = None
+with open(config_file, 'r') as ymlfile:
+ config = yaml.load(ymlfile)
+ Utils.service_config_override(config)
+
+
+def on_download(bank, downloaded_files):
+ metrics = []
+ if 'prometheus' in config and not config['prometheus']:
+ return
+ if not downloaded_files:
+ metric = {'bank': bank, 'error': 1}
+ metrics.append(metrics)
+ else:
+ for downloaded_file in downloaded_files:
+ metric = {'bank': bank}
+ if 'error' in downloaded_file and downloaded_file['error']:
+ metric['error'] = 1
+ else:
+ metric['size'] = downloaded_file['size']
+ metric['download_time'] = downloaded_file['download_time']
+ if 'hostname' in config['web']:
+ metric['host'] = config['web']['hostname']
+ metrics.append(metric)
+ r = requests.post(config['web']['local_endpoint'] + '/api/download/metrics', json = metrics)
+
+
+download = DownloadService(config_file)
+download.on_download_callback(on_download)
+download.supervise()
+download.wait_for_messages()
diff --git a/biomaj_download/__init__.py b/biomaj_download/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj_download/biomaj_download_web.py b/biomaj_download/biomaj_download_web.py
new file mode 100644
index 0000000..3f036b7
--- /dev/null
+++ b/biomaj_download/biomaj_download_web.py
@@ -0,0 +1,161 @@
+'''
+Web interface to query list/download status
+Manage sessions and metrics
+'''
+
+import ssl
+import os
+
+import yaml
+from flask import Flask
+from flask import jsonify
+from flask import request
+from prometheus_client import Counter
+from prometheus_client.exposition import generate_latest
+from prometheus_client import multiprocess
+from prometheus_client import CollectorRegistry
+import consul
+
+from biomaj_download.message import message_pb2
+from biomaj_download.downloadservice import DownloadService
+
+from biomaj_core.utils import Utils
+
+app = Flask(__name__)
+
+download_metric = Counter("biomaj_download_total", "Bank total download.", ['bank'])
+download_error_metric = Counter("biomaj_download_errors", "Bank total download errors.", ['bank'])
+download_size_metric = Counter("biomaj_download_file_size", "Bank download file size in bytes.", ['bank', 'host'])
+download_time_metric = Counter("biomaj_download_file_time", "Bank download file time in seconds.", ['bank', 'host'])
+
+config_file = 'config.yml'
+if 'BIOMAJ_CONFIG' in os.environ:
+ config_file = os.environ['BIOMAJ_CONFIG']
+
+config = None
+with open(config_file, 'r') as ymlfile:
+ config = yaml.load(ymlfile)
+ Utils.service_config_override(config)
+
+
+def consul_declare(config):
+ if config['consul']['host']:
+ consul_agent = consul.Consul(host=config['consul']['host'])
+ consul_agent.agent.service.register('biomaj-download', service_id=config['consul']['id'], address=config['web']['hostname'], port=config['web']['port'], tags=['biomaj'])
+ check = consul.Check.http(url='http://' + config['web']['hostname'] + ':' + str(config['web']['port']) + '/api/download', interval=20)
+ consul_agent.agent.check.register(config['consul']['id'] + '_check', check=check, service_id=config['consul']['id'])
+
+
+consul_declare(config)
+
+
+ at app.route('/api/download', methods=['GET'])
+def ping():
+ return jsonify({'msg': 'pong'})
+
+
+ at app.route('/metrics', methods=['GET'])
+def metrics():
+ registry = CollectorRegistry()
+ multiprocess.MultiProcessCollector(registry)
+ return generate_latest(registry)
+
+
+ at app.route('/api/download/metrics', methods=['POST'])
+def add_metrics():
+ '''
+ Expects a JSON request with an array of {'bank': 'bank_name', 'host': 'hostname', 'error': 'error_message', 'size': size_of_download, 'download_time': seconds_to_download}
+ '''
+ downloaded_files = request.get_json()
+ for downloaded_file in downloaded_files:
+ host = 'na'
+ if 'host' in downloaded_file:
+ host = downloaded_file['host']
+ if 'error' in downloaded_file and downloaded_file['error']:
+ download_error_metric.labels(downloaded_file['bank']).inc()
+ else:
+ download_metric.labels(downloaded_file['bank']).inc()
+ download_size_metric.labels(downloaded_file['bank'], host).inc(int(downloaded_file['size']))
+ download_time_metric.labels(downloaded_file['bank'], host).inc(int(downloaded_file['download_time']))
+ return jsonify({'msg': 'OK'})
+
+
+ at app.route('/api/download/status/list/<bank>/<session>')
+def list_status(bank, session):
+ '''
+ Check if listing request is over
+ '''
+ dserv = DownloadService(config_file, rabbitmq=False)
+ biomaj_file_info = message_pb2.DownloadFile()
+ biomaj_file_info.bank = bank
+ biomaj_file_info.session = session
+ biomaj_file_info.local_dir = '/tmp'
+ status = dserv.list_status(biomaj_file_info)
+ return jsonify({'status': status})
+
+
+ at app.route('/api/download/status/download/<bank>/<session>')
+def download_status(bank, session):
+ '''
+ Get number of downloads and errors for bank and session. Progress includes successful download and errored downloads.
+ '''
+ dserv = DownloadService(config_file, rabbitmq=False)
+ biomaj_file_info = message_pb2.DownloadFile()
+ biomaj_file_info.bank = bank
+ biomaj_file_info.session = session
+ biomaj_file_info.local_dir = '/tmp'
+ (progress, errors) = dserv.download_status(biomaj_file_info)
+ return jsonify({'progress': progress, 'errors': errors})
+
+
+ at app.route('/api/download/error/download/<bank>/<session>')
+def download_error(bank, session):
+ '''
+ Get errors info for bank and session
+ '''
+ dserv = DownloadService(config_file, rabbitmq=False)
+ biomaj_file_info = message_pb2.DownloadFile()
+ biomaj_file_info.bank = bank
+ biomaj_file_info.session = session
+ biomaj_file_info.local_dir = '/tmp'
+ errors = dserv.download_errors(biomaj_file_info)
+ return jsonify({'error': errors})
+
+
+ at app.route('/api/download/list/<bank>/<session>')
+def list_result(bank, session):
+ '''
+ Get file listing for bank and session, using FileList protobuf serialized string
+ '''
+ dserv = DownloadService(config_file, rabbitmq=False)
+ biomaj_file_info = message_pb2.DownloadFile()
+ biomaj_file_info.bank = bank
+ biomaj_file_info.session = session
+ biomaj_file_info.local_dir = '/tmp'
+ list_elts = dserv.list_result(biomaj_file_info, protobuf_decode=False)
+ return jsonify({'files': list_elts})
+
+
+ at app.route('/api/download/session/<bank>', methods=['POST'])
+def create_session(bank):
+ dserv = DownloadService(config_file, rabbitmq=False)
+ session = dserv._create_session(bank)
+ return jsonify({'session': session})
+
+
+ at app.route('/api/download/session/<bank>/<session>', methods=['DELETE'])
+def clean_session(bank, session):
+ dserv = DownloadService(config_file, rabbitmq=False)
+ biomaj_file_info = message_pb2.DownloadFile()
+ biomaj_file_info.bank = bank
+ biomaj_file_info.session = session
+ dserv.clean(biomaj_file_info)
+ return jsonify({'msg': 'session cleared'})
+
+
+if __name__ == "__main__":
+ context = None
+ if config['tls']['cert']:
+ context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
+ context.load_cert_chain(config['tls']['cert'], config['tls']['key'])
+ app.run(host='0.0.0.0', port=config['web']['port'], ssl_context=context, threaded=True, debug=config['web']['debug'])
diff --git a/biomaj_download/download/__init__.py b/biomaj_download/download/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj_download/download/direct.py b/biomaj_download/download/direct.py
new file mode 100644
index 0000000..25855f2
--- /dev/null
+++ b/biomaj_download/download/direct.py
@@ -0,0 +1,283 @@
+import datetime
+import time
+import pycurl
+import os
+import re
+import hashlib
+import sys
+
+from biomaj_download.download.ftp import FTPDownload
+from biomaj_core.utils import Utils
+
+if sys.version_info[0] < 3:
+ from urllib import urlencode
+else:
+ from urllib.parse import urlencode
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+
+class DirectFTPDownload(FTPDownload):
+ '''
+ download a list of files from FTP, no regexp
+ '''
+
+ def __init__(self, protocol, host, rootdir=''):
+ '''
+
+ Initialize the files in list with today as last-modification date.
+ Size is also preset to zero, size will be set after download
+
+ '''
+ FTPDownload.__init__(self, protocol, host, rootdir)
+ self.save_as = None
+ self.headers = {}
+
+ def set_files_to_download(self, files):
+ today = datetime.date.today()
+ self.files_to_download = []
+ for file_to_download in files:
+ rfile = {}
+ rfile['root'] = ''
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = 0
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ if file_to_download.endswith('/'):
+ rfile['name'] = file_to_download[:-1]
+ else:
+ rfile['name'] = file_to_download
+ rfile['hash'] = None
+ if self.param:
+ if 'param' not in file_to_download or not file_to_download['param']:
+ rfile['param'] = self.param
+ self.files_to_download.append(rfile)
+
+ def list(self, directory=''):
+ '''
+ FTP protocol does not give us the possibility to get file date from remote url
+ '''
+ for rfile in self.files_to_download:
+ if self.save_as is None:
+ self.save_as = rfile['name']
+ rfile['save_as'] = self.save_as
+ return (self.files_to_download, [])
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ All files to download match, no pattern
+ '''
+ if dir_list is None:
+ dir_list = []
+ self.files_to_download = file_list
+
+
+class DirectHttpDownload(DirectFTPDownload):
+
+ def __init__(self, protocol, host, rootdir=''):
+ '''
+ :param file_list: list of files to download on server
+ :type file_list: list
+ '''
+ DirectFTPDownload.__init__(self, protocol, host, rootdir)
+ self.save_as = None
+ self.method = 'GET'
+ self.param = {}
+
+ def download(self, local_dir, keep_dirs=True):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
+ :return: list of downloaded files
+ '''
+ self.logger.debug('DirectHTTP:Download')
+ nb_files = len(self.files_to_download)
+
+ if nb_files > 1:
+ self.files_to_download = []
+ self.logger.error('DirectHTTP accepts only 1 file')
+
+ cur_files = 1
+
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+
+ if not self.save_as:
+ self.save_as = rfile['name']
+ else:
+ rfile['save_as'] = self.save_as
+ file_dir = local_dir
+ if keep_dirs:
+ file_dir = local_dir + os.path.dirname(self.save_as)
+ file_path = file_dir + '/' + os.path.basename(self.save_as)
+
+ # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ self.logger.debug('DirectHTTP:Download:Progress' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'] + ', save as ' + self.save_as)
+ cur_files += 1
+ if 'url' not in rfile:
+ rfile['url'] = self.url
+ fp = open(file_path, "wb")
+ curl = pycurl.Curl()
+
+ if self.proxy is not None:
+ curl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.method == 'POST':
+ # Form data must be provided already urlencoded.
+ postfields = urlencode(self.param)
+ # Sets request method to POST,
+ # Content-Type header to application/x-www-form-urlencoded
+ # and data to send in request body.
+ if self.credentials is not None:
+ curl.setopt(pycurl.USERPWD, self.credentials)
+
+ curl.setopt(pycurl.POSTFIELDS, postfields)
+ try:
+ curl.setopt(pycurl.URL, rfile['url'] + rfile['root'] + '/' + rfile['name'])
+ except Exception:
+ curl.setopt(pycurl.URL, (rfile['url'] + rfile['root'] + '/' + rfile['name']).encode('ascii', 'ignore'))
+
+ else:
+ url = rfile['url'] + rfile['root'] + '/' + rfile['name'] + '?' + urlencode(self.param)
+ try:
+ curl.setopt(pycurl.URL, url)
+ except Exception:
+ curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
+
+ curl.setopt(pycurl.WRITEDATA, fp)
+ start_time = datetime.datetime.now()
+ start_time = time.mktime(start_time.timetuple())
+ curl.perform()
+ end_time = datetime.datetime.now()
+ end_time = time.mktime(end_time.timetuple())
+ rfile['download_time'] = end_time - start_time
+
+ curl.close()
+ fp.close()
+ self.logger.debug('downloaded!')
+ rfile['name'] = self.save_as
+ self.set_permissions(file_path, rfile)
+ return self.files_to_download
+
+ def header_function(self, header_line):
+ # HTTP standard specifies that headers are encoded in iso-8859-1.
+ # On Python 2, decoding step can be skipped.
+ # On Python 3, decoding step is required.
+ header_line = header_line.decode('iso-8859-1')
+
+ # Header lines include the first status line (HTTP/1.x ...).
+ # We are going to ignore all lines that don't have a colon in them.
+ # This will botch headers that are split on multiple lines...
+ if ':' not in header_line:
+ return
+
+ # Break the header line into header name and value.
+ name, value = header_line.split(':', 1)
+
+ # Remove whitespace that may be present.
+ # Header lines include the trailing newline, and there may be whitespace
+ # around the colon.
+ name = name.strip()
+ value = value.strip()
+
+ # Header names are case insensitive.
+ # Lowercase name here.
+ name = name.lower()
+
+ # Now we can actually record the header name and value.
+ self.headers[name] = value
+
+ def list(self, directory=''):
+ '''
+ Try to get file headers to get last_modification and size
+ '''
+ for rfile in self.files_to_download:
+ if self.save_as is None:
+ self.save_as = rfile['name']
+
+ rfile['save_as'] = self.save_as
+
+ self.crl.setopt(pycurl.HEADER, True)
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ self.crl.setopt(pycurl.NOBODY, True)
+ try:
+ self.crl.setopt(pycurl.URL, self.url + self.rootdir + rfile['name'])
+ except Exception:
+ self.crl.setopt(pycurl.URL, (self.url + self.rootdir + rfile['name']).encode('ascii', 'ignore'))
+
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+ self.crl.perform()
+
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+
+ lines = re.split(r'[\n\r]+', result)
+ for line in lines:
+ parts = line.split(':')
+ if parts[0].strip() == 'Content-Length':
+ rfile['size'] = int(parts[1].strip())
+ if parts[0].strip() == 'Last-Modified':
+ # Sun, 06 Nov 1994
+ res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = int(res.group(2))
+ rfile['month'] = Utils.month_to_num(res.group(3))
+ rfile['year'] = int(res.group(4))
+ continue
+ # Sunday, 06-Nov-94
+ res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = int(res.group(2))
+ rfile['month'] = Utils.month_to_num(res.group(3))
+ rfile['year'] = 2000 + int(res.group(4))
+ continue
+ # Sun Nov 6 08:49:37 1994
+ res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = int(res.group(3))
+ rfile['month'] = Utils.month_to_num(res.group(2))
+ rfile['year'] = int(res.group(4))
+ continue
+ return (self.files_to_download, [])
diff --git a/biomaj_download/download/downloadthreads.py b/biomaj_download/download/downloadthreads.py
new file mode 100644
index 0000000..88d6fc3
--- /dev/null
+++ b/biomaj_download/download/downloadthreads.py
@@ -0,0 +1,53 @@
+from builtins import str
+
+import logging
+import threading
+import traceback
+import sys
+
+
+class DownloadThread(threading.Thread):
+
+ def __init__(self, ds, queue):
+ '''
+ Download thread to download a list of files
+
+ :param downloader: downloader to use
+ :type downloader: :class:`biomaj.download.interface.DownloadInterface`
+ :param local_dir: directory to download files
+ :type local_dir: str
+ '''
+ threading.Thread.__init__(self)
+ self.queue = queue
+ self._stopevent = threading.Event()
+ self.error = 0
+ self.files_to_download = 0
+ self.ds = ds
+
+ def run(self):
+ logging.info('Start download thread')
+ try:
+ message = self.queue.get(False)
+ except Exception:
+ return
+ while message:
+ try:
+ files = self.ds.local_download(message)
+ if files is None:
+ self.error += 1
+ self.files_to_download += 1
+ except Exception as e:
+ logging.error("Download error: " + str(e))
+ traceback.print_exc(file=sys.stdout)
+ self.error += 1
+ self.queue.task_done()
+ try:
+ message = self.queue.get(False)
+ except Exception:
+ break
+
+ def stop(self):
+ self._stopevent.set()
+
+
+DownloadThread.MKDIR_LOCK = threading.Lock()
diff --git a/biomaj_download/download/ftp.py b/biomaj_download/download/ftp.py
new file mode 100644
index 0000000..fed6bd0
--- /dev/null
+++ b/biomaj_download/download/ftp.py
@@ -0,0 +1,322 @@
+import pycurl
+import re
+import os
+from datetime import datetime
+import time
+import hashlib
+
+from biomaj_core.utils import Utils
+from biomaj_download.download.interface import DownloadInterface
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+
+class FTPDownload(DownloadInterface):
+ '''
+ Base class to download files from FTP
+
+ protocol=ftp
+ server=ftp.ncbi.nih.gov
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+ def __init__(self, protocol, host, rootdir):
+ DownloadInterface.__init__(self)
+ self.logger.debug('Download')
+ self.crl = pycurl.Curl()
+ url = protocol + '://' + host
+ self.rootdir = rootdir
+ self.url = url
+ self.headers = {}
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ Find files matching patterns. Sets instance variable files_to_download.
+
+ :param patterns: regexps to match
+ :type patterns: list
+ :param file_list: list of files to match
+ :type file_list: list
+ :param dir_list: sub directories in current dir
+ :type dir_list: list
+ :param prefix: directory prefix
+ :type prefix: str
+ :param submatch: first call to match, or called from match
+ :type submatch: bool
+ '''
+ self.logger.debug('Download:File:RegExp:' + str(patterns))
+ if dir_list is None:
+ dir_list = []
+ if not submatch:
+ self.files_to_download = []
+ for pattern in patterns:
+ subdirs_pattern = pattern.split('/')
+ if len(subdirs_pattern) > 1:
+ # Pattern contains sub directories
+ subdir = subdirs_pattern[0]
+ if subdir == '^':
+ subdirs_pattern = subdirs_pattern[1:]
+ subdir = subdirs_pattern[0]
+ for direlt in dir_list:
+ subdir = direlt['name']
+ self.logger.debug('Download:File:Subdir:Check:' + subdir)
+ if pattern == '**/*':
+ (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
+ self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
+ for rfile in file_list:
+ if pattern == '**/*' or re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' + rfile['name']
+ self.files_to_download.append(rfile)
+ self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
+ else:
+ if re.match(subdirs_pattern[0], subdir):
+ self.logger.debug('Download:File:Subdir:Match:' + subdir)
+ # subdir match the beginning of the pattern
+ # check match in subdir
+ (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
+ self.match(['/'.join(subdirs_pattern[1:])], subfile_list, subdirs_list, prefix + '/' + subdir, True)
+
+ else:
+ for rfile in file_list:
+ if re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' + rfile['name']
+ self.files_to_download.append(rfile)
+ self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
+ if not submatch and len(self.files_to_download) == 0:
+ raise Exception('no file found matching expressions')
+
+ def curl_download(self, file_path, file_to_download):
+ error = True
+ nbtry = 1
+ while(error is True and nbtry < 3):
+ fp = open(file_path, "wb")
+ curl = pycurl.Curl()
+ try:
+ curl.setopt(pycurl.URL, file_to_download)
+ except Exception:
+ curl.setopt(pycurl.URL, file_to_download.encode('ascii', 'ignore'))
+ if self.proxy is not None:
+ curl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ curl.setopt(pycurl.USERPWD, self.credentials)
+
+ curl.setopt(pycurl.CONNECTTIMEOUT, 300)
+ # Download should not take more than 5minutes
+ curl.setopt(pycurl.TIMEOUT, self.timeout)
+ curl.setopt(pycurl.NOSIGNAL, 1)
+ curl.setopt(pycurl.WRITEDATA, fp)
+
+ try:
+ curl.perform()
+ errcode = curl.getinfo(pycurl.HTTP_CODE)
+ if int(errcode) != 226 and int(errcode) != 200:
+ error = True
+ self.logger.error('Error while downloading ' + file_to_download + ' - ' + str(errcode))
+ else:
+ error = False
+ except Exception as e:
+ self.logger.error('Could not get errcode:' + str(e))
+ nbtry += 1
+ curl.close()
+ fp.close()
+ return error
+
+ def download(self, local_dir, keep_dirs=True):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
+ :return: list of downloaded files
+ '''
+ self.logger.debug('FTP:Download')
+
+ nb_files = len(self.files_to_download)
+ cur_files = 1
+
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ file_dir = local_dir
+ if 'save_as' not in rfile or not rfile['save_as']:
+ rfile['save_as'] = rfile['name']
+ if keep_dirs:
+ file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
+ file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
+
+ # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+
+ self.logger.debug('FTP:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
+ self.logger.debug('FTP:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
+ cur_files += 1
+ if 'url' not in rfile or not rfile['url']:
+ rfile['url'] = self.url
+ if 'root' not in rfile or not rfile['root']:
+ rfile['root'] = self.rootdir
+ start_time = datetime.now()
+ start_time = time.mktime(start_time.timetuple())
+ error = self.curl_download(file_path, rfile['url'] + rfile['root'] + '/' + rfile['name'])
+ if error:
+ rfile['download_time'] = 0
+ rfile['error'] = True
+ raise Exception("FTP:Download:Error:" + rfile['url'] + rfile['root'] + '/' + rfile['name'])
+ else:
+ end_time = datetime.now()
+ end_time = time.mktime(end_time.timetuple())
+ rfile['download_time'] = end_time - start_time
+
+ self.set_permissions(file_path, rfile)
+
+ return self.files_to_download
+
+ def header_function(self, header_line):
+ # HTTP standard specifies that headers are encoded in iso-8859-1.
+ # On Python 2, decoding step can be skipped.
+ # On Python 3, decoding step is required.
+ header_line = header_line.decode('iso-8859-1')
+
+ # Header lines include the first status line (HTTP/1.x ...).
+ # We are going to ignore all lines that don't have a colon in them.
+ # This will botch headers that are split on multiple lines...
+ if ':' not in header_line:
+ return
+
+ # Break the header line into header name and value.
+ name, value = header_line.split(':', 1)
+
+ # Remove whitespace that may be present.
+ # Header lines include the trailing newline, and there may be whitespace
+ # around the colon.
+ name = name.strip()
+ value = value.strip()
+
+ # Header names are case insensitive.
+ # Lowercase name here.
+ name = name.lower()
+
+ # Now we can actually record the header name and value.
+ self.headers[name] = value
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
+
+ try:
+ self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
+ except Exception:
+ self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+
+ self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
+ # Download should not take more than 5minutes
+ self.crl.setopt(pycurl.TIMEOUT, self.timeout)
+ self.crl.setopt(pycurl.NOSIGNAL, 1)
+ try:
+ self.crl.perform()
+ except Exception as e:
+ self.logger.error('Could not get errcode:' + str(e))
+
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+
+ # FTP LIST output is separated by \r\n
+ # lets split the output in lines
+ lines = re.split(r'[\n\r]+', result)
+ # lets walk through each line
+ rfiles = []
+ rdirs = []
+
+ for line in lines:
+ rfile = {}
+ # lets print each part separately
+ parts = line.split()
+ # the individual fields in this list of parts
+ if not parts:
+ continue
+ rfile['permissions'] = parts[0]
+ rfile['group'] = parts[2]
+ rfile['user'] = parts[3]
+ rfile['size'] = int(parts[4])
+ rfile['month'] = Utils.month_to_num(parts[5])
+ rfile['day'] = int(parts[6])
+ rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest()
+ try:
+ rfile['year'] = int(parts[7])
+ except Exception as e:
+ # specific ftp case issues at getting date info
+ curdate = datetime.now()
+ rfile['year'] = curdate.year
+ # Year not precised, month feater than current means previous year
+ if rfile['month'] > curdate.month:
+ rfile['year'] = curdate.year - 1
+ # Same month but later day => previous year
+ if rfile['month'] == curdate.month and rfile['day'] > curdate.day:
+ rfile['year'] = curdate.year - 1
+ rfile['name'] = parts[8]
+ if len(parts) >= 10 and parts[9] == '->':
+ # Symlink, add to files AND dirs as we don't know the type of the link
+ rdirs.append(rfile)
+
+ is_dir = False
+ if re.match('^d', rfile['permissions']):
+ is_dir = True
+
+ if not is_dir:
+ rfiles.append(rfile)
+ else:
+ rdirs.append(rfile)
+ return (rfiles, rdirs)
+
+ def chroot(self, cwd):
+ self.logger.debug('Download: change dir ' + cwd)
+
+ def close(self):
+ if self.crl is not None:
+ self.crl.close()
+ self.crl = None
diff --git a/biomaj_download/download/http.py b/biomaj_download/download/http.py
new file mode 100644
index 0000000..db2614d
--- /dev/null
+++ b/biomaj_download/download/http.py
@@ -0,0 +1,166 @@
+import pycurl
+import re
+import hashlib
+import datetime
+
+import humanfriendly
+
+from biomaj_core.utils import Utils
+from biomaj_download.download.ftp import FTPDownload
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+
+class HTTPParse(object):
+
+ def __init__(self, dir_line, file_line, dir_name=1, dir_date=2, file_name=1, file_date=2, file_date_format=None, file_size=3):
+ """
+ http.parse.dir.line: <img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+ http.parse.file.line: <img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
+ http.group.dir.name: 1
+ http.group.dir.date: 2
+ http.group.file.name: 1
+ http.group.file.date: 2
+ http.group.file.size: 3
+ """
+ self.dir_line = dir_line
+ self.file_line = file_line
+ self.dir_name = dir_name
+ self.dir_date = dir_date
+ self.file_name = file_name
+ self.file_date = file_date
+ self.file_size = file_size
+ self.file_date_format = file_date_format
+
+
+class HTTPDownload(FTPDownload):
+ '''
+ Base class to download files from HTTP
+
+ Makes use of http.parse.dir.line etc.. regexps to extract page information
+
+ protocol=http
+ server=ftp.ncbi.nih.gov
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+ def __init__(self, protocol, host, rootdir, http_parse=None):
+ FTPDownload.__init__(self, protocol, host, rootdir)
+ self.http_parse = http_parse
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
+
+ try:
+ self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
+ except Exception:
+ self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+ self.crl.perform()
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+ '''
+ 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
+ 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
+ 'http.group.dir.name': 1,
+ 'http.group.dir.date': 2,
+ 'http.group.file.name': 1,
+ 'http.group.file.date': 2,
+ 'http.group.file.size': 3,
+ '''
+
+ rfiles = []
+ rdirs = []
+
+ dirs = re.findall(self.http_parse.dir_line, result)
+ if dirs is not None and len(dirs) > 0:
+ for founddir in dirs:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = 0
+ date = founddir[self.http_parse.dir_date - 1]
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ # 19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = int(parts[0])
+ rfile['year'] = int(parts[2])
+ rfile['name'] = founddir[self.http_parse.dir_name - 1]
+ rdirs.append(rfile)
+
+ files = re.findall(self.http_parse.file_line, result)
+ if files is not None and len(files) > 0:
+ for foundfile in files:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ if self.http_parse.file_size != -1:
+ rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
+ else:
+ rfile['size'] = 0
+ if self.http_parse.file_date != -1:
+ date = foundfile[self.http_parse.file_date - 1]
+ if self.http_parse.file_date_format:
+ date_object = datetime.datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
+ rfile['month'] = date_object.month
+ rfile['day'] = date_object.day
+ rfile['year'] = date_object.year
+ else:
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ # 19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = int(parts[0])
+ rfile['year'] = int(parts[2])
+ else:
+ today = datetime.datetime.now()
+ date = '%s-%s-%s' % (today.year, today.month, today.day)
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ rfile['name'] = foundfile[self.http_parse.file_name - 1]
+ filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
+ rfile['hash'] = hashlib.md5(filehash).hexdigest()
+ rfiles.append(rfile)
+
+ return (rfiles, rdirs)
diff --git a/biomaj_download/download/interface.py b/biomaj_download/download/interface.py
new file mode 100644
index 0000000..17c1325
--- /dev/null
+++ b/biomaj_download/download/interface.py
@@ -0,0 +1,273 @@
+import os
+import logging
+import datetime
+import time
+import re
+
+
+class _FakeLock(object):
+ '''
+ Fake lock for downloaders not called by a Downloadthread
+ '''
+
+ def __init__(self):
+ pass
+
+ def acquire(self):
+ pass
+
+ def release(self):
+ pass
+
+
+class DownloadInterface(object):
+ '''
+ Main interface that all downloaders must extend
+ '''
+
+ files_num_threads = 4
+
+ def __init__(self):
+ self.config = None
+ self.files_to_download = []
+ self.files_to_copy = []
+ self.error = False
+ self.credentials = None
+ # bank name
+ self.bank = None
+ self.mkdir_lock = _FakeLock()
+ self.kill_received = False
+ self.proxy = None
+ # 24h timeout
+ self.timeout = 3600 * 24
+ # Optional save target for single file downloaders
+ self.save_as = None
+ self.logger = logging.getLogger('biomaj')
+ self.param = None
+ self.method = None
+ self.protocol = None
+ self.server = None
+ self.offline_dir = None
+
+ def set_offline_dir(self, offline_dir):
+ self.offline_dir = offline_dir
+
+ def set_server(self, server):
+ self.server = server
+
+ def set_protocol(self, protocol):
+ self.protocol = protocol
+
+ def set_files_to_download(self, files):
+ self.files_to_download = files
+ for file_to_download in self.files_to_download:
+ if self.param:
+ if 'param' not in file_to_download or not file_to_download['param']:
+ file_to_download['param'] = self.param
+
+ def set_param(self, param):
+ self.param = param
+
+ def set_timeout(self, timeout):
+ if isinstance(timeout, int):
+ self.timeout = timeout
+ else:
+ try:
+ self.timeout = int(timeout)
+ except Exception:
+ logging.error('Timeout is not a valid integer, skipping')
+
+ def set_save_as(self, save_as):
+ self.save_as = save_as
+
+ def set_proxy(self, proxy, proxy_auth=None):
+ '''
+ Use a proxy to connect to remote servers
+
+ :param proxy: proxy to use (see http://curl.haxx.se/libcurl/c/CURLOPT_PROXY.html for format)
+ :type proxy: str
+ :param proxy_auth: proxy authentication if any (user:password)
+ :type proxy_auth: str
+ '''
+ self.proxy = proxy
+ self.proxy_auth = proxy_auth
+
+ def set_method(self, method):
+ self.method = method
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ Find files matching patterns. Sets instance variable files_to_download.
+
+ :param patterns: regexps to match
+ :type patterns: list
+ :param file_list: list of files to match
+ :type file_list: list
+ :param dir_list: sub directories in current dir
+ :type dir_list: list
+ :param prefix: directory prefix
+ :type prefix: str
+ :param submatch: first call to match, or called from match
+ :type submatch: bool
+ '''
+ self.logger.debug('Download:File:RegExp:' + str(patterns))
+
+ if dir_list is None:
+ dir_list = []
+
+ if not submatch:
+ self.files_to_download = []
+ for pattern in patterns:
+ subdirs_pattern = pattern.split('/')
+ if len(subdirs_pattern) > 1:
+ # Pattern contains sub directories
+ subdir = subdirs_pattern[0]
+ if subdir == '^':
+ subdirs_pattern = subdirs_pattern[1:]
+ subdir = subdirs_pattern[0]
+ if not dir_list and pattern == '**/*':
+ # Take all and no more dirs, take all files
+ for rfile in file_list:
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' + rfile['name']
+ self.files_to_download.append(rfile)
+ self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
+ return
+ for direlt in dir_list:
+ subdir = direlt['name']
+ self.logger.debug('Download:File:Subdir:Check:' + subdir)
+ if pattern == '**/*':
+ (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
+ self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
+ for rfile in file_list:
+ if pattern == '**/*' or re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' + rfile['name']
+ self.files_to_download.append(rfile)
+ self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
+ else:
+ if re.match(subdirs_pattern[0], subdir):
+ self.logger.debug('Download:File:Subdir:Match:' + subdir)
+ # subdir match the beginning of the pattern
+ # check match in subdir
+ (subfile_list, subdirs_list) = self.list(prefix + '/' + subdir + '/')
+ self.match(['/'.join(subdirs_pattern[1:])], subfile_list, subdirs_list, prefix + '/' + subdir, True)
+
+ else:
+ for rfile in file_list:
+ if re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' + rfile['name']
+ self.files_to_download.append(rfile)
+ self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
+ if not submatch and len(self.files_to_download) == 0:
+ raise Exception('no file found matching expressions')
+
+ def set_permissions(self, file_path, file_info):
+ '''
+ Sets file attributes to remote ones
+ '''
+ if file_info['year'] and file_info['month'] and file_info['day']:
+ ftime = datetime.date(file_info['year'], file_info['month'], file_info['day'])
+ settime = time.mktime(ftime.timetuple())
+ os.utime(file_path, (settime, settime))
+
+ def download_or_copy(self, available_files, root_dir, check_exists=True):
+ '''
+ If a file to download is available in available_files, copy it instead of downloading it.
+
+ Update the instance variables files_to_download and files_to_copy
+
+ :param available_files: list of files available in root_dir
+ :type available files: list
+ :param root_dir: directory where files are available
+ :type root_dir: str
+ :param check_exists: checks if file exists locally
+ :type check_exists: bool
+ '''
+
+ self.files_to_copy = []
+ # In such case, it forces the download again
+ if not available_files:
+ return
+ available_files.sort(key=lambda x: x['name'])
+ self.files_to_download.sort(key=lambda x: x['name'])
+
+ new_files_to_download = []
+
+ test1_tuples = set((d['name'], d['year'], d['month'], d['day'], d['size']) for d in self.files_to_download)
+ test2_tuples = set((d['name'], d['year'], d['month'], d['day'], d['size']) for d in available_files)
+ new_or_modified_files = [t for t in test1_tuples if t not in test2_tuples]
+ new_or_modified_files.sort(key=lambda x: x[0])
+ index = 0
+
+ if len(new_or_modified_files) > 0:
+ self.logger.debug('Number of remote files: %d' % (len(self.files_to_download)))
+ self.logger.debug('Number of local files: %d' % (len(available_files)))
+ self.logger.debug('Number of files new or modified: %d' % (len(new_or_modified_files)))
+ for dfile in self.files_to_download:
+ if index < len(new_or_modified_files) and \
+ dfile['name'] == new_or_modified_files[index][0]:
+
+ new_files_to_download.append(dfile)
+ index += 1
+ else:
+ if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])):
+ dfile['root'] = root_dir
+ self.logger.debug('Copy file instead of downloading it: %s' % (os.path.join(root_dir, dfile['name'])))
+ self.files_to_copy.append(dfile)
+ else:
+ new_files_to_download.append(dfile)
+
+ else:
+ # Copy everything
+ for dfile in self.files_to_download:
+ if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])):
+ dfile['root'] = root_dir
+ self.files_to_copy.append(dfile)
+ else:
+ new_files_to_download.append(dfile)
+
+ self.files_to_download = new_files_to_download
+
+ def download(self, local_dir):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :return: list of downloaded files
+ '''
+ pass
+
+ def list(self):
+ '''
+ List directory
+
+ :return: tuple of file list and dir list
+ '''
+ pass
+
+ def chroot(self, cwd):
+ '''
+ Change directory
+ '''
+ pass
+
+ def set_credentials(self, userpwd):
+ '''
+ Set credentials in format user:pwd
+
+ :param userpwd: credentials
+ :type userpwd: str
+ '''
+ self.credentials = userpwd
+
+ def close(self):
+ '''
+ Close connection
+ '''
+ pass
diff --git a/biomaj_download/download/localcopy.py b/biomaj_download/download/localcopy.py
new file mode 100644
index 0000000..ae5ba0f
--- /dev/null
+++ b/biomaj_download/download/localcopy.py
@@ -0,0 +1,82 @@
+import os
+import datetime
+import hashlib
+
+from biomaj_core.utils import Utils
+from biomaj_download.download.interface import DownloadInterface
+
+
+class LocalDownload(DownloadInterface):
+ '''
+ Base class to copy file from local system
+
+ protocol=cp
+ server=localhost
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+ def __init__(self, rootdir):
+ DownloadInterface.__init__(self)
+ self.logger.debug('Download')
+ self.rootdir = rootdir
+
+ def download(self, local_dir):
+ '''
+ Copy local files to local_dir
+
+ :param local_dir: Directory where files should be copied
+ :type local_dir: str
+ :return: list of downloaded files
+ '''
+ self.logger.debug('Local:Download')
+ Utils.copy_files(self.files_to_download, local_dir, lock=self.mkdir_lock)
+ for rfile in self.files_to_download:
+ rfile['download_time'] = 0
+
+ return self.files_to_download
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ self.logger.debug('Download:List:' + self.rootdir + directory)
+ # lets walk through each line
+
+ rfiles = []
+ rdirs = []
+
+ files = [f for f in os.listdir(self.rootdir + directory)]
+ for file_in_files in files:
+ rfile = {}
+ fstat = os.stat(os.path.join(self.rootdir + directory, file_in_files))
+
+ rfile['permissions'] = str(fstat.st_mode)
+ rfile['group'] = str(fstat.st_gid)
+ rfile['user'] = str(fstat.st_uid)
+ rfile['size'] = fstat.st_size
+ fstat_mtime = datetime.datetime.fromtimestamp(fstat.st_mtime)
+ rfile['month'] = fstat_mtime.month
+ rfile['day'] = fstat_mtime.day
+ rfile['year'] = fstat_mtime.year
+ rfile['name'] = file_in_files
+ filehash = (rfile['name'] + str(fstat.st_mtime) + str(rfile['size'])).encode('utf-8')
+ rfile['hash'] = hashlib.md5(filehash).hexdigest()
+
+ is_dir = False
+ if os.path.isdir(os.path.join(self.rootdir + directory, file_in_files)):
+ is_dir = True
+
+ if not is_dir:
+ rfiles.append(rfile)
+ else:
+ rdirs.append(rfile)
+ return (rfiles, rdirs)
+
+ def chroot(self, cwd):
+ self.logger.debug('Download: change dir ' + cwd)
+ os.chdir(cwd)
diff --git a/biomaj_download/download/rsync.py b/biomaj_download/download/rsync.py
new file mode 100644
index 0000000..90c14b0
--- /dev/null
+++ b/biomaj_download/download/rsync.py
@@ -0,0 +1,192 @@
+# from future import standard_library
+# standard_library.install_aliases()
+# from builtins import str
+import logging
+import re
+import os
+import subprocess
+from datetime import datetime
+import time
+
+from biomaj_download.download.interface import DownloadInterface
+
+
+class RSYNCDownload(DownloadInterface):
+ '''
+ Base class to download files from rsyncc
+ protocol = rsync
+ server =
+ remote.dir =
+
+ remote.files =
+ '''
+
+ def __init__(self, protocol, server, remote_dir):
+ DownloadInterface.__init__(self)
+ logging.debug('Download')
+ self.rootdir = remote_dir
+ self.protocol = protocol
+ if server and remote_dir:
+ self.server = server # name of the remote server
+ self.remote_dir = remote_dir # directory on the remote server
+ else:
+ if server:
+ self.server = server
+ self.remote_dir = ""
+
+ def list(self, directory=''):
+ '''
+ List server directory
+
+ :return: dict of file and dirs in current directory with details
+ '''
+ err_code = None
+ rfiles = []
+ rdirs = []
+ logging.debug('RSYNC:List')
+ # give a working directory to run rsync
+ try:
+ os.chdir(self.offline_dir)
+ except TypeError:
+ logging.error("RSYNC:list:Could not find offline_dir")
+ if self.remote_dir and self.credentials:
+ cmd = str(self.protocol) + " --list-only " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(directory)
+ elif (self.remote_dir and not self.credentials):
+ cmd = str(self.protocol) + " --list-only " + str(self.server) + ":" + str(self.remote_dir) + str(directory)
+ else: # Local rsync for unitest
+ cmd = str(self.protocol) + " --list-only " + str(self.server) + str(directory)
+ try:
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+ list_rsync, err = p.communicate()
+ self.test_stderr_rsync_message(err)
+ self.test_stderr_rsync_error(err)
+ err_code = p.returncode
+ except ExceptionRsync as e:
+ logging.error("RsyncError:" + str(e))
+ if err_code != 0:
+ logging.error('Error while listing ' + str(err_code))
+ return(rfiles, rdirs)
+ list_rsync = str(list_rsync.decode('utf-8'))
+ lines = list_rsync.rstrip().split("\n")
+ for line in lines:
+ rfile = {}
+ # rsync LIST output is separated by \n
+ parts = line.split()
+ if not parts:
+ continue
+ date = parts[2].split('/')
+ rfile['permissions'] = parts[0]
+ rfile['size'] = int(parts[1].replace(',', ''))
+ rfile['month'] = int(date[1])
+ rfile['day'] = int(date[2])
+ rfile['year'] = int(date[0])
+ rfile['name'] = parts[4]
+ is_dir = False
+ if re.match('^d', rfile['permissions']):
+ is_dir = True
+
+ if not is_dir:
+ rfiles.append(rfile)
+ else:
+ rdirs.append(rfile)
+
+ return (rfiles, rdirs)
+
+ def download(self, local_dir, keep_dirs=True):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
+ :return: list of downloaded files
+ '''
+
+ logging.debug('RSYNC:Download')
+ nb_files = len(self.files_to_download)
+ cur_files = 1
+ # give a working directory to run rsync
+ try:
+ os.chdir(self.offline_dir)
+ except TypeError:
+ logging.error("RSYNC:list:Could not find offline_dir")
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ file_dir = local_dir
+ if 'save_as' not in rfile or rfile['save_as'] is None:
+ rfile['save_as'] = rfile['name']
+ if keep_dirs:
+ file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
+ if re.match('\S*\/$', file_dir):
+ file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
+ else:
+ file_path = file_dir + os.path.basename(rfile['save_as'])
+ # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+
+ logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
+ logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
+ cur_files += 1
+ start_time = datetime.now()
+ start_time = time.mktime(start_time.timetuple())
+ error = self.rsync_download(file_path, rfile['name'])
+ if error:
+ rfile['download_time'] = 0
+ rfile['error'] = True
+ raise Exception("RSYNC:Download:Error:" + rfile['root'] + '/' + rfile['name'])
+ end_time = datetime.now()
+ end_time = time.mktime(end_time.timetuple())
+ rfile['download_time'] = end_time - start_time
+ self.set_permissions(file_path, rfile)
+ return(self.files_to_download)
+
+ def rsync_download(self, file_path, file_to_download):
+ error = False
+ err_code = ''
+ logging.debug('RSYNC:RSYNC DOwNLOAD')
+ # give a working directory to run rsync
+ try:
+ os.chdir(self.offline_dir)
+ except TypeError:
+ logging.error("RSYNC:list:Could not find offline_dir")
+ try:
+ if self.remote_dir and self.credentials: # download on server
+ cmd = str(self.protocol) + " " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
+ elif self.remote_dir and not self.credentials:
+ cmd = str(self.protocol) + " " + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
+ else: # Local rsync for unitest
+ cmd = str(self.protocol) + " " + str(self.server) + str(file_to_download) + " " + str(file_path)
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+ stdout, stderr = p.communicate()
+ err_code = p.returncode
+ self.test_stderr_rsync_message(stderr)
+ self.test_stderr_rsync_error(stderr)
+ except ExceptionRsync as e:
+ logging.error("RsyncError:" + str(e))
+ if err_code != 0:
+ logging.error('Error while downloading ' + file_to_download + ' - ' + str(err_code))
+ error = True
+ return(error)
+
+ def test_stderr_rsync_error(self, stderr):
+ stderr = str(stderr.decode('utf-8'))
+ if "rsync error" in str(stderr):
+ reason = stderr.split(str(self.protocol) + " error:")[1].split("\n")[0]
+ raise ExceptionRsync(reason)
+
+ def test_stderr_rsync_message(self, stderr):
+ stderr = str(stderr.decode('utf-8'))
+ if "rsync:" in str(stderr):
+ reason = stderr.split(str(self.protocol) + ":")[1].split("\n")[0]
+ raise ExceptionRsync(reason)
+
+
+class ExceptionRsync(Exception):
+ def __init__(self, exception_reason):
+ self.exception_reason = exception_reason
+
+ def __str__(self):
+ return self.exception_reason
diff --git a/biomaj_download/downloadclient.py b/biomaj_download/downloadclient.py
new file mode 100644
index 0000000..4c2435e
--- /dev/null
+++ b/biomaj_download/downloadclient.py
@@ -0,0 +1,257 @@
+from biomaj_download.downloadservice import DownloadService
+import requests
+import logging
+import uuid
+import time
+import sys
+import pika
+
+from biomaj_download.download.downloadthreads import DownloadThread
+from biomaj_download.message import message_pb2
+
+if sys.version_info[0] < 3:
+ from Queue import Queue
+else:
+ from queue import Queue
+
+
+class DownloadClient(DownloadService):
+
+ def __init__(self, rabbitmq_host=None, rabbitmq_port=5672, rabbitmq_vhost='/', rabbitmq_user=None, rabbitmq_password=None, pool_size=5, redis_client=None, redis_prefix=None):
+ self.logger = logging
+ self.channel = None
+ self.pool_size = pool_size
+ self.proxy = None
+ self.bank = None
+ self.rate_limiting = 0
+ self.redis_client = redis_client
+ self.redis_prefix = redis_prefix
+ if rabbitmq_host:
+ self.remote = True
+ connection = None
+ if rabbitmq_user:
+ credentials = pika.PlainCredentials(rabbitmq_user, rabbitmq_password)
+ connection = pika.BlockingConnection(pika.ConnectionParameters(rabbitmq_host, rabbitmq_port, rabbitmq_vhost, credentials, heartbeat_interval=0))
+ else:
+ connection = pika.BlockingConnection(pika.ConnectionParameters(rabbitmq_host, rabbitmq_port, rabbitmq_vhost, heartbeat_interval=0))
+ self.channel = connection.channel()
+ else:
+ self.remote = False
+ self.logger.info("Use remote: %s" % (str(self.remote)))
+ self.download_pool = []
+ self.files_to_download = 0
+
+ def set_queue_size(self, size):
+ self.pool_size = size
+
+ def set_rate_limiting(self, rate):
+ self.rate_limiting = rate
+
+ def create_session(self, bank, proxy=None):
+ self.bank = bank
+ if not self.remote:
+ self.session = str(uuid.uuid4())
+ return self.session
+
+ for i in range(3):
+ try:
+ url = proxy + '/api/download/session/' + bank
+ r = requests.post(url)
+ if r.status_code == 200:
+ result = r.json()
+ self.session = result['session']
+ self.proxy = proxy
+ return result['session']
+ except Exception:
+ logging.exception('Failed to send create operation: %s' % (url))
+ raise Exception('Failed to connect to the download proxy')
+
+ def download_status(self):
+ '''
+ Get progress of downloads, try to contact up to 3 times
+ '''
+ for i in range(2):
+ try:
+ url = self.proxy + '/api/download/status/download/' + self.bank + '/' + self.session
+ r = requests.get(self.proxy + '/api/download/status/download/' + self.bank + '/' + self.session)
+ if not r.status_code == 200:
+ logging.error('Failed to connect to the download proxy: %d' % (r.status_code))
+ else:
+ result = r.json()
+ return (result['progress'], result['errors'])
+ except Exception:
+ logging.exception('Failed to connect to the download proxy: %s' % (url))
+ raise Exception('Failed to connect to the download proxy')
+
+ def download_remote_files(self, cf, downloaders, offline_dir):
+ '''
+ cf = Config
+ downloaders = list of downloader
+ offline_dir = base dir to download files
+
+ '''
+ for downloader in downloaders:
+ for file_to_download in downloader.files_to_download:
+ operation = message_pb2.Operation()
+ operation.type = 1
+ message = message_pb2.DownloadFile()
+ message.bank = self.bank
+ message.session = self.session
+ message.local_dir = offline_dir
+ remote_file = message_pb2.DownloadFile.RemoteFile()
+ protocol = downloader.protocol
+ remote_file.protocol = message_pb2.DownloadFile.Protocol.Value(protocol.upper())
+ remote_file.server = downloader.server
+ if cf.get('remote.dir'):
+ remote_file.remote_dir = cf.get('remote.dir')
+ else:
+ remote_file.remote_dir = ''
+ remote_file.credentials = downloader.credentials
+ biomaj_file = remote_file.files.add()
+ biomaj_file.name = file_to_download['name']
+ if 'root' in file_to_download and file_to_download['root']:
+ biomaj_file.root = file_to_download['root']
+ if 'param' in file_to_download and file_to_download['param']:
+ for key in list(file_to_download['param'].keys()):
+ param = remote_file.param.add()
+ param.name = key
+ param.value = file_to_download['param'][key]
+ if 'save_as' in file_to_download and file_to_download['save_as']:
+ biomaj_file.save_as = file_to_download['save_as']
+ if 'url' in file_to_download and file_to_download['url']:
+ biomaj_file.url = file_to_download['url']
+ if 'permissions' in file_to_download and file_to_download['permissions']:
+ biomaj_file.metadata.permissions = file_to_download['permissions']
+ if 'size' in file_to_download and file_to_download['size']:
+ biomaj_file.metadata.size = file_to_download['size']
+ if 'year' in file_to_download and file_to_download['year']:
+ biomaj_file.metadata.year = file_to_download['year']
+ if 'month' in file_to_download and file_to_download['month']:
+ biomaj_file.metadata.month = file_to_download['month']
+ if 'day' in file_to_download and file_to_download['day']:
+ biomaj_file.metadata.day = file_to_download['day']
+ if 'hash' in file_to_download and file_to_download['hash']:
+ biomaj_file.metadata.hash = file_to_download['hash']
+ if 'md5' in file_to_download and file_to_download['md5']:
+ biomaj_file.metadata.md5 = file_to_download['md5']
+
+ message.http_method = message_pb2.DownloadFile.HTTP_METHOD.Value(downloader.method.upper())
+
+ timeout_download = cf.get('timeout.download', None)
+ if timeout_download:
+ try:
+ message.timeout_download = int(timeout_download)
+ except Exception:
+ logging.error('Invalid timeout value, not an integer, skipping')
+
+ message.remote_file.MergeFrom(remote_file)
+ operation.download.MergeFrom(message)
+ self.download_remote_file(operation)
+
+ def download_remote_file(self, operation):
+ # If biomaj_proxy
+ self.files_to_download += 1
+ if self.remote:
+ if self.rate_limiting > 0:
+ self.download_pool.append(operation)
+ else:
+ self.ask_download(operation)
+ else:
+ self.download_pool.append(operation.download)
+
+ def _download_pool_files(self):
+ thlist = []
+
+ logging.info("Workflow:wf_download:Download:Threads:FillQueue")
+
+ message_queue = Queue()
+ for message in self.download_pool:
+ message_queue.put(message)
+
+ logging.info("Workflow:wf_download:Download:Threads:Start")
+
+ for i in range(self.pool_size):
+ th = DownloadThread(self, message_queue)
+ thlist.append(th)
+ th.start()
+
+ message_queue.join()
+
+ logging.info("Workflow:wf_download:Download:Threads:Over")
+ nb_error = 0
+ nb_files_to_download = 0
+ for th in thlist:
+ nb_files_to_download += th.files_to_download
+ if th.error > 0:
+ nb_error += 1
+ return nb_error
+
+ def wait_for_download(self):
+ over = False
+ nb_files_to_download = self.files_to_download
+ nb_submitted = 0
+ logging.info("Workflow:wf_download:RemoteDownload:Waiting")
+ if self.remote:
+ download_error = False
+ last_progress = 0
+ while not over:
+ # Check for cancel request
+ if self.redis_client and self.redis_client.get(self.redis_prefix + ':' + self.bank + ':action:cancel'):
+ logging.warn('Cancel requested, stopping update')
+ self.redis_client.delete(self.redis_prefix + ':' + self.bank + ':action:cancel')
+ raise Exception('Cancel requested, stopping download')
+ (progress, error) = self.download_status()
+ logging.debug('Rate limiting: ' + str(self.rate_limiting))
+ if self.rate_limiting > 0:
+ logging.debug('Workflow:wf_download:RemoteDownload:submitted: %d, current progress: %d, total: %d' % (nb_submitted, progress, nb_files_to_download))
+ if self.download_pool:
+ max_submit = self.rate_limiting
+ if nb_submitted != 0:
+ max_submit = self.rate_limiting - (nb_submitted - progress)
+ logging.debug('Workflow:wf_download:RemoteDownload:RequestAvailable:%d' % (max_submit))
+ for i in range(max_submit):
+ if self.download_pool:
+ logging.debug('Workflow:wf_download:RemoteDownload:RequestNewFile')
+ operation = self.download_pool.pop()
+ self.ask_download(operation)
+ nb_submitted += 1
+
+ if progress == nb_files_to_download:
+ over = True
+ logging.info("Workflow:wf_download:RemoteDownload:Completed:" + str(progress))
+ logging.info("Workflow:wf_download:RemoteDownload:Errors:" + str(error))
+ else:
+ progress_percent = (progress // nb_files_to_download) * 100
+ if progress_percent > last_progress:
+ last_progress = progress_percent
+ logging.info("Workflow:wf_download:RemoteDownload:InProgress:" + str(progress) + '/' + str(nb_files_to_download) + "(" + str(progress_percent) + "%)")
+ time.sleep(10)
+ if error > 0:
+ download_error = True
+ r = requests.get(self.proxy + '/api/download/error/download/' + self.bank + '/' + self.session)
+ if not r.status_code == 200:
+ raise Exception('Failed to connect to the download proxy')
+ result = r.json()
+ for err in result['error']:
+ logging.info("Workflow:wf_download:RemoteDownload:Errors:Info:" + str(err))
+ return download_error
+ else:
+ error = self._download_pool_files()
+ logging.info('Workflow:wf_download:RemoteDownload:Completed')
+ if error > 0:
+ logging.info("Workflow:wf_download:RemoteDownload:Errors:" + str(error))
+ return True
+ else:
+ return False
+
+ def clean(self):
+ if self.remote:
+ for i in range(3):
+ try:
+ url = self.proxy + '/api/download/session/' + self.bank + '/' + self.session
+ r = requests.delete(self.proxy + '/api/download/session/' + self.bank + '/' + self.session)
+ if r.status_code == 200:
+ return
+ except Exception:
+ logging.exception('Failed to send clean operation: %s' % (url))
+ raise Exception('Failed to connect to the download proxy')
diff --git a/biomaj_download/downloadservice.py b/biomaj_download/downloadservice.py
new file mode 100644
index 0000000..eabc3b5
--- /dev/null
+++ b/biomaj_download/downloadservice.py
@@ -0,0 +1,483 @@
+import os
+import datetime
+import logging
+import logging.config
+import yaml
+import redis
+import uuid
+import traceback
+import threading
+
+import consul
+import pika
+from flask import Flask
+from flask import jsonify
+
+from biomaj_download.download.ftp import FTPDownload
+from biomaj_download.download.http import HTTPDownload
+from biomaj_download.download.direct import DirectFTPDownload
+from biomaj_download.download.direct import DirectHttpDownload
+from biomaj_download.download.localcopy import LocalDownload
+from biomaj_download.message import message_pb2
+from biomaj_download.download.rsync import RSYNCDownload
+from biomaj_core.utils import Utils
+from biomaj_zipkin.zipkin import Zipkin
+
+
+app = Flask(__name__)
+app_log = logging.getLogger('werkzeug')
+app_log.setLevel(logging.ERROR)
+
+
+ at app.route('/api/download-message')
+def ping():
+ return jsonify({'msg': 'pong'})
+
+
+def start_web(config):
+ app.run(host='0.0.0.0', port=config['web']['port'])
+
+
+def consul_declare(config):
+ if config['consul']['host']:
+ consul_agent = consul.Consul(host=config['consul']['host'])
+ consul_agent.agent.service.register(
+ 'biomaj-download-message',
+ service_id=config['consul']['id'],
+ address=config['web']['hostname'],
+ port=config['web']['port'],
+ tags=['biomaj']
+ )
+ check = consul.Check.http(
+ url='http://' + config['web']['hostname'] + ':' + str(config['web']['port']) + '/api/download-message',
+ interval=20
+ )
+ consul_agent.agent.check.register(
+ config['consul']['id'] + '_check',
+ check=check,
+ service_id=config['consul']['id']
+ )
+ return True
+ else:
+ return False
+
+
+class DownloadService(object):
+
+ channel = None
+ redis_client = None
+
+ def supervise(self):
+ if consul_declare(self.config):
+ web_thread = threading.Thread(target=start_web, args=(self.config,))
+ web_thread.start()
+
+ def __init__(self, config_file=None, rabbitmq=True):
+ self.logger = logging
+ self.session = None
+ self.bank = None
+ self.download_callback = None
+ with open(config_file, 'r') as ymlfile:
+ self.config = yaml.load(ymlfile)
+ Utils.service_config_override(self.config)
+
+ Zipkin.set_config(self.config)
+
+ if 'log_config' in self.config:
+ for handler in list(self.config['log_config']['handlers'].keys()):
+ self.config['log_config']['handlers'][handler] = dict(self.config['log_config']['handlers'][handler])
+ logging.config.dictConfig(self.config['log_config'])
+ self.logger = logging.getLogger('biomaj')
+
+ if not self.redis_client:
+ self.redis_client = redis.StrictRedis(host=self.config['redis']['host'],
+ port=self.config['redis']['port'],
+ db=self.config['redis']['db'],
+ decode_responses=True)
+
+ if rabbitmq and not self.channel:
+ connection = None
+ rabbitmq_port = self.config['rabbitmq']['port']
+ rabbitmq_user = self.config['rabbitmq']['user']
+ rabbitmq_password = self.config['rabbitmq']['password']
+ rabbitmq_vhost = self.config['rabbitmq']['virtual_host']
+ if rabbitmq_user:
+ credentials = pika.PlainCredentials(rabbitmq_user, rabbitmq_password)
+ connection = pika.BlockingConnection(pika.ConnectionParameters(self.config['rabbitmq']['host'], rabbitmq_port, rabbitmq_vhost, credentials, heartbeat_interval=0))
+ else:
+ connection = pika.BlockingConnection(pika.ConnectionParameters(self.config['rabbitmq']['host'], heartbeat_interval=0))
+ self.channel = connection.channel()
+ self.logger.info('Download service started')
+
+ def close(self):
+ if self.channel:
+ try:
+ self.channel.close()
+ except Exception as e:
+ logging.warn('Download:Service:Exception:' + str(e))
+
+ def on_download_callback(self, func):
+ self.download_callback = func
+
+ def get_handler(self, protocol_name, server, remote_dir, remote_files=[],
+ credentials=None, http_parse=None, http_method=None, param=None,
+ proxy=None, proxy_auth='',
+ save_as=None, timeout_download=None, offline_dir=None):
+ protocol = message_pb2.DownloadFile.Protocol.Value(protocol_name.upper())
+ downloader = None
+ if protocol in [0, 1]:
+ downloader = FTPDownload(protocol_name, server, remote_dir)
+ if protocol in [2, 3]:
+ downloader = HTTPDownload(protocol_name, server, remote_dir, http_parse)
+ if protocol == 7:
+ downloader = LocalDownload(remote_dir)
+ if protocol == 4:
+ downloader = DirectFTPDownload('ftp', server, '/')
+ if protocol == 5:
+ downloader = DirectHttpDownload('http', server, '/')
+ if protocol == 6:
+ downloader = DirectHttpDownload('https', server, '/')
+ if protocol == 8:
+ downloader = RSYNCDownload('rsync', server, remote_dir)
+ if downloader is None:
+ return None
+
+ for remote_file in remote_files:
+ if remote_file['save_as']:
+ save_as = remote_file['save_as']
+
+ # For direct protocol, we only keep base name
+ if protocol in [4, 5, 6]:
+ tmp_remote = []
+ for remote_file in remote_files:
+ tmp_remote.append(remote_file['name'])
+ remote_files = tmp_remote
+
+ if http_method is not None:
+ downloader.set_method(http_method)
+
+ if offline_dir:
+ downloader.set_offline_dir(offline_dir)
+
+ if proxy is not None and proxy:
+ downloader.set_proxy(proxy, proxy_auth)
+
+ if timeout_download is not None and timeout_download:
+ downloader.set_timeout(timeout_download)
+
+ if credentials:
+ downloader.set_credentials(credentials)
+
+ if save_as:
+ downloader.set_save_as(save_as)
+
+ if param:
+ downloader.set_param(param)
+
+ downloader.set_server(server)
+
+ downloader.set_protocol(protocol_name)
+
+ downloader.logger = self.logger
+ downloader.set_files_to_download(remote_files)
+ return downloader
+
+ def _get_handler(self, biomaj_file_info):
+ """
+ Get a protocol download handler
+ """
+
+ protocol = biomaj_file_info.remote_file.protocol
+ server = biomaj_file_info.remote_file.server
+ remote_dir = biomaj_file_info.remote_file.remote_dir
+
+ protocol_name = message_pb2.DownloadFile.Protocol.Name(protocol).lower()
+ self.logger.debug('%s request to download from %s://%s' % (biomaj_file_info.bank, protocol_name, server))
+
+ remote_files = []
+ for remote_file in biomaj_file_info.remote_file.files:
+ remote_files.append({
+ 'name': remote_file.name,
+ 'save_as': remote_file.save_as,
+ 'year': remote_file.metadata.year,
+ 'month': remote_file.metadata.month,
+ 'day': remote_file.metadata.day,
+ 'root': remote_file.root
+ })
+
+ proxy = None
+ proxy_auth = ''
+ if biomaj_file_info.proxy is not None:
+ proxy = biomaj_file_info.proxy.proxy
+ proxy_auth = biomaj_file_info.proxy.proxy_auth
+
+ params = None
+ if biomaj_file_info.remote_file.param:
+ params = {}
+ for param in biomaj_file_info.remote_file.param:
+ params[param.name] = param.value
+ return self.get_handler(protocol_name, server, remote_dir,
+ remote_files=remote_files,
+ credentials=biomaj_file_info.remote_file.credentials,
+ http_parse=biomaj_file_info.remote_file.http_parse,
+ http_method=message_pb2.DownloadFile.HTTP_METHOD.Name(biomaj_file_info.http_method),
+ param=params,
+ proxy=proxy,
+ proxy_auth=proxy_auth,
+ save_as=biomaj_file_info.remote_file.save_as,
+ timeout_download=biomaj_file_info.timeout_download,
+ offline_dir=biomaj_file_info.local_dir)
+
+ def clean(self, biomaj_file_info=None):
+ '''
+ Clean session and download info
+ '''
+ session = self.session
+ bank = self.bank
+ if biomaj_file_info:
+ session = biomaj_file_info.session
+ bank = biomaj_file_info.bank
+
+ self.logger.debug('Clean %s session %s' % (bank, session))
+ self.redis_client.delete(self.config['redis']['prefix'] + ':' + bank + ':session:' + session)
+ self.redis_client.delete(self.config['redis']['prefix'] + ':' + bank + ':session:' + session + ':error')
+ self.redis_client.delete(self.config['redis']['prefix'] + ':' + bank + ':session:' + session + ':progress')
+ self.redis_client.delete(self.config['redis']['prefix'] + ':' + bank + ':session:' + session + ':files')
+ self.redis_client.delete(self.config['redis']['prefix'] + ':' + bank + ':session:' + session + ':error:info')
+
+ def _create_session(self, bank):
+ '''
+ Creates a unique session
+ '''
+ self.session = str(uuid.uuid4())
+ self.redis_client.set(self.config['redis']['prefix'] + ':' + bank + ':session:' + self.session, 1)
+ self.logger.debug('Create %s new session %s' % (bank, self.session))
+ self.bank = bank
+ return self.session
+
+ def download_errors(self, biomaj_file_info):
+ '''
+ Get errors
+ '''
+ errors = []
+ error = self.redis_client.rpop(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error:info')
+ while error:
+ errors.append(error)
+ error = self.redis_client.rpop(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error:info')
+ return errors
+
+ def download_status(self, biomaj_file_info):
+ '''
+ Get current status
+ '''
+
+ error = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error')
+ progress = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':progress')
+ if error is None:
+ error = -1
+ if progress is None:
+ progress = -1
+ return (int(progress), int(error))
+
+ def list_status(self, biomaj_file_info):
+
+ list_progress = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':progress')
+ if list_progress:
+ return True
+ else:
+ return False
+
+ def list_result(self, biomaj_file_info, protobuf_decode=True):
+ '''
+ Get file list result
+ '''
+
+ file_list = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':files')
+ if protobuf_decode:
+ file_list_pb2 = message_pb2.FileList()
+ file_list_pb2.ParseFromString(file_list_pb2)
+ return file_list_pb2
+
+ return file_list
+
+ def _list(self, download_handler, biomaj_file_info):
+ '''
+ List remote content, no session management
+ '''
+ file_list = []
+ dir_list = []
+ file_list_pb2 = message_pb2.FileList()
+
+ try:
+ (file_list, dir_list) = download_handler.list()
+ download_handler.match(biomaj_file_info.remote_file.matches, file_list, dir_list)
+ except Exception as e:
+ self.logger.error('List exception for bank %s: %s' % (biomaj_file_info.bank, str(e)))
+ self.redis_client.set(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error', 1)
+ self.redis_client.lpush(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error:info', str(e))
+ else:
+ self.logger.debug('End of download for %s session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+ for file_elt in download_handler.files_to_download:
+ # file_pb2 = message_pb2.File()
+ file_pb2 = file_list_pb2.files.add()
+ file_pb2.name = file_elt['name']
+ file_pb2.root = file_elt['root']
+ if 'save_as' in file_elt:
+ file_pb2.save_as = file_elt['save_as']
+ if 'url' in file_elt:
+ file_pb2.url = file_elt['url']
+ if 'param' in file_elt and file_elt['param']:
+ for key in list(file_elt['param'].keys()):
+ param = file_list_pb2.param.add()
+ param.name = key
+ param.value = file_elt['param'][key]
+ metadata = message_pb2.File.MetaData()
+ metadata.permissions = file_elt['permissions']
+ metadata.group = file_elt['group']
+ metadata.size = int(file_elt['size'])
+ metadata.hash = file_elt['hash']
+ metadata.year = int(file_elt['year'])
+ metadata.month = int(file_elt['month'])
+ metadata.day = int(file_elt['day'])
+ if 'format' in file_elt:
+ metadata.format = file_elt['format']
+ file_pb2.metadata.MergeFrom(metadata)
+ return file_list_pb2
+
+ def list(self, biomaj_file_info):
+ '''
+ List remote content
+ '''
+ self.logger.debug('New list request %s session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+ session = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session)
+ if not session:
+ self.logger.debug('Session %s for bank %s has expired, skipping download of %s' % (biomaj_file_info.session, biomaj_file_info.bank, biomaj_file_info.remote_file.files))
+ return
+ download_handler = self._get_handler(biomaj_file_info)
+ if download_handler is None:
+ self.logger.error('Could not get a handler for %s with session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+
+ file_list_pb2 = self._list(download_handler, biomaj_file_info)
+
+ self.redis_client.set(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':files', str(file_list_pb2.SerializeToString()))
+ self.redis_client.incr(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':progress')
+
+ def local_download(self, biomaj_file_info):
+ '''
+ Download files, no session
+ '''
+ download_handler = self._get_handler(biomaj_file_info)
+ if download_handler is None:
+ self.logger.error('Could not get a handler for %s with session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+ downloaded_files = download_handler.download(biomaj_file_info.local_dir)
+ self.logger.debug("Downloaded " + str(len(downloaded_files)) + " file in " + biomaj_file_info.local_dir)
+ self.get_file_info(biomaj_file_info.local_dir, downloaded_files)
+ return downloaded_files
+
+ def download(self, biomaj_file_info):
+ '''
+ Download files
+
+ Store in redis the progress and count of errors under:
+ - prefix:bank_name:session:session_id:error
+ - prefix:bank_name:session:session_id:progress
+ '''
+
+ self.logger.debug('New download request %s session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+ session = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session)
+ if not session:
+ self.logger.debug('Session %s for bank %s has expired, skipping download of %s' % (biomaj_file_info.session, biomaj_file_info.bank, biomaj_file_info.remote_file.files))
+ return
+ downloaded_files = []
+ try:
+ downloaded_files = self.local_download(biomaj_file_info)
+ except Exception as e:
+ self.logger.exception("Download error:%s:%s:%s" % (biomaj_file_info.bank, biomaj_file_info.session, str(e)))
+ session = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session)
+ if session:
+ # If session deleted, do not track
+ self.redis_client.incr(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error')
+ self.redis_client.lpush(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':error:info', str(e))
+ else:
+ self.logger.debug('End of download for %s session %s' % (biomaj_file_info.bank, biomaj_file_info.session))
+
+ session = self.redis_client.get(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session)
+ if session:
+ # If session deleted, do not track
+ self.redis_client.incr(self.config['redis']['prefix'] + ':' + biomaj_file_info.bank + ':session:' + biomaj_file_info.session + ':progress')
+ return downloaded_files
+
+ def get_file_info(self, local_dir, downloaded_files):
+ if downloaded_files is None:
+ return
+ for downloaded_file in downloaded_files:
+ # file_dir = local_dir + '/' + os.path.dirname(downloaded_file['save_as'])
+ file_path = local_dir + '/' + downloaded_file['save_as']
+ fstat = os.stat(file_path)
+ downloaded_file['permissions'] = str(fstat.st_mode)
+ downloaded_file['group'] = str(fstat.st_gid)
+ downloaded_file['user'] = str(fstat.st_uid)
+ downloaded_file['size'] = str(fstat.st_size)
+ fstat_mtime = datetime.datetime.fromtimestamp(fstat.st_mtime)
+ downloaded_file['month'] = fstat_mtime.month
+ downloaded_file['day'] = fstat_mtime.day
+ downloaded_file['year'] = fstat_mtime.year
+
+ def ask_download(self, biomaj_info_file):
+ self.channel.basic_publish(
+ exchange='',
+ routing_key='biomajdownload',
+ body=biomaj_info_file.SerializeToString(),
+ properties=pika.BasicProperties(
+ # make message persistent
+ delivery_mode=2
+ ))
+
+ def callback_messages(self, ch, method, properties, body):
+ '''
+ Manage download and send ACK message
+ '''
+ try:
+ operation = message_pb2.Operation()
+ operation.ParseFromString(body)
+ message = operation.download
+ span = None
+ if operation.trace and operation.trace.trace_id:
+ url = str(message.remote_file.protocol) + ':' + str(message.remote_file.server) + ':' + str(message.remote_file.remote_dir)
+ span = Zipkin('biomaj-download-executor', str(message.remote_file.server), trace_id=operation.trace.trace_id, parent_id=operation.trace.span_id)
+ span.add_binary_annotation('url', url)
+ span.add_binary_annotation('local_dir', str(message.local_dir))
+
+ self.logger.debug('Received message: %s' % (message))
+ if operation.type == 0:
+ message = operation.download
+ self.logger.debug('List operation %s, %s' % (message.bank, message.session))
+ if len(message.remote_file.matches) == 0:
+ self.logger.error('No pattern match for a list operation')
+ else:
+ self.list(message)
+ elif operation.type == 1:
+ message = operation.download
+ self.logger.debug('Download operation %s, %s' % (message.bank, message.session))
+ downloaded_files = self.download(message)
+ if self.download_callback is not None:
+ self.download_callback(message.bank, downloaded_files)
+ else:
+ self.logger.warn('Wrong message type, skipping')
+ if span:
+ span.trace()
+ except Exception as e:
+ self.logger.error('Error with message: %s' % (str(e)))
+ traceback.print_exc()
+ ch.basic_ack(delivery_tag=method.delivery_tag)
+
+ def wait_for_messages(self):
+ '''
+ Loop queue waiting for messages
+ '''
+ self.channel.queue_declare(queue='biomajdownload', durable=True)
+ self.channel.basic_qos(prefetch_count=1)
+ self.channel.basic_consume(
+ self.callback_messages,
+ queue='biomajdownload')
+ self.channel.start_consuming()
diff --git a/biomaj_download/message/__init__.py b/biomaj_download/message/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj_download/message/message.proto b/biomaj_download/message/message.proto
new file mode 100644
index 0000000..29486e4
--- /dev/null
+++ b/biomaj_download/message/message.proto
@@ -0,0 +1,122 @@
+package biomaj;
+
+message File {
+ // Name of the file
+ required string name = 1;
+ // Location on remote server, defaults to root directory
+ optional string root = 2;
+ // Save file under different name
+ optional string save_as = 3;
+ // Specific file url
+ optional string url = 4;
+
+ message MetaData {
+ optional string permissions = 1;
+ optional string group = 2;
+ optional int64 size = 3;
+ optional string hash = 4;
+ optional int32 year = 5;
+ optional int32 month = 6;
+ optional int32 day = 7;
+ optional string format = 8;
+ optional string md5 = 9;
+ optional int64 download_time = 10;
+ }
+
+ optional MetaData metadata = 5;
+
+}
+
+message FileList {
+ repeated File files = 1;
+}
+
+message Operation {
+
+ enum OPERATION {
+ LIST = 0;
+ DOWNLOAD = 1;
+ PROCESS = 2;
+ }
+
+ required OPERATION type = 1;
+ optional DownloadFile download = 2;
+ optional Process process = 3;
+ message Trace {
+ required string trace_id = 1;
+ required string span_id = 2;
+ }
+
+ optional Trace trace = 4;
+}
+
+message Process {
+ required string exec = 1;
+}
+
+message DownloadFile {
+ required string bank = 1;
+ required string session = 2;
+
+ required string local_dir = 3;
+
+ optional int32 timeout_download = 4;
+
+ enum Protocol {
+ FTP = 0;
+ SFTP = 1;
+ HTTP = 2;
+ HTTPS = 3;
+ DIRECTFTP = 4;
+ DIRECTHTTP = 5;
+ DIRECTHTTPS = 6;
+ LOCAL = 7;
+ RSYNC = 8;
+ }
+
+ message Param {
+ required string name = 1;
+ required string value = 2;
+ }
+
+ message HttpParse {
+ required string dir_line = 1 [default = '<img[\\s]+src="[\\S]+"[\\s]+alt="\\[DIR\\]"[\\s]*/?>[\\s]*<a[\\s]+href="([\\S]+)/"[\\s]*>.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})'];
+ required string file_line = 2 [default = '<img[\\s]+src="[\\S]+"[\\s]+alt="\\[[\\s]+\\]"[\\s]*/?>[\\s]<a[\\s]+href="([\\S]+)".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})'];
+ required int32 dir_name = 3 [default =1];
+ required int32 dir_date = 4 [default = 2];
+ required int32 file_name = 5 [default = 1];
+ required int32 file_date = 6 [default = 2];
+ optional string file_date_format = 7;
+ required int32 file_size = 8 [default = 3];
+ }
+
+ enum HTTP_METHOD {
+ GET = 0;
+ POST = 1;
+ }
+
+ message RemoteFile {
+ repeated File files = 1;
+ required Protocol protocol = 2;
+ required string server = 3;
+ required string remote_dir = 4;
+ optional string save_as = 5;
+ repeated Param param = 6;
+ optional HttpParse http_parse = 7;
+ optional string credentials = 8;
+ repeated string matches = 9;
+ }
+
+ required RemoteFile remote_file = 5;
+
+
+ message Proxy {
+ required string proxy = 1;
+ optional string proxy_auth = 2;
+ }
+
+ optional Proxy proxy = 6;
+
+ optional HTTP_METHOD http_method = 8 [ default = GET];
+
+}
diff --git a/biomaj_download/message/message_pb2.py b/biomaj_download/message/message_pb2.py
new file mode 100644
index 0000000..401f01b
--- /dev/null
+++ b/biomaj_download/message/message_pb2.py
@@ -0,0 +1,844 @@
+# Generated by the protocol buffer compiler. DO NOT EDIT!
+# source: message.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+ name='message.proto',
+ package='biomaj',
+ serialized_pb=_b('\n\rmessage.proto\x12\x06\x62iomaj\"\x94\x02\n\x04\x46ile\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04root\x18\x02 \x01(\t\x12\x0f\n\x07save_as\x18\x03 \x01(\t\x12\x0b\n\x03url\x18\x04 \x01(\t\x12\'\n\x08metadata\x18\x05 \x01(\x0b\x32\x15.biomaj.File.MetaData\x1a\xa8\x01\n\x08MetaData\x12\x13\n\x0bpermissions\x18\x01 \x01(\t\x12\r\n\x05group\x18\x02 \x01(\t\x12\x0c\n\x04size\x18\x03 \x01(\x03\x12\x0c\n\x04hash\x18\x04 \x01(\t\x12\x0c\n\x04year\x18\x05 \x01(\x05\x12 [...]
+)
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+
+_OPERATION_OPERATION = _descriptor.EnumDescriptor(
+ name='OPERATION',
+ full_name='biomaj.Operation.OPERATION',
+ filename=None,
+ file=DESCRIPTOR,
+ values=[
+ _descriptor.EnumValueDescriptor(
+ name='LIST', index=0, number=0,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='DOWNLOAD', index=1, number=1,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='PROCESS', index=2, number=2,
+ options=None,
+ type=None),
+ ],
+ containing_type=None,
+ options=None,
+ serialized_start=560,
+ serialized_end=608,
+)
+_sym_db.RegisterEnumDescriptor(_OPERATION_OPERATION)
+
+_DOWNLOADFILE_PROTOCOL = _descriptor.EnumDescriptor(
+ name='Protocol',
+ full_name='biomaj.DownloadFile.Protocol',
+ filename=None,
+ file=DESCRIPTOR,
+ values=[
+ _descriptor.EnumValueDescriptor(
+ name='FTP', index=0, number=0,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='SFTP', index=1, number=1,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='HTTP', index=2, number=2,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='HTTPS', index=3, number=3,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='DIRECTFTP', index=4, number=4,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='DIRECTHTTP', index=5, number=5,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='DIRECTHTTPS', index=6, number=6,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='LOCAL', index=7, number=7,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='RSYNC', index=8, number=8,
+ options=None,
+ type=None),
+ ],
+ containing_type=None,
+ options=None,
+ serialized_start=1710,
+ serialized_end=1830,
+)
+_sym_db.RegisterEnumDescriptor(_DOWNLOADFILE_PROTOCOL)
+
+_DOWNLOADFILE_HTTP_METHOD = _descriptor.EnumDescriptor(
+ name='HTTP_METHOD',
+ full_name='biomaj.DownloadFile.HTTP_METHOD',
+ filename=None,
+ file=DESCRIPTOR,
+ values=[
+ _descriptor.EnumValueDescriptor(
+ name='GET', index=0, number=0,
+ options=None,
+ type=None),
+ _descriptor.EnumValueDescriptor(
+ name='POST', index=1, number=1,
+ options=None,
+ type=None),
+ ],
+ containing_type=None,
+ options=None,
+ serialized_start=1832,
+ serialized_end=1864,
+)
+_sym_db.RegisterEnumDescriptor(_DOWNLOADFILE_HTTP_METHOD)
+
+
+_FILE_METADATA = _descriptor.Descriptor(
+ name='MetaData',
+ full_name='biomaj.File.MetaData',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='permissions', full_name='biomaj.File.MetaData.permissions', index=0,
+ number=1, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='group', full_name='biomaj.File.MetaData.group', index=1,
+ number=2, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='size', full_name='biomaj.File.MetaData.size', index=2,
+ number=3, type=3, cpp_type=2, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='hash', full_name='biomaj.File.MetaData.hash', index=3,
+ number=4, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='year', full_name='biomaj.File.MetaData.year', index=4,
+ number=5, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='month', full_name='biomaj.File.MetaData.month', index=5,
+ number=6, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='day', full_name='biomaj.File.MetaData.day', index=6,
+ number=7, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='format', full_name='biomaj.File.MetaData.format', index=7,
+ number=8, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='md5', full_name='biomaj.File.MetaData.md5', index=8,
+ number=9, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='download_time', full_name='biomaj.File.MetaData.download_time', index=9,
+ number=10, type=3, cpp_type=2, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=134,
+ serialized_end=302,
+)
+
+_FILE = _descriptor.Descriptor(
+ name='File',
+ full_name='biomaj.File',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='name', full_name='biomaj.File.name', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='root', full_name='biomaj.File.root', index=1,
+ number=2, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='save_as', full_name='biomaj.File.save_as', index=2,
+ number=3, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='url', full_name='biomaj.File.url', index=3,
+ number=4, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='metadata', full_name='biomaj.File.metadata', index=4,
+ number=5, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[_FILE_METADATA, ],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=26,
+ serialized_end=302,
+)
+
+
+_FILELIST = _descriptor.Descriptor(
+ name='FileList',
+ full_name='biomaj.FileList',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='files', full_name='biomaj.FileList.files', index=0,
+ number=1, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=304,
+ serialized_end=343,
+)
+
+
+_OPERATION_TRACE = _descriptor.Descriptor(
+ name='Trace',
+ full_name='biomaj.Operation.Trace',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='trace_id', full_name='biomaj.Operation.Trace.trace_id', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='span_id', full_name='biomaj.Operation.Trace.span_id', index=1,
+ number=2, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=516,
+ serialized_end=558,
+)
+
+_OPERATION = _descriptor.Descriptor(
+ name='Operation',
+ full_name='biomaj.Operation',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='type', full_name='biomaj.Operation.type', index=0,
+ number=1, type=14, cpp_type=8, label=2,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='download', full_name='biomaj.Operation.download', index=1,
+ number=2, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='process', full_name='biomaj.Operation.process', index=2,
+ number=3, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='trace', full_name='biomaj.Operation.trace', index=3,
+ number=4, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[_OPERATION_TRACE, ],
+ enum_types=[
+ _OPERATION_OPERATION,
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=346,
+ serialized_end=608,
+)
+
+
+_PROCESS = _descriptor.Descriptor(
+ name='Process',
+ full_name='biomaj.Process',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='exec', full_name='biomaj.Process.exec', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=610,
+ serialized_end=633,
+)
+
+
+_DOWNLOADFILE_PARAM = _descriptor.Descriptor(
+ name='Param',
+ full_name='biomaj.DownloadFile.Param',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='name', full_name='biomaj.DownloadFile.Param.name', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='value', full_name='biomaj.DownloadFile.Param.value', index=1,
+ number=2, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=885,
+ serialized_end=921,
+)
+
+_DOWNLOADFILE_HTTPPARSE = _descriptor.Descriptor(
+ name='HttpParse',
+ full_name='biomaj.DownloadFile.HttpParse',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='dir_line', full_name='biomaj.DownloadFile.HttpParse.dir_line', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=True, default_value=_b("<img[\\s]+src=\"[\\S]+\"[\\s]+alt=\"\\[DIR\\]\"[\\s]*/?>[\\s]*<a[\\s]+href=\"([\\S]+)/\"[\\s]*>.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='file_line', full_name='biomaj.DownloadFile.HttpParse.file_line', index=1,
+ number=2, type=9, cpp_type=9, label=2,
+ has_default_value=True, default_value=_b("<img[\\s]+src=\"[\\S]+\"[\\s]+alt=\"\\[[\\s]+\\]\"[\\s]*/?>[\\s]<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='dir_name', full_name='biomaj.DownloadFile.HttpParse.dir_name', index=2,
+ number=3, type=5, cpp_type=1, label=2,
+ has_default_value=True, default_value=1,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='dir_date', full_name='biomaj.DownloadFile.HttpParse.dir_date', index=3,
+ number=4, type=5, cpp_type=1, label=2,
+ has_default_value=True, default_value=2,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='file_name', full_name='biomaj.DownloadFile.HttpParse.file_name', index=4,
+ number=5, type=5, cpp_type=1, label=2,
+ has_default_value=True, default_value=1,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='file_date', full_name='biomaj.DownloadFile.HttpParse.file_date', index=5,
+ number=6, type=5, cpp_type=1, label=2,
+ has_default_value=True, default_value=2,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='file_date_format', full_name='biomaj.DownloadFile.HttpParse.file_date_format', index=6,
+ number=7, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='file_size', full_name='biomaj.DownloadFile.HttpParse.file_size', index=7,
+ number=8, type=5, cpp_type=1, label=2,
+ has_default_value=True, default_value=3,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=924,
+ serialized_end=1385,
+)
+
+_DOWNLOADFILE_REMOTEFILE = _descriptor.Descriptor(
+ name='RemoteFile',
+ full_name='biomaj.DownloadFile.RemoteFile',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='files', full_name='biomaj.DownloadFile.RemoteFile.files', index=0,
+ number=1, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='protocol', full_name='biomaj.DownloadFile.RemoteFile.protocol', index=1,
+ number=2, type=14, cpp_type=8, label=2,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='server', full_name='biomaj.DownloadFile.RemoteFile.server', index=2,
+ number=3, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='remote_dir', full_name='biomaj.DownloadFile.RemoteFile.remote_dir', index=3,
+ number=4, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='save_as', full_name='biomaj.DownloadFile.RemoteFile.save_as', index=4,
+ number=5, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='param', full_name='biomaj.DownloadFile.RemoteFile.param', index=5,
+ number=6, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='http_parse', full_name='biomaj.DownloadFile.RemoteFile.http_parse', index=6,
+ number=7, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='credentials', full_name='biomaj.DownloadFile.RemoteFile.credentials', index=7,
+ number=8, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='matches', full_name='biomaj.DownloadFile.RemoteFile.matches', index=8,
+ number=9, type=9, cpp_type=9, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=1388,
+ serialized_end=1664,
+)
+
+_DOWNLOADFILE_PROXY = _descriptor.Descriptor(
+ name='Proxy',
+ full_name='biomaj.DownloadFile.Proxy',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='proxy', full_name='biomaj.DownloadFile.Proxy.proxy', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='proxy_auth', full_name='biomaj.DownloadFile.Proxy.proxy_auth', index=1,
+ number=2, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=1666,
+ serialized_end=1708,
+)
+
+_DOWNLOADFILE = _descriptor.Descriptor(
+ name='DownloadFile',
+ full_name='biomaj.DownloadFile',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='bank', full_name='biomaj.DownloadFile.bank', index=0,
+ number=1, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='session', full_name='biomaj.DownloadFile.session', index=1,
+ number=2, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='local_dir', full_name='biomaj.DownloadFile.local_dir', index=2,
+ number=3, type=9, cpp_type=9, label=2,
+ has_default_value=False, default_value=_b("").decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='timeout_download', full_name='biomaj.DownloadFile.timeout_download', index=3,
+ number=4, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='remote_file', full_name='biomaj.DownloadFile.remote_file', index=4,
+ number=5, type=11, cpp_type=10, label=2,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='proxy', full_name='biomaj.DownloadFile.proxy', index=5,
+ number=6, type=11, cpp_type=10, label=1,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ _descriptor.FieldDescriptor(
+ name='http_method', full_name='biomaj.DownloadFile.http_method', index=6,
+ number=8, type=14, cpp_type=8, label=1,
+ has_default_value=True, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ options=None),
+ ],
+ extensions=[
+ ],
+ nested_types=[_DOWNLOADFILE_PARAM, _DOWNLOADFILE_HTTPPARSE, _DOWNLOADFILE_REMOTEFILE, _DOWNLOADFILE_PROXY, ],
+ enum_types=[
+ _DOWNLOADFILE_PROTOCOL,
+ _DOWNLOADFILE_HTTP_METHOD,
+ ],
+ options=None,
+ is_extendable=False,
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=636,
+ serialized_end=1864,
+)
+
+_FILE_METADATA.containing_type = _FILE
+_FILE.fields_by_name['metadata'].message_type = _FILE_METADATA
+_FILELIST.fields_by_name['files'].message_type = _FILE
+_OPERATION_TRACE.containing_type = _OPERATION
+_OPERATION.fields_by_name['type'].enum_type = _OPERATION_OPERATION
+_OPERATION.fields_by_name['download'].message_type = _DOWNLOADFILE
+_OPERATION.fields_by_name['process'].message_type = _PROCESS
+_OPERATION.fields_by_name['trace'].message_type = _OPERATION_TRACE
+_OPERATION_OPERATION.containing_type = _OPERATION
+_DOWNLOADFILE_PARAM.containing_type = _DOWNLOADFILE
+_DOWNLOADFILE_HTTPPARSE.containing_type = _DOWNLOADFILE
+_DOWNLOADFILE_REMOTEFILE.fields_by_name['files'].message_type = _FILE
+_DOWNLOADFILE_REMOTEFILE.fields_by_name['protocol'].enum_type = _DOWNLOADFILE_PROTOCOL
+_DOWNLOADFILE_REMOTEFILE.fields_by_name['param'].message_type = _DOWNLOADFILE_PARAM
+_DOWNLOADFILE_REMOTEFILE.fields_by_name['http_parse'].message_type = _DOWNLOADFILE_HTTPPARSE
+_DOWNLOADFILE_REMOTEFILE.containing_type = _DOWNLOADFILE
+_DOWNLOADFILE_PROXY.containing_type = _DOWNLOADFILE
+_DOWNLOADFILE.fields_by_name['remote_file'].message_type = _DOWNLOADFILE_REMOTEFILE
+_DOWNLOADFILE.fields_by_name['proxy'].message_type = _DOWNLOADFILE_PROXY
+_DOWNLOADFILE.fields_by_name['http_method'].enum_type = _DOWNLOADFILE_HTTP_METHOD
+_DOWNLOADFILE_PROTOCOL.containing_type = _DOWNLOADFILE
+_DOWNLOADFILE_HTTP_METHOD.containing_type = _DOWNLOADFILE
+DESCRIPTOR.message_types_by_name['File'] = _FILE
+DESCRIPTOR.message_types_by_name['FileList'] = _FILELIST
+DESCRIPTOR.message_types_by_name['Operation'] = _OPERATION
+DESCRIPTOR.message_types_by_name['Process'] = _PROCESS
+DESCRIPTOR.message_types_by_name['DownloadFile'] = _DOWNLOADFILE
+
+File = _reflection.GeneratedProtocolMessageType('File', (_message.Message,), dict(
+
+ MetaData = _reflection.GeneratedProtocolMessageType('MetaData', (_message.Message,), dict(
+ DESCRIPTOR = _FILE_METADATA,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.File.MetaData)
+ ))
+ ,
+ DESCRIPTOR = _FILE,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.File)
+ ))
+_sym_db.RegisterMessage(File)
+_sym_db.RegisterMessage(File.MetaData)
+
+FileList = _reflection.GeneratedProtocolMessageType('FileList', (_message.Message,), dict(
+ DESCRIPTOR = _FILELIST,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.FileList)
+ ))
+_sym_db.RegisterMessage(FileList)
+
+Operation = _reflection.GeneratedProtocolMessageType('Operation', (_message.Message,), dict(
+
+ Trace = _reflection.GeneratedProtocolMessageType('Trace', (_message.Message,), dict(
+ DESCRIPTOR = _OPERATION_TRACE,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.Operation.Trace)
+ ))
+ ,
+ DESCRIPTOR = _OPERATION,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.Operation)
+ ))
+_sym_db.RegisterMessage(Operation)
+_sym_db.RegisterMessage(Operation.Trace)
+
+Process = _reflection.GeneratedProtocolMessageType('Process', (_message.Message,), dict(
+ DESCRIPTOR = _PROCESS,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.Process)
+ ))
+_sym_db.RegisterMessage(Process)
+
+DownloadFile = _reflection.GeneratedProtocolMessageType('DownloadFile', (_message.Message,), dict(
+
+ Param = _reflection.GeneratedProtocolMessageType('Param', (_message.Message,), dict(
+ DESCRIPTOR = _DOWNLOADFILE_PARAM,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.DownloadFile.Param)
+ ))
+ ,
+
+ HttpParse = _reflection.GeneratedProtocolMessageType('HttpParse', (_message.Message,), dict(
+ DESCRIPTOR = _DOWNLOADFILE_HTTPPARSE,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.DownloadFile.HttpParse)
+ ))
+ ,
+
+ RemoteFile = _reflection.GeneratedProtocolMessageType('RemoteFile', (_message.Message,), dict(
+ DESCRIPTOR = _DOWNLOADFILE_REMOTEFILE,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.DownloadFile.RemoteFile)
+ ))
+ ,
+
+ Proxy = _reflection.GeneratedProtocolMessageType('Proxy', (_message.Message,), dict(
+ DESCRIPTOR = _DOWNLOADFILE_PROXY,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.DownloadFile.Proxy)
+ ))
+ ,
+ DESCRIPTOR = _DOWNLOADFILE,
+ __module__ = 'message_pb2'
+ # @@protoc_insertion_point(class_scope:biomaj.DownloadFile)
+ ))
+_sym_db.RegisterMessage(DownloadFile)
+_sym_db.RegisterMessage(DownloadFile.Param)
+_sym_db.RegisterMessage(DownloadFile.HttpParse)
+_sym_db.RegisterMessage(DownloadFile.RemoteFile)
+_sym_db.RegisterMessage(DownloadFile.Proxy)
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/biomaj_download/mimes-bio.txt b/biomaj_download/mimes-bio.txt
new file mode 100644
index 0000000..c794bf6
--- /dev/null
+++ b/biomaj_download/mimes-bio.txt
@@ -0,0 +1,18 @@
+# Biological file mime types
+application/fasta fasta fa fsa
+application/bam bam bai
+application/gff gff gff3
+application/bed bed
+application/fastq fastq
+application/gtf gtf
+application/octet-stream ab1 scf
+application/axt axt
+application/csFasta csfasta
+application/FasttqSolexa fastqsolexa
+application/Interval interval
+application/Laj laj
+application/Lav lav
+application/Maf maf
+application/QualityScore qual
+application/BlastXml blastxml
+application/Wiggle wig
diff --git a/biomaj_download/wsgi.py b/biomaj_download/wsgi.py
new file mode 100644
index 0000000..3fc868a
--- /dev/null
+++ b/biomaj_download/wsgi.py
@@ -0,0 +1,4 @@
+from biomaj_download.biomaj_download_web import app
+
+if __name__ == "__main__":
+ app.run()
diff --git a/config.yml b/config.yml
new file mode 100644
index 0000000..3d3e73f
--- /dev/null
+++ b/config.yml
@@ -0,0 +1,51 @@
+
+
+redis:
+ host: '127.0.0.1'
+ #host: '131.254.17.40'
+ port: 6379
+ db: 0
+ prefix: 'biomaj'
+
+rabbitmq:
+ host: '127.0.0.1'
+ port: 5672
+ user: null
+ password: null
+ virtual_host: '/'
+
+
+consul:
+ host: null
+ # Unique agent identifier name among biomaj downloaders
+ id: 'biomaj_download_agent'
+
+web:
+ debug: true
+ port: 5003
+ local_endpoint: 'http://131.254.17.40:5003'
+
+tls:
+ key: null
+ cert: null
+
+log_config:
+ 'version': 1
+ 'formatters':
+ 'generic':
+ 'format': '%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s'
+ 'handlers':
+ 'console':
+ 'class': 'logging.StreamHandler'
+ 'formatter': 'generic'
+ 'level': 'DEBUG'
+ 'loggers':
+ 'root':
+ 'level': 'INFO'
+ 'handlers':
+ - 'console'
+ 'biomaj':
+ 'level': 'DEBUG'
+ 'handlers':
+ - 'console'
+ 'disable_existing_loggers': False
diff --git a/gunicorn_conf.py b/gunicorn_conf.py
new file mode 100644
index 0000000..b5a65b0
--- /dev/null
+++ b/gunicorn_conf.py
@@ -0,0 +1,3 @@
+def worker_exit(server, worker):
+ from prometheus_client import multiprocess
+ multiprocess.mark_process_dead(worker.pid)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ee627fa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+mock
+nose
+pycurl
+py-bcrypt
+pika
+redis
+PyYAML
+protobuf
+flask
+python-consul
+prometheus_client>=0.0.18
+requests
+biomaj_core
+biomaj_zipkin
+flake8
+humanfriendly
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..d697ff1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,67 @@
+try:
+ from setuptools import setup, find_packages
+except ImportError:
+ from distutils.core import setup
+
+from distutils.command.install import install
+import os
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(here, 'README.md')) as f:
+ README = f.read()
+with open(os.path.join(here, 'CHANGES.txt')) as f:
+ CHANGES = f.read()
+
+
+config = {
+ 'description': 'BioMAJ download service',
+ 'long_description': README + '\n\n' + CHANGES,
+ 'author': 'Olivier Sallou',
+ 'url': 'http://biomaj.genouest.org',
+ 'download_url': 'http://biomaj.genouest.org',
+ 'author_email': 'olivier.sallou at irisa.fr',
+ 'version': '3.0.13',
+ 'classifiers': [
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Natural Language :: English',
+ 'Operating System :: POSIX :: Linux',
+ # Indicate who your project is intended for
+ 'Intended Audience :: Science/Research',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ # Pick your license as you wish (should match "license" above)
+ 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
+ # Specify the Python versions you support here. In particular, ensure
+ # that you indicate whether you support Python 2, Python 3 or both.
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4'
+ ],
+ 'install_requires': [
+ 'biomaj_core',
+ 'biomaj_zipkin',
+ 'pycurl',
+ 'py-bcrypt',
+ 'pika',
+ 'redis',
+ 'PyYAML',
+ 'flask',
+ 'python-consul',
+ 'prometheus_client>=0.0.18',
+ 'protobuf',
+ 'requests',
+ 'humanfriendly'
+ ],
+ 'tests_require': ['nose', 'mock'],
+ 'test_suite': 'nose.collector',
+ 'packages': find_packages(),
+ 'include_package_data': True,
+ 'scripts': ['bin/biomaj_download_consumer.py'],
+ 'name': 'biomaj_download'
+}
+
+setup(**config)
diff --git a/tests/alu.properties b/tests/alu.properties
new file mode 100644
index 0000000..0e729e0
--- /dev/null
+++ b/tests/alu.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+db.name=alu
+db.type=nucleic_protein
+
+offline.dir.name=offline/ncbi/blast/alu_tmp
+dir.version=ncbi/blast/alu
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=ftp
+server=ftp.ncbi.nih.gov
+remote.dir=/blast/db/FASTA/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^alu.*\.gz$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/bank/process/test.sh b/tests/bank/process/test.sh
new file mode 100644
index 0000000..2d510e2
--- /dev/null
+++ b/tests/bank/process/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Testing a process"
+
+echo "test meta data"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr1#blast/chr1/chr1db"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr2#blast/chr2/chr2db"
+
+echo "test meta data 2"
+
+echo "##BIOMAJ#fasta#nucleic#organism:hg19#fasta/chr1.fa,fasta/chr2.fa"
diff --git a/tests/bank/test.fasta.gz b/tests/bank/test.fasta.gz
new file mode 100644
index 0000000..666d6f2
Binary files /dev/null and b/tests/bank/test.fasta.gz differ
diff --git a/tests/bank/test2.fasta b/tests/bank/test2.fasta
new file mode 100644
index 0000000..410ca0f
--- /dev/null
+++ b/tests/bank/test2.fasta
@@ -0,0 +1,2 @@
+>test2
+gcgcgcgcgcgcgcgccgcgcgcgcgcgcgcggc
diff --git a/tests/bank/test_100.txt b/tests/bank/test_100.txt
new file mode 100644
index 0000000..c7f7c3b
--- /dev/null
+++ b/tests/bank/test_100.txt
@@ -0,0 +1 @@
+This is a sample file to extract Release 103 from a text file
diff --git a/tests/biomaj_tests.py b/tests/biomaj_tests.py
new file mode 100644
index 0000000..9295b50
--- /dev/null
+++ b/tests/biomaj_tests.py
@@ -0,0 +1,554 @@
+from nose.tools import *
+from nose.plugins.attrib import attr
+
+import json
+import shutil
+import os
+import tempfile
+import logging
+import copy
+import stat
+import time
+
+from mock import patch
+
+from optparse import OptionParser
+
+
+from biomaj_core.config import BiomajConfig
+from biomaj_core.utils import Utils
+from biomaj_download.download.ftp import FTPDownload
+from biomaj_download.download.direct import DirectFTPDownload, DirectHttpDownload
+from biomaj_download.download.http import HTTPDownload, HTTPParse
+from biomaj_download.download.localcopy import LocalDownload
+from biomaj_download.download.downloadthreads import DownloadThread
+from biomaj_download.download.rsync import RSYNCDownload
+
+import unittest
+
+class UtilsForTest():
+ """
+ Copy properties files to a temp directory and update properties to
+ use a temp directory
+ """
+
+ def __init__(self):
+ """
+ Setup the temp dirs and files.
+ """
+ self.global_properties = None
+ self.bank_properties = None
+
+ self.test_dir = tempfile.mkdtemp('biomaj')
+
+ self.conf_dir =os.path.join(self.test_dir,'conf')
+ if not os.path.exists(self.conf_dir):
+ os.makedirs(self.conf_dir)
+ self.data_dir =os.path.join(self.test_dir,'data')
+ if not os.path.exists(self.data_dir):
+ os.makedirs(self.data_dir)
+ self.log_dir =os.path.join(self.test_dir,'log')
+ if not os.path.exists(self.log_dir):
+ os.makedirs(self.log_dir)
+ self.process_dir =os.path.join(self.test_dir,'process')
+ if not os.path.exists(self.process_dir):
+ os.makedirs(self.process_dir)
+ self.lock_dir =os.path.join(self.test_dir,'lock')
+ if not os.path.exists(self.lock_dir):
+ os.makedirs(self.lock_dir)
+ self.cache_dir =os.path.join(self.test_dir,'cache')
+ if not os.path.exists(self.cache_dir):
+ os.makedirs(self.cache_dir)
+
+
+ if self.global_properties is None:
+ self.__copy_global_properties()
+
+ if self.bank_properties is None:
+ self.__copy_test_bank_properties()
+
+ def clean(self):
+ """
+ Deletes temp directory
+ """
+ shutil.rmtree(self.test_dir)
+
+ def __copy_test_bank_properties(self):
+ if self.bank_properties is not None:
+ return
+ self.bank_properties = ['alu', 'local', 'testhttp','directhttp']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ for b in self.bank_properties:
+ from_file = os.path.join(curdir, b+'.properties')
+ to_file = os.path.join(self.conf_dir, b+'.properties')
+ shutil.copyfile(from_file, to_file)
+
+ self.bank_process = ['test.sh']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ procdir = os.path.join(curdir, 'bank/process')
+ for proc in self.bank_process:
+ from_file = os.path.join(procdir, proc)
+ to_file = os.path.join(self.process_dir, proc)
+ shutil.copyfile(from_file, to_file)
+ os.chmod(to_file, stat.S_IRWXU)
+
+ # Manage local bank test, use bank test subdir as remote
+ properties = ['multi.properties', 'computederror.properties', 'error.properties', 'local.properties', 'localprocess.properties', 'testhttp.properties', 'computed.properties', 'computed2.properties', 'sub1.properties', 'sub2.properties']
+ for prop in properties:
+ from_file = os.path.join(curdir, prop)
+ to_file = os.path.join(self.conf_dir, prop)
+ fout = open(to_file,'w')
+ with open(from_file,'r') as fin:
+ for line in fin:
+ if line.startswith('remote.dir'):
+ fout.write("remote.dir="+os.path.join(curdir,'bank')+"\n")
+ elif line.startswith('remote.files'):
+ fout.write(line.replace('/tmp', os.path.join(curdir,'bank')))
+ else:
+ fout.write(line)
+ fout.close()
+
+ def __copy_global_properties(self):
+ if self.global_properties is not None:
+ return
+ self.global_properties = os.path.join(self.conf_dir,'global.properties')
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ global_template = os.path.join(curdir,'global.properties')
+ fout = open(self.global_properties,'w')
+ with open(global_template,'r') as fin:
+ for line in fin:
+ if line.startswith('conf.dir'):
+ fout.write("conf.dir="+self.conf_dir+"\n")
+ elif line.startswith('log.dir'):
+ fout.write("log.dir="+self.log_dir+"\n")
+ elif line.startswith('data.dir'):
+ fout.write("data.dir="+self.data_dir+"\n")
+ elif line.startswith('process.dir'):
+ fout.write("process.dir="+self.process_dir+"\n")
+ elif line.startswith('lock.dir'):
+ fout.write("lock.dir="+self.lock_dir+"\n")
+ else:
+ fout.write(line)
+ fout.close()
+
+
+class TestBiomajUtils(unittest.TestCase):
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+
+ def test_mimes(self):
+ fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'bank/test2.fasta')
+ (mime, encoding) = Utils.detect_format(fasta_file)
+ self.assertTrue('application/fasta' == mime)
+
+ @attr('compress')
+ def test_uncompress(self):
+ from_file = { 'root': os.path.dirname(os.path.realpath(__file__)),
+ 'name': 'bank/test.fasta.gz'
+ }
+
+ to_dir = self.utils.data_dir
+ Utils.copy_files([from_file], to_dir)
+ Utils.uncompress(os.path.join(to_dir, from_file['name']))
+ self.assertTrue(os.path.exists(to_dir+'/bank/test.fasta'))
+
+ def test_copy_with_regexp(self):
+ from_dir = os.path.dirname(os.path.realpath(__file__))
+ to_dir = self.utils.data_dir
+ Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py'])
+ self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
+
+ def test_copy(self):
+ from_dir = os.path.dirname(os.path.realpath(__file__))
+ local_file = 'biomaj_tests.py'
+ files_to_copy = [ {'root': from_dir, 'name': local_file}]
+ to_dir = self.utils.data_dir
+ Utils.copy_files(files_to_copy, to_dir)
+ self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
+
+class TestBiomajLocalDownload(unittest.TestCase):
+ """
+ Test Local downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ self.curdir = os.path.dirname(os.path.realpath(__file__))
+ self.examples = os.path.join(self.curdir,'bank') + '/'
+
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_local_list(self):
+ locald = LocalDownload(self.examples)
+ (file_list, dir_list) = locald.list()
+ locald.close()
+ self.assertTrue(len(file_list) > 1)
+
+ def test_local_download(self):
+ locald = LocalDownload(self.examples)
+ (file_list, dir_list) = locald.list()
+ locald.match([r'^test.*\.gz$'], file_list, dir_list)
+ locald.download(self.utils.data_dir)
+ locald.close()
+ self.assertTrue(len(locald.files_to_download) == 1)
+
+ def test_local_download_in_subdir(self):
+ locald = LocalDownload(self.curdir+'/')
+ (file_list, dir_list) = locald.list()
+ locald.match([r'^/bank/test.*\.gz$'], file_list, dir_list)
+ locald.download(self.utils.data_dir)
+ locald.close()
+ self.assertTrue(len(locald.files_to_download) == 1)
+
+
+ at attr('network')
+ at attr('http')
+class TestBiomajHTTPDownload(unittest.TestCase):
+ """
+ Test HTTP downloader
+ """
+ def setUp(self):
+ self.utils = UtilsForTest()
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+ self.config = BiomajConfig('testhttp')
+ self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'),
+ self.config.get('http.parse.file.line'),
+ int(self.config.get('http.group.dir.name')),
+ int(self.config.get('http.group.dir.date')),
+ int(self.config.get('http.group.file.name')),
+ int(self.config.get('http.group.file.date')),
+ self.config.get('http.group.file.date_format', None),
+ int(self.config.get('http.group.file.size'))
+ )
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_http_list(self):
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_http_list_dateregexp(self):
+ #self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
+ self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_http_download_no_size(self):
+ self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'),
+ self.config.get('http.parse.file.line'),
+ int(self.config.get('http.group.dir.name')),
+ int(self.config.get('http.group.dir.date')),
+ int(self.config.get('http.group.file.name')),
+ int(self.config.get('http.group.file.date')),
+ self.config.get('http.group.file.date_format', None),
+ -1
+ )
+ self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+ def test_http_download_no_date(self):
+ self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'),
+ self.config.get('http.parse.file.line'),
+ int(self.config.get('http.group.dir.name')),
+ int(self.config.get('http.group.dir.date')),
+ int(self.config.get('http.group.file.name')),
+ -1,
+ self.config.get('http.group.file.date_format', None),
+ int(self.config.get('http.group.file.size'))
+ )
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+ def test_http_download(self):
+ self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ print(str(file_list))
+ httpd.match([r'^README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+ def test_http_download_in_subdir(self):
+ self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^dists/README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+
+ at attr('directftp')
+ at attr('network')
+class TestBiomajDirectFTPDownload(unittest.TestCase):
+ """
+ Test DirectFTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_ftp_list(self):
+ file_list = ['/blast/db/FASTA/alu.n.gz.md5']
+ ftpd = DirectFTPDownload('ftp', 'ftp.ncbi.nih.gov', '')
+ ftpd.set_files_to_download(file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_download(self):
+ file_list = ['/blast/db/FASTA/alu.n.gz.md5']
+ ftpd = DirectFTPDownload('ftp', 'ftp.ncbi.nih.gov', '')
+ ftpd.set_files_to_download(file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'alu.n.gz.md5')))
+
+
+ at attr('directhttp')
+ at attr('network')
+class TestBiomajDirectHTTPDownload(unittest.TestCase):
+ """
+ Test DirectFTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_http_list(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd.set_files_to_download(file_list)
+ fday = ftpd.files_to_download[0]['day']
+ fmonth = ftpd.files_to_download[0]['month']
+ fyear = ftpd.files_to_download[0]['year']
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) == 1)
+ self.assertTrue(file_list[0]['size']!=0)
+ self.assertFalse(fyear == ftpd.files_to_download[0]['year'] and fmonth == ftpd.files_to_download[0]['month'] and fday == ftpd.files_to_download[0]['day'])
+
+ def test_download(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd.set_files_to_download(file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'README.html')))
+
+ def test_download_get_params_save_as(self):
+ file_list = ['/get']
+ ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+ ftpd.set_files_to_download(file_list)
+ ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
+ ftpd.save_as = 'test.json'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.json')))
+ with open(os.path.join(self.utils.data_dir,'test.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['args']['key1'] == 'value1')
+
+ @attr('test')
+ def test_download_save_as(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+ ftpd.set_files_to_download(file_list)
+ ftpd.save_as = 'test.html'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.html')))
+
+ def test_download_post_params(self):
+ #file_list = ['/debian/README.html']
+ file_list = ['/post']
+ ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+ ftpd.set_files_to_download(file_list)
+ ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
+ ftpd.save_as = 'test.json'
+ ftpd.method = 'POST'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.json')))
+ with open(os.path.join(self.utils.data_dir,'test.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['form']['key1'] == 'value1')
+
+
+ at attr('ftp')
+ at attr('network')
+class TestBiomajFTPDownload(unittest.TestCase):
+ """
+ Test FTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_ftp_list(self):
+ ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+ # ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/db/FASTA/')
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) > 1)
+
+ @attr('test')
+ def test_download(self):
+ # ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/db/FASTA/')
+ ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+ (file_list, dir_list) = ftpd.list()
+ # ftpd.match([r'^alu.*\.gz$'], file_list, dir_list)
+ ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
+ ftpd.download(self.utils.data_dir)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download) == 2)
+
+ def test_download_in_subdir(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/')
+ (file_list, dir_list) = ftpd.list()
+ try:
+ ftpd.match([r'^db/FASTA/alu.*\.gz$'], file_list, dir_list)
+ except Exception as e:
+ print("Error: " + str(e))
+ self.skipTest("Skipping test due to remote server error")
+ ftpd.download(self.utils.data_dir)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download) == 2)
+
+ def test_download_or_copy(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/')
+ ftpd.files_to_download = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test2', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
+ ]
+ available_files = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test12', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test3', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 20},
+ {'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
+ ]
+ ftpd.download_or_copy(available_files, '/biomaj', False)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download)==2)
+ self.assertTrue(len(ftpd.files_to_copy)==2)
+
+ def test_get_more_recent_file(self):
+ files = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test2', 'year': '2013', 'month': '11', 'day': '12', 'size': 10},
+ {'name':'/test/test1', 'year': '1988', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test11', 'year': '2013', 'month': '9', 'day': '23', 'size': 10}
+ ]
+ release = Utils.get_more_recent_file(files)
+ self.assertTrue(release['year']=='2013')
+ self.assertTrue(release['month']=='11')
+ self.assertTrue(release['day']=='12')
+
+ at attr('rsync')
+ at attr('local')
+class TestBiomajRSYNCDownload(unittest.TestCase):
+ '''
+ Test RSYNC downloader
+ '''
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ self.curdir = os.path.dirname(os.path.realpath(__file__))
+ self.examples = os.path.join(self.curdir,'bank') + '/'
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_rsync_list(self):
+ rsyncd = RSYNCDownload('rsync', self.examples, "")
+ rsyncd.set_credentials(None)
+ rsyncd.set_offline_dir(self.utils.data_dir)
+ (files_list, dir_list) = rsyncd.list()
+ self.assertTrue(len(files_list) != 0)
+
+ def test_rsync_match(self):
+ rsyncd = RSYNCDownload('rsync', self.examples, "")
+ rsyncd.set_credentials(None)
+ rsyncd.set_offline_dir(self.utils.data_dir)
+ (files_list, dir_list) = rsyncd.list()
+ rsyncd.match([r'^test.*\.gz$'], files_list, dir_list, prefix='', submatch=False)
+ self.assertTrue(len(rsyncd.files_to_download) != 0)
+
+ def test_rsync_download(self):
+ rsyncd = RSYNCDownload('rsync', self.examples, "")
+ rsyncd.set_credentials(None)
+ rsyncd.set_offline_dir(self.utils.data_dir)
+ error = rsyncd.rsync_download(self.utils.data_dir, "test2.fasta")
+ self.assertTrue(error == 0)
+
+
+ def test_rsync_general_download(self):
+ rsyncd = RSYNCDownload('rsync', self.examples, "")
+ rsyncd.set_credentials(None)
+ rsyncd.set_offline_dir(self.utils.data_dir)
+ (files_list, dir_list) = rsyncd.list()
+ rsyncd.match([r'^test.*\.gz$'],files_list,dir_list, prefix='')
+ download_files=rsyncd.download(self.curdir)
+ self.assertTrue(len(download_files)==1)
+
+ def test_rsync_download_or_copy(self):
+ rsyncd = RSYNCDownload('rsync', self.examples, "")
+ rsyncd.set_offline_dir(self.utils.data_dir)
+ (file_list, dir_list) = rsyncd.list()
+ rsyncd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
+ files_to_download_prev = rsyncd.files_to_download
+ rsyncd.download_or_copy(rsyncd.files_to_download, self.examples, check_exists=True)
+ self.assertTrue(files_to_download_prev != rsyncd.files_to_download)
+
+ def test_rsync_download_in_subdir(self):
+ rsyncd = RSYNCDownload('rsync', self.curdir+'/', "")
+ rsyncd.set_offline_dir(self.curdir+'/')
+ (file_list, dir_list) = rsyncd.list()
+ rsyncd.match([r'^/bank/test*'], file_list, dir_list, prefix='')
+ rsyncd.download(self.utils.data_dir)
+ self.assertTrue(len(rsyncd.files_to_download) == 3)
diff --git a/tests/computed.properties b/tests/computed.properties
new file mode 100644
index 0000000..214baf4
--- /dev/null
+++ b/tests/computed.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+sub1.files.move=flat/test_.*
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computed2.properties b/tests/computed2.properties
new file mode 100644
index 0000000..2768000
--- /dev/null
+++ b/tests/computed2.properties
@@ -0,0 +1,45 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+
+ref.release=sub1
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computederror.properties b/tests/computederror.properties
new file mode 100644
index 0000000..ce4bae1
--- /dev/null
+++ b/tests/computederror.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed error local system bank test"
+db.name=computederror
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/computederror_tmp
+dir.version=test/computederror
+
+depends=sub2,error
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/directhttp.properties b/tests/directhttp.properties
new file mode 100644
index 0000000..30f673d
--- /dev/null
+++ b/tests/directhttp.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="directhttp system bank test"
+db.name=directhttp
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/directhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=directhttp
+server=ftp2.fr.debian.org
+
+release.protocol=directhttp
+release.server=ftp2.fr.debian.org
+release.remote.dir=/debian/README
+release.file=README
+release.regexp=([0-9.]+),
+release.file.compressed=
+
+#remote.dir=common/downloads/Current_Release/Pfalciparum3D7/fasta/data/PlasmoDB-25_Pfalciparum3D7_Genome.fasta
+#plasmo/communityDownload.do?fname=Atg3_alignment.txt
+remote.dir=/debian/README.html
+remote.files=
+
+local.files=debian/README.html
+
+## Post Process ## The files should be located in the projectfiles/process
+BLOCKS=
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/error.properties b/tests/error.properties
new file mode 100644
index 0000000..2e50f00
--- /dev/null
+++ b/tests/error.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="error local system bank test"
+db.name=error
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/error_tmp
+dir.version=test/error
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/error/
+remote.files=^error.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^error.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/global.properties b/tests/global.properties
new file mode 100644
index 0000000..75cb02c
--- /dev/null
+++ b/tests/global.properties
@@ -0,0 +1,123 @@
+[GENERAL]
+test=1
+conf.dir=/tmp/biomaj/config
+log.dir=/tmp/biomaj/log
+process.dir=/tmp/biomaj/process
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=/tmp/biomaj/
+lock.dir=/tmp/biomaj/lock
+cache.dir=/tmp/biomaj/cache
+
+db.url=mongodb://localhost:27017
+db.name=biomaj_test
+
+use_ldap=1
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+# Use ElasticSearch for index/search capabilities
+use_elastic=0
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj_test
+
+celery.queue=biomaj
+celery.broker=mongodb://localhost:27017/biomaj_celery
+
+# Get directory stats (can be time consuming depending on number of files etc...)
+data.stats=1
+
+# List of user admin (linux user id, comma separated)
+admin=
+
+# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
+auto_publish=0
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+mail.admin=
+mail.from=
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#Number of thread for processes
+bank.num.threads=2
+
+#Number of threads to use for downloading
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#----------------------
+# Release configuration
+#----------------------
+release.separator=_
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+#http.parse.dir.line=<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.dir.line=<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.file.line=<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+
+# Bank default access
+visibility.default=public
+
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = INFO
+handlers = console
+
+[logger_biomaj]
+level = DEBUG
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/tests/local.properties b/tests/local.properties
new file mode 100644
index 0000000..7f6f5fd
--- /dev/null
+++ b/tests/local.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/locallist.properties b/tests/locallist.properties
new file mode 100644
index 0000000..a901b2c
--- /dev/null
+++ b/tests/locallist.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=locallist
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/locallist_tmp
+dir.version=test/locallist
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=
+remote.files=
+remote.files.list=true
+remote.files.1.path=/tmp/test.fasta.gz
+remote.files.2.path=/tmp/test2.fasta
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/localprocess.properties b/tests/localprocess.properties
new file mode 100644
index 0000000..7166186
--- /dev/null
+++ b/tests/localprocess.properties
@@ -0,0 +1,100 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Pre process
+db.pre.process=META0
+
+## Remove process
+db.remove.process=META0
+
+## Post Process ## The files should be located in the projectfiles/process directory
+BLOCKS=BLOCK1,BLOCK2
+BLOCK1.db.post.process=META0
+BLOCK2.db.post.process=META1,META2,META3
+META0=PROC0
+META1=PROC1,PROC2
+META2=PROC3
+META3=PROC4,PROC5
+
+
+PROC0.name=test0
+PROC0.desc=sample test
+PROC0.cluster=false
+PROC0.type=test
+PROC0.exe=echo
+PROC0.args=test $datadir
+
+PROC1.name=test1
+PROC1.desc=sample test
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=touch
+PROC1.args=$datadir/$dirversion/$localrelease/proc1.txt
+
+PROC2.name=test2
+PROC2.desc=sample test
+PROC2.cluster=false
+PROC2.type=test
+PROC2.exe=touch
+PROC2.args=$datadir/$dirversion/$localrelease/proc2.txt
+
+PROC3.name=test3
+PROC3.desc=sample test
+PROC3.cluster=false
+PROC3.type=test
+PROC3.exe=echo
+PROC3.args=test 3
+
+PROC4.name=test4
+PROC4.desc=sample test
+PROC4.cluster=false
+PROC4.type=test
+PROC4.exe=echo
+PROC4.args=test 4
+
+PROC5.name=test5
+PROC5.desc=sample test
+PROC5.cluster=false
+PROC5.type=testmetadata
+PROC5.exe=test.sh
+PROC5.args=
+PROC5.format=test
+PROC5.types=any
+PROC5.tags=chr:chr1,organism:hg19
+# If files is set, then the post-process does not have to print generated files on STDOUT (but can)
+# in this case, the list of files will be extracted from this list with above format/types/tags
+PROC5.files=dir1/file1,dir1/file2,dir1/file3
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/multi.properties b/tests/multi.properties
new file mode 100644
index 0000000..82e08f9
--- /dev/null
+++ b/tests/multi.properties
@@ -0,0 +1,60 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for multi protocol
+db.name=multi
+db.type=test
+
+offline.dir.name=offline/multi_tmp
+dir.version=multi
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=multi
+server=
+remote.dir=
+
+remote.file.0.protocol = directhttp
+remote.file.0.server = httpbin.org
+remote.file.0.path = /get
+remote.file.0.params.keys = key1,key2
+remote.file.0.params.key1 = value1
+remote.file.0.params.key2 = value2
+remote.file.0.name = test1.json
+
+remote.file.1.protocol = directhttp
+remote.file.1.method = POST
+remote.file.1.server = httpbin.org
+remote.file.1.path = /post
+remote.file.1.params.keys = key1,key2
+remote.file.1.params.key1 = value1
+remote.file.1.params.key2 = value2
+remote.file.1.name = test2.json
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub1.properties b/tests/sub1.properties
new file mode 100644
index 0000000..8e0c69b
--- /dev/null
+++ b/tests/sub1.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local1
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local1_tmp
+dir.version=test/local1
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub2.properties b/tests/sub2.properties
new file mode 100644
index 0000000..b9d3142
--- /dev/null
+++ b/tests/sub2.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local2
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local2_tmp
+dir.version=test/local2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/test.fasta.gz b/tests/test.fasta.gz
new file mode 100644
index 0000000..666d6f2
Binary files /dev/null and b/tests/test.fasta.gz differ
diff --git a/tests/testhttp.properties b/tests/testhttp.properties
new file mode 100644
index 0000000..ff109d9
--- /dev/null
+++ b/tests/testhttp.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for http protocol
+db.name=testhttp
+db.type=package
+
+offline.dir.name=offline/testhttp_tmp
+dir.version=testhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=http
+server=ftp2.fr.debian.org
+remote.dir=/debian/dists/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+http.parse.dir.line=<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"?.*<a[\s]+href="([\S]+)\/"[\s]*>.*([\d]{4}-[\w\d]{2,5}-[\d]{2}\s[\d]{2}:[\d]{2})
+http.parse.file.line=<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*\/?><\/td><td><a[\s]+href="([\S]+)".*([\d]{4}-[\d]{2}-[\d]{2}\s[\d]{2}:[\d]{2}).*>([\d\.]+[MKG]{0,1})
+http.group.file.date_format="%%Y-%%m-%%d %%H:%%M"
+### Deployment ###
+
+keep.old.version=1
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/biomaj3-download.git
More information about the debian-med-commit
mailing list