[med-svn] [python-biomaj3] 01/07: Import Upstream version 3.0.20
Olivier Sallou
osallou at debian.org
Sat Jan 14 13:41:47 UTC 2017
This is an automated email from the git hooks/post-receive script.
osallou pushed a commit to branch master
in repository python-biomaj3.
commit 0271fd05dc8ea2805753555ef31e4ab831c85939
Author: Olivier Sallou <osallou at debian.org>
Date: Sat Jan 14 12:05:59 2017 +0000
Import Upstream version 3.0.20
---
.coveragerc | 3 +
.travis.yml | 32 +
CHANGES.txt | 103 +++
LICENSE | 662 ++++++++++++++++++
MANIFEST.in | 2 +
README.md | 173 +++++
bin/biomaj-cli.py | 531 +++++++++++++++
biomaj/__init__.py | 1 +
biomaj/bank.py | 1073 +++++++++++++++++++++++++++++
biomaj/bmajindex.py | 231 +++++++
biomaj/config.py | 409 +++++++++++
biomaj/download/__init__.py | 0
biomaj/download/direct.py | 340 ++++++++++
biomaj/download/downloadthreads.py | 105 +++
biomaj/download/ftp.py | 348 ++++++++++
biomaj/download/http.py | 138 ++++
biomaj/download/interface.py | 256 +++++++
biomaj/download/localcopy.py | 89 +++
biomaj/mimes-bio.txt | 18 +
biomaj/mongo_connector.py | 20 +
biomaj/notify.py | 55 ++
biomaj/options.py | 36 +
biomaj/process/__init__.py | 0
biomaj/process/metaprocess.py | 277 ++++++++
biomaj/process/process.py | 217 ++++++
biomaj/process/processfactory.py | 230 +++++++
biomaj/schema_version.py | 64 ++
biomaj/session.py | 224 +++++++
biomaj/user.py | 183 +++++
biomaj/utils.py | 288 ++++++++
biomaj/workflow.py | 1305 ++++++++++++++++++++++++++++++++++++
docs/Makefile | 177 +++++
docs/admin.rst | 37 +
docs/alu.properties | 42 ++
docs/bank.rst | 15 +
docs/bmajindex.rst | 14 +
docs/conf.py | 277 ++++++++
docs/config.rst | 15 +
docs/docker-compose-advanced.yml | 16 +
docs/docker-compose.yml | 11 +
docs/examples.rst | 104 +++
docs/ftp.rst | 15 +
docs/global.advanced.properties | 143 ++++
docs/http.rst | 15 +
docs/index.rst | 44 ++
docs/interface.rst | 17 +
docs/localcopy.rst | 15 +
docs/make.bat | 242 +++++++
docs/metaprocess.rst | 15 +
docs/notify.rst | 15 +
docs/options.rst | 15 +
docs/process.rst | 15 +
docs/processfactory.rst | 15 +
docs/requirements.txt | 9 +
docs/session.rst | 15 +
docs/user.rst | 15 +
docs/utils.rst | 15 +
docs/workflow.rst | 15 +
requirements.txt | 10 +
setup.py | 69 ++
tests/alu.properties | 43 ++
tests/bank/process/test.sh | 11 +
tests/bank/test.fasta.gz | Bin 0 -> 45 bytes
tests/bank/test2.fasta | 2 +
tests/bank/test_100.txt | 1 +
tests/biomaj_tests.py | 1305 ++++++++++++++++++++++++++++++++++++
tests/computed.properties | 44 ++
tests/computed2.properties | 45 ++
tests/computederror.properties | 43 ++
tests/directhttp.properties | 41 ++
tests/error.properties | 43 ++
tests/global.properties | 123 ++++
tests/local.properties | 41 ++
tests/locallist.properties | 44 ++
tests/localprocess.properties | 100 +++
tests/multi.properties | 60 ++
tests/sub1.properties | 43 ++
tests/sub2.properties | 41 ++
tests/testhttp.properties | 43 ++
tools/examples/alu.properties | 51 ++
tools/examples/global.properties | 115 ++++
tools/examples/local.properties | 55 ++
tools/process/concat.sh | 114 ++++
tools/process/formatdb.sh | 244 +++++++
tools/process/makeblastdb.sh | 212 ++++++
tools/process/scan.py | 48 ++
86 files changed, 11747 insertions(+)
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..0a4ae8f
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+source = biomaj
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..4526f17
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,32 @@
+language: python
+sudo: false
+python:
+ - "2.7"
+ - "3.4"
+ - "3.5"
+services:
+ - mongodb
+ - elasticsearch
+# Apply only on main branches
+branches:
+ except:
+ - /^feature.*$/
+# command to install dependencies
+#before_install:
+# - "sudo apt-get update -qq"
+# - "sudo apt-get install -qq libldap2-dev libsasl2-dev"
+install:
+ - "pip install -r requirements.txt"
+ - "pip install coverage"
+ - "pip install python-coveralls"
+ - "python setup.py -q install"
+# - "echo data_file=$TRAVIS_BUILD_DIR/.coverage >> .coveragerc"
+# command to run tests
+before_script:
+ - sleep 10
+#script: nosetests --with-coverage --cover-package=biomaj -a '!network'
+#script: nosetests --with-coverage --cover-package=biomaj
+script: nosetests
+#after_success:
+# - coveralls
+
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 0000000..c574948
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,103 @@
+3.0.20:
+ Fix #55: Added support for https and directhttps
+ Add possibility to define files to download from a local file with remote.list parameter
+ Fix visibility modification (bug deleted the bank properties field)
+3.0.19:
+ Fix missing README.md in package
+ Fix #53 avoid duplicates in pending databases
+3.0.18:
+ Add migration method to update schema when needed
+ Manage HTTP month format to support text format (Jan, Feb, ...) and int format (01, 02, ...)
+ New optional bank property http.parse.file.date.format to extract date in HTTP protocol following python date regexp format (http://www.tutorialspoint.com/python/time_strptime.htm)
+ Example: %d-%b-%Y %H:%M
+3.0.17:
+ Fix #47: save_as error with directhttp protocol
+ Fix #45: error with pending releases when release has dots in value
+ typo/pylint fixes
+3.0.16:
+ Do not use config values, trust database values #39
+ Fix #42: Add optional release.separator to name the bank directory bankname_release (underscore as default)
+3.0.15:
+ Fix #37: remote local files history from db and put it in cache.dir
+ Feature #38: add optional keep.old.sessions parameter to keep all sessions in database, even for removed releases
+ Feature #28: add optional release.format parameter to specify the date format of a release
+3.0.14:
+ Fix in method set_owner
+ Force release to be a str
+ Fix #32: fix --from-task issue when calling a meta process
+ Fix #34: remove release from pending when doing cleanup of old sessions
+ Remove logs on some operations
+ Add --status-ko option to list bank in error state
+ Fix #36 manage workflows over by error or unfinished
+3.0.13:
+ Fix #27: Thread lock issue during download
+ New optional attribute in bank properties: timeout.download
+ HTTP protocol fix (deepcopy error)
+3.0.12:
+ Fix index deletion on bank removal
+ Fix lock errors on dir creation for multi-threads,
+ pre-create directroy structure in offline directory
+ Fix #26: save error when too many files in bank
+3.0.11:
+ Fix in session management with pre and rm processes
+ Fix #23: Check workflow step name passed to
+ --stop-after/--start-after/--from-task
+ Fix #24: deprecated delete_by_query method in elasticsearch
+ Add some controls on base directories
+
+3.0.10:
+ Change dir to process.dir to find processes in subdirs
+ If all files found in offline dir, continue workflow with no download
+ Remove extra log files for bank dependencies (computed banks)
+ Fix computed bank update when sub banks are not updated
+ Fix #15 when remote reverts to a previous release
+ Feature #16: get possibility not to download files (for computed banks for
+ example). Set protocol='none' in bank properties.
+ Fix on --check with some protocols
+ Fix #21 release.file not supported for directhttp protocol
+ Feature #22: add localrelease and remoterelease bank properties to use the
+ remote release as an expression in other properties
+ => remote.dir = xx/yy/%(remoterelease)s/zz
+ Feature #17,#20: detect remote modifications even if release is the same
+ new parameter release.control (true, false) to force a check
+ even if remote release (file controlled or date) is the same.
+ Fix on 'multi' protocol
+ Fix on "save_as" regexp when remote.files starts with a ^ character.
+
+3.0.9:
+ Fix thread synchro issue:
+ during download some download threads could be alive while main thread continues worflow
+ the fix prevents using Ctrl-C during download
+ Workflow fix:
+ if subtask of workflow fails, fail main task
+osallou authored 14 hours ago
+3.0.8:
+ do not test index if elasticsearch is not up
+ minor fixes
+ add http proxy support
+ pylint fixes
+ retry uncompress once in case of failure (#13)
+3.0.7:
+ Reindent code, pep8 fixes
+ Various fixes on var names and OrderedDict suport for Python < 2.7
+ Merge config files to be able to reference global.properties variables in bank
+ property file in format %(xx)s
+ Use ConfigParser instead of SafeConfigParser that will be deprecated
+3.0.6:
+ Add option --remove-pending to remove all pending sessions and directories
+ Add process env variables logdir and logfile
+ Fix Unicode issue with old versions of PyCurl.
+3.0.5:
+ Fix removal workflow during an update workflow, removedrelease was current
+ release.
+ Fix shebang of biomaj-cli, and python 2/3 compat issue
+3.0.4:
+ Update code to make it Python 3 compatible
+ Use ldap3 library (pure Python and p2,3 compatible) instead of python-ldap
+ get possiblity to save downloaded files for ftp and http without keeping full
+ directory structure:
+ remote.files can include groups to save file without directory structure,
+ or partial directories only, examples:
+ remote.files = genomes/fasta/.*\.gz => save files in offline directory, keeping remote structure offlinedir/genomes/fasta/
+ remote.files = genomes/fasta/(.*\.gz) => save files in offline directory offlinedir/
+ remote.files = genomes/(fasta)/(.*\.gz) => save files in offline directory offlinedir/fasta
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..cebe035
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,662 @@
+GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..ce94e14
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.txt *.md
+recursive-include biomaj *.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..565d596
--- /dev/null
+++ b/README.md
@@ -0,0 +1,173 @@
+BioMAJ3
+=====
+
+This project is a complete rewrite of BioMAJ (http://biomaj.genouest.org).
+
+BioMAJ (BIOlogie Mise A Jour) is a workflow engine dedicated to data
+synchronization and processing. The Software automates the update cycle and the
+supervision of the locally mirrored databank repository.
+
+Common usages are to download remote databanks (Genbank for example) and apply
+some transformations (blast indexing, emboss indexing,...). Any script can be
+applied on downloaded data. When all treatments are successfully applied, bank
+is put in "production" on a dedicated release directory.
+With cron tasks, update tasks can be executed at regular interval, data are
+downloaded again only if a change is detected.
+
+More documentation is available in wiki page.
+
+Getting started
+===============
+
+Edit global.properties file to match your settings. Minimal conf are database connection and directories.
+
+ biomaj-cli.py -h
+
+ biomaj-cli.py --config global.properties --status
+
+ biomaj-cli.py --config global.properties --bank alu --update
+
+Migration
+=========
+
+To migrate from previous BioMAJ, a script is available at:
+https://github.com/genouest/biomaj-migrate. Script will import old database to
+the new database, and update configuration files to the modified format. Data directory is the same.
+
+Application Features
+====================
+
+* Synchronisation:
+ * Multiple remote protocols (ftp, sftp, http, local copy, ....)
+ * Data transfers integrity check
+ * Release versioning using a incremental approach
+ * Multi threading
+ * Data extraction (gzip, tar, bzip)
+ * Data tree directory normalisation
+
+
+* Pre &Post processing :
+ * Advanced workflow description (D.A.G)
+ * Post-process indexation for various bioinformatics software (blast, srs,
+ fastacmd, readseq, etc…)
+ * Easy integration of personal scripts for bank post-processing automation
+
+
+* Supervision:
+ * Optional Administration web interface (biomaj-watcher)
+ * CLI management
+ * Mail alerts for the update cycle supervision
+
+
+
+Dependencies
+============
+
+Packages:
+ * Debian: libcurl-dev, gcc
+ * CentOs: libcurl-devel, openldap-devel, gcc
+
+ Linux tools: tar, unzip, gunzip, bunzip
+
+Database:
+ * mongodb (local or remote)
+
+Indexing (optional):
+ * elasticsearch (global property, use_elastic=1)
+
+ElasticSearch indexing add advanced search features to biomaj to find bank
+having files with specific format etc...
+Configuration of ElasticSearch is not in the scope of BioMAJ documentation.
+For a basic installation, one instance of ElasticSearch is enough (low volume of
+data), in such a case, the ElasticSearch configuration file should be modified
+accordingly:
+
+ node.name: "biomaj" (or any other name)
+ index.number_of_shards: 1
+ index.number_of_replicas: 0
+
+Installation
+============
+
+After dependencies installation, go in BioMAJ source directory:
+
+ python setup.py install
+
+
+You should consider using a Python virtual environment (virtualenv) to install BioMAJ.
+
+In tools/examples, copy the global.properties and update it to match your local
+installation.
+
+The tools/process contains example process files (python and shell).
+
+Docker
+======
+
+You can use BioMAJ with Docker (genouest/biomaj)
+
+
+ docker pull genouest/biomaj
+ docker pull mongo
+ docker run --name biomaj-mongodb -d mongo
+ # Wait ~10 seconds for mongo to initialize
+ # Create a local directory where databases will be permanently stored
+ # *local_path*
+ docker run --rm -v local_path:/var/lib/biomaj --link biomaj-mongodb:biomaj-mongodb osallou/biomaj-docker --help
+
+
+Copy your bank properties in directory *local_path*/conf and post-processes (if any) in *local_path*/process
+
+You can override global.properties in /etc/biomaj/global.properties (-v xx/global.properties:/etc/biomaj/global.properties)
+
+No default bank property file or process are available in the container.
+
+Examples are available at https://github.com/genouest/biomaj-data
+
+API documentation
+=================
+
+https://readthedocs.org/projects/biomaj/
+
+Status
+======
+
+[![Build Status](https://travis-ci.org/genouest/biomaj.svg?branch=master)](https://travis-ci.org/genouest/biomaj)
+
+[![Documentation Status](https://readthedocs.org/projects/biomaj/badge/?version=latest)](https://readthedocs.org/projects/biomaj/?badge=latest)
+
+[![Code Health](https://landscape.io/github/genouest/biomaj/master/landscape.svg?style=flat)](https://landscape.io/github/genouest/biomaj/master)
+
+Testing
+=======
+
+Execute unit tests
+
+ nosetests
+
+Execute unit tests but disable ones needing network access
+
+ nosetests -a '!network'
+
+License
+=======
+
+A-GPL v3+
+
+Remarks
+=======
+
+Biomaj uses libcurl, for sftp libcurl must be compiled with sftp support
+
+To delete elasticsearch index:
+
+ curl -XDELETE 'http://localhost:9200/biomaj_test/'
+
+Credits
+======
+
+Special thanks for tuco at Pasteur Institute for the intensive testing and new
+ideas....
+Thanks to the old BioMAJ team for the work they have done.
+
+BioMAJ is developped at IRISA research institute.
diff --git a/bin/biomaj-cli.py b/bin/biomaj-cli.py
new file mode 100755
index 0000000..e159838
--- /dev/null
+++ b/bin/biomaj-cli.py
@@ -0,0 +1,531 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+from tabulate import tabulate
+
+import os
+import sys
+#from optparse import OptionParser
+import argparse
+import pkg_resources
+import configparser
+import shutil
+
+from biomaj.bank import Bank
+from biomaj.config import BiomajConfig
+from biomaj.notify import Notify
+from biomaj.options import Options
+from biomaj.workflow import Workflow
+from biomaj.workflow import UpdateWorkflow, RemoveWorkflow, Workflow
+
+
+def main():
+
+ parser = argparse.ArgumentParser(add_help=False)
+ parser.add_argument('-c', '--config', dest="config",help="Configuration file")
+ parser.add_argument('--check', dest="check", help="Check bank property file", action="store_true", default=False)
+ parser.add_argument('-u', '--update', dest="update", help="Update action", action="store_true", default=False)
+ parser.add_argument('--fromscratch', dest="fromscratch", help="Force a new cycle update", action="store_true", default=False)
+ parser.add_argument('-z', '--from-scratch', dest="fromscratch", help="Force a new cycle update", action="store_true", default=False)
+ parser.add_argument('-p', '--publish', dest="publish", help="Publish", action="store_true", default=False)
+ parser.add_argument('--unpublish', dest="unpublish", help="Unpublish", action="store_true", default=False)
+
+ parser.add_argument('--release', dest="release", help="release of the bank")
+ parser.add_argument('--from-task', dest="from_task", help="Start cycle at a specific task (init always executed)")
+ parser.add_argument('--process', dest="process", help="Linked to from-task, optionally specify a block, meta or process name to start from")
+ parser.add_argument('-l', '--log', dest="log", help="log level")
+ parser.add_argument('-r', '--remove', dest="remove", help="Remove a bank release", action="store_true", default=False)
+ parser.add_argument('--remove-all', dest="removeall", help="Remove all bank releases and database records", action="store_true", default=False)
+ parser.add_argument('--remove-pending', dest="removepending", help="Remove pending release", action="store_true", default=False)
+ parser.add_argument('-s', '--status', dest="status", help="Get status", action="store_true", default=False)
+ parser.add_argument('-b', '--bank', dest="bank", help="bank name")
+ parser.add_argument('--owner', dest="owner", help="change owner of the bank")
+ parser.add_argument('--stop-before', dest="stop_before", help="Store workflow before task")
+ parser.add_argument('--stop-after', dest="stop_after", help="Store workflow after task")
+ parser.add_argument('--freeze', dest="freeze", help="Freeze a bank release", action="store_true", default=False)
+ parser.add_argument('--unfreeze', dest="unfreeze", help="Unfreeze a bank release", action="store_true", default=False)
+ parser.add_argument('-f', '--force', dest="force", help="Force action", action="store_true", default=False)
+ parser.add_argument('-h', '--help', dest="help", help="Show usage", action="store_true", default=False)
+
+ parser.add_argument('--search', dest="search", help="Search by format and types", action="store_true", default=False)
+ parser.add_argument('--formats', dest="formats", help="List of formats to search, comma separated")
+ parser.add_argument('--types', dest="types", help="List of types to search, comma separated")
+ parser.add_argument('--query', dest="query", help="Lucene query syntax to search in index")
+
+ parser.add_argument('--show', dest="show", help="Show format files for selected bank", action="store_true", default=False)
+
+ parser.add_argument('-n', '--change-dbname', dest="newbank", help="Change old bank name to this new bank name")
+ parser.add_argument('-e', '--move-production-directories', dest="newdir",help="Change bank production directories location to this new path, path must exists")
+ parser.add_argument('--visibility', dest="visibility", help="visibility status of the bank")
+
+ parser.add_argument('--maintenance', dest="maintenance", help="Maintenance mode (on/off/status)")
+
+ parser.add_argument('--version', dest="version", help="Show version", action="store_true", default=False)
+ parser.add_argument('--status-ko', dest="statusko", help="Get bank in KO status", action="store_true", default=False)
+
+
+ options = Options()
+ parser.parse_args(namespace=options)
+
+ options.no_log = False
+
+ if options.help:
+ print('''
+ --config: global.properties file path
+
+ --status: list of banks with published release
+ [OPTIONAL]
+ --bank xx / bank: Get status details of bank
+
+ --status-ko: list of banks in error status (last run)
+
+ --log DEBUG|INFO|WARN|ERR [OPTIONAL]: set log level in logs for this run, default is set in global.properties file
+
+ --check: Check bank property file
+ [MANDATORY]
+ --bank xx: name of the bank to check (will check xx.properties)
+
+ --owner yy: Change owner of the bank (user id)
+ [MANDATORY]
+ --bank xx: name of the bank
+
+ --visibility public|private: change visibility public/private of a bank
+ [MANDATORY]
+ --bank xx: name of the bank
+
+ --change-dbname yy: Change name of the bank to this new name
+ [MANDATORY]
+ --bank xx: current name of the bank
+
+ --move-production-directories yy: Change bank production directories location to this new path, path must exists
+ [MANDATORY]
+ --bank xx: current name of the bank
+
+ --update: Update bank
+ [MANDATORY]
+ --bank xx: name of the bank(s) to update, comma separated
+ [OPTIONAL]
+ --publish: after update set as *current* version
+ --from-scratch: force a new update cycle, even if release is identical, release will be incremented like (myrel_1)
+ --stop-before xx: stop update cycle before the start of step xx
+ --stop-after xx: stop update cycle after step xx has completed
+ --from-task xx --release yy: Force an re-update cycle for bank release *yy* or from current cycle (in production directories), skipping steps up to *xx*
+ --process xx: linked to from-task, optionally specify a block, meta or process name to start from
+ --release xx: release to update
+
+ --publish: Publish bank as current release to use
+ [MANDATORY]
+ --bank xx: name of the bank to update
+ --release xx: release of the bank to publish
+ --unpublish: Unpublish bank (remove current)
+ [MANDATORY]
+ --bank xx: name of the bank to update
+
+ --remove-all: Remove all bank releases and database records
+ [MANDATORY]
+ --bank xx: name of the bank to update
+ [OPTIONAL]
+ --force: remove freezed releases
+
+ --remove-pending: Remove pending releases
+ [MANDATORY]
+ --bank xx: name of the bank to update
+
+ --remove: Remove bank release (files and database release)
+ [MANDATORY]
+ --bank xx: name of the bank to update
+ --release xx: release of the bank to remove
+
+ Release must not be the *current* version. If this is the case, publish a new release before.
+
+ --freeze: Freeze bank release (cannot be removed)
+ [MANDATORY]
+ --bank xx: name of the bank to update
+ --release xx: release of the bank to remove
+
+ --unfreeze: Unfreeze bank release (can be removed)
+ [MANDATORY]
+ --bank xx: name of the bank to update
+ --release xx: release of the bank to remove
+
+ --search: basic search in bank production releases, return list of banks
+ --formats xx,yy : list of comma separated format
+ AND/OR
+ --types xx,yy : list of comma separated type
+
+ --query "LUCENE query syntax": search in index (if activated)
+
+ --show: Show bank files per format
+ [MANDATORY]
+ --bank xx: name of the bank to show
+ [OPTIONAL]
+ --release xx: release of the bank to show
+
+ --maintenance on/off/status: (un)set biomaj in maintenance mode to prevent updates/removal
+
+ ''')
+ return
+
+ if options.version:
+ version = pkg_resources.require('biomaj')[0].version
+ print('Version: '+str(version))
+ return
+
+ if options.stop_after or options.stop_before or options.from_task:
+ available_steps = []
+ for flow in UpdateWorkflow.FLOW:
+ available_steps.append(flow['name'])
+ for flow in RemoveWorkflow.FLOW:
+ available_steps.append(flow['name'])
+ if options.stop_after:
+ if options.stop_after not in available_steps:
+ print('Invalid step: '+options.stop_after)
+ sys.exit(1)
+ if options.stop_before:
+ if options.stop_before not in available_steps:
+ print('Invalid step: '+options.stop_before)
+ sys.exit(1)
+ if options.from_task:
+ if options.from_task not in available_steps:
+ print('Invalid step: '+options.from_task)
+ sys.exit(1)
+
+ bmaj = None
+ try:
+ if options.config is not None:
+ BiomajConfig.load_config(options.config)
+ else:
+ BiomajConfig.load_config()
+ except Exception as e:
+ print(str(e))
+ sys.exit(1)
+
+ try:
+
+ if options.maintenance:
+ if options.maintenance not in ['on', 'off', 'status']:
+ print("Wrong maintenance value [on,off,status]")
+ sys.exit(1)
+ data_dir = BiomajConfig.global_config.get('GENERAL', 'data.dir')
+ if BiomajConfig.global_config.has_option('GENERAL', 'lock.dir'):
+ lock_dir = BiomajConfig.global_config.get('GENERAL', 'lock.dir')
+ else:
+ lock_dir = data_dir
+ maintenance_lock_file = os.path.join(lock_dir,'biomaj.lock')
+ if options.maintenance == 'status':
+ if os.path.exists(maintenance_lock_file):
+ print("Maintenance: On")
+ else:
+ print("Maintenance: Off")
+ sys.exit(0)
+ if options.maintenance == 'on':
+ f = open(maintenance_lock_file, 'w')
+ f.write('1')
+ f.close()
+ print("Maintenance set to On")
+ sys.exit(0)
+ if options.maintenance == 'off':
+ if os.path.exists(maintenance_lock_file):
+ os.remove(maintenance_lock_file)
+ print("Maintenance set to Off")
+ sys.exit(0)
+
+ if options.owner:
+ if not options.bank:
+ print("Bank option is missing")
+ sys.exit(1)
+ bank = Bank(options.bank, no_log=True)
+ bank.set_owner(options.owner)
+ sys.exit(0)
+
+ if options.visibility:
+ if not options.bank:
+ print("Bank option is missing")
+ sys.exit(1)
+ if options.visibility not in ['public', 'private']:
+ print("Valid values are public|private")
+ sys.exit(1)
+ bank = Bank(options.bank, no_log=True)
+ bank.set_visibility(options.visibility)
+ print("Do not forget to update accordingly the visibility.default parameter in the configuration file")
+ sys.exit(0)
+
+ if options.newdir:
+ if not options.bank:
+ print("Bank option is missing")
+ sys.exit(1)
+ if not os.path.exists(options.newdir):
+ print("Destination directory does not exists")
+ bank = Bank(options.bank, options=options, no_log=True)
+ if not bank.bank['production']:
+ print("Nothing to move, no production directory")
+ sys.exit(0)
+ bank.load_session(Workflow.FLOW, None)
+ w = Workflow(bank)
+ res = w.wf_init()
+ if not res:
+ sys.exit(1)
+ for prod in bank.bank['production']:
+ session = bank.get_session_from_release(prod['release'])
+ bank.load_session(Workflow.FLOW, session)
+ prod_path = bank.session.get_full_release_directory()
+ if os.path.exists(prod_path):
+ shutil.move(prod_path, options.newdir)
+ prod['data_dir'] = options.newdir
+ bank.banks.update({'name': options.bank}, {'$set' : { 'production': bank.bank['production'] }})
+ print("Bank production directories moved to " + options.newdir)
+ print("WARNING: do not forget to update accordingly the data.dir and dir.version properties")
+ w.wf_over()
+ sys.exit(0)
+
+ if options.newbank:
+ if not options.bank:
+ print("Bank option is missing")
+ sys.exit(1)
+ bank = Bank(options.bank, no_log=True)
+ conf_dir = BiomajConfig.global_config.get('GENERAL', 'conf.dir')
+ bank_prop_file = os.path.join(conf_dir,options.bank+'.properties')
+ config_bank = configparser.SafeConfigParser()
+ config_bank.read([os.path.join(conf_dir,options.bank+'.properties')])
+ config_bank.set('GENERAL', 'db.name', options.newbank)
+ newbank_prop_file = open(os.path.join(conf_dir,options.newbank+'.properties'),'w')
+ config_bank.write(newbank_prop_file)
+ newbank_prop_file.close()
+ bank.banks.update({'name': options.bank}, {'$set' : { 'name': options.newbank }})
+ os.remove(bank_prop_file)
+ print("Bank "+options.bank+" renamed to "+options.newbank)
+ sys.exit(0)
+
+ if options.search:
+ if options.query:
+ res = Bank.searchindex(options.query)
+ print("Query matches for :"+options.query)
+ results = [["Release", "Format(s)", "Type(s)", "Files"]]
+ for match in res:
+ results.append([match['_source']['release'],
+ str(match['_source']['format']),
+ str(match['_source']['types']),
+ ','.join(match['_source']['files'])])
+ print(tabulate(results, headers="firstrow", tablefmt="grid"))
+ else:
+ formats = []
+ if options.formats:
+ formats = options.formats.split(',')
+ types = []
+ if options.types:
+ types = options.types.split(',')
+ print("Search by formats="+str(formats)+", types="+str(types))
+ res = Bank.search(formats, types, False)
+ results = [["Name", "Release", "Format(s)", "Type(s)", 'Published']]
+ for bank in sorted(res, key=lambda bank: (bank['name'])):
+ b = bank['name']
+ bank['production'].sort(key=lambda n: n['release'], reverse=True)
+ for prod in bank['production']:
+ iscurrent = ""
+ if prod['session'] == bank['current']:
+ iscurrent = "yes"
+ results.append([b if b else '', prod['release'], ','.join(prod['formats']),
+ ','.join(prod['types']), iscurrent])
+ print(tabulate(results, headers="firstrow", tablefmt="grid"))
+ sys.exit(0)
+
+ if options.show:
+ if not options.bank:
+ print("Bank option is required")
+ sys.exit(1)
+
+ bank = Bank(options.bank, no_log=True)
+ results = [["Name", "Release", "Format(s)", "Type(s)", "Tag(s)", "File(s)"]]
+ current = None
+ fformat = None
+ if 'current' in bank.bank and bank.bank['current']:
+ current = bank.bank['current']
+ for prod in bank.bank['production']:
+ include = True
+ release = prod['release']
+ if current == prod['session']:
+ release += ' (current)'
+ if options.release and (prod['release'] != options.release and prod['prod_dir'] != options.release):
+ include =False
+ if include:
+ session = bank.get_session_from_release(prod['release'])
+ formats = session['formats']
+ afiles = []
+ atags = []
+ atypes = []
+ for fformat in list(formats.keys()):
+ for elt in formats[fformat]:
+ atypes.append(','.join(elt['types']))
+ for tag in list(elt['tags'].keys()):
+ atags.append(elt['tags'][tag])
+ for eltfile in elt['files']:
+ afiles.append(eltfile)
+ results.append([bank.bank['name'], release, fformat, ','.join(atypes),
+ ','.join(atags), ','.join(afiles)])
+ print(tabulate(results, headers="firstrow", tablefmt="grid"))
+ sys.exit(0)
+
+ if options.check:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ bank = Bank(options.bank, no_log=True)
+ print(options.bank+" check: "+str(bank.check())+"\n")
+ sys.exit(0)
+
+ if options.status:
+ if options.bank:
+ bank = Bank(options.bank, no_log=True)
+ info = bank.get_bank_release_info(full=True)
+ print(tabulate(info['info'], headers='firstrow', tablefmt='psql'))
+ print(tabulate(info['prod'], headers='firstrow', tablefmt='psql'))
+ # do we have some pending release(s)
+ if 'pend' in info and len(info['pend']) > 1:
+ print(tabulate(info['pend'], headers='firstrow', tablefmt='psql'))
+ else:
+ banks = Bank.list()
+ # Headers of output table
+ banks_list = [["Name", "Type(s)", "Release", "Visibility", "Last update"]]
+ for bank in sorted(banks, key=lambda k: k['name']):
+ bank = Bank(bank['name'], no_log=True)
+ banks_list.append(bank.get_bank_release_info()['info'])
+ print(tabulate(banks_list, headers="firstrow", tablefmt="psql"))
+ sys.exit(0)
+
+ if options.statusko:
+ banks = Bank.list()
+ banks_list = [["Name", "Type(s)", "Release", "Visibility", "Last run"]]
+ for bank in sorted(banks, key=lambda k: k['name']):
+ try:
+ bank = Bank(bank['name'], no_log=True)
+ bank.load_session(UpdateWorkflow.FLOW)
+ if bank.session is not None:
+ if bank.use_last_session and not bank.session.get_status(Workflow.FLOW_OVER):
+ wf_status = bank.session.get('workflow_status')
+ if wf_status is None or not wf_status:
+ banks_list.append(bank.get_bank_release_info()['info'])
+ except Exception as e:
+ print(str(e))
+ print(tabulate(banks_list, headers="firstrow", tablefmt="psql"))
+
+ if options.update:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ banks = options.bank.split(',')
+ gres = True
+ for bank in banks:
+ options.bank = bank
+ bmaj = Bank(bank, options)
+ print('Log file: '+bmaj.config.log_file)
+ check_status = bmaj.check()
+ if not check_status:
+ print('Skip bank ' + options.bank + ': wrong config')
+ gres = False
+ continue
+ res = bmaj.update(depends=True)
+ if not res:
+ gres = False
+ Notify.notifyBankAction(bmaj)
+ if not gres:
+ sys.exit(1)
+
+ if options.freeze:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ if not options.release:
+ print("Bank release is missing")
+ sys.exit(1)
+ bmaj = Bank(options.bank, options)
+ res = bmaj.freeze(options.release)
+ if not res:
+ sys.exit(1)
+
+ if options.unfreeze:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ if not options.release:
+ print("Bank release is missing")
+ sys.exit(1)
+ bmaj = Bank(options.bank, options)
+ res = bmaj.unfreeze(options.release)
+ if not res:
+ sys.exit(1)
+
+ if options.remove or options.removeall:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ if options.remove and not options.release:
+ print("Bank release is missing")
+ sys.exit(1)
+ if options.removeall:
+ bmaj = Bank(options.bank, options, no_log=True)
+ print('Log file: '+bmaj.config.log_file)
+ res = bmaj.removeAll(options.force)
+ else:
+ bmaj = Bank(options.bank, options)
+ print('Log file: '+bmaj.config.log_file)
+ res = bmaj.remove(options.release)
+ Notify.notifyBankAction(bmaj)
+ if not res:
+ sys.exit(1)
+
+ if options.removepending:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ bmaj = Bank(options.bank, options, no_log=True)
+ res = bmaj.remove_pending(options.release)
+ if not res:
+ sys.exit(1)
+
+ if options.unpublish:
+ if not options.bank:
+ print("Bank name is missing")
+ sys.exit(1)
+ bmaj = Bank(options.bank, options, no_log=True)
+ bmaj.load_session()
+ bmaj.unpublish()
+ sys.exit(0)
+
+ if options.publish:
+ if not options.bank:
+ print("Bank name or release is missing")
+ sys.exit(1)
+ bmaj = Bank(options.bank, options, no_log=True)
+ bmaj.load_session()
+ bank = bmaj.bank
+ session = None
+ if options.get_option('release') is None:
+ # Get latest prod release
+ if len(bank['production'])>0:
+ prod = bank['production'][len(bank['production'])-1]
+ for s in bank['sessions']:
+ if s['id'] == prod['session']:
+ session = s
+ break
+ else:
+ # Search production release matching release
+ for prod in bank['production']:
+ if prod['release'] == options.release or prod['prod_dir'] == options.release:
+ # Search session related to this production release
+ for s in bank['sessions']:
+ if s['id'] == prod['session']:
+ session = s
+ break
+ break
+ if session is None:
+ print("No production session could be found for this release")
+ sys.exit(1)
+ bmaj.session._session = session
+ bmaj.publish()
+ except Exception as e:
+ print(str(e))
+
+if __name__ == '__main__':
+ main()
diff --git a/biomaj/__init__.py b/biomaj/__init__.py
new file mode 100644
index 0000000..de40ea7
--- /dev/null
+++ b/biomaj/__init__.py
@@ -0,0 +1 @@
+__import__('pkg_resources').declare_namespace(__name__)
diff --git a/biomaj/bank.py b/biomaj/bank.py
new file mode 100644
index 0000000..b880ceb
--- /dev/null
+++ b/biomaj/bank.py
@@ -0,0 +1,1073 @@
+from builtins import str
+from builtins import object
+import os
+import logging
+import time
+import shutil
+import json
+import pkg_resources
+
+
+from datetime import datetime
+from biomaj.mongo_connector import MongoConnector
+
+from biomaj.session import Session
+from biomaj.workflow import UpdateWorkflow, RemoveWorkflow, Workflow
+from biomaj.config import BiomajConfig
+from biomaj.options import Options
+from biomaj.process.processfactory import ProcessFactory
+from biomaj.bmajindex import BmajIndex
+
+import getpass
+
+# from bson.objectid import ObjectId
+
+
+class Bank(object):
+ """
+ BioMAJ bank
+ """
+
+ def __init__(self, name, options=None, no_log=False):
+ """
+ Get a bank from db or creates a new one
+
+ :param name: name of the bank, must match its config file
+ :type name: str
+ :param options: bank options
+ :type options: argparse
+ :param no_log: create a log file for the bank
+ :type no_log: bool
+ """
+ logging.debug('Initialize ' + name)
+ if BiomajConfig.global_config is None:
+ raise Exception('Configuration must be loaded first')
+
+ self.name = name
+ self.depends = []
+ self.no_log = no_log
+
+ if no_log:
+ if options is None:
+ # options = {'no_log': True}
+ options = Options()
+ options.no_log = True
+ else:
+ options.no_log = no_log
+
+ self.config = BiomajConfig(self.name, options)
+
+ if self.config.get('bank.num.threads') is not None:
+ ProcessFactory.NB_THREAD = int(self.config.get('bank.num.threads'))
+
+ if self.config.log_file is not None and self.config.log_file != 'none':
+ logging.info("Log file: " + self.config.log_file)
+
+ # self.options = Options(options)
+ if options is None:
+ self.options = Options()
+ else:
+ self.options = options
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ self.banks = MongoConnector.banks
+ self.bank = self.banks.find_one({'name': self.name})
+
+ if self.bank is None:
+ self.bank = {
+ 'name': self.name,
+ 'current': None,
+ 'sessions': [],
+ 'production': [],
+ 'properties': self.get_properties()
+ }
+ self.bank['_id'] = self.banks.insert(self.bank)
+
+ self.session = None
+ self.use_last_session = False
+
+ def check(self):
+ """
+ Checks bank configuration
+ """
+ return self.config.check()
+
+ def is_locked(self):
+ """
+ Checks if bank is locked ie action is in progress
+ """
+ data_dir = self.config.get('data.dir')
+ lock_dir = self.config.get('lock.dir', default=data_dir)
+ lock_file = os.path.join(lock_dir, self.name + '.lock')
+ if os.path.exists(lock_file):
+ return True
+ else:
+ return False
+
+ def get_bank(self):
+ """
+ Get bank stored in db
+
+ :return: bank json object
+ """
+ return self.bank
+
+ @staticmethod
+ def get_banks_disk_usage():
+ """
+ Get disk usage per bank and release
+ """
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ bank_list = []
+ banks = MongoConnector.banks.find({}, {'name': 1, 'production': 1})
+ for b in banks:
+ bank_elt = {'name': b['name'], 'size': 0, 'releases': []}
+ for p in b['production']:
+ if p['size'] is None:
+ p['size'] = 0
+ bank_elt['size'] += p['size']
+ bank_elt['releases'].append({'name': p['release'], 'size': p['size']})
+ bank_list.append(bank_elt)
+ return bank_list
+
+ def get_bank_release_info(self, full=False):
+ """
+ Get release info for the bank. Used with --status option from biomaj-cly.py
+ :param full: Display full for the bank
+ :type full: Boolean
+ :return: Dict with keys
+ if full=True
+ - info, prod, pend
+ else
+ - info
+ """
+
+ _bank = self.bank
+ info = {}
+ release = 'N/A'
+ last_update = 'N/A'
+ if 'last_update_session' in _bank:
+ last_update = datetime.fromtimestamp(_bank['last_update_session']).strftime("%Y-%m-%d %H:%M:%S")
+
+ if full:
+ bank_info = []
+ prod_info = []
+ pend_info = []
+
+ if 'current' in _bank and _bank['current']:
+ for prod in _bank['production']:
+ if _bank['current'] == prod['session']:
+ release = prod['release']
+ # Bank info header
+ bank_info.append(["Name", "Type(s)", "Last update status", "Published release"])
+ bank_info.append([_bank['name'],
+ str(','.join(_bank['properties']['type'])),
+ str(last_update),
+ str(release)])
+ # Bank production info header
+ prod_info.append(["Session", "Remote release", "Release", "Directory", "Freeze"])
+ for prod in _bank['production']:
+ data_dir = self.config.get('data.dir')
+ dir_version = self.config.get('dir.version')
+ if 'data.dir' in prod:
+ data_dir = prod['data.dir']
+ if 'dir.version' in prod:
+ dir_version = prod['dir.version']
+ release_dir = os.path.join(data_dir,
+ dir_version,
+ prod['prod_dir'])
+ date = datetime.fromtimestamp(prod['session']).strftime('%Y-%m-%d %H:%M:%S')
+ prod_info.append([date,
+ prod['remoterelease'],
+ prod['release'],
+ release_dir,
+ 'yes' if 'freeze' in prod and prod['freeze'] else 'no'])
+ # Bank pending info header
+ if 'pending' in _bank and len(_bank['pending']) > 0:
+ pend_info.append(["Pending release", "Last run"])
+ for pending in _bank['pending']:
+ run = datetime.fromtimestamp(pending['id']).strftime('%Y-%m-%d %H:%M:%S')
+ pend_info.append([pending['release'], run])
+
+ info['info'] = bank_info
+ info['prod'] = prod_info
+ info['pend'] = pend_info
+ return info
+
+ else:
+ if 'current' in _bank and _bank['current']:
+ for prod in _bank['production']:
+ if _bank['current'] == prod['session']:
+ release = prod['remoterelease']
+ info['info'] = [_bank['name'], ','.join(_bank['properties']['type']),
+ str(release), _bank['properties']['visibility'], last_update]
+ return info
+
+ def update_dependencies(self):
+ """
+ Update bank dependencies
+
+ :return: status of updates
+ """
+ self.depends = []
+ if self.run_depends:
+ depends = self.get_dependencies()
+ else:
+ depends = []
+
+ self.session.set('depends', {})
+ res = True
+ for dep in depends:
+ self.session._session['depends'][dep] = False
+ for dep in depends:
+ if self.session._session['depends'][dep]:
+ logging.debug('Update:Depends:' + dep + ':SKIP')
+ # Bank has been marked as depends multiple times, run only once
+ continue
+ logging.info('Update:Depends:' + dep)
+ b = Bank(dep)
+ res = b.update()
+ self.depends.append(b)
+ self.session._session['depends'][dep] = res
+ logging.info('Update:Depends:' + dep + ':' + str(res))
+ if not res:
+ break
+ return res
+
+ def get_bank(self, bank, no_log=False):
+ """
+ Gets an other bank
+ """
+ return Bank(bank, no_log=no_log)
+
+ def get_dependencies(self, bank=None):
+ """
+ Search all bank dependencies
+
+ :return: list of bank names to update
+ """
+ if bank is None:
+ deps = self.config.get('depends')
+ else:
+ deps = bank.config.get('depends')
+ if deps is None:
+ return []
+ # Mainn deps
+ deps = deps.split(',')
+ # Now search in deps if they themselves depend on other banks
+ for dep in deps:
+ b = Bank(dep, no_log = True)
+ deps = b.get_dependencies() + deps
+ return deps
+
+ def is_owner(self):
+ """
+ Checks if current user is owner or admin
+ """
+ owner = getpass.getuser()
+ admin_config = self.config.get('admin')
+ admin = []
+ if admin_config is not None:
+ admin = [x.strip() for x in admin_config.split(',')]
+ if admin and owner in admin:
+ return True
+ if owner == self.bank['properties']['owner']:
+ return True
+ return False
+
+ def set_owner(self, owner):
+ """
+ Update bank owner, only if current owner
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.banks.update({'name': self.name}, {'$set': {'properties.owner': owner}})
+
+ def set_visibility(self, visibility):
+ """
+ Update bank visibility, only if current owner
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.banks.update({'name': self.name}, {'$set': {'properties.visibility': visibility}})
+
+ def get_properties(self):
+ """
+ Read bank properties from config file
+
+ :return: properties dict
+ """
+ owner = getpass.getuser()
+ # owner = os.environ['LOGNAME']
+ # If owner not set, use current user, else keep current
+ if self.bank and 'properties' in self.bank and 'owner' in self.bank['properties']:
+ owner = self.bank['properties']['owner']
+
+ props = {
+ 'visibility': self.config.get('visibility.default'),
+ 'type': self.config.get('db.type').split(','),
+ 'tags': [],
+ 'owner': owner
+ }
+
+ return props
+
+ @staticmethod
+ def searchindex(query):
+ return BmajIndex.searchq(query)
+
+ @staticmethod
+ def search(formats=None, types=None, with_sessions=True):
+ """
+ Search all bank releases matching some formats and types
+
+ Matches production release with at least one of formats and one of types
+ """
+ if formats is None:
+ formats = []
+
+ if types is None:
+ types = []
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+ searchfilter = {}
+ if formats:
+ searchfilter['production.formats'] = {'$in': formats}
+ if with_sessions:
+ res = MongoConnector.banks.find(searchfilter)
+ else:
+ res = MongoConnector.banks.find(searchfilter, {'sessions': 0})
+ # Now search in which production release formats and types apply
+ search_list = []
+ for r in res:
+ prod_to_delete = []
+ for p in r['production']:
+ is_format = False
+ if not formats:
+ is_format = True
+ # Are formats present in this production release?
+ for f in formats:
+ if f in p['formats']:
+ is_format = True
+ break
+ # Are types present in this production release?
+ is_type = False
+ if not types:
+ is_type = True
+ if is_format:
+ for t in types:
+ if t in p['types'] or t in r['properties']['type']:
+ is_type = True
+ break
+ if not is_type or not is_format:
+ prod_to_delete.append(p)
+ for prod_del in prod_to_delete:
+ r['production'].remove(prod_del)
+ if len(r['production']) > 0:
+ search_list.append(r)
+ return search_list
+
+ @staticmethod
+ def list(with_sessions=False):
+ """
+ Return a list of banks
+
+ :param with_sessions: should sessions be returned or not (can be quite big)
+ :type with_sessions: bool
+ :return: list of :class:`biomaj.bank.Bank`
+ """
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ bank_list = []
+ if with_sessions:
+ res = MongoConnector.banks.find({})
+ else:
+ res = MongoConnector.banks.find({}, {'sessions': 0})
+ for r in res:
+ bank_list.append(r)
+ return bank_list
+
+ def controls(self):
+ """
+ Initial controls (create directories etc...)
+ """
+ data_dir = self.config.get('data.dir')
+ bank_dir = self.config.get('dir.version')
+ bank_dir = os.path.join(data_dir, bank_dir)
+ if not os.path.exists(bank_dir):
+ os.makedirs(bank_dir)
+
+ offline_dir = self.config.get('offline.dir.name')
+ offline_dir = os.path.join(data_dir, offline_dir)
+ if not os.path.exists(offline_dir):
+ os.makedirs(offline_dir)
+
+ log_dir = self.config.get('log.dir')
+ log_dir = os.path.join(log_dir, self.name)
+ if not os.path.exists(log_dir):
+ os.makedirs(log_dir)
+
+ def _delete(self):
+ """
+ Delete bank from database, not files
+ """
+ self.banks.remove({'_id': self.bank['_id']})
+
+ def save_session(self):
+ """
+ Save session in database
+ """
+ self.session._session['last_update_time'] = time.time()
+ self.session._session['log_file'] = self.config.log_file
+ if self.use_last_session:
+ # Remove last session
+ self.banks.update({'name': self.name}, {'$pull': {'sessions': {'id': self.session._session['id']}}})
+ # Insert session
+ if self.session.get('action') == 'update':
+ action = 'last_update_session'
+ if self.session.get('action') == 'remove':
+ action = 'last_remove_session'
+
+
+ cache_dir = self.config.get('cache.dir')
+ download_files = self.session.get('download_files')
+ if download_files is not None:
+ f_downloaded_files = open(os.path.join(cache_dir, 'files_'+str(self.session.get('id'))), 'w')
+ f_downloaded_files.write(json.dumps(download_files))
+ f_downloaded_files.close()
+ self.session.set('download_files',[])
+
+ local_files = self.session.get('files')
+ if local_files is not None:
+ f_local_files = open(os.path.join(cache_dir, 'local_files_'+str(self.session.get('id'))), 'w')
+ f_local_files.write(json.dumps(download_files))
+ f_local_files.close()
+ self.session.set('files',[])
+
+
+ self.banks.update({'name': self.name}, {
+ '$set': {
+ action: self.session._session['id'],
+ 'properties': self.get_properties()
+ },
+ '$push': {'sessions': self.session._session}
+ })
+ BmajIndex.add(self.name, self.session._session)
+ if self.session.get('action') == 'update' and not self.session.get_status(Workflow.FLOW_OVER)\
+ and self.session.get('release'):
+ release = self.session.get('release')
+ found = self.banks.find_one({'name': self.name, 'pending.release': release})
+ if found is None:
+ self.banks.update({'name': self.name},
+ {'$push': {'pending': {'release': self.session.get('release'),
+ 'id': self.session._session['id']}}})
+
+ if self.session.get('action') == 'update' and self.session.get_status(Workflow.FLOW_OVER) and self.session.get(
+ 'update'):
+ # We expect that a production release has reached the FLOW_OVER status.
+ # If no update is needed (same release etc...), the *update* session of the session is set to False
+ logging.debug('Bank:Save:' + self.name)
+ if len(self.bank['production']) > 0:
+ # Remove from database
+ self.banks.update({'name': self.name},
+ {'$pull': {'production': {'release': self.session._session['release']}}})
+ # Update local object
+ # index = 0
+ # for prod in self.bank['production']:
+ # if prod['release'] == self.session._session['release']:
+ # break;
+ # index += 1
+ # if index < len(self.bank['production']):
+ # self.bank['production'].pop(index)
+ release_types = []
+ if self.config.get('db.type'):
+ release_types = self.config.get('db.type').split(',')
+ release_formats = list(self.session._session['formats'].keys())
+ if self.config.get('db.formats'):
+ config_formats = self.config.get('db.formats').split(',')
+ for config_format in config_formats:
+ if config_format not in release_formats:
+ release_formats.append(config_format)
+
+ for release_format in self.session._session['formats']:
+ for release_files in self.session._session['formats'][release_format]:
+ if release_files['types']:
+ for rtype in release_files['types']:
+ if rtype not in release_types:
+ release_types.append(rtype)
+ prod_dir = self.session.get_release_directory()
+ if self.session.get('prod_dir'):
+ prod_dir = self.session.get('prod_dir')
+ production = {'release': self.session.get('release'),
+ 'remoterelease': self.session.get('remoterelease'),
+ 'session': self.session._session['id'],
+ 'formats': release_formats,
+ 'types': release_types,
+ 'size': self.session.get('fullsize'),
+ 'data_dir': self.session._session['data_dir'],
+ 'dir_version': self.session._session['dir_version'],
+ 'prod_dir': prod_dir,
+ 'freeze': False}
+ self.bank['production'].append(production)
+ self.banks.update({'name': self.name},
+ {'$push': {'production': production},
+ '$pull': {'pending': {'release': self.session.get('release'),
+ 'id': self.session._session['id']}}
+ })
+
+
+ self.bank = self.banks.find_one({'name': self.name})
+
+ def clean_old_sessions(self):
+ """
+ Delete old sessions, not latest ones nor related to production sessions
+ """
+ if self.session is None:
+ return
+ # No previous session
+ if 'sessions' not in self.bank:
+ return
+ if self.config.get_bool('keep.old.sessions'):
+ logging.debug('keep old sessions, skipping...')
+ return
+ # 'last_update_session' in self.bank and self.bank['last_update_session']
+ old_sessions = []
+ prod_releases = []
+ for session in self.bank['sessions']:
+ if session['id'] == self.session.get('id'):
+ # Current session
+ prod_releases.append(session['release'])
+ continue
+ if session['id'] == self.session.get('last_update_session'):
+ prod_releases.append(session['release'])
+ continue
+ if session['id'] == self.session.get('last_remove_session'):
+ continue
+ is_prod_session = False
+ for prod in self.bank['production']:
+ if session['id'] == prod['session']:
+ is_prod_session = True
+ break
+ if is_prod_session:
+ prod_releases.append(session['release'])
+ continue
+ old_sessions.append(session)
+ if len(old_sessions) > 0:
+ for session in old_sessions:
+ session_id = session['id']
+ self.banks.update({'name': self.name}, {'$pull': {'sessions': {'id': session_id}}})
+ # Check if in pending sessions
+ for rel in self.bank['pending']:
+ rel_session = rel['id']
+ if rel_session == session_id:
+ self.banks.update({'name': self.name},
+ {'$pull': {'pending': {'release': session['release'], 'id': session_id}}})
+ if session['release'] not in prod_releases and session['release'] != self.session.get('release'):
+ # There might be unfinished releases linked to session, delete them
+ # if they are not related to a production directory or latest run
+ session_dir = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ self.name + self.config.get('release.separator', default='_') + str(session['release']))
+ if os.path.exists(session_dir):
+ logging.info('Bank:DeleteOldSessionDir:' + self.name + self.config.get('release.separator', default='_') + str(session['release']))
+ shutil.rmtree(session_dir)
+ self.bank = self.banks.find_one({'name': self.name})
+
+ def publish(self):
+ """
+ Set session release to *current*
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ current_link = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'current')
+ prod_dir = self.session.get_full_release_directory()
+
+ to_dir = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'))
+
+ if os.path.lexists(current_link):
+ os.remove(current_link)
+ os.chdir(to_dir)
+ os.symlink(self.session.get_release_directory(), 'current')
+ self.bank['current'] = self.session._session['id']
+ self.banks.update({'name': self.name},
+ {
+ '$set': {'current': self.session._session['id']}
+ })
+
+ def unpublish(self):
+ """
+ Unset *current*
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ current_link = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'current')
+
+ if os.path.lexists(current_link):
+ os.remove(current_link)
+ self.banks.update({'name': self.name},
+ {
+ '$set': {'current': None}
+ })
+
+ def get_production(self, release):
+ """
+ Get production field for release
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: production field
+ """
+ release = str(release)
+ production = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ production = prod
+ return production
+
+ def freeze(self, release):
+ """
+ Freeze a production release
+
+ When freezed, a production release cannot be removed (manually or automatically)
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ rel = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ rel = prod['release']
+ if rel is None:
+ logging.error('Release not found: ' + release)
+ self.banks.update({'name': self.name, 'production.release': rel}, {'$set': {'production.$.freeze': True}})
+ self.bank = self.banks.find_one({'name': self.name})
+ return True
+
+ def unfreeze(self, release):
+ """
+ Unfreeze a production release to allow removal
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ rel = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ rel = prod['release']
+ if rel is None:
+ logging.error('Release not found: ' + release)
+ self.banks.update({'name': self.name, 'production.release': rel}, {'$set': {'production.$.freeze': False}})
+ self.bank = self.banks.find_one({'name': self.name})
+ return True
+
+ def get_new_session(self, flow=None):
+ """
+ Returns an empty session
+
+ :param flow: kind of workflow
+ :type flow: :func:`biomaj.workflow.Workflow.FLOW`
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+ return Session(self.name, self.config, flow)
+
+ def get_session_from_release(self, release):
+ """
+ Loads the session matching a specific release
+
+ :param release: release name oe production dir
+ :type release: str
+ :return: :class:`biomaj.session.Session`
+ """
+ release = str(release)
+ oldsession = None
+ # Search production release matching release
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ for s in self.bank['sessions']:
+ if s['id'] == prod['session']:
+ oldsession = s
+ break
+ break
+ if oldsession is None:
+ # No prod session, try to find a session for this release, session may have failed or be stopped
+ for s in self.bank['sessions']:
+ if s['release'] and release.endswith(s['release']):
+ oldsession = s
+ if oldsession is None:
+ logging.error('No production session could be found for this release')
+ return oldsession
+
+ def load_session(self, flow=None, session=None):
+ """
+ Loads last session or, if over or forced, a new session
+
+ Creates a new session or load last session if not over
+
+ :param flow: kind of workflow
+ :type flow: :func:`biomaj.workflow.Workflow.FLOW`
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+
+ if session is not None:
+ logging.debug('Load specified session ' + str(session['id']))
+ self.session = Session(self.name, self.config, flow)
+ self.session.load(session)
+ self.use_last_session = True
+ return
+ if len(self.bank['sessions']) == 0 or self.options.get_option(Options.FROMSCRATCH):
+ self.session = Session(self.name, self.config, flow)
+ logging.debug('Start new session')
+ else:
+ # Take last session
+ self.session = Session(self.name, self.config, flow)
+ session_id = None
+ # Load previous session for updates only
+ if self.session.get('action') == 'update' and 'last_update_session' in self.bank and self.bank[
+ 'last_update_session']:
+ session_id = self.bank['last_update_session']
+ load_session = None
+ for session in self.bank['sessions']:
+ if session['id'] == session_id:
+ load_session = session
+ break
+ if load_session is not None:
+ # self.session.load(self.bank['sessions'][len(self.bank['sessions'])-1])
+ self.session.load(session)
+ # if self.config.last_modified > self.session.get('last_modified'):
+ # # Config has changed, need to restart
+ # self.session = Session(self.name, self.config, flow)
+ # logging.info('Configuration file has been modified since last session, restart in any case a new session')
+ if self.session.get_status(Workflow.FLOW_OVER) and self.options.get_option(
+ Options.FROM_TASK) is None:
+ previous_release = self.session.get('remoterelease')
+ self.session = Session(self.name, self.config, flow)
+ self.session.set('previous_release', previous_release)
+ logging.debug('Start new session')
+ else:
+ logging.debug('Load previous session ' + str(self.session.get('id')))
+ self.use_last_session = True
+
+ def remove_session(self, sid):
+ """
+ Delete a session from db
+
+ :param sid: id of the session
+ :type sid: long
+ :return: bool
+ """
+ session_release = None
+ _tmpbank = self.banks.find_one({'name': self.name})
+ for s in _tmpbank['sessions']:
+ if s['id'] == sid:
+ session_release = s['release']
+
+ cache_dir = self.config.get('cache.dir')
+ download_files = os.path.join(cache_dir, 'files_'+str(sid))
+ if os.path.exists(download_files):
+ os.remove(download_files)
+
+ local_files = os.path.join(cache_dir, 'local_files_'+str(sid))
+ if os.path.exists(local_files):
+ os.remove(local_files)
+
+ if self.config.get_bool('keep.old.sessions'):
+ logging.debug('keep old sessions')
+ if session_release is not None:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'production': {'session': sid}
+ },
+ '$pull': {
+ 'pending': {'release': session_release,
+ 'id': sid}
+ }
+ })
+ else:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'production': {'session': sid}
+ }
+ })
+ self.banks.update({'name': self.name, 'sessions.id': sid},
+ {'$set': {'sessions.$.deleted': time.time()}})
+ else:
+ if session_release is not None:
+ result = self.banks.update({'name': self.name}, {'$pull': {
+ 'sessions': {'id': sid},
+ 'production': {'session': sid},
+ 'pending': {'release': session_release,
+ 'id': sid}
+ }
+ })
+ else:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'sessions': {'id': sid},
+ 'production': {'session': sid}
+ }
+ })
+ # Update object
+ self.bank = self.banks.find_one({'name': self.name})
+ if session_release is not None:
+ BmajIndex.remove(self.name, session_release)
+ return True
+
+ def get_data_dir(self):
+ """
+ Returns bank data directory
+
+ :return: str
+ """
+ return os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'))
+
+ def removeAll(self, force=False):
+ """
+ Remove all bank releases and database records
+
+ :param force: force removal even if some production dirs are freezed
+ :type force: bool
+ :return: bool
+ """
+ if not force:
+ has_freeze = False
+ for prod in self.bank['production']:
+ if 'freeze' in prod and prod['freeze']:
+ has_freeze = True
+ break
+ if has_freeze:
+ logging.error('Cannot remove bank, some production directories are freezed, use force if needed')
+ return False
+
+ self.banks.remove({'name': self.name})
+ BmajIndex.delete_all_bank(self.name)
+ bank_data_dir = self.get_data_dir()
+ logging.warn('DELETE ' + bank_data_dir)
+ if os.path.exists(bank_data_dir):
+ shutil.rmtree(bank_data_dir)
+ bank_offline_dir = os.path.join(self.config.get('data.dir'), self.config.get('offline.dir.name'))
+ if os.path.exists(bank_offline_dir):
+ shutil.rmtree(bank_offline_dir)
+ bank_log_dir = os.path.join(self.config.get('log.dir'), self.name)
+ if os.path.exists(bank_log_dir) and self.no_log:
+ shutil.rmtree(bank_log_dir)
+ return True
+
+ def get_status(self):
+ """
+ Get status of current workflow
+
+ :return: dict of current workflow status
+ """
+ if self.bank['status'] is None:
+ return {}
+ return self.bank['status']
+
+ def remove_pending(self, release=None):
+ """
+ Remove pending releases if 'release is None
+
+ :param release: release or release directory, default None
+ :type release: str
+ :return: bool
+ """
+ if release:
+ release = str(release)
+ logging.warning('Bank:' + self.name + ':RemovePending')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ if 'pending' not in self.bank:
+ return True
+ pendings = self.bank['pending']
+
+ for pending in pendings:
+ # Only work with pending for argument release
+ if release and release != pending['release']:
+ continue
+ pending_session_id = pending['id']
+ pending_session = None
+ for s in self.bank['sessions']:
+ if s['id'] == pending_session_id:
+ pending_session = s
+ break
+ session = Session(self.name, self.config, RemoveWorkflow.FLOW)
+ if pending_session is None:
+ session._session['release'] = pending['release']
+ else:
+ session.load(pending_session)
+ if os.path.exists(session.get_full_release_directory()):
+ logging.debug("Remove:Pending:Dir:" + session.get_full_release_directory())
+ shutil.rmtree(session.get_full_release_directory())
+ self.remove_session(pending['id'])
+ # If no release ask for deletion, remove all pending
+ if not release:
+ self.banks.update({'name': self.name}, {'$set': {'pending': []}})
+ return True
+
+ def remove(self, release):
+ """
+ Remove a release (db and files)
+
+ :param release: release or release directory
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ logging.warning('Bank:' + self.name + ':Remove')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.session = self.get_new_session(RemoveWorkflow.FLOW)
+ oldsession = None
+ # Search production release matching release
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ if 'freeze' in prod and prod['freeze']:
+ logging.error('Cannot remove release, release is freezed, unfreeze it first')
+ return False
+ # Search session related to this production release
+ for s in self.bank['sessions']:
+ if s['id'] == prod['session']:
+ oldsession = s
+ break
+ break
+ if oldsession is None:
+ logging.error('No production session could be found for this release')
+ return False
+ if 'current' in self.bank and self.bank['current'] == oldsession['id']:
+ logging.error('This release is the release in the main release production, you should first unpublish it')
+ return False
+
+ # New empty session for removal
+ session = Session(self.name, self.config, RemoveWorkflow.FLOW)
+ session.set('action', 'remove')
+ session.set('release', oldsession['release'])
+ session.set('update_session_id', oldsession['id'])
+ self.session = session
+ # Reset status, we take an update session
+ res = self.start_remove(session)
+ self.session.set('workflow_status', res)
+
+ self.save_session()
+
+ return res
+
+ def update(self, depends=False):
+ """
+ Launch a bank update
+
+ :param depends: run update of bank dependencies first
+ :type depends: bool
+ :return: bool
+ """
+ logging.warning('Bank:' + self.name + ':Update')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.run_depends = depends
+
+ self.controls()
+ if self.options.get_option('release'):
+ logging.info('Bank:' + self.name + ':Release:' + self.options.get_option('release'))
+ s = self.get_session_from_release(self.options.get_option('release'))
+ # No session in prod
+ if s is None:
+ logging.error('Release does not exists: ' + self.options.get_option('release'))
+ return False
+ self.load_session(UpdateWorkflow.FLOW, s)
+ else:
+ logging.info('Bank:' + self.name + ':Release:latest')
+ self.load_session(UpdateWorkflow.FLOW)
+ # if from task, reset workflow status in session.
+ if self.options.get_option('from_task'):
+ set_to_false = False
+ for task in self.session.flow:
+ # If task was in False status (KO) and we ask to start after this task, exit
+ if not set_to_false and not self.session.get_status(task['name']) and task[
+ 'name'] != self.options.get_option('from_task'):
+ logging.error(
+ 'Previous task ' + task['name'] + ' was not successful, cannot restart after this task')
+ return False
+ if task['name'] == self.options.get_option('from_task'):
+ set_to_false = True
+ if set_to_false:
+ # After from_task task, tasks must be set to False to be run
+ self.session.set_status(task['name'], False)
+ proc = None
+ if task['name'] in [Workflow.FLOW_POSTPROCESS, Workflow.FLOW_PREPROCESS,
+ Workflow.FLOW_REMOVEPROCESS]:
+ proc = self.options.get_option('process')
+ self.session.reset_proc(task['name'], proc)
+ # if task['name'] == Workflow.FLOW_POSTPROCESS:
+ # self.session.reset_proc(Workflow.FLOW_POSTPROCESS, proc)
+ # elif task['name'] == Workflow.FLOW_PREPROCESS:
+ # self.session.reset_proc(Workflow.FLOW_PREPROCESS, proc)
+ # elif task['name'] == Workflow.FLOW_REMOVEPROCESS:
+ # self.session.reset_proc(Workflow.FLOW_REMOVEPROCESS, proc)
+ self.session.set('action', 'update')
+ res = self.start_update()
+ self.session.set('workflow_status', res)
+ self.save_session()
+ return res
+
+ def start_remove(self, session):
+ """
+ Start a removal workflow
+
+ :param session: Session to remove
+ :type session: :class:`biomaj.session.Session`
+ :return: bool
+ """
+ workflow = RemoveWorkflow(self, session)
+ return workflow.start()
+
+ def start_update(self):
+ """
+ Start an update workflow
+ """
+ workflow = UpdateWorkflow(self)
+ return workflow.start()
diff --git a/biomaj/bmajindex.py b/biomaj/bmajindex.py
new file mode 100644
index 0000000..1f658ad
--- /dev/null
+++ b/biomaj/bmajindex.py
@@ -0,0 +1,231 @@
+from builtins import str
+from builtins import object
+import logging
+import copy
+from elasticsearch import Elasticsearch
+
+class BmajIndex(object):
+ """
+ ElasticSearch indexation and search
+ """
+
+
+ """
+ ElasticSearch server
+ """
+ es = None
+
+ """
+ Index name
+ """
+ index = 'biomaj'
+
+ """
+ Do indexing
+ """
+ do_index = False
+
+ """
+ Skip if failure (tests)
+ """
+ skip_if_failure = False
+
+ @staticmethod
+ def load(hosts=None, index='biomaj', do_index=True):
+ """
+ Initialize index
+
+ :param hosts: List of elastic search nodes to connect to
+ :type hosts: list
+ :param do_index: index data or not
+ :type do_index: bool
+ """
+ if hosts is None:
+ hosts = ['localhost']
+ if not do_index:
+ return
+ BmajIndex.index = index
+ BmajIndex.do_index = do_index
+ if BmajIndex.es is None:
+ BmajIndex.es = Elasticsearch(hosts)
+
+ mapping = {
+ "mappings": {
+ "production": {
+ "date_detection": False
+ },
+ "releasestats": {
+ "date_detection": False,
+ "_timestamp" : {
+ "enabled" : True,
+ "store" : True
+ }
+ }
+ }
+ }
+ try:
+ if not BmajIndex.es.indices.exists(index=BmajIndex.index):
+ BmajIndex.es.indices.create(index=BmajIndex.index, body=mapping)
+ except Exception as e:
+ logging.error('ElasticSearch connection error, check server is running and configuration')
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
+ else:
+ raise e
+
+ @staticmethod
+ def _bulk_delete(query, flush=True):
+ try:
+ page = BmajIndex.es.search(index=BmajIndex.index,
+ doc_type='production',
+ search_type = "query_then_fetch",
+ size=1000,
+ body= {'query': {'match': {'bank': query['bank']}}})
+
+ if page is None:
+ return
+ bulk_delete = ''
+ for del_hit in page['hits']['hits']:
+ if ('release' in query and query['release'] == del_hit['_source']['release']) or 'release' not in query:
+ bulk_delete += "{ \"delete\" : {\"_index\":\""+BmajIndex.index+"\",\"_type\":\"production\", \"_id\" : \""+del_hit['_id']+"\" } }\n"
+ if bulk_delete:
+ BmajIndex.es.bulk(body=bulk_delete)
+ if flush:
+ BmajIndex.es.indices.flush(index=BmajIndex.index, force=True)
+ except Exception as e:
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
+ else:
+ raise e
+
+
+ @staticmethod
+ def delete_all_bank(bank_name):
+ """
+ Delete complete index for a bank
+ """
+ if not BmajIndex.do_index:
+ return
+ BmajIndex._bulk_delete({"bank" : bank_name}, True)
+ """
+ query = {
+ "query" : {
+ "term" : {"bank" : bank_name}
+ }
+ }
+ try:
+ BmajIndex.es.delete_by_query(index=BmajIndex.index, body=query)
+ except Exception as e:
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
+ else:
+ raise e
+ """
+
+ @staticmethod
+ def remove(bank_name, release):
+ """
+ Remove a production release
+
+ :param bank_name: Name of the bank
+ :type bank_name: str
+ :param release: production release
+ :type release: str
+ """
+ if not BmajIndex.do_index:
+ return
+ BmajIndex._bulk_delete({"release" : release, "bank": bank_name})
+ """
+ try:
+ query = {
+ "query" : {
+ "term" : {"release" : release, "bank": bank_name}
+ }
+ }
+ BmajIndex.es.delete_by_query(index=BmajIndex.index, body=query)
+ except Exception as e:
+ logging.error('Index:Remove:'+bank_name+'_'+str(release)+':Exception:'+str(e))
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
+ """
+
+ @staticmethod
+ def search(query):
+ if not BmajIndex.do_index:
+ return None
+ res = BmajIndex.es.search(index=BmajIndex.index,
+ doc_type='production',
+ search_type = "query_then_fetch",
+ body=query)
+ return res['hits']['hits']
+
+ @staticmethod
+ def searchq(query, size=1000):
+ """
+ Lucene syntax search
+
+ :param query: Lucene search string
+ :type query: str
+ :param size: number of results
+ :type size: int
+ :return: list of matches
+ """
+ if not BmajIndex.do_index:
+ return None
+ res = BmajIndex.es.search(index=BmajIndex.index, doc_type='production', q=query, size=size)
+ return res['hits']['hits']
+
+ @staticmethod
+ def add_stat(stat_id, stat):
+ """
+ Add some statistics, must contain release and bank properties.
+ """
+ if not BmajIndex.do_index:
+ return
+ if stat['release'] is None or stat['bank'] is None:
+ return False
+ #stat['bank'] = bank_name
+ try:
+ BmajIndex.es.index(index=BmajIndex.index, doc_type='releasestats', id=stat_id, body=stat)
+ except Exception:
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
+ else:
+ return False
+ return True
+
+
+ @staticmethod
+ def add(bank_name, prod, flush=False):
+ """
+ Index a production release
+
+ :param bank_name: Name of the bank
+ :type bank_name: str
+ :param prod: session release object
+ :type prod: dict
+ :param flush: Force flushing
+ :type flush: bool
+ """
+ if not BmajIndex.do_index:
+ return
+ obj = copy.deepcopy(prod)
+ if obj['release'] is None:
+ return
+ obj['bank'] = bank_name
+ formats = obj['formats']
+ try:
+ for fkey, fvalue in formats.items():
+ for elt in fvalue:
+ elt['format'] = fkey
+ elt['bank'] = bank_name
+ elt['release'] = obj['release']
+ if 'status' in obj:
+ elt['status'] = obj['status']
+ res = BmajIndex.es.index(index=BmajIndex.index, doc_type='production', body=elt)
+ if flush:
+ BmajIndex.es.indices.flush(index=BmajIndex.index, force=True)
+ except Exception as e:
+ logging.error('Index:Add:'+bank_name+'_'+str(obj['release'])+':Exception:'+str(e))
+ if BmajIndex.skip_if_failure:
+ BmajIndex.do_index = False
diff --git a/biomaj/config.py b/biomaj/config.py
new file mode 100644
index 0000000..3fd1ba2
--- /dev/null
+++ b/biomaj/config.py
@@ -0,0 +1,409 @@
+from __future__ import print_function
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+from builtins import object
+import logging
+import logging.config
+import os
+
+import time
+import sys
+
+from biomaj.bmajindex import BmajIndex
+
+if sys.version < '3':
+ import ConfigParser as configparser
+else:
+ import configparser
+
+class BiomajConfig(object):
+ """
+ Manage Biomaj configuration
+ """
+
+ DEFAULTS = {
+ 'http.parse.dir.line': r'<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
+ 'http.parse.file.line': r'<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
+ 'http.group.dir.name': 1,
+ 'http.group.dir.date': 2,
+ 'http.group.file.name': 1,
+ 'http.group.file.date': 2,
+ 'http.group.file.size': 3,
+ 'visibility.default': 'public',
+ 'historic.logfile.level': 'INFO',
+ 'bank.num.threads': 2,
+ 'files.num.threads': 4,
+ 'use_elastic': 0,
+ 'use_drmaa': 0,
+ 'db.type': '',
+ 'db.formats': '',
+ 'keep.old.version': 1,
+ 'docker.sudo': '1',
+ 'auto_publish': 0
+ }
+
+ # Old biomaj level compatibility
+ LOGLEVEL = {
+ 'DEBUG': logging.DEBUG,
+ 'VERBOSE': logging.INFO,
+ 'INFO': logging.INFO,
+ 'WARN': logging.WARNING,
+ 'ERR': logging.ERROR
+ }
+
+ """
+ Global configuration file
+ """
+ global_config = None
+
+ """
+ Per use global configuration file, overriding global_config
+ """
+ user_config = None
+
+ @staticmethod
+ def load_config(config_file=None, allow_user_config=True):
+ """
+ Loads general config
+
+ :param config_file: global.properties file path
+ :type config_file: str
+ :param allow_user_config: use ~/.biomaj.cfg if present
+ :type allow_user_config: bool
+ """
+ if config_file is None:
+ env_file = os.environ.get('BIOMAJ_CONF')
+ if env_file is not None and os.path.exists(env_file):
+ config_file = env_file
+ else:
+ env_file = 'global.properties'
+ if os.path.exists(env_file):
+ config_file = env_file
+
+ if config_file is None or not os.path.exists(config_file):
+ raise Exception('Missing global configuration file')
+
+ BiomajConfig.config_file = os.path.abspath(config_file)
+
+ BiomajConfig.global_config = configparser.ConfigParser()
+
+ if allow_user_config and os.path.exists(os.path.expanduser('~/.biomaj.cfg')):
+ BiomajConfig.user_config_file = os.path.expanduser('~/.biomaj.cfg')
+ BiomajConfig.user_config = configparser.ConfigParser()
+ BiomajConfig.user_config.read([os.path.expanduser('~/.biomaj.cfg')])
+ else:
+ BiomajConfig.user_config_file = None
+
+ BiomajConfig.global_config.read([config_file])
+
+ # ElasticSearch indexation support
+ do_index = False
+ if BiomajConfig.global_config.get('GENERAL', 'use_elastic') and \
+ BiomajConfig.global_config.get('GENERAL', 'use_elastic') == "1":
+ do_index = True
+ if do_index:
+ if BiomajConfig.global_config.get('GENERAL', 'elastic_nodes'):
+ elastic_hosts = BiomajConfig.global_config.get('GENERAL', 'elastic_nodes').split(',')
+ else:
+ elastic_hosts = ['localhost']
+ elastic_index = BiomajConfig.global_config.get('GENERAL', 'elastic_index')
+ if elastic_index is None:
+ elastic_index = 'biomaj'
+
+ if BiomajConfig.global_config.has_option('GENERAL', 'test') and \
+ BiomajConfig.global_config.get('GENERAL', 'test') == "1":
+ # Test connection to elasticsearch, if not available skip indexing for tests
+ BmajIndex.skip_if_failure = True
+
+
+ BmajIndex.load(index=elastic_index, hosts=elastic_hosts,
+ do_index=do_index)
+
+
+
+
+ def __init__(self, bank, options=None):
+ """
+ Loads bank configuration
+
+ :param bank: bank name
+ :type bank: str
+ :param options: bank options
+ :type options: argparse
+ """
+ self.name = bank
+ if BiomajConfig.global_config is None:
+ BiomajConfig.load_config()
+ self.config_bank = configparser.ConfigParser()
+ conf_dir = BiomajConfig.global_config.get('GENERAL', 'conf.dir')
+ if not os.path.exists(os.path.join(conf_dir, bank+'.properties')):
+ logging.error('Bank configuration file does not exists')
+ raise Exception('Configuration file '+bank+'.properties does not exists')
+ try:
+ config_files = [BiomajConfig.config_file]
+ if BiomajConfig.user_config_file is not None:
+ config_files.append(BiomajConfig.user_config_file)
+ config_files.append(os.path.join(conf_dir, bank+'.properties'))
+ self.config_bank.read(config_files)
+ except Exception as e:
+ print("Configuration file error: "+str(e))
+ logging.error("Configuration file error "+str(e))
+ sys.exit(1)
+
+ self.last_modified = int(os.stat(os.path.join(conf_dir, bank+'.properties')).st_mtime)
+
+ if os.path.exists(os.path.expanduser('~/.biomaj.cfg')):
+ logging.config.fileConfig(os.path.expanduser('~/.biomaj.cfg'))
+ else:
+ logging.config.fileConfig(BiomajConfig.config_file)
+
+ do_log = False
+ if options is None:
+ do_log = True
+ elif hasattr(options, 'no_log') and not options.no_log:
+ do_log = True
+ elif type(options) is dict and 'no_log' in options and not options['no_log']:
+ do_log = True
+
+ #if options is None or (( hasattr(options,'no_log') and not options.no_log) or ('no_log' in options and not options['no_log'])):
+ if do_log:
+ logger = logging.getLogger()
+ bank_log_dir = os.path.join(self.get('log.dir'), bank, str(time.time()))
+ if not os.path.exists(bank_log_dir):
+ os.makedirs(bank_log_dir)
+ hdlr = logging.FileHandler(os.path.join(bank_log_dir, bank+'.log'))
+ self.log_file = os.path.join(bank_log_dir, bank+'.log')
+ if options is not None and options.get_option('log') is not None:
+ hdlr.setLevel(BiomajConfig.LOGLEVEL[options.get_option('log')])
+ else:
+ hdlr.setLevel(BiomajConfig.LOGLEVEL[self.get('historic.logfile.level')])
+ formatter = logging.Formatter('%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s')
+ hdlr.setFormatter(formatter)
+ logger.addHandler(hdlr)
+ else:
+ self.log_file = 'none'
+
+ cache_dir = self.get('cache.dir')
+ if cache_dir is None:
+ print("Configuration file error: cache.dir empty")
+ logging.error("cache.dir is not defined")
+ sys.exit(1)
+
+ if not os.path.exists(cache_dir):
+ os.makedirs(cache_dir)
+
+ process_dir = self.get('process.dir')
+ if process_dir is None:
+ print("Configuration file error: process.dir empty")
+ logging.error("process.dir is not defined")
+ sys.exit(1)
+
+ if not os.path.exists(process_dir):
+ os.makedirs(process_dir)
+
+ data_dir = self.get('data.dir')
+ if data_dir is None:
+ print("Configuration file error: data.dir empty")
+ logging.error("data.dir is not defined")
+ sys.exit(1)
+
+ if not os.path.exists(data_dir):
+ os.makedirs(data_dir)
+
+ lock_dir = self.get('lock.dir')
+ if lock_dir is None:
+ print("Configuration file error: lock.dir empty")
+ logging.error("lock.dir is not defined")
+ sys.exit(1)
+
+ if not os.path.exists(lock_dir):
+ os.makedirs(lock_dir)
+
+
+ def set(self, prop, value, section='GENERAL'):
+ self.config_bank.set(section, prop, value)
+
+ def get_bool(self, prop, section='GENERAL', escape=True, default=None):
+ """
+ Get a boolean property from bank or general configration. Optionally in section.
+ """
+ value = self.get(prop, section, escape, default)
+ if value is None:
+ return False
+ if value is True or value == 'true' or value == '1':
+ return True
+ else:
+ return False
+
+ def get(self, prop, section='GENERAL', escape=True, default=None):
+ """
+ Get a property from bank or general configration. Optionally in section.
+ """
+ # Compatibility fields
+ if prop == 'depends':
+ depend = self.get('db.source', section, escape, None)
+ if depend:
+ return depend
+
+ if self.config_bank.has_option(section, prop):
+ val = self.config_bank.get(section, prop)
+ if prop == 'remote.dir' and not val.endswith('/'):
+ val = val + '/'
+ # If regexp, escape backslashes
+ if escape and (prop == 'local.files' or prop == 'remote.files' or prop == 'http.parse.dir.line' or prop == 'http.parse.file.line'):
+ val = val.replace('\\\\', '\\')
+ return val
+
+ if BiomajConfig.user_config is not None:
+ if BiomajConfig.user_config.has_option(section, prop):
+ return BiomajConfig.user_config.get(section, prop)
+
+ if BiomajConfig.global_config.has_option(section, prop):
+ return BiomajConfig.global_config.get(section, prop)
+
+ if prop in BiomajConfig.DEFAULTS:
+ return BiomajConfig.DEFAULTS[prop]
+
+ return default
+
+
+ def get_time(self):
+ """
+ Return last modification time of config files
+ """
+ return self.last_modified
+
+
+
+ def check(self):
+ """
+ Check configuration
+ """
+ self.set('localrelease', '')
+ self.set('remoterelease', '')
+
+ status = True
+ if not self.get('data.dir'):
+ logging.error('data.dir is not set')
+ status = False
+ if not self.get('conf.dir'):
+ logging.error('conf.dir is not set')
+ status = False
+ if not self.get('log.dir'):
+ logging.error('log.dir is not set')
+ status = False
+ if not self.get('process.dir'):
+ logging.error('process.dir is not set')
+ status = False
+ if not self.get('lock.dir'):
+ logging.error('lock.dir is not set')
+ status = False
+ if not self.get('cache.dir'):
+ logging.error('cache.dir is not set')
+ status = False
+
+
+ if not self.get('db.fullname'):
+ logging.warn('db.fullname is not set')
+ if not self.get('db.formats'):
+ logging.warn('db.formats is not set')
+ if self.get('use_ldap'):
+ if not self.get('ldap.host') or not self.get('ldap.port') or not self.get('ldap.dn'):
+ logging.error('use_ldap set to 1 but missing configuration')
+ status = False
+ if self.get('use_elastic'):
+ if not self.get('elastic_nodes') or not self.get('elastic_index'):
+ logging.error('use_elastic set to 1 but missing configuration')
+ status = False
+
+ if not self.get('celery.queue') or not self.get('celery.broker'):
+ logging.warn('celery config is not set, that\'s fine if you do not use Celery for background tasks')
+
+ if not self.get('mail.smtp.host'):
+ logging.error('SMTP mail config not set, you will not be able to send emails')
+ status = False
+ if self.get('mail.smtp.host') and not self.get('mail.from'):
+ logging.error('Mail origin mail.from not set')
+ status = False
+
+ if not self.get('offline.dir.name'):
+ logging.error('offline.dir.name is not set')
+ status = False
+ elif self.get('offline.dir.name').startswith('/'):
+ logging.error('offline dir must be relative to data.dir and should not start with a /')
+ status = False
+ if not self.get('dir.version'):
+ logging.error('dir.version is not set')
+ status = False
+ elif self.get('dir.version').startswith('/'):
+ logging.error('dir.version must be relative to data.dir and should not start with a /')
+ status = False
+ if not self.get('protocol'):
+ logging.error('protocol is not set')
+ status = False
+ else:
+ protocol = self.get('protocol')
+ allowed_protocols = ['none', 'multi', 'local', 'ftp', 'sftp', 'http', 'https', 'directftp', 'directhttp', 'directhttps']
+ if protocol not in allowed_protocols:
+ logging.error('Protocol not supported: '+protocol)
+ status = False
+ if protocol not in ['multi','none']:
+ if protocol != 'local' and not self.get('server'):
+ logging.error('server not set')
+ status = False
+ if not self.get('remote.dir'):
+ logging.error('remote.dir not set')
+ status = False
+ elif not self.get('remote.dir').endswith('/'):
+ logging.error('remote.dir must end with a /')
+ return False
+ if protocol not in ['direcftp', 'directhttp', 'directhttps'] and not self.get('remote.files') and not self.get('remote.list'):
+ logging.error('remote.files not set')
+ status = False
+ if not self.get('local.files'):
+ logging.error('local.files is not set')
+ status = False
+ # Remove processes
+ processes = ['db.remove.process', 'db.pre.process']
+ for process in processes:
+ if self.get(process):
+ metas = self.get(process).split(',')
+ for meta in metas:
+ if not self.get(meta):
+ logging.error('Metaprocess ' + meta + ' not defined')
+ status = False
+ else:
+ procs = self.get(meta).split(',')
+ for proc in procs:
+ if not self.get(proc+'.name'):
+ logging.error('Process '+proc+' not defined')
+ status = False
+ else:
+ if not self.get(proc+'.exe'):
+ logging.error('Process exe for '+proc+' not defined')
+ status = False
+ # Check blocks
+ if self.get('BLOCKS'):
+ blocks = self.get('BLOCKS').split(',')
+ for block in blocks:
+ if not self.get(block+'.db.post.process'):
+ logging.error('Block '+block+' not defined')
+ status = False
+ else:
+ metas = self.get(block+'.db.post.process').split(',')
+ for meta in metas:
+ if not self.get(meta):
+ logging.error('Metaprocess ' + meta + ' not defined')
+ status = False
+ else:
+ procs = self.get(meta).split(',')
+ for proc in procs:
+ if not self.get(proc+'.name'):
+ logging.error('Process '+proc+' not defined')
+ status = False
+ else:
+ if not self.get(proc+'.exe'):
+ logging.error('Process exe for '+proc+' not defined')
+ status = False
+ return status
diff --git a/biomaj/download/__init__.py b/biomaj/download/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj/download/direct.py b/biomaj/download/direct.py
new file mode 100644
index 0000000..dfe1cba
--- /dev/null
+++ b/biomaj/download/direct.py
@@ -0,0 +1,340 @@
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+import datetime
+import logging
+import pycurl
+import io
+import os
+import re
+import urllib.request, urllib.parse, urllib.error
+import hashlib
+
+from biomaj.download.interface import DownloadInterface
+from biomaj.download.ftp import FTPDownload
+from biomaj.utils import Utils
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+class MultiDownload(DownloadInterface):
+ '''
+ Base interface for a downloader using multiple downloaders
+ '''
+ def __init__(self):
+ DownloadInterface.__init__(self)
+ self.downloaders = []
+ self.files_to_download = []
+
+ def add_downloaders(self, downloaders):
+ '''
+ Adds a list of downloaders
+ '''
+ self.downloaders += downloaders
+ for d in downloaders:
+ self.files_to_download += d.files_to_download
+
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ if dir_list is None:
+ dir_list = []
+ self.files_to_download = []
+ for d in self.downloaders:
+ d.match(patterns, d.files_to_download, [], prefix, submatch)
+ self.files_to_download = []
+ for d in self.downloaders:
+ self.files_to_download += d.files_to_download
+
+ def download(self, local_dir):
+ self.files_to_download = []
+ for d in self.downloaders:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ d.download(local_dir)
+ self.files_to_download = []
+ for d in self.downloaders:
+ self.files_to_download += d.files_to_download
+ return (self.files_to_download, [])
+
+ def list(self):
+ self.files_to_download = []
+ for d in self.downloaders:
+ d.list()
+ self.files_to_download = []
+ for d in self.downloaders:
+ self.files_to_download += d.files_to_download
+ return (self.files_to_download, [])
+
+ def close(self):
+ for d in self.downloaders:
+ d.close()
+
+
+class DirectFTPDownload(FTPDownload):
+ '''
+ download a list of files from FTP, no regexp
+ '''
+
+ def __init__(self, protocol, host, rootdir='', file_list=None):
+ '''
+
+ Initialize the files in list with today as last-modification date.
+ Size is also preset to zero, size will be set after download
+
+ :param file_list: list of files to download on server
+ :type file_list: list
+ '''
+ FTPDownload.__init__(self, protocol, host, rootdir)
+ self.save_as = None
+ if file_list is None:
+ file_list = []
+ today = datetime.date.today()
+ self.files_to_download = []
+ self.headers = {}
+ for file in file_list:
+ rfile = {}
+ rfile['root'] = self.rootdir
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = 0
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ rfile['name'] = file
+ rfile['hash'] = None
+ self.files_to_download.append(rfile)
+
+ def list(self, directory=''):
+ '''
+ FTP protocol does not give us the possibility to get file date from remote url
+ '''
+ for rfile in self.files_to_download:
+ if self.save_as is None:
+ self.save_as = rfile['name']
+ rfile['save_as'] = self.save_as
+ return (self.files_to_download, [])
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ All files to download match, no pattern
+ '''
+ if dir_list is None:
+ dir_list = []
+ self.files_to_download = file_list
+
+
+
+class DirectHttpDownload(DirectFTPDownload):
+
+ def __init__(self, protocol, host, rootdir='', file_list=None):
+ '''
+ :param file_list: list of files to download on server
+ :type file_list: list
+ '''
+ if file_list is None:
+ file_list = []
+ DirectFTPDownload.__init__(self, protocol, host, rootdir, file_list)
+ self.save_as = None
+ self.method = 'GET'
+ self.param = {}
+
+ def download(self, local_dir, keep_dirs=True):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
+ :return: list of downloaded files
+ '''
+ logging.debug('DirectHTTP:Download')
+ nb_files = len(self.files_to_download)
+
+ if nb_files > 1:
+ self.files_to_download = []
+ logging.error('DirectHTTP accepts only 1 file')
+
+ cur_files = 1
+
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+
+ if self.save_as is None:
+ self.save_as = rfile['name']
+
+ file_dir = local_dir
+ if keep_dirs:
+ file_dir = local_dir + os.path.dirname(self.save_as)
+ file_path = file_dir + '/' + os.path.basename(self.save_as)
+
+ # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ '''
+ self.mkdir_lock.acquire()
+ try:
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ except Exception as e:
+ logging.error(e)
+ finally:
+ self.mkdir_lock.release() # release lock, no matter what
+ '''
+ logging.debug('DirectHTTP:Download:Progress'+str(cur_files)+'/'+str(nb_files)+' downloading file '+rfile['name']+', save as '+self.save_as)
+ cur_files += 1
+ if not 'url' in rfile:
+ rfile['url'] = self.url
+ fp = open(file_path, "wb")
+ curl = pycurl.Curl()
+
+ if self.proxy is not None:
+ curl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.method == 'POST':
+ # Form data must be provided already urlencoded.
+ postfields = urllib.parse.urlencode(self.param)
+ # Sets request method to POST,
+ # Content-Type header to application/x-www-form-urlencoded
+ # and data to send in request body.
+ if self.credentials is not None:
+ curl.setopt(pycurl.USERPWD, self.credentials)
+
+ curl.setopt(pycurl.POSTFIELDS, postfields)
+ try:
+ curl.setopt(pycurl.URL, rfile['url']+rfile['root']+'/'+rfile['name'])
+ except Exception as a:
+ curl.setopt(pycurl.URL, (rfile['url']+rfile['root']+'/'+rfile['name']).encode('ascii', 'ignore'))
+ #curl.setopt(pycurl.URL, rfile['url']+rfile['root']+'/'+rfile['name'])
+ else:
+ url = rfile['url']+rfile['root']+'/'+rfile['name']+'?'+urllib.parse.urlencode(self.param)
+ #curl.setopt(pycurl.URL, url)
+ try:
+ curl.setopt(pycurl.URL, url)
+ except Exception as a:
+ curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
+
+ curl.setopt(pycurl.WRITEDATA, fp)
+ curl.perform()
+
+ curl.close()
+ fp.close()
+ logging.debug('downloaded!')
+ rfile['name'] = self.save_as
+ self.set_permissions(file_path, rfile)
+ self.set_progress(1, nb_files)
+ return self.files_to_download
+
+ def header_function(self, header_line):
+ # HTTP standard specifies that headers are encoded in iso-8859-1.
+ # On Python 2, decoding step can be skipped.
+ # On Python 3, decoding step is required.
+ header_line = header_line.decode('iso-8859-1')
+
+ # Header lines include the first status line (HTTP/1.x ...).
+ # We are going to ignore all lines that don't have a colon in them.
+ # This will botch headers that are split on multiple lines...
+ if ':' not in header_line:
+ return
+
+ # Break the header line into header name and value.
+ name, value = header_line.split(':', 1)
+
+ # Remove whitespace that may be present.
+ # Header lines include the trailing newline, and there may be whitespace
+ # around the colon.
+ name = name.strip()
+ value = value.strip()
+
+ # Header names are case insensitive.
+ # Lowercase name here.
+ name = name.lower()
+
+ # Now we can actually record the header name and value.
+ self.headers[name] = value
+
+ def list(self, directory=''):
+ '''
+ Try to get file headers to get last_modification and size
+ '''
+ for rfile in self.files_to_download:
+ if self.save_as is None:
+ self.save_as = rfile['name']
+
+ rfile['save_as'] = self.save_as
+
+ self.crl.setopt(pycurl.HEADER, True)
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ self.crl.setopt(pycurl.NOBODY, True)
+ try:
+ self.crl.setopt(pycurl.URL, self.url+self.rootdir+rfile['name'])
+ except Exception as a:
+ self.crl.setopt(pycurl.URL, (self.url+self.rootdir+rfile['name']).encode('ascii', 'ignore'))
+ #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+ self.crl.perform()
+
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+
+ lines = re.split(r'[\n\r]+', result)
+ for line in lines:
+ parts = line.split(':')
+ if parts[0].strip() == 'Content-Length':
+ rfile['size'] = parts[1].strip()
+ if parts[0].strip() == 'Last-Modified':
+ # Sun, 06 Nov 1994
+ res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = res.group(2)
+ rfile['month'] = Utils.month_to_num(res.group(3))
+ rfile['year'] = res.group(4)
+ continue
+ #Sunday, 06-Nov-94
+ res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = res.group(2)
+ rfile['month'] = Utils.month_to_num(res.group(3))
+ rfile['year'] = str(2000 + int(res.group(4)))
+ continue
+ #Sun Nov 6 08:49:37 1994
+ res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
+ if res:
+ rfile['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
+ rfile['day'] = res.group(3)
+ rfile['month'] = Utils.month_to_num(res.group(2))
+ rfile['year'] = res.group(4)
+ continue
+ return (self.files_to_download, [])
diff --git a/biomaj/download/downloadthreads.py b/biomaj/download/downloadthreads.py
new file mode 100644
index 0000000..0d7dc73
--- /dev/null
+++ b/biomaj/download/downloadthreads.py
@@ -0,0 +1,105 @@
+from builtins import str
+from builtins import range
+#import os
+import logging
+#import datetime
+#import time
+#import re
+import threading
+import copy
+#import tarfile
+#import zipfile
+import traceback
+
+class DownloadThread(threading.Thread):
+
+ NB_THREAD = 2
+
+
+ @staticmethod
+ def get_threads(downloader, local_dir):
+ '''
+ Creates a list of thread for download
+
+ :param downloader: downloader to use
+ :type downloader: :class:`biomaj.download.interface.DownloadInterface`
+ :param local_dir: directory where files should be downloaded
+ :type local_dir: str
+ :return: list of threads
+ '''
+ threads = []
+ # Creates threads with copies of the downloader
+ download_config = downloader.config
+ for i in range(0, DownloadThread.NB_THREAD):
+ downloader.config = None
+ new_download = copy.deepcopy(downloader)
+ new_download.config = download_config
+ new_download.files_to_download = []
+ th = DownloadThread(new_download, local_dir)
+ threads.append(th)
+ # Now dispatch the files to download to the threads
+ thread_id = 0
+ for dfile in downloader.files_to_download:
+ if thread_id == DownloadThread.NB_THREAD:
+ thread_id = 0
+ threads[thread_id].downloader.files_to_download.append(dfile)
+ thread_id += 1
+ return threads
+
+ @staticmethod
+ def get_threads_multi(downloaders, local_dir):
+ '''
+ Dispatch multiple downloaders on threads
+
+ :param downloaders: downlaoders to dispatch in threads
+ :type downloaders: list of :class:`biomaj.download.interface.DownloadInterface`
+ :param local_dir: directory where files should be downloaded
+ :type local_dir: str
+ :return: list of threads
+ '''
+ threads = []
+ # Creates threads with copies of the downloader
+ thread_id = 0
+ for downloader in downloaders:
+ if thread_id == DownloadThread.NB_THREAD:
+ thread_id = 0
+ th = DownloadThread(downloader, local_dir)
+ threads.append(th)
+ thread_id += 1
+ return threads
+
+ def __init__(self, downloader, local_dir):
+ '''
+ Download thread to download a list of files
+
+ :param downloader: downloader to use
+ :type downloader: :class:`biomaj.download.interface.DownloadInterface`
+ :param local_dir: directory to download files
+ :type local_dir: str
+ '''
+ threading.Thread.__init__(self)
+ self.downloader = downloader
+ self.downloader.mkdir_lock = DownloadThread.MKDIR_LOCK
+ self.downloader.kill_received = False
+ self.local_dir = local_dir
+ self.error = False
+ self._stopevent = threading.Event()
+
+ def run(self):
+ logging.info('Start download thread')
+ if self.downloader is None:
+ return True
+ self.error = False
+ try:
+ self.downloader.download(self.local_dir)
+ self.downloader.close()
+ except Exception as e:
+ logging.error('Error in download execution of thread: '+str(e))
+ logging.debug(traceback.format_exc())
+ self.error = True
+
+ def stop(self):
+ self._stopevent.set()
+
+
+DownloadThread.MKDIR_LOCK = threading.Lock()
diff --git a/biomaj/download/ftp.py b/biomaj/download/ftp.py
new file mode 100644
index 0000000..c92a584
--- /dev/null
+++ b/biomaj/download/ftp.py
@@ -0,0 +1,348 @@
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+import logging
+import pycurl
+import io
+import re
+import os
+from datetime import datetime
+import hashlib
+
+from biomaj.utils import Utils
+from biomaj.download.interface import DownloadInterface
+
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+
+class FTPDownload(DownloadInterface):
+ '''
+ Base class to download files from FTP
+
+ protocol=ftp
+ server=ftp.ncbi.nih.gov
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+
+ def __init__(self, protocol, host, rootdir):
+ DownloadInterface.__init__(self)
+ logging.debug('Download')
+ self.crl = pycurl.Curl()
+ url = protocol+'://'+host
+ self.rootdir = rootdir
+ self.url = url
+ self.headers = {}
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ Find files matching patterns. Sets instance variable files_to_download.
+
+ :param patterns: regexps to match
+ :type patterns: list
+ :param file_list: list of files to match
+ :type file_list: list
+ :param dir_list: sub directories in current dir
+ :type dir_list: list
+ :param prefix: directory prefix
+ :type prefix: str
+ :param submatch: first call to match, or called from match
+ :type submatch: bool
+ '''
+ logging.debug('Download:File:RegExp:'+str(patterns))
+ if dir_list is None:
+ dir_list = []
+ if not submatch:
+ self.files_to_download = []
+ for pattern in patterns:
+ subdirs_pattern = pattern.split('/')
+ if len(subdirs_pattern) > 1:
+ # Pattern contains sub directories
+ subdir = subdirs_pattern[0]
+ if subdir == '^':
+ subdirs_pattern = subdirs_pattern[1:]
+ subdir = subdirs_pattern[0]
+ for direlt in dir_list:
+ subdir = direlt['name']
+ logging.debug('Download:File:Subdir:Check:'+subdir)
+ if pattern == '**/*':
+ (subfile_list, subdirs_list) = self.list(prefix+'/'+subdir+'/')
+ self.match([pattern], subfile_list, subdirs_list, prefix+'/'+subdir, True)
+ for rfile in file_list:
+ if pattern == '**/*' or re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' +rfile['name']
+ self.files_to_download.append(rfile)
+ logging.debug('Download:File:MatchRegExp:'+rfile['name'])
+ else:
+ if re.match(subdirs_pattern[0], subdir):
+ logging.debug('Download:File:Subdir:Match:'+subdir)
+ # subdir match the beginning of the pattern
+ # check match in subdir
+ (subfile_list, subdirs_list) = self.list(prefix+'/'+subdir+'/')
+ self.match(['/'.join(subdirs_pattern[1:])], subfile_list, subdirs_list, prefix+'/'+subdir, True)
+
+ else:
+ for rfile in file_list:
+ if re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' +rfile['name']
+ self.files_to_download.append(rfile)
+ logging.debug('Download:File:MatchRegExp:'+rfile['name'])
+ if not submatch and len(self.files_to_download) == 0:
+ raise Exception('no file found matching expressions')
+
+ def curl_download(self, file_path, file_to_download):
+ error = True
+ nbtry = 1
+ while(error==True and nbtry<3):
+ fp = open(file_path, "wb")
+ curl = pycurl.Curl()
+ try:
+ curl.setopt(pycurl.URL, file_to_download)
+ except Exception as a:
+ curl.setopt(pycurl.URL, file_to_download.encode('ascii', 'ignore'))
+ if self.proxy is not None:
+ curl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ curl.setopt(pycurl.USERPWD, self.credentials)
+
+ curl.setopt(pycurl.CONNECTTIMEOUT, 300)
+ # Download should not take more than 5minutes
+ curl.setopt(pycurl.TIMEOUT, self.timeout)
+ curl.setopt(pycurl.NOSIGNAL, 1)
+
+ curl.setopt(pycurl.WRITEDATA, fp)
+
+ try:
+ curl.perform()
+ errcode = curl.getinfo(pycurl.HTTP_CODE)
+ if int(errcode) != 226 and int(errcode) != 200:
+ error = True
+ logging.error('Error while downloading '+file_to_download+' - '+str(errcode))
+ else:
+ error = False
+ except Exception as e:
+ logging.error('Could not get errcode:' + str(e))
+ nbtry += 1
+ curl.close()
+ fp.close()
+ return error
+
+ def download(self, local_dir, keep_dirs=True):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+ :param keep_dirs: bool
+ :return: list of downloaded files
+ '''
+ logging.debug('FTP:Download')
+
+ nb_files = len(self.files_to_download)
+ cur_files = 1
+
+ for rfile in self.files_to_download:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ file_dir = local_dir
+ if 'save_as' not in rfile or rfile['save_as'] is None:
+ rfile['save_as'] = rfile['name']
+ if keep_dirs:
+ file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
+ file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
+
+ # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ '''
+ self.mkdir_lock.acquire()
+ try:
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ except Exception as e:
+ logging.error(e)
+ finally:
+ self.mkdir_lock.release() # release lock, no matter what
+ '''
+ logging.debug('FTP:Download:Progress:'+str(cur_files)+'/'+str(nb_files)+' downloading file '+rfile['name'])
+ logging.debug('FTP:Download:Progress:'+str(cur_files)+'/'+str(nb_files)+' save as '+rfile['save_as'])
+ cur_files += 1
+ if not 'url' in rfile:
+ rfile['url'] = self.url
+
+ error = self.curl_download(file_path, rfile['url']+rfile['root']+'/'+rfile['name'])
+ if error:
+ raise Exception("FTP:Download:Error:"+rfile['url']+rfile['root']+'/'+rfile['name'])
+
+ #logging.debug('downloaded!')
+ self.set_permissions(file_path, rfile)
+ # Add progress only per 10 files to limit db requests
+ if nb_files < 10:
+ nb = 1
+ do_progress = True
+ else:
+ if cur_files == nb_files:
+ do_progress = True
+ nb = cur_files % 10
+ elif cur_files > 0 and cur_files % 10 == 0:
+ nb = 10
+ do_progress = True
+ else:
+ do_progress = False
+ if do_progress:
+ self.set_progress(nb, nb_files)
+ return self.files_to_download
+
+
+ def header_function(self, header_line):
+ # HTTP standard specifies that headers are encoded in iso-8859-1.
+ # On Python 2, decoding step can be skipped.
+ # On Python 3, decoding step is required.
+ header_line = header_line.decode('iso-8859-1')
+
+ # Header lines include the first status line (HTTP/1.x ...).
+ # We are going to ignore all lines that don't have a colon in them.
+ # This will botch headers that are split on multiple lines...
+ if ':' not in header_line:
+ return
+
+ # Break the header line into header name and value.
+ name, value = header_line.split(':', 1)
+
+ # Remove whitespace that may be present.
+ # Header lines include the trailing newline, and there may be whitespace
+ # around the colon.
+ name = name.strip()
+ value = value.strip()
+
+ # Header names are case insensitive.
+ # Lowercase name here.
+ name = name.lower()
+
+ # Now we can actually record the header name and value.
+ self.headers[name] = value
+
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ logging.debug('Download:List:'+self.url+self.rootdir+directory)
+ #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
+ try:
+ self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
+ except Exception as a:
+ self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore'))
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+
+
+ self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
+ # Download should not take more than 5minutes
+ self.crl.setopt(pycurl.TIMEOUT, self.timeout)
+ self.crl.setopt(pycurl.NOSIGNAL, 1)
+ try:
+ self.crl.perform()
+ except Exception as e:
+ logging.error('Could not get errcode:' + str(e))
+
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+
+ # FTP LIST output is separated by \r\n
+ # lets split the output in lines
+ #lines = result.split(r'[\r\n]+')
+ lines = re.split(r'[\n\r]+', result)
+ # lets walk through each line
+ rfiles = []
+ rdirs = []
+
+ for line in lines:
+ rfile = {}
+ # lets print each part separately
+ parts = line.split()
+ # the individual fields in this list of parts
+ if not parts: continue
+ rfile['permissions'] = parts[0]
+ rfile['group'] = parts[2]
+ rfile['user'] = parts[3]
+ rfile['size'] = parts[4]
+ rfile['month'] = Utils.month_to_num(parts[5])
+ rfile['day'] = parts[6]
+ rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest()
+ try:
+ rfile['year'] = int(parts[7])
+ except Exception as e:
+ # specific ftp case issues at getting date info
+ curdate = datetime.now()
+ rfile['year'] = curdate.year
+ # Year not precised, month feater than current means previous year
+ if rfile['month'] > curdate.month:
+ rfile['year'] = curdate.year - 1
+ # Same month but later day => previous year
+ if rfile['month'] == curdate.month and int(rfile['day']) > curdate.day:
+ rfile['year'] = curdate.year - 1
+ rfile['name'] = parts[8]
+ if len(parts) >= 10 and parts[9] == '->':
+ # Symlink, add to files AND dirs as we don't know the type of the link
+ rdirs.append(rfile)
+
+ is_dir = False
+ if re.match('^d', rfile['permissions']):
+ is_dir = True
+
+ if not is_dir:
+ rfiles.append(rfile)
+ else:
+ rdirs.append(rfile)
+ return (rfiles, rdirs)
+
+
+ def chroot(self, cwd):
+ logging.debug('Download: change dir '+cwd)
+
+ def close(self):
+ if self.crl is not None:
+ self.crl.close()
+ self.crl = None
diff --git a/biomaj/download/http.py b/biomaj/download/http.py
new file mode 100644
index 0000000..1c5fa36
--- /dev/null
+++ b/biomaj/download/http.py
@@ -0,0 +1,138 @@
+from future import standard_library
+standard_library.install_aliases()
+import logging
+import pycurl
+import io
+import re
+import os
+import hashlib
+import datetime
+
+from biomaj.utils import Utils
+from biomaj.download.ftp import FTPDownload
+
+try:
+ from io import BytesIO
+except ImportError:
+ from StringIO import StringIO as BytesIO
+
+class HTTPDownload(FTPDownload):
+ '''
+ Base class to download files from HTTP
+
+ Makes use of http.parse.dir.line etc.. regexps to extract page information
+
+ protocol=http
+ server=ftp.ncbi.nih.gov
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+ def __init__(self, protocol, host, rootdir, config):
+ FTPDownload.__init__(self, protocol, host, rootdir)
+ self.config = config
+
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ logging.debug('Download:List:'+self.url+self.rootdir+directory)
+ #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
+ try:
+ self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
+ except Exception as a:
+ self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore'))
+
+ if self.proxy is not None:
+ self.crl.setopt(pycurl.PROXY, self.proxy)
+ if self.proxy_auth is not None:
+ self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
+
+ if self.credentials is not None:
+ self.crl.setopt(pycurl.USERPWD, self.credentials)
+
+ output = BytesIO()
+ # lets assign this buffer to pycurl object
+ self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+ self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+ self.crl.perform()
+ # Figure out what encoding was sent with the response, if any.
+ # Check against lowercased header name.
+ encoding = None
+ if 'content-type' in self.headers:
+ content_type = self.headers['content-type'].lower()
+ match = re.search('charset=(\S+)', content_type)
+ if match:
+ encoding = match.group(1)
+ if encoding is None:
+ # Default encoding for HTML is iso-8859-1.
+ # Other content types may have different default encoding,
+ # or in case of binary data, may have no encoding at all.
+ encoding = 'iso-8859-1'
+
+ # lets get the output in a string
+ result = output.getvalue().decode(encoding)
+ '''
+ 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
+ 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
+ 'http.group.dir.name': 1,
+ 'http.group.dir.date': 2,
+ 'http.group.file.name': 1,
+ 'http.group.file.date': 2,
+ 'http.group.file.size': 3,
+ '''
+
+ rfiles = []
+ rdirs = []
+
+ dirs = re.findall(self.config.get('http.parse.dir.line'), result)
+ if dirs is not None and len(dirs) > 0:
+ for founddir in dirs:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = '0'
+ date = founddir[int(self.config.get('http.group.dir.date'))-1]
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ #19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = parts[0]
+ rfile['year'] = parts[2]
+ rfile['name'] = founddir[int(self.config.get('http.group.dir.name'))-1]
+ rdirs.append(rfile)
+
+ files = re.findall(self.config.get('http.parse.file.line'), result)
+ if files is not None and len(files)>0:
+ for foundfile in files:
+ rfile = {}
+ rfile['permissions'] = ''
+ rfile['group'] = ''
+ rfile['user'] = ''
+ rfile['size'] = foundfile[int(self.config.get('http.group.file.size'))-1]
+ date = foundfile[int(self.config.get('http.group.file.date'))-1]
+ if self.config.get('http.parse.file.date.format'):
+ date_object = datetime.datetime.strptime(date, self.config.get('http.parse.file.date.format').replace('%%', '%'))
+ rfile['month'] = date_object.month
+ rfile['day'] = date_object.day
+ rfile['year'] = date_object.year
+ else:
+ dirdate = date.split()
+ parts = dirdate[0].split('-')
+ #19-Jul-2014 13:02
+ rfile['month'] = Utils.month_to_num(parts[1])
+ rfile['day'] = parts[0]
+ rfile['year'] = parts[2]
+ rfile['name'] = foundfile[int(self.config.get('http.group.file.name'))-1]
+ filehash = (rfile['name']+str(date)+str(rfile['size'])).encode('utf-8')
+ rfile['hash'] = hashlib.md5(filehash).hexdigest()
+ rfiles.append(rfile)
+ print("###OSALLOU "+str(rfile))
+
+ return (rfiles, rdirs)
diff --git a/biomaj/download/interface.py b/biomaj/download/interface.py
new file mode 100644
index 0000000..f82fda9
--- /dev/null
+++ b/biomaj/download/interface.py
@@ -0,0 +1,256 @@
+from builtins import str
+from builtins import object
+import os
+import logging
+import datetime
+import time
+import re
+import tarfile
+import zipfile
+
+from biomaj.utils import Utils
+
+from biomaj.mongo_connector import MongoConnector
+
+
+class _FakeLock(object):
+ '''
+ Fake lock for downloaders not called by a Downloadthread
+ '''
+
+ def __init__(self):
+ pass
+
+ def acquire(self):
+ pass
+
+ def release(self):
+ pass
+
+class DownloadInterface(object):
+ '''
+ Main interface that all downloaders must extend
+ '''
+
+ files_num_threads = 4
+
+ def __init__(self):
+ self.config = None
+ self.files_to_download = []
+ self.files_to_copy = []
+ self.error = False
+ self.credentials = None
+ #bank name
+ self.bank = None
+ self.mkdir_lock = _FakeLock()
+ self.kill_received = False
+ self.proxy = None
+ # 24h timeout
+ self.timeout = 3600 * 24
+ # Optional save target for single file downloaders
+ self.save_as = None
+
+
+ def set_proxy(self, proxy, proxy_auth=None):
+ '''
+ Use a proxy to connect to remote servers
+
+ :param proxy: proxy to use (see http://curl.haxx.se/libcurl/c/CURLOPT_PROXY.html for format)
+ :type proxy: str
+ :param proxy_auth: proxy authentication if any (user:password)
+ :type proxy_auth: str
+ '''
+ self.proxy = proxy
+ self.proxy_auth = proxy_auth
+
+
+ def set_progress(self, val, max):
+ '''
+ Update progress on download
+
+ :param val: number of downloaded files since last progress
+ :type val: int
+ :param max: number of files to download
+ :type max: int
+ '''
+ logging.debug('Download:progress:'+str(val)+'/'+str(max))
+ if not self.bank:
+ logging.debug('bank not specified, skipping record of download progress')
+ return
+
+ MongoConnector.banks.update({'name': self.bank},
+ {'$inc': {'status.download.progress': val},
+ '$set': {'status.download.total': max}})
+
+ def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
+ '''
+ Find files matching patterns. Sets instance variable files_to_download.
+
+ :param patterns: regexps to match
+ :type patterns: list
+ :param file_list: list of files to match
+ :type file_list: list
+ :param dir_list: sub directories in current dir
+ :type dir_list: list
+ :param prefix: directory prefix
+ :type prefix: str
+ :param submatch: first call to match, or called from match
+ :type submatch: bool
+ '''
+ logging.debug('Download:File:RegExp:'+str(patterns))
+
+ if dir_list is None:
+ dir_list = []
+
+ if not submatch:
+ self.files_to_download = []
+ for pattern in patterns:
+ subdirs_pattern = pattern.split('/')
+ if len(subdirs_pattern) > 1:
+ # Pattern contains sub directories
+ subdir = subdirs_pattern[0]
+ if subdir == '^':
+ subdirs_pattern = subdirs_pattern[1:]
+ subdir = subdirs_pattern[0]
+ if not dir_list and pattern == '**/*':
+ # Take all and no more dirs, take all files
+ for rfile in file_list:
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' +rfile['name']
+ self.files_to_download.append(rfile)
+ logging.debug('Download:File:MatchRegExp:'+rfile['name'])
+ return
+ for direlt in dir_list:
+ subdir = direlt['name']
+ logging.debug('Download:File:Subdir:Check:'+subdir)
+ if pattern == '**/*':
+ (subfile_list, subdirs_list) = self.list(prefix+'/'+subdir+'/')
+ self.match([pattern], subfile_list, subdirs_list, prefix+'/'+subdir, True)
+ for rfile in file_list:
+ if pattern == '**/*' or re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' +rfile['name']
+ self.files_to_download.append(rfile)
+ logging.debug('Download:File:MatchRegExp:'+rfile['name'])
+ else:
+ if re.match(subdirs_pattern[0], subdir):
+ logging.debug('Download:File:Subdir:Match:'+subdir)
+ # subdir match the beginning of the pattern
+ # check match in subdir
+ (subfile_list, subdirs_list) = self.list(prefix+'/'+subdir+'/')
+ self.match(['/'.join(subdirs_pattern[1:])], subfile_list, subdirs_list, prefix+'/'+subdir, True)
+
+ else:
+ for rfile in file_list:
+ if re.match(pattern, rfile['name']):
+ rfile['root'] = self.rootdir
+ if prefix != '':
+ rfile['name'] = prefix + '/' +rfile['name']
+ self.files_to_download.append(rfile)
+ logging.debug('Download:File:MatchRegExp:'+rfile['name'])
+ if not submatch and len(self.files_to_download) == 0:
+ raise Exception('no file found matching expressions')
+
+
+
+ def set_permissions(self, file_path, file_info):
+ '''
+ Sets file attributes to remote ones
+ '''
+ ftime = datetime.date(int(file_info['year']), int(file_info['month']), int(file_info['day']))
+ settime = time.mktime(ftime.timetuple())
+ os.utime(file_path, (settime, settime))
+
+ def download_or_copy(self, available_files, root_dir, check_exists=True):
+ '''
+ If a file to download is available in available_files, copy it instead of downloading it.
+
+ Update the instance variables files_to_download and files_to_copy
+
+ :param available_files: list of files available in root_dir
+ :type available files: list
+ :param root_dir: directory where files are available
+ :type root_dir: str
+ :param check_exists: checks if file exists locally
+ :type check_exists: bool
+ '''
+
+ self.files_to_copy = []
+ # In such case, it forces the download again
+ if not available_files:
+ return
+ available_files.sort(key=lambda x: x['name'])
+ self.files_to_download.sort(key=lambda x: x['name'])
+
+ new_files_to_download = []
+
+ test1_tuples = ((d['name'], d['year'], d['month'], d['day'], d['size']) for d in self.files_to_download)
+ test2_tuples = set((d['name'], d['year'], d['month'], d['day'], d['size']) for d in available_files)
+ new_or_modified_files = [t for t in test1_tuples if t not in test2_tuples]
+ index = 0
+
+ if len(new_or_modified_files) > 0:
+ for dfile in self.files_to_download:
+ if index < len(new_or_modified_files) and \
+ dfile['name'] == new_or_modified_files[index][0]:
+ new_files_to_download.append(dfile)
+ index += 1
+ else:
+ if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])):
+ dfile['root'] = root_dir
+ self.files_to_copy.append(dfile)
+ else:
+ new_files_to_download.append(dfile)
+
+ else:
+ # Copy everything
+ for dfile in self.files_to_download:
+ if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])):
+ dfile['root'] = root_dir
+ self.files_to_copy.apppend(dfile)
+ else:
+ new_files_to_download.append(dfile)
+
+ self.files_to_download = new_files_to_download
+
+
+ def download(self, local_dir):
+ '''
+ Download remote files to local_dir
+
+ :param local_dir: Directory where files should be downloaded
+ :type local_dir: str
+ :return: list of downloaded files
+ '''
+ pass
+
+ def list(self):
+ '''
+ List directory
+
+ :return: tuple of file list and dir list
+ '''
+ pass
+
+ def chroot(self, cwd):
+ '''
+ Change directory
+ '''
+ pass
+
+ def set_credentials(self, userpwd):
+ '''
+ Set credentials in format user:pwd
+
+ :param userpwd: credentials
+ :type userpwd: str
+ '''
+ self.credentials = userpwd
+
+ def close(self):
+ '''
+ Close connection
+ '''
+ pass
diff --git a/biomaj/download/localcopy.py b/biomaj/download/localcopy.py
new file mode 100644
index 0000000..71e3f51
--- /dev/null
+++ b/biomaj/download/localcopy.py
@@ -0,0 +1,89 @@
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+import logging
+import pycurl
+import io
+import re
+import os
+import datetime
+import hashlib
+
+from biomaj.utils import Utils
+from biomaj.download.interface import DownloadInterface
+
+class LocalDownload(DownloadInterface):
+ '''
+ Base class to copy file from local system
+
+ protocol=cp
+ server=localhost
+ remote.dir=/blast/db/FASTA/
+
+ remote.files=^alu.*\\.gz$
+
+ '''
+
+
+ def __init__(self, rootdir):
+ DownloadInterface.__init__(self)
+ logging.debug('Download')
+ self.rootdir = rootdir
+
+
+ def download(self, local_dir):
+ '''
+ Copy local files to local_dir
+
+ :param local_dir: Directory where files should be copied
+ :type local_dir: str
+ :return: list of downloaded files
+ '''
+ logging.debug('Local:Download')
+ Utils.copy_files(self.files_to_download, local_dir, lock=self.mkdir_lock)
+
+ return self.files_to_download
+
+ def list(self, directory=''):
+ '''
+ List FTP directory
+
+ :return: tuple of file and dirs in current directory with details
+ '''
+ logging.debug('Download:List:'+self.rootdir+directory)
+ # lets walk through each line
+
+ rfiles = []
+ rdirs = []
+
+ files = [f for f in os.listdir(self.rootdir + directory)]
+ for file_in_files in files:
+ rfile = {}
+ fstat = os.stat(os.path.join(self.rootdir + directory,file_in_files))
+
+ rfile['permissions'] = str(fstat.st_mode)
+ rfile['group'] = str(fstat.st_gid)
+ rfile['user'] = str(fstat.st_uid)
+ rfile['size'] = str(fstat.st_size)
+ fstat_mtime = datetime.datetime.fromtimestamp(fstat.st_mtime)
+ rfile['month'] = fstat_mtime.month
+ rfile['day'] = fstat_mtime.day
+ rfile['year'] = fstat_mtime.year
+ rfile['name'] = file_in_files
+ filehash = (rfile['name']+str(fstat.st_mtime)+rfile['size']).encode('utf-8')
+ rfile['hash'] = hashlib.md5(filehash).hexdigest()
+
+ is_dir = False
+ if os.path.isdir(os.path.join(self.rootdir + directory, file_in_files)):
+ is_dir = True
+
+ if not is_dir:
+ rfiles.append(rfile)
+ else:
+ rdirs.append(rfile)
+ return (rfiles, rdirs)
+
+
+ def chroot(self, cwd):
+ logging.debug('Download: change dir '+cwd)
+ os.chdir(cwd)
diff --git a/biomaj/mimes-bio.txt b/biomaj/mimes-bio.txt
new file mode 100644
index 0000000..c794bf6
--- /dev/null
+++ b/biomaj/mimes-bio.txt
@@ -0,0 +1,18 @@
+# Biological file mime types
+application/fasta fasta fa fsa
+application/bam bam bai
+application/gff gff gff3
+application/bed bed
+application/fastq fastq
+application/gtf gtf
+application/octet-stream ab1 scf
+application/axt axt
+application/csFasta csfasta
+application/FasttqSolexa fastqsolexa
+application/Interval interval
+application/Laj laj
+application/Lav lav
+application/Maf maf
+application/QualityScore qual
+application/BlastXml blastxml
+application/Wiggle wig
diff --git a/biomaj/mongo_connector.py b/biomaj/mongo_connector.py
new file mode 100644
index 0000000..76e3435
--- /dev/null
+++ b/biomaj/mongo_connector.py
@@ -0,0 +1,20 @@
+from builtins import object
+from pymongo import MongoClient
+
+
+class MongoConnector(object):
+ """
+ Connector to mongodb
+ """
+
+ client = None
+ db = None
+ banks = None
+ users = None
+
+ def __init__(self, url, db):
+ MongoConnector.client = MongoClient(url)
+ MongoConnector.db = MongoConnector.client[db]
+ MongoConnector.banks = MongoConnector.db.banks
+ MongoConnector.users = MongoConnector.db.users
+ MongoConnector.db_schema = MongoConnector.db.db_schema
diff --git a/biomaj/notify.py b/biomaj/notify.py
new file mode 100644
index 0000000..57be488
--- /dev/null
+++ b/biomaj/notify.py
@@ -0,0 +1,55 @@
+from builtins import str
+from builtins import object
+import smtplib
+import email.utils
+import sys
+if sys.version < '3':
+ from email.MIMEText import MIMEText
+else:
+ from email.mime.text import MIMEText
+
+from biomaj.workflow import Workflow
+import logging
+
+class Notify(object):
+ """
+ Send notifications
+ """
+
+ @staticmethod
+ def notifyBankAction(bank):
+ if not bank.config.get('mail.smtp.host') or bank.session is None:
+ logging.info('Notify:none')
+ return
+ logging.info('Notify:'+bank.config.get('mail.admin'))
+ mfrom = bank.config.get('mail.from')
+ mto = bank.config.get('mail.admin')
+ log_file = bank.config.log_file
+ msg = MIMEText('')
+ if log_file:
+ fp = open(log_file, 'rb')
+ msg = MIMEText(fp.read())
+ fp.close()
+ msg['To'] = email.utils.formataddr(('Recipient', mto))
+ msg['From'] = email.utils.formataddr(('Author', mfrom))
+ #msg['Subject'] = 'BANK['+bank.name+'] - STATUS['+str(bank.session.get_status(Workflow.FLOW_OVER))+'] - UPDATE['+str(bank.session.get('update'))+'] - REMOVE['+str(bank.session.get('remove'))+']'
+ msg['Subject'] = 'BANK['+bank.name+'] - STATUS['+str(bank.session.get_status(Workflow.FLOW_OVER))+'] - UPDATE['+str(bank.session.get('update'))+'] - REMOVE['+str(bank.session.get('remove'))+']' + ' - RELEASE['+str(bank.session.get('release'))+']'
+ #if bank.session.get('action') == 'update':
+ # msg['Subject'] = 'BANK['+bank.name+'] - STATUS['+str(bank.session.get_status(Workflow.FLOW_OVER))+'] - UPDATE['+str(bank.session.get('update'))+'] - REMOVE['+str(bank.session.get('remove'))+']' + ' - RELEASE['+str(bank.session.get('release'))+']'
+ #else:
+ # msg['Subject'] = 'BANK['+bank.name+'] - STATUS['+str(bank.session.get_status(Workflow.FLOW_OVER))+'] - UPDATE['+str(bank.session.get('update'))+'] - REMOVE['+str(bank.session.get('remove'))+']'
+ logging.info(msg['subject'])
+ server = None
+ try:
+ server = smtplib.SMTP(bank.config.get('mail.smtp.host'))
+ #server.set_debuglevel(1)
+ if bank.config.get('mail.tls') is not None and str(bank.config.get('mail.tls')) == 'true':
+ server.starttls()
+ if bank.config.get('mail.user') is not None and str(bank.config.get('mail.user')) != '':
+ server.login(bank.config.get('mail.user'), bank.config.get('mail.password'))
+ server.sendmail(mfrom, [mto], msg.as_string())
+ except Exception as e:
+ logging.error('Could not send email: '+str(e))
+ finally:
+ if server is not None:
+ server.quit()
diff --git a/biomaj/options.py b/biomaj/options.py
new file mode 100644
index 0000000..5003e7a
--- /dev/null
+++ b/biomaj/options.py
@@ -0,0 +1,36 @@
+from builtins import object
+
+
+class Options(object):
+ """
+ Available options
+ """
+
+ def __init__(self, options=None):
+ self.options = options
+
+ def get_option(self, option):
+ """
+ Gets an option if present, else return None
+ """
+ #if self.options is None:
+ # return None
+
+ #if hasattr(self.options, option):
+ # return getattr(self.options, option)
+
+ if hasattr(self, option):
+ return getattr(self, option)
+ #if option in self.options:
+ # return self.options[option]
+
+ return None
+
+ UPDATE = 'update'
+ REMOVE = 'remove'
+ PUBLISH = 'publish'
+ FROM_TASK = 'from_task'
+ PROCESS = 'process'
+ STOP_BEFORE = 'stop_before'
+ STOP_AFTER = 'stop_after'
+ FROMSCRATCH = 'fromscratch'
diff --git a/biomaj/process/__init__.py b/biomaj/process/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj/process/metaprocess.py b/biomaj/process/metaprocess.py
new file mode 100644
index 0000000..08c659f
--- /dev/null
+++ b/biomaj/process/metaprocess.py
@@ -0,0 +1,277 @@
+from builtins import str
+import threading
+import logging
+import os
+
+from biomaj.process.process import Process, DrmaaProcess, DockerProcess
+from biomaj.mongo_connector import MongoConnector
+
+class MetaProcess(threading.Thread):
+ '''
+ Meta process in biomaj process workflow. Meta processes are executed in parallel.
+
+ Each meta process defined a list of Process to execute sequentially
+ '''
+
+ def __init__(self, bank, metas, meta_status=None, meta_data=None, simulate=False):
+ '''
+ Creates a meta process thread
+
+ :param bank: Bank
+ :type bank: :class:`biomak.bank`
+ :param meta: list of meta processes to execute in thread
+ :type meta: list of str
+ :param meta_status: initial status of the meta processes
+ :type meta_status: bool
+ :param simulate: does not execute process
+ :type simulate: bool
+ '''
+ if meta_data is None:
+ meta_data = {}
+ threading.Thread.__init__(self)
+ self._lock = None
+ self.kill_received = False
+ self.workflow = None
+ self.simulate = simulate
+ self.bank = bank
+ self.metas = metas
+ self.meta_data = meta_data
+ self.meta_status = {}
+ for meta in self.metas:
+ self.meta_status[meta] = {}
+
+ if meta_status is not None:
+ self.meta_status = meta_status
+
+ self._stopevent = threading.Event()
+
+ self.bmaj_env = os.environ.copy()
+ #self.bmaj_env = {}
+ # Copy all config from bank
+
+
+ self.bmaj_only_env = {}
+ #The root directory where all databases are stored.
+ #If your data is not stored under one directory hirearchy
+ #you can override this value in the database properties file.
+ for conf in dict(self.bank.config.config_bank.items('GENERAL')):
+ self.bmaj_env[conf] = self.bank.config.config_bank.get('GENERAL', conf)
+ if self.bmaj_env[conf] is None:
+ self.bmaj_env[conf] = ''
+ self.bmaj_only_env[conf] = self.bmaj_env[conf]
+
+ self.bmaj_env['dbname'] = self.bank.name
+ self.bmaj_only_env['dbname'] = self.bmaj_env['dbname']
+
+ self.bmaj_env['datadir'] = self.bank.config.get('data.dir')
+ self.bmaj_only_env['datadir'] = self.bmaj_env['datadir']
+
+ self.bmaj_env['data.dir'] = self.bmaj_env['datadir']
+ self.bmaj_only_env['data.dir'] = self.bmaj_env['data.dir']
+
+ if self.bank.config.get('mail.admin'):
+ self.bmaj_env['mailadmin'] = self.bank.config.get('mail.admin')
+ self.bmaj_only_env['mailadmin'] = self.bmaj_env['mailadmin']
+
+ if self.bank.config.get('mail.smtp.host'):
+ self.bmaj_env['mailsmtp'] = self.bank.config.get('mail.smtp.host')
+ self.bmaj_only_env['mailsmtp'] = self.bmaj_env['mailsmtp']
+
+ self.bmaj_env['processdir'] = self.bank.config.get('process.dir', default='')
+ self.bmaj_only_env['processdir'] = self.bmaj_env['processdir']
+
+ if 'PATH' in self.bmaj_env:
+ self.bmaj_env['PATH'] += ':' + self.bmaj_env['processdir']
+ self.bmaj_only_env['PATH'] = self.bmaj_env['PATH']
+ else:
+ self.bmaj_env['PATH'] = self.bmaj_env['processdir']+':/usr/local/bin:/usr/sbin:/usr/bin'
+ self.bmaj_only_env['PATH'] = self.bmaj_env['PATH']
+
+ self.bmaj_env['PP_DEPENDENCE'] = '#'
+ self.bmaj_only_env['PP_DEPENDENCE'] = '#'
+ self.bmaj_env['PP_DEPENDENCE_VOLATILE'] = '#'
+ self.bmaj_only_env['PP_DEPENDENCE_VOLATILE'] = '#'
+ self.bmaj_env['PP_WARNING'] = '#'
+ self.bmaj_only_env['PP_WARNING'] = '#'
+
+ self.bmaj_env['PATH_PROCESS_BIOMAJ'] = self.bank.config.get('process.dir')
+ self.bmaj_only_env['PATH_PROCESS_BIOMAJ'] = self.bank.config.get('process.dir')
+
+ # Set some session specific env
+ if self.bank.session is not None:
+
+ if self.bank.session.get('log_file') is not None:
+ log_file = self.bank.session.get('log_file')
+ log_dir = os.path.dirname(log_file)
+ self.bmaj_env['logdir'] = log_dir
+ self.bmaj_only_env['logdir'] = log_dir
+ self.bmaj_env['logfile'] = log_file
+ self.bmaj_only_env['logfile'] = log_file
+
+
+ self.bmaj_env['offlinedir'] = self.bank.session.get_offline_directory()
+ self.bmaj_only_env['offlinedir'] = self.bmaj_env['offlinedir']
+
+ self.bmaj_env['dirversion'] = self.bank.config.get('dir.version')
+ self.bmaj_only_env['dirversion'] = self.bmaj_env['dirversion']
+
+ self.bmaj_env['noextract'] = self.bank.config.get('no.extract')
+ if self.bmaj_env['noextract'] is None:
+ self.bmaj_env['noextract'] = ''
+ self.bmaj_only_env['noextract'] = self.bmaj_env['noextract']
+
+ self.bmaj_env['localrelease'] = self.bank.session.get_release_directory()
+ self.bmaj_only_env['localrelease'] = self.bmaj_env['localrelease']
+ if self.bank.session.get('release') is not None:
+ self.bmaj_env['remoterelease'] = self.bank.session.get('remoterelease')
+ self.bmaj_only_env['remoterelease'] = self.bmaj_env['remoterelease']
+ self.bmaj_env['removedrelease'] = self.bank.session.get('release')
+ self.bmaj_only_env['removedrelease'] = self.bmaj_env['removedrelease']
+
+ for bdep in self.bank.depends:
+ self.bmaj_env[bdep.name+'source'] = bdep.session.get_full_release_directory()
+ self.bmaj_only_env[bdep.name+'source'] = self.bmaj_env[bdep.name+'source']
+
+ # Fix case where a var = None
+ for key in list(self.bmaj_only_env.keys()):
+ if self.bmaj_only_env[key] is None:
+ self.bmaj_env[key] = ''
+ self.bmaj_only_env[key] = ''
+
+
+ def set_progress(self, name, status=None):
+ '''
+ Update progress on download
+
+ :param name: name of process
+ :type name: str
+ :param status: status of process
+ :type status: bool or None
+ '''
+ logging.debug('Process:progress:'+name+"="+str(status))
+ if self.workflow is not None:
+ MongoConnector.banks.update({'name': self.bank.name},
+ {'$set': {'status.'+self.workflow+'.progress.'+name: status}})
+
+ def run(self):
+ # Run meta processes
+ self.global_status = True
+ for meta in self.metas:
+ if not self._stopevent.isSet():
+ logging.info("PROC:META:RUN:"+meta)
+ processes = []
+ if self.bank.config.get(meta) is not None:
+ processes = self.bank.config.get(meta).split(',')
+ processes_status = {}
+ for bprocess in processes:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ # Process status already ok, do not replay
+ if meta in self.meta_status and bprocess in self.meta_status[meta] and self.meta_status[meta][bprocess]:
+ logging.info("PROC:META:SKIP:PROCESS:"+bprocess)
+ processes_status[bprocess] = True
+ continue
+ logging.info("PROC:META:RUN:PROCESS:"+bprocess)
+ # bprocess.name may not be unique
+ #name = self.bank.config.get(bprocess+'.name')
+ name = bprocess
+ desc = self.bank.config.get(bprocess+'.desc')
+ cluster = self.bank.config.get_bool(bprocess+'.cluster', default=False)
+ docker = self.bank.config.get(bprocess+'.docker')
+ proc_type = self.bank.config.get(bprocess+'.type')
+ exe = self.bank.config.get(bprocess+'.exe')
+ args = self.bank.config.get(bprocess+'.args')
+ expand = self.bank.config.get_bool(bprocess+'.expand', default=True)
+ if cluster:
+ native = self.bank.config.get(bprocess+'.native')
+ bmaj_process = DrmaaProcess(meta+'_'+name, exe, args, desc, proc_type, native,
+ expand, self.bmaj_env,
+ os.path.dirname(self.bank.config.log_file))
+ elif docker:
+ use_sudo = self.bank.config.get_bool('docker.sudo', default=True)
+ bmaj_process = DockerProcess(meta+'_'+name, exe, args, desc, proc_type, docker,
+ expand, self.bmaj_only_env,
+ os.path.dirname(self.bank.config.log_file), use_sudo)
+ else:
+ bmaj_process = Process(meta+'_'+name, exe, args, desc, proc_type,
+ expand, self.bmaj_env, os.path.dirname(self.bank.config.log_file))
+ self.set_progress(bmaj_process.name, None)
+ if self.bank.config.get(bprocess+'.format'):
+ bmaj_process.format = self.bank.config.get(bprocess+'.format')
+ if self.bank.config.get(bprocess+'.types'):
+ bmaj_process.types = self.bank.config.get(bprocess+'.types')
+ if self.bank.config.get(bprocess+'.tags'):
+ bmaj_process.tags = self.bank.config.get(bprocess+'.tags')
+ if self.bank.config.get(bprocess+'.files'):
+ bmaj_process.files = self.bank.config.get(bprocess+'.files')
+ res = bmaj_process.run(self.simulate)
+ processes_status[bprocess] = res
+ self.set_progress(bmaj_process.name, res)
+ if not res:
+ self.global_status = False
+ break
+ if not self.simulate:
+ if self._lock:
+ self._lock.acquire()
+ try:
+ self._get_metata_from_outputfile(bmaj_process)
+ except Exception as e:
+ logging.error(e)
+ finally:
+ self._lock.release() # release lock, no matter what
+ else:
+ self._get_metata_from_outputfile(bmaj_process)
+ self.meta_status[meta] = processes_status
+
+ def _get_metata_from_outputfile(self, proc):
+ '''
+ Extract metadata given by process on stdout. Store metadata in self.metadata
+
+ :param proc: process
+ :type proc_name: :class:`biomaj.process.Process`
+ '''
+ proc_name = proc.name
+ output_file = proc.output_file
+
+ self.meta_data[proc_name] = {}
+ with open(output_file) as f:
+ for line in f:
+ if line.startswith('##BIOMAJ#'):
+ line = line.replace('##BIOMAJ#', '')
+ line = line.strip('\n\r')
+ metas = line.split('#')
+ meta_format = metas[0]
+ if meta_format == '':
+ meta_format = proc.format
+ meta_type = metas[1]
+ if meta_type == '':
+ meta_type = proc.types
+ meta_tags = metas[2]
+ if meta_tags == '':
+ meta_tags = proc.tags
+ meta_files = metas[3]
+ if not meta_format in self.meta_data[proc_name]:
+ self.meta_data[proc_name][meta_format] = []
+ tags = meta_tags.split(',')
+ tag_list = {}
+ if meta_tags != '':
+ for tag in tags:
+ t = tag.split(':')
+ tag_list[t[0]] = t[1]
+ self.meta_data[proc_name][meta_format].append({'tags': tag_list,
+ 'types': meta_type.split(','),
+ 'files': meta_files.split(',')})
+ if proc.files and proc.format:
+ tag_list = {}
+ if proc.tags != '':
+ for tag in proc.tags.split(','):
+ t = tag.split(':')
+ tag_list[t[0]] = t[1]
+ self.meta_data[proc_name][proc.format] = []
+ self.meta_data[proc_name][proc.format].append({'tags': tag_list,
+ 'types': proc.types.split(','),
+ 'files': proc.files.split(',')})
+
+
+ def stop(self):
+ self._stopevent.set()
diff --git a/biomaj/process/process.py b/biomaj/process/process.py
new file mode 100644
index 0000000..47da31c
--- /dev/null
+++ b/biomaj/process/process.py
@@ -0,0 +1,217 @@
+from builtins import str
+from builtins import object
+import logging
+import os
+import subprocess
+import tempfile
+
+class Process(object):
+ '''
+ Define a process to execute
+ '''
+
+ def __init__(self, name, exe, args, desc=None, proc_type=None, expand=True, bank_env=None, log_dir=None):
+ '''
+ Define one process
+
+ :param name: name of the process (descriptive)
+ :type name: str
+ :param exe: path to the executable (relative to process.dir or full path)
+ :type exe: str
+ :param args: arguments
+ :type args: str
+ :param desc: process description
+ :type desc: str
+ :param proc_type: types of data generated by process
+ :type proc_type: str
+ :param expand: allow shell expansion on command line
+ :type expand: bool
+ :param bank_env: environnement variables to set
+ :type bank_env: list
+ :param log_dir: directroy to place process stdout and stderr
+ :type log_dir: str
+ '''
+ # Replace env vars in args
+ if args:
+ for key, value in bank_env.items():
+ if value is not None:
+ args = args.replace('${'+key+'}', value)
+
+ self.name = name
+ self.exe = exe
+ self.desc = desc
+ if args is not None:
+ self.args = args.split()
+ else:
+ self.args = []
+ self.bank_env = bank_env
+ self.type = proc_type
+ self.expand = expand
+ if log_dir is not None:
+ self.output_file = os.path.join(log_dir, name+'.out')
+ self.error_file = os.path.join(log_dir, name+'.err')
+ else:
+ self.output_file = name+'.out'
+ self.error_file = name+'.err'
+
+ self.types = ''
+ self.format = ''
+ self.tags = ''
+ self.files = ''
+
+ def run(self, simulate=False):
+ '''
+ Execute process
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: exit code of process
+ '''
+ args = [self.exe] + self.args
+ logging.debug('PROCESS:EXEC:'+str(self.args))
+ err = False
+ if not simulate:
+ logging.info('PROCESS:RUN:'+self.name)
+ with open(self.output_file, 'w') as fout:
+ with open(self.error_file, 'w') as ferr:
+ if self.expand:
+ args = " ".join(args)
+ proc = subprocess.Popen(args, stdout=fout, stderr=ferr, env=self.bank_env, shell=True)
+ else:
+ proc = subprocess.Popen(args, stdout=fout, stderr=ferr, env=self.bank_env, shell=False)
+ proc.wait()
+ if proc.returncode == 0:
+ err = True
+ else:
+ logging.error('PROCESS:ERROR:'+self.name)
+ fout.flush()
+ ferr.flush()
+ else:
+ err = True
+ logging.info('PROCESS:EXEC:' + self.name + ':' + str(err))
+
+ return err
+
+class DockerProcess(Process):
+ def __init__(self, name, exe, args, desc=None, proc_type=None, docker=None, expand=True, bank_env=None, log_dir=None, use_sudo=True):
+ Process.__init__(self, name, exe, args, desc, proc_type, expand, bank_env, log_dir)
+ self.docker = docker
+ self.use_sudo = use_sudo
+
+ def run(self, simulate=False):
+ '''
+ Execute process in docker container
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: exit code of process
+ '''
+ use_sudo = ''
+ if self.use_sudo:
+ use_sudo = 'sudo'
+ release_dir = self.bank_env['datadir']+'/'+self.bank_env['dirversion']+'/'+self.bank_env['localrelease']
+ env = ''
+ if self.bank_env:
+ for key, value in self.bank_env.items():
+ env += ' -e "{0}={1}"'.format(key, value)
+ # docker run with data.dir env as shared volume
+ # forwarded env variables
+ cmd = '''uid={uid}
+ gid={gid}
+ {sudo} docker pull {container_id}
+ {sudo} docker run --rm -w {bank_dir} -v {data_dir}:{data_dir} {env} {container_id} \
+ bash -c "groupadd --gid {gid} {group_biomaj} && useradd --uid {uid} --gid {gid} {user_biomaj}; \
+ {exe} {args}; \
+ chown -R {uid}:{gid} {bank_dir}"'''.format(uid=os.getuid(),
+ gid=os.getgid(),
+ data_dir=self.bank_env['datadir'],
+ env=env,
+ container_id=self.docker,
+ group_biomaj='biomaj',
+ user_biomaj='biomaj',
+ exe=self.exe,
+ args=' '.join(self.args),
+ bank_dir=release_dir,
+ sudo=use_sudo
+ )
+
+ (handler, tmpfile) = tempfile.mkstemp('biomaj')
+ os.write(handler, cmd)
+ os.close(handler)
+ os.chmod(tmpfile, 0o755)
+ args = [tmpfile]
+ logging.debug('PROCESS:EXEC:Docker:'+str(self.args))
+ logging.debug('PROCESS:EXEC:Docker:Tmpfile:'+tmpfile)
+ err = False
+ if not simulate:
+ logging.info('PROCESS:RUN:Docker:'+self.docker+':'+self.name)
+ with open(self.output_file, 'w') as fout:
+ with open(self.error_file, 'w') as ferr:
+ if self.expand:
+ args = " ".join(args)
+ proc = subprocess.Popen(args, stdout=fout, stderr=ferr, env=self.bank_env, shell=True)
+ else:
+ proc = subprocess.Popen(args, stdout=fout, stderr=ferr, env=self.bank_env, shell=False)
+ proc.wait()
+ if proc.returncode == 0:
+ err = True
+ else:
+ logging.error('PROCESS:ERROR:'+self.name)
+ fout.flush()
+ ferr.flush()
+ else:
+ err = True
+ logging.info('PROCESS:EXEC:' + self.name + ':' + str(err))
+ os.remove(tmpfile)
+ return err
+
+
+class DrmaaProcess(Process):
+ def __init__(self, name, exe, args, desc=None, proc_type=None, native=None, expand=True, bank_env=None, log_dir=None):
+ Process.__init__(self, name, exe, args, desc, proc_type, expand, bank_env, log_dir)
+ self.native = native
+
+
+ def run(self, simulate=False):
+ '''
+ Execute process
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: exit code of process
+ '''
+ args = [self.exe] + self.args
+ logging.debug('PROCESS:EXEC:'+str(self.args))
+ err = False
+ if not simulate:
+ logging.info('Run process '+self.name)
+ # Execute on DRMAA
+ try:
+ import drmaa
+ with drmaa.Session() as s:
+ jt = s.createJobTemplate()
+ jt.remoteCommand = self.exe
+ jt.args = self.args
+ jt.joinFiles = False
+ jt.workingDirectory = os.path.dirname(os.path.realpath(self.output_file))
+ jt.jobEnvironment = self.bank_env
+ if self.native:
+ jt.nativeSpecification = " "+self.native+" "
+ jt.outputPath = self.output_file
+ jt.errorPath = self.error_file
+ jobid = s.runJob(jt)
+ retval = s.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER)
+ if retval.hasExited > 0:
+ err = True
+ else:
+ logging.error('PROCESS:ERROR:'+self.name)
+ s.deleteJobTemplate(jt)
+
+ except Exception as e:
+ logging.error('Drmaa process error: '+str(e))
+ return False
+ else:
+ err = True
+ logging.info('PROCESS:EXEC:' + self.name + ':' + str(err))
+
+ return err
diff --git a/biomaj/process/processfactory.py b/biomaj/process/processfactory.py
new file mode 100644
index 0000000..5787f4c
--- /dev/null
+++ b/biomaj/process/processfactory.py
@@ -0,0 +1,230 @@
+from builtins import range
+from builtins import object
+import threading
+import logging
+import os
+from biomaj.process.metaprocess import MetaProcess
+
+class ProcessFactory(object):
+ '''
+ Manage process execution
+ '''
+
+ NB_THREAD = 2
+
+ def __init__(self, bank):
+ self.bank = bank
+ self.threads_tasks = []
+ if self.bank.session:
+ self.meta_data = self.bank.session.get('per_process_metadata')
+ else:
+ self.meta_data = {}
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ pass
+
+ def run_threads(self, simulate=False):
+ '''
+ Start meta threads
+
+ :param simulate: do not execute processes
+ :type simulate: bool
+ :return: tuple global execution status and status per meta process
+ '''
+ logging.debug('Start meta threads')
+ os.chdir(self.bank.config.get('process.dir'))
+ threads = []
+ running_th = []
+ for thread_tasks in self.threads_tasks:
+ meta_thread = MetaProcess(self.bank, thread_tasks, self.meta_status, self.meta_data, simulate)
+ meta_thread._lock = ProcessFactory._LOCK
+ meta_thread.workflow = self.workflow
+ meta_thread.start()
+ threads.append(meta_thread)
+ running_th.append(meta_thread)
+ # Wait for the end of the threads
+ kill_received = False
+ while len(running_th) > 0:
+ try:
+ # Join all threads using a timeout so it doesn't block
+ # Filter out threads which have been joined or are None
+ running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()]
+ except KeyboardInterrupt:
+ logging.warn("Ctrl-c received! Sending kill to threads...")
+ logging.warn("Running tasks will continue and process will stop.")
+ kill_received = True
+ for t in running_th:
+ t.kill_received = True
+
+ for meta_thread in threads:
+ meta_thread.join()
+ global_meta_status = {}
+ global_status = True
+
+ for meta_thread in threads:
+ for meta in meta_thread.meta_status:
+ global_meta_status[meta] = meta_thread.meta_status[meta]
+ if not meta_thread.global_status:
+ global_status = False
+
+ if kill_received:
+ global_status = False
+
+ logging.debug('Meta threads are over')
+ return (global_status, global_meta_status)
+
+ def fill_tasks_in_threads(self, metas):
+ '''
+ Dispatch meta processes in available threads
+ '''
+ self.threads_tasks = []
+ for i in range(0, ProcessFactory.NB_THREAD):
+ # Fill array of meta process in future threads
+ self.threads_tasks.append([])
+ thread_id = 0
+ for meta in metas:
+ meta_process = meta.strip()
+ if thread_id == ProcessFactory.NB_THREAD:
+ thread_id = 0
+ self.threads_tasks[thread_id].append(meta_process)
+ thread_id += 1
+
+
+class PreProcessFactory(ProcessFactory):
+ '''
+ Manage preprocesses
+ '''
+
+ def __init__(self, bank, metas=None):
+ '''
+ Creates a preprocess factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param metas: initial status of meta processes
+ :type metas: dict
+ '''
+ ProcessFactory.__init__(self, bank)
+ self.meta_status = None
+ if metas is not None:
+ self.meta_status = metas
+ self.workflow = 'preprocess'
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:PRE')
+ if self.bank.config.get('db.pre.process') is None:
+ metas = []
+ else:
+ metas = self.bank.config.get('db.pre.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.meta_status) = self.run_threads(simulate)
+ return status
+
+class RemoveProcessFactory(ProcessFactory):
+ '''
+ Manage remove processes
+ '''
+
+ def __init__(self, bank, metas=None):
+ '''
+ Creates a remove process factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param metas: initial status of meta processes
+ :type metas: dict
+ '''
+ ProcessFactory.__init__(self, bank)
+ self.meta_status = None
+ if metas is not None:
+ self.meta_status = metas
+ self.workflow = 'removeprocess'
+
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:REMOVE')
+ if self.bank.config.get('db.remove.process') is None:
+ metas = []
+ else:
+ metas = self.bank.config.get('db.remove.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.meta_status) = self.run_threads(simulate)
+ return status
+
+class PostProcessFactory(ProcessFactory):
+ '''
+ Manage postprocesses
+
+ self.blocks: dict of meta processes status
+ Each meta process status is a dict of process status
+ '''
+
+ def __init__(self, bank, blocks=None):
+ '''
+ Creates a postprocess factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param blocks: initial status of block processes
+ :type blocks: dict
+ '''
+ ProcessFactory.__init__(self, bank)
+ self.blocks = {}
+ if blocks is not None:
+ self.blocks = blocks
+ self.workflow = 'postprocess'
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:POST:BLOCK')
+ blocks = self.bank.config.get('BLOCKS')
+ if blocks is None or blocks == '':
+ process_blocks = []
+ else:
+ process_blocks = blocks.split(',')
+ metas = []
+ self.meta_status = None
+ global_status = True
+ for process_block in process_blocks:
+ if not global_status:
+ continue
+ logging.info('PROC:POST:BLOCK:'+process_block)
+ if process_block in self.blocks:
+ self.meta_status = self.blocks[process_block]
+ # run each block
+ metas = self.bank.config.get(process_block.strip()+'.db.post.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.blocks[process_block]) = self.run_threads(simulate)
+ if not status:
+ global_status = False
+ return global_status
+
+
+ProcessFactory._LOCK = threading.Lock()
diff --git a/biomaj/schema_version.py b/biomaj/schema_version.py
new file mode 100644
index 0000000..5f21093
--- /dev/null
+++ b/biomaj/schema_version.py
@@ -0,0 +1,64 @@
+import pkg_resources
+from biomaj.mongo_connector import MongoConnector
+from biomaj.config import BiomajConfig
+
+
+class SchemaVersion(object):
+
+ """
+ BioMAJ database schema version. This package can be used to make some schema modification if needed during
+ incremental software version.
+ """
+
+ @staticmethod
+ def migrate_pendings():
+ """
+ Migrate database
+
+ 3.0.18: Check the actual BioMAJ version and if older than 3.0.17, do the 'pending' key migration
+ """
+ if BiomajConfig.global_config is None:
+ try:
+ BiomajConfig.load_config()
+ except Exception as err:
+ print("* SchemaVersion: Can't find config file")
+ return None
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ schema = MongoConnector.db_schema
+ banks = MongoConnector.banks
+
+ schema_version = schema.find_one({'id': 1})
+ installed_version = pkg_resources.get_distribution("biomaj").version
+ if schema_version is None:
+ schema_version = {'id': 1, 'version': '3.0.0'}
+ schema.insert(schema_version)
+
+ moderate = int(schema_version['version'].split('.')[1])
+ minor = int(schema_version['version'].split('.')[2])
+
+ if moderate == 0 and minor <= 17:
+ print("Migrate from release: %s" % schema_version['version'])
+ # Update pending releases
+ bank_list = banks.find()
+ updated = 0
+ for bank in bank_list:
+ if 'pending' in bank:
+ # Check we have an old pending type
+ if type(bank['pending']) == dict:
+ updated += 1
+ pendings = []
+ for release in sorted(bank['pending'], key=lambda r: bank['pending'][r]):
+ pendings.append({'release': str(release), 'id': bank['pending'][str(release)]})
+ if len(pendings) > 0:
+ banks.update({'name': bank['name']},
+ {'$set': {'pending': pendings}})
+ else:
+ # We remove old type for 'pending'
+ banks.update({'name': bank['name']},
+ {'$unset': {'pending': ""}})
+
+ print("Migration: %d bank(s) updated" % updated)
+ schema.update_one({'id': 1}, {'$set': {'version': installed_version}})
diff --git a/biomaj/session.py b/biomaj/session.py
new file mode 100644
index 0000000..9eca25d
--- /dev/null
+++ b/biomaj/session.py
@@ -0,0 +1,224 @@
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+from builtins import object
+import os
+#import configparser
+#import logging
+import time
+import copy
+import sys
+
+from biomaj.workflow import Workflow
+
+
+class Session(object):
+ """
+ BioMAJ bank session
+ """
+
+ @staticmethod
+ def get_ordered_dict():
+ if sys.version_info < (2, 7):
+ return {}
+ else:
+ import collections
+ return collections.OrderedDict()
+ #return {}
+
+
+ OVER = 0
+
+ def __init__(self, name, config, flow=None, action='update'):
+ """
+ Creates a new session
+
+ :param name: Name of the bank
+ :type name: str
+ :param config: bank and global config
+ :type config: BiomajConfig
+ :param flow: Workflow tasks
+ :param action: type of flow update|remove
+ :type action: str
+ :type flow: dict
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+ self.name = name
+ self.config = config
+ self.flow = copy.deepcopy(flow)
+
+ formats = {}
+ if self.config.get('db.formats') is not None:
+ flist = self.config.get('db.formats').split(',')
+ for f_in_list in flist:
+ formats[f_in_list.strip()] = []
+
+ self._session = {'id': time.time(),
+ 'log_file': self.config.log_file,
+ 'status': {},
+ 'files': [],
+ 'release': None,
+ 'remoterelease': None,
+ 'formats': formats,
+ 'process': {
+ 'postprocess': {},
+ 'preprocess': {},
+ 'removeprocess': {}
+ },
+ 'per_process_metadata': {},
+ 'data_dir': self.config.get('data.dir'),
+ 'dir_version': self.config.get('dir.version')
+ }
+ for flow in self.flow:
+ self._session['status'][flow['name']] = False
+
+ self.set('last_modified', self.config.last_modified)
+
+ # Default is update
+ self._session['action'] = action
+
+ def reload_postprocess_in_order(self, postprocess):
+ """
+ Reloads processes in config order
+ """
+ if self.config.get('BLOCKS') is None:
+ return postprocess
+ copy_postprocess = Session.get_ordered_dict()
+ blocks = self.config.get('BLOCKS').split(',')
+ for block in blocks:
+ copy_postprocess[block] = Session.get_ordered_dict()
+ metas = self.config.get(block.strip()+'.db.post.process').split(',')
+ for meta in metas:
+ copy_postprocess[block][meta] = Session.get_ordered_dict()
+ processes = self.config.get(meta.strip()).split(',')
+ for process in processes:
+ if block not in postprocess or meta not in postprocess[block] or process not in postprocess[block][meta]:
+ copy_postprocess[block][meta][process] = False
+ else:
+ copy_postprocess[block][meta][process] = postprocess[block][meta][process]
+ return copy_postprocess
+
+ def reload_in_order(self, cfg_type, otherprocess):
+ """
+ Reloads processes in config order
+ """
+ if self.config.get(cfg_type) is None or not self.config.get(cfg_type):
+ return otherprocess
+ copy_postprocess = Session.get_ordered_dict()
+ metas = self.config.get(cfg_type).split(',')
+ for meta in metas:
+ copy_postprocess[meta] = Session.get_ordered_dict()
+ processes = self.config.get(meta.strip()).split(',')
+ for process in processes:
+ copy_postprocess[meta][process] = otherprocess[meta][process]
+ return copy_postprocess
+
+ def reset_proc(self, type_proc, proc=None):
+ """
+ Reset status of processes for type in session
+
+ :param type_proc: postprocess preprocess or removeprocess
+ :type type_proc: Workflow.POSTPROCESS, Workflow.PREPROCESS, Workflow.REMOVEPROCESS
+ :param proc: reset from block/meta/proc, all reset all
+ :type proc: str
+ """
+ if type_proc == Workflow.FLOW_POSTPROCESS:
+ if proc in self._session['process']['postprocess']:
+ self._session['process']['postprocess'] = self.reload_postprocess_in_order(self._session['process']['postprocess'])
+ self.reset_meta(self._session['process']['postprocess'][proc])
+ else:
+ for elt in list(self._session['process']['postprocess'].keys()):
+ self.reset_meta(self._session['process']['postprocess'][elt], proc)
+ elif type_proc == Workflow.FLOW_PREPROCESS:
+ self._session['process']['preprocess'] = self.reload_in_order('db.pre.process', self._session['process']['preprocess'])
+ self.reset_meta(self._session['process']['preprocess'])
+ elif type_proc == Workflow.FLOW_REMOVEPROCESS:
+ self._session['process']['removeprocess'] = self.reload_in_order('db.remove.process', self._session['process']['removeprocess'])
+ self.reset_meta(self._session['process']['removeprocess'], proc)
+
+ def reset_meta(self, metas, proc=None):
+ """
+ Reset status of meta processes
+ """
+ if proc in metas:
+ for metaproc in list(metas[proc].keys()):
+ self.reset_process(metas[proc], metaproc)
+ else:
+ for meta in list(metas.keys()):
+ self.reset_process(metas[meta], proc)
+
+ def reset_process(self, processes, proc=None):
+ """
+ Reset status of processes
+ """
+ set_to_false = False
+ for process in list(processes.keys()):
+ if process == proc or proc is None:
+ set_to_false = True
+ if set_to_false:
+ processes[process] = False
+
+
+ def load(self, session):
+ """
+ Load an existing session
+ """
+ self._session = session
+
+ def get_release_directory(self):
+ """
+ Get release directroy name
+ """
+ return self.name+self.config.get('release.separator', default='_')+str(self._session['release'])
+
+ def get_full_release_directory(self):
+ """
+ Get bank directroy for this release
+ """
+ #release_dir = os.path.join(self.config.get('data.dir'),
+ # self.config.get('dir.version'),
+ # self.get_release_directory())
+ release_dir = os.path.join(self._session['data_dir'],
+ self._session['dir_version'],
+ self.get_release_directory())
+ return release_dir
+
+ def get_offline_directory(self):
+ """
+ Get bank offline directory
+ """
+ return os.path.join(self.config.get('data.dir'), self.config.get('offline.dir.name'))
+
+
+ def get(self, attr=None):
+ """
+ Return an attribute of session
+ """
+ if attr is None:
+ return self._session
+
+ if attr in self._session:
+ return self._session[attr]
+ else:
+ return None
+
+ def set(self, attr, value):
+ """
+ Sets an attribute of session
+ """
+ self._session[attr] = value
+
+ def get_status(self, status):
+ """
+ Return status for a flow event
+ """
+ if status not in self._session['status']:
+ return False
+ return self._session['status'][status]
+
+ def set_status(self, status, value):
+ """
+ Set status for a flow event
+ """
+ self._session['status'][status] = value
diff --git a/biomaj/user.py b/biomaj/user.py
new file mode 100644
index 0000000..bf15f6b
--- /dev/null
+++ b/biomaj/user.py
@@ -0,0 +1,183 @@
+from builtins import str
+from builtins import object
+import bcrypt
+import logging
+
+from biomaj.mongo_connector import MongoConnector
+from biomaj.config import BiomajConfig
+
+class BmajUser(object):
+ """
+ Biomaj User
+ """
+
+ def __init__(self, user):
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ self.users = MongoConnector.users
+ self.id = user
+ self.user = self.users.find_one({'id': user})
+ ldap_server = None
+ con = None
+ if not self.user and BiomajConfig.global_config.get('GENERAL', 'use_ldap') == '1':
+ # Check if in ldap
+ #import ldap
+ from ldap3 import Server, Connection, AUTH_SIMPLE, STRATEGY_SYNC, STRATEGY_ASYNC_THREADED, SEARCH_SCOPE_WHOLE_SUBTREE, GET_ALL_INFO
+ try:
+ ldap_host = BiomajConfig.global_config.get('GENERAL', 'ldap.host')
+ ldap_port = BiomajConfig.global_config.get('GENERAL', 'ldap.port')
+ #con = ldap.initialize('ldap://' + ldap_host + ':' + str(ldap_port))
+ ldap_server = Server(ldap_host, port=int(ldap_port), get_info=GET_ALL_INFO)
+ con = Connection(ldap_server, auto_bind=True, client_strategy=STRATEGY_SYNC, check_names=True)
+ except Exception as err:
+ logging.error(str(err))
+ self.user = None
+ ldap_dn = BiomajConfig.global_config.get('GENERAL', 'ldap.dn')
+ base_dn = 'ou=People,' + ldap_dn
+ ldapfilter = "(&(|(uid=" + user + ")(mail=" + user + ")))"
+ try:
+ #con.simple_bind_s()
+ attrs = ['mail']
+ #results = con.search_s(base_dn, ldap.SCOPE_SUBTREE, filter, attrs)
+ con.search(base_dn, ldapfilter, SEARCH_SCOPE_WHOLE_SUBTREE, attributes=attrs)
+ if con.response:
+ ldapMail = None
+ #for dn, entry in results:
+ for r in con.response:
+ user_dn = str(r['dn'])
+ #if 'mail' not in entry:
+ if 'mail' not in r['attributes']:
+ logging.error('Mail not set for user '+user)
+ else:
+ #ldapMail = entry['mail'][0]
+ ldapMail = r['attributes']['mail'][0]
+ self.user = {
+ 'id' : user,
+ 'email': ldapMail,
+ 'is_ldap': True
+ }
+ self.user['_id'] = self.users.insert(self.user)
+
+ else:
+ self.user = None
+ except Exception as err:
+ logging.error(str(err))
+ if con:
+ con.unbind()
+
+ @staticmethod
+ def user_remove(user_name):
+ """
+ Remove a user from db
+
+ :param user_name: user name
+ :type user_name: str
+ """
+ MongoConnector.users.remove({'id': user_name})
+
+ @staticmethod
+ def user_banks(user_name):
+ """
+ Get user banks name
+
+ :param user_name: user identifier
+ :type user_name: str
+ :return: list of bank name
+ """
+ banks = MongoConnector.banks.find({'properties.owner': user_name}, {'name':1})
+ return banks
+
+ @staticmethod
+ def list():
+ """
+ Get users
+ """
+ return MongoConnector.users.find()
+
+ def check_password(self, password):
+ if self.user is None:
+ return False
+
+ if self.user['is_ldap']:
+ #import ldap
+ con = None
+ ldap_server = None
+ #try:
+ # ldap_host = BiomajConfig.global_config.get('GENERAL','ldap.host')
+ # ldap_port = BiomajConfig.global_config.get('GENERAL','ldap.port')
+ # con = ldap.initialize('ldap://' + ldap_host + ':' + str(ldap_port))
+ from ldap3 import Server, Connection, AUTH_SIMPLE, STRATEGY_SYNC, STRATEGY_ASYNC_THREADED, SEARCH_SCOPE_WHOLE_SUBTREE, GET_ALL_INFO
+ from ldap3.core.exceptions import LDAPBindError
+ try:
+ ldap_host = BiomajConfig.global_config.get('GENERAL', 'ldap.host')
+ ldap_port = BiomajConfig.global_config.get('GENERAL', 'ldap.port')
+ #con = ldap.initialize('ldap://' + ldap_host + ':' + str(ldap_port))
+ ldap_server = Server(ldap_host, port=int(ldap_port), get_info=GET_ALL_INFO)
+ con = Connection(ldap_server, auto_bind=True, client_strategy=STRATEGY_SYNC, check_names=True)
+ except Exception as err:
+ logging.error(str(err))
+ return False
+ ldap_dn = BiomajConfig.global_config.get('GENERAL','ldap.dn')
+ base_dn = 'ou=People,' + ldap_dn
+ ldapfilter = "(&(|(uid=" + self.user['id'] + ")(mail=" + self.user['id'] + ")))"
+ #try:
+ # con.simple_bind_s()
+ #except Exception as err:
+ # logging.error(str(err))
+ # return False
+ try:
+ attrs = ['mail']
+ con.search(base_dn, ldapfilter, SEARCH_SCOPE_WHOLE_SUBTREE, attributes=attrs)
+ #results = con.search_s(base_dn, ldap.SCOPE_SUBTREE, filter, attrs)
+ user_dn = None
+ ldapMail = None
+ ldapHomeDirectory = None
+ for r in con.response:
+ user_dn = str(r['dn'])
+ ldapMail = r['attributes']['mail'][0]
+ #for dn, entry in results:
+ # user_dn = str(dn)
+ # ldapMail = entry['mail'][0]
+ con.unbind()
+ con = Connection(ldap_server, auto_bind=True, read_only=True, client_strategy=STRATEGY_SYNC, user=user_dn, password=password, authentication=AUTH_SIMPLE, check_names=True)
+ con.unbind()
+ #con.simple_bind_s(user_dn, password)
+ #con.unbind_s()
+ if user_dn:
+ return True
+ except LDAPBindError as err:
+ logging.error('Bind error: '+str(err))
+ return False
+ except Exception as err:
+ logging.error('Bind error: '+str(err))
+ return False
+
+ else:
+ hashed = bcrypt.hashpw(password, self.user['hashed_password'])
+ if hashed == self.user['hashed_password']:
+ return True
+ else:
+ return False
+
+ def remove(self):
+ if self.user is None:
+ return False
+ self.users.remove({'_id': self.user['_id']})
+ return True
+
+ def create(self, password, email=''):
+ """
+ Create a new user
+ """
+ hashed = bcrypt.hashpw(password, bcrypt.gensalt())
+ if self.user is None:
+ self.user = {
+ 'id' : self.id,
+ 'hashed_password': hashed,
+ 'email': email,
+ 'is_ldap': False
+ }
+ self.user['_id'] = self.users.insert(self.user)
diff --git a/biomaj/utils.py b/biomaj/utils.py
new file mode 100644
index 0000000..dd7b1c2
--- /dev/null
+++ b/biomaj/utils.py
@@ -0,0 +1,288 @@
+from builtins import str
+from builtins import object
+import tarfile
+import zipfile
+import gzip
+import bz2
+import re
+import glob
+import os
+import logging
+import shutil
+import datetime
+import subprocess
+from subprocess import CalledProcessError
+
+from mimetypes import MimeTypes
+
+class Utils(object):
+ """
+ Utility classes
+ """
+
+ mime = None
+
+ @staticmethod
+ def get_folder_size(folder):
+ """
+ Get directory path full size
+
+ :param folder: directory path
+ :type folder: str
+ """
+ if not os.path.exists(folder):
+ return -1
+ folder_size = 0
+ for (path, dirs, files) in os.walk(folder):
+ for ffile in files:
+ filename = os.path.join(path, ffile)
+ folder_size += os.path.getsize(filename)
+ return folder_size
+
+
+ @staticmethod
+ def detect_format(filename):
+ """
+ try to detect file format by extension
+ """
+ if Utils.mime is None:
+ Utils.mime = MimeTypes()
+ mimesfile = os.path.join(os.path.dirname(__file__), 'mimes-bio.txt')
+ Utils.mime.read(mimesfile, True)
+ return Utils.mime.guess_type(filename, True)
+
+ @staticmethod
+ def get_more_recent_file(files):
+ """
+ Return the date of the most recent file in list.
+
+ Each file is a dict like with (at least) parameters: year, month, day
+ """
+ release = None
+ for rfile in files:
+ if release is None:
+ release = {'year': rfile['year'], 'month': rfile['month'], 'day': rfile['day']}
+ else:
+ rel_date = datetime.date(int(release['year']), int(release['month']), int(release['day']))
+ file_date = datetime.date(int(rfile['year']), int(rfile['month']), int(rfile['day']))
+ if file_date > rel_date:
+ release['year'] = rfile['year']
+ release['month'] = rfile['month']
+ release['day'] = rfile['day']
+ return release
+
+ @staticmethod
+ def month_to_num(date):
+ return{
+ 'Jan' : 1,
+ 'Feb' : 2,
+ 'Mar' : 3,
+ 'Apr' : 4,
+ 'May' : 5,
+ 'Jun' : 6,
+ 'Jul' : 7,
+ 'Aug' : 8,
+ 'Sep' : 9,
+ 'Oct' : 10,
+ 'Nov' : 11,
+ 'Dec' : 12,
+ '01': 1,
+ '02': 2,
+ '03': 3,
+ '04': 4,
+ '05': 5,
+ '06': 6,
+ '07': 7,
+ '08': 8,
+ '09': 9,
+ '10': 10,
+ '11': 11,
+ '12': 12
+ }[date]
+
+ @staticmethod
+ def copy_files(files_to_copy, to_dir, move=False, lock=None):
+ """
+ Copy or move files to to_dir, keeping directory structure.
+
+ Copy keeps the original file stats.
+ Files should have attributes name and root:
+ - root: root directory
+ - name: relative path of file in root directory
+
+ /root/file/file1 will be copied in to_dir/file/file1
+
+ :param files_to_copy: list of files to copy
+ :type files_to_copy: list
+ :param to_dir: destination directory
+ :type to_dir: str
+ :param move: move instead of copy
+ :type move: bool
+ :param lock: thread lock object for multi-threads
+ :type lock: Lock
+ """
+ nb_files = len(files_to_copy)
+ cur_files = 1
+ for file_to_copy in files_to_copy:
+ logging.debug(str(cur_files)+'/'+str(nb_files)+' copy file '+file_to_copy['name'])
+ cur_files += 1
+ from_file = file_to_copy['root'] + '/' + file_to_copy['name']
+ to_file = to_dir + '/' + file_to_copy['name']
+ if lock is not None:
+ lock.acquire()
+ try:
+ if not os.path.exists(os.path.dirname(to_file)):
+ os.makedirs(os.path.dirname(to_file))
+ except Exception as e:
+ logging.error(e)
+ finally:
+ lock.release()
+
+ else:
+ if not os.path.exists(os.path.dirname(to_file)):
+ try:
+ os.makedirs(os.path.dirname(to_file))
+ except Exception as e:
+ logging.error(e)
+ if move:
+ shutil.move(from_file, to_file)
+ else:
+ shutil.copyfile(from_file, to_file)
+ shutil.copystat(from_file, to_file)
+
+ @staticmethod
+ def copy_files_with_regexp(from_dir, to_dir, regexps, move=False, lock=None):
+ """
+ Copy or move files from from_dir to to_dir matching regexps.
+ Copy keeps the original file stats.
+
+ :param from_dir: origin directory
+ :type from_dir: str
+ :param to_dir: destination directory
+ :type to_dir: str
+ :param regexps: list of regular expressions that files in from_dir should match to be copied
+ :type regexps: list
+ :param move: move instead of copy
+ :type move: bool
+ :param lock: thread lock object for multi-threads
+ :type lock: Lock
+ :return: list of copied files with their size
+ """
+ #os.chdir(from_dir)
+ files_to_copy = []
+ for root, dirs, files in os.walk(from_dir, topdown=True):
+ for name in files:
+ for reg in regexps:
+ file_relative_path = os.path.join(root, name).replace(from_dir, '')
+ if file_relative_path.startswith('/'):
+ file_relative_path = file_relative_path.replace('/', '', 1)
+ if reg == "**/*":
+ files_to_copy.append({'name': file_relative_path})
+ continue
+ if re.match(reg, file_relative_path):
+ files_to_copy.append({'name': file_relative_path})
+ continue
+
+ for file_to_copy in files_to_copy:
+ from_file = from_dir +'/' + file_to_copy['name']
+ to_file = to_dir + '/' + file_to_copy['name']
+
+ if lock is not None:
+ lock.acquire()
+ try:
+ if not os.path.exists(os.path.dirname(to_file)):
+ os.makedirs(os.path.dirname(to_file))
+ except Exception as e:
+ logging.error(e)
+ finally:
+ lock.release()
+ else:
+ if not os.path.exists(os.path.dirname(to_file)):
+ os.makedirs(os.path.dirname(to_file))
+ if move:
+ shutil.move(from_file, to_file)
+ else:
+ shutil.copyfile(from_file, to_file)
+ shutil.copystat(from_file, to_file)
+ file_to_copy['size'] = os.path.getsize(to_file)
+ f_stat = datetime.datetime.fromtimestamp(os.path.getmtime(to_file))
+ file_to_copy['year'] = str(f_stat.year)
+ file_to_copy['month'] = str(f_stat.month)
+ file_to_copy['day'] = str(f_stat.day)
+ (file_format, encoding) = Utils.detect_format(to_file)
+ file_to_copy['format'] = file_format
+ return files_to_copy
+
+ @staticmethod
+ def uncompress(archivefile, remove=True):
+ """
+ Test if file is an archive, and uncompress it
+ Remove archive file if specified
+
+ :param file: full path to file to check and uncompress
+ :type file: str
+ :param remove: remove archive if present
+ :type remove: bool
+ :return: True if ok, False if an error occured
+ """
+ is_archive = False
+ #if tarfile.is_tarfile(file):
+ # logging.debug('Uncompress:Tar:'+file)
+ # tfile = tarfile.TarFile(file)
+ # tfile.extractall(os.path.basename(file))
+ # tfile.close()
+ # is_archive = True
+ try:
+ if archivefile.endswith('.tar.gz'):
+ proc = subprocess.check_call("tar xfz "+archivefile+" --overwrite -C "+os.path.dirname(archivefile), shell=True)
+ #proc.wait()
+ is_archive = True
+ elif archivefile.endswith('.tar'):
+ proc = subprocess.check_call("tar xf "+archivefile+" --overwrite -C "+os.path.dirname(archivefile), shell=True)
+ #proc.wait()
+ is_archive = True
+ elif archivefile.endswith('.bz2'):
+ proc = subprocess.check_call("tar xjf "+archivefile+" --overwrite -C "+os.path.dirname(archivefile), shell=True)
+ #proc.wait()
+ is_archive = True
+ elif archivefile.endswith('.gz'):
+ proc = subprocess.check_call("gunzip -f "+archivefile, shell=True)
+ #proc.wait()
+ is_archive = True
+ elif archivefile.endswith('.zip'):
+ proc = subprocess.check_call("unzip -o "+archivefile+" -d "+os.path.dirname(archivefile), shell=True)
+ #proc.wait()
+ is_archive = True
+ except CalledProcessError as uncompresserror:
+ return False
+ #elif zipfile.is_zipfile(file):
+ # logging.debug('Uncompress:Zip:'+file)
+ # zfile = zipfile.ZipFile(file)
+ # zfile.extractall(os.path.basename(file))
+ # zfile.close()
+ # is_archive = True
+ #elif file.endswith('.gz'):
+ # logging.debug('Uncompress:Gz:'+file)
+ # f_in = open(file.replace('.gz',''), 'wb')
+ # gz_file = gzip.GzipFile(file)
+ # f_in.writelines(gz_file.readlines())
+ # f_in.close()
+ # gz_file.close()
+ # is_archive = True
+ #elif file.endswith('.bz2'):
+ # logging.debug('Uncompress:Bz2:'+file)
+ # f_in = open(file.replace('.bz2',''), 'wb')
+ # bz_file = bz2.BZ2File(file)
+ # f_in.writelines(bz_file.readlines())
+ # f_in.close()
+ # bz_file.close()
+ # is_archive = True
+
+ if is_archive:
+ logging.debug('Uncompress:uncompress:'+archivefile)
+
+
+ if is_archive and remove and os.path.exists(archivefile):
+ os.remove(archivefile)
+
+ return True
diff --git a/biomaj/workflow.py b/biomaj/workflow.py
new file mode 100644
index 0000000..7bea072
--- /dev/null
+++ b/biomaj/workflow.py
@@ -0,0 +1,1305 @@
+from builtins import str
+from builtins import range
+from builtins import object
+import logging
+import datetime
+import os
+import shutil
+import tempfile
+import re
+import traceback
+import json
+
+from biomaj.utils import Utils
+from biomaj.download.ftp import FTPDownload
+from biomaj.download.http import HTTPDownload
+from biomaj.download.direct import MultiDownload, DirectFTPDownload, DirectHttpDownload
+from biomaj.download.localcopy import LocalDownload
+from biomaj.download.downloadthreads import DownloadThread
+
+from biomaj.mongo_connector import MongoConnector
+from biomaj.options import Options
+
+from biomaj.process.processfactory import RemoveProcessFactory, PreProcessFactory, PostProcessFactory
+
+class Workflow(object):
+ """
+ Bank update workflow
+ """
+
+ FLOW_INIT = 'init'
+ FLOW_CHECK = 'check'
+ FLOW_DEPENDS = 'depends'
+ FLOW_PREPROCESS = 'preprocess'
+ FLOW_RELEASE = 'release'
+ FLOW_DOWNLOAD = 'download'
+ FLOW_POSTPROCESS = 'postprocess'
+ FLOW_REMOVEPROCESS = 'removeprocess'
+ FLOW_PUBLISH = 'publish'
+ FLOW_OVER = 'over'
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'check', 'steps': []},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank, session=None):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: :class:`biomaj.bank.Bank`
+ """
+ self.bank = bank
+ if session is None:
+ self.session = bank.session
+ else:
+ self.session = session
+ self.bank.session = session
+ self.options = bank.options
+ self.name = bank.name
+ # Skip all remaining tasks, no need to update
+ self.skip_all = False
+
+ self.session._session['update'] = False
+ self.session._session['remove'] = False
+ self.session.config.set('localrelease', '')
+ self.session.config.set('remoterelease', '')
+
+ def get_handler(self, protocol, server, remote_dir, list_file=None):
+ """
+ Get a protocol download handler
+ """
+ if list_file is None:
+ list_file = []
+ downloader = None
+ if protocol == 'ftp' or protocol == 'sftp':
+ downloader = FTPDownload(protocol, server, remote_dir)
+ if protocol == 'http' or protocol == 'https':
+ downloader = HTTPDownload(protocol, server, remote_dir, self.bank.config)
+ if protocol == 'local':
+ downloader = LocalDownload(remote_dir)
+ if protocol == 'directftp':
+ downloader = DirectFTPDownload('ftp', server, remote_dir, list_file)
+ if protocol == 'directhttp':
+ downloader = DirectHttpDownload('http', server, remote_dir, list_file)
+ if protocol == 'directhttps':
+ downloader = DirectHttpDownload('https', server, remote_dir, list_file)
+ if downloader is not None:
+ downloader.bank = self.bank.name
+
+ proxy = self.bank.config.get('proxy')
+ proxy_auth = self.bank.config.get('proxy_auth')
+ if proxy is not None and proxy:
+ downloader.set_proxy(proxy, proxy_auth)
+
+ timeout_download = self.bank.config.get('timeout.download')
+ if timeout_download is not None and timeout_download:
+ downloader.timeout = int(timeout_download)
+
+ return downloader
+
+
+ def get_flow(self, task):
+ for flow in Workflow.FLOW:
+ if flow['name'] == task:
+ return flow
+
+ def start(self):
+ """
+ Start the workflow
+ """
+ logging.info('Workflow:Start')
+ #print str(self.session._session['status'])
+ for flow in self.session.flow:
+ if self.skip_all:
+ logging.info('Workflow:Skip:'+flow['name'])
+ self.session._session['status'][flow['name']] = None
+ self.session._session['status'][Workflow.FLOW_OVER] = True
+ continue
+
+ if self.options.get_option(Options.STOP_BEFORE) == flow['name']:
+ self.wf_over()
+ break
+ # Always run INIT
+ if flow['name'] != Workflow.FLOW_INIT and self.session.get_status(flow['name']):
+ logging.info('Workflow:Skip:'+flow['name'])
+ if flow['name'] == Workflow.FLOW_INIT or not self.session.get_status(flow['name']):
+ logging.info('Workflow:Start:'+flow['name'])
+ try:
+ self.session._session['status'][flow['name']] = getattr(self, 'wf_'+flow['name'])()
+ except Exception as e:
+ self.session._session['status'][flow['name']] = False
+ logging.error('Workflow:'+flow['name']+'Exception:'+str(e))
+ logging.debug(traceback.format_exc())
+ #print str(traceback.format_exc())
+ finally:
+ self.wf_progress(flow['name'], self.session._session['status'][flow['name']])
+ if flow['name'] != Workflow.FLOW_OVER and not self.session.get_status(flow['name']):
+ logging.error('Error during task '+flow['name'])
+ if flow['name'] != Workflow.FLOW_INIT:
+ self.wf_over()
+ return False
+ # Main task is over, execute sub tasks of main
+ if not self.skip_all:
+ for step in flow['steps']:
+ try:
+ res = getattr(self, 'wf_'+step)()
+ if not res:
+ logging.error('Error during '+flow['name']+' subtask: wf_' + step)
+ logging.error('Revert main task status '+flow['name']+' to error status')
+ self.session._session['status'][flow['name']] = False
+ self.wf_over()
+ return False
+ except Exception as e:
+ logging.error('Workflow:'+flow['name']+' subtask: wf_' + step+ ':Exception:'+str(e))
+ self.session._session['status'][flow['name']] = False
+ logging.debug(traceback.format_exc())
+ self.wf_over()
+ return False
+ if self.options.get_option(Options.STOP_AFTER) == flow['name']:
+ self.wf_over()
+ #if self.options and 'stop_after' in self.options and self.options['stop_after'] == flow['name']:
+ break
+ self.wf_progress_end()
+ return True
+
+ def wf_progress_init(self):
+ """
+ Set up new progress status
+ """
+ status = {}
+ status['log_file'] = {'status': self.session.config.log_file, 'progress': 0}
+ status['session'] = self.session._session['id']
+ for flow in self.session.flow:
+ if flow['name'] == 'download':
+ status[flow['name']] = {'status': None, 'progress': 0, 'total': 0}
+ elif flow['name'].endswith('process'):
+ status[flow['name']] = {'status': None, 'progress': {}}
+ elif flow['name'] == 'release':
+ status[flow['name']] = {'status': None, 'progress': ''}
+ else:
+ status[flow['name']] = {'status': None, 'progress': 0}
+ MongoConnector.banks.update({'name': self.name}, {'$set': {'status': status}})
+
+ def wf_progress_end(self):
+ """
+ Reset progress status when workflow is over
+ """
+ #MongoConnector.banks.update({'name': self.name},{'$set': {'status': None}})
+
+ def wf_progress(self, task, status):
+ """
+ Update bank status
+ """
+ subtask = 'status.'+task+'.status'
+ MongoConnector.banks.update({'name': self.name}, {'$set': {subtask: status}})
+
+ def wf_init(self):
+ """
+ Initialize workflow
+ """
+ logging.info('Workflow:wf_init')
+ data_dir = self.session.config.get('data.dir')
+ lock_dir = self.session.config.get('lock.dir', default=data_dir)
+ if not os.path.exists(lock_dir):
+ os.mkdir(lock_dir)
+ lock_file = os.path.join(lock_dir, self.name+'.lock')
+ maintenance_lock_file = os.path.join(lock_dir, 'biomaj.lock')
+ if os.path.exists(maintenance_lock_file):
+ logging.error('Biomaj is in maintenance')
+ return False
+ if os.path.exists(lock_file):
+ logging.error('Bank '+self.name+' is locked, a process may be in progress, else remove the lock file '+lock_file)
+ #print 'Bank '+self.name+' is locked, a process may be in progress, else remove the lock file'
+ return False
+ f = open(lock_file, 'w')
+ f.write('1')
+ f.close()
+ self.wf_progress_init()
+ return True
+
+ def wf_over(self):
+ """
+ Workflow is over
+ """
+ logging.info('Workflow:wf_over')
+ data_dir = self.session.config.get('data.dir')
+ lock_dir = self.session.config.get('lock.dir', default=data_dir)
+ lock_file = os.path.join(lock_dir, self.name+'.lock')
+ os.remove(lock_file)
+ return True
+
+class RemoveWorkflow(Workflow):
+ """
+ Workflow to remove a bank instance
+ """
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'removeprocess', 'steps': []},
+ {'name': 'remove_release', 'steps': []},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank, session):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: Bank
+ :param session: session to remove
+ :type session: :class:`biomaj.session.Session`
+ """
+ Workflow.__init__(self, bank, session)
+ logging.debug('New workflow')
+ self.session._session['remove'] = True
+
+
+ def wf_remove_release(self):
+ logging.info('Workflow:wf_remove_release')
+ if not self.session.get('update_session_id'):
+ logging.error('Bug: update_session_id not set in session')
+ return False
+
+ if os.path.exists(self.session.get_full_release_directory()):
+ shutil.rmtree(self.session.get_full_release_directory())
+ return self.bank.remove_session(self.session.get('update_session_id'))
+
+ def wf_removeprocess(self):
+ logging.info('Workflow:wf_removepreprocess')
+ metas = self.session._session['process']['removeprocess']
+ pfactory = RemoveProcessFactory(self.bank, metas)
+ res = pfactory.run()
+ self.session._session['process']['removeprocess'] = pfactory.meta_status
+ return res
+
+
+class UpdateWorkflow(Workflow):
+ """
+ Workflow for a bank update
+ """
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'check', 'steps': []},
+ {'name': 'depends', 'steps': []},
+ {'name': 'preprocess', 'steps': []},
+ {'name': 'release', 'steps': []},
+ {'name': 'download', 'steps': ['uncompress','copy', 'copydepends']},
+ {'name': 'postprocess', 'steps': ['metadata', 'stats']},
+ {'name': 'publish', 'steps': ['old_biomaj_api', 'clean_offline', 'delete_old', 'clean_old_sessions']},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: Bank
+ """
+ Workflow.__init__(self, bank)
+ logging.debug('New workflow')
+ self.session._session['update'] = True
+
+ def wf_init(self):
+ err = super(UpdateWorkflow, self).wf_init()
+ if not err:
+ return False
+ if self.options.get_option(Options.FROMSCRATCH):
+ return self.wf_clean_offline()
+
+ return True
+
+ def wf_check(self):
+ """
+ Basic checks
+ """
+ logging.info('Workflow:wf_check')
+ return True
+
+ def wf_depends(self):
+ """
+ Checks bank dependencies with other banks. If bank has dependencies, execute update on other banks first
+ """
+ logging.info('Workflow:wf_depends')
+ # Always rescan depends, there might be a new release
+ self.session.set('depends', {})
+ res = self.bank.update_dependencies()
+ logging.info('Workflow:wf_depends:'+str(res))
+ if res and len(self.bank.depends)>0:
+ depend_updated = False
+ for bdep in self.bank.depends:
+ logging.info('Workflow:wf_depends:'+bdep.name+':'+str(bdep.session.get('update')))
+ if bdep.session.get('update'):
+ depend_updated = True
+ break
+ #if not depend_updated:
+ # return self.no_need_to_update()
+
+ return res
+
+ def wf_copydepends(self):
+ """
+ Copy files from dependent banks if needed
+ """
+ logging.info('Workflow:wf_copydepends')
+ deps = self.bank.get_dependencies()
+ for dep in deps:
+ if self.bank.config.get(dep+'.files.move'):
+ logging.info('Worflow:wf_depends:Files:Move:'+self.bank.config.get(dep+'.files.move'))
+ bdir = None
+ for bdep in self.bank.depends:
+ if bdep.name == dep:
+ bdir = bdep.session.get_full_release_directory()
+ break
+ if bdir is None:
+ logging.error('Could not find a session update for bank '+dep)
+ return False
+ b = self.bank.get_bank(dep, no_log=True)
+ locald = LocalDownload(bdir)
+ (file_list, dir_list) = locald.list()
+ locald.match(self.bank.config.get(dep+'.files.move').split(), file_list, dir_list)
+ bankdepdir = self.bank.session.get_full_release_directory()+"/"+dep
+ if not os.path.exists(bankdepdir):
+ os.mkdir(bankdepdir)
+ downloadedfiles = locald.download(bankdepdir)
+ locald.close()
+ if not downloadedfiles:
+ logging.info('Workflow:wf_copydepends:no files to copy')
+ return False
+ return True
+
+ def wf_preprocess(self):
+ """
+ Execute pre-processes
+ """
+ logging.info('Workflow:wf_preprocess')
+ metas = self.session._session['process']['preprocess']
+ pfactory = PreProcessFactory(self.bank, metas)
+ res = pfactory.run()
+ self.session._session['process']['preprocess'] = pfactory.meta_status
+ return res
+
+ def wf_release(self):
+ """
+ Find current release on remote
+ """
+ logging.info('Workflow:wf_release')
+ cf = self.session.config
+ if cf.get('ref.release') and self.bank.depends:
+ # Bank is a computed bank and we ask to set release to the same
+ # than an other dependant bank
+ depbank = self.bank.get_bank(cf.get('ref.release'), no_log=True)
+ got_match = False
+ got_update = False
+ for dep in self.bank.depends:
+ if dep.session.get('update'):
+ got_update = True
+ if dep.name == depbank.name:
+ self.session.set('release', dep.session.get('release'))
+ self.session.set('remoterelease', dep.session.get('remoterelease'))
+ got_match = True
+
+ if not got_match:
+ logging.error('Workflow:wf_release: no release found for bank '+depbank.name)
+ return False
+
+ release = self.session.get('release')
+ MongoConnector.banks.update({'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}})
+
+ logging.info('Workflow:wf_release:FromDepends:'+depbank.name+':'+self.session.get('release'))
+ if got_update:
+ index = 0
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory()+'__'+str(x)):
+ index = x + 1
+ if index > 0:
+ self.session.set('release', release+'__'+str(index))
+ release = release+'__'+str(index)
+
+ self.session.previous_release = self.session.get('previous_release')
+
+ logging.info('Workflow:wf_release:previous_session:'+str(self.session.previous_release))
+ if self.session.get('release'):
+ # Release already set from a previous run or an other bank
+ logging.info('Workflow:wf_release:session:'+str(self.session.get('release')))
+ if self.session.previous_release == self.session.get('release') and not self.session.config.get_bool('release.control', default=False):
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+ else:
+ return True
+ if self.session.config.get('release.file') == '' or self.session.config.get('release.file') is None:
+ logging.debug('Workflow:wf_release:norelease')
+ self.session.set('release', None)
+ return True
+ else:
+ protocol = cf.get('protocol')
+ if cf.get('release.protocol') is not None:
+ protocol = cf.get('release.protocol')
+ server = cf.get('server')
+ if cf.get('release.server') is not None:
+ server = cf.get('release.server')
+ remote_dir = cf.get('remote.dir')
+ if cf.get('release.remote.dir') is not None:
+ remote_dir = cf.get('release.remote.dir')
+
+
+ #protocol = cf.get('protocol')
+ release_downloader = None
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ release_downloader = self.get_handler(protocol, server, '/', [remote_dir])
+ release_downloader.method = cf.get('release.url.method')
+ if release_downloader.method is None:
+ release_downloader.method = 'GET'
+ release_downloader.save_as = cf.get('release.file')
+ keys = cf.get('release.url.params')
+ if keys is not None:
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get(key.strip()+'.value')
+ release_downloader.param[key.strip()] = param.strip()
+ else:
+ release_downloader = self.get_handler(protocol, server, remote_dir)
+
+ if cf.get('server.credentials') is not None:
+ release_downloader.set_credentials(cf.get('server.credentials'))
+
+ if release_downloader is None:
+ logging.error('Protocol '+protocol+' not supported')
+ return False
+
+ (file_list, dir_list) = release_downloader.list()
+
+ release_downloader.match([cf.get('release.file')], file_list, dir_list)
+ if len(release_downloader.files_to_download) == 0:
+ logging.error('release.file defined but does not match any file')
+ return False
+ if len(release_downloader.files_to_download) > 1:
+ logging.error('release.file defined but matches multiple files')
+ return False
+ if cf.get('release.regexp') is None or not cf.get('release.regexp'):
+ # Try to get from regexp in file name
+ rel = re.search(cf.get('release.file'), release_downloader.files_to_download[0]['name'])
+ if rel is None:
+ logging.error('release.file defined but does not match any file')
+ return False
+ release = rel.group(1)
+ else:
+ # Download and extract
+ tmp_dir = tempfile.mkdtemp('biomaj')
+ release_downloader.mkdir_lock = DownloadThread.MKDIR_LOCK
+ rel_files = release_downloader.download(tmp_dir)
+ rel_file = open(tmp_dir + '/' + rel_files[0]['name'])
+ rel_content = rel_file.read()
+ rel_file.close()
+ shutil.rmtree(tmp_dir)
+ rel = re.search(cf.get('release.regexp'), rel_content)
+ if rel is None:
+ logging.error('release.regexp defined but does not match any file content')
+ return False
+ # If regexp contains matching group, else take whole match
+ if len(rel.groups()) > 0:
+ release = rel.group(1)
+ else:
+ release = rel.group(0)
+
+ release_downloader.close()
+ if release_downloader.error:
+ logging.error('An error occured during download')
+ return False
+
+ self.session.set('release', release)
+ self.session.set('remoterelease', release)
+
+ MongoConnector.banks.update({'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}})
+
+ # We restart from scratch, a directory with this release already exists
+ # Check directory existence if from scratch to change local release
+ if self.options.get_option(Options.FROMSCRATCH):
+ index = 0
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory()+'__'+str(x)):
+ index = x + 1
+ if index > 0:
+ self.session.set('release', release+'__'+str(index))
+ release = release+'__'+str(index)
+
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+ if not self.download_go_ahead and self.session.previous_release == self.session.get('remoterelease'):
+ if not self.session.config.get_bool('release.control', default=False):
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+
+ logging.info('Session:RemoteRelease:'+self.session.get('remoterelease'))
+ logging.info('Session:Release:'+self.session.get('release'))
+ return True
+
+
+ def no_need_to_update(self):
+ """
+ Set status to over and update = False because there is not a need to update bank
+ """
+ self.skip_all = True
+ self.session._session['status'][Workflow.FLOW_OVER] = True
+ self.session._session['update'] = False
+ self.session.set('download_files', [])
+ self.session.set('files', [])
+ last_session = self.get_last_prod_session_for_release(self.session.get('remoterelease'))
+ self.session.set('release', last_session['release'])
+ self.wf_over()
+ return True
+
+
+
+ def get_last_prod_session_for_release(self, release):
+ """
+ find last session matching a release in production
+ """
+ last_session = None
+ for prod in self.bank.bank['production']:
+ if prod['remoterelease'] == release:
+ # Search session related to this production release
+ for s in self.bank.bank['sessions']:
+ if s['id'] == prod['session']:
+ last_session = s
+ break
+ return last_session
+
+
+ def _load_local_files_from_session(self, session_id):
+ """
+ Load lccal files for sessions from cache directory
+ """
+
+ cache_dir = self.bank.config.get('cache.dir')
+ f_local_files = None
+ file_path = os.path.join(cache_dir, 'local_files_'+str(session_id))
+ if not os.path.exists(file_path):
+ return f_local_files
+
+ with open(file_path) as data_file:
+ f_local_files = json.load(data_file)
+
+ return f_local_files
+
+ def _load_download_files_from_session(self, session_id):
+ """
+ Load download files for sessions from cache directory
+ """
+
+ cache_dir = self.bank.config.get('cache.dir')
+ f_downloaded_files = None
+ file_path = os.path.join(cache_dir, 'files_'+str(session_id))
+ if not os.path.exists(file_path):
+ return f_downloaded_files
+
+ with open(file_path) as data_file:
+ f_downloaded_files = json.load(data_file)
+
+ return f_downloaded_files
+
+ def is_previous_release_content_identical(self):
+ """
+ Checks if releases (previous_release and remoterelease) are identical in release id and content.
+ Expects release.control parameter to be set to true or 1, else skip control.
+ """
+ if not self.session.config.get_bool('release.control', default=False):
+ return True
+ # Different releases, so different
+ if self.session.get('remoterelease') != self.session.previous_release:
+ logging.info('Workflow:wf_download:DifferentRelease')
+ return False
+ # Same release number, check further
+ previous_release_session = self.get_last_prod_session_for_release(self.session.previous_release)
+
+ if previous_release_session is None:
+ return False
+
+ previous_downloaded_files = self._load_download_files_from_session(previous_release_session.get('id'))
+ previous_release_session['download_files'] = previous_downloaded_files
+
+ #previous_downloaded_files = previous_release_session.get('download_files', None)
+
+ if previous_downloaded_files is None:
+ # No info on previous download, consider that base release is enough
+ logging.warn('Workflow:wf_download:SameRelease:download_files not available, cannot compare to previous release')
+ return True
+
+ nb_elts = len(previous_downloaded_files)
+
+ if self.session.get('download_files') is not None and nb_elts != len(self.session.get('download_files')):
+ # Number of files to download vs previously downloaded files differ
+ logging.info('Workflow:wf_download:SameRelease:Number of files differ')
+ return False
+ # Same number of files, check hash of files
+ list1 = sorted(previous_downloaded_files, key=lambda k: k['hash'])
+ list2 = sorted(self.session.get('download_files'), key=lambda k: k['hash'])
+ for index in range(0, nb_elts):
+ if list1[index]['hash'] != list2[index]['hash']:
+ return False
+ return True
+
+ def check_and_incr_release(self):
+ """
+ Checks if local release already exists on disk. If it exists, create a new
+ local release, appending __X to the release.
+
+ :returns: str local release
+ """
+ index = 0
+ release = self.session.get('release')
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory()+'__'+str(x)):
+ index = x + 1
+
+ #while os.path.exists(self.session.get_full_release_directory()+'__'+str(index)):
+ # index += 1
+ # If we found a directory for this release: XX or XX__Y
+ if index > 0:
+ self.session.set('release', release+'__'+str(index))
+ release = release+'__'+str(index)
+ logging.info('Workflow:wf_download:release:incr_release:'+release)
+ return release
+
+
+ def _create_dir_structure(self, downloader, offline_dir):
+ """
+ Create expected directory structure in offline directory before download
+ """
+ logging.debug('Workflow:wf_download:create_dir_structure:start')
+ for rfile in downloader.files_to_download:
+ save_as = None
+ if 'save_as' not in rfile or rfile['save_as'] is None:
+ save_as = rfile['name']
+ else:
+ save_as = rfile['save_as']
+
+ file_dir = offline_dir + '/' + os.path.dirname(save_as)
+
+ try:
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ except Exception as e:
+ logging.error(e)
+ logging.debug('Workflow:wf_download:create_dir_structure:done')
+
+ def _get_list_from_file(self, remote_list):
+ """
+ Load files to download from a file
+ """
+ if not os.path.exists(remote_list):
+ logging.info("remote.list " + remote_list + " does not exists, we suppose there is no new release available")
+ return None
+
+ data = []
+ with open(remote_list) as data_file:
+ data = json.load(data_file)
+
+ for rfile in data:
+ if 'year' not in rfile or 'month' not in rfile or 'day' not in rfile:
+ today = datetime.date.today()
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ if 'permissions' not in rfile:
+ rfile['permissions'] = ''
+ if 'group' not in rfile:
+ rfile['group'] = ''
+ if 'size' not in rfile:
+ rfile['size'] = 0
+ if 'hash' not in rfile:
+ rfile['hash'] = None
+ if 'root' not in rfile and self.session.config.get('remote.dir'):
+ rfile['root'] = self.session.config.get('remote.dir')
+ return data
+
+ def wf_download(self):
+ """
+ Download remote files or use an available local copy from last production directory if possible.
+ """
+ logging.info('Workflow:wf_download')
+ flow = self.get_flow(Workflow.FLOW_DOWNLOAD)
+ downloader = None
+ cf = self.session.config
+ self.session.previous_release = self.session.get('previous_release')
+
+ if self.session.get('release') is not None:
+ self.session.config.set('localrelease', self.session.get('release'))
+ self.session.config.set('remoterelease', self.session.get('remoterelease'))
+
+ if cf.get('protocol') == 'none':
+ if self.session.get('release') is None:
+ logging.error('Workflow:wf_download:no download file but no release found')
+ return False
+ else:
+ logging.info('Workflow:wf_download:no download file expected')
+ self.downloaded_files = []
+ if not os.path.exists(self.session.get_full_release_directory()):
+ os.makedirs(self.session.get_full_release_directory())
+ return True
+
+ if cf.get('protocol') == 'multi':
+ """
+ Search for:
+ protocol = multi
+ remote.file.0.protocol = directftp
+ remote.file.0.server = ftp.ncbi.org
+ remote.file.0.path = /musmusculus/chr1/chr1.fa
+
+ => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2
+ remote.file.1.protocol = directhttp
+ remote.file.1.server = ftp2.fr.debian.org
+ remote.file.1.path = debian/README.html
+ remote.file.1.method = GET
+ remote.file.1.params.keys = key1,key2
+ remote.file.1.params.key1 = value1
+ remote.file.1.params.key2 = value2
+
+ => http://ftp2.fr.debian.org/debian/README.html
+ #POST PARAMS:
+ key1=value
+ key2=value2
+ remote.file.1.protocol = directhttp
+ remote.file.1.server = ftp2.fr.debian.org
+ remote.file.1.path = debian/README.html
+ remote.file.1.method = POST
+ remote.file.1.params.keys = key1,key2
+ remote.file.1.params.key1 = value1
+ remote.file.1.params.key2 = value2
+
+ ......
+ """
+ downloader = MultiDownload()
+ downloaders = []
+ # Creates multiple downloaders
+ i = 0
+ rfile = cf.get('remote.file.'+str(i)+'.path')
+ while rfile is not None:
+ if cf.get('remote.file.'+str(i)+'.protocol') is not None:
+ protocol = cf.get('remote.file.'+str(i)+'.protocol')
+ else:
+ protocol = cf.get('protocol')
+ if cf.get('remote.file.'+str(i)+'.server') is not None:
+ server = cf.get('remote.file.'+str(i)+'.server')
+ else:
+ server = cf.get('server')
+ subdownloader = self.get_handler(protocol, server, '', [cf.get('remote.file.'+str(i)+'.path')])
+ if cf.get('remote.file.'+str(i)+'.credentials') is not None:
+ credentials = cf.get('remote.file.'+str(i)+'.credentials')
+ else:
+ credentials = cf.get('server.credentials')
+ if credentials is not None:
+ subdownloader.set_credentials(credentials)
+ if protocol == 'directftp':
+ if cf.get('remote.file.'+str(i)+'.name'):
+ subdownloader.save_as = cf.get('remote.file.'+str(i)+'.name')
+ else:
+ subdownloader.save_as = cf.get('remote.file.'+str(i)+'.path')
+ if protocol == 'directhttp' or protocol == 'directhttps':
+ subdownloader.method = cf.get('remote.file.'+str(i)+'.method')
+ if subdownloader.method is None:
+ subdownloader.method = 'GET'
+ if cf.get('remote.file.'+str(i)+'.name'):
+ subdownloader.save_as = cf.get('remote.file.'+str(i)+'.name')
+ else:
+ subdownloader.save_as = cf.get('remote.file.'+str(i)+'.path')
+ if cf.get('remote.file.'+str(i)+'.method'):
+ subdownloader.method = cf.get('remote.file.'+str(i)+'.method').strip().upper()
+ subdownloader.params = {}
+ keys = cf.get('remote.file.'+str(i)+'.params.keys')
+ if keys is not None:
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get('remote.file.'+str(i)+'.params.'+key.strip())
+ subdownloader.param[key.strip()] = param.strip()
+ downloaders.append(subdownloader)
+ i += 1
+ rfile = cf.get('remote.file.'+str(i)+'.path')
+ downloader.add_downloaders(downloaders)
+
+ else:
+ """
+ Simple case, one downloader with regexp
+ """
+ protocol = cf.get('protocol')
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ downloader = self.get_handler(cf.get('protocol'), cf.get('server'), '/', [cf.get('remote.dir')[:-1]])
+ downloader.method = cf.get('url.method')
+ if downloader.method is None:
+ downloader.method = 'GET'
+ downloader.save_as = cf.get('target.name')
+ keys = cf.get('url.params')
+ if keys is not None:
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get(key.strip()+'.value')
+ downloader.param[key.strip()] = param.strip()
+ else:
+ downloader = self.get_handler(cf.get('protocol'), cf.get('server'), cf.get('remote.dir'))
+
+ if downloader is None:
+ logging.error('Protocol '+cf.get('protocol')+' not supported')
+ return False
+
+ remote_list = cf.get('remote.list',default=None)
+ if remote_list is not None:
+ logging.info("Use list from " + remote_list)
+ downloader.files_to_download = self._get_list_from_file(remote_list)
+ if downloader.files_to_download is None:
+ self.session.set('remoterelease', self.session.previous_release )
+ return self.no_need_to_update()
+ else:
+ (file_list, dir_list) = downloader.list()
+
+ downloader.match(cf.get('remote.files',default='.*').split(), file_list, dir_list)
+
+ for f in downloader.files_to_download:
+ if 'save_as' not in f or not f['save_as']:
+ f['save_as'] = f['name']
+ for p in cf.get('remote.files', default='.*').split():
+ if p.startswith('^'):
+ p = p.replace('^','^/')
+ else:
+ p = '/' + p
+ res = re.match(p, f['name'])
+ if res is not None and res.groups() is not None and len(res.groups()) >= 1:
+ f['save_as'] = '/'.join(res.groups())
+ break
+
+
+ self.session.set('download_files', downloader.files_to_download)
+
+ if self.session.get('release') and self.session.config.get_bool('release.control', default=False):
+ if self.session.previous_release == self.session.get('remoterelease'):
+ if self.is_previous_release_content_identical():
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+ else:
+ release = self.check_and_incr_release()
+
+ if self.session.get('release') is None:
+ # Not defined, or could not get it ealier
+ # Set release to most recent file to download
+ release_dict = Utils.get_more_recent_file(downloader.files_to_download)
+ if release_dict is None:
+ today = datetime.datetime.now()
+ release_dict = {'year': today.year, 'month': today.month, 'day': today.day}
+
+ release = str(release_dict['year']) + '-' + str(release_dict['month']) + '-' + str(release_dict['day'])
+ if cf.get('release.format'):
+ release_date = datetime.datetime.now()
+ release_date = release_date.replace(year=int(release_dict['year']), month=int(release_dict['month']), day=int(release_dict['day']))
+ # Fix configparser problem between py2 and py3
+ release = release_date.strftime(cf.get('release.format').replace('%%', '%'))
+ self.session.set('release', release)
+ self.session.set('remoterelease', release)
+
+ logging.info('Workflow:wf_download:release:remoterelease:'+self.session.get('remoterelease'))
+ logging.info('Workflow:wf_download:release:release:'+release)
+ MongoConnector.banks.update({'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}})
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+ if not self.download_go_ahead and self.session.previous_release == self.session.get('remoterelease') and self.is_previous_release_content_identical():
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+
+ # We restart from scratch, check if directory with this release already exists
+ if self.options.get_option(Options.FROMSCRATCH) or self.options.get_option('release') is None:
+ release = self.check_and_incr_release()
+
+
+ self.session.config.set('localrelease', self.session.get('release'))
+ self.session.config.set('remoterelease', self.session.get('remoterelease'))
+
+ self.banks = MongoConnector.banks
+ self.bank.bank = self.banks.find_one({'name': self.name})
+
+ nb_prod_dir = len(self.bank.bank['production'])
+ offline_dir = self.session.get_offline_directory()
+
+ copied_files = []
+
+ # Check if already in offlinedir
+ keep_files = []
+ if os.path.exists(offline_dir):
+ for file_to_download in downloader.files_to_download:
+ # If file is in offline dir and has same date and size, do not download again
+ if os.path.exists(offline_dir + '/' + file_to_download['name']):
+ try:
+ file_stat = os.stat(offline_dir + '/' + file_to_download['name'])
+ f_stat = datetime.datetime.fromtimestamp(os.path.getmtime(offline_dir + '/' + file_to_download['name']))
+ year = str(f_stat.year)
+ month = str(f_stat.month)
+ day = str(f_stat.day)
+ if str(file_stat.st_size) != str(file_to_download['size']) or \
+ str(year) != str(file_to_download['year']) or \
+ str(month) != str(file_to_download['month']) or \
+ str(day) != str(file_to_download['day']):
+ logging.debug('Workflow:wf_download:different_from_offline:'+file_to_download['name'])
+ keep_files.append(file_to_download)
+ else:
+ logging.debug('Workflow:wf_download:offline:'+file_to_download['name'])
+ except Exception as e:
+ # Could not get stats on file
+ os.remove(offline_dir + '/' + file_to_download['name'])
+ keep_files.append(file_to_download)
+ else:
+ keep_files.append(file_to_download)
+ downloader.files_to_download = keep_files
+ # If everything was already in offline dir
+ if len(downloader.files_to_download) == 0:
+ self.downloaded_files = []
+ return True
+
+ self._create_dir_structure(downloader, offline_dir)
+
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+
+
+ if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0:
+ #for prod in self.bank.bank['production']:
+ # if self.session.get('release') == prod['release']:
+ # logging.info('Workflow:wf_release:same_as_previous_production_dir')
+ # return self.no_need_to_update()
+
+
+ # Get last production
+ last_production = self.bank.bank['production'][nb_prod_dir-1]
+ # Get session corresponding to production directory
+ last_production_session = self.banks.find_one({'name': self.name, 'sessions.id': last_production['session']}, {'sessions.$': 1})
+ last_production_dir = os.path.join(last_production['data_dir'], cf.get('dir.version'), last_production['release'])
+ # Checks if some files can be copied instead of downloaded
+ last_production_files = None
+ if len(last_production_session['sessions']) > 0:
+ last_production_files = self._load_local_files_from_session(last_production_session['sessions'][0]['id'])
+ downloader.download_or_copy(last_production_files, last_production_dir)
+ if len(downloader.files_to_download) == 0:
+ return self.no_need_to_update()
+
+ #release_dir = os.path.join(self.session.config.get('data.dir'),
+ # self.session.config.get('dir.version'),
+ # self.session.get_release_directory())
+ logging.debug('Workflow:wf_download:Copy files from '+last_production_dir)
+ copied_files = downloader.files_to_copy
+ Utils.copy_files(downloader.files_to_copy, offline_dir)
+
+
+ downloader.close()
+
+ DownloadThread.NB_THREAD = int(self.session.config.get('files.num.threads'))
+
+ if cf.get('protocol') == 'multi':
+ thlist = DownloadThread.get_threads_multi(downloader.downloaders, offline_dir)
+ else:
+ thlist = DownloadThread.get_threads(downloader, offline_dir)
+
+ running_th = []
+ for th in thlist:
+ running_th.append(th)
+ th.start()
+ """
+ while len(running_th) > 0:
+ try:
+ # Join all threads using a timeout so it doesn't block
+ # Filter out threads which have been joined or are None
+ running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()]
+ logging.debug("Workflow:wf_download:Download:Threads:"+str(running_th))
+ except KeyboardInterrupt:
+ logging.warn("Ctrl-c received! Sending kill to threads...")
+ logging.warn("Running tasks will continue and process will stop.")
+ for t in running_th:
+ t.downloader.kill_received = True
+ logging.info("Workflow:wf_download:Download:Threads:Over")
+ """
+ for th in thlist:
+ th.join()
+ logging.info("Workflow:wf_download:Download:Threads:Over")
+ is_error = False
+ for th in thlist:
+ if th.error:
+ is_error = True
+ downloader.error = True
+ break
+ self.downloaded_files = downloader.files_to_download + copied_files
+ #self.downloaded_files = downloader.download(offline_dir) + copied_files
+
+ #downloader.close()
+
+ if downloader.error:
+ logging.error('An error occured during download')
+ return False
+
+ return True
+
+ def wf_uncompress(self):
+ """
+ Uncompress files if archives and no.extract = false
+ """
+ logging.info('Workflow:wf_uncompress')
+ if len(self.downloaded_files) == 0:
+ logging.info("Workflow:wf_uncompress:NoFileDownload:NoExtract")
+ return True
+ no_extract = self.session.config.get('no.extract')
+ if no_extract is None or no_extract == 'false':
+ for file in self.downloaded_files:
+ if 'save_as' not in file:
+ file['save_as'] = file['name']
+ nb_try = 1
+ not_ok = True
+ while nb_try < 3 and not_ok:
+ status = Utils.uncompress(self.session.get_offline_directory() + '/' + file['save_as'])
+ if status:
+ not_ok = False
+ else:
+ logging.warn('Workflow:wf_uncompress:Failure:'+file['name']+':'+str(nb_try))
+ nb_try += 1
+ if not_ok:
+ logging.error('Workflow:wf_uncompress:Failure:'+file['name'])
+ return False
+ return True
+
+ def wf_copy(self):
+ """
+ Copy files from offline directory to release directory
+ """
+ logging.info('Workflow:wf_copy')
+ if len(self.downloaded_files) == 0:
+ logging.info("Workflow:wf_copy:NoFileDownload:NoCopy")
+ return True
+ from_dir = os.path.join(self.session.config.get('data.dir'),
+ self.session.config.get('offline.dir.name'))
+ regexp = self.session.config.get('local.files', default='**/*').split()
+ to_dir = os.path.join(self.session.config.get('data.dir'),
+ self.session.config.get('dir.version'),
+ self.session.get_release_directory(), 'flat')
+
+ local_files = Utils.copy_files_with_regexp(from_dir, to_dir, regexp, True)
+ self.session._session['files'] = local_files
+ if len(self.session._session['files']) == 0:
+ logging.error('Workflow:wf_copy:No file match in offline dir')
+ return False
+ return True
+
+ def wf_metadata(self):
+ """
+ Update metadata with info gathered from processes
+ """
+ logging.info('Workflow:wf_metadata')
+ self.bank.session.set('formats', {})
+ per_process_meta_data = self.session.get('per_process_metadata')
+ for proc in list(per_process_meta_data.keys()):
+ for meta_data in list(per_process_meta_data[proc].keys()):
+ session_formats = self.bank.session.get('formats')
+ if meta_data not in session_formats:
+ #session_formats[meta_data] = [meta_thread.meta_data[meta_data]]
+ session_formats[meta_data] = per_process_meta_data[proc][meta_data]
+ else:
+ #session_formats[meta_data].append(meta_thread.meta_data[meta_data])
+ session_formats[meta_data] += per_process_meta_data[proc][meta_data]
+ return True
+
+ def wf_stats(self):
+ """
+ Get some stats from current release data dir
+ """
+ logging.info('Workflow:wf_stats')
+ do_stats = self.bank.config.get('data.stats')
+ if do_stats is None or do_stats == '0':
+ self.session.set('fullsize', 0)
+ return True
+ prod_dir = self.session.get_full_release_directory()
+ dir_size = Utils.get_folder_size(prod_dir)
+ self.session.set('fullsize', dir_size)
+ return True
+
+ def wf_postprocess(self):
+ """
+ Execute post processes
+ """
+
+ # Creates a temporary symlink future_release to keep compatibility if process
+ # tries to access dir with this name
+ future_link = os.path.join(self.bank.config.get('data.dir'),
+ self.bank.config.get('dir.version'),
+ 'future_release')
+ prod_dir = self.session.get_full_release_directory()
+ to_dir = os.path.join(self.bank.config.get('data.dir'),
+ self.bank.config.get('dir.version'))
+
+ if os.path.lexists(future_link):
+ os.remove(future_link)
+ os.chdir(to_dir)
+ os.symlink(self.session.get_release_directory(), 'future_release')
+
+ logging.info('Workflow:wf_postprocess')
+ blocks = self.session._session['process']['postprocess']
+ pfactory = PostProcessFactory(self.bank, blocks)
+ res = pfactory.run()
+ self.session._session['process']['postprocess'] = pfactory.blocks
+
+ # In any way, delete symlink
+ if os.path.lexists(future_link):
+ os.remove(future_link)
+
+ return res
+
+ def wf_publish(self):
+ """
+ Add *current* symlink to this release
+ """
+ if self.bank.config.get_bool('auto_publish', default=False):
+ logging.info('Workflow:wf_publish')
+ self.bank.publish()
+ return True
+
+ if not self.options.get_option(Options.PUBLISH):
+ logging.info('Workflow:wf_publish:no')
+ return True
+ logging.info('Workflow:wf_publish')
+ self.bank.publish()
+ return True
+
+ def wf_old_biomaj_api(self):
+ """
+ Generates a listing.format file containing the list of files in directories declared in formats
+ """
+ release_dir = self.session.get_full_release_directory()
+ for release_format in self.bank.session.get('formats'):
+ format_file = os.path.join(release_dir, 'listingv1.'+release_format.replace('/','_'))
+ section = self.list_section(release_dir, release_format, release_format)
+ logging.debug("Worfklow:OldAPI:WriteListing: "+format_file)
+ fd = os.open(format_file, os.O_RDWR|os.O_CREAT)
+ os.write(fd, json.dumps(section).encode('utf-8'))
+ os.close(fd)
+ return True
+
+ def list_section(self, base_dir, release_format, base_format):
+ """
+ Get section files and sub-section from base_dir for directory release_format
+
+ :param base_dir: root directory
+ :type base_dir: str
+ :param base_dir: sub directory to scan
+ :type base_dir: str
+ :param base_format: first directroy indicating format
+ :type base_format: str
+ :return: dict section details
+ """
+ section = {"name": release_format, "sections": [], "files": []}
+ format_dir = os.path.join(base_dir, release_format)
+ if not os.path.exists(format_dir):
+ logging.info("Worfklow:OldAPI:Format directory "+release_format+" does not exists, skipping")
+ return section
+ format_dir_list = os.listdir(format_dir)
+ for format_dir_file in format_dir_list:
+ if os.path.isfile(os.path.join(format_dir, format_dir_file)):
+ if base_format.lower() == 'blast':
+ if format_dir_file.endswith('.nal'):
+ fileName, fileExtension = os.path.splitext(format_dir_file)
+ section['files'].append(os.path.join(format_dir, fileName))
+ else:
+ section['files'].append(os.path.join(format_dir, format_dir_file))
+ else:
+ # This is a sub directory
+ new_section = self.list_section(format_dir, format_dir_file, base_format)
+ section['sections'].append(new_section)
+ return section
+
+
+ def wf_clean_offline(self):
+ """
+ Clean offline directory
+ """
+ logging.info('Workflow:wf_clean_offline')
+ if os.path.exists(self.session.get_offline_directory()):
+ shutil.rmtree(self.session.get_offline_directory())
+ return True
+
+ def wf_clean_old_sessions(self):
+ """
+ Delete old sessions not related to a production directory or last run
+ """
+ logging.info('Workflow:wf_clean_old_sessions')
+ self.bank.clean_old_sessions()
+ return True
+
+ def wf_delete_old(self):
+ """
+ Delete old production dirs
+ """
+ logging.info('Workflow:wf_delete_old')
+ if self.options.get_option(Options.FROM_TASK) is not None:
+ # This is a run on an already present release, skip delete
+ logging.info('Workflow:wf_delete_old:Skip')
+ return True
+ if not self.session.config.get('keep.old.version'):
+ keep = 1
+ else:
+ keep = int(self.session.config.get('keep.old.version'))
+ # Current production dir is not yet in list
+ nb_prod = len(self.bank.bank['production'])
+ # save session during delete workflow
+ keep_session = self.bank.session
+
+ if nb_prod > keep:
+ for prod in self.bank.bank['production']:
+ if prod['release'] == keep_session.get('release'):
+ continue
+ if 'freeze' in prod and prod['freeze']:
+ continue
+ if self.bank.bank['current'] == prod['session']:
+ continue
+ if nb_prod - keep > 0:
+ nb_prod -= 1
+ session = self.bank.get_new_session(RemoveWorkflow.FLOW)
+ # Delete init and over because we are already in a run
+ i_init = -1
+ i_over = -1
+ for i in range(0, len(session.flow)):
+ if session.flow[i]['name'] == 'init':
+ i_init = i
+ if i_init >= 0:
+ del session.flow[i_init]
+ for i in range(0, len(session.flow)):
+ if session.flow[i]['name'] == 'over':
+ i_over = i
+ if i_over >= 0:
+ del session.flow[i_over]
+
+ session.set('action', 'remove')
+ session.set('release', prod['release'])
+ session.set('remoterelease', prod['remoterelease'])
+ session.set('update_session_id', prod['session'])
+ logging.info('Workflow:wf_delete_old:Delete:'+prod['release'])
+ res = self.bank.start_remove(session)
+ if not res:
+ logging.error('Workflow:wf_delete_old:ErrorDelete:'+prod['release'])
+ else:
+ break
+ # Set session back
+ self.bank.session = keep_session
+
+ return True
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..356ef0a
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BioMAJ.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BioMAJ.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/BioMAJ"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BioMAJ"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/admin.rst b/docs/admin.rst
new file mode 100644
index 0000000..b15363d
--- /dev/null
+++ b/docs/admin.rst
@@ -0,0 +1,37 @@
+***************
+Advanced Topics
+***************
+
+LDAP
+====
+
+The `BioMAJ watcher <https://github.com/genouest/biomaj-watcher>`__,
+provides an optional web interface to manage banks. Users can create
+"private" banks and manage them via the web.
+
+ElasticSearch
+=============
+
+In order to use the ``--search`` flag, you may wish to connect an
+ElasticSearch cluster.
+
+You will need to edit your ``global.properties`` to indicate where the ES servers are:
+
+.. code:: ini
+
+ use_elastic=0
+ #Comma separated list of elasticsearch nodes host1,host2:port2
+ elastic_nodes=localhost
+ elastic_index=biomaj
+ # Calculate data.dir size stats
+ data.stats=1
+
+An example ``docker-compose.yml`` would use this:
+
+.. literalinclude:: docker-compose-advanced.yml
+ :language: yaml
+
+And a modified ``global.properties`` referenced in that file would enable elasticsearch:
+
+.. literalinclude:: global.advanced.properties
+ :language: ini
diff --git a/docs/alu.properties b/docs/alu.properties
new file mode 100644
index 0000000..56ea01c
--- /dev/null
+++ b/docs/alu.properties
@@ -0,0 +1,42 @@
+[GENERAL]
+# Database name/description
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+# The short name for the database
+db.name=alu
+# Database type. Some common values include genome, nucleic, nucleic_protein, protein, other
+db.type=nucleic_protein
+# Base directory to download to download temp files to
+offline.dir.name=offline/ncbi/blast/alu_tmp
+# Base directory to download to
+dir.version=ncbi/blast/alu
+# Update frequency
+frequency.update=0
+# Number of threads used during downloading
+files.num.threads=1
+
+# Protocol, common values include ftp, http
+protocol=ftp
+# The FQDN of the server you with to connect to
+server=ftp.ncbi.nih.gov
+# And the directory on that server
+remote.dir=/blast/db/FASTA/
+# The files to find in that page of the remote server.
+remote.files=^alu.*\.gz$
+
+# BioMAJ can automatically extract the version number from a release
+# document. This will be covered in another section.
+release.file=
+release.regexp=
+release.file.compressed=
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+# ?
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+db.post.process=
+
+### Deployment ###
+keep.old.version=1
diff --git a/docs/bank.rst b/docs/bank.rst
new file mode 100644
index 0000000..08925d9
--- /dev/null
+++ b/docs/bank.rst
@@ -0,0 +1,15 @@
+.. _bank:
+
+
+*****
+bank
+*****
+
+
+bank API reference
+==================
+ .. automodule:: biomaj.bank
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/bmajindex.rst b/docs/bmajindex.rst
new file mode 100644
index 0000000..1aa4f36
--- /dev/null
+++ b/docs/bmajindex.rst
@@ -0,0 +1,14 @@
+.. _bmajindex:
+
+
+*****
+bmajindex
+*****
+
+
+BmajIndex API reference
+==================
+ .. automodule:: biomaj.bmajindex
+ :members:
+ :private-members:
+ :special-members:
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..d7835ee
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+#
+# BioMAJ documentation build configuration file, created by
+# sphinx-quickstart on Mon Oct 27 08:26:18 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+from mock import Mock as MagicMock
+
+class Mock(MagicMock):
+ @classmethod
+ def __getattr__(cls, name):
+ return Mock()
+
+MOCK_MODULES = ['pycurl', 'pymongo', 'elasticsearch', 'drmaa']
+sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('../'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.ifconfig',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.graphviz'
+ ]
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'BioMAJ'
+copyright = u'2014, Olivier Sallou'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '3.0'
+# The full version, including alpha/beta/rc tags.
+release = '3.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'BioMAJdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ ('index', 'BioMAJ.tex', u'BioMAJ Documentation',
+ u'Olivier Sallou', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'biomaj', u'BioMAJ Documentation',
+ [u'Olivier Sallou'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'BioMAJ', u'BioMAJ Documentation',
+ u'Olivier Sallou', 'BioMAJ', 'Biological databanks update.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/config.rst b/docs/config.rst
new file mode 100644
index 0000000..c19aa3e
--- /dev/null
+++ b/docs/config.rst
@@ -0,0 +1,15 @@
+.. _config:
+
+
+*****
+config
+*****
+
+
+Config API reference
+==================
+ .. automodule:: biomaj.config
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/docker-compose-advanced.yml b/docs/docker-compose-advanced.yml
new file mode 100644
index 0000000..8e56b8f
--- /dev/null
+++ b/docs/docker-compose-advanced.yml
@@ -0,0 +1,16 @@
+version: '2'
+services:
+ biomaj:
+ image: osallou/biomaj-docker
+ links:
+ - mongodb:biomaj-mongodb
+ - elasticsearch
+ volumes:
+ - ./data:/var/lib/biomaj
+ - ./global.advanced.properties:/etc/biomaj/global.properties
+
+ mongodb:
+ image: mongo
+
+ elasticsearch:
+ image: elasticsearch:1.7
diff --git a/docs/docker-compose.yml b/docs/docker-compose.yml
new file mode 100644
index 0000000..37210b0
--- /dev/null
+++ b/docs/docker-compose.yml
@@ -0,0 +1,11 @@
+version: '2'
+services:
+ biomaj:
+ image: osallou/biomaj-docker
+ links:
+ - mongodb:biomaj-mongodb
+ volumes:
+ - ./data:/var/lib/biomaj
+
+ mongodb:
+ image: mongo
diff --git a/docs/examples.rst b/docs/examples.rst
new file mode 100644
index 0000000..a308ae5
--- /dev/null
+++ b/docs/examples.rst
@@ -0,0 +1,104 @@
+***************
+Getting Started
+***************
+
+For a very basic setup, you can configure a ``docker-compose.yml`` file to use
+with `docker <https://www.docker.com/products/overview#install_the_platform>`__,
+which is especially helpful when you are testing out BioMAJ.
+
+Docker
+======
+
+.. literalinclude:: docker-compose.yml
+ :language: yaml
+ :linenos:
+
+This configuration file defines a simple MongoDB instance which is used for
+backend storage by BioMAJ, as well as the BioMAJ instance itself. Line 8
+denotes that a folder named ``data`` in the current directory will be mounted
+into the volume as storage. Any files downloaded by BioMAJ will appear in this
+directory.
+
+Running the ``--help`` command can be done easily:
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --help
+
+
+Simple Configuration
+====================
+
+Once you've reached this point, you're ready to start configuring BioMAJ to
+download datasets for you. Configuration files should go instead a folder
+``conf`` inside the ``data`` folder in your current directory. As an example,
+we will use this simple ALU configuration file:
+
+.. literalinclude:: alu.properties
+ :language: text
+ :linenos:
+
+The file can be broken down into a couple of sections:
+
+- Metadata (lines 1-15)
+- Remote Source (17-24)
+- Release Information (26-30)
+- Other
+
+The metadata consists of things like where data should be stored, and how
+to name it. The remote source describes where data is to be fetched from,
+release information we will see in another example, and then there are a
+few extra, miscellaneous options shown in this example config.
+
+If you have copied the ``alu.properties`` file into ``./data/conf/alu.properties``, you are ready to download this database:
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --bank alu --update
+ 2016-08-24 21:43:15,276 INFO [root][MainThread] Log file: /var/lib/biomaj/log/alu/1472074995.28/alu.log
+ Log file: /var/lib/biomaj/log/alu/1472074995.28/alu.log
+ ...
+
+This command should complete successfully, and you will have some more files in ``./data/``:
+
+.. code:: console
+
+ $ find data
+ data/conf/alu.properties
+ data/data/ncbi/blast/alu/alu-2003-11-26/flat/alu.a
+ data/data/ncbi/blast/alu/alu-2003-11-26/flat/alu.n
+ data/cache/files_1472074995.29
+ data/log/alu/1472074995.28/alu.log
+
+The ``data/data`` directories contain your downloaded files. Additionally
+a cache file exists and a job run log is contains data about what occurred
+during the download and processing. Note that the files that appear are
+``alu.a`` and ``alu.n``, instead of ``alu.a.gz`` and ``alu.n.gz``. By
+having the option ``no.extract=true`` commented out on line 33, BioMAJ
+automatically extracted the data for us.
+
+The ``--status`` command will allow you to see the status of various databases you have downloaded.
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --bank alu --status
+ +--------+-----------------+----------------------+---------------------+
+ | Name | Type(s) | Last update status | Published release |
+ |--------+-----------------+----------------------+---------------------|
+ | alu | nucleic_protein | 2016-08-24 21:58:14 | 2003-11-26 |
+ +--------+-----------------+----------------------+---------------------+
+ +---------------------+------------------+------------+----------------------------------------------------+----------+
+ | Session | Remote release | Release | Directory | Freeze |
+ |---------------------+------------------+------------+----------------------------------------------------+----------|
+ | 2016-08-24 21:58:14 | 2003-11-26 | 2003-11-26 | /var/lib/biomaj/data/ncbi/blast/alu/alu-2003-11-26 | no |
+ +---------------------+------------------+------------+----------------------------------------------------+----------+
+
+
+Advanced Configuration
+======================
+
+Once you have this sort of simple configuration working, you may wish to
+explore more advanced configurations. There is a `public repository
+<https://github.com/genouest/biomaj-data/>`__ of BioMAJ configurations which
+will be interesting to the advanced user wishing to learn more about what can
+be done with BioMAJ.
diff --git a/docs/ftp.rst b/docs/ftp.rst
new file mode 100644
index 0000000..69931d2
--- /dev/null
+++ b/docs/ftp.rst
@@ -0,0 +1,15 @@
+.. _ftp:
+
+
+*****
+ftp
+*****
+
+
+FTPDownloader API reference
+==================
+ .. automodule:: biomaj.download.ftp
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/global.advanced.properties b/docs/global.advanced.properties
new file mode 100644
index 0000000..9d87c96
--- /dev/null
+++ b/docs/global.advanced.properties
@@ -0,0 +1,143 @@
+[GENERAL]
+root.dir=/var/lib/biomaj
+conf.dir=%(root.dir)s/conf
+log.dir=%(root.dir)s/log
+process.dir=%(root.dir)s/process
+cache.dir=%(root.dir)s/cache
+lock.dir=%(root.dir)s/lock
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=%(root.dir)s/data
+
+db.url=mongodb://biomaj-mongodb:27017
+db.name=biomaj
+
+use_ldap=0
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+use_elastic=1
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=elasticsearch
+elastic_index=biomaj
+# Calculate data.dir size stats
+data.stats=1
+
+celery.queue=biomaj
+celery.broker=mongodb://biomaj-mongodb:27017/biomaj_celery
+
+
+auto_publish=1
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+#mail.stmp.host=
+mail.admin=
+mail.from=biomaj at localhost
+mail.user=
+mail.password=
+mail.tls=
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#---------------------
+# PROTOCOL
+#-------------------
+#possible values : ftp, http, rsync, local
+port=21
+username=anonymous
+password=anonymous at nowhere.com
+
+
+
+#access user for production directories
+production.directory.chmod=775
+
+#Number of thread during the download
+bank.num.threads=4
+
+#Number of threads to use for downloading and processing
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#Link copy property
+do.link.copy=true
+
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=INFO
+
+http.parse.dir.line=<a[\\s]+href=\"([\\S]+)/\".*alt=\"\\[DIR\\]\">.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})
+http.parse.file.line=<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+#Needed if data sources are contains in an archive
+log.files=true
+
+local.files.excluded=\\.panfs.*
+
+#~40mn
+ftp.timeout=2000000
+ftp.automatic.reconnect=5
+ftp.active.mode=false
+
+# Bank default access
+visibility.default=public
+
+#proxy=http://localhost:3128
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = INFO
+handlers = console
+
+[logger_biomaj]
+level = INFO
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/docs/http.rst b/docs/http.rst
new file mode 100644
index 0000000..4d81ea2
--- /dev/null
+++ b/docs/http.rst
@@ -0,0 +1,15 @@
+.. _http:
+
+
+*****
+http
+*****
+
+
+HTTPDownloader API reference
+==================
+ .. automodule:: biomaj.download.http
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..91c6eb1
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,44 @@
+.. BioMAJ documentation master file, created by
+ sphinx-quickstart on Mon Oct 27 08:26:18 2014.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to BioMAJ's documentation!
+==================================
+
+Getting Started Documentation:
+
+.. toctree::
+ :maxdepth: 2
+
+ examples
+ admin
+
+API Documentation:
+
+.. toctree::
+ :maxdepth: 2
+
+ bank
+ config
+ options
+ session
+ utils
+ workflow
+ interface
+ ftp
+ http
+ localcopy
+ notify
+ metaprocess
+ process
+ processfactory
+ user
+ bmajindex
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/interface.rst b/docs/interface.rst
new file mode 100644
index 0000000..97786eb
--- /dev/null
+++ b/docs/interface.rst
@@ -0,0 +1,17 @@
+.. _interface:
+
+
+*****
+interface
+*****
+
+Base interface that downloaders must extend.
+
+
+DownloadInterface API reference
+==================
+ .. automodule:: biomaj.download.interface
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/localcopy.rst b/docs/localcopy.rst
new file mode 100644
index 0000000..7085731
--- /dev/null
+++ b/docs/localcopy.rst
@@ -0,0 +1,15 @@
+.. _localcopy:
+
+
+*****
+localcopy
+*****
+
+
+LocalDownloader API reference
+==================
+ .. automodule:: biomaj.download.localcopy
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..e05270a
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,242 @@
+ at ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. xml to make Docutils-native XML files
+ echo. pseudoxml to make pseudoxml-XML files for display purposes
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\BioMAJ.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\BioMAJ.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdf" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf
+ cd %BUILDDIR%/..
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdfja" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf-ja
+ cd %BUILDDIR%/..
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+if "%1" == "xml" (
+ %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The XML files are in %BUILDDIR%/xml.
+ goto end
+)
+
+if "%1" == "pseudoxml" (
+ %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+ goto end
+)
+
+:end
diff --git a/docs/metaprocess.rst b/docs/metaprocess.rst
new file mode 100644
index 0000000..2c5be97
--- /dev/null
+++ b/docs/metaprocess.rst
@@ -0,0 +1,15 @@
+.. _metaprocess:
+
+
+*****
+metaprocess
+*****
+
+
+MetaProcess API reference
+==================
+ .. automodule:: biomaj.process.metaprocess
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/notify.rst b/docs/notify.rst
new file mode 100644
index 0000000..926f2b2
--- /dev/null
+++ b/docs/notify.rst
@@ -0,0 +1,15 @@
+.. _notify:
+
+
+*****
+notify
+*****
+
+
+Notify API reference
+==================
+ .. automodule:: biomaj.notify
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/options.rst b/docs/options.rst
new file mode 100644
index 0000000..9e91975
--- /dev/null
+++ b/docs/options.rst
@@ -0,0 +1,15 @@
+.. _options:
+
+
+*****
+options
+*****
+
+
+Options API reference
+==================
+ .. automodule:: biomaj.options
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/process.rst b/docs/process.rst
new file mode 100644
index 0000000..b949eda
--- /dev/null
+++ b/docs/process.rst
@@ -0,0 +1,15 @@
+.. _process:
+
+
+*****
+process
+*****
+
+
+Process API reference
+==================
+ .. automodule:: biomaj.process.process
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/processfactory.rst b/docs/processfactory.rst
new file mode 100644
index 0000000..28cd4a8
--- /dev/null
+++ b/docs/processfactory.rst
@@ -0,0 +1,15 @@
+.. _processfactory:
+
+
+*****
+processfactory
+*****
+
+
+ProcessFactory API reference
+==================
+ .. automodule:: biomaj.process.processfactory
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..397278b
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,9 @@
+mock
+nose
+pymongo==3.2
+tabulate
+ldap3
+py-bcrypt
+drmaa
+future
+elasticsearch
diff --git a/docs/session.rst b/docs/session.rst
new file mode 100644
index 0000000..85c4a1e
--- /dev/null
+++ b/docs/session.rst
@@ -0,0 +1,15 @@
+.. _session:
+
+
+*****
+Session
+*****
+
+
+Session API reference
+==================
+ .. automodule:: biomaj.session
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/user.rst b/docs/user.rst
new file mode 100644
index 0000000..cfdb379
--- /dev/null
+++ b/docs/user.rst
@@ -0,0 +1,15 @@
+.. _user:
+
+
+*****
+Biomaj User
+*****
+
+
+BmajUser API reference
+==================
+ .. automodule:: biomaj.user
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/utils.rst b/docs/utils.rst
new file mode 100644
index 0000000..5ba47bd
--- /dev/null
+++ b/docs/utils.rst
@@ -0,0 +1,15 @@
+.. _utils:
+
+
+*****
+Utils
+*****
+
+
+Utils API reference
+==================
+ .. automodule:: biomaj.utils
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/workflow.rst b/docs/workflow.rst
new file mode 100644
index 0000000..75d8fe5
--- /dev/null
+++ b/docs/workflow.rst
@@ -0,0 +1,15 @@
+.. _workflow:
+
+
+*****
+workflow
+*****
+
+
+Workflows API reference
+==================
+ .. automodule:: biomaj.workflow
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..fb4bdf7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+mock
+nose
+pymongo==3.2
+pycurl
+tabulate
+ldap3==1.4.0
+py-bcrypt
+drmaa
+future
+elasticsearch
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5db5cf6
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,69 @@
+try:
+ from setuptools import setup, find_packages
+except ImportError:
+ from distutils.core import setup
+
+from distutils.command.install import install
+import os
+
+
+class post_install(install):
+ def run(self):
+ install.run(self)
+ from biomaj.schema_version import SchemaVersion
+ SchemaVersion.migrate_pendings()
+
+here = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(here, 'README.md')) as f:
+ README = f.read()
+with open(os.path.join(here, 'CHANGES.txt')) as f:
+ CHANGES = f.read()
+
+
+config = {
+ 'description': 'BioMAJ',
+ 'long_description': README + '\n\n' + CHANGES,
+ 'author': 'Olivier Sallou',
+ 'url': 'http://biomaj.genouest.org',
+ 'download_url': 'http://biomaj.genouest.org',
+ 'author_email': 'olivier.sallou at irisa.fr',
+ 'version': '3.0.20',
+ 'classifiers': [
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Natural Language :: English',
+ 'Operating System :: POSIX :: Linux',
+ # Indicate who your project is intended for
+ 'Intended Audience :: Science/Research',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ # Pick your license as you wish (should match "license" above)
+ 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
+ # Specify the Python versions you support here. In particular, ensure
+ # that you indicate whether you support Python 2, Python 3 or both.
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4'
+ ],
+ 'install_requires': ['nose',
+ 'pymongo==3.2',
+ 'pycurl',
+ 'ldap3==1.4.0',
+ 'mock',
+ 'py-bcrypt',
+ 'drmaa',
+ 'future',
+ 'tabulate',
+ 'elasticsearch'],
+ 'packages': find_packages(),
+ 'include_package_data': True,
+ 'scripts': ['bin/biomaj-cli.py'],
+ 'name': 'biomaj',
+ 'cmdclass': {'install': post_install},
+}
+
+setup(**config)
diff --git a/tests/alu.properties b/tests/alu.properties
new file mode 100644
index 0000000..0e729e0
--- /dev/null
+++ b/tests/alu.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+db.name=alu
+db.type=nucleic_protein
+
+offline.dir.name=offline/ncbi/blast/alu_tmp
+dir.version=ncbi/blast/alu
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=ftp
+server=ftp.ncbi.nih.gov
+remote.dir=/blast/db/FASTA/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^alu.*\.gz$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/bank/process/test.sh b/tests/bank/process/test.sh
new file mode 100755
index 0000000..2d510e2
--- /dev/null
+++ b/tests/bank/process/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Testing a process"
+
+echo "test meta data"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr1#blast/chr1/chr1db"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr2#blast/chr2/chr2db"
+
+echo "test meta data 2"
+
+echo "##BIOMAJ#fasta#nucleic#organism:hg19#fasta/chr1.fa,fasta/chr2.fa"
diff --git a/tests/bank/test.fasta.gz b/tests/bank/test.fasta.gz
new file mode 100644
index 0000000..666d6f2
Binary files /dev/null and b/tests/bank/test.fasta.gz differ
diff --git a/tests/bank/test2.fasta b/tests/bank/test2.fasta
new file mode 100644
index 0000000..410ca0f
--- /dev/null
+++ b/tests/bank/test2.fasta
@@ -0,0 +1,2 @@
+>test2
+gcgcgcgcgcgcgcgccgcgcgcgcgcgcgcggc
diff --git a/tests/bank/test_100.txt b/tests/bank/test_100.txt
new file mode 100644
index 0000000..c7f7c3b
--- /dev/null
+++ b/tests/bank/test_100.txt
@@ -0,0 +1 @@
+This is a sample file to extract Release 103 from a text file
diff --git a/tests/biomaj_tests.py b/tests/biomaj_tests.py
new file mode 100644
index 0000000..6c1c74a
--- /dev/null
+++ b/tests/biomaj_tests.py
@@ -0,0 +1,1305 @@
+from nose.tools import *
+from nose.plugins.attrib import attr
+
+import json
+import shutil
+import os
+import tempfile
+import logging
+import copy
+import stat
+import time
+
+from mock import patch
+
+from optparse import OptionParser
+
+
+from biomaj.bank import Bank
+from biomaj.session import Session
+from biomaj.workflow import Workflow, UpdateWorkflow
+from biomaj.utils import Utils
+from biomaj.download.ftp import FTPDownload
+from biomaj.download.direct import DirectFTPDownload, DirectHttpDownload
+from biomaj.download.http import HTTPDownload
+from biomaj.download.localcopy import LocalDownload
+from biomaj.download.downloadthreads import DownloadThread
+from biomaj.config import BiomajConfig
+from biomaj.process.processfactory import PostProcessFactory,PreProcessFactory,RemoveProcessFactory
+from biomaj.user import BmajUser
+from biomaj.bmajindex import BmajIndex
+
+from ldap3.core.exceptions import LDAPBindError
+
+
+import unittest
+
+class UtilsForTest():
+ """
+ Copy properties files to a temp directory and update properties to
+ use a temp directory
+ """
+
+ def __init__(self):
+ """
+ Setup the temp dirs and files.
+ """
+ self.global_properties = None
+ self.bank_properties = None
+
+ self.test_dir = tempfile.mkdtemp('biomaj')
+
+ self.conf_dir =os.path.join(self.test_dir,'conf')
+ if not os.path.exists(self.conf_dir):
+ os.makedirs(self.conf_dir)
+ self.data_dir =os.path.join(self.test_dir,'data')
+ if not os.path.exists(self.data_dir):
+ os.makedirs(self.data_dir)
+ self.log_dir =os.path.join(self.test_dir,'log')
+ if not os.path.exists(self.log_dir):
+ os.makedirs(self.log_dir)
+ self.process_dir =os.path.join(self.test_dir,'process')
+ if not os.path.exists(self.process_dir):
+ os.makedirs(self.process_dir)
+ self.lock_dir =os.path.join(self.test_dir,'lock')
+ if not os.path.exists(self.lock_dir):
+ os.makedirs(self.lock_dir)
+ self.cache_dir =os.path.join(self.test_dir,'cache')
+ if not os.path.exists(self.cache_dir):
+ os.makedirs(self.cache_dir)
+
+
+ if self.global_properties is None:
+ self.__copy_global_properties()
+
+ if self.bank_properties is None:
+ self.__copy_test_bank_properties()
+
+ def clean(self):
+ """
+ Deletes temp directory
+ """
+ shutil.rmtree(self.test_dir)
+
+ def __copy_test_bank_properties(self):
+ if self.bank_properties is not None:
+ return
+ self.bank_properties = ['alu', 'local', 'testhttp','directhttp']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ for b in self.bank_properties:
+ from_file = os.path.join(curdir, b+'.properties')
+ to_file = os.path.join(self.conf_dir, b+'.properties')
+ shutil.copyfile(from_file, to_file)
+
+ self.bank_process = ['test.sh']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ procdir = os.path.join(curdir, 'bank/process')
+ for proc in self.bank_process:
+ from_file = os.path.join(procdir, proc)
+ to_file = os.path.join(self.process_dir, proc)
+ shutil.copyfile(from_file, to_file)
+ os.chmod(to_file, stat.S_IRWXU)
+
+ # Manage local bank test, use bank test subdir as remote
+ properties = ['multi.properties', 'computederror.properties', 'error.properties', 'local.properties', 'localprocess.properties', 'testhttp.properties', 'computed.properties', 'computed2.properties', 'sub1.properties', 'sub2.properties']
+ for prop in properties:
+ from_file = os.path.join(curdir, prop)
+ to_file = os.path.join(self.conf_dir, prop)
+ fout = open(to_file,'w')
+ with open(from_file,'r') as fin:
+ for line in fin:
+ if line.startswith('remote.dir'):
+ fout.write("remote.dir="+os.path.join(curdir,'bank')+"\n")
+ elif line.startswith('remote.files'):
+ fout.write(line.replace('/tmp', os.path.join(curdir,'bank')))
+ else:
+ fout.write(line)
+ fout.close()
+
+ def __copy_global_properties(self):
+ if self.global_properties is not None:
+ return
+ self.global_properties = os.path.join(self.conf_dir,'global.properties')
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ global_template = os.path.join(curdir,'global.properties')
+ fout = open(self.global_properties,'w')
+ with open(global_template,'r') as fin:
+ for line in fin:
+ if line.startswith('conf.dir'):
+ fout.write("conf.dir="+self.conf_dir+"\n")
+ elif line.startswith('log.dir'):
+ fout.write("log.dir="+self.log_dir+"\n")
+ elif line.startswith('data.dir'):
+ fout.write("data.dir="+self.data_dir+"\n")
+ elif line.startswith('process.dir'):
+ fout.write("process.dir="+self.process_dir+"\n")
+ elif line.startswith('lock.dir'):
+ fout.write("lock.dir="+self.lock_dir+"\n")
+ else:
+ fout.write(line)
+ fout.close()
+
+
+class TestBiomajUtils(unittest.TestCase):
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+
+ def test_mimes(self):
+ fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'bank/test2.fasta')
+ (mime, encoding) = Utils.detect_format(fasta_file)
+ self.assertTrue('application/fasta' == mime)
+
+ @attr('compress')
+ def test_uncompress(self):
+ from_file = { 'root': os.path.dirname(os.path.realpath(__file__)),
+ 'name': 'bank/test.fasta.gz'
+ }
+
+ to_dir = self.utils.data_dir
+ Utils.copy_files([from_file], to_dir)
+ Utils.uncompress(os.path.join(to_dir, from_file['name']))
+ self.assertTrue(os.path.exists(to_dir+'/bank/test.fasta'))
+
+ def test_copy_with_regexp(self):
+ from_dir = os.path.dirname(os.path.realpath(__file__))
+ to_dir = self.utils.data_dir
+ Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py'])
+ self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
+
+ def test_copy(self):
+ from_dir = os.path.dirname(os.path.realpath(__file__))
+ local_file = 'biomaj_tests.py'
+ files_to_copy = [ {'root': from_dir, 'name': local_file}]
+ to_dir = self.utils.data_dir
+ Utils.copy_files(files_to_copy, to_dir)
+ self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
+
+class TestBiomajLocalDownload(unittest.TestCase):
+ """
+ Test Local downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ self.curdir = os.path.dirname(os.path.realpath(__file__))
+ self.examples = os.path.join(self.curdir,'bank') + '/'
+
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ """
+ if not os.path.exists('/tmp/biomaj/config'):
+ os.makedirs('/tmp/biomaj/config')
+ if not os.path.exists(os.path.join('/tmp/biomaj/config','local.properties')):
+ shutil.copyfile(os.path.join(self.curdir,'local.properties'),
+ os.path.join('/tmp/biomaj/config','local.properties'))
+ flocal = open(os.path.join('/tmp/biomaj/config','local.properties'),'a')
+ flocal.write('\nremote.dir='+self.examples+"\n")
+ flocal.close()
+ """
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_local_list(self):
+ locald = LocalDownload(self.examples)
+ (file_list, dir_list) = locald.list()
+ locald.close()
+ self.assertTrue(len(file_list) > 1)
+
+ def test_local_download(self):
+ locald = LocalDownload(self.examples)
+ (file_list, dir_list) = locald.list()
+ locald.match([r'^test.*\.gz$'], file_list, dir_list)
+ locald.download(self.utils.data_dir)
+ locald.close()
+ self.assertTrue(len(locald.files_to_download) == 1)
+
+ def test_local_download_in_subdir(self):
+ locald = LocalDownload(self.curdir+'/')
+ (file_list, dir_list) = locald.list()
+ locald.match([r'^/bank/test.*\.gz$'], file_list, dir_list)
+ locald.download(self.utils.data_dir)
+ locald.close()
+ self.assertTrue(len(locald.files_to_download) == 1)
+
+ def test_parallel_local_download(self):
+ locald = LocalDownload(self.examples)
+ (file_list, dir_list) = locald.list()
+ locald.match([r'^test'], file_list, dir_list)
+ list1 = [locald.files_to_download[0]]
+ list2 = locald.files_to_download[1:]
+ locald.close()
+
+ locald1 = LocalDownload(self.examples)
+ locald1.files_to_download = list1
+ locald2 = LocalDownload(self.examples)
+ locald2.files_to_download = list2
+ t1 = DownloadThread(locald1, self.utils.data_dir)
+ t2 = DownloadThread(locald2, self.utils.data_dir)
+ t1.start()
+ t2.start()
+ t1.join()
+ t2.join()
+ self.assertTrue(len(t1.downloader.files_to_download) == 1)
+ self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list1[0]['name']))
+ self.assertTrue(len(t2.downloader.files_to_download) == 2)
+ self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[0]['name']))
+ self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[1]['name']))
+
+ at attr('network')
+ at attr('http')
+class TestBiomajHTTPDownload(unittest.TestCase):
+ """
+ Test HTTP downloader
+ """
+ def setUp(self):
+ self.utils = UtilsForTest()
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+ self.config = BiomajConfig('testhttp')
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_http_list(self):
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.config)
+ (file_list, dir_list) = httpd.list()
+ httpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_http_list_dateregexp(self):
+ self.config.set('http.parse.file.date.format',"%%d-%%b-%%Y %%H:%%M")
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.config)
+ (file_list, dir_list) = httpd.list()
+ httpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_http_download(self):
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.config)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+ def test_http_download_in_subdir(self):
+ httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/', self.config)
+ (file_list, dir_list) = httpd.list()
+ httpd.match([r'^dists/README$'], file_list, dir_list)
+ httpd.download(self.utils.data_dir)
+ httpd.close()
+ self.assertTrue(len(httpd.files_to_download) == 1)
+
+
+ at attr('directftp')
+ at attr('network')
+class TestBiomajDirectFTPDownload(unittest.TestCase):
+ """
+ Test DirectFTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_ftp_list(self):
+ file_list = ['/blast/db/FASTA/alu.n.gz.md5']
+ ftpd = DirectFTPDownload('ftp', 'ftp.ncbi.nih.gov', '', file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) == 1)
+
+ def test_download(self):
+ file_list = ['/blast/db/FASTA/alu.n.gz.md5']
+ ftpd = DirectFTPDownload('ftp', 'ftp.ncbi.nih.gov', '', file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'alu.n.gz.md5')))
+
+
+ at attr('directhttp')
+ at attr('network')
+class TestBiomajDirectHTTPDownload(unittest.TestCase):
+ """
+ Test DirectFTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_http_list(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '', file_list)
+ fday = ftpd.files_to_download[0]['day']
+ fmonth = ftpd.files_to_download[0]['month']
+ fyear = ftpd.files_to_download[0]['year']
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) == 1)
+ self.assertTrue(file_list[0]['size']!=0)
+ self.assertFalse(fyear == ftpd.files_to_download[0]['year'] and fmonth == ftpd.files_to_download[0]['month'] and fday == ftpd.files_to_download[0]['day'])
+
+ def test_download(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '', file_list)
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'README.html')))
+
+ def test_download_get_params_save_as(self):
+ file_list = ['/get']
+ ftpd = DirectHttpDownload('http', 'httpbin.org', '', file_list)
+ ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
+ ftpd.save_as = 'test.json'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.json')))
+ with open(os.path.join(self.utils.data_dir,'test.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['args']['key1'] == 'value1')
+
+ def test_download_save_as(self):
+ file_list = ['/debian/README.html']
+ ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '', file_list)
+ ftpd.save_as = 'test.html'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.html')))
+
+ def test_download_post_params(self):
+ #file_list = ['/debian/README.html']
+ file_list = ['/post']
+ ftpd = DirectHttpDownload('http', 'httpbin.org', '', file_list)
+ #ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '', file_list)
+ ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
+ ftpd.save_as = 'test.json'
+ ftpd.method = 'POST'
+ (file_list, dir_list) = ftpd.list()
+ ftpd.download(self.utils.data_dir, False)
+ ftpd.close()
+ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'test.json')))
+ with open(os.path.join(self.utils.data_dir,'test.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['form']['key1'] == 'value1')
+
+
+ at attr('ftp')
+ at attr('network')
+class TestBiomajFTPDownload(unittest.TestCase):
+ """
+ Test FTP downloader
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+
+ def tearDown(self):
+ self.utils.clean()
+
+ def test_ftp_list(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/db/FASTA/')
+ (file_list, dir_list) = ftpd.list()
+ ftpd.close()
+ self.assertTrue(len(file_list) > 1)
+
+ def test_download(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/db/FASTA/')
+ (file_list, dir_list) = ftpd.list()
+ ftpd.match([r'^alu.*\.gz$'], file_list, dir_list)
+ ftpd.download(self.utils.data_dir)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download) == 2)
+
+
+ def test_download_in_subdir(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/')
+ (file_list, dir_list) = ftpd.list()
+ ftpd.match([r'^db/FASTA/alu.*\.gz$'], file_list, dir_list)
+ ftpd.download(self.utils.data_dir)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download) == 2)
+
+ def test_download_or_copy(self):
+ ftpd = FTPDownload('ftp', 'ftp.ncbi.nih.gov', '/blast/')
+ ftpd.files_to_download = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test2', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
+ ]
+ available_files = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test12', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test3', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 20},
+ {'name':'/test/test11', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}
+ ]
+ ftpd.download_or_copy(available_files, '/biomaj', False)
+ ftpd.close()
+ self.assertTrue(len(ftpd.files_to_download)==2)
+ self.assertTrue(len(ftpd.files_to_copy)==2)
+
+ def test_get_more_recent_file(self):
+ files = [
+ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test2', 'year': '2013', 'month': '11', 'day': '12', 'size': 10},
+ {'name':'/test/test1', 'year': '1988', 'month': '11', 'day': '10', 'size': 10},
+ {'name':'/test/test11', 'year': '2013', 'month': '9', 'day': '23', 'size': 10}
+ ]
+ release = Utils.get_more_recent_file(files)
+ self.assertTrue(release['year']=='2013')
+ self.assertTrue(release['month']=='11')
+ self.assertTrue(release['day']=='12')
+
+class TestBiomajSetup(unittest.TestCase):
+
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ # Delete all banks
+ b = Bank('alu')
+ b.banks.remove({})
+
+ self.config = BiomajConfig('alu')
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'alu.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+
+ def tearDown(self):
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'alu.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+ self.utils.clean()
+
+ def test_new_bank(self):
+ """
+ Checks bank init
+ """
+ b = Bank('alu')
+
+ def test_new_session(self):
+ """
+ Checks an empty session is created
+ """
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ for key in b.session._session['status'].keys():
+ self.assertFalse(b.session.get_status(key))
+
+ def test_session_reload_notover(self):
+ """
+ Checks a session is used if present
+ """
+ b = Bank('alu')
+ for i in range(1, 5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ self.assertTrue(b.session.get_status(Workflow.FLOW_INIT))
+
+ def test_clean_old_sessions(self):
+ """
+ Checks a session is used if present
+ """
+ b = Bank('local')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+ b2 = Bank('local')
+ b2.update()
+ b2.clean_old_sessions()
+ self.assertTrue(len(b2.bank['sessions']) == 1)
+
+ def test_session_reload_over(self):
+ """
+ Checks a session if is not over
+ """
+ b = Bank('alu')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ s._session['status'][Workflow.FLOW_OVER] = True
+ b.session = s
+ b.save_session()
+
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ self.assertFalse(b.session.get_status(Workflow.FLOW_INIT))
+
+ def test_bank_list(self):
+ b1 = Bank('alu')
+ b2 = Bank('local')
+ banks = Bank.list()
+ self.assertTrue(len(banks) == 2)
+
+ @attr('network')
+ def test_get_release(self):
+ """
+ Get release
+ """
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ res = b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(res)
+ self.assertTrue(b.session._session['release'] is not None)
+
+ def test_remove_session(self):
+ b = Bank('alu')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+ self.assertTrue(len(b.bank['sessions'])==4)
+ b.remove_session(b.session.get('id'))
+ self.assertTrue(len(b.bank['sessions'])==3)
+
+ @attr('process')
+ def test_postprocesses_setup(self):
+ b = Bank('localprocess')
+ pfactory = PostProcessFactory(b)
+ pfactory.run(True)
+ self.assertTrue(len(pfactory.threads_tasks[0])==2)
+ self.assertTrue(len(pfactory.threads_tasks[1])==1)
+
+ @attr('process')
+ def test_postprocesses_exec_again(self):
+ """
+ Execute once, set a status to false, check that False processes are executed
+ """
+ b = Bank('localprocess')
+ pfactory = PostProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.blocks['BLOCK1']['META0']['PROC0'])
+ self.assertTrue(pfactory.blocks['BLOCK2']['META1']['PROC1'])
+ self.assertTrue(pfactory.blocks['BLOCK2']['META1']['PROC2'])
+ blocks = copy.deepcopy(pfactory.blocks)
+ blocks['BLOCK2']['META1']['PROC2'] = False
+ pfactory2 = PostProcessFactory(b, blocks)
+ pfactory2.run()
+ self.assertTrue(pfactory2.blocks['BLOCK2']['META1']['PROC2'])
+
+ @attr('process')
+ def test_preprocesses(self):
+ b = Bank('localprocess')
+ pfactory = PreProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.meta_status['META0']['PROC0'])
+
+ @attr('process')
+ def test_removeprocesses(self):
+ b = Bank('localprocess')
+ pfactory = RemoveProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.meta_status['META0']['PROC0'])
+
+ def test_dependencies_list(self):
+ b = Bank('computed')
+ deps = b.get_dependencies()
+ self.assertTrue(len(deps)==2)
+
+class TestBiomajFunctional(unittest.TestCase):
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ #Delete all banks
+ b = Bank('local')
+ b.banks.remove({})
+
+ self.config = BiomajConfig('local')
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+
+ def tearDown(self):
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+ self.utils.clean()
+
+ def test_extract_release_from_file_name(self):
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_(\d+)\.txt')
+ b.session.config.set('release.regexp', '')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100')
+
+ def test_extract_release_from_file_content(self):
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_100\.txt')
+ b.session.config.set('release.regexp', 'Release\s*(\d+)')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '103')
+
+ def test_publish(self):
+ """
+ Update a bank, then publish it
+ """
+ b = Bank('local')
+ b.update()
+ current_link = os.path.join(b.config.get('data.dir'),
+ b.config.get('dir.version'),
+ 'current')
+ self.assertFalse(os.path.exists(current_link))
+ self.assertTrue(b.bank['current'] is None)
+ b.publish()
+ self.assertTrue(os.path.exists(current_link))
+ self.assertTrue(b.bank['current'] == b.session._session['id'])
+
+ # Should test this on local downloader, changing 1 file to force update,
+ # else we would get same bank and there would be no update
+ def test_no_update(self):
+ """
+ Try updating twice, at second time, bank should not be updated
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.update()
+ self.assertFalse(b.session.get('update'))
+ self.assertFalse(b.session.get_status(Workflow.FLOW_POSTPROCESS))
+
+ @attr('remotelist')
+ def test_download_from_list(self):
+ """
+ Use remote.list to define a list of files to download
+ """
+ b = Bank('local')
+ fd, file_path = tempfile.mkstemp()
+ try:
+ b.config.set('remote.list', file_path)
+ with os.fdopen(fd, 'w') as tmp:
+ tmp.write('[{"name": "test_100.txt", "root": "' + b.config.get('remote.dir') + '"}]')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ finally:
+ #os.remove(file_path)
+ print(file_path)
+
+
+ @attr('release')
+ def test_release_control(self):
+ """
+ Try updating twice, at second time, modify one file (same date),
+ bank should update
+ """
+ b = Bank('local')
+ b.update()
+ b.session.config.set('keep.old.version', '3')
+ self.assertTrue(b.session.get('update'))
+ remote_file = b.session.config.get('remote.dir') + 'test2.fasta'
+ os.utime(remote_file, None)
+ # Update test2.fasta and set release.control
+ b.session.config.set('release.control', 'true')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.update()
+ self.assertFalse(b.session.get('update'))
+ b.session.config.set('remote.files', '^test2.fasta')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+
+ def test_fromscratch_update(self):
+ """
+ Try updating twice, at second time, bank should be updated (force with fromscratc)
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ sess = b.session.get('release')
+ b.options.fromscratch = True
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertEqual(b.session.get('release'), sess+'__1')
+
+
+ def test_fromscratch_update_with_release(self):
+ """
+ Try updating twice, at second time, bank should be updated (force with fromscratch)
+
+ Use case with release defined in release file
+ """
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_(\d+)\.txt')
+ b.session.config.set('release.regexp', '')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100')
+ os.makedirs(b.session.get_full_release_directory())
+ w = UpdateWorkflow(b)
+ # Reset release
+ b.session.set('release', None)
+ w.options.fromscratch = True
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100__1')
+
+
+ def test_mix_stop_from_task(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+ def test_mix_stop_from_task2(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ b2.options.from_task = 'download'
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+ def test_mix_stop_from_task3(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ b2.options.from_task = 'postprocess'
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+
+ def test_mix_stop_from_task4(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_before = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ b3 = Bank('local')
+ b3.options.from_task = 'postprocess'
+ res = b3.update()
+ self.assertFalse(res)
+
+
+ def test_delete_old_dirs(self):
+ """
+ Try updating 3 times, oldest dir should be removed
+ """
+ b = Bank('local')
+ b.removeAll(True)
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.options.fromscratch = True
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(len(b.bank['production']) == 2)
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ # one new dir, but olders must be deleted
+ self.assertTrue(len(b.bank['production']) == 2)
+
+ def test_delete_old_dirs_with_freeze(self):
+ """
+ Try updating 3 times, oldest dir should be removed but not freezed releases
+ """
+ b = Bank('local')
+ b.removeAll(True)
+ b = Bank('local')
+ b.update()
+ b.freeze(b.session.get('release'))
+ self.assertTrue(b.session.get('update'))
+ b.options.fromscratch = True
+ b.update()
+ b.freeze(b.session.get('release'))
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(len(b.bank['production']) == 2)
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ # one new dir, but olders must be deleted
+ self.assertTrue(len(b.bank['production']) == 3)
+
+
+ def test_removeAll(self):
+ b = Bank('local')
+ b.update()
+ b.removeAll()
+ self.assertFalse(os.path.exists(b.get_data_dir()))
+ bdb = b.banks.find_one({'name': b.name})
+ self.assertTrue(bdb is None)
+
+ def test_remove(self):
+ """
+ test removal of a production dir
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()))
+ self.assertTrue(len(b.bank['production'])==1)
+ b.remove(b.session.get('release'))
+ self.assertFalse(os.path.exists(b.session.get_full_release_directory()))
+ b = Bank('local')
+ self.assertTrue(len(b.bank['production'])==0)
+
+ def test_update_stop_after(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertTrue(b.session.get_status('download'))
+ self.assertFalse(b.session.get_status('postprocess'))
+
+ def test_update_stop_before(self):
+ b = Bank('local')
+ b.options.stop_before = 'postprocess'
+ b.update()
+ self.assertTrue(b.session.get_status('download'))
+ self.assertFalse(b.session.get_status('postprocess'))
+
+ def test_reupdate_from_task(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ b2.update()
+ self.assertTrue(b2.session.get_status('postprocess'))
+ self.assertEqual(b.session.get_full_release_directory(), b2.session.get_full_release_directory())
+
+ def test_reupdate_from_task_error(self):
+ b = Bank('local')
+ b.options.stop_after = 'check'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ res = b2.update()
+ self.assertFalse(res)
+
+ def test_reupdate_from_task_wrong_release(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = 'wrongrelease'
+ res = b2.update()
+ self.assertFalse(res)
+
+ @attr('process')
+ def test_postprocesses_restart_from_proc(self):
+ b = Bank('localprocess')
+ b.update()
+ proc1file = os.path.join(b.session.get_full_release_directory(),'proc1.txt')
+ proc2file = os.path.join(b.session.get_full_release_directory(),'proc2.txt')
+ self.assertTrue(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+ os.remove(proc1file)
+ os.remove(proc2file)
+ # Restart from postprocess, reexecute all processes
+ b2 = Bank('localprocess')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ b2.update()
+ self.assertTrue(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+ os.remove(proc1file)
+ os.remove(proc2file)
+ # Restart from postprocess, but at process PROC2 and following
+ b3 = Bank('localprocess')
+ b3.options.from_task = 'postprocess'
+ b3.options.process = 'PROC2'
+ b3.options.release = b.session.get('release')
+ b3.update()
+ #self.assertFalse(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+
+ def test_computed(self):
+ b = Bank('computed')
+ res = b.update(True)
+ self.assertTrue(res)
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/sub1/flat/test_100.txt'))
+ self.assertTrue(b.session.get('update'))
+ # Check that, with depends non updated, bank is not updated itself
+ nextb = Bank('computed')
+ res = nextb.update(True)
+ self.assertFalse(nextb.session.get('update'))
+
+
+ @attr('nofile')
+ def test_computed_nofile(self):
+ b = Bank('computed2')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('protocol', 'none')
+ b.session.config.set('sub1.files.move', 'flat/test_.*')
+ res = b.update(True)
+ self.assertTrue(res)
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/sub1/flat/test_100.txt'))
+
+
+ def test_computed_ref_release(self):
+ b = Bank('computed2')
+ res = b.update(True)
+ b2 = Bank('sub1')
+ b2release = b2.bank['production'][len(b2.bank['production'])-1]['release']
+ brelease = b.bank['production'][len(b.bank['production'])-1]['release']
+ self.assertTrue(res)
+ self.assertTrue(brelease == b2release)
+
+ @attr('computed')
+ def test_computed_ref_release(self):
+ b = Bank('computed2')
+ res = b.update(True)
+ self.assertTrue(b.session.get('update'))
+ b2 = Bank('computed2')
+ res = b2.update(True)
+ self.assertFalse(b2.session.get('update'))
+
+ def test_computederror(self):
+ b = Bank('computederror')
+ res = b.update(True)
+ self.assertFalse(res)
+ self.assertTrue(b.session._session['depends']['sub2'])
+ self.assertFalse(b.session._session['depends']['error'])
+
+
+ @attr('directrelease')
+ def test_directhttp_release(self):
+ b = Bank('directhttp')
+ res = b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/flat/debian/README.html'))
+ #print str(b.session.get('release'))
+ #print str(b.session.get('remoterelease'))
+
+ @attr('network')
+ def test_multi(self):
+ b = Bank('multi')
+ res = b.update()
+ with open(os.path.join(b.session.get_full_release_directory(),'flat/test1.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['args']['key1'] == 'value1')
+ with open(os.path.join(b.session.get_full_release_directory(),'flat/test2.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['form']['key1'] == 'value1')
+
+ def test_freeze(self):
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b.freeze(rel)
+ prod = b.get_production(rel)
+ self.assertTrue(prod['freeze'] == True)
+ res = b.remove(rel)
+ self.assertTrue(res == False)
+ b.unfreeze(rel)
+ prod = b.get_production(rel)
+ self.assertTrue(prod['freeze'] == False)
+ res = b.remove(rel)
+ self.assertTrue(res == True)
+
+
+ def test_stats(self):
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ stats = Bank.get_banks_disk_usage()
+ self.assertTrue(stats[0]['size']>0)
+ for release in stats[0]['releases']:
+ if release['name'] == rel:
+ self.assertTrue(release['size']>0)
+
+
+ @attr('process')
+ def test_processes_meta_data(self):
+ b = Bank('localprocess')
+ b.update()
+ formats = b.session.get('formats')
+ self.assertTrue(len(formats['blast'])==2)
+ self.assertTrue(len(formats['test'][0]['files'])==3)
+
+ @attr('process')
+ def test_search(self):
+ b = Bank('localprocess')
+ b.update()
+ search_res = Bank.search(['blast'],[])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search([],['nucleic'])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search(['blast'],['nucleic'])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search(['blast'],['proteic'])
+ self.assertTrue(len(search_res)==0)
+
+
+ def test_owner(self):
+ """
+ test ACL with owner
+ """
+ b = Bank('local')
+ res = b.update()
+ self.assertTrue(res)
+ b.set_owner('sample')
+ b2 = Bank('local')
+ try:
+ res = b2.update()
+ self.fail('not owner, should not be allowed')
+ except Exception as e:
+ pass
+
+ at attr('elastic')
+class TestElastic(unittest.TestCase):
+ """
+ test indexing and search
+ """
+
+ def setUp(self):
+ BmajIndex.es = None
+ self.utils = UtilsForTest()
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+ if BmajIndex.do_index == False:
+ self.skipTest("Skipping indexing tests due to elasticsearch not available")
+ # Delete all banks
+ b = Bank('local')
+ b.banks.remove({})
+ BmajIndex.delete_all_bank('local')
+
+ self.config = BiomajConfig('local')
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+
+ def tearDown(self):
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+ self.utils.clean()
+ BmajIndex.delete_all_bank('test')
+
+ def test_index(self):
+ BmajIndex.do_index = True
+ prod = {
+ "data_dir" : "/tmp/test/data",
+ "formats" : {
+ "fasta" : [
+ {
+ "files" : [
+ "fasta/chr1.fa",
+ "fasta/chr2.fa"
+ ],
+ "types" : [
+ "nucleic"
+ ],
+ "tags" : {
+ "organism" : "hg19"
+ }
+ }
+ ],
+ "blast": [
+ {
+ "files" : [
+ "blast/chr1/chr1db"
+ ],
+ "types" : [
+ "nucleic"
+ ],
+ "tags" : {
+ "chr" : "chr1",
+ "organism" : "hg19"
+ }
+ }
+ ]
+
+ },
+ "freeze" : False,
+ "session" : 1416229253.930908,
+ "prod_dir" : "alu-2003-11-26",
+ "release" : "2003-11-26",
+ "types" : [
+ "nucleic"
+ ]
+ }
+
+ BmajIndex.add('test',prod, True)
+
+ query = {
+ 'query' : {
+ 'match' : {'bank': 'test'}
+ }
+ }
+ res = BmajIndex.search(query)
+ self.assertTrue(len(res)==2)
+
+
+ def test_remove_all(self):
+ self.test_index()
+ query = {
+ 'query' : {
+ 'match' : {'bank': 'test'}
+ }
+ }
+ BmajIndex.delete_all_bank('test')
+ res = BmajIndex.search(query)
+ self.assertTrue(len(res)==0)
+
+
+class MockLdapConn(object):
+
+ ldap_user = 'biomajldap'
+ ldap_user_email = 'bldap at no-reply.org'
+
+ STRATEGY_SYNC = 0
+ AUTH_SIMPLE = 0
+ STRATEGY_SYNC = 0
+ STRATEGY_ASYNC_THREADED = 0
+ SEARCH_SCOPE_WHOLE_SUBTREE = 0
+ GET_ALL_INFO = 0
+
+ @staticmethod
+ def Server(ldap_host, port, get_info):
+ return None
+
+ @staticmethod
+ def Connection(ldap_server, auto_bind=True, read_only=True, client_strategy=0, user=None, password=None, authentication=0,check_names=True):
+ if user is not None and password is not None:
+ if password == 'notest':
+ #raise ldap3.core.exceptions.LDAPBindError('no bind')
+ return None
+ return MockLdapConn(ldap_server)
+
+ def __init__(self, url=None):
+ #self.ldap_user = 'biomajldap'
+ #self.ldap_user_email = 'bldap at no-reply.org'
+ pass
+
+ def search(self, base_dn, filter, scope, attributes=[]):
+ if MockLdapConn.ldap_user in filter:
+ self.response = [{'dn': MockLdapConn.ldap_user, 'attributes': {'mail': [MockLdapConn.ldap_user_email]}}]
+ return [(MockLdapConn.ldap_user, {'mail': [MockLdapConn.ldap_user_email]})]
+ else:
+ raise Exception('no match')
+
+ def unbind(self):
+ pass
+
+
+ at attr('user')
+class TestUser(unittest.TestCase):
+ """
+ Test user management
+ """
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+ self.curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ def tearDown(self):
+ self.utils.clean()
+
+ @patch('ldap3.Connection')
+ def test_get_user(self, initialize_mock):
+ mockldap = MockLdapConn()
+ initialize_mock.return_value = MockLdapConn.Connection(None, None, None, None)
+ user = BmajUser('biomaj')
+ self.assertTrue(user.user is None)
+ user.remove()
+
+ @patch('ldap3.Connection')
+ def test_create_user(self, initialize_mock):
+ mockldap = MockLdapConn()
+ initialize_mock.return_value = MockLdapConn.Connection(None, None, None, None)
+ user = BmajUser('biomaj')
+ user.create('test', 'test at no-reply.org')
+ self.assertTrue(user.user['email'] == 'test at no-reply.org')
+ user.remove()
+
+ @patch('ldap3.Connection')
+ def test_check_password(self, initialize_mock):
+ mockldap = MockLdapConn()
+ initialize_mock.return_value = MockLdapConn.Connection(None, None, None, None)
+ user = BmajUser('biomaj')
+ user.create('test', 'test at no-reply.org')
+ self.assertTrue(user.check_password('test'))
+ user.remove()
+
+
+ @patch('ldap3.Connection')
+ def test_ldap_user(self, initialize_mock):
+ mockldap = MockLdapConn()
+ initialize_mock.return_value = MockLdapConn.Connection(None, None, None, None)
+ user = BmajUser('biomajldap')
+ self.assertTrue(user.user['is_ldap'] == True)
+ self.assertTrue(user.user['_id'] is not None)
+ self.assertTrue(user.check_password('test'))
+ user.remove()
diff --git a/tests/computed.properties b/tests/computed.properties
new file mode 100644
index 0000000..214baf4
--- /dev/null
+++ b/tests/computed.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+sub1.files.move=flat/test_.*
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computed2.properties b/tests/computed2.properties
new file mode 100644
index 0000000..2768000
--- /dev/null
+++ b/tests/computed2.properties
@@ -0,0 +1,45 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+
+ref.release=sub1
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computederror.properties b/tests/computederror.properties
new file mode 100644
index 0000000..ce4bae1
--- /dev/null
+++ b/tests/computederror.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed error local system bank test"
+db.name=computederror
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/computederror_tmp
+dir.version=test/computederror
+
+depends=sub2,error
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/directhttp.properties b/tests/directhttp.properties
new file mode 100644
index 0000000..30f673d
--- /dev/null
+++ b/tests/directhttp.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="directhttp system bank test"
+db.name=directhttp
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/directhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=directhttp
+server=ftp2.fr.debian.org
+
+release.protocol=directhttp
+release.server=ftp2.fr.debian.org
+release.remote.dir=/debian/README
+release.file=README
+release.regexp=([0-9.]+),
+release.file.compressed=
+
+#remote.dir=common/downloads/Current_Release/Pfalciparum3D7/fasta/data/PlasmoDB-25_Pfalciparum3D7_Genome.fasta
+#plasmo/communityDownload.do?fname=Atg3_alignment.txt
+remote.dir=/debian/README.html
+remote.files=
+
+local.files=debian/README.html
+
+## Post Process ## The files should be located in the projectfiles/process
+BLOCKS=
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/error.properties b/tests/error.properties
new file mode 100644
index 0000000..2e50f00
--- /dev/null
+++ b/tests/error.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="error local system bank test"
+db.name=error
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/error_tmp
+dir.version=test/error
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/error/
+remote.files=^error.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^error.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/global.properties b/tests/global.properties
new file mode 100644
index 0000000..d09829a
--- /dev/null
+++ b/tests/global.properties
@@ -0,0 +1,123 @@
+[GENERAL]
+test=1
+conf.dir=/tmp/biomaj/config
+log.dir=/tmp/biomaj/log
+process.dir=/tmp/biomaj/process
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=/tmp/biomaj/
+lock.dir=/tmp/biomaj/lock
+cache.dir=/tmp/biomaj/cache
+
+db.url=mongodb://localhost:27017
+db.name=biomaj_test
+
+use_ldap=1
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+# Use ElasticSearch for index/search capabilities
+use_elastic=1
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj_test
+
+celery.queue=biomaj
+celery.broker=mongodb://localhost:27017/biomaj_celery
+
+# Get directory stats (can be time consuming depending on number of files etc...)
+data.stats=1
+
+# List of user admin (linux user id, comma separated)
+admin=
+
+# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
+auto_publish=0
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+mail.admin=
+mail.from=
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#Number of thread for processes
+bank.num.threads=2
+
+#Number of threads to use for downloading
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#----------------------
+# Release configuration
+#----------------------
+release.separator=_
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+#http.parse.dir.line=<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.dir.line=<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.file.line=<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+
+# Bank default access
+visibility.default=public
+
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = INFO
+handlers = console
+
+[logger_biomaj]
+level = DEBUG
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/tests/local.properties b/tests/local.properties
new file mode 100644
index 0000000..7f6f5fd
--- /dev/null
+++ b/tests/local.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/locallist.properties b/tests/locallist.properties
new file mode 100644
index 0000000..a901b2c
--- /dev/null
+++ b/tests/locallist.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=locallist
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/locallist_tmp
+dir.version=test/locallist
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=
+remote.files=
+remote.files.list=true
+remote.files.1.path=/tmp/test.fasta.gz
+remote.files.2.path=/tmp/test2.fasta
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/localprocess.properties b/tests/localprocess.properties
new file mode 100644
index 0000000..7166186
--- /dev/null
+++ b/tests/localprocess.properties
@@ -0,0 +1,100 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Pre process
+db.pre.process=META0
+
+## Remove process
+db.remove.process=META0
+
+## Post Process ## The files should be located in the projectfiles/process directory
+BLOCKS=BLOCK1,BLOCK2
+BLOCK1.db.post.process=META0
+BLOCK2.db.post.process=META1,META2,META3
+META0=PROC0
+META1=PROC1,PROC2
+META2=PROC3
+META3=PROC4,PROC5
+
+
+PROC0.name=test0
+PROC0.desc=sample test
+PROC0.cluster=false
+PROC0.type=test
+PROC0.exe=echo
+PROC0.args=test $datadir
+
+PROC1.name=test1
+PROC1.desc=sample test
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=touch
+PROC1.args=$datadir/$dirversion/$localrelease/proc1.txt
+
+PROC2.name=test2
+PROC2.desc=sample test
+PROC2.cluster=false
+PROC2.type=test
+PROC2.exe=touch
+PROC2.args=$datadir/$dirversion/$localrelease/proc2.txt
+
+PROC3.name=test3
+PROC3.desc=sample test
+PROC3.cluster=false
+PROC3.type=test
+PROC3.exe=echo
+PROC3.args=test 3
+
+PROC4.name=test4
+PROC4.desc=sample test
+PROC4.cluster=false
+PROC4.type=test
+PROC4.exe=echo
+PROC4.args=test 4
+
+PROC5.name=test5
+PROC5.desc=sample test
+PROC5.cluster=false
+PROC5.type=testmetadata
+PROC5.exe=test.sh
+PROC5.args=
+PROC5.format=test
+PROC5.types=any
+PROC5.tags=chr:chr1,organism:hg19
+# If files is set, then the post-process does not have to print generated files on STDOUT (but can)
+# in this case, the list of files will be extracted from this list with above format/types/tags
+PROC5.files=dir1/file1,dir1/file2,dir1/file3
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/multi.properties b/tests/multi.properties
new file mode 100644
index 0000000..82e08f9
--- /dev/null
+++ b/tests/multi.properties
@@ -0,0 +1,60 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for multi protocol
+db.name=multi
+db.type=test
+
+offline.dir.name=offline/multi_tmp
+dir.version=multi
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=multi
+server=
+remote.dir=
+
+remote.file.0.protocol = directhttp
+remote.file.0.server = httpbin.org
+remote.file.0.path = /get
+remote.file.0.params.keys = key1,key2
+remote.file.0.params.key1 = value1
+remote.file.0.params.key2 = value2
+remote.file.0.name = test1.json
+
+remote.file.1.protocol = directhttp
+remote.file.1.method = POST
+remote.file.1.server = httpbin.org
+remote.file.1.path = /post
+remote.file.1.params.keys = key1,key2
+remote.file.1.params.key1 = value1
+remote.file.1.params.key2 = value2
+remote.file.1.name = test2.json
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub1.properties b/tests/sub1.properties
new file mode 100644
index 0000000..8e0c69b
--- /dev/null
+++ b/tests/sub1.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local1
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local1_tmp
+dir.version=test/local1
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub2.properties b/tests/sub2.properties
new file mode 100644
index 0000000..b9d3142
--- /dev/null
+++ b/tests/sub2.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local2
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local2_tmp
+dir.version=test/local2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/testhttp.properties b/tests/testhttp.properties
new file mode 100644
index 0000000..454753e
--- /dev/null
+++ b/tests/testhttp.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for http protocol
+db.name=testhttp
+db.type=package
+
+offline.dir.name=offline/testhttp_tmp
+dir.version=testhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=http
+server=ftp2.fr.debian.org
+remote.dir=/debian/dists/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tools/examples/alu.properties b/tools/examples/alu.properties
new file mode 100644
index 0000000..b4ce85a
--- /dev/null
+++ b/tools/examples/alu.properties
@@ -0,0 +1,51 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+db.name=alu
+db.type=nucleic_protein
+
+offline.dir.name=offline/ncbi/blast/alu_tmp
+dir.version=ncbi/blast/alu
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=ftp
+server=ftp.ncbi.nih.gov
+remote.dir=/blast/db/FASTA/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^alu.*\.gz$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+BLOCKS=BLOCK1
+BLOCK1.db.post.process=META0
+META0=PROC1
+
+PROC1.name=scanflatdir
+PROC1.desc=scan bank flat to detect available files
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=scan.py
+PROC1.args=--scan $datadir/$dirversion/$localrelease --type=nucleic --tags="organism:human"
+
+
+
+### Deployment ###
+# Always keep previous version
+keep.old.version=1
diff --git a/tools/examples/global.properties b/tools/examples/global.properties
new file mode 100644
index 0000000..677c9f5
--- /dev/null
+++ b/tools/examples/global.properties
@@ -0,0 +1,115 @@
+[GENERAL]
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=/var/lib/biomaj3/banks
+
+conf.dir=/etc/biomaj3
+log.dir=/var/log/biomaj3
+process.dir=/usr/share/biomaj3/process
+cache.dir=/var/cache/biomaj3
+lock.dir= /var/lib/biomaj/lock
+
+db.url=mongodb://localhost:27017
+db.name=biomaj
+
+use_ldap=0
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+use_elastic=0
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj
+
+# Calculate data.dir size stats
+data.stats=1
+
+# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
+auto_publish=0
+
+celery.queue=biomaj
+celery.broker=mongodb://localhost:27017/biomaj_celery
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=localhost
+mail.admin=
+mail.from=biomaj at localhost
+mail.user=
+mail.password=
+mail.tls=
+
+
+#Number of thread during the download
+bank.num.threads=4
+
+#Number of threads to use for downloading and processing
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+http.parse.dir.line=<a[\\s]+href=\"([\\S]+)/\".*alt=\"\\[DIR\\]\">.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})
+http.parse.file.line=<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+
+# Bank default access
+visibility.default=public
+
+# Proxy, optional proxy (see format at
+# http://curl.haxx.se/libcurl/c/CURLOPT_PROXY.html)
+# biomaj >= 3.0.7
+#proxy=myproxyhost:1080
+#proxy=myproxyhost
+# Set proxy authentication if any, else keep commented
+#proxy_auth=user:password
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level =INFO
+handlers = console
+
+[logger_biomaj]
+level = INFO
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = INFO
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/tools/examples/local.properties b/tools/examples/local.properties
new file mode 100644
index 0000000..edb5211
--- /dev/null
+++ b/tools/examples/local.properties
@@ -0,0 +1,55 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local copy bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# Local system (copy some files)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/DIR_PATH_OF_FILES_TO_COPY
+# Reg exp of files to copy
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+# Reg exp of files to keep
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+#db.post.process=
+## Post Process ## The files should be located in the projectfiles/process
+BLOCKS=BLOCK1
+BLOCK1.db.post.process=META0
+META0=PROC1
+
+PROC1.name=test1
+PROC1.desc=scan copied files try to auto detect format
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=scan.py
+PROC1.args=--scan $datadir/$dirversion/$localrelease
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tools/process/concat.sh b/tools/process/concat.sh
new file mode 100755
index 0000000..e1482e4
--- /dev/null
+++ b/tools/process/concat.sh
@@ -0,0 +1,114 @@
+# ! /bin/bash
+# Script for Biomaj PostProcess
+#
+# concat files
+#
+# ARGS :
+# 1) regular expression for file to apply concat
+# 2) regular expression for exclude files in result
+# 3) relativ path name (result of concat)
+# 4) format (fasta) [OPTIONAL]
+# 5) types (type1,type2,...) [OPTIONAL]
+# 6) tags (key:value,key:value,...) [OPTIONAL]
+#
+#
+#
+# Default input from STDIN unless files specified. To explictly specify STDIN
+# to be used for input use '-' as filename
+
+
+if (test $# -lt 3) then
+ echo "arguments:" 1>&2;
+ echo "1: regular expression for include a set of file to apply concat" 1
+>&2;
+ echo "2: regular expression for exclude a set of file to apply concat" 1
+>&2;
+ echo "3: result file name (relative path from future_release and name)"
+1>&2;
+ exit -1;
+
+fi
+
+workdir=$datadir/$dirversion/$localrelease/
+echo "apply concat with set $workdir/$1 to $workdir/$3";
+
+#Creation des repertoires
+
+dirtocreate=`dirname $workdir/$3`;
+
+if (! test -e $dirtocreate ) then
+ echo "mkdir :"$dirtocreate;
+ mkdir -p $dirtocreate
+fi
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $dirtocreate." 1>&2 ;
+ exit 1;
+fi
+
+
+cd $workdir;
+
+echo ;
+
+files='';
+
+echo "set a list of file...";
+
+for expr in $1
+do
+ # echo "$expr";
+ # dir=`dirname $expr`;
+ # fileExp=`basename $expr`;
+ if [ "$2" != "" ]
+ then
+ files="$files ""`echo $expr | egrep -v $2`";
+ else
+ files="$files $expr";
+ fi
+done
+
+echo "";
+echo "--------------------------";
+echo "Comput [$workdir/$3]....";
+echo "change directory:$workdir";
+echo "$files > $workdir/$3";
+rm -f $workdir/$3 2> /dev/null ;
+
+if ( test -z "$files" )
+then
+ echo "Cannot create $workdir/$3 : no files !" 1>&2 ;
+ exit 1;
+fi
+
+echo "cat $files > $workdir/$3";
+
+for fileToConcat in $files
+do
+ cat $fileToConcat >> $workdir/$3 ;
+
+ if ( test $? -ne 0 ) then
+ echo "Cannot create $3.[error:$?]" 1>&2 ;
+ exit 1;
+ fi
+done
+
+format=""
+types=""
+tags=""
+if [ "$4" != "" ]
+then
+ format=$4
+fi
+if [ "$5" != "" ]
+then
+ types=$5
+fi
+if [ "$6" != "" ]
+then
+ tags=$6
+fi
+
+
+
+echo "##BIOMAJ#$format#$types#$tags#$3"
diff --git a/tools/process/formatdb.sh b/tools/process/formatdb.sh
new file mode 100755
index 0000000..a36abea
--- /dev/null
+++ b/tools/process/formatdb.sh
@@ -0,0 +1,244 @@
+# ! /bin/bash
+
+# Script for Biomaj PostProcess
+# author : ofilangi
+# date : 19/06/2007
+# update : 22/10/2010 fix bug in generated alias file + a few cleanups
+#
+# -t Title for database file [String] Optional
+# -i Input file(s) for formatting [File In] Optional
+# -l Logfile name: [File Out] Optional
+# default = formatdb.log
+# -p Type of file
+# T - protein
+# F - nucleotide [T/F] Optional
+# default = T
+# -o Parse options
+# T - True: Parse SeqId and create indexes.
+# F - False: Do not parse SeqId. Do not create indexes.
+# [T/F] Optional
+# default = F
+# -a Input file is database in ASN.1 format (otherwise FASTA is expected)
+# T - True,
+# F - False.
+# [T/F] Optional
+# default = F
+# -b ASN.1 database in binary mode
+# T - binary,
+# F - text mode.
+# [T/F] Optional
+# default = F
+# -e Input is a Seq-entry [T/F] Optional
+# default = F
+# -n Base name for BLAST files [String] Optional
+# -v Database volume size in millions of letters [Integer] Optional
+# default = 4000
+# -s Create indexes limited only to accessions - sparse [T/F] Optional
+# default = F
+# -V Verbose: check for non-unique string ids in the database [T/F] Optional
+# default = F
+# -L Create an alias file with this name
+# use the gifile arg (below) if set to calculate db size
+# use the BLAST db specified with -i (above) [File Out] Optional
+# -F Gifile (file containing list of gi's) [File In] Optional
+# -B Binary Gifile produced from the Gifile specified above [File Out] Optional
+# -T Taxid file to set the taxonomy ids in ASN.1 deflines [File In] Optional
+#
+#
+
+#----------
+#GLOBAL DEF
+#----------
+
+BLASTDB_DIR="/db/index-blast"; # Path where aliases files should be generated
+FORMATDB=/local/ncbi/current/bin/formatdb; # Path to formatdb executable
+
+
+#----------
+# FUNCTIONS
+#----------
+# createAlias: builds an alias file
+# arg1: file to write to
+# arg2: bank name
+# arg3: db file list
+createAlias() {
+ local file=$1;
+ local nomBanque=$2;
+ local lFiles=$3;
+
+ rm -f $file;
+ echo "#" > $file
+ echo "# Alias file created "`date` >>$file
+ echo "#" >>$file ;
+ echo "#">> $file ;
+ echo "TITLE "$nomBanque >> $file;
+ echo "#" >> $file;
+ echo "DBLIST "$lFiles >>$file;
+ echo "#" >> $file;
+ echo "#GILIST" >> $file;
+ echo "#" >> $file;
+ echo "#OIDLIST" >> $file;
+ echo "#" >> $file;
+}
+
+#-----
+# MAIN
+#-----
+
+if (test $# -ne 4) then
+ echo "arguments:" 1>&2
+ echo "1: input files"
+ echo "2: working directory" 1>&2
+ echo "3: formatdb options (without -i for input file)" 1>&2
+ echo "4: bank name" 1>&2
+ echo `formatdb --help`;
+ exit -1
+fi
+
+relWorkDir=`echo "$2" | sed "s/\/*$//"` # remove useless trailing slash
+
+workdir=$datadir/$dirversion/future_release
+workdir=$workdir/$relWorkDir;
+
+rm -rf $workdir;
+mkdir -p $workdir ;
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $workdir." 1>&2 ;
+ exit 1;
+fi
+
+cd $workdir
+
+# Some vars for links creation
+back="";
+dir=$relWorkDir;
+OLDIFS=$IFS;
+IFS="/";
+for i in $dir
+do
+ back="../"$back;
+done
+IFS=$OLDIFS;
+
+# Create links to input files into the working dir
+listFile="";
+
+for expression in $1
+do
+ # the basename can be a regex
+ lsFile=`ls $datadir/$dirversion/future_release/$expression`;
+ if ( test $? -ne 0 ) then
+ echo "No input file found in dir `pwd`." 1>&2 ;
+ exit 1
+ fi
+ baseFile=`dirname $expression`;
+ for f in $lsFile
+ do
+ name=`basename $f`;
+ rm -f $4.p*;
+ rm -f $4.n*;
+ nameLink=`echo $name | cut -d"." -f1`;
+ ln -s $back/$baseFile/$name $nameLink;
+ if ( test $? -ne 0 ) then
+ echo "Cannot create link. [ln -s $back$f $name]" 1>&2 ;
+ exit 1
+ fi
+ if (test -z "$listFile") then
+ listFile=$nameLink;
+ else
+ listFile=$nameLink" "$listFile;
+ fi
+ done
+done
+
+echo "Input sequence file list: $listFile";
+
+if (test -z "$listFile") then
+ echo "No input file found." 1>&2 ;
+ exit 1
+fi
+
+nameB=$4;
+echo "Database name: $nameB";
+
+echo "Working in "`pwd`;
+echo "Launching formatdb [formatdb -i $listFile $3 -n $nameB]";
+
+# Execute formatdb
+$FORMATDB -i "$listFile" $3 -n $nameB;
+
+formatdbResult=$?
+if ( test $formatdbResult -ne 0 ) then
+ echo "Formatdb failed with status $formatdbResult" 1>&2 ;
+ exit 1
+fi
+
+echo "##BIOMAJ#blast###$2$nameB"
+
+# Delete temp files and links
+#-------------------------------------------------------------
+rm -f $listFile;
+rm -f formatdb.log
+
+# Add generated files to biomaj postprocess dependance
+echo "Generated files:";
+for ff in `ls *`
+do
+ echo $PP_DEPENDENCE$PWD/$ff;
+done
+
+goodPath=`readlink $datadir/$dirversion/future_release -s -n`;
+if ( test $? -ne 0 ) then
+ echo "Failed to get version path: readlink returned with an error [$goodPath]" 1>&2 ;
+ exit 1
+fi
+
+# Search for nal files which are sometimes generated by formatdb.
+lsAl=`ls *.?al 2> /dev/null`;
+
+if ( test $? -ne 0 ) then
+ echo "No alias file found.";
+ lsAl="";
+else
+ echo "Generated alias files:"
+ echo "$lsAl";
+fi
+
+# If nal files were generated, use them to generate nal files in $BLASTDB_DIR
+for fileIndexVirtuel in $lsAl
+do
+ echo "Found alias file: [$fileIndexVirtuel]";
+ listIndex=`more $fileIndexVirtuel | grep DBLIST`;
+ listFile2="";
+ for f in $listIndex
+ do
+ if (test $f != "DBLIST") then
+ listFile2=$goodPath/$relWorkDir/$f" "$listFile2;
+ fi
+ done
+ echo "Creating alias in [$BLASTDB_DIR/$fileIndexVirtuel]";
+ createAlias $BLASTDB_DIR/$fileIndexVirtuel $nameB "$listFile2"
+done
+
+# Else, if no nal file was generated by formatdb, create them
+if (test -z "$lsAl") then
+ ext=`ls | grep .*hr$ | tail -c5 | head -c2`al;
+ echo "Creating alias file [$PWD/$4$ext]";
+
+ listNhr=`ls *.*hr | sed 's/\..hr$//g'`;
+ listFileNalRel=""; # List of blast db files, relative path
+ listFileNalAbs=""; # List of blast db files, absolute path
+ for f in $listNhr
+ do
+ listFileNalRel=$f" "$listFileNalRel;
+ listFileNalAbs=$goodPath/$relWorkDir/$f" "$listFileNalAbs;
+ done
+
+ createAlias $4$ext $nameB "$listFileNalRel";
+ echo $PP_DEPENDENCE$PWD/$4$ext;
+
+ echo "Creating alias in [$BLASTDB_DIR/$4$ext]";
+ createAlias $BLASTDB_DIR/$4$ext $nameB "$listFileNalAbs" ;
+fi
+
diff --git a/tools/process/makeblastdb.sh b/tools/process/makeblastdb.sh
new file mode 100755
index 0000000..49aa952
--- /dev/null
+++ b/tools/process/makeblastdb.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+# Script for Biomaj PostProcess
+# author : ofilangi, osallou
+# date : 19/06/2007
+# update : 22/10/2010 fix bug in generated alias file + a few cleanups
+# 23/12/2015 use makeblastdb for ncbi blast+
+#
+# -title Title for database file [String] Optional
+# -in Input file(s) for formatting [File In] Optional
+# -logfile Logfile name: [File Out] Optional
+# default = formatdb.log
+# -dbtype nucl
+# -parse_seqids
+#
+
+#----------
+#GLOBAL DEF
+#----------
+BLASTDB_DIR="$datadir/index-blast"; # Path where aliases files should be generated
+mkdir -p $BLASTDB_DIR
+FORMATDB="makeblastdb"; # Path to formatdb executable
+
+
+#----------
+# FUNCTIONS
+#----------
+# createAlias: builds an alias file
+# arg1: file to write to
+# arg2: bank name
+# arg3: db file list
+createAlias() {
+ local file=$1;
+ local nomBanque=$2;
+ local lFiles=$3;
+
+ rm -f $file;
+ echo "#" > $file
+ echo "# Alias file created "`date` >>$file
+ echo "#" >>$file ;
+ echo "#">> $file ;
+ echo "TITLE "$nomBanque >> $file;
+ echo "#" >> $file;
+ echo "DBLIST "$lFiles >>$file;
+ echo "#" >> $file;
+ echo "#GILIST" >> $file;
+ echo "#" >> $file;
+ echo "#OIDLIST" >> $file;
+ echo "#" >> $file;
+}
+
+#-----
+# MAIN
+#-----
+
+if (test $# -ne 4) then
+ echo "arguments:" 1>&2
+ echo "1: input files"
+ echo "2: working directory" 1>&2
+ echo "3: formatdb options (without -in for input file)" 1>&2
+ echo "4: bank name" 1>&2
+ echo `formatdb --help`;
+ exit -1
+fi
+
+relWorkDir=`echo "$2" | sed "s/\/*$//"` # remove useless trailing slash
+
+workdir=$datadir/$dirversion/future_release
+workdir=$workdir/$relWorkDir;
+
+rm -rf $workdir;
+mkdir -p $workdir ;
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $workdir." 1>&2 ;
+ exit 1;
+fi
+
+cd $workdir
+
+# Some vars for links creation
+back="";
+dir=$relWorkDir;
+OLDIFS=$IFS;
+IFS="/";
+for i in $dir
+do
+ back="../"$back;
+done
+IFS=$OLDIFS;
+
+# Create links to input files into the working dir
+listFile="";
+
+for expression in $1
+do
+ # the basename can be a regex
+ lsFile=`ls $datadir/$dirversion/future_release/$expression`;
+ if ( test $? -ne 0 ) then
+ echo "No input file found in dir `pwd`." 1>&2 ;
+ exit 1
+ fi
+ baseFile=`dirname $expression`;
+ for f in $lsFile
+ do
+ name=`basename $f`;
+ rm -f $4.p*;
+ rm -f $4.n*;
+ nameLink=`echo $name | cut -d"." -f1`;
+ ln -s $back/$baseFile/$name $nameLink;
+ if ( test $? -ne 0 ) then
+ echo "Cannot create link. [ln -s $back$f $name]" 1>&2 ;
+ exit 1
+ fi
+ if (test -z "$listFile") then
+ listFile=$nameLink;
+ else
+ listFile=$nameLink" "$listFile;
+ fi
+ done
+done
+
+echo "Input sequence file list: $listFile";
+
+if (test -z "$listFile") then
+ echo "No input file found." 1>&2 ;
+ exit 1
+fi
+
+nameB=$4;
+echo "Database name: $nameB";
+
+echo "Working in "`pwd`;
+echo "Launching formatdb [formatdb -in $listFile $3 -out $nameB]";
+
+# Execute formatdb
+$FORMATDB -in "$listFile" $3 -out $nameB;
+
+formatdbResult=$?
+if ( test $formatdbResult -ne 0 ) then
+ echo "Formatdb failed with status $formatdbResult" 1>&2 ;
+ exit 1
+fi
+
+echo "##BIOMAJ#blast###$2$nameB"
+
+# Delete temp files and links
+#-------------------------------------------------------------
+rm -f $listFile;
+rm -f formatdb.log
+
+# Add generated files to biomaj postprocess dependance
+echo "Generated files:";
+for ff in `ls *`
+do
+ echo $PP_DEPENDENCE$PWD/$ff;
+done
+
+goodPath=`readlink $datadir/$dirversion/future_release -s -n`;
+if ( test $? -ne 0 ) then
+ echo "Failed to get version path: readlink returned with an error [$goodPath]" 1>&2 ;
+ exit 1
+fi
+
+# Search for nal files which are sometimes generated by formatdb.
+lsAl=`ls *.?al 2> /dev/null`;
+
+if ( test $? -ne 0 ) then
+ echo "No alias file found.";
+ lsAl="";
+else
+ echo "Generated alias files:"
+ echo "$lsAl";
+fi
+
+# If nal files were generated, use them to generate nal files in $BLASTDB_DIR
+for fileIndexVirtuel in $lsAl
+do
+ echo "Found alias file: [$fileIndexVirtuel]";
+ listIndex=`more $fileIndexVirtuel | grep DBLIST`;
+ listFile2="";
+ for f in $listIndex
+ do
+ if (test $f != "DBLIST") then
+ listFile2=$goodPath/$relWorkDir/$f" "$listFile2;
+ fi
+ done
+ echo "Creating alias in [$BLASTDB_DIR/$fileIndexVirtuel]";
+ createAlias $BLASTDB_DIR/$fileIndexVirtuel $nameB "$listFile2"
+done
+
+# Else, if no nal file was generated by formatdb, create them
+if (test -z "$lsAl") then
+ ext=`ls | grep .*hr$ | tail -c5 | head -c2`al;
+ echo "Creating alias file [$PWD/$4$ext]";
+
+ listNhr=`ls *.*hr | sed 's/\..hr$//g'`;
+ listFileNalRel=""; # List of blast db files, relative path
+ listFileNalAbs=""; # List of blast db files, absolute path
+ for f in $listNhr
+ do
+ listFileNalRel=$f" "$listFileNalRel;
+ listFileNalAbs=$goodPath/$relWorkDir/$f" "$listFileNalAbs;
+ done
+
+ createAlias $4$ext $nameB "$listFileNalRel";
+ echo $PP_DEPENDENCE$PWD/$4$ext;
+
+ echo "Creating alias in [$BLASTDB_DIR/$4$ext]";
+ createAlias $BLASTDB_DIR/$4$ext $nameB "$listFileNalAbs" ;
+fi
+
diff --git a/tools/process/scan.py b/tools/process/scan.py
new file mode 100755
index 0000000..0ab69c6
--- /dev/null
+++ b/tools/process/scan.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import os,sys
+import argparse
+import logging.config
+
+from biomaj.utils import Utils
+
+def main():
+
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument('-s', '--scan', dest="directory",help="Directory to scan")
+ parser.add_argument('--type', dest="ftype",help="Files type")
+ parser.add_argument('--tags', dest="tags", action="append", default=[],
+ help="tags, format key:value, can be repeated multiple times")
+
+ args = parser.parse_args()
+
+ if not os.path.exists(args.directory):
+ sys.exit(1)
+
+ res = {}
+ for (path, dirs, files) in os.walk(args.directory):
+ for file in files:
+ filename = os.path.join(path, file)
+ (file_format, mime) = Utils.detect_format(filename)
+ if file_format is not None:
+ file_format = file_format.replace('application/','')
+ filename = filename.replace(args.directory+'/','')
+ if file_format is not None:
+ if file_format not in res:
+ res[file_format] = [filename]
+ else:
+ res[file_format].append(filename)
+
+ f_type = ''
+ if args.ftype:
+ f_type = args.ftype
+ tags = ''
+ if args.tags:
+ tags = ','.join(args.tags)
+ for fformat in res.keys():
+ print '##BIOMAJ#'+fformat+'#'+f_type+'#'+tags+'#'+','.join(res[fformat])
+
+
+if __name__ == '__main__':
+ main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-biomaj3.git
More information about the debian-med-commit
mailing list