[med-svn] [biomaj3] 01/02: New upstream version 3.1.3
Olivier Sallou
osallou at debian.org
Thu Aug 17 14:46:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
osallou pushed a commit to branch master
in repository biomaj3.
commit 2308952fcde2e025e89010eddc0596be1674e6eb
Author: Olivier Sallou <osallou at debian.org>
Date: Thu Aug 17 12:49:15 2017 +0000
New upstream version 3.1.3
---
.coveragerc | 3 +
.gitignore | 72 ++
.travis.yml | 36 +
CHANGES.txt | 144 +++
LICENSE | 662 ++++++++++++++
MANIFEST.in | 2 +
README.md | 208 +++++
biomaj/__init__.py | 1 +
biomaj/bank.py | 1271 ++++++++++++++++++++++++++
biomaj/mongo_connector.py | 20 +
biomaj/notify.py | 60 ++
biomaj/options.py | 27 +
biomaj/process/__init__.py | 0
biomaj/process/metaprocess.py | 330 +++++++
biomaj/process/processfactory.py | 242 +++++
biomaj/schema_version.py | 210 +++++
biomaj/session.py | 235 +++++
biomaj/workflow.py | 1731 ++++++++++++++++++++++++++++++++++++
config.yml | 48 +
docs/Makefile | 177 ++++
docs/admin.rst | 37 +
docs/alu.properties | 42 +
docs/bank.rst | 15 +
docs/conf.py | 284 ++++++
docs/docker-compose-advanced.yml | 16 +
docs/docker-compose.yml | 11 +
docs/examples.rst | 104 +++
docs/global.advanced.properties | 143 +++
docs/index.rst | 36 +
docs/make.bat | 242 +++++
docs/metaprocess.rst | 15 +
docs/notify.rst | 15 +
docs/options.rst | 15 +
docs/processfactory.rst | 15 +
docs/requirements.txt | 14 +
docs/session.rst | 15 +
docs/workflow.rst | 15 +
global.properties.example | 162 ++++
requirements.txt | 17 +
scripts/biomaj_add_property.py | 30 +
scripts/biomaj_migrate_database.py | 7 +
scripts/influxdb_import.py | 90 ++
setup.cfg | 2 +
setup.py | 86 ++
tests/alu.properties | 43 +
tests/bank/process/test.sh | 11 +
tests/bank/test.fasta.gz | Bin 0 -> 45 bytes
tests/bank/test2.fasta | 2 +
tests/bank/test_100.txt | 1 +
tests/biomaj_tests.py | 807 +++++++++++++++++
tests/computed.properties | 44 +
tests/computed2.properties | 45 +
tests/computederror.properties | 43 +
tests/directhttp.properties | 41 +
tests/error.properties | 43 +
tests/global.properties | 123 +++
tests/local.properties | 41 +
tests/locallist.properties | 44 +
tests/localprocess.properties | 100 +++
tests/multi.properties | 60 ++
tests/sub1.properties | 43 +
tests/sub2.properties | 41 +
tests/testhttp.properties | 43 +
tools/examples/alu.properties | 51 ++
tools/examples/global.properties | 146 +++
tools/examples/local.properties | 55 ++
tools/process/concat.sh | 114 +++
tools/process/formatdb.sh | 244 +++++
tools/process/makeblastdb.sh | 212 +++++
tools/process/scan.py | 48 +
70 files changed, 9352 insertions(+)
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..0a4ae8f
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+source = biomaj
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..754c2a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,72 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Coveralls
+.coveralls.yml
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# PyCharm
+.idea
+
+# Vim
+.viminfo
+# Less history
+.lesshst
+
+.dbshell
+.emacs*
+.ipython
+.mongo*
+#*.properties
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..723b5b8
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,36 @@
+language: python
+sudo: false
+python:
+ - "2.7"
+ - "3.4"
+ - "3.5"
+ - "3.6"
+services:
+ - mongodb
+ - elasticsearch
+# Apply only on main branches
+branches:
+ except:
+ - /^feature.*$/
+# command to install dependencies
+#before_install:
+# - "sudo apt-get update -qq"
+# - "sudo apt-get install -qq libldap2-dev libsasl2-dev"
+install:
+ - "pip install flake8"
+ - "pip install -r requirements.txt"
+ - "pip install coverage"
+ - "pip install python-coveralls"
+ - "python setup.py -q install"
+# - "echo data_file=$TRAVIS_BUILD_DIR/.coverage >> .coveragerc"
+# command to run tests
+before_script:
+ - sleep 10
+#script: nosetests --with-coverage --cover-package=biomaj -a '!network'
+#script: nosetests --with-coverage --cover-package=biomaj
+script:
+ - python setup.py test
+ - flake8 --ignore E501,E123 biomaj
+#after_success:
+# - coveralls
+
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 0000000..81324f0
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,144 @@
+3.1.3:
+ Remove post-install step for automatic upgrades, not supported by wheel package
+
+3.1.2:
+ Fix #86 remove special character from README.md
+ Feature #85 SchemaVersion automatically add new property
+
+3.1.1:
+ Fix #80 Check process exists with `--from-task` and `--process`
+ Manage old banks with no status
+
+3.1.0:
+ ## Needs database upgrade
+ If using biomaj-watcher, must use version >= 3.1.0
+ Feature #67,#66,#61 switch to micro service architecture. Still works in local monolithic install
+ Fix some configuration parameter loading when not defined in config
+ Fix HTTP parsing parameters loading
+ Fix download_or_copy to copy files in last production release if available instead of downloading files again
+ Manage user migration for micro services
+ Feature #74 add influxdb statistics
+ Feature #65 add a release info file at the root of the bank which can be used by other services to know the latest release available
+ Feature #25 experimental support of rsync protocol
+ Add rate limiting for download with micro services
+ Limit email size to 2Mb, log file may be truncated
+
+3.0.20:
+ Fix #55: Added support for https and directhttps
+ Add possibility to define files to download from a local file with remote.list parameter
+ Fix visibility modification (bug deleted the bank properties field)
+ Fix #65 Add release file in bank dir after update
+ Add md5 or sha256 checksum checks if files are downloaded and available
+
+3.0.19:
+ Fix missing README.md in package
+ Fix #53 avoid duplicates in pending databases
+
+3.0.18:
+ Add migration method to update schema when needed
+ Manage HTTP month format to support text format (Jan, Feb, ...) and int format (01, 02, ...)
+ New optional bank property http.parse.file.date.format to extract date in HTTP protocol following python date regexp format (http://www.tutorialspoint.com/python/time_strptime.htm)
+ Example: %d-%b-%Y %H:%M
+
+3.0.17:
+ Fix #47: save_as error with directhttp protocol
+ Fix #45: error with pending releases when release has dots in value
+ typo/pylint fixes
+
+3.0.16:
+ Do not use config values, trust database values #39
+ Fix #42: Add optional release.separator to name the bank directory bankname_release (underscore as default)
+
+3.0.15:
+ Fix #37: remote local files history from db and put it in cache.dir
+ Feature #38: add optional keep.old.sessions parameter to keep all sessions in database, even for removed releases
+ Feature #28: add optional release.format parameter to specify the date format of a release
+
+3.0.14:
+ Fix in method set_owner
+ Force release to be a str
+ Fix #32: fix --from-task issue when calling a meta process
+ Fix #34: remove release from pending when doing cleanup of old sessions
+ Remove logs on some operations
+ Add --status-ko option to list bank in error state
+ Fix #36 manage workflows over by error or unfinished
+
+3.0.13:
+ Fix #27: Thread lock issue during download
+ New optional attribute in bank properties: timeout.download
+ HTTP protocol fix (deepcopy error)
+
+3.0.12:
+ Fix index deletion on bank removal
+ Fix lock errors on dir creation for multi-threads,
+ pre-create directroy structure in offline directory
+ Fix #26: save error when too many files in bank
+
+3.0.11:
+ Fix in session management with pre and rm processes
+ Fix #23: Check workflow step name passed to
+ --stop-after/--start-after/--from-task
+ Fix #24: deprecated delete_by_query method in elasticsearch
+ Add some controls on base directories
+
+
+3.0.10:
+ Change dir to process.dir to find processes in subdirs
+ If all files found in offline dir, continue workflow with no download
+ Remove extra log files for bank dependencies (computed banks)
+ Fix computed bank update when sub banks are not updated
+ Fix #15 when remote reverts to a previous release
+ Feature #16: get possibility not to download files (for computed banks for
+ example). Set protocol='none' in bank properties.
+ Fix on --check with some protocols
+ Fix #21 release.file not supported for directhttp protocol
+ Feature #22: add localrelease and remoterelease bank properties to use the
+ remote release as an expression in other properties
+ => remote.dir = xx/yy/%(remoterelease)s/zz
+ Feature #17,#20: detect remote modifications even if release is the same
+ new parameter release.control (true, false) to force a check
+ even if remote release (file controlled or date) is the same.
+ Fix on 'multi' protocol
+ Fix on "save_as" regexp when remote.files starts with a ^ character.
+
+3.0.9:
+ Fix thread synchro issue:
+ during download some download threads could be alive while main thread continues worflow
+ the fix prevents using Ctrl-C during download
+ Workflow fix:
+ if subtask of workflow fails, fail main task
+
+3.0.8:
+ do not test index if elasticsearch is not up
+ minor fixes
+ add http proxy support
+ pylint fixes
+ retry uncompress once in case of failure (#13)
+
+3.0.7:
+ Reindent code, pep8 fixes
+ Various fixes on var names and OrderedDict suport for Python < 2.7
+ Merge config files to be able to reference global.properties variables in bank
+ property file in format %(xx)s
+ Use ConfigParser instead of SafeConfigParser that will be deprecated
+
+3.0.6:
+ Add option --remove-pending to remove all pending sessions and directories
+ Add process env variables logdir and logfile
+ Fix Unicode issue with old versions of PyCurl.
+
+3.0.5:
+ Fix removal workflow during an update workflow, removedrelease was current
+ release.
+ Fix shebang of biomaj-cli, and python 2/3 compat issue
+
+3.0.4:
+ Update code to make it Python 3 compatible
+ Use ldap3 library (pure Python and p2,3 compatible) instead of python-ldap
+ get possiblity to save downloaded files for ftp and http without keeping full
+ directory structure:
+ remote.files can include groups to save file without directory structure,
+ or partial directories only, examples:
+ remote.files = genomes/fasta/.*\.gz => save files in offline directory, keeping remote structure offlinedir/genomes/fasta/
+ remote.files = genomes/fasta/(.*\.gz) => save files in offline directory offlinedir/
+ remote.files = genomes/(fasta)/(.*\.gz) => save files in offline directory offlinedir/fasta
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..cebe035
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,662 @@
+GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..ce94e14
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.txt *.md
+recursive-include biomaj *.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7b6df8a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,208 @@
+BioMAJ3
+=====
+
+This project is a complete rewrite of BioMAJ (http://biomaj.genouest.org).
+
+BioMAJ (BIOlogie Mise A Jour) is a workflow engine dedicated to data
+synchronization and processing. The Software automates the update cycle and the
+supervision of the locally mirrored databank repository.
+
+Common usages are to download remote databanks (Genbank for example) and apply
+some transformations (blast indexing, emboss indexing, etc.). Any script can be
+applied on downloaded data. When all treatments are successfully applied, bank
+is put in "production" on a dedicated release directory.
+With cron tasks, update tasks can be executed at regular interval, data are
+downloaded again only if a change is detected.
+
+More documentation is available in wiki page.
+
+BioMAJ is python 2 and 3 compatible.
+
+Getting started
+===============
+
+Edit global.properties file to match your settings. Minimal conf are database connection and directories.
+
+ biomaj-cli.py -h
+
+ biomaj-cli.py --config global.properties --status
+
+ biomaj-cli.py --config global.properties --bank alu --update
+
+Migration
+=========
+
+To migrate from previous BioMAJ 1.x, a script is available at:
+https://github.com/genouest/biomaj-migrate. Script will import old database to
+the new database, and update configuration files to the modified format. Data directory is the same.
+
+Migration for 3.0 to 3.1:
+
+Biomaj 3.1 provides an optional micro service architecture, allowing to separate and distributute/scale biomaj components on one or many hosts. This implementation is optional but recommended for server installations. Monolithic installation can be kept for local computer installation.
+To upgrade an existing 3.0 installation, as biomaj code has been split into multiple components, it is necessary to install/update biomaj python package but also biomaj-cli and biomaj-daemon packages. Then database must be upgraded manually (see Upgrading in documentation).
+
+To execute database migration:
+
+ python biomaj_migrate_database.py
+
+Application Features
+====================
+
+* Synchronisation:
+ * Multiple remote protocols (ftp, sftp, http, local copy, etc.)
+ * Data transfers integrity check
+ * Release versioning using a incremental approach
+ * Multi threading
+ * Data extraction (gzip, tar, bzip)
+ * Data tree directory normalisation
+
+
+* Pre &Post processing :
+ * Advanced workflow description (D.A.G)
+ * Post-process indexation for various bioinformatics software (blast, srs, fastacmd, readseq, etc.)
+ * Easy integration of personal scripts for bank post-processing automation
+
+
+* Supervision:
+ * Optional Administration web interface (biomaj-watcher)
+ * CLI management
+ * Mail alerts for the update cycle supervision
+ * Prometheus and Influxdb optional integration
+ * Optional consul supervision of processes
+
+
+* Scalability:
+ * Monolithic (local install) or microservice architecture (remote access to a BioMAJ server)
+ * Microservice installation allows per process scalability and supervision (number of process in charge of download, execution, etc.)
+
+
+* Remote access:
+ * Optional FTP server providing authenticated or anonymous data access
+
+Dependencies
+============
+
+Packages:
+ * Debian: libcurl-dev, gcc
+ * CentOs: libcurl-devel, openldap-devel, gcc
+
+ Linux tools: tar, unzip, gunzip, bunzip
+
+Database:
+ * mongodb (local or remote)
+
+Indexing (optional):
+ * elasticsearch (global property, use_elastic=1)
+
+ElasticSearch indexing adds advanced search features to biomaj to find bank having files with specific format or type.
+Configuration of ElasticSearch is not in the scope of BioMAJ documentation.
+For a basic installation, one instance of ElasticSearch is enough (low volume of data), in such a case, the ElasticSearch configuration file should be modified accordingly:
+
+ node.name: "biomaj" (or any other name)
+ index.number_of_shards: 1
+ index.number_of_replicas: 0
+
+Installation
+============
+
+From source:
+
+After dependencies installation, go in BioMAJ source directory:
+
+ python setup.py install
+
+From packages:
+
+ pip install biomaj biomaj-cli biomaj-daemon
+
+
+You should consider using a Python virtual environment (virtualenv) to install BioMAJ.
+
+In tools/examples, copy the global.properties and update it to match your local
+installation.
+
+The tools/process contains example process files (python and shell).
+
+
+Docker
+======
+
+You can use BioMAJ with Docker (genouest/biomaj)
+
+
+ docker pull genouest/biomaj
+ docker pull mongo
+ docker run --name biomaj-mongodb -d mongo
+ # Wait ~10 seconds for mongo to initialize
+ # Create a local directory where databases will be permanently stored
+ # *local_path*
+ docker run --rm -v local_path:/var/lib/biomaj --link biomaj-mongodb:biomaj-mongodb osallou/biomaj-docker --help
+
+
+Copy your bank properties in directory *local_path*/conf and post-processes (if any) in *local_path*/process
+
+You can override global.properties in /etc/biomaj/global.properties (-v xx/global.properties:/etc/biomaj/global.properties)
+
+No default bank property file or process are available in the container.
+
+Examples are available at https://github.com/genouest/biomaj-data
+
+API documentation
+=================
+
+https://readthedocs.org/projects/biomaj/
+
+Status
+======
+
+[![Build Status](https://travis-ci.org/genouest/biomaj.svg?branch=master)](https://travis-ci.org/genouest/biomaj)
+
+[![Documentation Status](https://readthedocs.org/projects/biomaj/badge/?version=latest)](https://readthedocs.org/projects/biomaj/?badge=latest)
+
+[![Code Health](https://landscape.io/github/genouest/biomaj/master/landscape.svg?style=flat)](https://landscape.io/github/genouest/biomaj/master)
+
+Testing
+=======
+
+Execute unit tests
+
+ nosetests
+
+Execute unit tests but disable ones needing network access
+
+ nosetests -a '!network'
+
+
+Monitoring
+==========
+
+InfluxDB can be used to monitor biomaj. Following series are available:
+
+* biomaj.banks.quantity (number of banks)
+* biomaj.production.size.total (size of all production directories)
+* biomaj.workflow.duration (workflow duration)
+* biomaj.production.size.latest (size of latest update)
+* biomaj.bank.update.downloaded_files (number of downloaded files)
+* biomaj.bank.update.new (track updates)
+
+License
+=======
+
+A-GPL v3+
+
+Remarks
+=======
+
+Biomaj uses libcurl, for sftp libcurl must be compiled with sftp support
+
+To delete elasticsearch index:
+
+ curl -XDELETE 'http://localhost:9200/biomaj_test/'
+
+Credits
+======
+
+Special thanks for tuco at Pasteur Institute for the intensive testing and new ideas.
+Thanks to the old BioMAJ team for the work they have done.
+
+BioMAJ is developped at IRISA research institute.
diff --git a/biomaj/__init__.py b/biomaj/__init__.py
new file mode 100644
index 0000000..de40ea7
--- /dev/null
+++ b/biomaj/__init__.py
@@ -0,0 +1 @@
+__import__('pkg_resources').declare_namespace(__name__)
diff --git a/biomaj/bank.py b/biomaj/bank.py
new file mode 100644
index 0000000..d71cf9f
--- /dev/null
+++ b/biomaj/bank.py
@@ -0,0 +1,1271 @@
+from builtins import str
+from builtins import object
+import os
+import logging
+import time
+import shutil
+import json
+from datetime import datetime
+
+import redis
+from influxdb import InfluxDBClient
+
+from biomaj.mongo_connector import MongoConnector
+from biomaj.session import Session
+from biomaj.workflow import UpdateWorkflow
+from biomaj.workflow import RemoveWorkflow
+from biomaj.workflow import Workflow
+from biomaj.workflow import ReleaseCheckWorkflow
+from biomaj_core.config import BiomajConfig
+from biomaj.options import Options
+from biomaj.process.processfactory import ProcessFactory
+from biomaj_core.bmajindex import BmajIndex
+
+import getpass
+
+
+class Bank(object):
+ """
+ BioMAJ bank
+ """
+
+ def __init__(self, name, options=None, no_log=False):
+ """
+ Get a bank from db or creates a new one
+
+ :param name: name of the bank, must match its config file
+ :type name: str
+ :param options: bank options
+ :type options: argparse
+ :param no_log: create a log file for the bank
+ :type no_log: bool
+ """
+ logging.debug('Initialize ' + name)
+ if BiomajConfig.global_config is None:
+ raise Exception('Configuration must be loaded first')
+
+ self.name = name
+ self.depends = []
+ self.no_log = no_log
+
+ if no_log:
+ if options is None:
+ # options = {'no_log': True}
+ options = Options()
+ options.no_log = True
+ else:
+ options.no_log = no_log
+
+ self.config = BiomajConfig(self.name, options)
+
+ if self.config.get('bank.num.threads') is not None:
+ ProcessFactory.NB_THREAD = int(self.config.get('bank.num.threads'))
+
+ if self.config.log_file is not None and self.config.log_file != 'none':
+ logging.info("Log file: " + self.config.log_file)
+
+ # self.options = Options(options)
+ if options is None:
+ self.options = Options()
+ else:
+ self.options = options
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ self.banks = MongoConnector.banks
+ self.bank = self.banks.find_one({'name': self.name})
+
+ if self.bank is None:
+ self.bank = {
+ 'name': self.name,
+ 'current': None,
+ 'sessions': [],
+ 'production': [],
+ 'properties': self.get_properties()
+ }
+ self.bank['_id'] = self.banks.insert(self.bank)
+
+ self.session = None
+ self.use_last_session = False
+
+ def check(self):
+ """
+ Checks bank configuration
+ """
+ return self.config.check()
+
+ def is_locked(self):
+ """
+ Checks if bank is locked ie action is in progress
+ """
+ data_dir = self.config.get('data.dir')
+ lock_dir = self.config.get('lock.dir', default=data_dir)
+ lock_file = os.path.join(lock_dir, self.name + '.lock')
+ if os.path.exists(lock_file):
+ return True
+ else:
+ return False
+
+ @staticmethod
+ def get_banks_disk_usage():
+ """
+ Get disk usage per bank and release
+ """
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ bank_list = []
+ banks = MongoConnector.banks.find({}, {'name': 1, 'production': 1})
+ for b in banks:
+ bank_elt = {'name': b['name'], 'size': 0, 'releases': []}
+ for p in b['production']:
+ if p['size'] is None:
+ p['size'] = 0
+ bank_elt['size'] += p['size']
+ bank_elt['releases'].append({'name': p['release'], 'size': p['size']})
+ bank_list.append(bank_elt)
+ return bank_list
+
+ def get_bank_release_info(self, full=False):
+ """
+ Get release info for the bank. Used with --status option from biomaj-cly.py
+ :param full: Display full for the bank
+ :type full: Boolean
+ :return: Dict with keys
+ if full=True
+ - info, prod, pend
+ else
+ - info
+ """
+
+ _bank = self.bank
+ info = {}
+ release = 'N/A'
+ last_update = 'N/A'
+ if 'last_update_session' in _bank:
+ last_update = datetime.fromtimestamp(_bank['last_update_session']).strftime("%Y-%m-%d %H:%M:%S")
+
+ if full:
+ bank_info = []
+ prod_info = []
+ pend_info = []
+
+ if 'current' in _bank and _bank['current']:
+ for prod in _bank['production']:
+ if _bank['current'] == prod['session']:
+ release = prod['release']
+ # Bank info header
+ bank_info.append(["Name", "Type(s)", "Last update status", "Published release"])
+ bank_info.append([_bank['name'],
+ str(','.join(_bank['properties']['type'])),
+ str(last_update),
+ str(release)])
+ # Bank production info header
+ prod_info.append(["Session", "Remote release", "Release", "Directory", "Freeze"])
+ for prod in _bank['production']:
+ data_dir = self.config.get('data.dir')
+ dir_version = self.config.get('dir.version')
+ if 'data.dir' in prod:
+ data_dir = prod['data.dir']
+ if 'dir.version' in prod:
+ dir_version = prod['dir.version']
+ release_dir = os.path.join(data_dir,
+ dir_version,
+ prod['prod_dir'])
+ date = datetime.fromtimestamp(prod['session']).strftime('%Y-%m-%d %H:%M:%S')
+ prod_info.append([date,
+ prod['remoterelease'],
+ prod['release'],
+ release_dir,
+ 'yes' if 'freeze' in prod and prod['freeze'] else 'no'])
+ # Bank pending info header
+ if 'pending' in _bank and len(_bank['pending']) > 0:
+ pend_info.append(["Pending release", "Last run"])
+ for pending in _bank['pending']:
+ run = datetime.fromtimestamp(pending['id']).strftime('%Y-%m-%d %H:%M:%S')
+ pend_info.append([pending['release'], run])
+
+ info['info'] = bank_info
+ info['prod'] = prod_info
+ info['pend'] = pend_info
+ return info
+
+ else:
+ if 'current' in _bank and _bank['current']:
+ for prod in _bank['production']:
+ if _bank['current'] == prod['session']:
+ release = prod['remoterelease']
+ info['info'] = [_bank['name'], ','.join(_bank['properties']['type']),
+ str(release), _bank['properties']['visibility'], last_update]
+ return info
+
+ def update_dependencies(self):
+ """
+ Update bank dependencies
+
+ :return: status of updates
+ """
+ self.depends = []
+ if self.run_depends:
+ depends = self.get_dependencies()
+ else:
+ depends = []
+
+ self.session.set('depends', {})
+ res = True
+ for dep in depends:
+ self.session._session['depends'][dep] = False
+ for dep in depends:
+ if self.session._session['depends'][dep]:
+ logging.debug('Update:Depends:' + dep + ':SKIP')
+ # Bank has been marked as depends multiple times, run only once
+ continue
+ logging.info('Update:Depends:' + dep)
+ b = Bank(dep)
+ if self.options and hasattr(self.options, 'user') and self.options.user:
+ b.options.user = self.options.user
+ res = b.update()
+ self.depends.append(b)
+ self.session._session['depends'][dep] = res
+ logging.info('Update:Depends:' + dep + ':' + str(res))
+ if not res:
+ break
+ if depends:
+ # Revert logging config
+ self.config.reset_logger()
+ return res
+
+ def get_bank(self, bank=None, no_log=False):
+ """
+ Gets an other bank
+ """
+ if bank is None:
+ return self.bank
+ return Bank(bank, no_log=no_log)
+
+ def get_dependencies(self, bank=None):
+ """
+ Search all bank dependencies
+
+ :return: list of bank names to update
+ """
+ if bank is None:
+ deps = self.config.get('depends')
+ else:
+ deps = bank.config.get('depends')
+ if deps is None:
+ return []
+ # Mainn deps
+ deps = deps.split(',')
+ # Now search in deps if they themselves depend on other banks
+ for dep in deps:
+ sub_options = None
+ if self.options and hasattr(self.options, 'user') and self.options.user:
+ sub_options = Options()
+ sub_options.user = self.options.user
+ b = Bank(dep, options=sub_options, no_log=True)
+ deps = b.get_dependencies() + deps
+ return deps
+
+ def is_owner(self):
+ """
+ Checks if current user is owner or admin
+ """
+ owner = getpass.getuser()
+ admin_config = self.config.get('admin')
+ admin = []
+ if admin_config is not None:
+ admin = [x.strip() for x in admin_config.split(',')]
+
+ current_user = None
+ if self.config.get('micro.biomaj.service.daemon', default=None) == '1':
+ if self.options and hasattr(self.options, 'user') and self.options.user:
+ current_user = self.options.user
+ else:
+ logging.debug('Micro services activated but user not authenticated')
+ return False
+ else:
+ current_user = owner
+
+ if admin and current_user in admin:
+ return True
+ if current_user == self.bank['properties']['owner']:
+ return True
+ return False
+
+ def set_owner(self, owner):
+ """
+ Update bank owner, only if current owner
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.banks.update({'name': self.name}, {'$set': {'properties.owner': owner}})
+
+ def set_visibility(self, visibility):
+ """
+ Update bank visibility, only if current owner
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.banks.update({'name': self.name}, {'$set': {'properties.visibility': visibility}})
+
+ def get_properties(self):
+ """
+ Read bank properties from config file
+
+ :return: properties dict
+ """
+ owner = None
+ if self.config.get('micro.biomaj.service.daemon', default=None) == '1':
+ if self.options and hasattr(self.options, 'user') and self.options.user:
+ owner = self.options.user
+ else:
+ logging.debug('Micro services activated but user not authenticated')
+ raise Exception('Micro services activated but user not authenticated')
+ else:
+ owner = getpass.getuser()
+
+ # If owner not set, use current user, else keep current
+ if self.bank and 'properties' in self.bank and 'owner' in self.bank['properties']:
+ owner = self.bank['properties']['owner']
+
+ props = {
+ 'visibility': self.config.get('visibility.default'),
+ 'type': self.config.get('db.type').split(','),
+ 'tags': [],
+ 'owner': owner,
+ 'desc': self.config.get('db.fullname')
+ }
+
+ return props
+
+ @staticmethod
+ def user_banks(user_name):
+ """
+ Get user banks name
+ :param user_name: user identifier
+ :type user_name: str
+ :return: list of bank name
+ """
+ banks = MongoConnector.banks.find({'properties.owner': user_name}, {'name': 1})
+ return banks
+
+ @staticmethod
+ def searchindex(query):
+ return BmajIndex.searchq(query)
+
+ @staticmethod
+ def search(formats=None, types=None, with_sessions=True):
+ """
+ Search all bank releases matching some formats and types
+
+ Matches production release with at least one of formats and one of types
+ """
+ if formats is None:
+ formats = []
+
+ if types is None:
+ types = []
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+ searchfilter = {}
+ if formats:
+ searchfilter['production.formats'] = {'$in': formats}
+ if with_sessions:
+ res = MongoConnector.banks.find(searchfilter)
+ else:
+ res = MongoConnector.banks.find(searchfilter, {'sessions': 0})
+ # Now search in which production release formats and types apply
+ search_list = []
+ for r in res:
+ prod_to_delete = []
+ for p in r['production']:
+ is_format = False
+ if not formats:
+ is_format = True
+ # Are formats present in this production release?
+ for f in formats:
+ if f in p['formats']:
+ is_format = True
+ break
+ # Are types present in this production release?
+ is_type = False
+ if not types:
+ is_type = True
+ if is_format:
+ for t in types:
+ if t in p['types'] or t in r['properties']['type']:
+ is_type = True
+ break
+ if not is_type or not is_format:
+ prod_to_delete.append(p)
+ for prod_del in prod_to_delete:
+ r['production'].remove(prod_del)
+ if len(r['production']) > 0:
+ search_list.append(r)
+ return search_list
+
+ @staticmethod
+ def list(with_sessions=False):
+ """
+ Return a list of banks
+
+ :param with_sessions: should sessions be returned or not (can be quite big)
+ :type with_sessions: bool
+ :return: list of :class:`biomaj.bank.Bank`
+ """
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ bank_list = []
+ if with_sessions:
+ res = MongoConnector.banks.find({})
+ else:
+ res = MongoConnector.banks.find({}, {'sessions': 0})
+ for r in res:
+ bank_list.append(r)
+ return bank_list
+
+ def controls(self):
+ """
+ Initial controls (create directories etc...)
+ """
+ data_dir = self.config.get('data.dir')
+ bank_dir = self.config.get('dir.version')
+ bank_dir = os.path.join(data_dir, bank_dir)
+ if not os.path.exists(bank_dir):
+ os.makedirs(bank_dir)
+
+ offline_dir = self.config.get('offline.dir.name')
+ offline_dir = os.path.join(data_dir, offline_dir)
+ if not os.path.exists(offline_dir):
+ os.makedirs(offline_dir)
+
+ log_dir = self.config.get('log.dir')
+ log_dir = os.path.join(log_dir, self.name)
+ if not os.path.exists(log_dir):
+ os.makedirs(log_dir)
+
+ def _delete(self):
+ """
+ Delete bank from database, not files
+ """
+ self.banks.remove({'_id': self.bank['_id']})
+
+ def save_session(self):
+ """
+ Save session in database
+ """
+ self.session._session['last_update_time'] = time.time()
+ self.session._session['log_file'] = self.config.log_file
+ if self.use_last_session:
+ # Remove last session
+ self.banks.update({'name': self.name}, {'$pull': {'sessions': {'id': self.session._session['id']}}})
+ # Insert session
+ if self.session.get('action') == 'update':
+ action = 'last_update_session'
+ if self.session.get('action') == 'remove':
+ action = 'last_remove_session'
+
+ cache_dir = self.config.get('cache.dir')
+ download_files = self.session.get('download_files')
+ if download_files is not None:
+ f_downloaded_files = open(os.path.join(cache_dir, 'files_' + str(self.session.get('id'))), 'w')
+ f_downloaded_files.write(json.dumps(download_files))
+ f_downloaded_files.close()
+ self.session.set('download_files', [])
+
+ local_files = self.session.get('files')
+ if local_files is not None:
+ f_local_files = open(os.path.join(cache_dir, 'local_files_' + str(self.session.get('id'))), 'w')
+ f_local_files.write(json.dumps(download_files))
+ f_local_files.close()
+ self.session.set('files', [])
+
+ self.banks.update({'name': self.name}, {
+ '$set': {
+ action: self.session._session['id'],
+ 'properties': self.get_properties()
+ },
+ '$push': {'sessions': self.session._session}
+ })
+ BmajIndex.add(self.name, self.session._session)
+ if self.session.get('action') == 'update' and not self.session.get_status(Workflow.FLOW_OVER)\
+ and self.session.get('release'):
+ release = self.session.get('release')
+ found = self.banks.find_one({'name': self.name, 'pending.release': release})
+ if found is None:
+ self.banks.update({'name': self.name},
+ {'$push': {'pending': {'release': self.session.get('release'),
+ 'id': self.session._session['id']}}})
+
+ if self.session.get('action') == 'update' and self.session.get_status(Workflow.FLOW_OVER) and self.session.get(
+ 'update'):
+ # We expect that a production release has reached the FLOW_OVER status.
+ # If no update is needed (same release etc...), the *update* session of the session is set to False
+ logging.debug('Bank:Save:' + self.name)
+ if len(self.bank['production']) > 0:
+ # Remove from database
+ self.banks.update({'name': self.name},
+ {'$pull': {'production': {'release': self.session._session['release']}}})
+
+ release_types = []
+ if self.config.get('db.type'):
+ release_types = self.config.get('db.type').split(',')
+ release_formats = list(self.session._session['formats'].keys())
+ if self.config.get('db.formats'):
+ config_formats = self.config.get('db.formats').split(',')
+ for config_format in config_formats:
+ if config_format not in release_formats:
+ release_formats.append(config_format)
+
+ for release_format in self.session._session['formats']:
+ for release_files in self.session._session['formats'][release_format]:
+ if release_files['types']:
+ for rtype in release_files['types']:
+ if rtype not in release_types:
+ release_types.append(rtype)
+ prod_dir = self.session.get_release_directory()
+ if self.session.get('prod_dir'):
+ prod_dir = self.session.get('prod_dir')
+ production = {'release': self.session.get('release'),
+ 'remoterelease': self.session.get('remoterelease'),
+ 'session': self.session._session['id'],
+ 'formats': release_formats,
+ 'types': release_types,
+ 'size': self.session.get('fullsize'),
+ 'data_dir': self.session._session['data_dir'],
+ 'dir_version': self.session._session['dir_version'],
+ 'prod_dir': prod_dir,
+ 'freeze': False}
+ self.bank['production'].append(production)
+ self.banks.update({'name': self.name},
+ {'$push': {'production': production},
+ '$pull': {'pending': {'release': self.session.get('release'),
+ 'id': self.session._session['id']}}
+ })
+
+ self.bank = self.banks.find_one({'name': self.name})
+
+ def clean_old_sessions(self):
+ """
+ Delete old sessions, not latest ones nor related to production sessions
+ """
+ if self.session is None:
+ return
+ # No previous session
+ if 'sessions' not in self.bank:
+ return
+ if self.config.get_bool('keep.old.sessions'):
+ logging.debug('keep old sessions, skipping...')
+ return
+ # 'last_update_session' in self.bank and self.bank['last_update_session']
+ old_sessions = []
+ prod_releases = []
+ for session in self.bank['sessions']:
+ if session['id'] == self.session.get('id'):
+ # Current session
+ prod_releases.append(session['release'])
+ continue
+ if session['id'] == self.session.get('last_update_session'):
+ prod_releases.append(session['release'])
+ continue
+ if session['id'] == self.session.get('last_remove_session'):
+ continue
+ is_prod_session = False
+ for prod in self.bank['production']:
+ if session['id'] == prod['session']:
+ is_prod_session = True
+ break
+ if is_prod_session:
+ prod_releases.append(session['release'])
+ continue
+ old_sessions.append(session)
+ if len(old_sessions) > 0:
+ for session in old_sessions:
+ session_id = session['id']
+ self.banks.update({'name': self.name}, {'$pull': {'sessions': {'id': session_id}}})
+ # Check if in pending sessions
+ if 'pending' in self.bank:
+ for rel in self.bank['pending']:
+ rel_session = rel['id']
+ if rel_session == session_id:
+ self.banks.update({'name': self.name},
+ {'$pull': {'pending': {'release': session['release'], 'id': session_id}}})
+ if session['release'] not in prod_releases and session['release'] != self.session.get('release'):
+ # There might be unfinished releases linked to session, delete them
+ # if they are not related to a production directory or latest run
+ session_dir = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ self.name + self.config.get('release.separator', default='_') + str(session['release']))
+ if os.path.exists(session_dir):
+ logging.info('Bank:DeleteOldSessionDir:' + self.name + self.config.get('release.separator', default='_') + str(session['release']))
+ shutil.rmtree(session_dir)
+ self.bank = self.banks.find_one({'name': self.name})
+
+ def publish(self):
+ """
+ Set session release to *current*
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ current_link = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'current')
+
+ to_dir = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'))
+
+ if os.path.lexists(current_link):
+ os.remove(current_link)
+ os.chdir(to_dir)
+ os.symlink(self.session.get_release_directory(), 'current')
+ self.bank['current'] = self.session._session['id']
+ self.banks.update(
+ {'name': self.name},
+ {'$set': {'current': self.session._session['id']}}
+ )
+
+ release_file = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'RELEASE.txt')
+
+ with open(release_file, 'w') as outfile:
+ outfile.write('Bank: %s\nRelease: %s\nRemote release:%s\n' % (self.name, self.session.get('release'), self.session.get('remoterelease')))
+
+ def unpublish(self):
+ """
+ Unset *current*
+ """
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ current_link = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'current')
+
+ if os.path.lexists(current_link):
+ os.remove(current_link)
+
+ release_file = os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'),
+ 'RELEASE.txt')
+ if os.path.exists(release_file):
+ os.remove(release_file)
+
+ self.banks.update(
+ {'name': self.name},
+ {'$set': {'current': None}}
+ )
+
+ def get_production(self, release):
+ """
+ Get production field for release
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: production field
+ """
+ release = str(release)
+ production = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ production = prod
+ return production
+
+ def freeze(self, release):
+ """
+ Freeze a production release
+
+ When freezed, a production release cannot be removed (manually or automatically)
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ rel = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ rel = prod['release']
+ if rel is None:
+ logging.error('Release not found: ' + release)
+ self.banks.update({'name': self.name, 'production.release': rel}, {'$set': {'production.$.freeze': True}})
+ self.bank = self.banks.find_one({'name': self.name})
+ return True
+
+ def unfreeze(self, release):
+ """
+ Unfreeze a production release to allow removal
+
+ :param release: release name or production dir name
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ rel = None
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ rel = prod['release']
+ if rel is None:
+ logging.error('Release not found: ' + release)
+ self.banks.update({'name': self.name, 'production.release': rel}, {'$set': {'production.$.freeze': False}})
+ self.bank = self.banks.find_one({'name': self.name})
+ return True
+
+ def get_new_session(self, flow=None):
+ """
+ Returns an empty session
+
+ :param flow: kind of workflow
+ :type flow: :func:`biomaj.workflow.Workflow.FLOW`
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+ return Session(self.name, self.config, flow)
+
+ def get_session_from_release(self, release):
+ """
+ Loads the session matching a specific release
+
+ :param release: release name oe production dir
+ :type release: str
+ :return: :class:`biomaj.session.Session`
+ """
+ release = str(release)
+ oldsession = None
+ # Search production release matching release
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ # Search session related to this production release
+ for s in self.bank['sessions']:
+ if s['id'] == prod['session']:
+ oldsession = s
+ break
+ break
+ if oldsession is None:
+ # No prod session, try to find a session for this release, session may have failed or be stopped
+ for s in self.bank['sessions']:
+ if s['release'] and release.endswith(s['release']):
+ oldsession = s
+ if oldsession is None:
+ logging.error('No production session could be found for this release')
+ return oldsession
+
+ def load_session(self, flow=None, session=None):
+ """
+ Loads last session or, if over or forced, a new session
+
+ Creates a new session or load last session if not over
+
+ :param flow: kind of workflow
+ :type flow: :func:`biomaj.workflow.Workflow.FLOW`
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+
+ if session is not None:
+ logging.debug('Load specified session ' + str(session['id']))
+ self.session = Session(self.name, self.config, flow)
+ self.session.load(session)
+ self.use_last_session = True
+ return
+ if len(self.bank['sessions']) == 0 or self.options.get_option(Options.FROMSCRATCH):
+ self.session = Session(self.name, self.config, flow)
+ logging.debug('Start new session')
+ else:
+ # Take last session
+ self.session = Session(self.name, self.config, flow)
+ session_id = None
+ # Load previous session for updates only
+ if self.session.get('action') == 'update' and 'last_update_session' in self.bank and self.bank[
+ 'last_update_session']:
+ session_id = self.bank['last_update_session']
+ load_session = None
+ for session in self.bank['sessions']:
+ if session['id'] == session_id:
+ load_session = session
+ break
+ if load_session is not None:
+ # self.session.load(self.bank['sessions'][len(self.bank['sessions'])-1])
+ self.session.load(session)
+ # if self.config.last_modified > self.session.get('last_modified'):
+ # # Config has changed, need to restart
+ # self.session = Session(self.name, self.config, flow)
+ # logging.info('Configuration file has been modified since last session, restart in any case a new session')
+ if self.session.get_status(Workflow.FLOW_OVER) and self.options.get_option(
+ Options.FROM_TASK) is None:
+ previous_release = self.session.get('remoterelease')
+ self.session = Session(self.name, self.config, flow)
+ self.session.set('previous_release', previous_release)
+ logging.debug('Start new session')
+ else:
+ logging.debug('Load previous session ' + str(self.session.get('id')))
+ self.use_last_session = True
+
+ def remove_session(self, sid):
+ """
+ Delete a session from db
+
+ :param sid: id of the session
+ :type sid: long
+ :return: bool
+ """
+ session_release = None
+ _tmpbank = self.banks.find_one({'name': self.name})
+ for s in _tmpbank['sessions']:
+ if s['id'] == sid:
+ session_release = s['release']
+
+ cache_dir = self.config.get('cache.dir')
+ download_files = os.path.join(cache_dir, 'files_' + str(sid))
+ if os.path.exists(download_files):
+ os.remove(download_files)
+
+ local_files = os.path.join(cache_dir, 'local_files_' + str(sid))
+ if os.path.exists(local_files):
+ os.remove(local_files)
+
+ if self.config.get_bool('keep.old.sessions'):
+ logging.debug('keep old sessions')
+ if session_release is not None:
+ self.banks.update({'name': self.name}, {
+ '$pull': {
+ 'production': {'session': sid},
+ 'pending': {
+ 'release': session_release,
+ 'id': sid
+ }
+ }
+ })
+ else:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'production': {'session': sid}
+ }
+ })
+ self.banks.update({'name': self.name, 'sessions.id': sid},
+ {'$set': {'sessions.$.deleted': time.time()}})
+ else:
+ if session_release is not None:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'sessions': {'id': sid},
+ 'production': {'session': sid},
+ 'pending': {'release': session_release,
+ 'id': sid}
+ }
+ })
+ else:
+ self.banks.update({'name': self.name}, {'$pull': {
+ 'sessions': {'id': sid},
+ 'production': {'session': sid}
+ }
+ })
+ # Update object
+ self.bank = self.banks.find_one({'name': self.name})
+ if session_release is not None:
+ BmajIndex.remove(self.name, session_release)
+ return True
+
+ def get_data_dir(self):
+ """
+ Returns bank data directory
+
+ :return: str
+ """
+ return os.path.join(self.config.get('data.dir'),
+ self.config.get('dir.version'))
+
+ def removeAll(self, force=False):
+ """
+ Remove all bank releases and database records
+
+ :param force: force removal even if some production dirs are freezed
+ :type force: bool
+ :return: bool
+ """
+ if not force:
+ has_freeze = False
+ for prod in self.bank['production']:
+ if 'freeze' in prod and prod['freeze']:
+ has_freeze = True
+ break
+ if has_freeze:
+ logging.error('Cannot remove bank, some production directories are freezed, use force if needed')
+ return False
+
+ self.banks.remove({'name': self.name})
+ BmajIndex.delete_all_bank(self.name)
+ bank_data_dir = self.get_data_dir()
+ logging.warn('DELETE ' + bank_data_dir)
+ if os.path.exists(bank_data_dir):
+ shutil.rmtree(bank_data_dir)
+ bank_offline_dir = os.path.join(self.config.get('data.dir'), self.config.get('offline.dir.name'))
+ if os.path.exists(bank_offline_dir):
+ shutil.rmtree(bank_offline_dir)
+ bank_log_dir = os.path.join(self.config.get('log.dir'), self.name)
+ if os.path.exists(bank_log_dir) and self.no_log:
+ shutil.rmtree(bank_log_dir)
+ return True
+
+ def get_status(self):
+ """
+ Get status of current workflow
+
+ :return: dict of current workflow status
+ """
+ if 'status' not in self.bank or self.bank['status'] is None:
+ return {}
+ return self.bank['status']
+
+ def remove_pending(self, release=None):
+ """
+ Remove pending releases if 'release is None
+
+ :param release: release or release directory, default None
+ :type release: str
+ :return: bool
+ """
+ if release:
+ release = str(release)
+ logging.warning('Bank:' + self.name + ':RemovePending')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ if 'pending' not in self.bank:
+ return True
+ pendings = self.bank['pending']
+
+ for pending in pendings:
+ # Only work with pending for argument release
+ if release and release != pending['release']:
+ continue
+ pending_session_id = pending['id']
+ pending_session = None
+ for s in self.bank['sessions']:
+ if s['id'] == pending_session_id:
+ pending_session = s
+ break
+ session = Session(self.name, self.config, RemoveWorkflow.FLOW)
+ if pending_session is None:
+ session._session['release'] = pending['release']
+ else:
+ session.load(pending_session)
+ if os.path.exists(session.get_full_release_directory()):
+ logging.debug("Remove:Pending:Dir:" + session.get_full_release_directory())
+ shutil.rmtree(session.get_full_release_directory())
+ self.remove_session(pending['id'])
+ # If no release ask for deletion, remove all pending
+ if not release:
+ self.banks.update({'name': self.name}, {'$set': {'pending': []}})
+ return True
+
+ def remove(self, release):
+ """
+ Remove a release (db and files)
+
+ :param release: release or release directory
+ :type release: str
+ :return: bool
+ """
+ release = str(release)
+ logging.warning('Bank:' + self.name + ':Remove')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.session = self.get_new_session(RemoveWorkflow.FLOW)
+ oldsession = None
+ # Search production release matching release
+ for prod in self.bank['production']:
+ if prod['release'] == release or prod['prod_dir'] == release:
+ if 'freeze' in prod and prod['freeze']:
+ logging.error('Cannot remove release, release is freezed, unfreeze it first')
+ return False
+ # Search session related to this production release
+ for s in self.bank['sessions']:
+ if s['id'] == prod['session']:
+ oldsession = s
+ break
+ break
+ if oldsession is None:
+ logging.error('No production session could be found for this release')
+ return False
+ if 'current' in self.bank and self.bank['current'] == oldsession['id']:
+ logging.error('This release is the release in the main release production, you should first unpublish it')
+ return False
+
+ # New empty session for removal
+ session = Session(self.name, self.config, RemoveWorkflow.FLOW)
+ session.set('action', 'remove')
+ session.set('release', oldsession['release'])
+ session.set('update_session_id', oldsession['id'])
+ self.session = session
+ # Reset status, we take an update session
+ res = self.start_remove(session)
+ self.session.set('workflow_status', res)
+
+ self.save_session()
+
+ return res
+
+ def update(self, depends=False):
+ """
+ Launch a bank update
+
+ :param depends: run update of bank dependencies first
+ :type depends: bool
+ :return: bool
+ """
+ logging.warning('Bank:' + self.name + ':Update')
+
+ if not self.is_owner():
+ logging.error('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+ raise Exception('Not authorized, bank owned by ' + self.bank['properties']['owner'])
+
+ self.run_depends = depends
+
+ self.controls()
+ if self.options.get_option('release'):
+ logging.info('Bank:' + self.name + ':Release:' + self.options.get_option('release'))
+ s = self.get_session_from_release(self.options.get_option('release'))
+ # No session in prod
+ if s is None:
+ logging.error('Release does not exists: ' + self.options.get_option('release'))
+ return False
+ self.load_session(UpdateWorkflow.FLOW, s)
+ else:
+ logging.info('Bank:' + self.name + ':Release:latest')
+ self.load_session(UpdateWorkflow.FLOW)
+ # if from task, reset workflow status in session.
+ if self.options.get_option('from_task'):
+ set_to_false = False
+ for task in self.session.flow:
+ # If task was in False status (KO) and we ask to start after this task, exit
+ if not set_to_false and not self.session.get_status(task['name']) and \
+ task['name'] != self.options.get_option('from_task'):
+ logging.error(
+ 'Previous task ' + task['name'] + ' was not successful, cannot restart after this task')
+ return False
+ if task['name'] == self.options.get_option('from_task'):
+ set_to_false = True
+ if set_to_false:
+ # After from_task task, tasks must be set to False to be run
+ self.session.set_status(task['name'], False)
+ proc = None
+ if task['name'] in [Workflow.FLOW_POSTPROCESS, Workflow.FLOW_PREPROCESS,
+ Workflow.FLOW_REMOVEPROCESS]:
+ proc = self.options.get_option('process')
+ reset = self.session.reset_proc(task['name'], proc)
+ if not reset:
+ logging.info("Process %s not found in %s" % (str(proc), task['name']))
+ return False
+
+ self.session.set('action', 'update')
+ res = self.start_update()
+ self.session.set('workflow_status', res)
+ self.save_session()
+ self.__stats()
+ return res
+
+ def __stats(self):
+ '''
+ Send stats to Influxdb if enabled
+ '''
+ db_host = self.config.get('influxdb.host', default=None)
+ if not db_host:
+ return
+ if not self.session.get_status(Workflow.FLOW_OVER):
+ return
+ if 'stats' not in self.session._session:
+ return
+
+ db_port = int(self.config.get('influxdb.port', default='8086'))
+ db_user = self.config.get('influxdb.user', default=None)
+ db_password = self.config.get('influxdb.password', default=None)
+ db_name = self.config.get('influxdb.db', default='biomaj')
+ influxdb = None
+ try:
+ if db_user and db_password:
+ influxdb = InfluxDBClient(host=db_host, port=db_port, username=db_user, password=db_password, database=db_name)
+ else:
+ influxdb = InfluxDBClient(host=db_host, port=db_port, database=db_name)
+ except Exception as e:
+ logging.error('InfluxDB connection error: ' + str(e))
+ return
+ metrics = []
+
+ if 'production' not in self.bank or not self.bank['production']:
+ return
+
+ productions = self.bank['production']
+ total_size = 0
+ latest_size = 0
+ if not productions:
+ return
+ for production in productions:
+ if 'size' in production:
+ total_size += production['size']
+
+ influx_metric = {
+ "measurement": 'biomaj.production.size.total',
+ "fields": {
+ "value": float(total_size)
+ },
+ "tags": {
+ "bank": self.name
+ }
+ }
+ metrics.append(influx_metric)
+
+ all_banks = Bank.list()
+ nb_banks_with_prod = 0
+ if all_banks:
+ for bank_info in all_banks:
+ if 'production' in bank_info and len(bank_info['production']) > 0:
+ nb_banks_with_prod += 1
+ influx_metric = {
+ "measurement": 'biomaj.banks.quantity',
+ "fields": {
+ "value": nb_banks_with_prod
+ }
+ }
+ metrics.append(influx_metric)
+
+ workflow_duration = 0
+ for flow in list(self.session._session['stats']['workflow'].keys()):
+ workflow_duration += self.session._session['stats']['workflow'][flow]
+
+ influx_metric = {
+ "measurement": 'biomaj.workflow.duration',
+ "fields": {
+ "value": workflow_duration
+ },
+ "tags": {
+ "bank": self.name
+ }
+ }
+ metrics.append(influx_metric)
+
+ if self.session.get('update'):
+ latest_size = self.session.get('fullsize')
+ influx_metric = {
+ "measurement": 'biomaj.production.size.latest',
+ "fields": {
+ "value": float(latest_size)
+ },
+ "tags": {
+ "bank": self.name
+ }
+ }
+ metrics.append(influx_metric)
+
+ influx_metric = {
+ "measurement": 'biomaj.bank.update.downloaded_files',
+ "fields": {
+ "value": self.session._session['stats']['nb_downloaded_files']
+ },
+ "tags": {
+ "bank": self.name
+ }
+ }
+ metrics.append(influx_metric)
+
+ influx_metric = {
+ "measurement": 'biomaj.bank.update.new',
+ "fields": {
+ "value": 1
+ },
+ "tags": {
+ "bank": self.name
+ }
+ }
+ metrics.append(influx_metric)
+
+ res = influxdb.write_points(metrics, time_precision="s")
+ if not res:
+ logging.error('Failed to send metrics to database')
+
+ def check_remote_release(self):
+ '''
+ Check remote release of the bank
+ '''
+ logging.warning('Bank:' + self.name + ':Check remote release')
+ self.controls()
+ logging.info('Bank:' + self.name + ':Release:latest')
+ self.load_session(ReleaseCheckWorkflow.FLOW)
+ workflow = ReleaseCheckWorkflow(self)
+ res = workflow.start()
+ remoterelease = None
+ if res:
+ remoterelease = workflow.session.get('remoterelease')
+ return (res, remoterelease)
+
+ def start_remove(self, session):
+ """
+ Start a removal workflow
+
+ :param session: Session to remove
+ :type session: :class:`biomaj.session.Session`
+ :return: bool
+ """
+ workflow = RemoveWorkflow(self, session)
+ if self.options and self.options.get_option('redis_host'):
+ redis_client = redis.StrictRedis(
+ host=self.options.get_option('redis_host'),
+ port=self.options.get_option('redis_port'),
+ db=self.options.get_option('redis_db'),
+ decode_responses=True
+ )
+ workflow.redis_client = redis_client
+ workflow.redis_prefix = self.options.get_option('redis_prefix')
+ if redis_client.get(self.options.get_option('redis_prefix') + ':' + self.name + ':action:cancel'):
+ logging.warn('Cancel requested, stopping update')
+ redis_client.delete(self.options.get_option('redis_prefix') + ':' + self.name + ':action:cancel')
+ return False
+ return workflow.start()
+
+ def start_update(self):
+ """
+ Start an update workflow
+ """
+ workflow = UpdateWorkflow(self)
+ if self.options and self.options.get_option('redis_host'):
+ redis_client = redis.StrictRedis(
+ host=self.options.get_option('redis_host'),
+ port=self.options.get_option('redis_port'),
+ db=self.options.get_option('redis_db'),
+ decode_responses=True
+ )
+ workflow.redis_client = redis_client
+ workflow.redis_prefix = self.options.get_option('redis_prefix')
+ if redis_client.get(self.options.get_option('redis_prefix') + ':' + self.name + ':action:cancel'):
+ logging.warn('Cancel requested, stopping update')
+ redis_client.delete(self.options.get_option('redis_prefix') + ':' + self.name + ':action:cancel')
+ return False
+ return workflow.start()
diff --git a/biomaj/mongo_connector.py b/biomaj/mongo_connector.py
new file mode 100644
index 0000000..76e3435
--- /dev/null
+++ b/biomaj/mongo_connector.py
@@ -0,0 +1,20 @@
+from builtins import object
+from pymongo import MongoClient
+
+
+class MongoConnector(object):
+ """
+ Connector to mongodb
+ """
+
+ client = None
+ db = None
+ banks = None
+ users = None
+
+ def __init__(self, url, db):
+ MongoConnector.client = MongoClient(url)
+ MongoConnector.db = MongoConnector.client[db]
+ MongoConnector.banks = MongoConnector.db.banks
+ MongoConnector.users = MongoConnector.db.users
+ MongoConnector.db_schema = MongoConnector.db.db_schema
diff --git a/biomaj/notify.py b/biomaj/notify.py
new file mode 100644
index 0000000..ee76417
--- /dev/null
+++ b/biomaj/notify.py
@@ -0,0 +1,60 @@
+from builtins import str
+from builtins import object
+import smtplib
+import email.utils
+from biomaj.workflow import Workflow
+import logging
+import sys
+if sys.version < '3':
+ from email.MIMEText import MIMEText
+else:
+ from email.mime.text import MIMEText
+
+
+class Notify(object):
+ """
+ Send notifications
+ """
+
+ @staticmethod
+ def notifyBankAction(bank):
+ if not bank.config.get('mail.smtp.host') or bank.session is None:
+ logging.info('Notify:none')
+ return
+ admins = bank.config.get('mail.admin')
+ if not admins:
+ logging.info('Notify: no mail.admin defined')
+ return
+ admin_list = admins.split(',')
+ logging.info('Notify:' + bank.config.get('mail.admin'))
+ mfrom = bank.config.get('mail.from')
+ log_file = bank.config.log_file
+ msg = MIMEText('')
+ if log_file:
+ fp = None
+ if sys.version < '3':
+ fp = open(log_file, 'rb')
+ else:
+ fp = open(log_file, 'r')
+ msg = MIMEText(fp.read(2000000))
+ fp.close()
+
+ msg['From'] = email.utils.formataddr(('Author', mfrom))
+ msg['Subject'] = 'BANK[' + bank.name + '] - STATUS[' + str(bank.session.get_status(Workflow.FLOW_OVER)) + '] - UPDATE[' + str(bank.session.get('update')) + '] - REMOVE[' + str(bank.session.get('remove')) + ']' + ' - RELEASE[' + str(bank.session.get('release')) + ']'
+
+ logging.info(msg['subject'])
+ server = None
+ for mto in admin_list:
+ msg['To'] = email.utils.formataddr(('Recipient', mto))
+ try:
+ server = smtplib.SMTP(bank.config.get('mail.smtp.host'))
+ if bank.config.get('mail.tls') is not None and str(bank.config.get('mail.tls')) == 'true':
+ server.starttls()
+ if bank.config.get('mail.user') is not None and str(bank.config.get('mail.user')) != '':
+ server.login(bank.config.get('mail.user'), bank.config.get('mail.password'))
+ server.sendmail(mfrom, [mto], msg.as_string())
+ except Exception as e:
+ logging.error('Could not send email: ' + str(e))
+ finally:
+ if server is not None:
+ server.quit()
diff --git a/biomaj/options.py b/biomaj/options.py
new file mode 100644
index 0000000..f51b305
--- /dev/null
+++ b/biomaj/options.py
@@ -0,0 +1,27 @@
+from builtins import object
+
+
+class Options(object):
+ """
+ Available options
+ """
+
+ def __init__(self, options=None):
+ self.options = options
+
+ def get_option(self, option):
+ """
+ Gets an option if present, else return None
+ """
+ if hasattr(self, option):
+ return getattr(self, option)
+ return None
+
+ UPDATE = 'update'
+ REMOVE = 'remove'
+ PUBLISH = 'publish'
+ FROM_TASK = 'from_task'
+ PROCESS = 'process'
+ STOP_BEFORE = 'stop_before'
+ STOP_AFTER = 'stop_after'
+ FROMSCRATCH = 'fromscratch'
diff --git a/biomaj/process/__init__.py b/biomaj/process/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/biomaj/process/metaprocess.py b/biomaj/process/metaprocess.py
new file mode 100644
index 0000000..9838596
--- /dev/null
+++ b/biomaj/process/metaprocess.py
@@ -0,0 +1,330 @@
+from builtins import str
+import threading
+import logging
+import os
+
+from biomaj_process.process import Process, DrmaaProcess, DockerProcess
+from biomaj_process.process import RemoteProcess
+from biomaj.mongo_connector import MongoConnector
+from biomaj_zipkin.zipkin import Zipkin
+
+
+class MetaProcess(threading.Thread):
+ '''
+ Meta process in biomaj process workflow. Meta processes are executed in parallel.
+
+ Each meta process defined a list of Process to execute sequentially
+ '''
+
+ def __init__(self, bank, metas, meta_status=None, meta_data=None, simulate=False):
+ '''
+ Creates a meta process thread
+
+ :param bank: Bank
+ :type bank: :class:`biomak.bank`
+ :param meta: list of meta processes to execute in thread
+ :type meta: list of str
+ :param meta_status: initial status of the meta processes
+ :type meta_status: bool
+ :param simulate: does not execute process
+ :type simulate: bool
+ '''
+ if meta_data is None:
+ meta_data = {}
+ threading.Thread.__init__(self)
+ self._lock = None
+ self.kill_received = False
+ self.workflow = None
+ self.simulate = simulate
+ self.bank = bank
+ self.metas = metas
+ self.meta_data = meta_data
+ self.meta_status = {}
+ for meta in self.metas:
+ self.meta_status[meta] = {}
+
+ if meta_status is not None:
+ self.meta_status = meta_status
+
+ self._stopevent = threading.Event()
+
+ self.bmaj_env = os.environ.copy()
+
+ self.bmaj_only_env = {}
+ # The root directory where all databases are stored.
+ # If your data is not stored under one directory hirearchy
+ # you can override this value in the database properties file.
+ for conf in dict(self.bank.config.config_bank.items('GENERAL')):
+ self.bmaj_env[conf] = self.bank.config.config_bank.get('GENERAL', conf)
+ if self.bmaj_env[conf] is None:
+ self.bmaj_env[conf] = ''
+ self.bmaj_only_env[conf] = self.bmaj_env[conf]
+
+ self.bmaj_env['dbname'] = self.bank.name
+ self.bmaj_only_env['dbname'] = self.bmaj_env['dbname']
+
+ self.bmaj_env['datadir'] = self.bank.config.get('data.dir')
+ self.bmaj_only_env['datadir'] = self.bmaj_env['datadir']
+
+ self.bmaj_env['data.dir'] = self.bmaj_env['datadir']
+ self.bmaj_only_env['data.dir'] = self.bmaj_env['data.dir']
+
+ if self.bank.config.get('mail.admin'):
+ self.bmaj_env['mailadmin'] = self.bank.config.get('mail.admin')
+ self.bmaj_only_env['mailadmin'] = self.bmaj_env['mailadmin']
+
+ if self.bank.config.get('mail.smtp.host'):
+ self.bmaj_env['mailsmtp'] = self.bank.config.get('mail.smtp.host')
+ self.bmaj_only_env['mailsmtp'] = self.bmaj_env['mailsmtp']
+
+ self.bmaj_env['processdir'] = self.bank.config.get('process.dir', default='')
+ self.bmaj_only_env['processdir'] = self.bmaj_env['processdir']
+
+ if 'PATH' in self.bmaj_env:
+ self.bmaj_env['PATH'] += ':' + self.bmaj_env['processdir']
+ self.bmaj_only_env['PATH'] = self.bmaj_env['PATH']
+ else:
+ self.bmaj_env['PATH'] = self.bmaj_env['processdir'] + ':/usr/local/bin:/usr/sbin:/usr/bin'
+ self.bmaj_only_env['PATH'] = self.bmaj_env['PATH']
+
+ self.bmaj_env['PP_DEPENDENCE'] = '#'
+ self.bmaj_only_env['PP_DEPENDENCE'] = '#'
+ self.bmaj_env['PP_DEPENDENCE_VOLATILE'] = '#'
+ self.bmaj_only_env['PP_DEPENDENCE_VOLATILE'] = '#'
+ self.bmaj_env['PP_WARNING'] = '#'
+ self.bmaj_only_env['PP_WARNING'] = '#'
+
+ self.bmaj_env['PATH_PROCESS_BIOMAJ'] = self.bank.config.get('process.dir')
+ self.bmaj_only_env['PATH_PROCESS_BIOMAJ'] = self.bank.config.get('process.dir')
+
+ # Set some session specific env
+ if self.bank.session is not None:
+
+ if self.bank.session.get('log_file') is not None:
+ log_file = self.bank.session.get('log_file')
+ log_dir = os.path.dirname(log_file)
+ self.bmaj_env['logdir'] = log_dir
+ self.bmaj_only_env['logdir'] = log_dir
+ self.bmaj_env['logfile'] = log_file
+ self.bmaj_only_env['logfile'] = log_file
+
+ self.bmaj_env['offlinedir'] = self.bank.session.get_offline_directory()
+ self.bmaj_only_env['offlinedir'] = self.bmaj_env['offlinedir']
+
+ self.bmaj_env['dirversion'] = self.bank.config.get('dir.version')
+ self.bmaj_only_env['dirversion'] = self.bmaj_env['dirversion']
+
+ self.bmaj_env['noextract'] = self.bank.config.get('no.extract')
+ if self.bmaj_env['noextract'] is None:
+ self.bmaj_env['noextract'] = ''
+ self.bmaj_only_env['noextract'] = self.bmaj_env['noextract']
+
+ self.bmaj_env['localrelease'] = self.bank.session.get_release_directory()
+ self.bmaj_only_env['localrelease'] = self.bmaj_env['localrelease']
+ if self.bank.session.get('release') is not None:
+ self.bmaj_env['remoterelease'] = self.bank.session.get('remoterelease')
+ self.bmaj_only_env['remoterelease'] = self.bmaj_env['remoterelease']
+ self.bmaj_env['removedrelease'] = self.bank.session.get('release')
+ self.bmaj_only_env['removedrelease'] = self.bmaj_env['removedrelease']
+
+ for bdep in self.bank.depends:
+ self.bmaj_env[bdep.name + 'source'] = bdep.session.get_full_release_directory()
+ self.bmaj_only_env[bdep.name + 'source'] = self.bmaj_env[bdep.name + 'source']
+
+ # Fix case where a var = None
+ for key in list(self.bmaj_only_env.keys()):
+ if self.bmaj_only_env[key] is None:
+ self.bmaj_env[key] = ''
+ self.bmaj_only_env[key] = ''
+
+ def set_progress(self, name, status=None):
+ '''
+ Update progress on download
+
+ :param name: name of process
+ :type name: str
+ :param status: status of process
+ :type status: bool or None
+ '''
+ logging.debug('Process:progress:' + name + "=" + str(status))
+ if self.workflow is not None:
+ MongoConnector.banks.update(
+ {'name': self.bank.name},
+ {'$set': {'status.' + self.workflow + '.progress.' + name: status}}
+ )
+
+ def run(self):
+ # Run meta processes
+ self.global_status = True
+ for meta in self.metas:
+ if not self._stopevent.isSet():
+ logging.info("PROC:META:RUN:" + meta)
+ processes = []
+ if self.bank.config.get(meta) is not None:
+ processes = self.bank.config.get(meta).split(',')
+ processes_status = {}
+ for bprocess in processes:
+ if self.kill_received:
+ raise Exception('Kill request received, exiting')
+ # Process status already ok, do not replay
+ if meta in self.meta_status and bprocess in self.meta_status[meta] and self.meta_status[meta][bprocess]:
+ logging.info("PROC:META:SKIP:PROCESS:" + bprocess)
+ processes_status[bprocess] = True
+ continue
+ logging.info("PROC:META:RUN:PROCESS:" + bprocess)
+ # bprocess.name may not be unique
+ name = bprocess
+ desc = self.bank.config.get(bprocess + '.desc')
+ cluster = self.bank.config.get_bool(bprocess + '.cluster', default=False)
+ docker = self.bank.config.get(bprocess + '.docker')
+ proc_type = self.bank.config.get(bprocess + '.type')
+ exe = self.bank.config.get(bprocess + '.exe')
+ args = self.bank.config.get(bprocess + '.args')
+ expand = self.bank.config.get_bool(bprocess + '.expand', default=True)
+ if cluster:
+ native = self.bank.config.get(bprocess + '.native')
+ bmaj_process = DrmaaProcess(meta + '_' + name, exe, args, desc, proc_type, native,
+ expand, self.bmaj_env,
+ os.path.dirname(self.bank.config.log_file))
+ else:
+ if self.bank.config.get('micro.biomaj.service.process', default=None) == '1':
+ logging.info("PROC:META:RUN:REMOTEPROCESS: " + bprocess)
+ # (self, name, exe, args, desc=None, proc_type=None, expand=True,
+ # bank_env=None, log_dir=None,
+ # rabbit_mq=None, rabbit_mq_port=5672, rabbit_mq_user=None, rabbit_mq_password=None, rabbit_mq_virtualhost=None,
+ # proxy=None, bank=None):
+ use_sudo = self.bank.config.get_bool('docker.sudo', default=True)
+ bmaj_process = RemoteProcess(
+ meta + '_' + name,
+ exe,
+ args,
+ desc=desc,
+ proc_type=proc_type,
+ expand=expand,
+ docker=docker,
+ docker_sudo=use_sudo,
+ bank_env=self.bmaj_only_env,
+ log_dir=os.path.dirname(self.bank.config.log_file),
+ rabbit_mq=self.bank.config.get('micro.biomaj.rabbit_mq'),
+ rabbit_mq_port=int(self.bank.config.get('micro.biomaj.rabbit_mq_port', default='5672')),
+ rabbit_mq_user=self.bank.config.get('micro.biomaj.rabbit_mq_user'),
+ rabbit_mq_password=self.bank.config.get('micro.biomaj.rabbit_mq_password'),
+ rabbit_mq_virtualhost=self.bank.config.get('micro.biomaj.rabbit_mq_virtualhost', default='/'),
+ proxy=self.bank.config.get('micro.biomaj.proxy'),
+ bank=self.bank.name
+ )
+ else:
+ if docker:
+ use_sudo = self.bank.config.get_bool('docker.sudo', default=True)
+ bmaj_process = DockerProcess(
+ meta + '_' + name, exe, args,
+ desc=desc,
+ proc_type=proc_type,
+ docker=docker,
+ expand=expand,
+ bank_env=self.bmaj_only_env,
+ log_dir=os.path.dirname(self.bank.config.log_file),
+ use_sudo=use_sudo)
+ else:
+ bmaj_process = Process(
+ meta + '_' + name, exe, args,
+ desc=desc,
+ proc_type=proc_type,
+ expand=expand,
+ bank_env=self.bmaj_env,
+ log_dir=os.path.dirname(self.bank.config.log_file)
+ )
+ self.set_progress(bmaj_process.name, None)
+ if self.bank.config.get(bprocess + '.format'):
+ bmaj_process.format = self.bank.config.get(bprocess + '.format')
+ if self.bank.config.get(bprocess + '.types'):
+ bmaj_process.types = self.bank.config.get(bprocess + '.types')
+ if self.bank.config.get(bprocess + '.tags'):
+ bmaj_process.tags = self.bank.config.get(bprocess + '.tags')
+ if self.bank.config.get(bprocess + '.files'):
+ bmaj_process.files = self.bank.config.get(bprocess + '.files')
+
+ span = None
+ if self.bank.config.get('zipkin_trace_id'):
+ span = Zipkin('biomaj-process', bmaj_process.name, trace_id=self.bank.config.get('zipkin_trace_id'), parent_id=self.bank.config.get('zipkin_span_id'))
+ bmaj_process.set_trace(span.get_trace_id(), span.get_span_id())
+
+ res = bmaj_process.run(self.simulate)
+
+ if span:
+ span.add_binary_annotation('status', str(res))
+ span.trace()
+
+ processes_status[bprocess] = res
+ self.set_progress(bmaj_process.name, res)
+ if not res:
+ self.global_status = False
+ break
+ if not self.simulate:
+ if self._lock:
+ self._lock.acquire()
+ try:
+ self._get_metata_from_outputfile(bmaj_process)
+ except Exception as e:
+ logging.error(e)
+ finally:
+ self._lock.release() # release lock, no matter what
+ else:
+ self._get_metata_from_outputfile(bmaj_process)
+ self.meta_status[meta] = processes_status
+
+ def _get_metata_from_outputfile(self, proc):
+ '''
+ Extract metadata given by process on stdout. Store metadata in self.metadata
+
+ :param proc: process
+ :type proc_name: :class:`biomaj.process.Process`
+ '''
+ proc_name = proc.name
+ output_file = proc.output_file
+
+ self.meta_data[proc_name] = {}
+ with open(output_file) as f:
+ for line in f:
+ if line.startswith('##BIOMAJ#'):
+ line = line.replace('##BIOMAJ#', '')
+ line = line.strip('\n\r')
+ metas = line.split('#')
+ meta_format = metas[0]
+ if meta_format == '':
+ meta_format = proc.format
+ meta_type = metas[1]
+ if meta_type == '':
+ meta_type = proc.types
+ meta_tags = metas[2]
+ if meta_tags == '':
+ meta_tags = proc.tags
+ meta_files = metas[3]
+ if meta_format not in self.meta_data[proc_name]:
+ self.meta_data[proc_name][meta_format] = []
+ tags = meta_tags.split(',')
+ tag_list = {}
+ if meta_tags != '':
+ for tag in tags:
+ t = tag.split(':')
+ tag_list[t[0]] = t[1]
+ self.meta_data[proc_name][meta_format].append({
+ 'tags': tag_list,
+ 'types': meta_type.split(','),
+ 'files': meta_files.split(',')}
+ )
+ if proc.files and proc.format:
+ tag_list = {}
+ if proc.tags != '':
+ for tag in proc.tags.split(','):
+ t = tag.split(':')
+ tag_list[t[0]] = t[1]
+ self.meta_data[proc_name][proc.format] = []
+ self.meta_data[proc_name][proc.format].append({
+ 'tags': tag_list,
+ 'types': proc.types.split(','),
+ 'files': proc.files.split(',')}
+ )
+
+ def stop(self):
+ self._stopevent.set()
diff --git a/biomaj/process/processfactory.py b/biomaj/process/processfactory.py
new file mode 100644
index 0000000..7c63191
--- /dev/null
+++ b/biomaj/process/processfactory.py
@@ -0,0 +1,242 @@
+from builtins import range
+from builtins import object
+import threading
+import logging
+import os
+from biomaj.process.metaprocess import MetaProcess
+
+
+class ProcessFactory(object):
+ '''
+ Manage process execution
+ '''
+
+ NB_THREAD = 2
+
+ def __init__(self, bank, redis_client=None, redis_prefix=None):
+ self.bank = bank
+ self.threads_tasks = []
+ if self.bank.session:
+ self.meta_data = self.bank.session.get('per_process_metadata')
+ else:
+ self.meta_data = {}
+ self.redis_client = redis_client
+ self.redis_prefix = redis_prefix
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ pass
+
+ def run_threads(self, simulate=False):
+ '''
+ Start meta threads
+
+ :param simulate: do not execute processes
+ :type simulate: bool
+ :return: tuple global execution status and status per meta process
+ '''
+ logging.debug('Start meta threads')
+ os.chdir(self.bank.config.get('process.dir'))
+ threads = []
+ running_th = []
+ for thread_tasks in self.threads_tasks:
+ meta_thread = MetaProcess(self.bank, thread_tasks, self.meta_status, self.meta_data, simulate)
+ meta_thread._lock = ProcessFactory._LOCK
+ meta_thread.workflow = self.workflow
+ meta_thread.start()
+ threads.append(meta_thread)
+ running_th.append(meta_thread)
+ # Wait for the end of the threads
+ kill_received = False
+ while len(running_th) > 0:
+ try:
+ # Join all threads using a timeout so it doesn't block
+ # Filter out threads which have been joined or are None
+
+ # Check for cancel request
+ if self.redis_client and self.redis_client.get(self.redis_prefix + ':' + self.bank.name + ':action:cancel'):
+ logging.warn('Cancel requested, stopping process update')
+ self.redis_client.delete(self.redis_prefix + ':' + self.bank.name + ':session:' + self.session)
+ kill_received = True
+ for t in running_th:
+ t.kill_received = True
+ running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()]
+ except KeyboardInterrupt:
+ logging.warn("Ctrl-c received! Sending kill to threads...")
+ logging.warn("Running tasks will continue and process will stop.")
+ kill_received = True
+ for t in running_th:
+ t.kill_received = True
+
+ for meta_thread in threads:
+ meta_thread.join()
+ global_meta_status = {}
+ global_status = True
+
+ for meta_thread in threads:
+ for meta in meta_thread.meta_status:
+ global_meta_status[meta] = meta_thread.meta_status[meta]
+ if not meta_thread.global_status:
+ global_status = False
+
+ if kill_received:
+ global_status = False
+
+ logging.debug('Meta threads are over')
+ return (global_status, global_meta_status)
+
+ def fill_tasks_in_threads(self, metas):
+ '''
+ Dispatch meta processes in available threads
+ '''
+ self.threads_tasks = []
+ for i in range(0, ProcessFactory.NB_THREAD):
+ # Fill array of meta process in future threads
+ self.threads_tasks.append([])
+ thread_id = 0
+ for meta in metas:
+ meta_process = meta.strip()
+ if thread_id == ProcessFactory.NB_THREAD:
+ thread_id = 0
+ self.threads_tasks[thread_id].append(meta_process)
+ thread_id += 1
+
+
+class PreProcessFactory(ProcessFactory):
+ '''
+ Manage preprocesses
+ '''
+
+ def __init__(self, bank, metas=None, redis_client=None, redis_prefix=None):
+ '''
+ Creates a preprocess factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param metas: initial status of meta processes
+ :type metas: dict
+ '''
+ ProcessFactory.__init__(self, bank, redis_client, redis_prefix)
+ self.meta_status = None
+ if metas is not None:
+ self.meta_status = metas
+ self.workflow = 'preprocess'
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:PRE')
+ if self.bank.config.get('db.pre.process') is None:
+ metas = []
+ else:
+ metas = self.bank.config.get('db.pre.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.meta_status) = self.run_threads(simulate)
+ return status
+
+
+class RemoveProcessFactory(ProcessFactory):
+ '''
+ Manage remove processes
+ '''
+
+ def __init__(self, bank, metas=None, redis_client=None, redis_prefix=None):
+ '''
+ Creates a remove process factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param metas: initial status of meta processes
+ :type metas: dict
+ '''
+ ProcessFactory.__init__(self, bank, redis_client, redis_prefix)
+ self.meta_status = None
+ if metas is not None:
+ self.meta_status = metas
+ self.workflow = 'removeprocess'
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:REMOVE')
+ if self.bank.config.get('db.remove.process') is None:
+ metas = []
+ else:
+ metas = self.bank.config.get('db.remove.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.meta_status) = self.run_threads(simulate)
+ return status
+
+
+class PostProcessFactory(ProcessFactory):
+ '''
+ Manage postprocesses
+
+ self.blocks: dict of meta processes status
+ Each meta process status is a dict of process status
+ '''
+
+ def __init__(self, bank, blocks=None, redis_client=None, redis_prefix=None):
+ '''
+ Creates a postprocess factory
+
+ :param bank: Bank
+ :type bank: :class:`biomaj.bank.Bank`
+ :param blocks: initial status of block processes
+ :type blocks: dict
+ '''
+ ProcessFactory.__init__(self, bank, redis_client, redis_prefix)
+ self.blocks = {}
+ if blocks is not None:
+ self.blocks = blocks
+ self.workflow = 'postprocess'
+
+ def run(self, simulate=False):
+ '''
+ Run processes
+
+ :param simulate: does not execute process
+ :type simulate: bool
+ :return: status of execution - bool
+ '''
+ logging.info('PROC:POST:BLOCK')
+ blocks = self.bank.config.get('BLOCKS')
+ if blocks is None or blocks == '':
+ process_blocks = []
+ else:
+ process_blocks = blocks.split(',')
+ metas = []
+ self.meta_status = None
+ global_status = True
+ for process_block in process_blocks:
+ if not global_status:
+ continue
+ logging.info('PROC:POST:BLOCK:' + process_block)
+ if process_block in self.blocks:
+ self.meta_status = self.blocks[process_block]
+ # run each block
+ metas = self.bank.config.get(process_block.strip() + '.db.post.process').split(',')
+ self.fill_tasks_in_threads(metas)
+ (status, self.blocks[process_block]) = self.run_threads(simulate)
+ if not status:
+ global_status = False
+ return global_status
+
+
+ProcessFactory._LOCK = threading.Lock()
diff --git a/biomaj/schema_version.py b/biomaj/schema_version.py
new file mode 100644
index 0000000..7caf2a2
--- /dev/null
+++ b/biomaj/schema_version.py
@@ -0,0 +1,210 @@
+from __future__ import print_function
+import pkg_resources
+import string
+import random
+import os
+import sys
+from biomaj.bank import Bank
+from biomaj.mongo_connector import MongoConnector
+from biomaj_core.config import BiomajConfig
+from biomaj_core.utils import Utils
+
+import logging
+
+
+class SchemaVersion(object):
+
+ """
+ BioMAJ database schema version. This package can be used to make some schema modification if needed during
+ incremental software version.
+ """
+
+ VERSION = None
+
+ @staticmethod
+ def migrate_pendings():
+ """
+ Migrate database
+
+ 3.0.18: Check the actual BioMAJ version and if older than 3.0.17, do the 'pending' key migration
+ """
+ if BiomajConfig.global_config is None:
+ try:
+ BiomajConfig.load_config()
+ except Exception as err:
+ print("* SchemaVersion: Can't find config file: " + str(err))
+ return None
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ schema = MongoConnector.db_schema
+ banks = MongoConnector.banks
+ users = MongoConnector.users
+ schema_version = SchemaVersion.get_dbschema_version(schema)
+ moderate = int(schema_version.split('.')[1])
+ minor = int(schema_version.split('.')[2])
+
+ if moderate == 0 and minor <= 17:
+ print("Migrate from release: %s" % schema_version)
+ # Update pending releases
+ bank_list = banks.find()
+ updated = 0
+ for bank in bank_list:
+ if 'pending' in bank:
+ # Check we have an old pending type
+ if type(bank['pending']) == dict:
+ updated += 1
+ pendings = []
+ for release in sorted(bank['pending'], key=lambda r: bank['pending'][r]):
+ pendings.append({'release': str(release), 'id': bank['pending'][str(release)]})
+ if len(pendings) > 0:
+ banks.update({'name': bank['name']},
+ {'$set': {'pending': pendings}})
+ else:
+ # We remove old type for 'pending'
+ banks.update({'name': bank['name']},
+ {'$unset': {'pending': ""}})
+
+ print("Migration: %d bank(s) updated" % updated)
+ if moderate < 1:
+ updated = 0
+ user_list = users.find()
+ for user in user_list:
+ if 'apikey' not in user:
+ updated += 1
+ api_key = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10))
+ users.update({'_id': user['_id']}, {'$set': {'apikey': api_key}})
+ print("Migration: %d user(s) updated" % updated)
+ # production size
+ bank_list = banks.find()
+ updated = 0
+ for bank in bank_list:
+ for prod in bank['production']:
+ '''
+ { "_id" : ObjectId("54edb10856e8bb11340b5f51"), "production" : [
+ { "freeze" : false, "remoterelease" : "2003-11-26", "session" : 1427809848.560108,
+ "data_dir" : "/db", "formats" : [ ], "release" : "2003-11-26",
+ "dir_version" : "ncbi/blast/alu",
+ "prod_dir" : "alu-2003-11-26", "types" : [ ], "size" : 319432 } ] }
+ '''
+ if 'size' not in prod or prod['size'] == 0:
+ logging.info('Calculate size for bank %s' % (bank['name']))
+ if 'data_dir' not in prod or not prod['data_dir'] or 'prod_dir' not in prod or not prod['prod_dir'] or 'dir_version' not in prod or not prod['dir_version']:
+ logging.warn('no production directory information for %s, skipping...' % (bank['name']))
+ continue
+ prod_dir = os.path.join(prod['data_dir'], prod['dir_version'], prod['prod_dir'])
+ if not os.path.exists(prod_dir):
+ logging.warn('production directory %s does not exists for %s, skipping...' % (prod_dir, bank['name']))
+ continue
+ dir_size = Utils.get_folder_size(prod_dir)
+ banks.update({'name': bank['name'], 'production.release': prod['release']}, {'$set': {'production.$.size': dir_size}})
+ updated += 1
+ print("Migration: %d bank production info updated" % updated)
+
+ @staticmethod
+ def add_property(bank=None, prop=None, value=None, cfg=None):
+ """
+ Update properties field for banks.
+
+ :param bank: Bank name to update, default all
+ :type bank: str
+ :param prop: New property to add
+ :type prop: str
+ :param value: Property value, if cfg set, value taken
+ from bank configuration cfg key
+ :type value: str
+ :param cfg: Bank configuration key value is taken from
+ :type cfg: str
+
+ :raise Exception: If not configuration file found
+ :returns: True/False
+ :rtype: bool
+ """
+ if BiomajConfig.global_config is None:
+ try:
+ BiomajConfig.load_config()
+ except Exception as err:
+ print("* SchemaVersion: Can't find config file: " + str(err))
+ return False
+ if prop is None:
+ print("Property key is required", file=sys.stderr)
+ return False
+
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+
+ schema = MongoConnector.db_schema
+ banks = MongoConnector.banks
+ schema_version = SchemaVersion.get_dbschema_version(schema)
+ moderate = int(schema_version.split('.')[1])
+ minor = int(schema_version.split('.')[2])
+
+ if moderate <= 1 and minor <= 0:
+ bank_list = []
+ if bank is None:
+ bank_list = banks.find()
+ else:
+ bank_list = [banks.find_one({'name': bank})]
+ updated = 0
+ for bank in bank_list:
+ if 'properties' in bank:
+ b = Bank(bank['name'], no_log=True)
+ new_prop = 'properties.' + prop
+ new_value = value
+ if new_value is None:
+ if cfg is not None:
+ new_value = b.config.get(cfg)
+ else:
+ print("[%s] With value set to None, you must set cfg to get "
+ "corresponding value" % str(bank['name']), file=sys.stderr)
+ continue
+ banks.update({'name': bank['name']},
+ {'$set': {new_prop: new_value}})
+ updated += 1
+ else:
+ logging.warn("Bank %s does not have 'properties' field!" % str(bank['name']))
+
+ print("Add property: %d bank(s) updated" % updated)
+
+ @staticmethod
+ def get_dbschema_version(schema):
+ """
+ Get the version of the actual schema version stored in database
+
+ :param schema: Mongo schema info
+ :type schema: MongoDB collection
+ :returns: moderate, minor
+ :rtype: int, int
+
+ """
+ schema_version = schema.find_one({'id': 1})
+ if schema_version is None:
+ schema_version = {'id': 1, 'version': '3.0.0'}
+ schema.insert(schema_version)
+ return schema_version['version']
+
+ @staticmethod
+ def set_version(version=None):
+ """
+ Set BioMAJ current installed version in db_schema collection if version is None
+
+ :param version: db_schema collection version to set
+ :type version: str
+ """
+ installed_version = version
+ if installed_version is None:
+ installed_version = pkg_resources.get_distribution("biomaj").version
+ if BiomajConfig.global_config is None:
+ try:
+ BiomajConfig.load_config()
+ except Exception as err:
+ print("* SchemaVersion: Can't find config file: " + str(err))
+ return None
+ if MongoConnector.db is None:
+ MongoConnector(BiomajConfig.global_config.get('GENERAL', 'db.url'),
+ BiomajConfig.global_config.get('GENERAL', 'db.name'))
+ schema = MongoConnector.db_schema
+ schema.update_one({'id': 1}, {'$set': {'version': installed_version}})
+ print("Schema version set to %s" % str(installed_version))
diff --git a/biomaj/session.py b/biomaj/session.py
new file mode 100644
index 0000000..a2fe295
--- /dev/null
+++ b/biomaj/session.py
@@ -0,0 +1,235 @@
+from future import standard_library
+from builtins import str
+from builtins import object
+import os
+import time
+import copy
+import sys
+
+from biomaj.workflow import Workflow
+
+standard_library.install_aliases()
+
+
+class Session(object):
+ """
+ BioMAJ bank session
+ """
+ OVER = 0
+
+ @staticmethod
+ def get_ordered_dict():
+ if sys.version_info < (2, 7):
+ return {}
+ else:
+ import collections
+ return collections.OrderedDict()
+
+ def __init__(self, name, config, flow=None, action='update'):
+ """
+ Creates a new session
+
+ :param name: Name of the bank
+ :type name: str
+ :param config: bank and global config
+ :type config: BiomajConfig
+ :param flow: Workflow tasks
+ :param action: type of flow update|remove
+ :type action: str
+ :type flow: dict
+ """
+ if flow is None:
+ flow = Workflow.FLOW
+ self.name = name
+ self.config = config
+ self.flow = copy.deepcopy(flow)
+ self._reset_done = False
+
+ formats = {}
+ if self.config.get('db.formats') is not None:
+ flist = self.config.get('db.formats').split(',')
+ for f_in_list in flist:
+ formats[f_in_list.strip()] = []
+
+ self._session = {
+ 'id': time.time(),
+ 'log_file': self.config.log_file,
+ 'status': {},
+ 'files': [],
+ 'release': None,
+ 'remoterelease': None,
+ 'formats': formats,
+ 'process': {
+ 'postprocess': {},
+ 'preprocess': {},
+ 'removeprocess': {}
+ },
+ 'per_process_metadata': {},
+ 'data_dir': self.config.get('data.dir'),
+ 'dir_version': self.config.get('dir.version')
+ }
+
+ for flow in self.flow:
+ self._session['status'][flow['name']] = False
+
+ self.set('last_modified', self.config.last_modified)
+
+ # Default is update
+ self._session['action'] = action
+
+ def reload_postprocess_in_order(self, postprocess):
+ """
+ Reloads processes in config order
+ """
+ if self.config.get('BLOCKS') is None:
+ return postprocess
+ copy_postprocess = Session.get_ordered_dict()
+ blocks = self.config.get('BLOCKS').split(',')
+ for block in blocks:
+ copy_postprocess[block] = Session.get_ordered_dict()
+ metas = self.config.get(block.strip() + '.db.post.process').split(',')
+ for meta in metas:
+ copy_postprocess[block][meta] = Session.get_ordered_dict()
+ processes = self.config.get(meta.strip()).split(',')
+ for process in processes:
+ if block not in postprocess or meta not in postprocess[block] or process not in postprocess[block][meta]:
+ copy_postprocess[block][meta][process] = False
+ else:
+ copy_postprocess[block][meta][process] = postprocess[block][meta][process]
+ return copy_postprocess
+
+ def reload_in_order(self, cfg_type, otherprocess):
+ """
+ Reloads processes in config order
+ """
+ if self.config.get(cfg_type) is None or not self.config.get(cfg_type):
+ return otherprocess
+ copy_postprocess = Session.get_ordered_dict()
+ metas = self.config.get(cfg_type).split(',')
+ for meta in metas:
+ copy_postprocess[meta] = Session.get_ordered_dict()
+ processes = self.config.get(meta.strip()).split(',')
+ for process in processes:
+ copy_postprocess[meta][process] = otherprocess[meta][process]
+ return copy_postprocess
+
+ def reset_proc(self, type_proc, proc=None):
+ """
+ Reset status of processes for type in session
+
+ :param type_proc: postprocess preprocess or removeprocess
+ :type type_proc: Workflow.POSTPROCESS, Workflow.PREPROCESS, Workflow.REMOVEPROCESS
+ :param proc: reset from block/meta/proc, all reset all
+ :type proc: str
+ """
+ # If --process option not given on command line, we won't find it in following loop(s)
+ if proc is None:
+ self._reset_done = True
+ if type_proc == Workflow.FLOW_POSTPROCESS:
+ if proc in self._session['process']['postprocess']:
+ self._session['process']['postprocess'] = self.reload_postprocess_in_order(self._session['process']['postprocess'])
+ self.reset_meta(self._session['process']['postprocess'][proc])
+ else:
+ for elt in list(self._session['process']['postprocess'].keys()):
+ self.reset_meta(self._session['process']['postprocess'][elt], proc)
+ elif type_proc == Workflow.FLOW_PREPROCESS:
+ self._session['process']['preprocess'] = self.reload_in_order('db.pre.process', self._session['process']['preprocess'])
+ self.reset_meta(self._session['process']['preprocess'])
+ elif type_proc == Workflow.FLOW_REMOVEPROCESS:
+ self._session['process']['removeprocess'] = self.reload_in_order('db.remove.process', self._session['process']['removeprocess'])
+ self.reset_meta(self._session['process']['removeprocess'], proc)
+ return self._reset_done
+
+ def reset_meta(self, metas, proc=None):
+ """
+ Reset status of meta processes
+ """
+ if proc in metas:
+ for metaproc in list(metas[proc].keys()):
+ self.reset_process(metas[proc], metaproc)
+ else:
+ for meta in list(metas.keys()):
+ self.reset_process(metas[meta], proc)
+
+ def reset_process(self, processes, proc=None):
+ """
+ Reset status of processes
+ """
+ set_to_false = False
+ for process in list(processes.keys()):
+ if process == proc or proc is None:
+ set_to_false = True
+ self._reset_done = True
+ if set_to_false:
+ processes[process] = False
+
+ def load(self, session):
+ """
+ Load an existing session
+ """
+ self._session = session
+
+ def get_release_directory(self, release=None):
+ """
+ Get release directroy name
+
+ :param release: optional release, if not set, use current session release
+ :type release: str
+
+ """
+ if release is not None:
+ return self.name + self.config.get('release.separator', default='_') + str(release)
+ else:
+ return self.name + self.config.get('release.separator', default='_') + str(self._session['release'])
+
+ def get_full_release_directory(self, release=None):
+ """
+ Get bank directroy for this release
+
+ :param release: optional release, if not set, use current session release
+ :type release: str
+ """
+ release_dir = os.path.join(
+ self._session['data_dir'],
+ self._session['dir_version'],
+ self.get_release_directory(release)
+ )
+ return release_dir
+
+ def get_offline_directory(self):
+ """
+ Get bank offline directory
+ """
+ return os.path.join(self.config.get('data.dir'), self.config.get('offline.dir.name'))
+
+ def get(self, attr=None):
+ """
+ Return an attribute of session
+ """
+ if attr is None:
+ return self._session
+
+ if attr in self._session:
+ return self._session[attr]
+ else:
+ return None
+
+ def set(self, attr, value):
+ """
+ Sets an attribute of session
+ """
+ self._session[attr] = value
+
+ def get_status(self, status):
+ """
+ Return status for a flow event
+ """
+ if status not in self._session['status']:
+ return False
+ return self._session['status'][status]
+
+ def set_status(self, status, value):
+ """
+ Set status for a flow event
+ """
+ self._session['status'][status] = value
diff --git a/biomaj/workflow.py b/biomaj/workflow.py
new file mode 100644
index 0000000..af2ce49
--- /dev/null
+++ b/biomaj/workflow.py
@@ -0,0 +1,1731 @@
+from builtins import str
+from builtins import range
+from builtins import object
+import logging
+import datetime
+import time
+import os
+import shutil
+import tempfile
+import re
+import traceback
+import json
+import hashlib
+
+from biomaj_core.utils import Utils
+from biomaj_download.downloadclient import DownloadClient
+from biomaj_download.message import message_pb2
+from biomaj_download.download.http import HTTPParse
+from biomaj_download.download.localcopy import LocalDownload
+
+from biomaj.mongo_connector import MongoConnector
+from biomaj.options import Options
+from biomaj.process.processfactory import RemoveProcessFactory, PreProcessFactory, PostProcessFactory
+
+from biomaj_zipkin.zipkin import Zipkin
+
+
+class Workflow(object):
+ """
+ Bank update workflow
+ """
+
+ FLOW_INIT = 'init'
+ FLOW_CHECK = 'check'
+ FLOW_DEPENDS = 'depends'
+ FLOW_PREPROCESS = 'preprocess'
+ FLOW_RELEASE = 'release'
+ FLOW_DOWNLOAD = 'download'
+ FLOW_POSTPROCESS = 'postprocess'
+ FLOW_REMOVEPROCESS = 'removeprocess'
+ FLOW_PUBLISH = 'publish'
+ FLOW_OVER = 'over'
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'check', 'steps': []},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank, session=None):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: :class:`biomaj.bank.Bank`
+ """
+ self.bank = bank
+ if session is None:
+ self.session = bank.session
+ else:
+ self.session = session
+ self.bank.session = session
+ self.options = bank.options
+ self.name = bank.name
+ # Skip all remaining tasks, no need to update
+ self.skip_all = False
+
+ self.session._session['update'] = False
+ self.session._session['remove'] = False
+ self.session.config.set('localrelease', '')
+ self.session.config.set('remoterelease', '')
+ # For micro services
+ self.redis_client = None
+ self.redis_prefix = None
+ # Zipkin
+ self.span = None
+
+ def get_flow(self, task):
+ for flow in Workflow.FLOW:
+ if flow['name'] == task:
+ return flow
+
+ def start(self):
+ """
+ Start the workflow
+ """
+ logging.info('Workflow:Start')
+ if 'stats' not in self.session._session:
+ self.session._session['stats'] = {
+ 'workflow': {},
+ 'nb_downloaded_files': 0
+ }
+
+ for flow in self.session.flow:
+ dt = datetime.datetime.now()
+ start_timestamp = time.mktime(dt.timetuple())
+ if self.skip_all:
+ logging.info('Workflow:Skip:' + flow['name'])
+ self.session._session['status'][flow['name']] = None
+ self.session._session['status'][Workflow.FLOW_OVER] = True
+ continue
+
+ if self.options.get_option(Options.STOP_BEFORE) == flow['name']:
+ self.wf_over()
+ break
+
+ # Check for cancel request
+ if self.redis_client and self.redis_client.get(self.redis_prefix + ':' + self.bank.name + ':action:cancel'):
+ logging.warn('Cancel requested, stopping update')
+ self.redis_client.delete(self.redis_prefix + ':' + self.bank.name + ':action:cancel')
+ self.wf_over()
+ return False
+
+ # Always run INIT
+ if flow['name'] != Workflow.FLOW_INIT and self.session.get_status(flow['name']):
+ logging.info('Workflow:Skip:' + flow['name'])
+ if flow['name'] == Workflow.FLOW_INIT or not self.session.get_status(flow['name']):
+ logging.info('Workflow:Start:' + flow['name'])
+ span = None
+ if self.options.get_option('traceId'):
+ trace_id = self.options.get_option('traceId')
+ span_id = self.options.get_option('spanId')
+ span = Zipkin('biomaj-workflow', flow['name'], trace_id=trace_id, parent_id=span_id)
+ self.span = span
+ self.bank.config.set('zipkin_trace_id', span.get_trace_id())
+ self.bank.config.set('zipkin_span_id', span.get_span_id())
+
+ try:
+ self.session._session['status'][flow['name']] = getattr(self, 'wf_' + flow['name'])()
+ except Exception as e:
+ self.session._session['status'][flow['name']] = False
+ logging.exception('Workflow:' + flow['name'] + ':Exception:' + str(e))
+ logging.debug(traceback.format_exc())
+ finally:
+ self.wf_progress(flow['name'], self.session._session['status'][flow['name']])
+
+ if span:
+ span.add_binary_annotation('status', str(self.session._session['status'][flow['name']]))
+ span.trace()
+
+ if flow['name'] != Workflow.FLOW_OVER and not self.session.get_status(flow['name']):
+ logging.error('Error during task ' + flow['name'])
+ if flow['name'] != Workflow.FLOW_INIT:
+ self.wf_over()
+ return False
+ # Main task is over, execute sub tasks of main
+ if not self.skip_all:
+ for step in flow['steps']:
+ span = None
+ try:
+ # Check for cancel request
+ if self.redis_client and self.redis_client.get(self.redis_prefix + ':' + self.bank.name + ':action:cancel'):
+ logging.warn('Cancel requested, stopping update')
+ self.redis_client.delete(self.redis_prefix + ':' + self.bank.name + ':action:cancel')
+ self.wf_over()
+ return False
+
+ if self.options.get_option('traceId'):
+ trace_id = self.options.get_option('traceId')
+ span_id = self.options.get_option('spanId')
+ span = Zipkin('biomaj-workflow', flow['name'] + ":wf_" + step, trace_id=trace_id, parent_id=span_id)
+ self.span = span
+ self.bank.config.set('zipkin_trace_id', span.get_trace_id())
+ self.bank.config.set('zipkin_span_id', span.get_span_id())
+ res = getattr(self, 'wf_' + step)()
+
+ if span:
+ span.add_binary_annotation('status', str(res))
+ span.trace()
+
+ if not res:
+ logging.error('Error during ' + flow['name'] + ' subtask: wf_' + step)
+ logging.error('Revert main task status ' + flow['name'] + ' to error status')
+ self.session._session['status'][flow['name']] = False
+ self.wf_over()
+ return False
+ except Exception as e:
+ logging.error('Workflow:' + flow['name'] + ' subtask: wf_' + step + ':Exception:' + str(e))
+ self.session._session['status'][flow['name']] = False
+ logging.debug(traceback.format_exc())
+ self.wf_over()
+ return False
+ dt = datetime.datetime.now()
+ end_timestamp = time.mktime(dt.timetuple())
+ self.session._session['stats']['workflow'][flow['name']] = end_timestamp - start_timestamp
+ if self.options.get_option(Options.STOP_AFTER) == flow['name']:
+ self.wf_over()
+ break
+ self.wf_progress_end()
+ return True
+
+ def wf_progress_init(self):
+ """
+ Set up new progress status
+ """
+ status = {}
+ status['log_file'] = {'status': self.session.config.log_file, 'progress': 0}
+ status['session'] = self.session._session['id']
+ for flow in self.session.flow:
+ if flow['name'] == 'download':
+ status[flow['name']] = {'status': None, 'progress': 0, 'total': 0}
+ elif flow['name'].endswith('process'):
+ status[flow['name']] = {'status': None, 'progress': {}}
+ elif flow['name'] == 'release':
+ status[flow['name']] = {'status': None, 'progress': ''}
+ else:
+ status[flow['name']] = {'status': None, 'progress': 0}
+ MongoConnector.banks.update({'name': self.name}, {'$set': {'status': status}})
+
+ def wf_progress_end(self):
+ """
+ Reset progress status when workflow is over
+ """
+ return True
+
+ def wf_progress(self, task, status):
+ """
+ Update bank status
+ """
+ subtask = 'status.' + task + '.status'
+ MongoConnector.banks.update({'name': self.name}, {'$set': {subtask: status}})
+
+ def wf_init(self):
+ """
+ Initialize workflow
+ """
+ logging.info('Workflow:wf_init')
+ data_dir = self.session.config.get('data.dir')
+ lock_dir = self.session.config.get('lock.dir', default=data_dir)
+ if not os.path.exists(lock_dir):
+ os.mkdir(lock_dir)
+ lock_file = os.path.join(lock_dir, self.name + '.lock')
+ maintenance_lock_file = os.path.join(lock_dir, 'biomaj.lock')
+ if os.path.exists(maintenance_lock_file):
+ logging.error('Biomaj is in maintenance')
+ return False
+ if os.path.exists(lock_file):
+ logging.error('Bank ' + self.name + ' is locked, a process may be in progress, else remove the lock file ' + lock_file)
+ return False
+ f = open(lock_file, 'w')
+ f.write('1')
+ f.close()
+ self.wf_progress_init()
+ return True
+
+ def wf_over(self):
+ """
+ Workflow is over
+ """
+ logging.info('Workflow:wf_over')
+ data_dir = self.session.config.get('data.dir')
+ lock_dir = self.session.config.get('lock.dir', default=data_dir)
+ lock_file = os.path.join(lock_dir, self.name + '.lock')
+ os.remove(lock_file)
+ return True
+
+
+class RemoveWorkflow(Workflow):
+ """
+ Workflow to remove a bank instance
+ """
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'removeprocess', 'steps': []},
+ {'name': 'remove_release', 'steps': []},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank, session):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: Bank
+ :param session: session to remove
+ :type session: :class:`biomaj.session.Session`
+ """
+ Workflow.__init__(self, bank, session)
+ logging.debug('New workflow')
+ self.session._session['remove'] = True
+
+ def wf_remove_release(self):
+ logging.info('Workflow:wf_remove_release')
+ if not self.session.get('update_session_id'):
+ logging.error('Bug: update_session_id not set in session')
+ return False
+
+ if os.path.exists(self.session.get_full_release_directory()):
+ shutil.rmtree(self.session.get_full_release_directory())
+ return self.bank.remove_session(self.session.get('update_session_id'))
+
+ def wf_removeprocess(self):
+ logging.info('Workflow:wf_removepreprocess')
+ metas = self.session._session['process']['removeprocess']
+ pfactory = RemoveProcessFactory(self.bank, metas, redis_client=self.redis_client, redis_prefix=self.redis_prefix)
+ res = pfactory.run()
+ self.session._session['process']['removeprocess'] = pfactory.meta_status
+ return res
+
+
+class UpdateWorkflow(Workflow):
+ """
+ Workflow for a bank update
+ """
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'check', 'steps': []},
+ {'name': 'depends', 'steps': []},
+ {'name': 'preprocess', 'steps': []},
+ {'name': 'release', 'steps': []},
+ {'name': 'download', 'steps': ['checksum', 'uncompress', 'copy', 'copydepends']},
+ {'name': 'postprocess', 'steps': ['metadata', 'stats']},
+ {'name': 'publish', 'steps': ['old_biomaj_api', 'clean_offline', 'delete_old', 'clean_old_sessions']},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: Bank
+ """
+ Workflow.__init__(self, bank)
+ logging.debug('New workflow')
+ self.session._session['update'] = True
+
+ def wf_init(self):
+ err = super(UpdateWorkflow, self).wf_init()
+ if not err:
+ return False
+ offline_dir = self.session.get_offline_directory()
+ if not os.path.exists(offline_dir):
+ logging.debug('Create offline directory: %s' % (str(offline_dir)))
+ os.makedirs(offline_dir)
+ if self.options.get_option(Options.FROMSCRATCH):
+ return self.wf_clean_offline()
+
+ return True
+
+ def _md5(self, fname):
+ hash_md5 = hashlib.md5()
+ with open(fname, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ return hash_md5.hexdigest()
+
+ def _sha256(self, fname):
+ hash_sha256 = hashlib.sha256()
+ with open(fname, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_sha256.update(chunk)
+ return hash_sha256.hexdigest()
+
+ def wf_checksum(self):
+ logging.info('Workflow:wf_checksum')
+ '''
+ if self.bank.config.get('file.md5.check', 'false') != 'true':
+ logging.info('Workflow:wf_checksum:skipping')
+ return True
+ '''
+ offline_dir = self.session.get_offline_directory()
+ error = False
+ for downloaded_file in self.downloaded_files:
+ downloaded_file_name = downloaded_file['name']
+ if 'save_as' in downloaded_file:
+ downloaded_file_name = downloaded_file['save_as']
+ md5_file = os.path.join(offline_dir, downloaded_file_name + '.md5')
+ if os.path.exists(md5_file):
+ with open(md5_file, 'r') as md5_content:
+ data = md5_content.read().split()
+ md5_cksum = data[0]
+ downloaded_file_md5 = self._md5(os.path.join(offline_dir, downloaded_file_name))
+ logging.debug('Wf_checksum:md5:%s:%s:%s' % (downloaded_file_name, downloaded_file_md5, md5_cksum))
+ if downloaded_file_md5 != md5_cksum:
+ logging.error('Invalid md5 checksum for file %s' % (downloaded_file_name))
+ error = True
+ sha256_file = os.path.join(offline_dir, downloaded_file_name + '.sha256')
+ if os.path.exists(sha256_file):
+ with open(sha256_file, 'r') as sha256_content:
+ data = sha256_content.read().split()
+ sha256_cksum = data[0]
+ downloaded_file_sha256 = self._sha256(os.path.join(offline_dir, downloaded_file_name))
+ logging.debug('Wf_checksum:sha256:%s:%s:%s' % (downloaded_file_name, downloaded_file_sha256, sha256_cksum))
+ if downloaded_file_sha256 != sha256_cksum:
+ logging.error('Invalid sha256 checksum for file %s' % (downloaded_file_name))
+ error = True
+ if error:
+ return False
+ return True
+
+ def wf_check(self):
+ """
+ Basic checks
+ """
+ logging.info('Workflow:wf_check')
+ return True
+
+ def wf_depends(self):
+ """
+ Checks bank dependencies with other banks. If bank has dependencies, execute update on other banks first
+ """
+ logging.info('Workflow:wf_depends')
+ # Always rescan depends, there might be a new release
+ self.session.set('depends', {})
+ res = self.bank.update_dependencies()
+ logging.info('Workflow:wf_depends:' + str(res))
+ if res and len(self.bank.depends) > 0:
+ depend_updated = False
+ for bdep in self.bank.depends:
+ logging.info('Workflow:wf_depends:' + bdep.name + ':' + str(bdep.session.get('update')))
+ if bdep.session.get('update'):
+ depend_updated = True
+ break
+ if not depend_updated:
+ logging.info('Workflow:wf_depends:no bank updated')
+ return res
+
+ def wf_copydepends(self):
+ """
+ Copy files from dependent banks if needed
+ """
+ logging.info('Workflow:wf_copydepends')
+ deps = self.bank.get_dependencies()
+ for dep in deps:
+ if self.bank.config.get(dep + '.files.move'):
+ logging.info('Worflow:wf_depends:Files:Move:' + self.bank.config.get(dep + '.files.move'))
+ bdir = None
+ for bdep in self.bank.depends:
+ if bdep.name == dep:
+ bdir = bdep.session.get_full_release_directory()
+ break
+ if bdir is None:
+ logging.error('Could not find a session update for bank ' + dep)
+ return False
+ # b = self.bank.get_bank(dep, no_log=True)
+ locald = LocalDownload(bdir)
+ (file_list, dir_list) = locald.list()
+ locald.match(self.bank.config.get(dep + '.files.move').split(), file_list, dir_list)
+ bankdepdir = self.bank.session.get_full_release_directory() + "/" + dep
+ if not os.path.exists(bankdepdir):
+ os.mkdir(bankdepdir)
+ downloadedfiles = locald.download(bankdepdir)
+ locald.close()
+ if not downloadedfiles:
+ logging.info('Workflow:wf_copydepends:no files to copy')
+ return False
+ return True
+
+ def wf_preprocess(self):
+ """
+ Execute pre-processes
+ """
+ logging.info('Workflow:wf_preprocess')
+ metas = self.session._session['process']['preprocess']
+ pfactory = PreProcessFactory(self.bank, metas, redis_client=self.redis_client, redis_prefix=self.redis_prefix)
+ res = pfactory.run()
+ self.session._session['process']['preprocess'] = pfactory.meta_status
+ return res
+
+ def _close_download_service(self, dserv):
+ '''
+ Cleanup of downloader
+ '''
+ logging.info("Workflow:DownloadService:CleanSession")
+ if dserv:
+ dserv.clean()
+ dserv.close()
+
+ def __update_info(self, info):
+ '''
+ Update some info in db for current bank
+ '''
+ if info is not None:
+ MongoConnector.banks.update({'name': self.bank.name},
+ info)
+
+ def wf_release(self):
+ """
+ Find current release on remote
+ """
+ logging.info('Workflow:wf_release')
+ cf = self.session.config
+ if cf.get('ref.release') and self.bank.depends:
+ # Bank is a computed bank and we ask to set release to the same
+ # than an other dependant bank
+ depbank = self.bank.get_bank(cf.get('ref.release'), no_log=True)
+ got_match = False
+ got_update = False
+ for dep in self.bank.depends:
+ if dep.session.get('update'):
+ got_update = True
+ if dep.name == depbank.name:
+ self.session.set('release', dep.session.get('release'))
+ self.session.set('remoterelease', dep.session.get('remoterelease'))
+ got_match = True
+
+ if not got_match:
+ logging.error('Workflow:wf_release: no release found for bank ' + depbank.name)
+ return False
+
+ release = self.session.get('release')
+ self.__update_info({'$set': {'status.release.progress': str(release)}})
+ '''
+ MongoConnector.banks.update({'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}})
+ '''
+
+ logging.info('Workflow:wf_release:FromDepends:' + depbank.name + ':' + self.session.get('release'))
+ if got_update:
+ index = 0
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory() + '__' + str(x)):
+ index = x + 1
+ if index > 0:
+ self.session.set('release', release + '__' + str(index))
+ release = release + '__' + str(index)
+
+ self.session.previous_release = self.session.get('previous_release')
+
+ logging.info('Workflow:wf_release:previous_session:' + str(self.session.previous_release))
+ if self.session.get('release'):
+ # Release already set from a previous run or an other bank
+ logging.info('Workflow:wf_release:session:' + str(self.session.get('release')))
+ if self.session.previous_release == self.session.get('release') and not self.session.config.get_bool('release.control', default=False):
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+ else:
+ return True
+ if self.session.config.get('release.file') == '' or self.session.config.get('release.file') is None:
+ logging.debug('Workflow:wf_release:norelease')
+ self.session.set('release', None)
+ return True
+ else:
+ # """""""""""""""""""""""
+ dserv = None
+ if self.bank.config.get('micro.biomaj.service.download', default=None) == '1':
+ dserv = DownloadClient(
+ self.bank.config.get('micro.biomaj.rabbit_mq'),
+ int(self.bank.config.get('micro.biomaj.rabbit_mq_port', default='5672')),
+ self.bank.config.get('micro.biomaj.rabbit_mq_virtualhost', default='/'),
+ self.bank.config.get('micro.biomaj.rabbit_mq_user', default=None),
+ self.bank.config.get('micro.biomaj.rabbit_mq_password', default=None)
+ )
+ else:
+ dserv = DownloadClient()
+ proxy = self.bank.config.get('micro.biomaj.proxy')
+ session = dserv.create_session(self.name, proxy)
+ logging.info("Workflow:wf_release:DownloadSession:" + str(session))
+
+ http_parse = HTTPParse(
+ cf.get('http.parse.dir.line'),
+ cf.get('http.parse.file.line'),
+ int(cf.get('http.group.dir.name')),
+ int(cf.get('http.group.dir.date')),
+ int(cf.get('http.group.file.name')),
+ int(cf.get('http.group.file.date')),
+ cf.get('http.group.file.date_format', default=None),
+ int(cf.get('http.group.file.size'))
+ )
+
+ proxy = cf.get('proxy')
+ if cf.get('release.proxy') is not None:
+ proxy = cf.get('release.proxy')
+
+ proxy_auth = cf.get('proxy_auth')
+ if cf.get('release.proxy_auth') is not None:
+ proxy = cf.get('release.proxy_auth')
+
+ protocol = cf.get('protocol')
+ if cf.get('release.protocol') is not None:
+ protocol = cf.get('release.protocol')
+
+ server = cf.get('server')
+ if cf.get('release.server') is not None:
+ server = cf.get('release.server')
+
+ remote_dir = cf.get('remote.dir')
+ if cf.get('release.remote.dir') is not None:
+ remote_dir = cf.get('release.remote.dir')
+
+ params = None
+ keys = cf.get('url.params')
+ if keys is not None:
+ params = {}
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get(key.strip() + '.value')
+ params[key.strip()] = param.strip()
+
+ credentials = cf.get('server.credentials')
+ if cf.get('release.credentials') is not None:
+ credentials = cf.get('release.credentials')
+
+ save_as = None
+ method = 'GET'
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ save_as = cf.get('release.file')
+ remotes = [remote_dir]
+ remote_dir = '/'
+ method = cf.get('url.method')
+ if cf.get('release.url.method') is not None:
+ method = cf.get('release.url.method')
+
+ release_downloader = dserv.get_handler(
+ protocol,
+ server,
+ remote_dir,
+ credentials=credentials,
+ http_parse=http_parse,
+ http_method=method,
+ param=params,
+ proxy=proxy,
+ proxy_auth=proxy_auth,
+ save_as=save_as,
+ timeout_download=cf.get('timeout.download'),
+ offline_dir=self.session.get_offline_directory()
+ )
+
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ release_downloader.set_files_to_download(remotes)
+ # """"""""""""""""""""""""
+
+ if release_downloader is None:
+ logging.error('Protocol ' + protocol + ' not supported')
+ self._close_download_service(dserv)
+ return False
+
+ try:
+ (file_list, dir_list) = release_downloader.list()
+ except Exception as e:
+ self._close_download_service(dserv)
+ logging.exception('Workflow:wf_release:Exception:' + str(e))
+ return False
+
+ release_downloader.match([cf.get('release.file')], file_list, dir_list)
+ if len(release_downloader.files_to_download) == 0:
+ logging.error('release.file defined but does not match any file')
+ self._close_download_service(dserv)
+ return False
+ if len(release_downloader.files_to_download) > 1:
+ logging.error('release.file defined but matches multiple files')
+ self._close_download_service(dserv)
+ return False
+ if cf.get('release.regexp') is None or not cf.get('release.regexp'):
+ # Try to get from regexp in file name
+ rel = re.search(cf.get('release.file'), release_downloader.files_to_download[0]['name'])
+ if rel is None:
+ logging.error('release.file defined but does not match any file')
+ self._close_download_service(dserv)
+ return False
+ release = rel.group(1)
+ else:
+ # Download and extract
+ tmp_dir = tempfile.mkdtemp('biomaj')
+ rel_files = release_downloader.download(tmp_dir)
+ rel_file = open(tmp_dir + '/' + rel_files[0]['name'])
+ rel_content = rel_file.read()
+ rel_file.close()
+ shutil.rmtree(tmp_dir)
+ rel = re.search(cf.get('release.regexp'), rel_content)
+ if rel is None:
+ logging.error('release.regexp defined but does not match any file content')
+ self._close_download_service(dserv)
+ return False
+ # If regexp contains matching group, else take whole match
+ if len(rel.groups()) > 0:
+ release = rel.group(1)
+ else:
+ release = rel.group(0)
+
+ release_downloader.close()
+ self._close_download_service(dserv)
+
+ if release_downloader.error:
+ logging.error('An error occured during download')
+ return False
+
+ self.session.set('release', release)
+ self.session.set('remoterelease', release)
+
+ self.__update_info({'$set': {'status.release.progress': str(release)}})
+ '''
+ MongoConnector.banks.update(
+ {'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}}
+ )
+ '''
+
+ # We restart from scratch, a directory with this release already exists
+ # Check directory existence if from scratch to change local release
+ if self.options.get_option(Options.FROMSCRATCH):
+ index = 0
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory() + '__' + str(x)):
+ index = x + 1
+ if index > 0:
+ self.session.set('release', release + '__' + str(index))
+ release = release + '__' + str(index)
+
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+
+ if not self.download_go_ahead and self.session.previous_release == self.session.get('remoterelease'):
+ if not self.session.config.get_bool('release.control', default=False):
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+
+ logging.info('Session:RemoteRelease:' + self.session.get('remoterelease'))
+ logging.info('Session:Release:' + self.session.get('release'))
+ return True
+
+ def no_need_to_update(self):
+ """
+ Set status to over and update = False because there is not a need to update bank
+ """
+ self.skip_all = True
+ self.session._session['status'][Workflow.FLOW_OVER] = True
+ self.wf_progress(Workflow.FLOW_OVER, True)
+ self.session._session['update'] = False
+ self.session.set('download_files', [])
+ self.session.set('files', [])
+ last_session = self.get_last_prod_session_for_release(self.session.get('remoterelease'))
+ self.session.set('release', last_session['release'])
+ self.wf_over()
+ return True
+
+ def get_last_prod_session_for_release(self, release):
+ """
+ find last session matching a release in production
+ """
+ last_session = None
+ for prod in self.bank.bank['production']:
+ if prod['remoterelease'] == release:
+ # Search session related to this production release
+ for s in self.bank.bank['sessions']:
+ if s['id'] == prod['session']:
+ last_session = s
+ break
+ return last_session
+
+ def _load_local_files_from_session(self, session_id):
+ """
+ Load lccal files for sessions from cache directory
+ """
+ cache_dir = self.bank.config.get('cache.dir')
+ f_local_files = None
+ file_path = os.path.join(cache_dir, 'local_files_' + str(session_id))
+ if not os.path.exists(file_path):
+ return f_local_files
+
+ with open(file_path) as data_file:
+ f_local_files = json.load(data_file)
+
+ return f_local_files
+
+ def _load_download_files_from_session(self, session_id):
+ """
+ Load download files for sessions from cache directory
+ """
+ cache_dir = self.bank.config.get('cache.dir')
+ f_downloaded_files = None
+ file_path = os.path.join(cache_dir, 'files_' + str(session_id))
+ if not os.path.exists(file_path):
+ return f_downloaded_files
+
+ with open(file_path) as data_file:
+ f_downloaded_files = json.load(data_file)
+
+ return f_downloaded_files
+
+ def is_previous_release_content_identical(self):
+ """
+ Checks if releases (previous_release and remoterelease) are identical in release id and content.
+ Expects release.control parameter to be set to true or 1, else skip control.
+ """
+ if not self.session.config.get_bool('release.control', default=False):
+ return True
+ # Different releases, so different
+ if self.session.get('remoterelease') != self.session.previous_release:
+ logging.info('Workflow:wf_download:DifferentRelease')
+ return False
+ # Same release number, check further
+ previous_release_session = self.get_last_prod_session_for_release(self.session.previous_release)
+
+ if previous_release_session is None:
+ return False
+
+ previous_downloaded_files = self._load_download_files_from_session(previous_release_session.get('id'))
+ previous_release_session['download_files'] = previous_downloaded_files
+
+ if previous_downloaded_files is None:
+ # No info on previous download, consider that base release is enough
+ logging.warn('Workflow:wf_download:SameRelease:download_files not available, cannot compare to previous release')
+ return True
+
+ nb_elts = len(previous_downloaded_files)
+
+ if self.session.get('download_files') is not None and nb_elts != len(self.session.get('download_files')):
+ # Number of files to download vs previously downloaded files differ
+ logging.info('Workflow:wf_download:SameRelease:Number of files differ')
+ return False
+ # Same number of files, check hash of files
+ list1 = sorted(previous_downloaded_files, key=lambda k: k['hash'])
+ list2 = sorted(self.session.get('download_files'), key=lambda k: k['hash'])
+ for index in range(0, nb_elts):
+ if list1[index]['hash'] != list2[index]['hash']:
+ return False
+ return True
+
+ def check_and_incr_release(self):
+ """
+ Checks if local release already exists on disk. If it exists, create a new
+ local release, appending __X to the release.
+
+ :returns: str local release
+ """
+ index = 0
+ release = self.session.get('release')
+ # Release directory exits, set index to 1
+ if os.path.exists(self.session.get_full_release_directory()):
+ index = 1
+ for x in range(1, 100):
+ if os.path.exists(self.session.get_full_release_directory() + '__' + str(x)):
+ index = x + 1
+
+ # If we found a directory for this release: XX or XX__Y
+ if index > 0:
+ self.session.set('release', release + '__' + str(index))
+ release = release + '__' + str(index)
+ logging.info('Workflow:wf_download:release:incr_release:' + release)
+ return release
+
+ def _create_dir_structure(self, downloader, offline_dir):
+ """
+ Create expected directory structure in offline directory before download
+ """
+ logging.debug('Workflow:wf_download:create_dir_structure:start')
+ for rfile in downloader.files_to_download:
+ save_as = None
+ if 'save_as' not in rfile or rfile['save_as'] is None:
+ save_as = rfile['name']
+ else:
+ save_as = rfile['save_as']
+
+ file_dir = offline_dir + '/' + os.path.dirname(save_as)
+
+ try:
+ if not os.path.exists(file_dir):
+ os.makedirs(file_dir)
+ except Exception as e:
+ logging.error(e)
+ logging.debug('Workflow:wf_download:create_dir_structure:done')
+
+ def _get_list_from_file(self, remote_list):
+ """
+ Load files to download from a file
+ """
+ if not os.path.exists(remote_list):
+ logging.info("remote.list " + remote_list + " does not exists, we suppose there is no new release available")
+ return None
+
+ data = []
+ with open(remote_list) as data_file:
+ data = json.load(data_file)
+
+ for rfile in data:
+ if 'year' not in rfile or 'month' not in rfile or 'day' not in rfile:
+ today = datetime.date.today()
+ rfile['month'] = today.month
+ rfile['day'] = today.day
+ rfile['year'] = today.year
+ if 'permissions' not in rfile:
+ rfile['permissions'] = ''
+ if 'group' not in rfile:
+ rfile['group'] = ''
+ if 'size' not in rfile:
+ rfile['size'] = 0
+ if 'hash' not in rfile:
+ rfile['hash'] = None
+ if 'root' not in rfile and self.session.config.get('remote.dir'):
+ rfile['root'] = self.session.config.get('remote.dir')
+ return data
+
+ def wf_download(self):
+ """
+ Download remote files or use an available local copy from last production directory if possible.
+ """
+ logging.info('Workflow:wf_download')
+ # flow = self.get_flow(Workflow.FLOW_DOWNLOAD)
+ downloader = None
+ cf = self.session.config
+ self.session.previous_release = self.session.get('previous_release')
+
+ if self.session.get('release') is not None:
+ self.session.config.set('localrelease', self.session.get('release'))
+ self.session.config.set('remoterelease', self.session.get('remoterelease'))
+ if self.session.config.get_bool('releaseonly', default=False):
+ return True
+
+ if cf.get('protocol') == 'none':
+ if self.session.get('release') is None:
+ logging.error('Workflow:wf_download:no download file but no release found')
+ return False
+ else:
+ logging.info('Workflow:wf_download:no download file expected')
+ self.downloaded_files = []
+ if not os.path.exists(self.session.get_full_release_directory()):
+ os.makedirs(self.session.get_full_release_directory())
+ return True
+
+ downloaders = []
+
+ pool_size = self.session.config.get('files.num.threads', default=None)
+
+ dserv = None
+ if self.bank.config.get('micro.biomaj.service.download', default=None) == '1':
+ dserv = DownloadClient(
+ self.bank.config.get('micro.biomaj.rabbit_mq'),
+ int(self.bank.config.get('micro.biomaj.rabbit_mq_port', default='5672')),
+ self.bank.config.get('micro.biomaj.rabbit_mq_virtualhost', default='/'),
+ self.bank.config.get('micro.biomaj.rabbit_mq_user', default=None),
+ self.bank.config.get('micro.biomaj.rabbit_mq_password', default=None),
+ )
+ else:
+ dserv = DownloadClient()
+
+ if pool_size:
+ dserv.set_queue_size(int(pool_size))
+
+ proxy = self.bank.config.get('micro.biomaj.proxy')
+ session = dserv.create_session(self.name, proxy)
+ logging.info("Workflow:wf_download:DownloadSession:" + str(session))
+
+ use_remote_list = False
+
+ http_parse = HTTPParse(
+ cf.get('http.parse.dir.line'),
+ cf.get('http.parse.file.line'),
+ int(cf.get('http.group.dir.name')),
+ int(cf.get('http.group.dir.date')),
+ int(cf.get('http.group.file.name')),
+ int(cf.get('http.group.file.date')),
+ cf.get('http.group.file.date_format', default=None),
+ int(cf.get('http.group.file.size'))
+ )
+ proxy = cf.get('proxy')
+ proxy_auth = cf.get('proxy_auth')
+
+ if cf.get('protocol') == 'multi':
+ """
+ Search for:
+ protocol = multi
+ remote.file.0.protocol = directftp
+ remote.file.0.server = ftp.ncbi.org
+ remote.file.0.path = /musmusculus/chr1/chr1.fa
+
+ => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2
+ remote.file.1.protocol = directhttp
+ remote.file.1.server = ftp2.fr.debian.org
+ remote.file.1.path = debian/README.html
+ remote.file.1.method = GET
+ remote.file.1.params.keys = key1,key2
+ remote.file.1.params.key1 = value1
+ remote.file.1.params.key2 = value2
+
+ => http://ftp2.fr.debian.org/debian/README.html
+ #POST PARAMS:
+ key1=value
+ key2=value2
+ remote.file.1.protocol = directhttp
+ remote.file.1.server = ftp2.fr.debian.org
+ remote.file.1.path = debian/README.html
+ remote.file.1.method = POST
+ remote.file.1.params.keys = key1,key2
+ remote.file.1.params.key1 = value1
+ remote.file.1.params.key2 = value2
+
+ ......
+ """
+ # Creates multiple downloaders
+ i = 0
+ rfile = cf.get('remote.file.' + str(i) + '.path')
+ server = None
+ while rfile is not None:
+ protocol = cf.get('protocol')
+ if cf.get('remote.file.' + str(i) + '.protocol') is not None:
+ protocol = cf.get('remote.file.' + str(i) + '.protocol')
+
+ server = cf.get('server')
+ if cf.get('remote.file.' + str(i) + '.server') is not None:
+ server = cf.get('remote.file.' + str(i) + '.server')
+
+ params = None
+ keys = cf.get('remote.file.' + str(i) + '.params.keys')
+ if keys is not None:
+ params = {}
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get('remote.file.' + str(i) + '.params.' + key.strip())
+ params[key.strip()] = param.strip()
+
+ method = cf.get('remote.file.' + str(i) + '.method')
+ if method is None:
+ if cf.get('url.method') is not None:
+ method = cf.get('url.method')
+ else:
+ method = 'GET'
+
+ credentials = cf.get('remote.file.' + str(i) + '.credentials')
+ if not method:
+ credentials = cf.get('server.credentials')
+
+ remotes = [cf.get('remote.file.' + str(i) + '.path')]
+
+ save_as = cf.get('remote.file.' + str(i) + '.path')
+ if cf.get('remote.file.' + str(i) + '.name'):
+ save_as = cf.get('remote.file.' + str(i) + '.name')
+
+ subdownloader = dserv.get_handler(
+ protocol,
+ server,
+ '',
+ credentials=credentials,
+ http_parse=http_parse,
+ http_method=method,
+ param=params,
+ proxy=proxy,
+ proxy_auth=proxy_auth,
+ save_as=save_as,
+ timeout_download=cf.get('timeout.download'),
+ offline_dir=self.session.get_offline_directory()
+ )
+ subdownloader.set_files_to_download(remotes)
+
+ downloaders.append(subdownloader)
+
+ i += 1
+ rfile = cf.get('remote.file.' + str(i) + '.path')
+ else:
+ """
+ Simple case, one downloader with regexp
+ """
+ protocol = cf.get('protocol')
+ server = cf.get('server')
+
+ params = None
+ keys = cf.get('url.params')
+ if keys is not None:
+ params = {}
+ keys = keys.split(',')
+ for key in keys:
+ param = cf.get(key.strip() + '.value')
+ params[key.strip()] = param.strip()
+
+ method = cf.get('url.method')
+ if method is None:
+ method = 'GET'
+
+ credentials = cf.get('server.credentials')
+
+ remote_dir = cf.get('remote.dir')
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ remotes = [cf.get('remote.dir')[:-1]]
+ remote_dir = '/'
+
+ save_as = cf.get('target.name')
+
+ downloader = dserv.get_handler(
+ protocol,
+ server,
+ remote_dir,
+ credentials=credentials,
+ http_parse=http_parse,
+ http_method=method,
+ param=params,
+ proxy=proxy,
+ proxy_auth=proxy_auth,
+ save_as=save_as,
+ timeout_download=cf.get('timeout.download'),
+ offline_dir=self.session.get_offline_directory()
+ )
+
+ if protocol == 'directhttp' or protocol == 'directhttps' or protocol == 'directftp':
+ downloader.set_files_to_download(remotes)
+
+ remote_list = cf.get('remote.list', default=None)
+ if remote_list is not None:
+ logging.info("Use list from " + remote_list)
+ downloader.files_to_download = self._get_list_from_file(remote_list)
+ use_remote_list = True
+
+ downloaders.append(downloader)
+
+ self._close_download_service(dserv)
+
+ for downloader in downloaders:
+ if downloader is None:
+ logging.error('Protocol ' + downloader.protocol + ' not supported')
+ return False
+
+ files_to_download = []
+
+ for downloader in downloaders:
+ if use_remote_list:
+ if not downloader.files_to_download:
+ self.session.set('remoterelease', self.session.previous_release)
+ return self.no_need_to_update()
+ else:
+ (file_list, dir_list) = downloader.list()
+ downloader.match(cf.get('remote.files', default='.*').split(), file_list, dir_list)
+
+ # Check if save_as defined, else check if regexp contains some save information with groups
+ for f in downloader.files_to_download:
+ if 'save_as' not in f or not f['save_as']:
+ f['save_as'] = f['name']
+ for p in cf.get('remote.files', default='.*').split():
+ if p.startswith('^'):
+ p = p.replace('^', '^/')
+ else:
+ p = '/' + p
+ res = re.match(p, f['name'])
+ if res is not None and res.groups() is not None and len(res.groups()) >= 1:
+ f['save_as'] = '/'.join(res.groups())
+ break
+
+ files_to_download += downloader.files_to_download
+
+ self.session.set('download_files', downloader.files_to_download)
+ self.session._session['stats']['nb_downloaded_files'] = len(files_to_download)
+ logging.info('Workflow:wf_download:nb_files_to_download:%d' % (len(files_to_download)))
+
+ if self.session.get('release') and self.session.config.get_bool('release.control', default=False):
+ if self.session.previous_release == self.session.get('remoterelease'):
+ if self.is_previous_release_content_identical():
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+ else:
+ release = self.check_and_incr_release()
+
+ if self.session.get('release') is None:
+ # Not defined, or could not get it ealier
+ # Set release to most recent file to download
+ release_dict = Utils.get_more_recent_file(downloader.files_to_download)
+ if release_dict is None:
+ today = datetime.datetime.now()
+ release_dict = {'year': today.year, 'month': today.month, 'day': today.day}
+
+ release = str(release_dict['year']) + '-' + str(release_dict['month']) + '-' + str(release_dict['day'])
+ if cf.get('release.format'):
+ release_date = datetime.datetime.now()
+ release_date = release_date.replace(year=int(release_dict['year']), month=int(release_dict['month']), day=int(release_dict['day']))
+ # Fix configparser problem between py2 and py3
+ release = release_date.strftime(cf.get('release.format').replace('%%', '%'))
+ self.session.set('release', release)
+ self.session.set('remoterelease', release)
+
+ logging.info('Workflow:wf_download:release:remoterelease:' + self.session.get('remoterelease'))
+ logging.info('Workflow:wf_download:release:release:' + release)
+
+ self.__update_info({'$set': {'status.release.progress': str(release)}})
+ '''
+ MongoConnector.banks.update(
+ {'name': self.bank.name},
+ {'$set': {'status.release.progress': str(release)}}
+ )
+ '''
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+ if not self.download_go_ahead and self.session.previous_release == self.session.get('remoterelease') and self.is_previous_release_content_identical():
+ logging.info('Workflow:wf_release:same_as_previous_session')
+ return self.no_need_to_update()
+
+ # We restart from scratch, check if directory with this release already exists
+ if self.options.get_option(Options.FROMSCRATCH) or self.options.get_option('release') is None:
+ release = self.check_and_incr_release()
+
+ self.session.config.set('localrelease', self.session.get('release'))
+ self.session.config.set('remoterelease', self.session.get('remoterelease'))
+
+ if self.session.config.get_bool('releaseonly', default=False):
+ return True
+
+ self.banks = MongoConnector.banks
+ self.bank.bank = self.banks.find_one({'name': self.name})
+
+ nb_prod_dir = len(self.bank.bank['production'])
+ offline_dir = self.session.get_offline_directory()
+
+ copied_files = []
+
+ # Check if already in offlinedir
+ files_in_offline = 0
+ nb_expected_files = 0
+ for downloader in downloaders:
+ keep_files = []
+ nb_expected_files += len(downloader.files_to_download)
+ if os.path.exists(offline_dir):
+ for file_to_download in downloader.files_to_download:
+ # If file is in offline dir and has same date and size, do not download again
+ if os.path.exists(offline_dir + '/' + file_to_download['name']):
+ try:
+ file_stat = os.stat(offline_dir + '/' + file_to_download['name'])
+ f_stat = datetime.datetime.fromtimestamp(os.path.getmtime(offline_dir + '/' + file_to_download['name']))
+ year = str(f_stat.year)
+ month = str(f_stat.month)
+ day = str(f_stat.day)
+ if str(file_stat.st_size) != str(file_to_download['size']) or \
+ str(year) != str(file_to_download['year']) or \
+ str(month) != str(file_to_download['month']) or \
+ str(day) != str(file_to_download['day']):
+ logging.debug('Workflow:wf_download:different_from_offline:' + file_to_download['name'])
+ keep_files.append(file_to_download)
+ else:
+ logging.debug('Workflow:wf_download:offline:' + file_to_download['name'])
+ files_in_offline += 1
+ copied_files.append(file_to_download)
+ except Exception as e:
+ # Could not get stats on file
+ logging.debug('Workflow:wf_download:offline:failed to stat file: ' + str(e))
+ os.remove(offline_dir + '/' + file_to_download['name'])
+ keep_files.append(file_to_download)
+ else:
+ keep_files.append(file_to_download)
+ downloader.files_to_download = keep_files
+ logging.info("Workflow:wf_download:nb_expected_files:" + str(nb_expected_files))
+ logging.info("Workflow:wf_download:nb_files_in_offline_dir:" + str(files_in_offline))
+ # If everything was already in offline dir
+ everything_present = True
+ for downloader in downloaders:
+ if len(downloader.files_to_download) > 0:
+ everything_present = False
+ break
+ if everything_present:
+ self.downloaded_files = []
+ logging.info("Workflow:wf_download:all_files_in_offline:skip download")
+ return True
+
+ for downloader in downloaders:
+ self._create_dir_structure(downloader, offline_dir)
+
+ self.download_go_ahead = False
+ if self.options.get_option(Options.FROM_TASK) == 'download':
+ # We want to download again in same release, that's fine, we do not care it is the same release
+ self.download_go_ahead = True
+
+ if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0:
+ # Get last production
+ last_production = self.bank.bank['production'][nb_prod_dir - 1]
+ # Get session corresponding to production directory
+ last_production_session = self.banks.find_one({'name': self.name, 'sessions.id': last_production['session']}, {'sessions.$': 1})
+ last_production_session_release_directory = self.session.get_full_release_directory(release=last_production['release'])
+ last_production_dir = os.path.join(last_production_session_release_directory, 'flat')
+ # Checks if some files can be copied instead of downloaded
+ last_production_files = None
+ if len(last_production_session['sessions']) > 0:
+ last_production_files = self._load_local_files_from_session(last_production_session['sessions'][0]['id'])
+
+ if not cf.get_bool('copy.skip', default=False):
+ for downloader in downloaders:
+ downloader.download_or_copy(last_production_files, last_production_dir)
+
+ everything_copied = True
+ for downloader in downloaders:
+ if len(downloader.files_to_download) > 0:
+ everything_copied = False
+ break
+ if everything_copied:
+ logging.info('Workflow:wf_download:all files copied from %s' % (str(last_production_dir)))
+ # return self.no_need_to_update()
+
+ logging.debug('Workflow:wf_download:Copy files from ' + last_production_dir)
+ for downloader in downloaders:
+ copied_files += downloader.files_to_copy
+ Utils.copy_files(downloader.files_to_copy, offline_dir)
+
+ downloader.close()
+
+ pool_size = self.session.config.get('files.num.threads', default=None)
+ dserv = None
+
+ if self.bank.config.get('micro.biomaj.service.download', default=None) == '1':
+ dserv = DownloadClient(
+ self.bank.config.get('micro.biomaj.rabbit_mq'),
+ int(self.bank.config.get('micro.biomaj.rabbit_mq_port', default='5672')),
+ self.bank.config.get('micro.biomaj.rabbit_mq_virtualhost', default='/'),
+ self.bank.config.get('micro.biomaj.rabbit_mq_user', default=None),
+ self.bank.config.get('micro.biomaj.rabbit_mq_password', default=None),
+ redis_client=self.redis_client,
+ redis_prefix=self.redis_prefix
+ )
+ if pool_size:
+ logging.debug('Set rate limiting: %s' % (str(pool_size)))
+ dserv.set_rate_limiting(int(pool_size))
+
+ else:
+ dserv = DownloadClient()
+
+ if pool_size:
+ dserv.set_queue_size(int(pool_size))
+
+ proxy = self.bank.config.get('micro.biomaj.proxy')
+ session = dserv.create_session(self.name, proxy)
+ logging.info("Workflow:wf_download:DownloadSession:" + str(session))
+
+ for downloader in downloaders:
+ for file_to_download in downloader.files_to_download:
+ operation = message_pb2.Operation()
+ operation.type = 1
+ message = message_pb2.DownloadFile()
+ message.bank = self.name
+ message.session = session
+ message.local_dir = offline_dir
+ remote_file = message_pb2.DownloadFile.RemoteFile()
+ protocol = downloader.protocol
+ remote_file.protocol = message_pb2.DownloadFile.Protocol.Value(protocol.upper())
+
+ if downloader.credentials:
+ remote_file.credentials = downloader.credentials
+
+ remote_file.server = downloader.server
+ if cf.get('remote.dir'):
+ remote_file.remote_dir = cf.get('remote.dir')
+ else:
+ remote_file.remote_dir = ''
+
+ if http_parse:
+ msg_http_parse = message_pb2.DownloadFile.HttpParse()
+ msg_http_parse.dir_line = http_parse.dir_line
+ msg_http_parse.file_line = http_parse.file_line
+ msg_http_parse.dir_name = http_parse.dir_name
+ msg_http_parse.dir_date = http_parse.dir_date
+ msg_http_parse.file_name = http_parse.file_name
+ msg_http_parse.file_date = http_parse.file_date
+ msg_http_parse.file_size = http_parse.file_size
+ if http_parse.file_date_format:
+ msg_http_parse.file_date_format = http_parse.file_date_format
+ remote_file.http_parse.MergeFrom(msg_http_parse)
+
+ biomaj_file = remote_file.files.add()
+ biomaj_file.name = file_to_download['name']
+ if 'root' in file_to_download and file_to_download['root']:
+ biomaj_file.root = file_to_download['root']
+ if downloader.param:
+ for key in list(downloader.param.keys()):
+ param = remote_file.param.add()
+ param.name = key
+ param.value = downloader.param[key]
+ if 'save_as' in file_to_download and file_to_download['save_as']:
+ biomaj_file.save_as = file_to_download['save_as']
+ if 'url' in file_to_download and file_to_download['url']:
+ biomaj_file.url = file_to_download['url']
+ if 'permissions' in file_to_download and file_to_download['permissions']:
+ biomaj_file.metadata.permissions = file_to_download['permissions']
+ if 'size' in file_to_download and file_to_download['size']:
+ biomaj_file.metadata.size = file_to_download['size']
+ if 'year' in file_to_download and file_to_download['year']:
+ biomaj_file.metadata.year = file_to_download['year']
+ if 'month' in file_to_download and file_to_download['month']:
+ biomaj_file.metadata.month = file_to_download['month']
+ if 'day' in file_to_download and file_to_download['day']:
+ biomaj_file.metadata.day = file_to_download['day']
+ if 'hash' in file_to_download and file_to_download['hash']:
+ biomaj_file.metadata.hash = file_to_download['hash']
+ if 'md5' in file_to_download and file_to_download['md5']:
+ biomaj_file.metadata.md5 = file_to_download['md5']
+
+ message.http_method = message_pb2.DownloadFile.HTTP_METHOD.Value(downloader.method.upper())
+
+ timeout_download = cf.get('timeout.download', default=None)
+ if timeout_download:
+ try:
+ message.timeout_download = int(timeout_download)
+ except Exception as e:
+ logging.error('Wrong timeout type for timeout.download: ' + str(e))
+
+ if self.span:
+ trace = message_pb2.Operation.Trace()
+ trace.trace_id = self.span.get_trace_id()
+ trace.span_id = self.span.get_span_id()
+ operation.trace.MergeFrom(trace)
+
+ message.remote_file.MergeFrom(remote_file)
+ operation.download.MergeFrom(message)
+ dserv.download_remote_file(operation)
+
+ logging.info("Workflow:wf_download:Download:Waiting")
+ download_error = False
+ try:
+ download_error = dserv.wait_for_download()
+ except Exception as e:
+ self._close_download_service(dserv)
+ logging.exception('Workflow:wf_download:Exception:' + str(e))
+ return False
+ except KeyboardInterrupt:
+ logging.warn("Ctrl-c received! Stop downloads...")
+ logging.warn("Running downloads will continue and process will stop.")
+ self._close_download_service(dserv)
+ return False
+
+ self._close_download_service(dserv)
+
+ self.downloaded_files = copied_files
+ for downloader in downloaders:
+ self.downloaded_files += downloader.files_to_download
+
+ if download_error:
+ logging.error('An error occured during download')
+ return False
+
+ return True
+
+ def wf_uncompress(self):
+ """
+ Uncompress files if archives and no.extract = false
+ """
+ logging.info('Workflow:wf_uncompress')
+ if len(self.downloaded_files) == 0:
+ logging.info("Workflow:wf_uncompress:NoFileDownload:NoExtract")
+ return True
+ no_extract = self.session.config.get('no.extract')
+ if no_extract is None or no_extract == 'false':
+ for file in self.downloaded_files:
+ if 'save_as' not in file:
+ file['save_as'] = file['name']
+ nb_try = 1
+ not_ok = True
+ while nb_try < 3 and not_ok:
+ status = Utils.uncompress(self.session.get_offline_directory() + '/' + file['save_as'])
+ if status:
+ not_ok = False
+ else:
+ logging.warn('Workflow:wf_uncompress:Failure:' + file['name'] + ':' + str(nb_try))
+ nb_try += 1
+ if not_ok:
+ logging.error('Workflow:wf_uncompress:Failure:' + file['name'])
+ return False
+ else:
+ logging.info("Workflow:wf_uncompress:NoExtract")
+ return True
+
+ def wf_copy(self):
+ """
+ Copy files from offline directory to release directory
+ """
+ logging.info('Workflow:wf_copy')
+ if len(self.downloaded_files) == 0:
+ logging.info("Workflow:wf_copy:NoFileDownload:NoCopy")
+ return True
+ from_dir = os.path.join(self.session.config.get('data.dir'),
+ self.session.config.get('offline.dir.name'))
+ regexp = self.session.config.get('local.files', default='**/*').split()
+ to_dir = os.path.join(
+ self.session.config.get('data.dir'),
+ self.session.config.get('dir.version'),
+ self.session.get_release_directory(),
+ 'flat'
+ )
+
+ local_files = Utils.copy_files_with_regexp(from_dir, to_dir, regexp, True)
+ self.session._session['files'] = local_files
+ if len(self.session._session['files']) == 0:
+ logging.error('Workflow:wf_copy:No file match in offline dir')
+ return False
+ return True
+
+ def wf_metadata(self):
+ """
+ Update metadata with info gathered from processes
+ """
+ logging.info('Workflow:wf_metadata')
+ self.bank.session.set('formats', {})
+ per_process_meta_data = self.session.get('per_process_metadata')
+ for proc in list(per_process_meta_data.keys()):
+ for meta_data in list(per_process_meta_data[proc].keys()):
+ session_formats = self.bank.session.get('formats')
+ if meta_data not in session_formats:
+ session_formats[meta_data] = per_process_meta_data[proc][meta_data]
+ else:
+ session_formats[meta_data] += per_process_meta_data[proc][meta_data]
+ return True
+
+ def wf_stats(self):
+ """
+ Get some stats from current release data dir
+ """
+ logging.info('Workflow:wf_stats')
+ do_stats = self.bank.config.get('data.stats')
+ if do_stats is None or do_stats == '0':
+ self.session.set('fullsize', 0)
+ return True
+ prod_dir = self.session.get_full_release_directory()
+ dir_size = Utils.get_folder_size(prod_dir)
+ self.session.set('fullsize', dir_size)
+ return True
+
+ def wf_postprocess(self):
+ """
+ Execute post processes
+ """
+ # Creates a temporary symlink future_release to keep compatibility if process
+ # tries to access dir with this name
+ future_link = os.path.join(
+ self.bank.config.get('data.dir'),
+ self.bank.config.get('dir.version'),
+ 'future_release'
+ )
+ # prod_dir = self.session.get_full_release_directory()
+ to_dir = os.path.join(
+ self.bank.config.get('data.dir'),
+ self.bank.config.get('dir.version')
+ )
+
+ if os.path.lexists(future_link):
+ os.remove(future_link)
+ os.chdir(to_dir)
+ os.symlink(self.session.get_release_directory(), 'future_release')
+
+ logging.info('Workflow:wf_postprocess')
+ blocks = self.session._session['process']['postprocess']
+ pfactory = PostProcessFactory(self.bank, blocks, redis_client=self.redis_client, redis_prefix=self.redis_prefix)
+ res = pfactory.run()
+ self.session._session['process']['postprocess'] = pfactory.blocks
+
+ # In any way, delete symlink
+ if os.path.lexists(future_link):
+ os.remove(future_link)
+
+ return res
+
+ def wf_publish(self):
+ """
+ Add *current* symlink to this release
+ """
+ if self.bank.config.get_bool('auto_publish', default=False):
+ logging.info('Workflow:wf_publish')
+ self.bank.publish()
+ return True
+
+ if not self.options.get_option(Options.PUBLISH):
+ logging.info('Workflow:wf_publish:no')
+ return True
+ logging.info('Workflow:wf_publish')
+ self.bank.publish()
+ return True
+
+ def wf_old_biomaj_api(self):
+ """
+ Generates a listing.format file containing the list of files in directories declared in formats
+ """
+ release_dir = self.session.get_full_release_directory()
+ for release_format in self.bank.session.get('formats'):
+ format_file = os.path.join(release_dir, 'listingv1.' + release_format.replace('/', '_'))
+ section = self.list_section(release_dir, release_format, release_format)
+ logging.debug("Worfklow:OldAPI:WriteListing: " + format_file)
+ fd = os.open(format_file, os.O_RDWR | os.O_CREAT)
+ os.write(fd, json.dumps(section).encode('utf-8'))
+ os.close(fd)
+ return True
+
+ def list_section(self, base_dir, release_format, base_format):
+ """
+ Get section files and sub-section from base_dir for directory release_format
+
+ :param base_dir: root directory
+ :type base_dir: str
+ :param base_dir: sub directory to scan
+ :type base_dir: str
+ :param base_format: first directroy indicating format
+ :type base_format: str
+ :return: dict section details
+ """
+ section = {"name": release_format, "sections": [], "files": []}
+ format_dir = os.path.join(base_dir, release_format)
+ if not os.path.exists(format_dir):
+ logging.info("Worfklow:OldAPI:Format directory " + release_format + " does not exists, skipping")
+ return section
+ format_dir_list = os.listdir(format_dir)
+ for format_dir_file in format_dir_list:
+ if os.path.isfile(os.path.join(format_dir, format_dir_file)):
+ if base_format.lower() == 'blast':
+ if format_dir_file.endswith('.nal'):
+ fileName, fileExtension = os.path.splitext(format_dir_file)
+ section['files'].append(os.path.join(format_dir, fileName))
+ else:
+ section['files'].append(os.path.join(format_dir, format_dir_file))
+ else:
+ # This is a sub directory
+ new_section = self.list_section(format_dir, format_dir_file, base_format)
+ section['sections'].append(new_section)
+ return section
+
+ def wf_clean_offline(self):
+ """
+ Clean offline directory
+ """
+ logging.info('Workflow:wf_clean_offline')
+ if os.path.exists(self.session.get_offline_directory()):
+ shutil.rmtree(self.session.get_offline_directory())
+ return True
+
+ def wf_clean_old_sessions(self):
+ """
+ Delete old sessions not related to a production directory or last run
+ """
+ logging.info('Workflow:wf_clean_old_sessions')
+ self.bank.clean_old_sessions()
+ return True
+
+ def wf_delete_old(self):
+ """
+ Delete old production dirs
+ """
+ logging.info('Workflow:wf_delete_old')
+ if self.options.get_option(Options.FROM_TASK) is not None:
+ # This is a run on an already present release, skip delete
+ logging.info('Workflow:wf_delete_old:Skip')
+ return True
+ if not self.session.config.get('keep.old.version'):
+ keep = 1
+ else:
+ keep = int(self.session.config.get('keep.old.version'))
+ # Current production dir is not yet in list
+ nb_prod = len(self.bank.bank['production'])
+ # save session during delete workflow
+ keep_session = self.bank.session
+
+ if nb_prod > keep:
+ for prod in self.bank.bank['production']:
+ if prod['release'] == keep_session.get('release'):
+ continue
+ if 'freeze' in prod and prod['freeze']:
+ continue
+ if self.bank.bank['current'] == prod['session']:
+ continue
+ if nb_prod - keep > 0:
+ nb_prod -= 1
+ session = self.bank.get_new_session(RemoveWorkflow.FLOW)
+ # Delete init and over because we are already in a run
+ i_init = -1
+ i_over = -1
+ for i in range(0, len(session.flow)):
+ if session.flow[i]['name'] == 'init':
+ i_init = i
+ if i_init >= 0:
+ del session.flow[i_init]
+ for i in range(0, len(session.flow)):
+ if session.flow[i]['name'] == 'over':
+ i_over = i
+ if i_over >= 0:
+ del session.flow[i_over]
+
+ session.set('action', 'remove')
+ session.set('release', prod['release'])
+ session.set('remoterelease', prod['remoterelease'])
+ session.set('update_session_id', prod['session'])
+ logging.info('Workflow:wf_delete_old:Delete:' + prod['release'])
+ res = self.bank.start_remove(session)
+ if not res:
+ logging.error('Workflow:wf_delete_old:ErrorDelete:' + prod['release'])
+ else:
+ break
+ # Set session back
+ self.bank.session = keep_session
+
+ return True
+
+
+class ReleaseCheckWorkflow(UpdateWorkflow):
+ """
+ Workflow for a bank update
+ """
+
+ FLOW = [
+ {'name': 'init', 'steps': []},
+ {'name': 'check', 'steps': []},
+ {'name': 'preprocess', 'steps': []},
+ {'name': 'release', 'steps': []},
+ {'name': 'download', 'steps': []},
+ {'name': 'over', 'steps': []}
+ ]
+
+ def __init__(self, bank):
+ """
+ Instantiate a new workflow
+
+ :param bank: bank on which to apply the workflow
+ :type bank: Bank
+ """
+ UpdateWorkflow.__init__(self, bank)
+ logging.debug('New release check workflow')
+ self.session.config.set('releaseonly', 'true')
+
+ def wf_init(self):
+ """
+ Initialize workflow, do not lock bank as it is not modified
+ If bank is already locked, stop workflow
+ """
+ logging.info('Workflow:wf_init')
+ data_dir = self.session.config.get('data.dir')
+ lock_dir = self.session.config.get('lock.dir', default=data_dir)
+ lock_file = os.path.join(lock_dir, self.name + '.lock')
+ if os.path.exists(lock_file):
+ logging.error('Bank ' + self.name + ' is locked, a process may be in progress, else remove the lock file ' + lock_file)
+ return False
+ return True
+
+ def wf_over(self):
+ """
+ Workflow is over
+ """
+ logging.info('Workflow:wf_over')
+ return True
+
+ def __update_info(self, info):
+ return
+
+ def wf_progress(self, task, status):
+ return
diff --git a/config.yml b/config.yml
new file mode 100644
index 0000000..2c46e64
--- /dev/null
+++ b/config.yml
@@ -0,0 +1,48 @@
+biomaj:
+ global_properties: '/pasteur/services/policy01/banques/biomaj3/global.properties'
+
+rabbitmq:
+ host: '127.0.0.1'
+ port: 5672
+ user: null
+ password: null
+ virtual_host: '/'
+
+mongo:
+ url: 'mongodb://localhost:27017'
+ db: 'biomaj'
+
+consul:
+ host: null
+ # Unique agent identifier name among biomaj downloaders
+ id: 'biomaj_daemon_agent'
+
+web:
+ debug: true
+ port: 5000
+ local_endpoint: 'http://127.0.0.1:5000'
+
+tls:
+ key: null
+ cert: null
+
+log_config:
+ 'version': 1
+ 'formatters':
+ 'generic':
+ 'format': '%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s'
+ 'handlers':
+ 'console':
+ 'class': 'logging.StreamHandler'
+ 'formatter': 'generic'
+ 'level': 'DEBUG'
+ 'loggers':
+ 'root':
+ 'level': 'INFO'
+ 'handlers':
+ - 'console'
+ 'biomaj':
+ 'level': 'DEBUG'
+ 'handlers':
+ - 'console'
+ 'disable_existing_loggers': False
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..356ef0a
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BioMAJ.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BioMAJ.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/BioMAJ"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BioMAJ"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/admin.rst b/docs/admin.rst
new file mode 100644
index 0000000..b15363d
--- /dev/null
+++ b/docs/admin.rst
@@ -0,0 +1,37 @@
+***************
+Advanced Topics
+***************
+
+LDAP
+====
+
+The `BioMAJ watcher <https://github.com/genouest/biomaj-watcher>`__,
+provides an optional web interface to manage banks. Users can create
+"private" banks and manage them via the web.
+
+ElasticSearch
+=============
+
+In order to use the ``--search`` flag, you may wish to connect an
+ElasticSearch cluster.
+
+You will need to edit your ``global.properties`` to indicate where the ES servers are:
+
+.. code:: ini
+
+ use_elastic=0
+ #Comma separated list of elasticsearch nodes host1,host2:port2
+ elastic_nodes=localhost
+ elastic_index=biomaj
+ # Calculate data.dir size stats
+ data.stats=1
+
+An example ``docker-compose.yml`` would use this:
+
+.. literalinclude:: docker-compose-advanced.yml
+ :language: yaml
+
+And a modified ``global.properties`` referenced in that file would enable elasticsearch:
+
+.. literalinclude:: global.advanced.properties
+ :language: ini
diff --git a/docs/alu.properties b/docs/alu.properties
new file mode 100644
index 0000000..56ea01c
--- /dev/null
+++ b/docs/alu.properties
@@ -0,0 +1,42 @@
+[GENERAL]
+# Database name/description
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+# The short name for the database
+db.name=alu
+# Database type. Some common values include genome, nucleic, nucleic_protein, protein, other
+db.type=nucleic_protein
+# Base directory to download to download temp files to
+offline.dir.name=offline/ncbi/blast/alu_tmp
+# Base directory to download to
+dir.version=ncbi/blast/alu
+# Update frequency
+frequency.update=0
+# Number of threads used during downloading
+files.num.threads=1
+
+# Protocol, common values include ftp, http
+protocol=ftp
+# The FQDN of the server you with to connect to
+server=ftp.ncbi.nih.gov
+# And the directory on that server
+remote.dir=/blast/db/FASTA/
+# The files to find in that page of the remote server.
+remote.files=^alu.*\.gz$
+
+# BioMAJ can automatically extract the version number from a release
+# document. This will be covered in another section.
+release.file=
+release.regexp=
+release.file.compressed=
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+# ?
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+db.post.process=
+
+### Deployment ###
+keep.old.version=1
diff --git a/docs/bank.rst b/docs/bank.rst
new file mode 100644
index 0000000..08925d9
--- /dev/null
+++ b/docs/bank.rst
@@ -0,0 +1,15 @@
+.. _bank:
+
+
+*****
+bank
+*****
+
+
+bank API reference
+==================
+ .. automodule:: biomaj.bank
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..5bf65da
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+#
+# BioMAJ documentation build configuration file, created by
+# sphinx-quickstart on Mon Oct 27 08:26:18 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+from mock import Mock as MagicMock
+
+class Mock(MagicMock):
+ @classmethod
+ def __getattr__(cls, name):
+ return Mock()
+
+MOCK_MODULES = ['pycurl', 'pymongo', 'elasticsearch', 'drmaa', 'influxdb',
+ 'biomaj_download',
+ 'biomaj_download.downloadclient',
+ 'biomaj_download.download',
+ 'biomaj_download.download.localcopy',
+ 'biomaj_download.download.http',
+ 'biomaj_download.download.http.HTTPParse',
+ 'biomaj_download.message']
+sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('../'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.ifconfig',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.graphviz'
+ ]
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'BioMAJ'
+copyright = u'2014, Olivier Sallou'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '3.0'
+# The full version, including alpha/beta/rc tags.
+release = '3.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'BioMAJdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ ('index', 'BioMAJ.tex', u'BioMAJ Documentation',
+ u'Olivier Sallou', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'biomaj', u'BioMAJ Documentation',
+ [u'Olivier Sallou'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'BioMAJ', u'BioMAJ Documentation',
+ u'Olivier Sallou', 'BioMAJ', 'Biological databanks update.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/docker-compose-advanced.yml b/docs/docker-compose-advanced.yml
new file mode 100644
index 0000000..8e56b8f
--- /dev/null
+++ b/docs/docker-compose-advanced.yml
@@ -0,0 +1,16 @@
+version: '2'
+services:
+ biomaj:
+ image: osallou/biomaj-docker
+ links:
+ - mongodb:biomaj-mongodb
+ - elasticsearch
+ volumes:
+ - ./data:/var/lib/biomaj
+ - ./global.advanced.properties:/etc/biomaj/global.properties
+
+ mongodb:
+ image: mongo
+
+ elasticsearch:
+ image: elasticsearch:1.7
diff --git a/docs/docker-compose.yml b/docs/docker-compose.yml
new file mode 100644
index 0000000..37210b0
--- /dev/null
+++ b/docs/docker-compose.yml
@@ -0,0 +1,11 @@
+version: '2'
+services:
+ biomaj:
+ image: osallou/biomaj-docker
+ links:
+ - mongodb:biomaj-mongodb
+ volumes:
+ - ./data:/var/lib/biomaj
+
+ mongodb:
+ image: mongo
diff --git a/docs/examples.rst b/docs/examples.rst
new file mode 100644
index 0000000..a308ae5
--- /dev/null
+++ b/docs/examples.rst
@@ -0,0 +1,104 @@
+***************
+Getting Started
+***************
+
+For a very basic setup, you can configure a ``docker-compose.yml`` file to use
+with `docker <https://www.docker.com/products/overview#install_the_platform>`__,
+which is especially helpful when you are testing out BioMAJ.
+
+Docker
+======
+
+.. literalinclude:: docker-compose.yml
+ :language: yaml
+ :linenos:
+
+This configuration file defines a simple MongoDB instance which is used for
+backend storage by BioMAJ, as well as the BioMAJ instance itself. Line 8
+denotes that a folder named ``data`` in the current directory will be mounted
+into the volume as storage. Any files downloaded by BioMAJ will appear in this
+directory.
+
+Running the ``--help`` command can be done easily:
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --help
+
+
+Simple Configuration
+====================
+
+Once you've reached this point, you're ready to start configuring BioMAJ to
+download datasets for you. Configuration files should go instead a folder
+``conf`` inside the ``data`` folder in your current directory. As an example,
+we will use this simple ALU configuration file:
+
+.. literalinclude:: alu.properties
+ :language: text
+ :linenos:
+
+The file can be broken down into a couple of sections:
+
+- Metadata (lines 1-15)
+- Remote Source (17-24)
+- Release Information (26-30)
+- Other
+
+The metadata consists of things like where data should be stored, and how
+to name it. The remote source describes where data is to be fetched from,
+release information we will see in another example, and then there are a
+few extra, miscellaneous options shown in this example config.
+
+If you have copied the ``alu.properties`` file into ``./data/conf/alu.properties``, you are ready to download this database:
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --bank alu --update
+ 2016-08-24 21:43:15,276 INFO [root][MainThread] Log file: /var/lib/biomaj/log/alu/1472074995.28/alu.log
+ Log file: /var/lib/biomaj/log/alu/1472074995.28/alu.log
+ ...
+
+This command should complete successfully, and you will have some more files in ``./data/``:
+
+.. code:: console
+
+ $ find data
+ data/conf/alu.properties
+ data/data/ncbi/blast/alu/alu-2003-11-26/flat/alu.a
+ data/data/ncbi/blast/alu/alu-2003-11-26/flat/alu.n
+ data/cache/files_1472074995.29
+ data/log/alu/1472074995.28/alu.log
+
+The ``data/data`` directories contain your downloaded files. Additionally
+a cache file exists and a job run log is contains data about what occurred
+during the download and processing. Note that the files that appear are
+``alu.a`` and ``alu.n``, instead of ``alu.a.gz`` and ``alu.n.gz``. By
+having the option ``no.extract=true`` commented out on line 33, BioMAJ
+automatically extracted the data for us.
+
+The ``--status`` command will allow you to see the status of various databases you have downloaded.
+
+.. code:: console
+
+ $ docker-compose run --rm biomaj --bank alu --status
+ +--------+-----------------+----------------------+---------------------+
+ | Name | Type(s) | Last update status | Published release |
+ |--------+-----------------+----------------------+---------------------|
+ | alu | nucleic_protein | 2016-08-24 21:58:14 | 2003-11-26 |
+ +--------+-----------------+----------------------+---------------------+
+ +---------------------+------------------+------------+----------------------------------------------------+----------+
+ | Session | Remote release | Release | Directory | Freeze |
+ |---------------------+------------------+------------+----------------------------------------------------+----------|
+ | 2016-08-24 21:58:14 | 2003-11-26 | 2003-11-26 | /var/lib/biomaj/data/ncbi/blast/alu/alu-2003-11-26 | no |
+ +---------------------+------------------+------------+----------------------------------------------------+----------+
+
+
+Advanced Configuration
+======================
+
+Once you have this sort of simple configuration working, you may wish to
+explore more advanced configurations. There is a `public repository
+<https://github.com/genouest/biomaj-data/>`__ of BioMAJ configurations which
+will be interesting to the advanced user wishing to learn more about what can
+be done with BioMAJ.
diff --git a/docs/global.advanced.properties b/docs/global.advanced.properties
new file mode 100644
index 0000000..9d87c96
--- /dev/null
+++ b/docs/global.advanced.properties
@@ -0,0 +1,143 @@
+[GENERAL]
+root.dir=/var/lib/biomaj
+conf.dir=%(root.dir)s/conf
+log.dir=%(root.dir)s/log
+process.dir=%(root.dir)s/process
+cache.dir=%(root.dir)s/cache
+lock.dir=%(root.dir)s/lock
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=%(root.dir)s/data
+
+db.url=mongodb://biomaj-mongodb:27017
+db.name=biomaj
+
+use_ldap=0
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+use_elastic=1
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=elasticsearch
+elastic_index=biomaj
+# Calculate data.dir size stats
+data.stats=1
+
+celery.queue=biomaj
+celery.broker=mongodb://biomaj-mongodb:27017/biomaj_celery
+
+
+auto_publish=1
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+#mail.stmp.host=
+mail.admin=
+mail.from=biomaj at localhost
+mail.user=
+mail.password=
+mail.tls=
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#---------------------
+# PROTOCOL
+#-------------------
+#possible values : ftp, http, rsync, local
+port=21
+username=anonymous
+password=anonymous at nowhere.com
+
+
+
+#access user for production directories
+production.directory.chmod=775
+
+#Number of thread during the download
+bank.num.threads=4
+
+#Number of threads to use for downloading and processing
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#Link copy property
+do.link.copy=true
+
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=INFO
+
+http.parse.dir.line=<a[\\s]+href=\"([\\S]+)/\".*alt=\"\\[DIR\\]\">.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})
+http.parse.file.line=<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+#Needed if data sources are contains in an archive
+log.files=true
+
+local.files.excluded=\\.panfs.*
+
+#~40mn
+ftp.timeout=2000000
+ftp.automatic.reconnect=5
+ftp.active.mode=false
+
+# Bank default access
+visibility.default=public
+
+#proxy=http://localhost:3128
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = INFO
+handlers = console
+
+[logger_biomaj]
+level = INFO
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..eff50b5
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,36 @@
+.. BioMAJ documentation master file, created by
+ sphinx-quickstart on Mon Oct 27 08:26:18 2014.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to BioMAJ's documentation!
+==================================
+
+Getting Started Documentation:
+
+.. toctree::
+ :maxdepth: 2
+
+ examples
+ admin
+
+API Documentation:
+
+.. toctree::
+ :maxdepth: 2
+
+ bank
+ options
+ session
+ workflow
+ notify
+ metaprocess
+ processfactory
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..e05270a
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,242 @@
+ at ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. xml to make Docutils-native XML files
+ echo. pseudoxml to make pseudoxml-XML files for display purposes
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\BioMAJ.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\BioMAJ.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdf" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf
+ cd %BUILDDIR%/..
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdfja" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf-ja
+ cd %BUILDDIR%/..
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+if "%1" == "xml" (
+ %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The XML files are in %BUILDDIR%/xml.
+ goto end
+)
+
+if "%1" == "pseudoxml" (
+ %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+ goto end
+)
+
+:end
diff --git a/docs/metaprocess.rst b/docs/metaprocess.rst
new file mode 100644
index 0000000..2c5be97
--- /dev/null
+++ b/docs/metaprocess.rst
@@ -0,0 +1,15 @@
+.. _metaprocess:
+
+
+*****
+metaprocess
+*****
+
+
+MetaProcess API reference
+==================
+ .. automodule:: biomaj.process.metaprocess
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/notify.rst b/docs/notify.rst
new file mode 100644
index 0000000..926f2b2
--- /dev/null
+++ b/docs/notify.rst
@@ -0,0 +1,15 @@
+.. _notify:
+
+
+*****
+notify
+*****
+
+
+Notify API reference
+==================
+ .. automodule:: biomaj.notify
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/options.rst b/docs/options.rst
new file mode 100644
index 0000000..9e91975
--- /dev/null
+++ b/docs/options.rst
@@ -0,0 +1,15 @@
+.. _options:
+
+
+*****
+options
+*****
+
+
+Options API reference
+==================
+ .. automodule:: biomaj.options
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/processfactory.rst b/docs/processfactory.rst
new file mode 100644
index 0000000..28cd4a8
--- /dev/null
+++ b/docs/processfactory.rst
@@ -0,0 +1,15 @@
+.. _processfactory:
+
+
+*****
+processfactory
+*****
+
+
+ProcessFactory API reference
+==================
+ .. automodule:: biomaj.process.processfactory
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..20d8a86
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,14 @@
+mock
+nose
+pymongo==3.2
+tabulate
+ldap3
+py-bcrypt
+drmaa
+future
+elasticsearch
+biomaj_core
+biomaj_user
+biomaj_process
+biomaj_cli
+
diff --git a/docs/session.rst b/docs/session.rst
new file mode 100644
index 0000000..85c4a1e
--- /dev/null
+++ b/docs/session.rst
@@ -0,0 +1,15 @@
+.. _session:
+
+
+*****
+Session
+*****
+
+
+Session API reference
+==================
+ .. automodule:: biomaj.session
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/docs/workflow.rst b/docs/workflow.rst
new file mode 100644
index 0000000..75d8fe5
--- /dev/null
+++ b/docs/workflow.rst
@@ -0,0 +1,15 @@
+.. _workflow:
+
+
+*****
+workflow
+*****
+
+
+Workflows API reference
+==================
+ .. automodule:: biomaj.workflow
+ :members:
+ :private-members:
+ :special-members:
+
diff --git a/global.properties.example b/global.properties.example
new file mode 100755
index 0000000..04bb1a3
--- /dev/null
+++ b/global.properties.example
@@ -0,0 +1,162 @@
+[GENERAL]
+root.dir=/var/lib/biomaj
+conf.dir=%(root.dir)s/conf
+log.dir=%(root.dir)s/log
+process.dir=%(root.dir)s/process
+cache.dir=%(root.dir)s/cache
+lock.dir=%(root.dir)s/lock
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=%(root.dir)s//data
+
+
+db.url=mongodb://localhost:27017
+db.name=biomaj
+
+use_ldap=0
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+use_elastic=0
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj
+# Calculate data.dir size stats
+data.stats=1
+
+redis.host=localhost
+redis.port=6379
+redis.db=0
+redis.prefix=biomaj
+
+
+# Influxdb configuration (optional)
+# User and db must be manually created in influxdb before use
+influxdb.host=
+influxdb.port=8086
+influxdb.user=root
+influxdb.password=root
+influxdb.db=biomaj
+
+# Needed for download remote service (should be a load balancer to services
+#micro.biomaj.proxy=http://127.0.0.1:5000
+#micro.biomaj.rabbit_mq=127.0.0.1
+#micro.biomaj.rabbit_mq_port=5672
+#micro.biomaj.rabbit_mq_user=
+#micro.biomaj.rabbit_mq_password=
+#micro.biomaj.rabbit_mq_virtualhost=
+#micro.biomaj.service.download=1
+#micro.biomaj.service.process=1
+#micro.biomaj.service.user=1
+#micro.biomaj.service.daemon=1
+
+auto_publish=1
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+mail.admin=
+mail.from=
+mail.user=
+mail.password=
+mail.tls=true
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#---------------------
+# PROTOCOL
+#-------------------
+#possible values : ftp, http, rsync, local
+port=21
+username=anonymous
+password=anonymous at nowhere.com
+
+#access user for production directories
+production.directory.chmod=775
+
+#Number of thread during the download
+bank.num.threads=4
+
+#Number of threads to use for downloading and processing
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#Link copy property
+do.link.copy=true
+
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+# Hint: you can use online service https://regex101.com/ to test your regexps
+http.parse.dir.line=<a[\\s]+href=\"([\\S]+)/\".*alt=\"\\[DIR\\]\">.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})
+http.parse.file.line=<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+#Needed if data sources are contains in an archive
+log.files=true
+
+local.files.excluded=\\.panfs.*
+
+#~40mn
+ftp.timeout=2000000
+ftp.automatic.reconnect=5
+ftp.active.mode=false
+
+# Bank default access
+visibility.default=public
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = DEBUG
+handlers = console
+
+[logger_biomaj]
+level = DEBUG
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c403a03
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,17 @@
+biomaj_core
+biomaj_user
+biomaj_download>=3.0.10
+biomaj_process
+biomaj_cli
+mock
+nose
+pymongo>=3.2
+pycurl
+tabulate
+py-bcrypt
+drmaa
+future
+elasticsearch
+requests
+redis
+influxdb
diff --git a/scripts/biomaj_add_property.py b/scripts/biomaj_add_property.py
new file mode 100644
index 0000000..5605c7b
--- /dev/null
+++ b/scripts/biomaj_add_property.py
@@ -0,0 +1,30 @@
+from biomaj.schema_version import SchemaVersion
+import argparse
+import logging
+import sys
+
+
+desc = "Add or update a property to bank properties"
+epilog = "Author: Emmanuel Quevillon (tuco at pasteur.fr)"
+parser = argparse.ArgumentParser(description=desc, epilog=epilog)
+parser.add_argument('-b', '--bank', action="store", dest="bank", default=None,
+ help="Bank name to update")
+parser.add_argument('-c', '--cfgkey', action="store", dest="cfg", default=None,
+ help="Bank configuration key to retrieve prop value")
+parser.add_argument('-p', '--property', action="store", dest="prop",
+ required=True, help="Property name")
+parser.add_argument('-v', '--value', action="store", dest="value",
+ help="Property value")
+args = parser.parse_args()
+if sys.argv == 1:
+ parser.print_help()
+ sys.exit(0)
+if args.value and args.cfg:
+ logging.error("-v and -c are not compatible")
+ sys.exit(1)
+logging.warn("Needs global.properties in local directory or env variable BIOMAJ_CONF")
+SchemaVersion.add_property(bank=args.bank, prop=args.prop, value=args.value,
+ cfg=args.cfg)
+logging.info("Insertion done")
+sys.exit(0)
+
diff --git a/scripts/biomaj_migrate_database.py b/scripts/biomaj_migrate_database.py
new file mode 100644
index 0000000..37ca5ed
--- /dev/null
+++ b/scripts/biomaj_migrate_database.py
@@ -0,0 +1,7 @@
+from biomaj.schema_version import SchemaVersion
+import logging
+
+logging.warn('Migrate BioMAJ database...')
+logging.warn('Needs global.properties in local directory or env variable BIOMAJ_CONF')
+SchemaVersion.migrate_pendings()
+logging.warn('Migration done')
diff --git a/scripts/influxdb_import.py b/scripts/influxdb_import.py
new file mode 100644
index 0000000..9792cdb
--- /dev/null
+++ b/scripts/influxdb_import.py
@@ -0,0 +1,90 @@
+'''
+Import biomaj banks statistics in Influxdb if never done before.....
+'''
+from influxdb import InfluxDBClient
+from biomaj.bank import Bank
+from biomaj_core.config import BiomajConfig
+import sys
+
+if len(sys.argv) != 2:
+ print('Usage: influxdb_import.py path_to_global.properties')
+ sys.exit(1)
+
+BiomajConfig.load_config(config_file=sys.argv[1])
+
+influxdb = None
+try:
+ host = BiomajConfig.global_config.get('GENERAL', 'influxdb.host')
+ user = BiomajConfig.global_config.get('GENERAL', 'influxdb.user')
+ password = BiomajConfig.global_config.get('GENERAL', 'influxdb.password')
+ port = BiomajConfig.global_config.get('GENERAL', 'influxdb.port')
+ database = BiomajConfig.global_config.get('GENERAL', 'influxdb.db')
+ influxdb = InfluxDBClient(host=host, database=database, port=port, username=user, password=password)
+except Exception as e:
+ print('Failed to connect to influxdb, check configuration in global.properties: ' + str(e))
+ sys.exit(1)
+
+res = influxdb.query('select last("value") from "biomaj.banks.quantity"')
+if res:
+ print('Found data in influxdb, update info....')
+
+banks = Bank.list()
+nb_banks = 0
+metrics = []
+for bank in banks:
+ productions = bank['production']
+ total_size = 0
+ latest_size = 0
+ if not productions:
+ continue
+ nb_banks += 1
+ latest_size = productions[len(productions) - 1]['size']
+ if not latest_size:
+ latest_size = 0
+ for production in productions:
+ if 'size' in production and production['size']:
+ total_size += production['size']
+
+ influx_metric = {
+ "measurement": 'biomaj.production.size.total',
+ "fields": {
+ "value": float(total_size)
+ },
+ "tags": {
+ "bank": bank['name']
+ },
+ "time": int(production['session'])
+ }
+ metrics.append(influx_metric)
+ influx_metric = {
+ "measurement": 'biomaj.production.size.latest',
+ "fields": {
+ "value": float(latest_size)
+ },
+ "tags": {
+ "bank": bank['name']
+ },
+ "time": int(production['session'])
+ }
+ metrics.append(influx_metric)
+ influx_metric = {
+ "measurement": 'biomaj.bank.update.new',
+ "fields": {
+ "value": 1
+ },
+ "tags": {
+ "bank": bank['name']
+ },
+ "time": int(production['session'])
+ }
+ metrics.append(influx_metric)
+
+influx_metric = {
+ "measurement": 'biomaj.banks.quantity',
+ "fields": {
+ "value": nb_banks
+ }
+}
+metrics.append(influx_metric)
+
+influxdb.write_points(metrics, time_precision="s")
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..40f6a1d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,86 @@
+try:
+ from setuptools import setup, find_packages
+except ImportError:
+ from distutils.core import setup
+
+from distutils.command.install import install
+import os
+
+
+class post_install(install):
+ def run(self):
+ install.run(self)
+ from biomaj.schema_version import SchemaVersion
+ SchemaVersion.migrate_pendings()
+ SchemaVersion.add_property(prop='desc', cfg='db.fullname')
+ SchemaVersion.set_version()
+
+here = os.path.abspath(os.path.dirname(__file__))
+try:
+ with open(os.path.join(here, 'README.md')) as f:
+ README = f.read()
+ with open(os.path.join(here, 'CHANGES.txt')) as f:
+ CHANGES = f.read()
+except UnicodeDecodeError:
+ with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+ README = f.read()
+ with open(os.path.join(here, 'CHANGES.txt'), encoding='utf-8') as f:
+ CHANGES = f.read()
+
+
+config = {
+ 'description': 'BioMAJ',
+ 'long_description': README + '\n\n' + CHANGES,
+ 'author': 'Olivier Sallou',
+ 'url': 'http://biomaj.genouest.org',
+ 'download_url': 'http://biomaj.genouest.org',
+ 'author_email': 'olivier.sallou at irisa.fr',
+ 'version': '3.1.3',
+ 'classifiers': [
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Natural Language :: English',
+ 'Operating System :: POSIX :: Linux',
+ # Indicate who your project is intended for
+ 'Intended Audience :: Science/Research',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ # Pick your license as you wish (should match "license" above)
+ 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
+ # Specify the Python versions you support here. In particular, ensure
+ # that you indicate whether you support Python 2, Python 3 or both.
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4'
+ ],
+ 'install_requires': [
+ 'biomaj_cli',
+ 'biomaj_core',
+ 'biomaj_user',
+ 'biomaj_download',
+ 'biomaj_process',
+ 'pymongo>=3.2',
+ 'pycurl',
+ 'py-bcrypt',
+ 'drmaa',
+ 'future',
+ 'tabulate',
+ 'requests',
+ 'redis',
+ 'elasticsearch',
+ 'influxdb'
+ ],
+ 'tests_require': ['nose', 'mock'],
+ 'test_suite': 'nose.collector',
+ 'packages': find_packages(),
+ 'include_package_data': True,
+ 'scripts': ['scripts/biomaj_migrate_database.py'],
+ 'name': 'biomaj',
+ #'cmdclass': {'install': post_install},
+}
+
+setup(**config)
diff --git a/tests/alu.properties b/tests/alu.properties
new file mode 100644
index 0000000..0e729e0
--- /dev/null
+++ b/tests/alu.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+db.name=alu
+db.type=nucleic_protein
+
+offline.dir.name=offline/ncbi/blast/alu_tmp
+dir.version=ncbi/blast/alu
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=ftp
+server=ftp.ncbi.nih.gov
+remote.dir=/blast/db/FASTA/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^alu.*\.gz$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/bank/process/test.sh b/tests/bank/process/test.sh
new file mode 100755
index 0000000..2d510e2
--- /dev/null
+++ b/tests/bank/process/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Testing a process"
+
+echo "test meta data"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr1#blast/chr1/chr1db"
+echo "##BIOMAJ#blast#nucleic#organism:hg19,chr:chr2#blast/chr2/chr2db"
+
+echo "test meta data 2"
+
+echo "##BIOMAJ#fasta#nucleic#organism:hg19#fasta/chr1.fa,fasta/chr2.fa"
diff --git a/tests/bank/test.fasta.gz b/tests/bank/test.fasta.gz
new file mode 100644
index 0000000..666d6f2
Binary files /dev/null and b/tests/bank/test.fasta.gz differ
diff --git a/tests/bank/test2.fasta b/tests/bank/test2.fasta
new file mode 100644
index 0000000..410ca0f
--- /dev/null
+++ b/tests/bank/test2.fasta
@@ -0,0 +1,2 @@
+>test2
+gcgcgcgcgcgcgcgccgcgcgcgcgcgcgcggc
diff --git a/tests/bank/test_100.txt b/tests/bank/test_100.txt
new file mode 100644
index 0000000..c7f7c3b
--- /dev/null
+++ b/tests/bank/test_100.txt
@@ -0,0 +1 @@
+This is a sample file to extract Release 103 from a text file
diff --git a/tests/biomaj_tests.py b/tests/biomaj_tests.py
new file mode 100644
index 0000000..f599487
--- /dev/null
+++ b/tests/biomaj_tests.py
@@ -0,0 +1,807 @@
+from nose.tools import *
+from nose.plugins.attrib import attr
+
+import json
+import shutil
+import os
+import tempfile
+import logging
+import copy
+import stat
+import time
+
+from mock import patch
+
+from optparse import OptionParser
+
+
+from biomaj.bank import Bank
+from biomaj.session import Session
+from biomaj.workflow import Workflow
+from biomaj.workflow import UpdateWorkflow
+from biomaj.workflow import ReleaseCheckWorkflow
+from biomaj_core.utils import Utils
+from biomaj_download.download.ftp import FTPDownload
+from biomaj_download.download.direct import DirectFTPDownload
+from biomaj_download.download.direct import DirectHttpDownload
+from biomaj_download.download.http import HTTPDownload
+from biomaj_download.download.localcopy import LocalDownload
+from biomaj_download.download.downloadthreads import DownloadThread
+from biomaj_core.config import BiomajConfig
+from biomaj.process.processfactory import PostProcessFactory
+from biomaj.process.processfactory import PreProcessFactory
+from biomaj.process.processfactory import RemoveProcessFactory
+from biomaj_user.user import BmajUser
+from biomaj_core.bmajindex import BmajIndex
+
+import unittest
+
+class UtilsForTest():
+ """
+ Copy properties files to a temp directory and update properties to
+ use a temp directory
+ """
+
+ def __init__(self):
+ """
+ Setup the temp dirs and files.
+ """
+ self.global_properties = None
+ self.bank_properties = None
+
+ self.test_dir = tempfile.mkdtemp('biomaj')
+
+ self.conf_dir =os.path.join(self.test_dir,'conf')
+ if not os.path.exists(self.conf_dir):
+ os.makedirs(self.conf_dir)
+ self.data_dir =os.path.join(self.test_dir,'data')
+ if not os.path.exists(self.data_dir):
+ os.makedirs(self.data_dir)
+ self.log_dir =os.path.join(self.test_dir,'log')
+ if not os.path.exists(self.log_dir):
+ os.makedirs(self.log_dir)
+ self.process_dir =os.path.join(self.test_dir,'process')
+ if not os.path.exists(self.process_dir):
+ os.makedirs(self.process_dir)
+ self.lock_dir =os.path.join(self.test_dir,'lock')
+ if not os.path.exists(self.lock_dir):
+ os.makedirs(self.lock_dir)
+ self.cache_dir =os.path.join(self.test_dir,'cache')
+ if not os.path.exists(self.cache_dir):
+ os.makedirs(self.cache_dir)
+
+
+ if self.global_properties is None:
+ self.__copy_global_properties()
+
+ if self.bank_properties is None:
+ self.__copy_test_bank_properties()
+
+ def clean(self):
+ """
+ Deletes temp directory
+ """
+ shutil.rmtree(self.test_dir)
+
+ def __copy_test_bank_properties(self):
+ if self.bank_properties is not None:
+ return
+ self.bank_properties = ['alu', 'local', 'testhttp','directhttp']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ for b in self.bank_properties:
+ from_file = os.path.join(curdir, b+'.properties')
+ to_file = os.path.join(self.conf_dir, b+'.properties')
+ shutil.copyfile(from_file, to_file)
+
+ self.bank_process = ['test.sh']
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ procdir = os.path.join(curdir, 'bank/process')
+ for proc in self.bank_process:
+ from_file = os.path.join(procdir, proc)
+ to_file = os.path.join(self.process_dir, proc)
+ shutil.copyfile(from_file, to_file)
+ os.chmod(to_file, stat.S_IRWXU)
+
+ # Manage local bank test, use bank test subdir as remote
+ properties = ['multi.properties', 'computederror.properties', 'error.properties', 'local.properties', 'localprocess.properties', 'testhttp.properties', 'computed.properties', 'computed2.properties', 'sub1.properties', 'sub2.properties']
+ for prop in properties:
+ from_file = os.path.join(curdir, prop)
+ to_file = os.path.join(self.conf_dir, prop)
+ fout = open(to_file,'w')
+ with open(from_file,'r') as fin:
+ for line in fin:
+ if line.startswith('remote.dir'):
+ fout.write("remote.dir="+os.path.join(curdir,'bank')+"\n")
+ elif line.startswith('remote.files'):
+ fout.write(line.replace('/tmp', os.path.join(curdir,'bank')))
+ else:
+ fout.write(line)
+ fout.close()
+
+ def __copy_global_properties(self):
+ if self.global_properties is not None:
+ return
+ self.global_properties = os.path.join(self.conf_dir,'global.properties')
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ global_template = os.path.join(curdir,'global.properties')
+ fout = open(self.global_properties,'w')
+ with open(global_template,'r') as fin:
+ for line in fin:
+ if line.startswith('conf.dir'):
+ fout.write("conf.dir="+self.conf_dir+"\n")
+ elif line.startswith('log.dir'):
+ fout.write("log.dir="+self.log_dir+"\n")
+ elif line.startswith('data.dir'):
+ fout.write("data.dir="+self.data_dir+"\n")
+ elif line.startswith('process.dir'):
+ fout.write("process.dir="+self.process_dir+"\n")
+ elif line.startswith('lock.dir'):
+ fout.write("lock.dir="+self.lock_dir+"\n")
+ else:
+ fout.write(line)
+ fout.close()
+
+
+class TestBiomajSetup(unittest.TestCase):
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ # Delete all banks
+ b = Bank('alu')
+ b.banks.remove({})
+
+ self.config = BiomajConfig('alu')
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'alu.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+
+ def tearDown(self):
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'alu.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+ self.utils.clean()
+
+ def test_new_bank(self):
+ """
+ Checks bank init
+ """
+ b = Bank('alu')
+
+ def test_new_session(self):
+ """
+ Checks an empty session is created
+ """
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ for key in b.session._session['status'].keys():
+ self.assertFalse(b.session.get_status(key))
+
+ def test_session_reload_notover(self):
+ """
+ Checks a session is used if present
+ """
+ b = Bank('alu')
+ for i in range(1, 5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ self.assertTrue(b.session.get_status(Workflow.FLOW_INIT))
+
+ def test_clean_old_sessions(self):
+ """
+ Checks a session is used if present
+ """
+ b = Bank('local')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+ b2 = Bank('local')
+ b2.update()
+ b2.clean_old_sessions()
+ self.assertTrue(len(b2.bank['sessions']) == 1)
+
+ def test_session_reload_over(self):
+ """
+ Checks a session if is not over
+ """
+ b = Bank('alu')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ s._session['status'][Workflow.FLOW_OVER] = True
+ b.session = s
+ b.save_session()
+
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ self.assertFalse(b.session.get_status(Workflow.FLOW_INIT))
+
+ def test_bank_list(self):
+ b1 = Bank('alu')
+ b2 = Bank('local')
+ banks = Bank.list()
+ self.assertTrue(len(banks) == 2)
+
+ @attr('test')
+ @attr('network')
+ def test_get_release(self):
+ """
+ Get release
+ """
+ b = Bank('alu')
+ b.load_session(UpdateWorkflow.FLOW)
+ res = b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(res)
+ self.assertTrue(b.session._session['release'] is not None)
+
+ def test_remove_session(self):
+ b = Bank('alu')
+ for i in range(1,5):
+ s = Session('alu', self.config, UpdateWorkflow.FLOW)
+ s._session['status'][Workflow.FLOW_INIT] = True
+ b.session = s
+ b.save_session()
+ self.assertTrue(len(b.bank['sessions'])==4)
+ b.remove_session(b.session.get('id'))
+ self.assertTrue(len(b.bank['sessions'])==3)
+
+ @attr('process')
+ def test_postprocesses_setup(self):
+ b = Bank('localprocess')
+ pfactory = PostProcessFactory(b)
+ pfactory.run(True)
+ self.assertTrue(len(pfactory.threads_tasks[0])==2)
+ self.assertTrue(len(pfactory.threads_tasks[1])==1)
+
+ @attr('process')
+ def test_postprocesses_exec_again(self):
+ """
+ Execute once, set a status to false, check that False processes are executed
+ """
+ b = Bank('localprocess')
+ pfactory = PostProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.blocks['BLOCK1']['META0']['PROC0'])
+ self.assertTrue(pfactory.blocks['BLOCK2']['META1']['PROC1'])
+ self.assertTrue(pfactory.blocks['BLOCK2']['META1']['PROC2'])
+ blocks = copy.deepcopy(pfactory.blocks)
+ blocks['BLOCK2']['META1']['PROC2'] = False
+ pfactory2 = PostProcessFactory(b, blocks)
+ pfactory2.run()
+ self.assertTrue(pfactory2.blocks['BLOCK2']['META1']['PROC2'])
+
+ @attr('process')
+ def test_preprocesses(self):
+ b = Bank('localprocess')
+ pfactory = PreProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.meta_status['META0']['PROC0'])
+
+ @attr('process')
+ def test_removeprocesses(self):
+ b = Bank('localprocess')
+ pfactory = RemoveProcessFactory(b)
+ pfactory.run()
+ self.assertTrue(pfactory.meta_status['META0']['PROC0'])
+
+ def test_dependencies_list(self):
+ b = Bank('computed')
+ deps = b.get_dependencies()
+ self.assertTrue(len(deps)==2)
+
+class TestBiomajFunctional(unittest.TestCase):
+
+ def setUp(self):
+ self.utils = UtilsForTest()
+ curdir = os.path.dirname(os.path.realpath(__file__))
+ BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
+
+ #Delete all banks
+ b = Bank('local')
+ b.banks.remove({})
+
+ self.config = BiomajConfig('local')
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+
+ def tearDown(self):
+ data_dir = self.config.get('data.dir')
+ lock_file = os.path.join(data_dir,'local.lock')
+ if os.path.exists(lock_file):
+ os.remove(lock_file)
+ self.utils.clean()
+
+ def test_extract_release_from_file_name(self):
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_(\d+)\.txt')
+ b.session.config.set('release.regexp', '')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100')
+
+ def test_remoterelease_check(self):
+ b = Bank('local')
+ b.load_session(ReleaseCheckWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_(\d+)\.txt')
+ b.session.config.set('release.regexp', '')
+ workflow = ReleaseCheckWorkflow(b)
+ res = workflow.start()
+ remoterelease = b.session.get('remoterelease')
+ self.assertTrue(remoterelease == '100')
+
+ def test_extract_release_from_file_content(self):
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_100\.txt')
+ b.session.config.set('release.regexp', 'Release\s*(\d+)')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '103')
+
+ def test_publish(self):
+ """
+ Update a bank, then publish it
+ """
+ b = Bank('local')
+ b.update()
+ current_link = os.path.join(b.config.get('data.dir'),
+ b.config.get('dir.version'),
+ 'current')
+ self.assertFalse(os.path.exists(current_link))
+ self.assertTrue(b.bank['current'] is None)
+ b.publish()
+ self.assertTrue(os.path.exists(current_link))
+ self.assertTrue(b.bank['current'] == b.session._session['id'])
+
+ # Should test this on local downloader, changing 1 file to force update,
+ # else we would get same bank and there would be no update
+ def test_no_update(self):
+ """
+ Try updating twice, at second time, bank should not be updated
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.update()
+ self.assertFalse(b.session.get('update'))
+ self.assertFalse(b.session.get_status(Workflow.FLOW_POSTPROCESS))
+
+ @attr('remotelist')
+ def test_download_from_list(self):
+ """
+ Use remote.list to define a list of files to download
+ """
+ b = Bank('local')
+ fd, file_path = tempfile.mkstemp()
+ try:
+ b.config.set('remote.list', file_path)
+ with os.fdopen(fd, 'w') as tmp:
+ tmp.write('[{"name": "test_100.txt", "root": "' + b.config.get('remote.dir') + '"}]')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ finally:
+ #os.remove(file_path)
+ print(file_path)
+
+ @attr('release')
+ def test_release_control(self):
+ """
+ Try updating twice, at second time, modify one file (same date),
+ bank should update
+ """
+ b = Bank('local')
+ b.update()
+ b.session.config.set('keep.old.version', '3')
+ self.assertTrue(b.session.get('update'))
+ remote_file = b.session.config.get('remote.dir') + 'test2.fasta'
+ os.utime(remote_file, None)
+ # Update test2.fasta and set release.control
+ b.session.config.set('release.control', 'true')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.update()
+ self.assertFalse(b.session.get('update'))
+ b.session.config.set('copy.skip', '1')
+ b.session.config.set('remote.files', '^test2.fasta')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+
+ def test_fromscratch_update(self):
+ """
+ Try updating twice, at second time, bank should be updated (force with fromscratc)
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ sess = b.session.get('release')
+ b.options.fromscratch = True
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertEqual(b.session.get('release'), sess+'__1')
+
+
+ def test_fromscratch_update_with_release(self):
+ """
+ Try updating twice, at second time, bank should be updated (force with fromscratch)
+
+ Use case with release defined in release file
+ """
+ b = Bank('local')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('release.file', 'test_(\d+)\.txt')
+ b.session.config.set('release.regexp', '')
+ w = UpdateWorkflow(b)
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100')
+ os.makedirs(b.session.get_full_release_directory())
+ w = UpdateWorkflow(b)
+ # Reset release
+ b.session.set('release', None)
+ w.options.fromscratch = True
+ w.wf_release()
+ self.assertTrue(b.session.get('release') == '100__1')
+
+
+ def test_mix_stop_from_task(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+ def test_mix_stop_from_task2(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ b2.options.from_task = 'download'
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+ def test_mix_stop_from_task3(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_after = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ self.assertTrue(b2.session.get('release') == rel+'__1')
+ b3 = Bank('local')
+ res = b3.update()
+ b2.options.from_task = 'postprocess'
+ self.assertTrue(b3.session.get('release') == rel+'__1')
+ self.assertTrue(res)
+
+ def test_mix_stop_from_task4(self):
+ """
+ Get a first release, then fromscratch --stop-after, then restart from-task
+ """
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b2 = Bank('local')
+ b2.options.stop_before = 'download'
+ b2.options.fromscratch = True
+ res = b2.update()
+ b3 = Bank('local')
+ b3.options.from_task = 'postprocess'
+ res = b3.update()
+ self.assertFalse(res)
+
+ def test_delete_old_dirs(self):
+ """
+ Try updating 3 times, oldest dir should be removed
+ """
+ b = Bank('local')
+ b.removeAll(True)
+ b = Bank('local')
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ b.options.fromscratch = True
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(len(b.bank['production']) == 2)
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ # one new dir, but olders must be deleted
+ self.assertTrue(len(b.bank['production']) == 2)
+
+ def test_delete_old_dirs_with_freeze(self):
+ """
+ Try updating 3 times, oldest dir should be removed but not freezed releases
+ """
+ b = Bank('local')
+ b.removeAll(True)
+ b = Bank('local')
+ b.update()
+ b.freeze(b.session.get('release'))
+ self.assertTrue(b.session.get('update'))
+ b.options.fromscratch = True
+ b.update()
+ b.freeze(b.session.get('release'))
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(len(b.bank['production']) == 2)
+ b.update()
+ self.assertTrue(b.session.get('update'))
+ # one new dir, but olders must be deleted
+ self.assertTrue(len(b.bank['production']) == 3)
+
+ def test_removeAll(self):
+ b = Bank('local')
+ b.update()
+ b.removeAll()
+ self.assertFalse(os.path.exists(b.get_data_dir()))
+ bdb = b.banks.find_one({'name': b.name})
+ self.assertTrue(bdb is None)
+
+ def test_remove(self):
+ """
+ test removal of a production dir
+ """
+ b = Bank('local')
+ b.update()
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()))
+ self.assertTrue(len(b.bank['production'])==1)
+ b.remove(b.session.get('release'))
+ self.assertFalse(os.path.exists(b.session.get_full_release_directory()))
+ b = Bank('local')
+ self.assertTrue(len(b.bank['production'])==0)
+
+ def test_update_stop_after(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertTrue(b.session.get_status('download'))
+ self.assertFalse(b.session.get_status('postprocess'))
+
+ def test_update_stop_before(self):
+ b = Bank('local')
+ b.options.stop_before = 'postprocess'
+ b.update()
+ self.assertTrue(b.session.get_status('download'))
+ self.assertFalse(b.session.get_status('postprocess'))
+
+ def test_reupdate_from_task(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ b2.update()
+ self.assertTrue(b2.session.get_status('postprocess'))
+ self.assertEqual(b.session.get_full_release_directory(), b2.session.get_full_release_directory())
+
+ def test_reupdate_from_task_error(self):
+ b = Bank('local')
+ b.options.stop_after = 'check'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ res = b2.update()
+ self.assertFalse(res)
+
+ def test_reupdate_from_task_wrong_release(self):
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = 'wrongrelease'
+ res = b2.update()
+ self.assertFalse(res)
+
+ @attr('process')
+ def test_postprocesses_restart_from_proc(self):
+ b = Bank('localprocess')
+ b.update()
+ proc1file = os.path.join(b.session.get_full_release_directory(),'proc1.txt')
+ proc2file = os.path.join(b.session.get_full_release_directory(),'proc2.txt')
+ self.assertTrue(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+ os.remove(proc1file)
+ os.remove(proc2file)
+ # Restart from postprocess, reexecute all processes
+ b2 = Bank('localprocess')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ b2.update()
+ self.assertTrue(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+ os.remove(proc1file)
+ os.remove(proc2file)
+ # Restart from postprocess, but at process PROC2 and following
+ b3 = Bank('localprocess')
+ b3.options.from_task = 'postprocess'
+ b3.options.process = 'PROC2'
+ b3.options.release = b.session.get('release')
+ b3.update()
+ #self.assertFalse(os.path.exists(proc1file))
+ self.assertTrue(os.path.exists(proc2file))
+
+ @attr('process')
+ def test_postprocess_wrong_process_name(self):
+ """If a wrong process name is given, update returns False and prints an error message"""
+ b = Bank('local')
+ b.options.stop_after = 'download'
+ b.update()
+ self.assertFalse(b.session.get_status('postprocess'))
+ b2 = Bank('local')
+ b2.options.from_task = 'postprocess'
+ b2.options.release = b.session.get('release')
+ b2.options.process = 'fake'
+ self.assertFalse(b2.update())
+ self.assertFalse(b2.session.get_status('postprocess'))
+ self.assertEqual(b.session.get_full_release_directory(), b2.session.get_full_release_directory())
+
+ def test_computed(self):
+ b = Bank('computed')
+ res = b.update(True)
+ self.assertTrue(res)
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/sub1/flat/test_100.txt'))
+ self.assertTrue(b.session.get('update'))
+ # Check that, with depends non updated, bank is not updated itself
+ nextb = Bank('computed')
+ res = nextb.update(True)
+ self.assertFalse(nextb.session.get('update'))
+
+ @attr('nofile')
+ def test_computed_nofile(self):
+ b = Bank('computed2')
+ b.load_session(UpdateWorkflow.FLOW)
+ b.session.config.set('protocol', 'none')
+ b.session.config.set('sub1.files.move', 'flat/test_.*')
+ res = b.update(True)
+ self.assertTrue(res)
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/sub1/flat/test_100.txt'))
+
+ def test_computed_ref_release(self):
+ b = Bank('computed2')
+ res = b.update(True)
+ b2 = Bank('sub1')
+ b2release = b2.bank['production'][len(b2.bank['production'])-1]['release']
+ brelease = b.bank['production'][len(b.bank['production'])-1]['release']
+ self.assertTrue(res)
+ self.assertTrue(brelease == b2release)
+
+ @attr('computed')
+ def test_computed_ref_release(self):
+ b = Bank('computed2')
+ res = b.update(True)
+ self.assertTrue(b.session.get('update'))
+ b2 = Bank('computed2')
+ res = b2.update(True)
+ self.assertFalse(b2.session.get('update'))
+
+ def test_computederror(self):
+ b = Bank('computederror')
+ res = b.update(True)
+ self.assertFalse(res)
+ self.assertTrue(b.session._session['depends']['sub2'])
+ self.assertFalse(b.session._session['depends']['error'])
+
+
+ @attr('directrelease')
+ def test_directhttp_release(self):
+ b = Bank('directhttp')
+ res = b.update()
+ self.assertTrue(b.session.get('update'))
+ self.assertTrue(os.path.exists(b.session.get_full_release_directory()+'/flat/debian/README.html'))
+ # print str(b.session.get('release'))
+ # print str(b.session.get('remoterelease'))
+
+ @attr('network')
+ def test_multi(self):
+ b = Bank('multi')
+ res = b.update()
+ with open(os.path.join(b.session.get_full_release_directory(),'flat/test1.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['args']['key1'] == 'value1')
+ with open(os.path.join(b.session.get_full_release_directory(),'flat/test2.json'), 'r') as content_file:
+ content = content_file.read()
+ my_json = json.loads(content)
+ self.assertTrue(my_json['form']['key1'] == 'value1')
+
+ def test_freeze(self):
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ b.freeze(rel)
+ prod = b.get_production(rel)
+ self.assertTrue(prod['freeze'] == True)
+ res = b.remove(rel)
+ self.assertTrue(res == False)
+ b.unfreeze(rel)
+ prod = b.get_production(rel)
+ self.assertTrue(prod['freeze'] == False)
+ res = b.remove(rel)
+ self.assertTrue(res == True)
+
+ def test_stats(self):
+ b = Bank('local')
+ b.update()
+ rel = b.session.get('release')
+ stats = Bank.get_banks_disk_usage()
+ self.assertTrue(stats[0]['size']>0)
+ for release in stats[0]['releases']:
+ if release['name'] == rel:
+ self.assertTrue(release['size']>0)
+
+ @attr('process')
+ def test_processes_meta_data(self):
+ b = Bank('localprocess')
+ b.update()
+ formats = b.session.get('formats')
+ self.assertTrue(len(formats['blast'])==2)
+ self.assertTrue(len(formats['test'][0]['files'])==3)
+
+ @attr('process')
+ def test_search(self):
+ b = Bank('localprocess')
+ b.update()
+ search_res = Bank.search(['blast'],[])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search([],['nucleic'])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search(['blast'],['nucleic'])
+ self.assertTrue(len(search_res)==1)
+ search_res = Bank.search(['blast'],['proteic'])
+ self.assertTrue(len(search_res)==0)
+
+ def test_owner(self):
+ """
+ test ACL with owner
+ """
+ b = Bank('local')
+ res = b.update()
+ self.assertTrue(res)
+ b.set_owner('sample')
+ b2 = Bank('local')
+ try:
+ res = b2.update()
+ self.fail('not owner, should not be allowed')
+ except Exception as e:
+ pass
diff --git a/tests/computed.properties b/tests/computed.properties
new file mode 100644
index 0000000..214baf4
--- /dev/null
+++ b/tests/computed.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+sub1.files.move=flat/test_.*
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computed2.properties b/tests/computed2.properties
new file mode 100644
index 0000000..2768000
--- /dev/null
+++ b/tests/computed2.properties
@@ -0,0 +1,45 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed local system bank test"
+db.name=local0
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local0_tmp
+dir.version=test/local0
+
+depends=sub1
+
+ref.release=sub1
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/computederror.properties b/tests/computederror.properties
new file mode 100644
index 0000000..ce4bae1
--- /dev/null
+++ b/tests/computederror.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="computed error local system bank test"
+db.name=computederror
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/computederror_tmp
+dir.version=test/computederror
+
+depends=sub2,error
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/directhttp.properties b/tests/directhttp.properties
new file mode 100644
index 0000000..30f673d
--- /dev/null
+++ b/tests/directhttp.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="directhttp system bank test"
+db.name=directhttp
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/directhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=directhttp
+server=ftp2.fr.debian.org
+
+release.protocol=directhttp
+release.server=ftp2.fr.debian.org
+release.remote.dir=/debian/README
+release.file=README
+release.regexp=([0-9.]+),
+release.file.compressed=
+
+#remote.dir=common/downloads/Current_Release/Pfalciparum3D7/fasta/data/PlasmoDB-25_Pfalciparum3D7_Genome.fasta
+#plasmo/communityDownload.do?fname=Atg3_alignment.txt
+remote.dir=/debian/README.html
+remote.files=
+
+local.files=debian/README.html
+
+## Post Process ## The files should be located in the projectfiles/process
+BLOCKS=
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/error.properties b/tests/error.properties
new file mode 100644
index 0000000..2e50f00
--- /dev/null
+++ b/tests/error.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="error local system bank test"
+db.name=error
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/error_tmp
+dir.version=test/error
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/error/
+remote.files=^error.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^error.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/global.properties b/tests/global.properties
new file mode 100644
index 0000000..75cb02c
--- /dev/null
+++ b/tests/global.properties
@@ -0,0 +1,123 @@
+[GENERAL]
+test=1
+conf.dir=/tmp/biomaj/config
+log.dir=/tmp/biomaj/log
+process.dir=/tmp/biomaj/process
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=/tmp/biomaj/
+lock.dir=/tmp/biomaj/lock
+cache.dir=/tmp/biomaj/cache
+
+db.url=mongodb://localhost:27017
+db.name=biomaj_test
+
+use_ldap=1
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+# Use ElasticSearch for index/search capabilities
+use_elastic=0
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj_test
+
+celery.queue=biomaj
+celery.broker=mongodb://localhost:27017/biomaj_celery
+
+# Get directory stats (can be time consuming depending on number of files etc...)
+data.stats=1
+
+# List of user admin (linux user id, comma separated)
+admin=
+
+# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
+auto_publish=0
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=
+mail.admin=
+mail.from=
+
+#---------------------
+#Proxy authentification
+#---------------------
+#proxyHost=
+#proxyPort=
+#proxyUser=
+#proxyPassword=
+
+#Number of thread for processes
+bank.num.threads=2
+
+#Number of threads to use for downloading
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#----------------------
+# Release configuration
+#----------------------
+release.separator=_
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+#http.parse.dir.line=<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.dir.line=<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
+http.parse.file.line=<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+
+# Bank default access
+visibility.default=public
+
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = INFO
+handlers = console
+
+[logger_biomaj]
+level = DEBUG
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = DEBUG
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/tests/local.properties b/tests/local.properties
new file mode 100644
index 0000000..7f6f5fd
--- /dev/null
+++ b/tests/local.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/locallist.properties b/tests/locallist.properties
new file mode 100644
index 0000000..a901b2c
--- /dev/null
+++ b/tests/locallist.properties
@@ -0,0 +1,44 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=locallist
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/locallist_tmp
+dir.version=test/locallist
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=
+remote.files=
+remote.files.list=true
+remote.files.1.path=/tmp/test.fasta.gz
+remote.files.2.path=/tmp/test2.fasta
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/localprocess.properties b/tests/localprocess.properties
new file mode 100644
index 0000000..7166186
--- /dev/null
+++ b/tests/localprocess.properties
@@ -0,0 +1,100 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local system bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Pre process
+db.pre.process=META0
+
+## Remove process
+db.remove.process=META0
+
+## Post Process ## The files should be located in the projectfiles/process directory
+BLOCKS=BLOCK1,BLOCK2
+BLOCK1.db.post.process=META0
+BLOCK2.db.post.process=META1,META2,META3
+META0=PROC0
+META1=PROC1,PROC2
+META2=PROC3
+META3=PROC4,PROC5
+
+
+PROC0.name=test0
+PROC0.desc=sample test
+PROC0.cluster=false
+PROC0.type=test
+PROC0.exe=echo
+PROC0.args=test $datadir
+
+PROC1.name=test1
+PROC1.desc=sample test
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=touch
+PROC1.args=$datadir/$dirversion/$localrelease/proc1.txt
+
+PROC2.name=test2
+PROC2.desc=sample test
+PROC2.cluster=false
+PROC2.type=test
+PROC2.exe=touch
+PROC2.args=$datadir/$dirversion/$localrelease/proc2.txt
+
+PROC3.name=test3
+PROC3.desc=sample test
+PROC3.cluster=false
+PROC3.type=test
+PROC3.exe=echo
+PROC3.args=test 3
+
+PROC4.name=test4
+PROC4.desc=sample test
+PROC4.cluster=false
+PROC4.type=test
+PROC4.exe=echo
+PROC4.args=test 4
+
+PROC5.name=test5
+PROC5.desc=sample test
+PROC5.cluster=false
+PROC5.type=testmetadata
+PROC5.exe=test.sh
+PROC5.args=
+PROC5.format=test
+PROC5.types=any
+PROC5.tags=chr:chr1,organism:hg19
+# If files is set, then the post-process does not have to print generated files on STDOUT (but can)
+# in this case, the list of files will be extracted from this list with above format/types/tags
+PROC5.files=dir1/file1,dir1/file2,dir1/file3
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/multi.properties b/tests/multi.properties
new file mode 100644
index 0000000..82e08f9
--- /dev/null
+++ b/tests/multi.properties
@@ -0,0 +1,60 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for multi protocol
+db.name=multi
+db.type=test
+
+offline.dir.name=offline/multi_tmp
+dir.version=multi
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=multi
+server=
+remote.dir=
+
+remote.file.0.protocol = directhttp
+remote.file.0.server = httpbin.org
+remote.file.0.path = /get
+remote.file.0.params.keys = key1,key2
+remote.file.0.params.key1 = value1
+remote.file.0.params.key2 = value2
+remote.file.0.name = test1.json
+
+remote.file.1.protocol = directhttp
+remote.file.1.method = POST
+remote.file.1.server = httpbin.org
+remote.file.1.path = /post
+remote.file.1.params.keys = key1,key2
+remote.file.1.params.key1 = value1
+remote.file.1.params.key2 = value2
+remote.file.1.name = test2.json
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub1.properties b/tests/sub1.properties
new file mode 100644
index 0000000..8e0c69b
--- /dev/null
+++ b/tests/sub1.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local1
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local1_tmp
+dir.version=test/local1
+
+depends=sub2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/sub2.properties b/tests/sub2.properties
new file mode 100644
index 0000000..b9d3142
--- /dev/null
+++ b/tests/sub2.properties
@@ -0,0 +1,41 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="sub local system bank test"
+db.name=local2
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local2_tmp
+dir.version=test/local2
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/tmp/
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^test.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tests/testhttp.properties b/tests/testhttp.properties
new file mode 100644
index 0000000..454753e
--- /dev/null
+++ b/tests/testhttp.properties
@@ -0,0 +1,43 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname=test for http protocol
+db.name=testhttp
+db.type=package
+
+offline.dir.name=offline/testhttp_tmp
+dir.version=testhttp
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=http
+server=ftp2.fr.debian.org
+remote.dir=/debian/dists/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^stable/Release$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+db.post.process=
+
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tools/examples/alu.properties b/tools/examples/alu.properties
new file mode 100644
index 0000000..b4ce85a
--- /dev/null
+++ b/tools/examples/alu.properties
@@ -0,0 +1,51 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="alu.n : alu repeat element. alu.a : translation of alu.n repeats"
+db.name=alu
+db.type=nucleic_protein
+
+offline.dir.name=offline/ncbi/blast/alu_tmp
+dir.version=ncbi/blast/alu
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# NCBI (download fasta)
+protocol=ftp
+server=ftp.ncbi.nih.gov
+remote.dir=/blast/db/FASTA/
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.files=^alu.*\.gz$
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+local.files=^alu\.(a|n).*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+BLOCKS=BLOCK1
+BLOCK1.db.post.process=META0
+META0=PROC1
+
+PROC1.name=scanflatdir
+PROC1.desc=scan bank flat to detect available files
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=scan.py
+PROC1.args=--scan $datadir/$dirversion/$localrelease --type=nucleic --tags="organism:human"
+
+
+
+### Deployment ###
+# Always keep previous version
+keep.old.version=1
diff --git a/tools/examples/global.properties b/tools/examples/global.properties
new file mode 100644
index 0000000..e137364
--- /dev/null
+++ b/tools/examples/global.properties
@@ -0,0 +1,146 @@
+[GENERAL]
+#The root directory where all databases are stored.
+#If your data is not stored under one directory hirearchy
+#you can override this value in the database properties file.
+data.dir=/var/lib/biomaj3/banks
+
+conf.dir=/etc/biomaj3
+log.dir=/var/log/biomaj3
+process.dir=/usr/share/biomaj3/process
+cache.dir=/var/cache/biomaj3
+lock.dir= /var/lib/biomaj3/lock
+
+db.url=mongodb://localhost:27017
+db.name=biomaj
+
+use_ldap=0
+ldap.host=localhost
+ldap.port=389
+ldap.dn=nodomain
+
+use_elastic=0
+#Comma separated list of elasticsearch nodes host1,host2:port2
+elastic_nodes=localhost
+elastic_index=biomaj
+
+# Calculate data.dir size stats
+data.stats=1
+
+# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
+auto_publish=0
+
+# Microservices
+# You can activate all microservices or only a few (download only for example).
+# Proxy is the API proxy that would load-balance API requests between services. If using only 1 service then you
+# can specify in proxy the URL to the biomaj web service in question.
+#
+# If using download micro service only you can execute for example 4 biomaj download message processes and 1 biomaj web process,
+# proxy would match the URL to the biomaj web process in this case.
+# See micro services documentation.
+micro.biomaj.proxy=
+#micro.biomaj.service.download=1
+#micro.biomaj.service.process=1
+#micro.biomaj.service.user=1
+#micro.biomaj.service.daemon=1
+
+# Rabbitmq configuration (if using microservices)
+# Rabbitmq must be configured to get remote access with defined user (by default guest access is allowed only from localhost),
+# see rabbitmq configuration for help (https://www.rabbitmq.com/access-control.html)
+micro.biomaj.rabbit_mq=
+micro.biomaj.rabbit_mq_port=5672
+micro.biomaj.rabbit_mq_virtualhost=/
+micro.biomaj.rabbit_mq_user=
+micro.biomaj.rabbit_mq_password=
+
+
+# Influxdb (optional)
+# keep host empty or commented if you do not use influxdb
+# database n influxdb must be created first
+influxdb.host=
+influxdb.port=8086
+influxdb.user=root
+influxdb.password=root
+influxdb.db=biomaj
+
+########################
+# Global properties file
+
+
+#To override these settings for a specific database go to its
+#properties file and uncomment or add the specific line you want
+#to override.
+
+#----------------
+# Mail Configuration
+#---------------
+#Uncomment thes lines if you want receive mail when the workflow is finished
+
+mail.smtp.host=localhost
+mail.admin=
+mail.from=biomaj at localhost
+mail.user=
+mail.password=
+mail.tls=
+
+
+#Number of thread during the download
+bank.num.threads=4
+
+#Number of threads to use for downloading and processing
+files.num.threads=4
+
+#to keep more than one release increase this value
+keep.old.version=0
+
+#The historic log file is generated in log/
+#define level information for output : DEBUG,INFO,WARN,ERR
+historic.logfile.level=DEBUG
+
+http.parse.dir.line=<a[\\s]+href=\"([\\S]+)/\".*alt=\"\\[DIR\\]\">.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})
+http.parse.file.line=<a[\\s]+href=\"([\\S]+)\".*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})[\\s]+([\\d\\.]+[MKG]{0,1})
+
+http.group.dir.name=1
+http.group.dir.date=2
+http.group.file.name=1
+http.group.file.date=2
+http.group.file.size=3
+
+
+# Bank default access
+visibility.default=public
+
+# Proxy, optional proxy (see format at
+# http://curl.haxx.se/libcurl/c/CURLOPT_PROXY.html)
+# biomaj >= 3.0.7
+#proxy=myproxyhost:1080
+#proxy=myproxyhost
+# Set proxy authentication if any, else keep commented
+#proxy_auth=user:password
+
+[loggers]
+keys = root, biomaj
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level =INFO
+handlers = console
+
+[logger_biomaj]
+level = INFO
+handlers = console
+qualname = biomaj
+propagate=0
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = INFO
+formatter = generic
+
+[formatter_generic]
+format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
diff --git a/tools/examples/local.properties b/tools/examples/local.properties
new file mode 100644
index 0000000..edb5211
--- /dev/null
+++ b/tools/examples/local.properties
@@ -0,0 +1,55 @@
+[GENERAL]
+######################
+### Initialization ###
+
+db.fullname="local copy bank test"
+db.name=local
+db.type=nucleic_protein
+
+offline.dir.name=offline/test/local_tmp
+dir.version=test/local
+
+frequency.update=0
+
+### Synchronization ###
+
+files.num.threads=1
+
+# Local system (copy some files)
+protocol=local
+server=
+
+release.file=
+release.regexp=
+release.file.compressed=
+
+remote.dir=/DIR_PATH_OF_FILES_TO_COPY
+# Reg exp of files to copy
+remote.files=^test.*
+
+#Uncomment if you don't want to extract the data files.
+#no.extract=true
+
+# Reg exp of files to keep
+local.files=^.*
+
+## Post Process ## The files should be located in the projectfiles/process directory
+
+#db.post.process=
+## Post Process ## The files should be located in the projectfiles/process
+BLOCKS=BLOCK1
+BLOCK1.db.post.process=META0
+META0=PROC1
+
+PROC1.name=test1
+PROC1.desc=scan copied files try to auto detect format
+PROC1.cluster=false
+PROC1.type=test
+PROC1.exe=scan.py
+PROC1.args=--scan $datadir/$dirversion/$localrelease
+
+
+
+### Deployment ###
+
+keep.old.version=1
diff --git a/tools/process/concat.sh b/tools/process/concat.sh
new file mode 100755
index 0000000..e1482e4
--- /dev/null
+++ b/tools/process/concat.sh
@@ -0,0 +1,114 @@
+# ! /bin/bash
+# Script for Biomaj PostProcess
+#
+# concat files
+#
+# ARGS :
+# 1) regular expression for file to apply concat
+# 2) regular expression for exclude files in result
+# 3) relativ path name (result of concat)
+# 4) format (fasta) [OPTIONAL]
+# 5) types (type1,type2,...) [OPTIONAL]
+# 6) tags (key:value,key:value,...) [OPTIONAL]
+#
+#
+#
+# Default input from STDIN unless files specified. To explictly specify STDIN
+# to be used for input use '-' as filename
+
+
+if (test $# -lt 3) then
+ echo "arguments:" 1>&2;
+ echo "1: regular expression for include a set of file to apply concat" 1
+>&2;
+ echo "2: regular expression for exclude a set of file to apply concat" 1
+>&2;
+ echo "3: result file name (relative path from future_release and name)"
+1>&2;
+ exit -1;
+
+fi
+
+workdir=$datadir/$dirversion/$localrelease/
+echo "apply concat with set $workdir/$1 to $workdir/$3";
+
+#Creation des repertoires
+
+dirtocreate=`dirname $workdir/$3`;
+
+if (! test -e $dirtocreate ) then
+ echo "mkdir :"$dirtocreate;
+ mkdir -p $dirtocreate
+fi
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $dirtocreate." 1>&2 ;
+ exit 1;
+fi
+
+
+cd $workdir;
+
+echo ;
+
+files='';
+
+echo "set a list of file...";
+
+for expr in $1
+do
+ # echo "$expr";
+ # dir=`dirname $expr`;
+ # fileExp=`basename $expr`;
+ if [ "$2" != "" ]
+ then
+ files="$files ""`echo $expr | egrep -v $2`";
+ else
+ files="$files $expr";
+ fi
+done
+
+echo "";
+echo "--------------------------";
+echo "Comput [$workdir/$3]....";
+echo "change directory:$workdir";
+echo "$files > $workdir/$3";
+rm -f $workdir/$3 2> /dev/null ;
+
+if ( test -z "$files" )
+then
+ echo "Cannot create $workdir/$3 : no files !" 1>&2 ;
+ exit 1;
+fi
+
+echo "cat $files > $workdir/$3";
+
+for fileToConcat in $files
+do
+ cat $fileToConcat >> $workdir/$3 ;
+
+ if ( test $? -ne 0 ) then
+ echo "Cannot create $3.[error:$?]" 1>&2 ;
+ exit 1;
+ fi
+done
+
+format=""
+types=""
+tags=""
+if [ "$4" != "" ]
+then
+ format=$4
+fi
+if [ "$5" != "" ]
+then
+ types=$5
+fi
+if [ "$6" != "" ]
+then
+ tags=$6
+fi
+
+
+
+echo "##BIOMAJ#$format#$types#$tags#$3"
diff --git a/tools/process/formatdb.sh b/tools/process/formatdb.sh
new file mode 100755
index 0000000..a36abea
--- /dev/null
+++ b/tools/process/formatdb.sh
@@ -0,0 +1,244 @@
+# ! /bin/bash
+
+# Script for Biomaj PostProcess
+# author : ofilangi
+# date : 19/06/2007
+# update : 22/10/2010 fix bug in generated alias file + a few cleanups
+#
+# -t Title for database file [String] Optional
+# -i Input file(s) for formatting [File In] Optional
+# -l Logfile name: [File Out] Optional
+# default = formatdb.log
+# -p Type of file
+# T - protein
+# F - nucleotide [T/F] Optional
+# default = T
+# -o Parse options
+# T - True: Parse SeqId and create indexes.
+# F - False: Do not parse SeqId. Do not create indexes.
+# [T/F] Optional
+# default = F
+# -a Input file is database in ASN.1 format (otherwise FASTA is expected)
+# T - True,
+# F - False.
+# [T/F] Optional
+# default = F
+# -b ASN.1 database in binary mode
+# T - binary,
+# F - text mode.
+# [T/F] Optional
+# default = F
+# -e Input is a Seq-entry [T/F] Optional
+# default = F
+# -n Base name for BLAST files [String] Optional
+# -v Database volume size in millions of letters [Integer] Optional
+# default = 4000
+# -s Create indexes limited only to accessions - sparse [T/F] Optional
+# default = F
+# -V Verbose: check for non-unique string ids in the database [T/F] Optional
+# default = F
+# -L Create an alias file with this name
+# use the gifile arg (below) if set to calculate db size
+# use the BLAST db specified with -i (above) [File Out] Optional
+# -F Gifile (file containing list of gi's) [File In] Optional
+# -B Binary Gifile produced from the Gifile specified above [File Out] Optional
+# -T Taxid file to set the taxonomy ids in ASN.1 deflines [File In] Optional
+#
+#
+
+#----------
+#GLOBAL DEF
+#----------
+
+BLASTDB_DIR="/db/index-blast"; # Path where aliases files should be generated
+FORMATDB=/local/ncbi/current/bin/formatdb; # Path to formatdb executable
+
+
+#----------
+# FUNCTIONS
+#----------
+# createAlias: builds an alias file
+# arg1: file to write to
+# arg2: bank name
+# arg3: db file list
+createAlias() {
+ local file=$1;
+ local nomBanque=$2;
+ local lFiles=$3;
+
+ rm -f $file;
+ echo "#" > $file
+ echo "# Alias file created "`date` >>$file
+ echo "#" >>$file ;
+ echo "#">> $file ;
+ echo "TITLE "$nomBanque >> $file;
+ echo "#" >> $file;
+ echo "DBLIST "$lFiles >>$file;
+ echo "#" >> $file;
+ echo "#GILIST" >> $file;
+ echo "#" >> $file;
+ echo "#OIDLIST" >> $file;
+ echo "#" >> $file;
+}
+
+#-----
+# MAIN
+#-----
+
+if (test $# -ne 4) then
+ echo "arguments:" 1>&2
+ echo "1: input files"
+ echo "2: working directory" 1>&2
+ echo "3: formatdb options (without -i for input file)" 1>&2
+ echo "4: bank name" 1>&2
+ echo `formatdb --help`;
+ exit -1
+fi
+
+relWorkDir=`echo "$2" | sed "s/\/*$//"` # remove useless trailing slash
+
+workdir=$datadir/$dirversion/future_release
+workdir=$workdir/$relWorkDir;
+
+rm -rf $workdir;
+mkdir -p $workdir ;
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $workdir." 1>&2 ;
+ exit 1;
+fi
+
+cd $workdir
+
+# Some vars for links creation
+back="";
+dir=$relWorkDir;
+OLDIFS=$IFS;
+IFS="/";
+for i in $dir
+do
+ back="../"$back;
+done
+IFS=$OLDIFS;
+
+# Create links to input files into the working dir
+listFile="";
+
+for expression in $1
+do
+ # the basename can be a regex
+ lsFile=`ls $datadir/$dirversion/future_release/$expression`;
+ if ( test $? -ne 0 ) then
+ echo "No input file found in dir `pwd`." 1>&2 ;
+ exit 1
+ fi
+ baseFile=`dirname $expression`;
+ for f in $lsFile
+ do
+ name=`basename $f`;
+ rm -f $4.p*;
+ rm -f $4.n*;
+ nameLink=`echo $name | cut -d"." -f1`;
+ ln -s $back/$baseFile/$name $nameLink;
+ if ( test $? -ne 0 ) then
+ echo "Cannot create link. [ln -s $back$f $name]" 1>&2 ;
+ exit 1
+ fi
+ if (test -z "$listFile") then
+ listFile=$nameLink;
+ else
+ listFile=$nameLink" "$listFile;
+ fi
+ done
+done
+
+echo "Input sequence file list: $listFile";
+
+if (test -z "$listFile") then
+ echo "No input file found." 1>&2 ;
+ exit 1
+fi
+
+nameB=$4;
+echo "Database name: $nameB";
+
+echo "Working in "`pwd`;
+echo "Launching formatdb [formatdb -i $listFile $3 -n $nameB]";
+
+# Execute formatdb
+$FORMATDB -i "$listFile" $3 -n $nameB;
+
+formatdbResult=$?
+if ( test $formatdbResult -ne 0 ) then
+ echo "Formatdb failed with status $formatdbResult" 1>&2 ;
+ exit 1
+fi
+
+echo "##BIOMAJ#blast###$2$nameB"
+
+# Delete temp files and links
+#-------------------------------------------------------------
+rm -f $listFile;
+rm -f formatdb.log
+
+# Add generated files to biomaj postprocess dependance
+echo "Generated files:";
+for ff in `ls *`
+do
+ echo $PP_DEPENDENCE$PWD/$ff;
+done
+
+goodPath=`readlink $datadir/$dirversion/future_release -s -n`;
+if ( test $? -ne 0 ) then
+ echo "Failed to get version path: readlink returned with an error [$goodPath]" 1>&2 ;
+ exit 1
+fi
+
+# Search for nal files which are sometimes generated by formatdb.
+lsAl=`ls *.?al 2> /dev/null`;
+
+if ( test $? -ne 0 ) then
+ echo "No alias file found.";
+ lsAl="";
+else
+ echo "Generated alias files:"
+ echo "$lsAl";
+fi
+
+# If nal files were generated, use them to generate nal files in $BLASTDB_DIR
+for fileIndexVirtuel in $lsAl
+do
+ echo "Found alias file: [$fileIndexVirtuel]";
+ listIndex=`more $fileIndexVirtuel | grep DBLIST`;
+ listFile2="";
+ for f in $listIndex
+ do
+ if (test $f != "DBLIST") then
+ listFile2=$goodPath/$relWorkDir/$f" "$listFile2;
+ fi
+ done
+ echo "Creating alias in [$BLASTDB_DIR/$fileIndexVirtuel]";
+ createAlias $BLASTDB_DIR/$fileIndexVirtuel $nameB "$listFile2"
+done
+
+# Else, if no nal file was generated by formatdb, create them
+if (test -z "$lsAl") then
+ ext=`ls | grep .*hr$ | tail -c5 | head -c2`al;
+ echo "Creating alias file [$PWD/$4$ext]";
+
+ listNhr=`ls *.*hr | sed 's/\..hr$//g'`;
+ listFileNalRel=""; # List of blast db files, relative path
+ listFileNalAbs=""; # List of blast db files, absolute path
+ for f in $listNhr
+ do
+ listFileNalRel=$f" "$listFileNalRel;
+ listFileNalAbs=$goodPath/$relWorkDir/$f" "$listFileNalAbs;
+ done
+
+ createAlias $4$ext $nameB "$listFileNalRel";
+ echo $PP_DEPENDENCE$PWD/$4$ext;
+
+ echo "Creating alias in [$BLASTDB_DIR/$4$ext]";
+ createAlias $BLASTDB_DIR/$4$ext $nameB "$listFileNalAbs" ;
+fi
+
diff --git a/tools/process/makeblastdb.sh b/tools/process/makeblastdb.sh
new file mode 100755
index 0000000..49aa952
--- /dev/null
+++ b/tools/process/makeblastdb.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+# Script for Biomaj PostProcess
+# author : ofilangi, osallou
+# date : 19/06/2007
+# update : 22/10/2010 fix bug in generated alias file + a few cleanups
+# 23/12/2015 use makeblastdb for ncbi blast+
+#
+# -title Title for database file [String] Optional
+# -in Input file(s) for formatting [File In] Optional
+# -logfile Logfile name: [File Out] Optional
+# default = formatdb.log
+# -dbtype nucl
+# -parse_seqids
+#
+
+#----------
+#GLOBAL DEF
+#----------
+BLASTDB_DIR="$datadir/index-blast"; # Path where aliases files should be generated
+mkdir -p $BLASTDB_DIR
+FORMATDB="makeblastdb"; # Path to formatdb executable
+
+
+#----------
+# FUNCTIONS
+#----------
+# createAlias: builds an alias file
+# arg1: file to write to
+# arg2: bank name
+# arg3: db file list
+createAlias() {
+ local file=$1;
+ local nomBanque=$2;
+ local lFiles=$3;
+
+ rm -f $file;
+ echo "#" > $file
+ echo "# Alias file created "`date` >>$file
+ echo "#" >>$file ;
+ echo "#">> $file ;
+ echo "TITLE "$nomBanque >> $file;
+ echo "#" >> $file;
+ echo "DBLIST "$lFiles >>$file;
+ echo "#" >> $file;
+ echo "#GILIST" >> $file;
+ echo "#" >> $file;
+ echo "#OIDLIST" >> $file;
+ echo "#" >> $file;
+}
+
+#-----
+# MAIN
+#-----
+
+if (test $# -ne 4) then
+ echo "arguments:" 1>&2
+ echo "1: input files"
+ echo "2: working directory" 1>&2
+ echo "3: formatdb options (without -in for input file)" 1>&2
+ echo "4: bank name" 1>&2
+ echo `formatdb --help`;
+ exit -1
+fi
+
+relWorkDir=`echo "$2" | sed "s/\/*$//"` # remove useless trailing slash
+
+workdir=$datadir/$dirversion/future_release
+workdir=$workdir/$relWorkDir;
+
+rm -rf $workdir;
+mkdir -p $workdir ;
+
+if ( test $? -ne 0 ) then
+ echo "Cannot create $workdir." 1>&2 ;
+ exit 1;
+fi
+
+cd $workdir
+
+# Some vars for links creation
+back="";
+dir=$relWorkDir;
+OLDIFS=$IFS;
+IFS="/";
+for i in $dir
+do
+ back="../"$back;
+done
+IFS=$OLDIFS;
+
+# Create links to input files into the working dir
+listFile="";
+
+for expression in $1
+do
+ # the basename can be a regex
+ lsFile=`ls $datadir/$dirversion/future_release/$expression`;
+ if ( test $? -ne 0 ) then
+ echo "No input file found in dir `pwd`." 1>&2 ;
+ exit 1
+ fi
+ baseFile=`dirname $expression`;
+ for f in $lsFile
+ do
+ name=`basename $f`;
+ rm -f $4.p*;
+ rm -f $4.n*;
+ nameLink=`echo $name | cut -d"." -f1`;
+ ln -s $back/$baseFile/$name $nameLink;
+ if ( test $? -ne 0 ) then
+ echo "Cannot create link. [ln -s $back$f $name]" 1>&2 ;
+ exit 1
+ fi
+ if (test -z "$listFile") then
+ listFile=$nameLink;
+ else
+ listFile=$nameLink" "$listFile;
+ fi
+ done
+done
+
+echo "Input sequence file list: $listFile";
+
+if (test -z "$listFile") then
+ echo "No input file found." 1>&2 ;
+ exit 1
+fi
+
+nameB=$4;
+echo "Database name: $nameB";
+
+echo "Working in "`pwd`;
+echo "Launching formatdb [formatdb -in $listFile $3 -out $nameB]";
+
+# Execute formatdb
+$FORMATDB -in "$listFile" $3 -out $nameB;
+
+formatdbResult=$?
+if ( test $formatdbResult -ne 0 ) then
+ echo "Formatdb failed with status $formatdbResult" 1>&2 ;
+ exit 1
+fi
+
+echo "##BIOMAJ#blast###$2$nameB"
+
+# Delete temp files and links
+#-------------------------------------------------------------
+rm -f $listFile;
+rm -f formatdb.log
+
+# Add generated files to biomaj postprocess dependance
+echo "Generated files:";
+for ff in `ls *`
+do
+ echo $PP_DEPENDENCE$PWD/$ff;
+done
+
+goodPath=`readlink $datadir/$dirversion/future_release -s -n`;
+if ( test $? -ne 0 ) then
+ echo "Failed to get version path: readlink returned with an error [$goodPath]" 1>&2 ;
+ exit 1
+fi
+
+# Search for nal files which are sometimes generated by formatdb.
+lsAl=`ls *.?al 2> /dev/null`;
+
+if ( test $? -ne 0 ) then
+ echo "No alias file found.";
+ lsAl="";
+else
+ echo "Generated alias files:"
+ echo "$lsAl";
+fi
+
+# If nal files were generated, use them to generate nal files in $BLASTDB_DIR
+for fileIndexVirtuel in $lsAl
+do
+ echo "Found alias file: [$fileIndexVirtuel]";
+ listIndex=`more $fileIndexVirtuel | grep DBLIST`;
+ listFile2="";
+ for f in $listIndex
+ do
+ if (test $f != "DBLIST") then
+ listFile2=$goodPath/$relWorkDir/$f" "$listFile2;
+ fi
+ done
+ echo "Creating alias in [$BLASTDB_DIR/$fileIndexVirtuel]";
+ createAlias $BLASTDB_DIR/$fileIndexVirtuel $nameB "$listFile2"
+done
+
+# Else, if no nal file was generated by formatdb, create them
+if (test -z "$lsAl") then
+ ext=`ls | grep .*hr$ | tail -c5 | head -c2`al;
+ echo "Creating alias file [$PWD/$4$ext]";
+
+ listNhr=`ls *.*hr | sed 's/\..hr$//g'`;
+ listFileNalRel=""; # List of blast db files, relative path
+ listFileNalAbs=""; # List of blast db files, absolute path
+ for f in $listNhr
+ do
+ listFileNalRel=$f" "$listFileNalRel;
+ listFileNalAbs=$goodPath/$relWorkDir/$f" "$listFileNalAbs;
+ done
+
+ createAlias $4$ext $nameB "$listFileNalRel";
+ echo $PP_DEPENDENCE$PWD/$4$ext;
+
+ echo "Creating alias in [$BLASTDB_DIR/$4$ext]";
+ createAlias $BLASTDB_DIR/$4$ext $nameB "$listFileNalAbs" ;
+fi
+
diff --git a/tools/process/scan.py b/tools/process/scan.py
new file mode 100755
index 0000000..21fe868
--- /dev/null
+++ b/tools/process/scan.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import os,sys
+import argparse
+import logging.config
+
+from biomaj_core.utils import Utils
+
+def main():
+
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument('-s', '--scan', dest="directory",help="Directory to scan")
+ parser.add_argument('--type', dest="ftype",help="Files type")
+ parser.add_argument('--tags', dest="tags", action="append", default=[],
+ help="tags, format key:value, can be repeated multiple times")
+
+ args = parser.parse_args()
+
+ if not os.path.exists(args.directory):
+ sys.exit(1)
+
+ res = {}
+ for (path, dirs, files) in os.walk(args.directory):
+ for file in files:
+ filename = os.path.join(path, file)
+ (file_format, mime) = Utils.detect_format(filename)
+ if file_format is not None:
+ file_format = file_format.replace('application/','')
+ filename = filename.replace(args.directory+'/','')
+ if file_format is not None:
+ if file_format not in res:
+ res[file_format] = [filename]
+ else:
+ res[file_format].append(filename)
+
+ f_type = ''
+ if args.ftype:
+ f_type = args.ftype
+ tags = ''
+ if args.tags:
+ tags = ','.join(args.tags)
+ for fformat in res.keys():
+ print('##BIOMAJ#'+fformat+'#'+f_type+'#'+tags+'#'+','.join(res[fformat]))
+
+
+if __name__ == '__main__':
+ main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/biomaj3.git
More information about the debian-med-commit
mailing list