[med-svn] [paleomix] 01/07: New upstream version 1.2.12
Andreas Tille
tille at debian.org
Thu Nov 16 11:13:22 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository paleomix.
commit 10673661df7d8739a8a5973d978ef75917bce1c6
Author: Andreas Tille <tille at debian.org>
Date: Thu Nov 16 09:33:35 2017 +0100
New upstream version 1.2.12
---
.gitignore | 2 +
CHANGES.md | 109 +++++-
INSTALL.rst | 4 +
README.rst | 16 +-
docs/bam_pipeline/requirements.rst | 6 +-
docs/conf.py | 28 +-
docs/index.rst | 14 +-
docs/yaml.rst | 164 ++++++++-
docs/zonkey_pipeline/index.rst | 11 +-
paleomix/__init__.py | 2 +-
paleomix/atomiccmd/builder.py | 81 +++--
paleomix/atomiccmd/command.py | 8 +-
paleomix/atomiccmd/pprint.py | 102 +++---
paleomix/common/formats/fasta.py | 16 +-
paleomix/common/makefile.py | 20 ++
paleomix/common/testing.py | 6 +-
paleomix/common/vcffilter.py | 3 +-
paleomix/common/vcfwrap.py | 15 +-
paleomix/common/versions.py | 77 +++--
paleomix/main.py | 4 +
paleomix/node.py | 100 +++---
paleomix/nodes/bowtie2.py | 31 +-
paleomix/nodes/bwa.py | 41 +--
paleomix/nodes/commands.py | 73 ++--
paleomix/nodes/mapdamage.py | 38 +-
paleomix/nodes/picard.py | 233 ++++++-------
paleomix/nodes/samtools.py | 27 +-
paleomix/nodes/validation.py | 252 ++++++++------
paleomix/pipeline.py | 4 +-
paleomix/tools/bam_pipeline/makefile.py | 105 +++---
paleomix/tools/bam_pipeline/mkfile.py | 6 +-
paleomix/tools/bam_pipeline/nodes.py | 73 ++--
paleomix/tools/bam_pipeline/parts/lane.py | 9 +-
paleomix/tools/bam_pipeline/parts/library.py | 40 ++-
paleomix/tools/bam_pipeline/parts/prefix.py | 102 ++++--
paleomix/tools/bam_pipeline/parts/statistics.py | 20 +-
paleomix/tools/bam_pipeline/pipeline.py | 46 ++-
paleomix/tools/bam_stats/common.py | 2 +-
paleomix/tools/cleanup.py | 42 ++-
paleomix/tools/dupcheck.py | 93 +++++
paleomix/tools/rmdup_collapsed.py | 315 ++++++++++-------
paleomix/tools/zonkey/config.py | 18 +-
paleomix/tools/zonkey/pipeline.py | 17 +-
paleomix/ui.py | 10 +-
setup.py | 2 +-
tests/.coveragerc | 11 +
tests/README.md | 4 +
tests/atomiccmd_test/builder_test.py | 75 +++-
tests/atomiccmd_test/command_test.py | 213 +++++++-----
tests/atomiccmd_test/pprint_test.py | 439 ++++++++++++++----------
tests/atomiccmd_test/sets_test.py | 18 +-
tests/common_tests/fileutils_test.py | 10 +-
tests/common_tests/makefile_test.py | 216 ++++++++++--
tests/common_tests/versions_tests.py | 118 +++++--
tests/run | 27 --
tests/setup.sh | 15 +
tox.ini | 4 +-
57 files changed, 2286 insertions(+), 1251 deletions(-)
diff --git a/.gitignore b/.gitignore
index 2295b43..51aa74b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,10 +14,12 @@ sdist
tests/runs
tests/links/
+tests/all_modules.py
*.egg/
*.egg-info/
.eggs
.tox
+.vscode
docs/_build
diff --git a/CHANGES.md b/CHANGES.md
index fe86ba8..92791d1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,4 +1,99 @@
-# Change log
+# Changelog
+
+## [1.2.12] - 2017-08-13
+### Fixed
+ - Fixed input / output files not being listed in 'pipe.errors' files.
+ - Use the same max open files limit for picard (ulimit -n minus headroom)
+ when determining if the default should be changed and as the final value.
+
+### Added
+ - The 'vcf_to_fasta' command now supports VCFs containing haploid genotype
+ calls, courtesy of Graham Gower.
+
+### Changed
+ - Require Pysam version 0.10.0 or later.
+
+
+## [1.2.11] - 2017-06-09
+### Fixed
+ - Fixed unhandled exception if a FASTA file for a prefix is missing in a
+ BAM pipeline makefile.
+ - Fixed the 'RescaleQualities' option not being respected for non-global
+ options in BAM pipeline makefiles.
+
+
+## [1.2.10] - 2017-05-29
+### Added
+ - Preliminary support for CSI indexed BAM files, required for genomes with
+ chromosomes > 2^29 - 1 bp in size. Support is still missing in HTSJDK, so
+ GATK cannot currently be used with such genomes. CSI indexing is enabled
+ automatically when required.
+
+### Fixed
+ - Reference sequences placed in the current directory no longer cause the
+ BAM pipeline to complain about non-writable directories.
+ - The maximum number of temporary files used by picard will no longer be
+ increased above the default value used by the picard tools.
+
+### Changed
+ - The 'Status' of processes terminated by the pipeline will now be reported
+ as 'Automatically terminated by PALEOMIX'. This is to help differentiate
+ between processes that failed or were killed by an external source, and
+ processes that were cleaned up by the pipeline itself.
+ - Pretty-printing of commands shown when commands fail have been revised
+ to make it more readable, including explicit descriptions when output
+ is piped from one process to another and vice versa.
+ - Commands are now shown in a format more suitable for running on the
+ command-line, instead of as a Python list, when a node fails. Pipes are
+ still specified separately.
+ - Improved error messages for missing programs during version checks, and for
+ exceptions raised when calling Popen during version checks.
+ - Strip MC tags from reads with unmapped mates during cleanup; this is
+ required since Picard (v2.9.0) ValidateSamFile considers such tags invalid.
+
+
+## [1.2.9] - 2017-05-01
+### Fixed
+ - Improved handling of BAM tags to prevent unintended type changes.
+ - Fixed 'rmdup_collapsed' underreporting the number of duplicate reads (in
+ the 'XP' tag), when duplicates with different CIGAR strings were processed.
+
+### Changed
+ - PCR duplicates detected for collapsed reads using 'rmdup\_collapsed' are
+ now identified based on alignments that include clipped bases. This
+ matches the behavior of the Picard 'MarkDuplicates' command.
+ - Depending on work-load, 'rmdup\_collapsed' may now run up to twice as fast.
+
+
+## [1.2.8] - 2017-04-28
+### Added
+ - Added FILTER entry for 'F' filter used in vcf\_filter. This corresponds to
+ heterozygous sites where the allele frequency was not determined.
+ - Added 'dupcheck' command. This command roughly corresponds to the
+ DetectInputDuplication step that is part of the BAM pipeline, and attempts
+ to identify duplicate data (not PCR duplicates), by locating reads mapped
+ to the same position, with the same name, sequence, and quality scores.
+ - Added link to sample data used in publication to the Zonkey documentation.
+
+### Changed
+ - Only letters, numbers, and '-', '_', and '.' are allowed in sample-names
+ used in Zonkey, in order to prevent invalid filenames and certain programs
+ breaking on whitespace. Trailing whitespace is stripped.
+ - Show more verbose output when building Zonkey pipelines.
+ - Picard tools version 1.137 or later is now required by the BAM pipeline.
+ This is nessesary as newer BAM files (header version 1.5) would fail to
+ validate when using earlier versions of Picard tools.
+
+### Fixed
+ - Fixed validation nodes failing on output paths without a directory.
+ - Fixed possible uncaught exceptions when terminating cat commands used by
+ FASTQ validation nodes resulting in loss of error messages.
+ - Fixed makefile validation failing with an unhandled TypeError if unhashable
+ types were found in unexpected locations. For example, a dict found where a
+ subset of strings were allowed. These now result in a proper MakeFileError.
+ - Fixed user options in the 'BWA' section of the BAM Pipeline makefiles not
+ being correctly applied when using the 'mem' or the 'bwasw' algorithms.
+ - Fixed some unit tests failing when the environment caused getlogin to fail.
## [1.2.7] - 2017-01-03
@@ -7,7 +102,6 @@
detecting equine F1 hybrids from archeological remains. Usage
is described in the documentation.
-
### Changed
- The wrongly named per-sample option 'Gender' in the phylogenetic
pipeline makefile has been replaced with a 'Sex' option. This does not
@@ -17,7 +111,6 @@
in the BAM pipeline makefile. The 'mapDamage' feature now takes the options
'plot', 'model', and 'rescale', allowing more fine-grained control.
-
### Fixed
- Fixed the phylogenetic pipeline complaining about missing sample genders
(now sex) if no regions of interest had been specified. The pipeline will
@@ -436,8 +529,12 @@ the (partially) updated documentation now hosted on ReadTheDocs.
- Switching to more traditional version-number tracking.
-
-[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.7...HEAD
+[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.12...HEAD
+[1.2.12]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.11...v1.2.12
+[1.2.11]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.10...v1.2.11
+[1.2.10]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.9...v1.2.10
+[1.2.9]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.8...v1.2.9
+[1.2.8]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.7...v1.2.8
[1.2.7]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.6...v1.2.7
[1.2.6]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.5...v1.2.6
[1.2.5]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.4...v1.2.5
@@ -450,5 +547,3 @@ the (partially) updated documentation now hosted on ReadTheDocs.
[1.1.0]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.1...v1.1.0
[1.0.1]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.0...v1.0.1
[1.0.0]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.0-RC...v1.0.0
-
-
diff --git a/INSTALL.rst b/INSTALL.rst
new file mode 100644
index 0000000..ea1021f
--- /dev/null
+++ b/INSTALL.rst
@@ -0,0 +1,4 @@
+For detailed installation instructions, please refer to
+http://paleomix.readthedocs.io/
+
+
diff --git a/README.rst b/README.rst
index 20b6444..0cbc908 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,17 @@
-*********************
-The PALEOMIX pipeline
-*********************
+**********************
+The PALEOMIX pipelines
+**********************
-The PALEOMIX pipeline is a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data, starting from de-multiplexed reads from one or more samples, through sequence processing and alignment, followed by genotyping and phylogenetic inference on the samples. In addition, PALEOMIX aids in metagenomic analysis of the extracts. The pipeline has been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the a [...]
+The PALEOMIX pipelines are a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data: The BAM pipeline processes de-multiplexed reads from one or more samples, through sequence processing and alignment, to generate BAM alignment files useful in downstream analyses; the Phylogenetic pipeline carries out genotyping and phylogenetic inference on BAM alignment files, either produced using the BAM pipeline or generated elsewhere; and the Zonkey [...]
+
+The pipelines have been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the analyses of ancient samples, but can all be for the processing of modern samples, in order to ensure consistent data processing.
For a detailed description of the pipeline, please refer to `PALEOMIX <http://geogenetics.ku.dk/publications/paleomix>`_ website and the `documentation <http://paleomix.readthedocs.io/>`_; for questions, bug reports, and/or suggestions, use the `GitHub tracker <https://github.com/MikkelSchubert/paleomix/issues/>`_, or contact Mikkel Schubert at `MSchubert at snm.ku.dk <mailto:MSchubert at snm.ku.dk>`_.
-The PALEOMIX pipeline has been published in Nature Protocols; if you make use of (parts of) the pipeline in your work, then please cite
+The PALEOMIX pipelines have been published in Nature Protocols; if you make use of PALEOMIX in your work, then please cite
Schubert M, Ermini L, Sarkissian CD, Jónsson H, Ginolhac A, Schaefer R, Martin MD, Fernández R, Kircher M, McCue M, Willerslev E, and Orlando L. "**Characterization of ancient and modern genomes by SNP detection and phylogenomic and metagenomic analysis using PALEOMIX**". Nat Protoc. 2014 May;9(5):1056-82. doi: `10.1038/nprot.2014.063 <http://dx.doi.org/10.1038/nprot.2014.063>`_. Epub 2014 Apr 10. PubMed PMID: `24722405 <http://www.ncbi.nlm.nih.gov/pubmed/24722405>`_.
+
+The Zonkey pipeline has been published in Journal of Archaeological Science; if you make use of this pipeline in your work, then please cite
+
+ Schubert M, Mashkour M, Gaunitz C, Fages A, Seguin-Orlando A, Sheikhi S, Alfarhan AH, Alquraishi SA, Al-Rasheid KAS, Chuang R, Ermini L, Gamba C, Weinstock J, Vedat O, and Orlando L. "**Zonkey: A simple, accurate and sensitive pipeline to genetically identify equine F1-hybrids in archaeological assemblages**". Journal of Archaeological Science. 2007 Feb; 78:147-157. doi: `10.1016/j.jas.2016.12.005 <http://dx.doi.org/10.1016/j.jas.2016.12.005>`_.
diff --git a/docs/bam_pipeline/requirements.rst b/docs/bam_pipeline/requirements.rst
index 62152e5..5c2cfc6 100644
--- a/docs/bam_pipeline/requirements.rst
+++ b/docs/bam_pipeline/requirements.rst
@@ -5,18 +5,18 @@
Software requirements
=====================
-In addition to the requirements listed in the ref:`installation` section, the BAM pipeline requires that a several other pieces of software be installed:
+In addition to the requirements listed in the ref:`installation` section, the BAM pipeline requires that a several other pieces of software be installed. The plus-sign following version numbers are used to indicate that versions newer than that version are also supported:
* `AdapterRemoval`_ v2.1+ [Lindgreen2012]_
* `SAMTools`_ v0.1.18+ [Li2009b]_
-* `Picard Tools`_ v1.124+
+* `Picard Tools`_ v1.137+
The Picard Tools JAR-file (picard.jar) is expected to be located in ~/install/jar_root/ by default, but this behavior may be changed using either the --jar-root command-line option, or via the global configuration file (see section :ref:`bam_configuration`).
Furthermore, one or both of the following sequence aligners must be installed:
* `Bowtie2`_ v2.1.0+ [Langmead2012]_
- * `BWA`_ v0.5.9+ or v0.6.2+ or v0.7.9+ [Li2009a]_
+ * `BWA`_ v0.5.9+, v0.6.2, or v0.7.9+ [Li2009a]_
In addition, the following packages are used by default, but can be omitted if disabled during runtime:
diff --git a/docs/conf.py b/docs/conf.py
index 016c7cb..5cb03c9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -57,7 +57,7 @@ author = u'Mikkel Schubert'
# The short X.Y version.
version = u'1.2'
# The full version, including alpha/beta/rc tags.
-release = u'1.2.7'
+release = u'1.2.12'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -207,25 +207,25 @@ htmlhelp_basename = 'PALEOMIXdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
+ # Latex figure (float) alignment
+ #'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- (master_doc, 'PALEOMIX.tex', u'PALEOMIX Documentation',
- u'Mikkel Schubert', 'manual'),
+ (master_doc, 'PALEOMIX.tex', u'PALEOMIX Documentation',
+ u'Mikkel Schubert', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
@@ -268,9 +268,9 @@ man_pages = [
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- (master_doc, 'PALEOMIX', u'PALEOMIX Documentation',
- author, 'PALEOMIX', 'TODO',
- 'Miscellaneous'),
+ (master_doc, 'PALEOMIX', u'PALEOMIX Documentation',
+ author, 'PALEOMIX', 'TODO',
+ 'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
diff --git a/docs/index.rst b/docs/index.rst
index 83aa1c7..faabc98 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,9 +2,19 @@
Welcome to PALEOMIX's documentation!
====================================
-The PALEOMIX pipeline is a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data, starting from de-multiplexed reads from one or more samples, through sequence processing and alignment, followed by genotyping and phylogenetic inference on the samples. In addition, PALEOMIX aids in metagenomic analysis of the extracts. The pipeline has been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the a [...]
+The PALEOMIX pipelines are a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data: The BAM pipeline processes de-multiplexed reads from one or more samples, through sequence processing and alignment, to generate BAM alignment files useful in downstream analyses; the Phylogenetic pipeline carries out genotyping and phylogenetic inference on BAM alignment files, either produced using the BAM pipeline or generated elsewhere; and the Zonkey [...]
-If you make use of any part of the PALEOMIX pipeline and/or assosiated tools, then we ask that you kindly cite [Schubert2014]_.
+The pipelines have been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the analyses of ancient samples, but can all be for the processing of modern samples, in order to ensure consistent data processing.
+
+For a detailed description of the pipeline, please refer to `PALEOMIX <http://geogenetics.ku.dk/publications/paleomix>`_ website and the `documentation <http://paleomix.readthedocs.io/>`_; for questions, bug reports, and/or suggestions, use the `GitHub tracker <https://github.com/MikkelSchubert/paleomix/issues/>`_, or contact Mikkel Schubert at `MSchubert at snm.ku.dk <mailto:MSchubert at snm.ku.dk>`_.
+
+The PALEOMIX pipelines have been published in Nature Protocols; if you make use of PALEOMIX in your work, then please cite
+
+ Schubert M, Ermini L, Sarkissian CD, Jónsson H, Ginolhac A, Schaefer R, Martin MD, Fernández R, Kircher M, McCue M, Willerslev E, and Orlando L. "**Characterization of ancient and modern genomes by SNP detection and phylogenomic and metagenomic analysis using PALEOMIX**". Nat Protoc. 2014 May;9(5):1056-82. doi: `10.1038/nprot.2014.063 <http://dx.doi.org/10.1038/nprot.2014.063>`_. Epub 2014 Apr 10. PubMed PMID: `24722405 <http://www.ncbi.nlm.nih.gov/pubmed/24722405>`_.
+
+The Zonkey pipeline has been published in Journal of Archaeological Science; if you make use of this pipeline in your work, then please cite
+
+ Schubert M, Mashkour M, Gaunitz C, Fages A, Seguin-Orlando A, Sheikhi S, Alfarhan AH, Alquraishi SA, Al-Rasheid KAS, Chuang R, Ermini L, Gamba C, Weinstock J, Vedat O, and Orlando L. "**Zonkey: A simple, accurate and sensitive pipeline to genetically identify equine F1-hybrids in archaeological assemblages**". Journal of Archaeological Science. 2007 Feb; 78:147-157. doi: `10.1016/j.jas.2016.12.005 <http://dx.doi.org/10.1016/j.jas.2016.12.005>`_.
**Table of Contents:**
diff --git a/docs/yaml.rst b/docs/yaml.rst
index e9ee157..e6cb3b3 100644
--- a/docs/yaml.rst
+++ b/docs/yaml.rst
@@ -4,19 +4,168 @@
YAML usage in PALEOMIX
======================
-The format, `YAML`_, is a simple human-readable markup language in which the structure of the data is determined by its identation, and will look familiar to anyone who has experience with the `Python`_ programming language.
+`YAML`_ is a simple markup language adopted for use in configuration files by pipelines included in PALEOMIX. YAML was chosen because it is a plain-text format that is easy to read and write by hand. Since YAML files are plain-text, they may be edited using any standard text editors, with the following caveats:
+
+* YAML exclusively uses spaces (space-bar) for indentation, not tabs; attempting to use tabs in YAML files will cause failures when the file is read by the associated program.
+* YAML is case-sensitive; an option such as 'QualityOffset' is therefore not the same as 'qualityoffset'.
+* It is strongly recommended that all files be named using the '.yaml' file-extension; setting the extension helps ensure proper handling by editors that natively support the YAML format.
+
+Only a subset of YAML features are actually used by PALEOMIX, which are described below. These include **mappings**, by which values are identified by names; **lists** of values; and **numbers**, **text-strings**, and **true** / **false** values, typically representing program options, file-paths, and the like. In addition, comments prefixed by the hash-sign (#) are frequently used to provide documentation.
+
+
+
+Comments
+--------
+
+Comments are specified by prefixing unquoted text with the hash-sign (#); all comments are ignored, and have no effect on the operation of the program. Comments are used solely to document the YAML files used by the pipelines::
+
+ # This is a comment; the next line contains both a value and a comment:
+ 123 # Comments may be placed on the same line as values.
+
+For the purpose of the PALEOMIX reading this YAML code, the above is equivalent to the following YAML code::
+
+ 123
+
+As noted above, this only applies to unquoted text, and the following is therefore not a comment, but rather a text-string::
+
+ "# this is not a comment"
+
+Comments are used in the following sections to provide context.
+
+
+Numbers (integers and floats)
+-----------------------------
+
+Numbers in YAML file include whole numbers (integers) as well as real numbers (floating point numbers). Numbers are mostly used for program options, such as a minimum read length option, and involve whole numbers, but a few options do involve real numbers. Numbers may be written as follows::
+
+ # This is an integer:
+ 123
+
+ # This is a float:
+ 123.5
+
+ # This is a float written using scientific notation:
+ 1.235e2
+
+
+Truth-values (booleans)
+-----------------------
+
+Truth values (*true* and *false*) are frequently used to enable or disable options in PALEOMIX configuration files. Several synonyms are available which helps improve readability. More specifically, all of the following values are interpreted as *true* by the pipelines::
+
+ true
+ yes
+ on
+
+And similarly, the following values are all interpreted as *false*::
+
+ false
+ no
+ off
+
+Template files included with the pipelines mostly use 'yes' and 'no', but either of the above corresponding values may be used. Note however that none of these values are quoted: If single or double-quotations were used, then these vales would be read as text rather than truth-values, as described next.
+
+
+Text (strings)
+--------------
+
+Text, or strings, is the most commonly used type of value used in the PALEOMIX YAML files, as these are used to present both labels and values for options, including paths to files to use in an analysis::
+
+ "Example"
+
+ "This is a longer string"
+
+ 'This is also a string'
+
+ "/path/to/my/files/reads.fastq"
+
+
+For most part it is not necessary to use quotation marks, and the above could instead be written as follows::
+
+ Example
+
+ This is a longer string
+
+ This is also a string
+
+ /path/to/my/files/reads.fastq
+
+However, it is important to make sure that values that are intended to be used strings are not mis-interpreted as a different type of value. For example, without the quotation marks the following values would be interpreted as numbers or truth-values::
+
+ "true"
+
+ "20090212"
+
+ "17e13"
+
+
+Mappings
+--------
+
+Mappings associate a value with a label (key), and are used for the majority of options. A mapping is simply a label followed by a colon, and then the value associated with that label::
+
+ MinimumQuality: 17
+
+ EnableFoo: no
+
+ NameOfTest: "test 17"
+
+In PALEOMIX configuration files, labels are always strings, and are normally not quoted. However, in some cases, such as when using numerical labels in some contexts, it may be useful to quote the values:
+
+ "A Label": on
+
+ "12032016": "CPT"
+
+
+Sections (mappings in mappings)
+-------------------------------
+
+In addition to mapping to a single value, a mapping may also itself contain one or more mappings::
+
+ Top level:
+ Second level: 'a value'
+ Another value: true
+
+Mappings can be nested any number of times, which is used in this manner to create sections and sub-sections in configuration files, grouping related options together::
+
+ Options:
+ Options for program:
+ Option1: yes
+ Option2: 17
+
+ Another program:
+ Option1: /path/to/file.fastq
+ Option2: no
+
+Note that the two mappings belonging to the 'Option' mapping are both indented the same number of spaces, which is what allows the program to figure out which values belong to what label. It is therefore important to keep indentation consistent.
+
+Lists of values
+---------------
+
+In some cases, it is possible to specify zero or more values with labels. This is accomplished using lists, which consist of values prefixed with a dash::
+
+ Section:
+ - First value
+ - Second value
+ - Third value
+
+Note that the indentation of each item must be the same, similar to how indentation of sub-sections must be the same (see above).
+
+
+Full example
+------------
The following showcases basic structure of a YAML document, as used by the pipelines::
# This is a comment; this line is completely ignored
This is a section:
This is a subsection:
- # This subsection contains 4 key / value pairs:
- First key: "First value"
- Second key: 2
- Third key: 3.4
- # The following key has no value assosiated!
- Fourth key:
+ # This subsection contains 3 label / value pairs:
+ First label: "First value"
+ Second label: 2
+ Third label: 3.
+
+ This is just another label: "Value!"
This is a section containing a list:
- The first item
@@ -24,5 +173,4 @@ The following showcases basic structure of a YAML document, as used by the pipel
-.. _Python: http://www.python.org/
.. _YAML: http://www.yaml.org
diff --git a/docs/zonkey_pipeline/index.rst b/docs/zonkey_pipeline/index.rst
index 0c55fe7..58ab68a 100644
--- a/docs/zonkey_pipeline/index.rst
+++ b/docs/zonkey_pipeline/index.rst
@@ -14,6 +14,13 @@ Zonkey Pipeline
panel.rst
filestructure.rst
-The Zonkey Pipeline is a easy-to-use pipeline designed for the analyses of low-coverage, ancient DNA derived from historical equid samples, with the purpose of determining the species of the sample, as well as determining possible hybridization between horses, zebras, and asses (see :ref:`zonkey_usage`).
+The Zonkey Pipeline is a easy-to-use pipeline designed for the analyses of low-coverage, ancient DNA derived from historical equid samples, with the purpose of determining the species of the sample, as well as determining possible hybridization between horses, zebras, and asses (see :ref:`zonkey_usage`). This is accomplished by comparing one or more samples aligned against the *Equus caballus* 2.0 reference sequence with a reference panel of modern equids, including wild and domesticated [...]
-This is accomplished by comparing one or more samples aligned against the *Equus caballus* 2.0 reference sequence with a reference panel of modern equids, including wild and domesticated equids. The reference panel is further described in the :ref:`zonkey_panel` section.
\ No newline at end of file
+The Zonkey pipeline has been published in Journal of Archaeological Science; if you make use of this pipeline in your work, then please cite
+
+ Schubert M, Mashkour M, Gaunitz C, Fages A, Seguin-Orlando A, Sheikhi S, Alfarhan AH, Alquraishi SA, Al-Rasheid KAS, Chuang R, Ermini L, Gamba C, Weinstock J, Vedat O, and Orlando L. "**Zonkey: A simple, accurate and sensitive pipeline to genetically identify equine F1-hybrids in archaeological assemblages**". Journal of Archaeological Science. 2007 Feb; 78:147-157. doi: `10.1016/j.jas.2016.12.005 <http://dx.doi.org/10.1016/j.jas.2016.12.005>`_.
+
+The sequencing data used in the Zonkey publication is available on `ENA`_ under the accession number `PRJEB15037`_.
+
+.. _ENA: https://www.ebi.ac.uk/ena/
+.. _PRJEB15037: https://www.ebi.ac.uk/ena/data/view/PRJEB15037
diff --git a/paleomix/__init__.py b/paleomix/__init__.py
index 005648e..99e299f 100644
--- a/paleomix/__init__.py
+++ b/paleomix/__init__.py
@@ -21,7 +21,7 @@
# SOFTWARE.
#
-__version_info__ = (1, 2, 7)
+__version_info__ = (1, 2, 12)
__version__ = '%i.%i.%i' % __version_info__
diff --git a/paleomix/atomiccmd/builder.py b/paleomix/atomiccmd/builder.py
index ceceaf3..073ac72 100644
--- a/paleomix/atomiccmd/builder.py
+++ b/paleomix/atomiccmd/builder.py
@@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -77,10 +77,10 @@ import paleomix.common.versions as versions
class AtomicCmdBuilderError(RuntimeError):
- pass
+ """Error raised by AtomicCmdBuilder."""
-class AtomicCmdBuilder:
+class AtomicCmdBuilder(object):
"""AtomicCmdBuilder is a class used to allow step-wise construction of an
AtomicCmd object. This allows the user of a Node to modify the behavior
of the called programs using some CLI parameters, without explicit support
@@ -162,6 +162,7 @@ class AtomicCmdBuilder:
"Singleton": False})
def pop_option(self, key):
+ """Remove option with key; raises error if option does not exist."""
old_option = self._get_option_for_editing(key, singleton=None)
if not old_option:
raise KeyError("Option with key %r does not exist" % key)
@@ -176,6 +177,8 @@ class AtomicCmdBuilder:
self._values.append(value)
def set_kwargs(self, **kwargs):
+ """Sets any number of keyword arguments; raises an exception if any of
+ the arguments have already been set."""
if self._object:
message = "Parameters have already been finalized"
raise AtomicCmdBuilderError(message)
@@ -216,6 +219,17 @@ class AtomicCmdBuilder:
self.set_kwargs(**kwargs)
return kwargs
+ def add_multiple_kwargs(self, values, template="IN_FILE_%02i"):
+ """Add multiple keyword arguments using the given values.
+
+ The template determines the key-names used for the arguments,
+ using numbers starting from 1 to differentiate between multiple
+ values.
+ """
+ kwargs = dict(self._get_new_kwarg_keys(values, template))
+ self.set_kwargs(**kwargs)
+ return kwargs
+
@property
def call(self):
"""Returns the system-call based on the call passed to the constructor,
@@ -279,7 +293,7 @@ class AtomicCmdBuilder:
raise KeyError("Key cannot be an empty string")
for option in reversed(self._options):
- if (option["Key"] == key):
+ if option["Key"] == key:
if (singleton is not None) \
and (option["Singleton"] != singleton):
message = "Mixing singleton and non-singleton options: %r"
@@ -300,11 +314,10 @@ class AtomicCmdBuilder:
class AtomicJavaCmdBuilder(AtomicCmdBuilder):
"""AtomicCmdBuilder for running java JARs.
- The resulting command will run the JAR in head-less mode, in order to ensure
- that the JARs can be run on head-less servers (and to avoid popups on OSX),
- using the process-specific temp-folder, and using at most a single thread
- for garbage collection (to ensure that thread-limits are obeyed).
-
+ The resulting command will run the JAR in head-less mode, in order to
+ ensure that the JARs can be run on head-less servers (and to avoid popups
+ on OSX), using the process-specific temp-folder, and using at most a single
+ thread for garbage collection (to ensure that thread-limits are obeyed).
"""
def __init__(self, jar, jre_options=(), temp_root="%(TEMP_DIR)s",
@@ -387,6 +400,8 @@ class AtomicJavaCmdBuilder(AtomicCmdBuilder):
checks=versions.GE(*version),
priority=10)
return JAVA_VERSIONS[version]
+
+
JAVA_VERSIONS = {}
@@ -400,22 +415,25 @@ class AtomicMPICmdBuilder(AtomicCmdBuilder):
"""
- def __init__(self, call, threads = 1, **kwargs):
+ def __init__(self, call, threads=1, **kwargs):
if not isinstance(threads, (types.IntType, types.LongType)):
- raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__)
+ raise TypeError("'threads' must be an integer value, not %r"
+ % threads.__class__.__name__)
elif threads < 1:
- raise ValueError("'threads' must be 1 or greater, not %i" % threads)
+ raise ValueError("'threads' must be 1 or greater, not %i"
+ % threads)
elif threads == 1:
- AtomicCmdBuilder.__init__(self, call, EXEC_MPI = "mpirun", **kwargs)
+ AtomicCmdBuilder.__init__(self, call, EXEC_MPI="mpirun", **kwargs)
else:
call = safe_coerce_to_tuple(call)
mpi_call = ["mpirun", "-n", threads]
mpi_call.extend(call)
- AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN = call[0], **kwargs)
+ AtomicCmdBuilder.__init__(
+ self, mpi_call, EXEC_MAIN=call[0], **kwargs)
-def use_customizable_cli_parameters(init_func): # pylint: disable=C0103
+def use_customizable_cli_parameters(init_func): # pylint: disable=C0103
"""Decorator for __init__ functions, implementing the customizable Node
interface: Allows a node to be implemented either using default behavior:
>>> node = SomeNode(value1 = ..., value2 = ...)
@@ -434,7 +452,8 @@ def use_customizable_cli_parameters(init_func): # pylint: disable=C0103
raise ValueError("Function name must be '__init__', not %r"
% (init_func.func_name,))
- def do_call(self, parameters = None, **kwargs):
+ def do_call(self, parameters=None, **kwargs):
+ """Call to invoke the decorated __init__ function."""
if not parameters:
parameters = self.customize(**kwargs)
@@ -443,7 +462,7 @@ def use_customizable_cli_parameters(init_func): # pylint: disable=C0103
return do_call
-def create_customizable_cli_parameters(customize_func): # pylint: disable=C0103
+def create_customizable_cli_parameters(customize_func): # pylint: disable=C0103
"""Decorator complementing the 'use_customizable_cli_parameters' decorator
defined above, which should be used on a function named 'customize'; this
function is made a classmethod.
@@ -489,7 +508,7 @@ def create_customizable_cli_parameters(customize_func): # pylint: disable=C0103
return classmethod(do_call)
-def apply_options(builder, options, pred = lambda s: s.startswith("-")):
+def apply_options(builder, options, pred=lambda s: s.startswith("-")):
"""Applies a dictionary of options to a builder. By default, only
options where the key start with "-" are used (determined by 'pred').
The following rules are used when applying options:
@@ -502,15 +521,19 @@ def apply_options(builder, options, pred = lambda s: s.startswith("-")):
"""
for (key, values) in dict(options).iteritems():
if not isinstance(key, types.StringTypes):
- raise TypeError("Keys must be strings, not %r" % (key.__class__.__name__,))
+ raise TypeError("Keys must be strings, not %r" %
+ (key.__class__.__name__,))
elif pred(key):
if isinstance(values, (types.ListType, types.TupleType)):
for value in values:
- if not isinstance(value, _ADDABLE_TYPES) or isinstance(value, _SETABLE_ONLY_TYPES):
- raise TypeError("Unexpected type when adding options: %r" % (value.__class__.__name__,))
+ if not isinstance(value, _ADDABLE_TYPES) \
+ or isinstance(value, _SETABLE_ONLY_TYPES):
+ raise TypeError("Unexpected type when in options: %r"
+ % (value.__class__.__name__,))
builder.add_option(key, value)
elif not isinstance(values, _SETABLE_TYPES):
- raise TypeError("Unexpected type when setting option: %r" % (values.__class__.__name__,))
+ raise TypeError("Unexpected type when setting option: %r" % (
+ values.__class__.__name__,))
elif isinstance(values, (types.BooleanType, types.NoneType)):
if values or values is None:
builder.set_option(key)
@@ -521,21 +544,23 @@ def apply_options(builder, options, pred = lambda s: s.startswith("-")):
_create_cli_parameters_cls_cache = {}
+
+
def _create_cli_parameters_cls(cls, kwargs):
- key = (cls, frozenset(kwargs))
+ key = (cls, frozenset(kwargs))
clsobj = _create_cli_parameters_cls_cache.get(key)
if not clsobj:
_create_cli_parameters_cls_cache[key] = clsobj = \
- collections.namedtuple("CustomCLIParams", " ".join(kwargs))
+ collections.namedtuple("CustomCLIParams", " ".join(kwargs))
- class _ParametersWrapper(clsobj): # pylint: disable=W0232
+ class _ParametersWrapper(clsobj): # pylint: disable=W0232
def build_node(self):
return cls(self)
return _ParametersWrapper(**kwargs)
-
-_ADDABLE_TYPES = (types.FloatType, types.IntType, types.LongType) + types.StringTypes
+_ADDABLE_TYPES = (types.FloatType, types.IntType,
+ types.LongType) + types.StringTypes
_SETABLE_ONLY_TYPES = (types.BooleanType, types.NoneType)
_SETABLE_TYPES = _ADDABLE_TYPES + _SETABLE_ONLY_TYPES
diff --git a/paleomix/atomiccmd/command.py b/paleomix/atomiccmd/command.py
index 8148251..d34c14a 100644
--- a/paleomix/atomiccmd/command.py
+++ b/paleomix/atomiccmd/command.py
@@ -21,7 +21,6 @@
# SOFTWARE.
#
import collections
-import errno
import os
import re
import signal
@@ -50,6 +49,8 @@ _FILE_MAP = {"IN": "input",
class CmdError(RuntimeError):
+ """Exception raised for AtomicCmd specific errors."""
+
def __init__(self, msg):
RuntimeError.__init__(self, msg)
@@ -144,6 +145,8 @@ class AtomicCmd(object):
self._running = False
self._command = map(str, safe_coerce_to_tuple(command))
self._set_cwd = set_cwd
+ self._terminated = False
+
if not self._command or not self._command[0]:
raise ValueError("Empty command in AtomicCmd constructor")
@@ -237,6 +240,7 @@ class AtomicCmd(object):
if self._proc and self._proc.poll() is None:
try:
os.killpg(self._proc.pid, signal.SIGTERM)
+ self._terminated = True
except OSError:
pass # Already dead / finished process
@@ -279,7 +283,7 @@ class AtomicCmd(object):
committed_files.add(self._files[key])
elif key.startswith("TEMP_OUT_"):
fileutils.try_remove(filename)
- except:
+ except StandardError:
# Cleanup after failed commit
for fpath in committed_files:
fileutils.try_remove(fpath)
diff --git a/paleomix/atomiccmd/pprint.py b/paleomix/atomiccmd/pprint.py
index 94e624e..958f834 100644
--- a/paleomix/atomiccmd/pprint.py
+++ b/paleomix/atomiccmd/pprint.py
@@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -20,12 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
-# pylint: disable=W0212
-
+# pylint: disable=protected-access
+#
from __future__ import print_function
import os
-import sys
+import pipes
import types
import subprocess
@@ -33,38 +33,41 @@ import subprocess
def _is_cls(obj, *cls_names):
return obj.__class__.__name__ in cls_names
+
def _get_pipe_name(files, pipe):
if pipe in files:
return pipe.split("_")[-1] + " "
return pipe.split("_")[-1] + "*"
+
def _get_pipe_file(files, pipe):
pipe_filename = files.get(pipe)
if pipe_filename:
return pipe_filename
return files.get("TEMP_%s" % (pipe,))
+
def _describe_cls(atomiccmd):
if _is_cls(atomiccmd, "ParallelCmds"):
- return "Parallel commands"
+ return "Parallel processes"
elif _is_cls(atomiccmd, "SequentialCmds"):
- return "Sequential commands"
- assert False # pragma: no coverage
+ return "Sequential processes"
+ assert False # pragma: no coverage
def _collect_stats(atomiccmd, stats):
assert atomiccmd not in stats["id"]
if _is_cls(atomiccmd, "AtomicCmd"):
- stats["id"][atomiccmd] = len(stats["id"])
- pipe = _get_pipe_file(atomiccmd._files, "IN_STDIN")
+ stats["id"][atomiccmd] = len(stats["id"]) + 1
+ pipe = _get_pipe_file(atomiccmd._files, "IN_STDIN")
if _is_cls(pipe, "AtomicCmd"):
stats["pipe"][pipe] = atomiccmd
elif _is_cls(atomiccmd, "ParallelCmds", "SequentialCmds"):
for subcmd in atomiccmd._commands:
_collect_stats(subcmd, stats)
else:
- assert False # pragma: no coverage
+ assert False # pragma: no coverage
return stats
@@ -74,20 +77,24 @@ def _build_status(atomiccmd, _stats, indent, lines):
if atomiccmd._proc:
if atomiccmd.ready():
return_code = tuple(atomiccmd.join())
- if isinstance(return_code[0], types.StringTypes):
- lines.append(prefix + "Terminated with signal %s" % return_code)
+ if atomiccmd._terminated:
+ lines.append(prefix + "Automatically terminated by PALEOMIX")
+ elif isinstance(return_code[0], types.StringTypes):
+ lines.append(prefix + "Terminated with signal %s"
+ % return_code)
else:
- lines.append(prefix + "Exited with return-code %i" % return_code)
+ lines.append(prefix + "Exited with return-code %i"
+ % return_code)
else:
lines.append(prefix + "Running ...")
def _build_stdin(atomiccmd, files, stats, indent, lines):
pipe_name = _get_pipe_name(files, "IN_STDIN")
- pipe = _get_pipe_file(files, "IN_STDIN")
+ pipe = _get_pipe_file(files, "IN_STDIN")
prefix = "%s%s = " % (" " * indent, pipe_name)
if pipe and pipe in stats["id"]:
- lines.append("%s<%02i>" % (prefix, stats["id"][pipe],))
+ lines.append("%sPiped from process %i" % (prefix, stats["id"][pipe],))
elif isinstance(pipe, types.StringTypes):
if atomiccmd._set_cwd and (pipe_name == "STDIN*"):
pipe = os.path.basename(pipe)
@@ -102,7 +109,7 @@ def _build_out_pipe(atomiccmd, files, stats, indent, lines, pipe):
if (atomiccmd in stats["pipe"]) and (pipe == "OUT_STDOUT"):
pipe = stats["pipe"].get(atomiccmd)
- lines.append("%s<%02i>" % (prefix, stats["id"][pipe],))
+ lines.append("%sPiped to process %i" % (prefix, stats["id"][pipe],))
return
filename = _get_pipe_file(files, pipe)
@@ -123,13 +130,14 @@ def _build_cwd(atomiccmd, indent, lines):
lines.append("%s'%s'" % (prefix, "${TEMP_DIR}"))
-def _pformat(atomiccmd, stats, indent, lines, include_prefix = True):
- s_prefix = ""
+def _pformat(atomiccmd, stats, indent, lines, include_prefix=True):
+ s_prefix = ""
if include_prefix:
- s_prefix = " " * indent + "- "
+ s_prefix = " " * indent
if _is_cls(atomiccmd, "AtomicCmd"):
cmd_id = stats["id"][atomiccmd]
- s_prefix += "<%02i> " % (cmd_id,)
+ lines.append(s_prefix + "Process %i:" % (cmd_id,))
+ s_prefix += " "
s_prefix_len = len(s_prefix)
if _is_cls(atomiccmd, "AtomicCmd"):
@@ -138,61 +146,61 @@ def _pformat(atomiccmd, stats, indent, lines, include_prefix = True):
c_prefix = s_prefix + "Command = "
for line in _pformat_list(atomiccmd._generate_call(temp)).split("\n"):
+ if atomiccmd._temp is not None:
+ line = line.replace('${TEMP_DIR}', atomiccmd._temp)
+
lines.append("%s%s" % (c_prefix, line))
c_prefix = " " * len(c_prefix)
- if not s_prefix_len:
- s_prefix_len += 1
-
- _build_status(atomiccmd, stats, s_prefix_len, lines)
- _build_stdin(atomiccmd, files, stats, s_prefix_len, lines)
- _build_out_pipe(atomiccmd, files, stats, s_prefix_len, lines, "OUT_STDOUT")
- _build_out_pipe(atomiccmd, files, stats, s_prefix_len, lines, "OUT_STDERR")
- _build_cwd(atomiccmd, s_prefix_len, lines)
+ _build_status(atomiccmd, stats, s_prefix_len, lines)
+ _build_stdin(atomiccmd, files, stats, s_prefix_len, lines)
+ _build_out_pipe(atomiccmd, files, stats,
+ s_prefix_len, lines, "OUT_STDOUT")
+ _build_out_pipe(atomiccmd, files, stats,
+ s_prefix_len, lines, "OUT_STDERR")
+ _build_cwd(atomiccmd, s_prefix_len, lines)
elif _is_cls(atomiccmd, "ParallelCmds", "SequentialCmds"):
lines.append("%s%s:" % (s_prefix, _describe_cls(atomiccmd)))
- for subcmd in atomiccmd._commands:
+ for subcmd_idx, subcmd in enumerate(atomiccmd._commands):
+ if subcmd_idx:
+ lines.append("")
+
_pformat(subcmd, stats, s_prefix_len + 2, lines)
else:
- assert False # pragma: no coverage
+ assert False # pragma: no coverage
-def _pformat_list(lst, width = 80):
+def _pformat_list(lst, width=80):
"""Return a printable representation of a list, where line-breaks
are inserted between items to minimize the number of lines with a
width greater than 'width'. Very long items may cause this maximum
to be exceeded."""
result = [[]]
current_width = 0
- for item in map(repr, lst):
- if current_width + len(item) + 2 > width:
+ for item in (pipes.quote(str(value)) for value in lst):
+ if current_width + len(item) + 1 > width:
if not result[-1]:
result[-1] = [item]
- current_width = len(item) + 2
else:
result.append([item])
- current_width = len(item) + 2
+
+ current_width = len(item) + 1
else:
result[-1].append(item)
- current_width += len(item) + 2
+ current_width += len(item) + 1
- return "[%s]" % (",\n ".join(", ".join(line) for line in result))
+ return " \\\n ".join(" ".join(line) for line in result)
def pformat(atomiccmd):
"""Returns a human readable description of an Atomic Cmd or Atomic Set
of commands. This is currently equivalent to str(cmd_obj)."""
if not _is_cls(atomiccmd, "AtomicCmd", "ParallelCmds", "SequentialCmds"):
- raise TypeError("Invalid type in pformat: %r" % atomiccmd.__class__.__name__)
+ raise TypeError("Invalid type in pformat: %r" %
+ atomiccmd.__class__.__name__)
lines = []
- stats = _collect_stats(atomiccmd, {"id" : {}, "pipe" : {}})
+ stats = _collect_stats(atomiccmd, {"id": {}, "pipe": {}})
_pformat(atomiccmd, stats, 0, lines, False)
- return "<%s>" % "\n".join(lines)
-
-
-def pprint(atomiccmd, out = sys.stdout):
- """Prints a human readable description of an Atomic Cmd or Atomic Set
- of commands. This is currently equivalent to print(str(cmd_obj), ...)."""
- print(pformat(atomiccmd), file = out)
+ return "\n".join(lines)
diff --git a/paleomix/common/formats/fasta.py b/paleomix/common/formats/fasta.py
index ba8d91f..ff202d5 100644
--- a/paleomix/common/formats/fasta.py
+++ b/paleomix/common/formats/fasta.py
@@ -27,10 +27,10 @@ import types
import pysam
from paleomix.common.utilities import \
- fragment, \
- split_before, \
- Immutable, \
- TotallyOrdered
+ fragment, \
+ split_before, \
+ Immutable, \
+ TotallyOrdered
from paleomix.common.fileutils import open_ro
from paleomix.common.formats._common import FormatError
@@ -103,12 +103,14 @@ class FASTA(TotallyOrdered, Immutable):
"""
fai_filename = filename + ".fai"
if not os.path.exists(fai_filename):
- if not os.access(os.path.dirname(filename), os.W_OK):
+ dirname = os.path.dirname(filename) or "."
+
+ if not os.access(dirname, os.W_OK):
message = \
- "FASTA index is missing, but folder is\n" \
+ "FASTA index is missing, but folder is " \
"not writable, so it cannot be created:\n" \
" Filename = %s\n\n" \
- "Either change permissions on the folder, or move\n" \
+ "Either change permissions on the folder, or move " \
"the FASTA file to different location." % (filename,)
raise FASTAError(message)
diff --git a/paleomix/common/makefile.py b/paleomix/common/makefile.py
index 5b66dec..ef047da 100644
--- a/paleomix/common/makefile.py
+++ b/paleomix/common/makefile.py
@@ -486,6 +486,8 @@ def _create_set_operator(operator_func, description):
"""Operator function for set based operations."""
if not isinstance(lvalue, (types.ListType,) + types.StringTypes):
return False
+ elif not _are_keys_hashable(lvalue):
+ return False
return bool(operator_func(frozenset(lvalue), rvalue))
@@ -613,6 +615,9 @@ class StringIn(_BinaryOperator):
@classmethod
def _string_in_operator(cls, lvalue, rvalues):
"""Implements case-insensitive 'in' operator."""
+ if not _is_hashable(lvalue):
+ return False
+
return _safe_coerce_to_lowercase(lvalue) in rvalues
@@ -627,6 +632,8 @@ class _StrSetOperator(_BinaryOperator):
def meets_spec(self, value):
if not isinstance(value, (types.ListType,) + types.StringTypes):
return False
+ elif not _are_keys_hashable(value):
+ return False
lvalues = frozenset(map(_safe_coerce_to_lowercase, value))
return _BinaryOperator.meets_spec(self, lvalues)
@@ -778,6 +785,19 @@ class IsDictOf(MakefileSpec):
###############################################################################
# Helper functions
+
+def _are_keys_hashable(value):
+ return all(_is_hashable(key) for key in value)
+
+
+def _is_hashable(value):
+ try:
+ hash(value)
+ return True
+ except TypeError:
+ return False
+
+
def _is_spec(spec):
"""Returns true if 'spec' is a specification instance or class."""
if isinstance(spec, MakefileSpec):
diff --git a/paleomix/common/testing.py b/paleomix/common/testing.py
index b23791a..5b5ecfc 100644
--- a/paleomix/common/testing.py
+++ b/paleomix/common/testing.py
@@ -21,6 +21,7 @@
# SOFTWARE.
#
import os
+import pwd
import sys
import shutil
import tempfile
@@ -46,8 +47,9 @@ def with_temp_folder(func):
Creates a unique temporary folder before running 'func'. The
function is is assumed to take at least one parameter, the first
of which is assumed to represent the temporary folder."""
- temp_root = os.path.join(tempfile.gettempdir(), os.getlogin())
- make_dirs(temp_root) # Ensure that this subdirectory exists
+ name = pwd.getpwuid(os.geteuid()).pw_name
+ temp_root = os.path.join(tempfile.gettempdir(), name)
+ make_dirs(temp_root)
@nose.tools.istest
def _wrapper(*args, **kwargs):
diff --git a/paleomix/common/vcffilter.py b/paleomix/common/vcffilter.py
index 51a25d8..57e4dad 100644
--- a/paleomix/common/vcffilter.py
+++ b/paleomix/common/vcffilter.py
@@ -95,7 +95,8 @@ def describe_filters(options):
return {
"HET": "Heterozygous SNPs observed on homozygous chromosome (e.g. chrX)",
"q:%i" % options.min_quality: "Minimum Phred score recorded in the QUAL column",
- "f:%.4f" % options.min_allele_frequency: "Minimum frequency of the alleles at heterozygous sites",
+ "f:%.4f" % options.min_allele_frequency: "Minimum frequency of the alleles at heterozygous sites",
+ "F:%.4f" % options.min_allele_frequency: "Heterozygous sites for which the frequency was not determined.",
"k": "SNPs without a most likely genotype (based on PL)",
"Q:%i" % options.min_mapping_quality: "Minimum RMS mapping quality",
"d:%i" % options.min_read_depth: "Minimum read depth",
diff --git a/paleomix/common/vcfwrap.py b/paleomix/common/vcfwrap.py
index a7fb417..13d71eb 100644
--- a/paleomix/common/vcfwrap.py
+++ b/paleomix/common/vcfwrap.py
@@ -123,17 +123,24 @@ def get_ml_genotype(vcf, sample=0):
PL = map(int, get_format(vcf, sample)["PL"].split(","))
- expected_length = (len(genotypes) * (len(genotypes) + 1)) // 2
- if len(PL) != expected_length:
- raise ValueError("Expected %i PL values, found %i"
+ if len(PL) == len(genotypes):
+ ploidy = 1
+ else:
+ expected_length = (len(genotypes) * (len(genotypes) + 1)) // 2
+ if len(PL) != expected_length:
+ raise ValueError("Expected %i PL values, found %i"
% (expected_length, len(PL)))
+ ploidy = 2
if PL.count(min(PL)) > 1:
# No single most likely genotype
return ("N", "N")
most_likely = min(xrange(len(PL)), key=PL.__getitem__)
- prefix, postfix = _genotype_indices[most_likely]
+ if ploidy == 1:
+ prefix = postfix = most_likely
+ else:
+ prefix, postfix = _genotype_indices[most_likely]
return (genotypes[prefix], genotypes[postfix])
diff --git a/paleomix/common/versions.py b/paleomix/common/versions.py
index 5ac3948..5d28bf9 100644
--- a/paleomix/common/versions.py
+++ b/paleomix/common/versions.py
@@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
-# pylint: disable=W0223
+# pylint: disable=too-few-public-methods
#
"""Version checks for apps or libraries required by PALEOMIX pipelines.
@@ -45,17 +45,14 @@ For example, to check that the Java version is v1.7 or later:
except VersionRequirementError:
pass # requirements not met, or failure to determine version
"""
-import re
-import operator
import collections
+import operator
+import re
from paleomix.common.utilities import \
- Immutable, \
TotallyOrdered, \
safe_coerce_to_tuple, \
try_cast
-from paleomix.common.fileutils import \
- which_executable
import paleomix.common.procs as procs
@@ -66,7 +63,7 @@ _CALL_CACHE = {}
_REQUIREMENT_CACHE = {}
-class VersionRequirementError(StandardError):
+class VersionRequirementError(Exception):
"""Raised if version requirements are not met, or if a version could not be
determined for a requirement check.
"""
@@ -141,7 +138,11 @@ class RequirementObj(object):
describing the cause of the problem.
"""
if self._version is None:
- output = _do_call(self._call)
+ try:
+ output = _do_call(self._call)
+ except OSError as error:
+ self._raise_failure(error)
+
# Raise an exception if the JRE is outdated, even if the
# version could be determined (likely a false positive match).
self._check_for_outdated_jre(output)
@@ -198,8 +199,11 @@ class RequirementObj(object):
lines.extend(self._describe_call())
lines.append("")
- # Raised if the JRE is too old compared to the JAR
- if "UnsupportedClassVersionError" in output:
+ if isinstance(output, OSError):
+ lines.append("Exception was raised:")
+ lines.append(" %s: %s" % (output.__class__.__name__, output))
+ elif "UnsupportedClassVersionError" in output:
+ # Raised if the JRE is too old compared to the JAR
lines.extend([
"The version of the Java Runtime Environment on this",
"system is too old; please check the the requirement",
@@ -208,29 +212,25 @@ class RequirementObj(object):
"See the documentation for more information.",
])
else:
- lines.append("Program may be broken or a version not supported by the")
- lines.append("pipeline; please refer to the PALEOMIX documentation.\n")
+ lines.append(
+ "Program may be broken or a version not supported by the")
+ lines.append(
+ "pipeline; please refer to the PALEOMIX documentation.\n")
lines.append(" Required: %s" % (self.checks,))
- lines.append(" Search string: %r\n" % (self._rege.pattern))
+ lines.append(" Search string: %s\n" % (self._rege.pattern))
lines.append("%s Command output %s" % ("-" * 22, "-" * 22))
lines.append(output)
raise VersionRequirementError("\n".join(lines))
def _describe_call(self):
- """Yields string describing the current system call, if any.
- """
- if self.executable:
- exec_path = which_executable(self.executable) or self.executable
- yield " Executable: %s" % (exec_path,)
-
+ """Returns lines describing the current system call, if any."""
if not isinstance(self._call[0], collections.Callable):
- yield " Call: %s" % (" ".join(self._call),)
+ yield "Attempted to run command:"
+ yield " $ %s" % (" ".join(self._call),)
-class Check(Immutable, TotallyOrdered):
- # Ignore "missing" members; required due to use of Immutable
- # pylint: disable=E1101
+class Check(TotallyOrdered):
"""Abstract base-class for version checks.
Callable with a tuple of version fields (typically integers), and returns
@@ -244,15 +244,14 @@ class Check(Immutable, TotallyOrdered):
"""
def __init__(self, description, func, *values):
- if not callable(func):
+ if not isinstance(func, collections.Callable):
raise TypeError('func must be callable, not %r' % (func,))
values = tuple(values)
- Immutable.__init__(self,
- _func=func,
- _values=values,
- _description=description,
- _objs=(description, func, values))
+ self._func = func
+ self._values = values
+ self._description = description
+ self._objs = (description, func, values)
def __str__(self):
return self._description
@@ -397,7 +396,8 @@ def _func_or(current, checks):
def _run(call):
"""Carries out a system call and returns STDOUT and STDERR as a combined
string. If an OSError is raied (e.g. due to missing executables), the
- resulting message is returned as a string.
+ resulting message is returned as a string. If the call raised an OSError,
+ then the exception is returned as a value.
"""
try:
proc = procs.open_proc(call,
@@ -406,23 +406,28 @@ def _run(call):
stderr=procs.STDOUT)
return proc.communicate()[0]
- except (OSError, procs.CalledProcessError), error:
- return str(error)
+ except OSError as error:
+ return error
def _do_call(call):
"""Performs a call; the result is cached, and returned upon subsequent
- calls with the same signature (either a function call or system call).
+ calls with the same signature (either a function call or system call). If
+ the call raised an OSError, then the exception is returned as a value.
"""
try:
- return _CALL_CACHE[call]
+ result = _CALL_CACHE[call]
except KeyError:
- if callable(call[0]):
+ if isinstance(call[0], collections.Callable):
result = call[0](*call[1:])
else:
result = _run(call)
_CALL_CACHE[call] = result
- return result
+
+ if isinstance(result, OSError):
+ raise result
+
+ return result
def _pprint_version(value):
diff --git a/paleomix/main.py b/paleomix/main.py
index 2414f23..1685f2e 100755
--- a/paleomix/main.py
+++ b/paleomix/main.py
@@ -53,6 +53,10 @@ def _commands():
yield ("metabit", "paleomix.tools.metabit.metabit", None)
yield ("BAM/SAM tools", None, None)
+ yield ("dupcheck", "paleomix.tools.dupcheck",
+ "Identifies potential duplicate data in sorted BAM files, defined"
+ "as reads aligned to the same position, with the same name, "
+ "sequence, and qualities.")
yield ("cleanup", "paleomix.tools.cleanup",
"Reads SAM file from STDIN, and outputs sorted, tagged, and filter "
"BAM, for which NM and MD tags have been updated.")
diff --git a/paleomix/node.py b/paleomix/node.py
index ccd6334..f1d6b6c 100644
--- a/paleomix/node.py
+++ b/paleomix/node.py
@@ -28,7 +28,7 @@ import types
import paleomix.common.fileutils as fileutils
from paleomix.common.utilities import \
- safe_coerce_to_frozenset
+ safe_coerce_to_frozenset
from paleomix.atomiccmd.command import \
CmdError
@@ -51,24 +51,24 @@ class NodeUnhandledException(NodeError):
class Node(object):
- def __init__(self, description = None, threads = 1,
- input_files = (), output_files = (),
- executables = (), auxiliary_files = (),
- requirements = (), dependencies = ()):
+ def __init__(self, description=None, threads=1,
+ input_files=(), output_files=(),
+ executables=(), auxiliary_files=(),
+ requirements=(), dependencies=()):
if not isinstance(description, _DESC_TYPES):
- raise TypeError("'description' must be None or a string, not %r" \
+ raise TypeError("'description' must be None or a string, not %r"
% (description.__class__.__name__,))
- self.__description = description
- self.input_files = self._validate_files(input_files)
- self.output_files = self._validate_files(output_files)
- self.executables = self._validate_files(executables)
+ self.__description = description
+ self.input_files = self._validate_files(input_files)
+ self.output_files = self._validate_files(output_files)
+ self.executables = self._validate_files(executables)
self.auxiliary_files = self._validate_files(auxiliary_files)
- self.requirements = self._validate_requirements(requirements)
+ self.requirements = self._validate_requirements(requirements)
- self.threads = self._validate_nthreads(threads)
- self.dependencies = self._collect_nodes(dependencies)
+ self.threads = self._validate_nthreads(threads)
+ self.dependencies = self._collect_nodes(dependencies)
# If there are no input files, the node cannot be re-run based on
# changes to the input, and nodes with output but no input are not
@@ -88,8 +88,10 @@ class Node(object):
NodeUnhandledException, which includes a full backtrace. This is needed
to allow showing these in the main process."""
+ temp = None
+
try:
- temp = None
+ # Generate directory name and create dir at temp_root
temp = self._create_temp_dir(config)
self._setup(config, temp)
@@ -98,11 +100,12 @@ class Node(object):
self._remove_temp_dir(temp)
except NodeError, error:
self._write_error_log(temp, error)
- raise NodeError("Error(s) running Node:\n\tTemporary directory: %s\n\n%s" \
+ raise NodeError("Error(s) running Node:\n\tTemporary directory: %s\n\n%s"
% (repr(temp), error))
+
except Exception, error:
self._write_error_log(temp, error)
- raise NodeUnhandledException("Error(s) running Node:\n\tTemporary directory: %s\n\n%s" \
+ raise NodeUnhandledException("Error(s) running Node:\n\tTemporary directory: %s\n\n%s"
% (repr(temp), traceback.format_exc()))
def _create_temp_dir(self, config):
@@ -122,7 +125,9 @@ class Node(object):
function. Checks that required input files exist, and raises an NodeError if
this is not the case."""
if fileutils.missing_executables(self.executables):
- raise NodeError("Executable(s) does not exist for node: %s" % (self,))
+ raise NodeError("Executable(s) does not exist for node: %s"
+ % (self,))
+
self._check_for_missing_files(self.input_files, "input")
self._check_for_missing_files(self.auxiliary_files, "auxiliary")
@@ -145,24 +150,26 @@ class Node(object):
otherwise greatly inflate the amount of information that needs to be
pickled."""
obj_dict = self.__dict__.copy()
- obj_dict["requirements"] = None
- obj_dict["dependencies"] = None
+ obj_dict["requirements"] = ()
+ obj_dict["dependencies"] = ()
return obj_dict
def _write_error_log(self, temp, error):
if not (temp and os.path.isdir(temp)):
return
- prefix = "\n "
+ def _fmt(values):
+ return "\n ".join(sorted(values))
+
message = ["Command = %r" % (" ".join(sys.argv),),
"CWD = %r" % (os.getcwd(),),
"PATH = %r" % (os.environ.get('PATH', ''),),
"Node = %s" % (str(self),),
"Threads = %i" % (self.threads,),
- "Input files = %s" % (prefix.join(sorted(self.input_files)),),
- "Output files = %s" % (prefix.join(sorted(self.output_files)),),
- "Auxiliary files = %s" % (prefix.join(sorted(self.auxiliary_files)),),
- "Executables = %s" % (prefix.join(sorted(self.executables)),),
+ "Input files = %s" % (_fmt(self.input_files),),
+ "Output files = %s" % (_fmt(self.output_files),),
+ "Auxiliary files = %s" % (_fmt(self.auxiliary_files),),
+ "Executables = %s" % (_fmt(self.executables),),
"",
"Errors =\n%s\n" % (error,)]
message = "\n".join(message)
@@ -171,7 +178,8 @@ class Node(object):
with open(os.path.join(temp, "pipe.errors"), "w") as handle:
handle.write(message)
except OSError, oserror:
- sys.stderr.write("ERROR: Could not write failure log: %s\n" % (oserror,))
+ sys.stderr.write("ERROR: Could not write failure log: %s\n"
+ % (oserror,))
def _collect_nodes(self, nodes):
if nodes is None:
@@ -210,30 +218,33 @@ class Node(object):
files = safe_coerce_to_frozenset(files)
for filename in files:
if not isinstance(filename, types.StringTypes):
- raise TypeError('Files must be strings, not %r' % filename.__class__.__name__)
+ raise TypeError('Files must be strings, not %r'
+ % (filename.__class__.__name__,))
return files
@classmethod
def _validate_nthreads(cls, threads):
if not isinstance(threads, (types.IntType, types.LongType)):
- raise TypeError("'threads' must be a positive integer, not %s" % (type(threads),))
+ raise TypeError("'threads' must be a positive integer, not a %s"
+ % (type(threads),))
elif threads < 1:
- raise ValueError("'threads' must be a positive integer, not %i" % (threads,))
- return int(threads)
+ raise ValueError("'threads' must be a positive integer, not %i"
+ % (threads,))
+ return threads
class CommandNode(Node):
def __init__(self, command, description=None, threads=1,
dependencies=()):
Node.__init__(self,
- description = description,
- input_files = command.input_files,
- output_files = command.output_files,
- auxiliary_files = command.auxiliary_files,
- executables = command.executables,
- requirements = command.requirements,
- threads = threads,
- dependencies = dependencies)
+ description=description,
+ input_files=command.input_files,
+ output_files=command.output_files,
+ auxiliary_files=command.auxiliary_files,
+ executables=command.executables,
+ requirements=command.requirements,
+ threads=threads,
+ dependencies=dependencies)
self._command = command
@@ -244,32 +255,29 @@ class CommandNode(Node):
try:
self._command.run(temp)
except CmdError, error:
- desc = "\n\t".join(str(self._command).split("\n"))
- raise CmdNodeError("%s\n\n%s" % (desc, error))
+ raise CmdNodeError("%s\n\n%s" % (str(self._command), error))
return_codes = self._command.join()
if any(return_codes):
- desc = "\n\t".join(str(self._command).split("\n"))
- raise CmdNodeError(desc)
-
+ raise CmdNodeError(str(self._command))
def _teardown(self, config, temp):
required_files = self._command.expected_temp_files
optional_files = self._command.optional_temp_files
- current_files = set(os.listdir(temp))
+ current_files = set(os.listdir(temp))
missing_files = (required_files - current_files)
if missing_files:
raise CmdNodeError(("Error running Node, required files not created:\n"
- "Temporary directory: %r\n"
- "\tRequired files missing from temporary directory:\n\t - %s") \
+ "Temporary directory: %r\n"
+ "\tRequired files missing from temporary directory:\n\t - %s")
% (temp, "\n\t - ".join(sorted(map(repr, missing_files)))))
extra_files = current_files - (required_files | optional_files)
if extra_files:
raise CmdNodeError("Error running Node, unexpected files created:\n"
"\tTemporary directory: %r\n"
- "\tUnexpected files found in temporary directory:\n\t - %s" \
+ "\tUnexpected files found in temporary directory:\n\t - %s"
% (temp, "\n\t - ".join(sorted(map(repr, extra_files)))))
self._command.commit(temp)
diff --git a/paleomix/nodes/bowtie2.py b/paleomix/nodes/bowtie2.py
index 0c63414..53c8a67 100644
--- a/paleomix/nodes/bowtie2.py
+++ b/paleomix/nodes/bowtie2.py
@@ -49,32 +49,23 @@ BOWTIE2_VERSION = versions.Requirement(call=("bowtie2", "--version"),
class Bowtie2IndexNode(CommandNode):
- @create_customizable_cli_parameters
- def customize(cls, input_file, prefix=None, dependencies=()):
+ def __init__(self, input_file, prefix=None, dependencies=()):
prefix = prefix if prefix else input_file
- params = _bowtie2_template(("bowtie2-build"), prefix, iotype="OUT",
- IN_FILE=input_file,
- TEMP_OUT_PREFIX=os.path.basename(prefix),
- CHECK_VERSION=BOWTIE2_VERSION)
+ builder = _bowtie2_template(("bowtie2-build"), prefix, iotype="OUT",
+ IN_FILE=input_file,
+ TEMP_OUT_PREFIX=os.path.basename(prefix),
+ CHECK_VERSION=BOWTIE2_VERSION)
- params.add_value("%(IN_FILE)s")
+ builder.add_value("%(IN_FILE)s")
# Destination prefix, in temp folder
- params.add_value("%(TEMP_OUT_PREFIX)s")
+ builder.add_value("%(TEMP_OUT_PREFIX)s")
- return {"prefix": prefix,
- "command": params,
- "dependencies": dependencies}
-
- @use_customizable_cli_parameters
- def __init__(self, parameters):
- command = parameters.command.finalize()
- description = "<Bowtie2 Index '%s' -> '%s.*'>" \
- % (parameters.input_file, parameters.prefix)
+ description = "<Bowtie2 Index '%s' -> '%s.*'>" % (input_file, prefix)
CommandNode.__init__(self,
- command=command,
+ command=builder.finalize(),
description=description,
- dependencies=parameters.dependencies)
+ dependencies=dependencies)
class Bowtie2Node(CommandNode):
@@ -120,7 +111,7 @@ class Bowtie2Node(CommandNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
command = ParallelCmds([parameters.commands[key].finalize()
- for key in parameters.order])
+ for key in parameters.order])
algorithm = "PE" if parameters.input_file_2 else "SE"
description \
diff --git a/paleomix/nodes/bwa.py b/paleomix/nodes/bwa.py
index 4b66d7c..8079739 100644
--- a/paleomix/nodes/bwa.py
+++ b/paleomix/nodes/bwa.py
@@ -52,32 +52,23 @@ BWA_VERSION_07x = versions.Requirement(call=("bwa",),
class BWAIndexNode(CommandNode):
- @create_customizable_cli_parameters
- def customize(cls, input_file, prefix=None, dependencies=()):
+ def __init__(self, input_file, prefix=None, dependencies=()):
prefix = prefix if prefix else input_file
- params = _get_bwa_template(("bwa", "index"), prefix, iotype="OUT",
- IN_FILE=input_file,
- TEMP_OUT_PREFIX=os.path.basename(prefix),
- CHECK_BWA=BWA_VERSION)
+ builder = _get_bwa_template(("bwa", "index"), prefix, iotype="OUT",
+ IN_FILE=input_file,
+ TEMP_OUT_PREFIX=os.path.basename(prefix),
+ CHECK_BWA=BWA_VERSION)
# Input fasta sequence
- params.add_value("%(IN_FILE)s")
+ builder.add_value("%(IN_FILE)s")
# Destination prefix, in temp folder
- params.set_option("-p", "%(TEMP_OUT_PREFIX)s")
-
- return {"prefix": prefix,
- "command": params,
- "dependencies": dependencies}
+ builder.set_option("-p", "%(TEMP_OUT_PREFIX)s")
- @use_customizable_cli_parameters
- def __init__(self, parameters):
- command = parameters.command.finalize()
- description = "<BWA Index '%s' -> '%s.*'>" % (parameters.input_file,
- parameters.prefix)
+ description = "<BWA Index '%s' -> '%s.*'>" % (input_file, prefix)
CommandNode.__init__(self,
- command=command,
+ command=builder.finalize(),
description=description,
- dependencies=parameters.dependencies)
+ dependencies=dependencies)
class BWABacktrack(CommandNode):
@@ -104,7 +95,7 @@ class BWABacktrack(CommandNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
command = ParallelCmds([parameters.commands[key].finalize()
- for key in parameters.order])
+ for key in parameters.order])
description \
= _get_node_description(name="BWA",
@@ -150,7 +141,7 @@ class BWASamse(CommandNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
command = ParallelCmds([parameters.commands[key].finalize()
- for key in parameters.order])
+ for key in parameters.order])
input_file = parameters.input_file_fq
description = _get_node_description(name="BWA Samse",
@@ -205,7 +196,7 @@ class BWASampe(CommandNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
command = ParallelCmds([parameters.commands[key].finalize()
- for key in parameters.order])
+ for key in parameters.order])
input_file_1 = parameters.input_file_fq_1
input_file_2 = parameters.input_file_fq_2
@@ -346,6 +337,8 @@ def _get_max_threads(reference, threads):
if prefix_size is None or prefix_size >= 2 ** 20: # > 1 MB
return threads
return 1
+
+
_PREFIX_SIZE_CACHE = {}
@@ -395,8 +388,10 @@ def _check_bwa_prefix(prefix):
" Your copy of BWA may have changed, or you may be using the wrong\n"
" prefix. To resolve this issue, either change your prefix, re-install\n"
" BWA %s, or remove the prefix files at\n"
- " $ ls %s.*" \
+ " $ ls %s.*"
% (".".join(map(str, bwa_version)), expected_version, expected_version, prefix))
+
+
_PREFIXES_CHECKED = set()
diff --git a/paleomix/nodes/commands.py b/paleomix/nodes/commands.py
index 64f5dc9..143f145 100644
--- a/paleomix/nodes/commands.py
+++ b/paleomix/nodes/commands.py
@@ -35,18 +35,19 @@ from paleomix.atomiccmd.command import \
AtomicCmd
from paleomix.atomiccmd.sets import \
ParallelCmds
-from paleomix.common.fileutils import \
- describe_files
from paleomix.nodes.picard import \
- MultiBAMInput, \
MultiBAMInputNode
from paleomix.atomiccmd.builder import \
AtomicCmdBuilder, \
create_customizable_cli_parameters, \
use_customizable_cli_parameters
from paleomix.common.fileutils import \
+ describe_files, \
reroot_path, \
- move_file
+ move_file, \
+ swap_ext
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
from paleomix.nodes.samtools import \
SAMTOOLS_VERSION, \
@@ -66,26 +67,26 @@ class DuplicateHistogramNode(MultiBAMInputNode):
"""
def __init__(self, config, input_files, output_file, dependencies=()):
- bam_input = MultiBAMInput(config, input_files, indexed=False)
- duphist_command = factory.new("duphist")
- duphist_command.add_value('%(TEMP_IN_BAM)s')
- duphist_command.set_kwargs(OUT_STDOUT=output_file)
- bam_input.setup(duphist_command)
- duphist_command = duphist_command.finalize()
+ input_files = safe_coerce_to_tuple(input_files)
- commands = ParallelCmds(bam_input.commands + [duphist_command])
+ builder = factory.new("duphist")
+ builder.add_value('%(TEMP_IN_BAM)s')
+ builder.set_kwargs(OUT_STDOUT=output_file,
+ TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
+ builder.add_multiple_kwargs(input_files)
description = "<DuplicateHistogram: %s -> %r>" \
% (describe_files(input_files), output_file)
MultiBAMInputNode.__init__(self,
- bam_input=bam_input,
- command=commands,
+ config=config,
+ input_bams=input_files,
+ command=builder.finalize(),
description=description,
dependencies=dependencies)
class CoverageNode(CommandNode):
- def __init__(self, config, target_name, input_file, output_file,
+ def __init__(self, target_name, input_file, output_file,
regions_file=None, dependencies=()):
builder = factory.new("coverage")
builder.add_value("%(IN_BAM)s")
@@ -127,31 +128,33 @@ class MergeCoverageNode(Node):
class DepthHistogramNode(MultiBAMInputNode):
def __init__(self, config, target_name, input_files, output_file,
- regions_file=None, dependencies=()):
- bam_input = MultiBAMInput(config, input_files,
- indexed=bool(regions_file))
- if len(bam_input.files) > 1 and regions_file:
- raise ValueError("DepthHistogram for regions require single, "
- "indexed input BAM file.")
+ prefix, regions_file=None, dependencies=()):
+ input_files = safe_coerce_to_tuple(input_files)
+ index_format = regions_file and prefix['IndexFormat']
builder = factory.new("depths")
builder.add_value("%(TEMP_IN_BAM)s")
builder.add_value("%(OUT_FILE)s")
builder.set_option("--target-name", target_name)
- builder.set_kwargs(OUT_FILE=output_file)
- bam_input.setup(builder)
+ builder.set_kwargs(OUT_FILE=output_file,
+ TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
+ builder.add_multiple_kwargs(input_files)
if regions_file:
+ index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format)
+
builder.set_option('--regions-file', '%(IN_REGIONS)s')
- builder.set_kwargs(IN_REGIONS=regions_file)
+ builder.set_kwargs(IN_REGIONS=regions_file,
+ TEMP_IN_INDEX=index_file)
- command = ParallelCmds(bam_input.commands + [builder.finalize()])
description = "<DepthHistogram: %s -> '%s'>" \
- % (describe_files(bam_input.files), output_file)
+ % (describe_files(input_files), output_file)
MultiBAMInputNode.__init__(self,
- bam_input=bam_input,
- command=command,
+ config=config,
+ input_bams=input_files,
+ index_format=index_format,
+ command=builder.finalize(),
description=description,
dependencies=dependencies)
@@ -159,23 +162,23 @@ class DepthHistogramNode(MultiBAMInputNode):
class FilterCollapsedBAMNode(MultiBAMInputNode):
def __init__(self, config, input_bams, output_bam, keep_dupes=True,
dependencies=()):
- bam_input = MultiBAMInput(config, input_bams, indexed=False)
+ input_bams = safe_coerce_to_tuple(input_bams)
builder = factory.new("rmdup_collapsed")
builder.add_value("%(TEMP_IN_BAM)s")
- builder.set_kwargs(OUT_STDOUT=output_bam)
- bam_input.setup(builder)
+ builder.set_kwargs(OUT_STDOUT=output_bam,
+ TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
+ builder.add_multiple_kwargs(input_bams)
if not keep_dupes:
builder.set_option("--remove-duplicates")
- filteruniq = builder.finalize()
- command = ParallelCmds(bam_input.commands + [filteruniq])
description = "<FilterCollapsedBAM: %s>" \
- % (describe_files(bam_input.files),)
+ % (describe_files(input_bams),)
MultiBAMInputNode.__init__(self,
- bam_input=bam_input,
- command=command,
+ config=config,
+ input_bams=input_bams,
+ command=builder.finalize(),
description=description,
dependencies=dependencies)
diff --git a/paleomix/nodes/mapdamage.py b/paleomix/nodes/mapdamage.py
index 54c82e5..c64aa90 100644
--- a/paleomix/nodes/mapdamage.py
+++ b/paleomix/nodes/mapdamage.py
@@ -27,14 +27,13 @@ import paleomix.common.versions as versions
from paleomix.common.fileutils import \
describe_files
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
from paleomix.node import \
NodeError, \
CommandNode
-from paleomix.atomiccmd.sets import \
- ParallelCmds
from paleomix.nodes.picard import \
- MultiBAMInput, \
MultiBAMInputNode
from paleomix.atomiccmd.builder import \
AtomicCmdBuilder, \
@@ -56,6 +55,8 @@ class MapDamagePlotNode(MultiBAMInputNode):
@create_customizable_cli_parameters
def customize(self, config, reference, input_files, output_directory,
title="mapDamage", dependencies=()):
+ input_files = safe_coerce_to_tuple(input_files)
+
command = AtomicCmdBuilder(
["mapDamage", "--no-stats",
# Prevent references with many contigs from using excessive
@@ -65,6 +66,8 @@ class MapDamagePlotNode(MultiBAMInputNode):
"-i", "%(TEMP_IN_BAM)s",
"-d", "%(TEMP_DIR)s",
"-r", "%(IN_REFERENCE)s"],
+
+ TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE,
IN_REFERENCE=reference,
OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"),
OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"),
@@ -82,6 +85,8 @@ class MapDamagePlotNode(MultiBAMInputNode):
CHECK_RSCRIPT=RSCRIPT_VERSION,
CHECK_MAPDAMAGE=MAPDAMAGE_VERSION)
+ command.add_multiple_kwargs(input_files)
+
return {"command": command,
"config": config,
"input_files": input_files,
@@ -89,18 +94,13 @@ class MapDamagePlotNode(MultiBAMInputNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
- bam_input = MultiBAMInput(parameters.config, parameters.input_files,
- indexed=False)
- bam_input.setup(parameters.command)
- cmd_map = parameters.command.finalize()
-
description = "<mapDamage (plots): %s -> '%s'>" \
% (describe_files(parameters.input_files),
parameters.output_directory)
MultiBAMInputNode.__init__(self,
- bam_input=bam_input,
- command=ParallelCmds(bam_input.commands +
- [cmd_map]),
+ config=parameters.config,
+ input_bams=parameters.input_files,
+ command=parameters.command.finalize(),
description=description,
dependencies=parameters.dependencies)
@@ -194,18 +194,24 @@ class MapDamageRescaleNode(MultiBAMInputNode):
@create_customizable_cli_parameters
def customize(self, config, reference, input_files, output_file, directory,
dependencies=()):
+ input_files = safe_coerce_to_tuple(input_files)
+
stats_out_fname = "Stats_out_MCMC_correct_prob.csv"
command = AtomicCmdBuilder(["mapDamage", "--rescale-only",
"-i", "%(TEMP_IN_BAM)s",
"-d", "%(TEMP_DIR)s",
"-r", "%(IN_REFERENCE)s",
"--rescale-out", "%(OUT_BAM)s"],
+
+ TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE,
IN_REFERENCE=reference,
TEMP_OUT_LOG="Runtime_log.txt",
TEMP_OUT_CSV=stats_out_fname,
OUT_BAM=output_file,
CHECK_VERSION=MAPDAMAGE_VERSION)
+ command.add_multiple_kwargs(input_files)
+
return {"command": command,
"config": config,
"input_files": input_files,
@@ -215,18 +221,14 @@ class MapDamageRescaleNode(MultiBAMInputNode):
@use_customizable_cli_parameters
def __init__(self, parameters):
self._directory = parameters.directory
- bam_input = MultiBAMInput(parameters.config, parameters.input_files,
- indexed=False)
- bam_input.setup(parameters.command)
- command = parameters.command.finalize()
description = "<mapDamage (rescale): %s -> %r>" \
% (describe_files(parameters.input_files),
parameters.output_file)
MultiBAMInputNode.__init__(self,
- bam_input=bam_input,
- command=ParallelCmds(bam_input.commands +
- [command]),
+ config=parameters.config,
+ input_bams=parameters.input_files,
+ command=parameters.command.finalize(),
description=description,
dependencies=parameters.dependencies)
diff --git a/paleomix/nodes/picard.py b/paleomix/nodes/picard.py
index 8641383..14d354d 100644
--- a/paleomix/nodes/picard.py
+++ b/paleomix/nodes/picard.py
@@ -28,10 +28,11 @@ from paleomix.atomiccmd.builder import \
AtomicJavaCmdBuilder, \
create_customizable_cli_parameters, \
use_customizable_cli_parameters
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
from paleomix.common.fileutils import \
swap_ext, \
try_rmtree, \
- try_remove, \
reroot_path, \
describe_files
from paleomix.common.utilities import \
@@ -58,54 +59,49 @@ class PicardNode(CommandNode):
class ValidateBAMNode(PicardNode):
- @create_customizable_cli_parameters
- def customize(cls, config, input_bam, output_log=None, dependencies=()):
- params = picard_command(config, "ValidateSamFile")
- _set_max_open_files(params, "MAX_OPEN_TEMP_FILES")
+ def __init__(self, config, input_bam, input_index=None, output_log=None,
+ ignored_checks=(), dependencies=()):
+ builder = picard_command(config, "ValidateSamFile")
+ _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")
- params.set_option("I", "%(IN_BAM)s", sep="=")
+ builder.set_option("I", "%(IN_BAM)s", sep="=")
+ for check in ignored_checks:
+ builder.add_option("IGNORE", check, sep="=")
output_log = output_log or swap_ext(input_bam, ".validated")
- params.set_kwargs(IN_BAM=input_bam,
- OUT_STDOUT=output_log)
-
- return {"command": params,
- "dependencies": dependencies}
+ builder.set_kwargs(IN_BAM=input_bam,
+ IN_INDEX=input_index,
+ OUT_STDOUT=output_log)
- @use_customizable_cli_parameters
- def __init__(self, parameters):
- description = "<Validate BAM: '%s'>" % (parameters.input_bam,)
+ description = "<Validate BAM: '%s'>" % (input_bam,)
PicardNode.__init__(self,
- command=parameters.command.finalize(),
+ command=builder.finalize(),
description=description,
- dependencies=parameters.dependencies)
+ dependencies=dependencies)
class BuildSequenceDictNode(PicardNode):
- @create_customizable_cli_parameters
- def customize(cls, config, reference, dependencies=()):
- params = picard_command(config, "CreateSequenceDictionary")
+ def __init__(self, config, reference, dependencies=()):
+ self._in_reference = os.path.abspath(reference)
- params.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
- params.set_option("O", "%(OUT_DICT)s", sep="=")
- params.set_kwargs(IN_REF=reference,
- TEMP_OUT_REF=os.path.basename(reference),
- OUT_DICT=swap_ext(reference, ".dict"))
+ builder = picard_command(config, "CreateSequenceDictionary")
- return {"command": params,
- "dependencies": dependencies}
+ builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
+ builder.set_option("O", "%(OUT_DICT)s", sep="=")
+ builder.set_kwargs(IN_REFERENCE=reference,
+ TEMP_OUT_REF=os.path.basename(reference),
+ OUT_DICT=swap_ext(reference, ".dict"))
- @use_customizable_cli_parameters
- def __init__(self, parameters):
- self._in_reference = os.path.abspath(parameters.reference)
- description = "<SequenceDictionary: '%s'>" % (parameters.reference,)
+ description = "<SequenceDictionary: '%s'>" % (reference,)
PicardNode.__init__(self,
- command=parameters.command.finalize(),
+ command=builder.finalize(),
description=description,
- dependencies=parameters.dependencies)
+ dependencies=dependencies)
def _setup(self, _config, temp):
+ # Ensure that Picard CreateSequenceDict cannot reuse any existing
+ # sequence dictionaries, if the underlying files have changed.
os.symlink(self._in_reference, reroot_path(temp, self._in_reference))
@@ -116,11 +112,11 @@ class MarkDuplicatesNode(PicardNode):
params = picard_command(config, "MarkDuplicates")
_set_max_open_files(params, "MAX_FILE_HANDLES")
- # Create .bai index, since it is required by a lot of other programs
- params.set_option("CREATE_INDEX", "True", sep="=")
-
params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
+ # Validation is mostly left to manual ValidateSamFile runs; required
+ # because .csi indexed BAM records can have "invalid" bins.
+ params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
params.add_multiple_options("I", input_bams, sep="=")
if not keep_dupes:
@@ -130,7 +126,6 @@ class MarkDuplicatesNode(PicardNode):
output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
params.set_kwargs(OUT_BAM=output_bam,
- OUT_BAI=swap_ext(output_bam, ".bai"),
OUT_METRICS=output_metrics)
return {"command": params,
@@ -147,106 +142,80 @@ class MarkDuplicatesNode(PicardNode):
class MergeSamFilesNode(PicardNode):
- @create_customizable_cli_parameters
- def customize(cls, config, input_bams, output_bam, dependencies=()):
- params = picard_command(config, "MergeSamFiles")
-
- params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
- params.set_option("CREATE_INDEX", "True", sep="=")
- params.set_option("SO", "coordinate", sep="=", fixed=False)
- params.add_multiple_options("I", input_bams, sep="=")
-
- params.set_kwargs(OUT_BAM=output_bam,
- OUT_BAI=swap_ext(output_bam, ".bai"))
-
- return {"command": params,
- "dependencies": dependencies}
-
- @use_customizable_cli_parameters
- def __init__(self, parameters):
+ def __init__(self, config, input_bams, output_bam, dependencies=()):
+ builder = picard_command(config, "MergeSamFiles")
+ builder.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
+ builder.set_option("SO", "coordinate", sep="=")
+ # Validation is mostly left to manual ValidateSamFile runs; required
+ # because .csi indexed BAM records can have "invalid" bins.
+ builder.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
+ builder.add_multiple_options("I", input_bams, sep="=")
+
+ builder.set_kwargs(OUT_BAM=output_bam)
description = "<Merge BAMs: %i file(s) -> '%s'>" \
- % (len(parameters.input_bams), parameters.output_bam)
+ % (len(input_bams), output_bam)
PicardNode.__init__(self,
- command=parameters.command.finalize(),
+ command=builder.finalize(),
description=description,
- dependencies=parameters.dependencies)
-
-
-class MultiBAMInput(object):
- """Container used to ease processing of 1 or more BAM files; used in
- conjunctin with MultiBAMInputNode.
- """
-
- def __init__(self, config, input_bams, pipename="input.bam", indexed=True):
- self.pipe = pipename
- self.indexed = indexed
- self.files = safe_coerce_to_tuple(input_bams)
-
- self.commands = []
- self.kwargs = {"TEMP_IN_BAM": self.pipe}
- if len(self.files) > 1:
- params = picard_command(config, "MergeSamFiles")
-
- params.set_option("SO", "coordinate", sep="=", fixed=False)
- params.set_option("CREATE_INDEX", "False", sep="=")
- params.set_option("COMPRESSION_LEVEL", 0, sep="=")
- params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
- params.add_multiple_options("I", input_bams, sep="=")
-
- params.set_kwargs(TEMP_OUT_BAM=self.pipe)
-
- self.commands = [params.finalize()]
- else:
- # Ensure that the actual command depends on the input
- self.kwargs["IN_FILE_00"] = self.files[0]
-
- if indexed:
- self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
-
- def setup(self, command):
- command.set_kwargs(**self.kwargs)
+ dependencies=dependencies)
class MultiBAMInputNode(CommandNode):
- """Node which provides concatenation of input BAM files. Takes a
- MultiBAMInput object, and creates a pipe in the temporary folder which
- yields the concatenated BAM resulting from the concatenation of all input
- files. To avoid unnessary overhead, a symbolic link is used in the case
- where there is only a single input file.
-
- Usage example:
- class ExampleNode(MultiBAMInputNode):
- def __init__(self, config, input_bams):
- bam_input = MultiBAMInput(config, input_bams)
- command = AtomicCmd(['analyse_bam', '%(TEMP_IN_BAM)s'],
- TEMP_IN_BAM=bam_input.pipe)
- commands = ParallelCmds(bam_input.commands + [command])
- MultiBAMInputNode.__init__(bam_input=bam_input,
- command=commands)
- """
-
- def __init__(self, bam_input, *args, **kwargs):
- self._bam_input = bam_input
- CommandNode.__init__(self, *args, **kwargs)
-
- def _setup(self, config, temp_root):
- CommandNode._setup(self, config, temp_root)
- dst_fname = os.path.join(temp_root, self._bam_input.pipe)
- if len(self._bam_input.files) > 1:
- os.mkfifo(dst_fname)
+ PIPE_FILE = "input.bam"
+
+ def __init__(self, config, input_bams, command, index_format=None,
+ description=None, threads=1, dependencies=()):
+ self._input_bams = safe_coerce_to_tuple(input_bams)
+ self._index_format = index_format
+
+ if not self._input_bams:
+ raise ValueError("No input BAM files specified!")
+ elif len(self._input_bams) > 1 and index_format:
+ raise ValueError("BAM index cannot be required for > 1 file")
+ elif index_format not in (None, ".bai", ".csi"):
+ raise ValueError("Unknown index format %r" % (index_format,))
+
+ if len(self._input_bams) > 1:
+ merge = picard_command(config, "MergeSamFiles")
+ merge.set_option("SO", "coordinate", sep="=")
+ merge.set_option("COMPRESSION_LEVEL", 0, sep="=")
+ merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
+ # Validation is mostly left to manual ValidateSamFile runs; this
+ # is because .csi indexed BAM records can have "invalid" bins.
+ merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
+ merge.add_multiple_options("I", input_bams, sep="=")
+
+ merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE)
+
+ command = ParallelCmds([merge.finalize(), command])
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ threads=threads,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ pipe_fname = os.path.join(temp, self.PIPE_FILE)
+ if len(self._input_bams) > 1:
+ os.mkfifo(pipe_fname)
else:
- src_fname, = self._bam_input.files
- os.symlink(os.path.join(os.getcwd(), src_fname), dst_fname)
+ source_fname = os.path.abspath(self._input_bams[0])
+ os.symlink(source_fname, pipe_fname)
+
+ if self._index_format:
+ os.symlink(swap_ext(source_fname, self._index_format),
+ swap_ext(pipe_fname, self._index_format))
- if self._bam_input.indexed:
- src_fname = os.path.join(os.getcwd(), swap_ext(src_fname, ".bai"))
- os.symlink(src_fname, dst_fname + ".bai")
+ def _teardown(self, config, temp):
+ os.remove(os.path.join(temp, self.PIPE_FILE))
+ if self._index_format:
+ os.remove(os.path.join(temp, swap_ext(self.PIPE_FILE,
+ self._index_format)))
- def _teardown(self, config, temp_root):
- pipe_fname = os.path.join(temp_root, self._bam_input.pipe)
- os.remove(pipe_fname)
- try_remove(pipe_fname + ".bai")
- CommandNode._teardown(self, config, temp_root)
+ CommandNode._teardown(self, config, temp)
###############################################################################
@@ -271,7 +240,7 @@ def picard_command(config, command):
requirement = versions.Requirement(call=params.finalized_call,
name="Picard tools",
search=r"^(\d+)\.(\d+)",
- checks=versions.GE(1, 124))
+ checks=versions.GE(1, 137))
_PICARD_VERSION_CACHE[jar_path] = requirement
version = _PICARD_VERSION_CACHE[jar_path]
@@ -286,6 +255,8 @@ def picard_command(config, command):
# Fraction of per-process max open files to use
_FRAC_MAX_OPEN_FILES = 0.95
+# Default maximum number of open temporary files used by Picard
+_DEFAULT_MAX_OPEN_FILES = 8000
def _set_max_open_files(params, key):
@@ -294,6 +265,8 @@ def _set_max_open_files(params, key):
ulimit.
"""
max_open_files = paleomix.common.system.get_max_open_files()
- if max_open_files is not None:
+ if max_open_files:
max_open_files = int(max_open_files * _FRAC_MAX_OPEN_FILES)
- params.set_option(key, max_open_files, sep="=")
+
+ if max_open_files < _DEFAULT_MAX_OPEN_FILES:
+ params.set_option(key, max_open_files, sep="=")
diff --git a/paleomix/nodes/samtools.py b/paleomix/nodes/samtools.py
index d9fa409..ca6a4f5 100644
--- a/paleomix/nodes/samtools.py
+++ b/paleomix/nodes/samtools.py
@@ -40,6 +40,10 @@ SAMTOOLS_VERSION = versions.Requirement(call=("samtools",),
search=_VERSION_REGEX,
checks=_COMMON_CHECK)
+SAMTOOLS_VERSION_1x = versions.Requirement(call=("samtools",),
+ search=_VERSION_REGEX,
+ checks=versions.GE(1, 0, 0))
+
SAMTOOLS_VERSION_0119 = versions.Requirement(call=("samtools",),
search=_VERSION_REGEX,
checks=versions.EQ(0, 1, 19))
@@ -133,26 +137,37 @@ class FastaIndexNode(CommandNode):
class BAMIndexNode(CommandNode):
"""Indexed a BAM file using 'samtools index'."""
- def __init__(self, infile, dependencies=()):
+ def __init__(self, infile, index_format='.bai', dependencies=()):
basename = os.path.basename(infile)
+ if index_format == '.bai':
+ samtools_version = SAMTOOLS_VERSION
+ samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"]
+ elif index_format == '.csi':
+ samtools_version = SAMTOOLS_VERSION_1x
+ samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"]
+ else:
+ raise ValueError("Unknown format type %r; expected .bai or .csi"
+ % (index_format,))
+
cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
IN_BAM=infile,
TEMP_OUT_BAM=basename,
set_cwd=True)
- cmd_index = AtomicCmd(["samtools", "index", "%(TEMP_IN_BAM)s"],
+ cmd_index = AtomicCmd(samtools_call,
TEMP_IN_BAM=basename,
- CHECK_SAM=SAMTOOLS_VERSION)
+ CHECK_SAM=samtools_version)
cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
- TEMP_IN_BAM=basename + ".bai",
- OUT_BAM=swap_ext(infile, ".bai"))
+ TEMP_IN_BAM=basename + index_format,
+ OUT_BAM=swap_ext(infile, index_format))
commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))
CommandNode.__init__(self,
- description="<BAMIndex: '%s'>" % (infile,),
+ description="<BAMIndex (%s): '%s'>"
+ % (index_format[1:].upper(), infile),
command=commands,
dependencies=dependencies)
diff --git a/paleomix/nodes/validation.py b/paleomix/nodes/validation.py
index d0989a5..28a443b 100644
--- a/paleomix/nodes/validation.py
+++ b/paleomix/nodes/validation.py
@@ -63,111 +63,46 @@ class DetectInputDuplicationNode(Node):
dependencies=dependencies)
def run(self, _):
- handles = []
- try:
- last_pos = None
- observed_reads = collections.defaultdict(list)
- for (record, filename) in self._open_samfiles(handles, self.input_files):
- curr_pos = (record.pos, record.tid)
- if curr_pos != last_pos:
- self._process_reads(observed_reads, self.output_files)
- observed_reads.clear()
- last_pos = curr_pos
-
- # Stop once the trailing, unmapped reads are reached
- if record.tid == -1:
- break
-
- observed_reads[record.qname].append((record, filename))
- self._process_reads(observed_reads, self.output_files)
-
- # Everything is ok, touch the output files
- for fpath in self.output_files:
- make_dirs(os.path.dirname(fpath))
- with open(fpath, "w"):
- pass
- finally:
- for handle in handles:
- handle.close()
-
- @classmethod
- def _open_samfiles(cls, handles, filenames):
- sequences = []
- for filename in filenames:
- handle = pysam.Samfile(filename)
- handles.append(handle)
-
- sequences.append(cls._read_samfile(handle, filename))
-
- return chain_sorted(*sequences, key=cls._key_by_tid_pos)
-
- @classmethod
- def _read_samfile(cls, handle, filename):
- for record in handle:
- if record.is_unmapped and (not record.pos or record.mate_is_unmapped):
- # Ignore unmapped reads except when these are sorted
- # according to the mate position (if mapped)
- continue
- elif record.flag & 0x900:
- # Ignore supplementary / secondary alignments
- continue
-
- yield (record, filename)
-
- @classmethod
- def _process_reads(cls, observed_reads, output_files):
- for records_and_filenames in observed_reads.itervalues():
- if len(records_and_filenames) == 1:
- # Most read-names should be obseved at most once at a position
- continue
-
- result = collections.defaultdict(list)
- for record, filename in records_and_filenames:
- key = (record.is_reverse, record.qname, record.seq, record.qual)
- result[key].append(filename)
-
- for (is_reverse, name, seq, qual), filenames in result.iteritems():
- if len(filenames) == 1:
- # Two reads had same name, but different characterstics
- continue
-
- filename_counts = collections.defaultdict(int)
- for filename in filenames:
- filename_counts[filename] += 1
+ check_bam_files(self.input_files, self._throw_node_error)
- if is_reverse:
- seq = reverse_complement(seq)
- qual = qual[::-1]
+ # Everything is ok, touch the output files
+ for fpath in self.output_files:
+ if os.path.dirname(fpath):
+ make_dirs(os.path.dirname(fpath))
- message = ["The same read was found multiple times!",
- " Name: %r" % (name,),
- " Sequence: %r" % (seq,),
- " Qualities: %r" % (qual,),
- ""]
+ with open(fpath, "w"):
+ pass
- message.append("Read was found")
- for filename, count in sorted(filename_counts.iteritems()):
- message.append(" % 2ix in %r" % (count, filename))
+ def _throw_node_error(self, chrom, pos, records, name, seq, qual):
+ message = ["The same read was found multiple times at position %i on %r!"
+ % (pos, chrom),
+ " Name: %r" % (name,),
+ " Sequence: %r" % (seq,),
+ " Qualities: %r" % (qual,),
+ ""]
- message.append("")
- message.append("This indicates that the same data files have "
- "been included multiple times in the project. "
- "Please review the input files used in this "
- "project, to ensure that each set of data is "
- "included only once!\n\n"
+ message.append("Read was found in these BAM files:")
+ for filename, records in sorted(records.iteritems()):
+ message.append(" %s in %r" % (_summarize_reads(records), filename))
- "If this is not the case, then execute the "
- "following command(s) to mark this test as "
- "having succeeded:")
+ message.append("")
+ message.append("This indicates that the same data has been "
+ "included multiple times in the project. This "
+ "can be because multiple copies of the same "
+ "files were used, or because one or more files "
+ "contain multiple copies of the same reads. "
+ "The command 'paleomix dupcheck' may be used "
+ "to review the potentially duplicated data in "
+ "these BAM files.\n\n"
- for fpath in output_files:
- message.append("$ touch '%s'" % (fpath,))
+ "If this error was a false positive, then you "
+ "may execute the following command(s) to mark "
+ "this test as having succeeded:")
- raise NodeError("\n".join(message))
+ for fpath in self.output_files:
+ message.append("$ touch '%s'" % (fpath,))
- @classmethod
- def _key_by_tid_pos(cls, record):
- return (record[0].tid, record[0].pos)
+ raise NodeError("\n".join(message))
class ValidateFASTQFilesNode(Node):
@@ -183,7 +118,8 @@ class ValidateFASTQFilesNode(Node):
def _run(self, _config, _temp):
check_fastq_files(self.input_files, self._offset, True)
output_file = tuple(self.output_files)[0]
- make_dirs(os.path.dirname(output_file))
+ if os.path.dirname(output_file):
+ make_dirs(os.path.dirname(output_file))
with open(output_file, "w"):
pass
@@ -203,11 +139,38 @@ class ValidateFASTAFilesNode(Node):
for filename in self.input_files:
check_fasta_file(filename)
output_file, = self.output_files
- make_dirs(os.path.dirname(output_file))
+ if os.path.dirname(output_file):
+ make_dirs(os.path.dirname(output_file))
with open(output_file, "w"):
pass
+def check_bam_files(input_files, err_func):
+ handles = []
+ try:
+ last_pos = None
+ observed_reads = collections.defaultdict(list)
+ reads_iter = _open_samfiles(handles, input_files)
+ references = handles[0].references
+
+ for (record, filename) in reads_iter:
+ curr_pos = (record.pos, record.tid)
+ if curr_pos != last_pos:
+ _process_bam_reads(observed_reads, references, last_pos, err_func)
+ observed_reads.clear()
+ last_pos = curr_pos
+
+ # Stop once the trailing, unmapped reads are reached
+ if record.tid == -1:
+ break
+
+ observed_reads[record.qname].append((record, filename))
+ _process_bam_reads(observed_reads, references, last_pos, err_func)
+ finally:
+ for handle in handles:
+ handle.close()
+
+
def check_fastq_files(filenames, required_offset, allow_empty=False):
for filename in filenames:
qualities = _read_sequences(filename)
@@ -249,12 +212,15 @@ def _read_sequences(filename):
qualities = _collect_qualities(cat.stdout, filename)
return sampling.reservoir_sampling(qualities, 100000)
- except:
+ except StandardError as error:
if cat:
- cat.kill()
+ try:
+ cat.kill()
+ except OSError:
+ pass
cat.wait()
cat = None
- raise
+ raise error
finally:
rc_cat = cat.wait() if cat else 0
if rc_cat:
@@ -264,6 +230,88 @@ def _read_sequences(filename):
raise NodeError(message)
+def _open_samfiles(handles, filenames):
+ sequences = []
+ for filename in filenames:
+ handle = pysam.Samfile(filename)
+ handles.append(handle)
+
+ sequences.append(_read_samfile(handle, filename))
+
+ return chain_sorted(*sequences, key=_key_by_tid_pos)
+
+
+def _read_samfile(handle, filename):
+ for record in handle:
+ if record.is_unmapped and (not record.pos or record.mate_is_unmapped):
+ # Ignore unmapped reads except when these are sorted
+ # according to the mate position (if mapped)
+ continue
+ elif record.flag & 0x900:
+ # Ignore supplementary / secondary alignments
+ continue
+
+ yield (record, filename)
+
+
+def _process_bam_reads(observed_reads, references, position, err_func):
+ for records_and_filenames in observed_reads.itervalues():
+ if len(records_and_filenames) == 1:
+ # Most read-names should be obseved at most once at a position
+ continue
+
+ result = collections.defaultdict(list)
+ for record, filename in records_and_filenames:
+ key = (record.is_reverse, record.qname, record.seq, record.qual)
+ result[key].append((filename, record))
+
+ for (is_reverse, name, seq, qual), filenames in result.iteritems():
+ if len(filenames) == 1:
+ # Two reads had same name, but different characterstics
+ continue
+
+ records = collections.defaultdict(list)
+ for filename, record in filenames:
+ records[filename].append(record)
+
+ if is_reverse:
+ seq = reverse_complement(seq)
+ qual = qual[::-1]
+
+ chrom = references[position[1]]
+ pos = position[0]
+
+ err_func(chrom, pos, records, name, seq, qual)
+
+
+def _summarize_reads(records):
+ counts = {'mate 1': 0, 'mate 2': 0, 'unpaired': 0}
+
+ for record in records:
+ if record.is_paired:
+ if record.is_read1:
+ counts['mate 1'] += 1
+ elif record.is_read2:
+ counts['mate 2'] += 1
+ else:
+ counts['unpaired'] += 1
+ else:
+ counts['unpaired'] += 1
+
+ result = []
+ for key, value in sorted(counts.items()):
+ if value > 1:
+ result.append('%i %s reads' % (value, key))
+ elif value:
+ result.append('%i %s read' % (value, key))
+
+ return ", ".join(result) or "No reads"
+
+
+def _key_by_tid_pos(record):
+ return (record[0].tid, record[0].pos)
+
+
def _collect_qualities(handle, filename):
header = handle.readline()
while header:
diff --git a/paleomix/pipeline.py b/paleomix/pipeline.py
index 373b8d5..8e16194 100644
--- a/paleomix/pipeline.py
+++ b/paleomix/pipeline.py
@@ -79,7 +79,7 @@ class Pypeline(object):
return False
for node in nodegraph.iterflat():
- if (node.threads > max_threads):
+ if node.threads > max_threads:
message = "Node(s) use more threads than the max allowed; " \
"the pipeline may therefore use more than the " \
"expected number of threads.\n"
@@ -87,7 +87,7 @@ class Pypeline(object):
break
if dry_run:
- progress_printer = paleomix.ui.QuietUI()
+ progress_printer = paleomix.ui.RunningUI()
nodegraph.add_state_observer(progress_printer)
progress_printer.flush()
progress_printer.finalize()
diff --git a/paleomix/tools/bam_pipeline/makefile.py b/paleomix/tools/bam_pipeline/makefile.py
index d05f308..ccf2289 100644
--- a/paleomix/tools/bam_pipeline/makefile.py
+++ b/paleomix/tools/bam_pipeline/makefile.py
@@ -68,6 +68,10 @@ _READ_TYPES = set(("Single", "Singleton",
"Collapsed", "CollapsedTruncated",
"Paired"))
+# The maximum reference sequence length supported by the BAI index format:
+# https://samtools.github.io/hts-specs/SAMv1.pdf
+_BAM_MAX_SEQUENCE_LENGTH = 2 ** 29 - 1
+
def read_makefiles(config, filenames, pipeline_variant="bam"):
if pipeline_variant not in ("bam", "trim"):
@@ -365,6 +369,10 @@ def _mangle_options(makefile):
options = fill_dict(destination=data.pop("Options"),
source=options)
+ # Force feature if 'RescaleQualities' is set, see _mangle_features
+ if options.pop('RescaleQualities', None):
+ options['Features']['mapDamage'] = 'rescale'
+
if len(path) < 2:
for key in data:
if key != "Options":
@@ -398,43 +406,34 @@ def _mangle_features(makefile):
def _mangle_prefixes(makefile):
- prefixes = {}
+ records = []
for (name, values) in makefile.get("Prefixes", {}).iteritems():
- filename = values["Path"]
if "*" in name[:-1]:
raise MakefileError("The character '*' is not allowed in Prefix "
- "names; if you use to select .fasta files "
- "using a search-string, then use the prefix "
- "name '%s*' instead and specify the wildcards "
- "in the 'Path' instead."
+ "names; if you wish to select multiple .fasta "
+ "files using a search-string, then use the "
+ "prefix name '%s*' instead and specify the "
+ "wildcards in the 'Path'."
% (name.replace("*", "",)))
elif name.endswith("*"):
- records = []
- for fname in glob.glob(filename):
- name = os.path.basename(fname).split(".")[0]
- _VALID_PREFIX_NAME(("Prefixes", name), name)
- new_prefix = copy.copy(values)
- new_prefix["Path"] = fname
-
- records.append((name, new_prefix))
- if not records:
- raise MakefileError("Did not find any matches for glob %s"
- % repr(filename))
+ records.extend(_glob_prefixes(values, values['Path']))
+
else:
- records = [(name, values)]
+ records.append((name, values))
- for (name, record) in records:
- if name in prefixes:
- raise MakefileError("Multiple prefixes with the same name: %s"
- % name)
+ prefixes = {}
+ for (name, record) in records:
+ if name in prefixes:
+ raise MakefileError("Multiple prefixes with the same name: %s"
+ % name)
- if not record["Path"].endswith(".fasta"):
- raise MakefileError("Path for prefix %r does not end with "
- ".fasta:\n %r" % (name, record["Path"]))
+ if not record["Path"].endswith(".fasta"):
+ raise MakefileError("Path for prefix %r does not end with "
+ ".fasta:\n %r" % (name, record["Path"]))
- record["Name"] = name
- record["Reference"] = record["Path"]
- prefixes[name] = record
+ record["Name"] = name
+ record["Reference"] = record["Path"]
+ prefixes[name] = record
if not prefixes:
raise MakefileError("At least one prefix must be specified")
@@ -442,6 +441,21 @@ def _mangle_prefixes(makefile):
makefile["Prefixes"] = prefixes
+def _glob_prefixes(template, pattern):
+ filename = None
+ for filename in glob.iglob(pattern):
+ name = os.path.basename(filename).split(".")[0]
+ _VALID_PREFIX_NAME(("Prefixes", name), name)
+ new_prefix = copy.copy(template)
+ new_prefix["Path"] = filename
+
+ yield (name, new_prefix)
+
+ if filename is None:
+ raise MakefileError("Did not find any matches for search string %r"
+ % (pattern,))
+
+
def _mangle_lanes(makefile):
formatter = string.Formatter()
prefixes = makefile["Prefixes"]
@@ -564,24 +578,26 @@ def _split_lanes_by_filenames(makefile):
record["Data"] = files = paths.collect_files(path, template)
split = record["Options"]["SplitLanesByFilenames"]
- if (split is True) or (isinstance(split, list) and (barcode in split)):
+ if (split is True) or (isinstance(split, list) and
+ (barcode in split)):
if any(len(v) > 1 for v in files.itervalues()):
- template = makefile["Targets"][target][sample][library].pop(barcode)
+ library = makefile["Targets"][target][sample][library]
+ template = library.pop(barcode)
keys = ("SE",) if ("SE" in files) else ("PE_1", "PE_2")
input_files = [files[key] for key in keys]
- assert len(input_files[0]) == len(input_files[-1]), input_files
-
input_files_iter = itertools.izip_longest(*input_files)
- for (index, filenames) in enumerate(input_files_iter, start=1):
+ for (index, filenames) in enumerate(input_files_iter,
+ start=1):
assert len(filenames) == len(keys)
new_barcode = "%s_%03i" % (barcode, index)
current = copy.deepcopy(template)
- current["Data"] = dict((key, [filename]) for (key, filename) in zip(keys, filenames))
+ current["Data"] = {k: [v]
+ for (k, v) in zip(keys, filenames)}
current["Tags"]["PU_cur"] = new_barcode
- makefile["Targets"][target][sample][library][new_barcode] = current
+ library[new_barcode] = current
def _validate_makefiles(config, makefiles):
@@ -739,17 +755,21 @@ def _validate_prefixes(makefiles):
genome is ordered 1 .. 23. This is required since GATK will not run with
human genomes in a different order.
"""
- already_validated = set()
+ already_validated = {}
print_info(" - Validating prefixes ...")
for makefile in makefiles:
uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
for prefix in makefile["Prefixes"].itervalues():
path = prefix["Path"]
if path in already_validated:
+ prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
continue
+ # Must be set to a valid value, even if FASTA file does not exist
+ prefix["IndexFormat"] = ".bai"
+
if not os.path.exists(path):
- print_info(" - Reference FASTA file does not exist:\n"
+ print_warn(" - Reference FASTA file does not exist:\n"
" %r" % (path,))
continue
elif not os.path.exists(path + ".fai"):
@@ -776,7 +796,14 @@ def _validate_prefixes(makefiles):
"interest %r for prefix %r:\n%s"
% (name, prefix["Name"], error))
- already_validated.add(path)
+ if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
+ print_warn(" - FASTA file %r contains sequences longer "
+ "than %i! CSI index files will be used instead "
+ "of BAI index files."
+ % (path, _BAM_MAX_SEQUENCE_LENGTH))
+ prefix["IndexFormat"] = ".csi"
+
+ already_validated[path] = prefix
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
@@ -827,7 +854,7 @@ def _is_invalid_hg_prefix(contigs):
# Contig not found; probably not hg18, hg19, or hg38
return False
- return not (hg_contigs["chr1"] < hg_contigs["chr2"] < hg_contigs["chr10"])
+ return not hg_contigs["chr1"] < hg_contigs["chr2"] < hg_contigs["chr10"]
def _iterate_over_records(makefile):
diff --git a/paleomix/tools/bam_pipeline/mkfile.py b/paleomix/tools/bam_pipeline/mkfile.py
index dc81423..6585ca7 100755
--- a/paleomix/tools/bam_pipeline/mkfile.py
+++ b/paleomix/tools/bam_pipeline/mkfile.py
@@ -87,9 +87,9 @@ _TEMPLATE_BAM_OPTIONS = \
MinQuality: 0
# Filter reads that did not map to the reference sequence
FilterUnmappedReads: yes
- # May be disabled ("no") for aDNA alignments, as post-mortem damage
- # localizes to the seed region, which BWA expects to have few
- # errors (sets "-l"). See http://pmid.us/22574660
+ # May be disabled ("no") for aDNA alignments with the 'aln' algorithm.
+ # Post-mortem damage localizes to the seed region, which BWA expects to
+ # have few errors (sets "-l"). See http://pmid.us/22574660
UseSeed: yes
# Additional command-line options may be specified for the "aln"
# call(s), as described below for Bowtie2 below.
diff --git a/paleomix/tools/bam_pipeline/nodes.py b/paleomix/tools/bam_pipeline/nodes.py
index 5ee01c7..f4f1acf 100644
--- a/paleomix/tools/bam_pipeline/nodes.py
+++ b/paleomix/tools/bam_pipeline/nodes.py
@@ -24,8 +24,6 @@ import os
import paleomix.nodes.picard as picard
-from paleomix.common.fileutils import \
- swap_ext
from paleomix.atomiccmd.command import \
AtomicCmd
from paleomix.atomiccmd.builder import \
@@ -42,49 +40,44 @@ from paleomix.nodes.samtools import \
def index_and_validate_bam(config, prefix, node, log_file=None,
create_index=True):
- input_file, has_index = _get_input_file(node)
- if not has_index and create_index:
+ input_file, index_file = _get_input_files(node, prefix['IndexFormat'])
+ if not index_file and create_index:
node = BAMIndexNode(infile=input_file,
+ index_format=prefix['IndexFormat'],
dependencies=node)
-
- validation_params = ValidateBAMNode.customize(config=config,
- input_bam=input_file,
- output_log=log_file,
- dependencies=node)
-
- # Ensure that the validation node is re-run if the index changes
- if has_index or create_index:
- bai_filename = swap_ext(input_file, ".bai")
- validation_params.command.set_kwargs(IN_BAI=bai_filename)
-
- # Check MD tags against reference sequence
- # FIXME: Disabled due to issues with Picard/Samtools disagreeing,
- # backwards compatibility. See the discussion at
- # http://sourceforge.net/mailarchive/message.php?msg_id=31348639
- # validation_params.command.set_kwargs(IN_REF=prefix["Reference"])
- # validation_params.command.add_option("R", "%(IN_REF)s", sep="=")
-
- # Ignored since we may filter out misses and low-quality hits during
- # mapping, which leads to a large proportion of missing PE mates.
- validation_params.command.add_option("IGNORE", "MATE_NOT_FOUND",
- sep="=")
- # Ignored due to high rate of false positives for lanes with few hits,
- # where high-quality reads may cause mis-identification of qualities
- validation_params.command.add_option("IGNORE",
- "INVALID_QUALITY_FORMAT", sep="=")
-
- return validation_params.build_node()
-
-
-def _get_input_file(node):
- input_filename, has_index = None, False
+ index_file, = node.output_files
+
+ ignored_checks = [
+ # Ignored since we may filter out misses and low-quality hits during
+ # mapping, which leads to a large proportion of missing PE mates.
+ "MATE_NOT_FOUND",
+ # Ignored due to high rate of false positives for lanes with few hits,
+ # where high-quality reads may cause mis-identification of qualities
+ "INVALID_QUALITY_FORMAT"]
+
+ if prefix['IndexFormat'] == '.csi':
+ # CSI uses a different method for assigning BINs to records, which
+ # Picard currently does not support.
+ ignored_checks.append("INVALID_INDEXING_BIN")
+
+ return ValidateBAMNode(config=config,
+ input_bam=input_file,
+ input_index=index_file,
+ ignored_checks=ignored_checks,
+ output_log=log_file,
+ dependencies=node)
+
+
+def _get_input_files(node, index_format):
+ index_filename = None
+ input_filename = None
for filename in node.output_files:
- if filename.lower().endswith(".bai"):
- has_index = True
+ if filename.lower().endswith(index_format):
+ index_filename = True
elif filename.lower().endswith(".bam"):
input_filename = filename
- return input_filename, has_index
+ return input_filename, index_filename
class CleanupBAMNode(PicardNode):
@@ -121,7 +114,7 @@ class CleanupBAMNode(PicardNode):
TEMP_OUT_BAM="bam.pipe")
calmd = AtomicCmdBuilder(["samtools", "calmd", "-b",
- "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
+ "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
IN_REF=reference,
TEMP_IN_BAM="bam.pipe",
OUT_STDOUT=output_bam)
diff --git a/paleomix/tools/bam_pipeline/parts/lane.py b/paleomix/tools/bam_pipeline/parts/lane.py
index 34ce62d..34ee8f5 100644
--- a/paleomix/tools/bam_pipeline/parts/lane.py
+++ b/paleomix/tools/bam_pipeline/parts/lane.py
@@ -263,6 +263,9 @@ class Lane:
self._set_pe_input_files(parameters)
node = BWAAlgorithmNode.customize(**parameters)
+ apply_options(node.commands["aln"],
+ self.options["Aligners"]["BWA"])
+
return self._finalize_nodes(config, prefix, parameters, node)
def _build_bowtie2(self, config, prefix, record, parameters):
@@ -276,9 +279,7 @@ class Lane:
else:
command.set_option("--phred64")
- for (key, value) in self.options["Aligners"]["Bowtie2"].iteritems():
- if key.startswith("-"):
- command.set_option(key, value)
+ apply_options(command, self.options["Aligners"]["Bowtie2"])
return self._finalize_nodes(config, prefix, parameters, node)
@@ -293,7 +294,7 @@ class Lane:
index_required = self._is_indexing_required(prefix)
validated_node = index_and_validate_bam(config=config,
- prefix=parameters['prefix'],
+ prefix=prefix,
node=node.build_node(),
create_index=index_required)
diff --git a/paleomix/tools/bam_pipeline/parts/library.py b/paleomix/tools/bam_pipeline/parts/library.py
index 55d36fd..7d0135b 100644
--- a/paleomix/tools/bam_pipeline/parts/library.py
+++ b/paleomix/tools/bam_pipeline/parts/library.py
@@ -67,7 +67,8 @@ class Library:
self.options = lanes[0].options
self.folder = os.path.dirname(self.lanes[0].folder)
- assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes)
+ assert all((self.folder == os.path.dirname(lane.folder))
+ for lane in self.lanes)
assert all((self.options == lane.options) for lane in self.lanes)
lane_bams = self._collect_bams_by_type(self.lanes)
@@ -75,9 +76,10 @@ class Library:
pcr_duplicates = self.options["PCRDuplicates"]
if pcr_duplicates:
# pcr_duplicates may be "mark" or any trueish value
- lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates)
+ lane_bams = self._remove_pcr_duplicates(
+ config, prefix, lane_bams, pcr_duplicates)
- # At this point we no longer need to differentiate between types of reads
+ # At this point we no longer need to differentiate between read types
files_and_nodes = self._collect_files_and_nodes(lane_bams)
# Collect output bams, possible following rescaling
@@ -87,7 +89,10 @@ class Library:
nodes = [self._build_dataduplication_node(lane_bams)]
nodes.extend(mapdamage_nodes)
- histogram_node = self._build_duphist_nodes(config, target, prefix, lane_bams)
+ histogram_node = self._build_duphist_nodes(config=config,
+ target=target,
+ prefix=prefix,
+ files_and_nodes=lane_bams)
if histogram_node:
nodes.append(histogram_node)
@@ -111,8 +116,8 @@ class Library:
return files_and_nodes
def _remove_pcr_duplicates(self, config, prefix, bams, strategy):
- rmdup_cls = {"collapsed" : FilterCollapsedBAMNode,
- "normal" : MarkDuplicatesNode}
+ rmdup_cls = {"collapsed": FilterCollapsedBAMNode,
+ "normal": MarkDuplicatesNode}
keep_duplicates = False
if isinstance(strategy, types.StringTypes) and (strategy.lower() == "mark"):
@@ -128,15 +133,17 @@ class Library:
results = {}
for (key, files_and_nodes) in bams.items():
output_filename = self.folder + ".rmdup.%s.bam" % key
- node = rmdup_cls[key](config = config,
- input_bams = files_and_nodes.keys(),
- output_bam = output_filename,
- keep_dupes = keep_duplicates,
- dependencies = files_and_nodes.values())
- validated_node = index_and_validate_bam(config, prefix, node,
+ node = rmdup_cls[key](config=config,
+ input_bams=files_and_nodes.keys(),
+ output_bam=output_filename,
+ keep_dupes=keep_duplicates,
+ dependencies=files_and_nodes.values())
+ validated_node = index_and_validate_bam(config=config,
+ prefix=prefix,
+ node=node,
create_index=index_required)
- results[key] = {output_filename : validated_node}
+ results[key] = {output_filename: validated_node}
return results
def _build_mapdamage_nodes(self, config, target, prefix, files_and_nodes):
@@ -222,7 +229,9 @@ class Library:
# Grab indexing and validation nodes, required by ROIs and GATK
index_required = bool(prefix.get("RegionsOfInterest")) \
or self.options["Features"]["RealignedBAM"]
- validate = index_and_validate_bam(config, prefix, scale,
+ validate = index_and_validate_bam(config=config,
+ prefix=prefix,
+ node=scale,
create_index=index_required)
return {output_filename: validate}, (model,)
@@ -249,7 +258,8 @@ class Library:
def _build_dataduplication_node(self, bams):
files_and_nodes = self._collect_files_and_nodes(bams)
+ output_file = self.folder + ".duplications_checked"
return DetectInputDuplicationNode(input_files=files_and_nodes.keys(),
- output_file=self.folder + ".duplications_checked",
+ output_file=output_file,
dependencies=files_and_nodes.values())
diff --git a/paleomix/tools/bam_pipeline/parts/prefix.py b/paleomix/tools/bam_pipeline/parts/prefix.py
index fe818db..565bd7e 100644
--- a/paleomix/tools/bam_pipeline/parts/prefix.py
+++ b/paleomix/tools/bam_pipeline/parts/prefix.py
@@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -29,31 +29,52 @@ from paleomix.tools.bam_pipeline.nodes import \
from paleomix.nodes.validation import \
DetectInputDuplicationNode
+import paleomix.ui as ui
import paleomix.nodes.gatk as gatk
-class Prefix:
+class Prefix(object):
def __init__(self, config, prefix, samples, features, target):
- self.name = prefix["Name"]
- self.label = prefix.get("Label") or self.name
- self.reference = prefix["Reference"]
- self.roi = prefix.get("RegionsOfInterest", {})
+ self.name = prefix["Name"]
+ self.label = prefix.get("Label") or self.name
+ self.roi = prefix.get("RegionsOfInterest", {})
self.samples = safe_coerce_to_tuple(samples)
- self.folder = config.destination
- self.target = target
+ self.folder = config.destination
+ self.target = target
files_and_nodes = {}
for sample in self.samples:
files_and_nodes.update(sample.bams.iteritems())
- self.datadup_check = self._build_dataduplication_node(prefix, files_and_nodes)
+ self.datadup_check = self._build_dataduplication_node(
+ prefix, files_and_nodes)
+
+ build_raw_bam = features["RawBAM"]
+ build_realigned_bam = features["RealignedBAM"]
+ if build_realigned_bam and prefix['IndexFormat'] == '.csi':
+ if prefix['Path'] not in _CSI_WARNINGS:
+ ui.print_err("\nWARNING: Realigned BAMs enabled for reference "
+ "genome %r, but the file contains sequences too "
+ "large for GATK, which does not support .csi "
+ "index files. Raw BAMs will be built instead of "
+ "realigned BAMs, for this reference sequence."
+ % (prefix['Path']))
+
+ # TODO: Add reference to FAQ when written.
+
+ _CSI_WARNINGS.add(prefix['Path'])
+ build_realigned_bam = False
+ build_raw_bam = True
self.bams = {}
- if features["RawBAM"]:
- self.bams.update(self._build_raw_bam(config, prefix, files_and_nodes))
- if features["RealignedBAM"]:
- self.bams.update(self._build_realigned_bam(config, prefix, files_and_nodes))
+ if build_raw_bam:
+ self.bams.update(self._build_raw_bam(
+ config, prefix, files_and_nodes))
+
+ if build_realigned_bam:
+ self.bams.update(self._build_realigned_bam(
+ config, prefix, files_and_nodes))
if not self.bams:
for sample in self.samples:
@@ -65,21 +86,29 @@ class Prefix:
self.nodes = tuple(nodes)
def _build_raw_bam(self, config, prefix, files_and_bams):
- output_filename = os.path.join(self.folder, "%s.%s.bam" % (self.target, prefix["Name"]))
- validated_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".validated")
-
- node = MergeSamFilesNode(config = config,
- input_bams = files_and_bams.keys(),
- output_bam = output_filename,
- dependencies = self.datadup_check)
- validated_node = index_and_validate_bam(config, prefix, node, validated_filename)
+ output_filename = os.path.join(
+ self.folder, "%s.%s.bam" % (self.target, prefix["Name"]))
+ validated_filename = os.path.join(
+ self.folder, self.target, prefix["Name"] + ".validated")
+
+ node = MergeSamFilesNode(config=config,
+ input_bams=files_and_bams.keys(),
+ output_bam=output_filename,
+ dependencies=self.datadup_check)
+ validated_node = index_and_validate_bam(config=config,
+ prefix=prefix,
+ node=node,
+ log_file=validated_filename)
- return {output_filename : validated_node}
+ return {output_filename: validated_node}
def _build_realigned_bam(self, config, prefix, bams):
- output_filename = os.path.join(self.folder, "%s.%s.realigned.bam" % (self.target, prefix["Name"]))
- intervals_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".intervals")
- validated_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".realigned.validated")
+ output_filename = os.path.join(
+ self.folder, "%s.%s.realigned.bam" % (self.target, prefix["Name"]))
+ intervals_filename = os.path.join(
+ self.folder, self.target, prefix["Name"] + ".intervals")
+ validated_filename = os.path.join(
+ self.folder, self.target, prefix["Name"] + ".realigned.validated")
trainer = gatk.GATKIndelTrainerNode(config=config,
reference=prefix["Reference"],
@@ -95,12 +124,23 @@ class Prefix:
outfile=output_filename,
dependencies=trainer)
- validated_node = index_and_validate_bam(config, prefix, aligner, validated_filename)
+ validated_node = index_and_validate_bam(config=config,
+ prefix=prefix,
+ node=aligner,
+ log_file=validated_filename)
return {output_filename: validated_node}
def _build_dataduplication_node(self, prefix, files_and_nodes):
- destination = os.path.join(self.folder, self.target, prefix["Name"] + ".duplications_checked")
- return DetectInputDuplicationNode(input_files = files_and_nodes.keys(),
- output_file = destination,
- dependencies = files_and_nodes.values())
+ filename = prefix["Name"] + ".duplications_checked"
+ destination = os.path.join(self.folder, self.target, filename)
+ dependencies = files_and_nodes.values()
+
+ return DetectInputDuplicationNode(input_files=files_and_nodes.keys(),
+ output_file=destination,
+ dependencies=dependencies)
+
+
+# Contains the paths of sequences for which warnings about GATK has been given,
+# if the 'RealignedBAM' feature was enabled for files that require a CSI index.
+_CSI_WARNINGS = set()
diff --git a/paleomix/tools/bam_pipeline/parts/statistics.py b/paleomix/tools/bam_pipeline/parts/statistics.py
index 140de45..8ae12ff 100644
--- a/paleomix/tools/bam_pipeline/parts/statistics.py
+++ b/paleomix/tools/bam_pipeline/parts/statistics.py
@@ -39,7 +39,7 @@ def add_statistics_nodes(config, makefile, target):
nodes = []
if features["Depths"]:
- nodes.extend(_build_depth(config, target))
+ nodes.extend(_build_depth(config, target, makefile["Prefixes"]))
if features["Summary"] or features["Coverage"]:
make_summary = features["Summary"]
@@ -55,7 +55,7 @@ def add_statistics_nodes(config, makefile, target):
def _build_summary_node(config, makefile, target, coverage):
- coverage_by_label = _build_coverage_nodes(config, target, use_label=True)
+ coverage_by_label = _build_coverage_nodes(target, use_label=True)
return SummaryTableNode(config=config,
makefile=makefile,
@@ -65,7 +65,7 @@ def _build_summary_node(config, makefile, target, coverage):
dependencies=coverage["Nodes"])
-def _build_depth(config, target):
+def _build_depth(config, target, prefixes):
nodes = []
for prefix in target.prefixes:
for (roi_name, roi_filename) in _get_roi(prefix, name_prefix="."):
@@ -89,6 +89,7 @@ def _build_depth(config, target):
node = DepthHistogramNode(config=config,
target_name=target.name,
input_files=input_files,
+ prefix=prefixes[prefix.name],
regions_file=roi_filename,
output_file=output_fpath,
dependencies=dependencies)
@@ -108,7 +109,7 @@ def _aggregate_for_prefix(cov, prefix, roi_name=None, into=None):
def _build_coverage(config, target, make_summary):
merged_nodes = []
- coverage = _build_coverage_nodes(config, target)
+ coverage = _build_coverage_nodes(target)
for prefix in target.prefixes:
for (roi_name, _) in _get_roi(prefix):
label = _get_prefix_label(prefix.name, roi_name)
@@ -142,7 +143,7 @@ def _build_coverage(config, target, make_summary):
return coverage
-def _build_coverage_nodes(config, target, use_label=False):
+def _build_coverage_nodes(target, use_label=False):
coverage = {"Lanes": collections.defaultdict(dict),
"Libraries": collections.defaultdict(dict)}
@@ -159,7 +160,7 @@ def _build_coverage_nodes(config, target, use_label=False):
for lane in library.lanes:
for bams in lane.bams.values():
- bams = _build_coverage_nodes_cached(config, bams,
+ bams = _build_coverage_nodes_cached(bams,
target.name,
roi_name,
roi_filename,
@@ -167,14 +168,14 @@ def _build_coverage_nodes(config, target, use_label=False):
coverage["Lanes"][key].update(bams)
- bams = _build_coverage_nodes_cached(config, library.bams,
+ bams = _build_coverage_nodes_cached(library.bams,
target.name, roi_name,
roi_filename, cache)
coverage["Libraries"][key].update(bams)
return coverage
-def _build_coverage_nodes_cached(config, files_and_nodes, target_name,
+def _build_coverage_nodes_cached(files_and_nodes, target_name,
roi_name, roi_filename, cache):
output_ext = ".coverage"
if roi_name:
@@ -186,8 +187,7 @@ def _build_coverage_nodes_cached(config, files_and_nodes, target_name,
cache_key = (roi_filename, input_filename)
if cache_key not in cache:
- cache[cache_key] = CoverageNode(config=config,
- input_file=input_filename,
+ cache[cache_key] = CoverageNode(input_file=input_filename,
output_file=output_filename,
target_name=target_name,
regions_file=roi_filename,
diff --git a/paleomix/tools/bam_pipeline/pipeline.py b/paleomix/tools/bam_pipeline/pipeline.py
index 59a9928..ac3cc1e 100755
--- a/paleomix/tools/bam_pipeline/pipeline.py
+++ b/paleomix/tools/bam_pipeline/pipeline.py
@@ -66,9 +66,9 @@ def build_pipeline_trimming(config, makefile):
for (_, samples) in makefile["Targets"].iteritems():
print_info(".", end='')
- for (_, libraries) in samples.iteritems():
- for (_, barcodes) in libraries.iteritems():
- for (barcode, record) in barcodes.iteritems():
+ for libraries in samples.itervalues():
+ for barcodes in libraries.itervalues():
+ for record in barcodes.itervalues():
if record["Type"] in ("Raw", "Trimmed"):
offset = record["Options"]["QualityOffset"]
reads = Reads(config, record, offset)
@@ -99,13 +99,24 @@ def build_pipeline_full(config, makefile, return_nodes=True):
lanes.append(lane)
if lanes:
- libraries.append(parts.Library(config, target_name, prefix, lanes, library_name))
+ libraries.append(parts.Library(config=config,
+ target=target_name,
+ prefix=prefix,
+ lanes=lanes,
+ name=library_name))
if libraries:
- samples.append(parts.Sample(config, prefix, libraries, sample_name))
+ samples.append(parts.Sample(config=config,
+ prefix=prefix,
+ libraries=libraries,
+ name=sample_name))
if samples:
- prefixes.append(parts.Prefix(config, prefix, samples, features, target_name))
+ prefixes.append(parts.Prefix(config=config,
+ prefix=prefix,
+ samples=samples,
+ features=features,
+ target=target_name))
if prefixes:
target = parts.Target(config, prefixes, target_name)
@@ -254,16 +265,19 @@ def run(config, args, pipeline_variant):
def _print_usage(pipeline):
basename = "%s_pipeline" % (pipeline,)
-
- print_info("BAM Pipeline v%s\n" % (paleomix.__version__,))
- print_info("Usage:")
- print_info(" -- %s help -- Display this message" % basename)
- print_info(" -- %s example [...] -- Create example project in folder." % basename)
- print_info(" -- %s makefile [...] -- Print makefile template." % basename)
- print_info(" -- %s dryrun [...] -- Perform dry run of pipeline on provided makefiles." % basename)
- print_info(" %s Equivalent to 'bam_pipeline run --dry-run [...]'." % (" " * len(basename),))
- print_info(" -- %s run [...] -- Run pipeline on provided makefiles." % basename)
- print_info(" -- %s remap [...] -- Re-map hits from previous alignment." % basename)
+ usage = \
+ "BAM Pipeline v{version}\n" \
+ "Usage:\n" \
+ " -- {cmd} help -- Display this message.\n" \
+ " -- {cmd} example [...] -- Create example project.\n" \
+ " -- {cmd} makefile [...] -- Print makefile template.\n" \
+ " -- {cmd} dryrun [...] -- Perform dry run of pipeline.\n" \
+ " -- {cmd} run [...] -- Run pipeline on provided makefiles.\n" \
+ " -- {cmd} remap [...] -- Re-map hits from previous alignment."
+
+ print_info(usage.format(version=paleomix.__version__,
+ cmd=basename,
+ pad=" " * len(basename)))
def main(argv, pipeline="bam"):
diff --git a/paleomix/tools/bam_stats/common.py b/paleomix/tools/bam_stats/common.py
index cc9dc0e..5fa3f10 100644
--- a/paleomix/tools/bam_stats/common.py
+++ b/paleomix/tools/bam_stats/common.py
@@ -174,7 +174,7 @@ def main_wrapper(process_func, argv, ext):
def _get_readgroup(record):
try:
- return record.opt("RG")
+ return record.get_tag("RG")
except KeyError:
return None
diff --git a/paleomix/tools/cleanup.py b/paleomix/tools/cleanup.py
index c654456..a2863e3 100755
--- a/paleomix/tools/cleanup.py
+++ b/paleomix/tools/cleanup.py
@@ -87,8 +87,9 @@ def _pipe_to_bam():
files that do not contain records (i.e. only a header), which are not
properly handled by "samtools view -S -", resulting in a parse failure.
"""
- with pysam.Samfile("-", "r") as input_handle:
- with pysam.Samfile("-", "wbu", template=input_handle) as output_handle:
+ with pysam.AlignmentFile("-", "r") as input_handle:
+ with pysam.AlignmentFile("-", "wbu",
+ template=input_handle) as output_handle:
for record in input_handle:
output_handle.write(record)
@@ -108,6 +109,11 @@ def _cleanup_record(record):
record.rnext = -1
record.pnext = -1
record.tlen = 0
+ elif record.mate_is_unmapped and record.has_tag('MC'):
+ # Picard ValidateSamFile (2.9.1) objects to MC tags for unmapped mates,
+ # which are currently added by SAMTools (v1.4).
+ tags = record.get_tags(with_value_type=True)
+ record.set_tags([tag for tag in tags if tag[0] != 'MC'])
if record.is_unmapped:
record.mapq = 0
@@ -143,7 +149,7 @@ def _filter_record(args, record):
exclude_flags = args.exclude_flags & _SE_FLAGS_MASK
require_flags = args.require_flags & _SE_FLAGS_MASK
- if (record.flag & exclude_flags):
+ if record.flag & exclude_flags:
return True
elif ~(record.flag & require_flags) & require_flags:
return True
@@ -162,14 +168,14 @@ def _cleanup_unmapped(args, cleanup_sam):
filter_by_flag = bool(args.exclude_flags or args.require_flags)
spec = "r" if cleanup_sam else "rb"
- with pysam.Samfile("-", spec) as input_handle:
+ with pysam.AlignmentFile("-", spec) as input_handle:
header = copy.deepcopy(input_handle.header)
_set_sort_order(header)
_set_pg_tags(header, args.update_pg_tag)
if args.rg_id is not None:
_set_rg_tags(header, args.rg_id, args.rg)
- with pysam.Samfile("-", "wbu", header=header) as output_handle:
+ with pysam.AlignmentFile("-", "wbu", header=header) as output_handle:
for record in input_handle:
# Ensure that the properties make sense before filtering
record = _cleanup_record(record)
@@ -181,10 +187,10 @@ def _cleanup_unmapped(args, cleanup_sam):
if args.rg_id is not None:
# Ensure that only one RG tag is set
- tags = [(key, value) for (key, value) in record.tags
- if key != "RG"]
- tags.append(("RG", args.rg_id))
- record.tags = tags
+ tags = record.get_tags(with_value_type=True)
+ tags = [tag for tag in tags if tag[0] != "RG"]
+ tags.append(("RG", args.rg_id, "Z"))
+ record.set_tags(tags)
output_handle.write(record)
@@ -288,14 +294,14 @@ def _run_cleanup_pipeline(args):
def parse_args(argv):
+ """Parses a list of command-line arguments, excluding the program name."""
prog = "paleomix cleanup"
usage = "%s --temp-prefix prefix --fasta reference.fasta < in.sam" \
% (prog,)
parser = argparse.ArgumentParser(prog=prog, usage=usage)
# "Hidden" commands, invoking the various sub-parts of this script
- parser.add_argument('command', choices=('pipe', 'cleanup', 'cleanup-sam'),
- nargs="?", help=argparse.SUPPRESS)
+ parser.add_argument('command', nargs="?", help=argparse.SUPPRESS)
# Specifies if the 'cleanup' step should expect SAM input
parser.add_argument('--cleanup-sam', default=False, action="store_true",
help=argparse.SUPPRESS)
@@ -348,10 +354,15 @@ def parse_args(argv):
parser.add_argument('--samtools1x', choices=('yes', 'no'),
help=argparse.SUPPRESS)
- return parser.parse_args(argv)
+ args = parser.parse_args(argv)
+ if args.command not in (None, 'pipe', 'cleanup', 'cleanup-sam'):
+ parser.error("unrecognized arguments: %s" % (args.command,))
+
+ return args
def main(argv):
+ """Main function; returns 0 on success, non-zero otherwise."""
args = parse_args(argv)
if args.samtools1x is None:
@@ -370,9 +381,8 @@ def main(argv):
"v1.0+ are supported; please upgrade / "
"replace the installed copy of SAMTools!\n")
return 1
- except versions.VersionRequirementError, error:
- sys.stderr.write("ERROR: Could not determine SAMTools version: "
- "%s\n" % (error,))
+ except versions.VersionRequirementError as error:
+ sys.stderr.write("\nERROR: %s\n" % (error,))
return 1
if args.command == "pipe":
@@ -381,6 +391,8 @@ def main(argv):
return _cleanup_unmapped(args, cleanup_sam=False)
elif args.command == "cleanup-sam":
return _cleanup_unmapped(args, cleanup_sam=True)
+ elif args.command:
+ raise NotImplementedError('Unexpected command %r' % (args.command,))
sys.stderr.write("Reading SAM file from STDIN ...\n")
return _run_cleanup_pipeline(args)
diff --git a/paleomix/tools/dupcheck.py b/paleomix/tools/dupcheck.py
new file mode 100644
index 0000000..5f0a40b
--- /dev/null
+++ b/paleomix/tools/dupcheck.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import argparse
+import sys
+
+import paleomix.nodes.validation as validation
+
+
+class ErrHandler(object):
+ def __init__(self, quiet=False):
+ self.quiet = quiet
+ self.duplicate_reads = 0
+
+ def __call__(self, chrom, pos, records, name, seq, qual):
+ self.duplicate_reads += 1
+ if self.quiet:
+ return
+
+ print('%s:%i -- %s %s %s:' % (chrom, pos, name, seq, qual))
+ for filename, records in sorted(records.iteritems()):
+ print(' - %s:' % (filename,))
+
+ for idx, record in enumerate(records, start=1):
+ print('% 8i. ' % (idx,), end='')
+
+ if record.is_paired:
+ if record.is_read1:
+ print('Mate 1 read', end='')
+ elif record.is_read2:
+ print('Mate 2 read', end='')
+ else:
+ print('Unpaired read', end='')
+ else:
+ print('Unpaired read', end='')
+
+ try:
+ print(' with read-group %r' % (record.get_tag('RG'),), end='')
+ except KeyError:
+ pass
+
+ print()
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(prog="paleomix dupcheck")
+ parser.add_argument("files", nargs="+",
+ help="One or more input BAM files.")
+ parser.add_argument("--quiet", default=False, action='store_true',
+ help="Only print the number of BAM records where 1 or "
+ "more potential duplicates duplicates were "
+ "identified.")
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ """Main function; takes a list of arguments but excluding sys.argv[0]."""
+ args = parse_args(argv)
+ handler = ErrHandler(quiet=args.quiet)
+ validation.check_bam_files(args.files, handler)
+
+ if args.quiet:
+ print('%i' % (handler.duplicate_reads,))
+ else:
+ print('Found %i record(s) with duplicates.' % (handler.duplicate_reads,))
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/rmdup_collapsed.py b/paleomix/tools/rmdup_collapsed.py
index eb5e9ec..b5ea8c2 100755
--- a/paleomix/tools/rmdup_collapsed.py
+++ b/paleomix/tools/rmdup_collapsed.py
@@ -1,121 +1,222 @@
#!/usr/bin/env python
+#
+# Based on 'FilterUniqueBAM' by
+# Martin Kircher
+# Martin.Kircher at eva.mpg.de
+#
+"""paleomix rmdup_collapsed [options] < sorted.bam > out.bam
-"""
-Stripped down version of 'FilterUniqueBAM' by
-:Author: Martin Kircher
-:Contact: Martin.Kircher at eva.mpg.de
-:Date: *08.10.2011
-:Type: tool
-:Input: BAM
-:Output: BAM
-
-Mark/Filter PCR duplicates for merged PE reads Reads BAM
-from STDIN and writes BAM to STDOUT. All non-collapsed reads
-as well as secondary/chinermic alignments, reads that have
-failed QC and unmmaped reads, are written to STDOUT as is.
-
-The input is assumed to be sorted by coordinates, and this
-order is preservered, though individual reads at the same
-position may be re-arranged).
-"""
+The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under
+the assumption that any unpaired read have been generated by the merging of
+overlapping paired-end reads, and thereby represent the complete template
+sequence. PCR duplicates are therefore detected based on both the 5' and 3'
+alignment coordinate.
-import sys
-import pysam
+Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100),
+reads that failed QC (0x200), and chimeric alignments (0x800), as identified
+using the BAM record flags, are not filtered, but simply written to the output.
+
+By default, filtered reads are flagged using the "duplicate" flag (0x400), and
+written to the output. Use the --remove-duplicates command-line option to
+instead remove these records from the output.
+"""
+import collections
import random
+import sys
from argparse import ArgumentParser
+import pysam
-def calc_consensus(reads, rng=random.random):
- count = len(reads)
- outread = None
- maxsumqual = -1
- for read in reads:
- # Handle reads without qualities, but favor reads with qualities
- qual = read.qual
- if qual is None:
- # Generate value in (-1; 0]
- nsum = -rng()
- else:
- nsum = sum(map(ord, qual))
- if nsum > maxsumqual:
- outread = read
- maxsumqual = nsum
+_FILTERED_FLAGS = 0x1 # PE reads
+_FILTERED_FLAGS |= 0x4 # Unmapped
+_FILTERED_FLAGS |= 0x100 # Secondary alignment
+_FILTERED_FLAGS |= 0x200 # Failed QC
+_FILTERED_FLAGS |= 0x800 # Chimeric alignment
- # LOOK FOR PREVIOUS PCR DUPLICATE COUNTS
- for key, value in read.tags:
- if key == "XP":
- count += value
+_CIGAR_SOFTCLIP = 4
+_CIGAR_HARDCLIP = 5
- if not outread.tags:
- outread.tags = [("XP", count)]
- else:
- outread.tags = outread.tags + [("XP", count)]
- return outread
+def read_quality(read):
+ qualities = read.query_alignment_qualities
+ if qualities is None:
+ # Generate value in range (-1; 0]
+ return -random.random()
+ return sum(qualities)
-def get_consensus_se(reads):
- # DETERMINE MOST FREQUENT CIGAR LINE
- by_cigar = {}
- cigar_count = {}
- for read in reads:
- tcigar = tuple(read.cigar)
- if tcigar in by_cigar:
- cigar_count[tcigar] += 1
- by_cigar[tcigar].append(read)
- else:
- cigar_count[tcigar] = 1
- by_cigar[tcigar] = [read]
- to_sort = [(y, -len(str(x)), x) for (x, y) in cigar_count.iteritems()]
- to_sort.sort()
- selcigar = to_sort[-1][-1]
- reads = by_cigar[selcigar]
+def copy_number(read):
+ # has_tag is faster than try/except, since most reads lack the tag.
+ if read.has_tag('XP'):
+ return read.get_tag('XP')
- return calc_consensus(reads)
+ return 0
-def write_consensus_se(outfile, reads, remove_duplicates):
- consensus = get_consensus_se(reads)
+def mark_duplicate_reads(reads):
+ """Identifies the best read from a set of PCR duplicates, and marks all
+ other reads as duplicates. The best read is selected by quality, among the
+ reads sharing the most common CIGAR string.
+ """
+ by_cigar = collections.defaultdict(list)
for read in reads:
- read.is_duplicate = (read is not consensus)
- if not (read.is_duplicate and remove_duplicates):
- outfile.write(read)
+ key = tuple(read.cigartuples)
+ by_cigar[key].append(read)
+
+ # Select the most common CIGAR strings, favoring simple CIGARs
+ best_count, best_cigar_len = max((len(values), -len(cigar))
+ for cigar, values in by_cigar.iteritems())
+ best_cigar_len = -best_cigar_len
+
+ best_read = None
+ best_quality = -1
+ copies = len(reads)
+
+ for cigar, candidates in by_cigar.iteritems():
+ if len(cigar) == best_cigar_len and len(candidates) == best_count:
+ for read in candidates:
+ copies += copy_number(read)
+ quality = read_quality(read)
+
+ if quality > best_quality:
+ best_read = read
+ best_quality = quality
+ else:
+ copies += sum(copy_number(read) for read in candidates)
+ best_read.set_tag('XP', copies, 'i')
+ for read in reads:
+ read.is_duplicate = (read is not best_read)
-def _flush_buffer(outfile, curvariants, remove_duplicates):
- for value in curvariants.itervalues():
- write_consensus_se(outfile, value[0], remove_duplicates)
- curvariants.clear()
+def write_read(args, out, read_and_alignment, duplicates_by_alignment):
+ read, alignment = read_and_alignment
+ if alignment is not None:
+ duplicates = duplicates_by_alignment.pop(alignment)
-_FILTERED_FLAGS = 0x1 # PE reads
-_FILTERED_FLAGS |= 0x4 # Unmapped
-_FILTERED_FLAGS |= 0x100 # Secondary alignment
-_FILTERED_FLAGS |= 0x200 # Failed QC
-_FILTERED_FLAGS |= 0x800 # Chimeric alignment
+ if len(duplicates) > 1:
+ # Select the best read and mark the others as duplicates.
+ mark_duplicate_reads(duplicates)
+ else:
+ duplicates[0].is_duplicate = False
+ duplicates[0].set_tag('XP', 1, 'i')
+
+ if not (args.remove_duplicates and read.is_duplicate):
+ out.write(read)
+
+
+def can_write_read(read_and_alignment, current_position):
+ """Returns true if the first read in the cache can safely be written. This
+ will be the case if the read was not the first in a set of reads with the
+ same alignment, or if the current position has gone beyond the last base
+ covered in that alignment.
+ """
+ _, alignment = read_and_alignment
+ if alignment is None:
+ return True
+
+ current_ref_id, current_ref_start = current_position
+ alignment_ref_id, _, _, alignment_ref_end = alignment
+
+ return alignment_ref_id != current_ref_id \
+ or alignment_ref_end < current_ref_start
+
+
+def clipped_bases_at_front(cigartuples):
+ """Returns number of bases soft or hard clipped at start of the CIGAR."""
+ total = 0
+ for (operation, length) in cigartuples:
+ if operation != _CIGAR_SOFTCLIP and operation != _CIGAR_HARDCLIP:
+ break
+
+ total += length
+
+ return total
+
+
+def unclipped_alignment_coordinates(read):
+ """Returns tuple describing the alignment, with external coordinates
+ modified to account for clipped bases, assuming an ungapped alignment to
+ the reference. This is equivalent to the behavior of Picard MarkDuplicates.
+ """
+ cigartuples = read.cigartuples
+ start = read.reference_start - clipped_bases_at_front(cigartuples)
+ end = read.reference_end + clipped_bases_at_front(reversed(cigartuples))
+
+ return (read.reference_id, read.is_reverse, start, end)
+
+
+def process_aligned_read(cache, duplicates_by_alignment, read):
+ """Processes an aligned read, either pairing it with an existing read, or
+ creating a new alignment block to track copies of this copies.
+ """
+ alignment = unclipped_alignment_coordinates(read)
+
+ try:
+ duplicates_by_alignment[alignment].append(read)
+ cache.append((read, None))
+ except KeyError:
+ # No previous reads with matching alignment; this read will
+ # serve to track any other reads with the same alignment.
+ duplicates_by_alignment[alignment] = [read]
+ cache.append((read, alignment))
+
+
+def is_trailing_unmapped_read(read):
+ return read.is_unmapped \
+ and read.reference_id == -1 \
+ and read.reference_start == -1
+
+
+def process(args, infile, outfile):
+ cache = collections.deque()
+ duplicates_by_alignment = {}
+ last_position = (0, 0)
+ read_num = 1
+
+ for read_num, read in enumerate(infile, start=read_num):
+ current_position = (read.reference_id, read.reference_start)
+ if last_position > current_position:
+ # Check also catches trailing unmapped reads mapped to (-1, -1).
+ if not is_trailing_unmapped_read(read):
+ sys.stderr.write("ERROR: Input file is not sorted by "
+ "coordinates at read %i. Aborting!\n"
+ % (read_num,))
+ return 1
+
+ cache.append((read, None))
+ break
+ elif read.flag & _FILTERED_FLAGS:
+ cache.append((read, None))
+ else:
+ process_aligned_read(cache, duplicates_by_alignment, read)
+ last_position = current_position
+ while cache and can_write_read(cache[0], current_position):
+ write_read(args, outfile, cache.popleft(), duplicates_by_alignment)
-def parse_args(argv):
- usage = """paleomix rmdup_collapsed [options] < sorted.bam > out.bam
+ while cache:
+ write_read(args, outfile, cache.popleft(), duplicates_by_alignment)
-The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under
-the assumption that any unpaired read have been generated by the merging of
-overlapping paired-end reads, and thereby represent the complete template
-sequence. PCR duplicates are therefore detected based on both the 5' and 3'
-alignment coordinate.
+ assert not duplicates_by_alignment, duplicates_by_alignment
-Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100),
-reads that failed QC (0x200), and chimeric alignments (0x800), as identified
-using the BAM record flags, are not filtered, but simply written to the output.
+ for read_num, read in enumerate(infile, start=read_num + 1):
+ if not is_trailing_unmapped_read(read):
+ sys.stderr.write("ERROR: Input file is not sorted by "
+ "coordinates at read %i. Aborting!\n"
+ % (read_num,))
+ return 1
-By default, filtered reads are flagged using the "duplicate" flag (0x400), and
-written to the output. Use the --remove-duplicates command-line option to
-instead remove these records from the output.
-"""
- parser = ArgumentParser(usage=usage)
+ outfile.write(read)
+
+ return 0
+
+
+def parse_args(argv):
+ parser = ArgumentParser(usage=__doc__)
parser.add_argument("input", default="-", nargs="?",
help="BAM file; if not set, input is read from STDIN.")
parser.add_argument("--remove-duplicates",
@@ -143,37 +244,9 @@ def main(argv):
sys.stderr.write("STDOUT is a terminal, terminating!\n")
return 1
- with pysam.Samfile(args.input, "rb") as infile:
- with pysam.Samfile("-", "wb", template=infile) as outfile:
- curpos = None
- curvariants = {}
- for (read_num, read) in enumerate(infile):
- if curpos and ((read.tid, read.pos) != curpos):
- # Sort order is defined as ascending 'tid's and positions
- if curpos > (read.tid, read.pos) and not read.is_unmapped:
- sys.stderr.write("ERROR: Input file does not appear "
- "to be sorted by coordinates at "
- "record %i, aborting ...\n"
- % (read_num,))
- return 1
-
- _flush_buffer(outfile, curvariants,
- args.remove_duplicates)
- curpos = None
-
- if read.flag & _FILTERED_FLAGS:
- outfile.write(read)
- continue
-
- curpos = (read.tid, read.pos)
- nkey = (read.is_reverse, read.pos, read.alen)
- if nkey in curvariants:
- curvariants[nkey][0].append(read)
- curvariants[nkey][1] += 1
- else:
- curvariants[nkey] = [[read], 1]
-
- _flush_buffer(outfile, curvariants, args.remove_duplicates)
+ with pysam.AlignmentFile(args.input, "rb") as infile:
+ with pysam.AlignmentFile("-", "wb", template=infile) as outfile:
+ return process(args, infile, outfile)
return 0
diff --git a/paleomix/tools/zonkey/config.py b/paleomix/tools/zonkey/config.py
index ef5102e..b988ecb 100644
--- a/paleomix/tools/zonkey/config.py
+++ b/paleomix/tools/zonkey/config.py
@@ -21,6 +21,7 @@
# SOFTWARE.
#
import os
+import string
import sys
import optparse
@@ -251,6 +252,7 @@ def _read_sample_table(config, filename):
a single mitochondrial alignment (2 columns), or both (3 columns).
"""
print_info("Reading table of samples from %r" % (filename,))
+ valid_characters = frozenset(string.letters + string.digits + ".-_")
samples = config.samples = {}
with fileutils.open_ro(filename) as handle:
@@ -258,16 +260,24 @@ def _read_sample_table(config, filename):
if not line.strip() or line.lstrip().startswith("#"):
continue
- fields = filter(None, line.rstrip('\r\n').split('\t'))
+ fields = filter(None, map(str.strip, line.split('\t')))
if len(fields) not in (2, 3):
- print_err("Error reading sample table (%r) at line %i; "
- "expected 2 or 3 columns, found %i; please "
+ print_err("Error reading sample table (%r) at line %i: "
+ "Expected 2 or 3 columns, found %i; please "
"correct file before continuing."
% (filename, linenum, len(fields)))
return
name = fields[0]
- if name in samples:
+ invalid_letters = frozenset(name) - valid_characters
+ if invalid_letters:
+ print_err("Error reading sample table (%r) at line %i: "
+ "Sample name contains illegal character(s). Only "
+ "letters, numbers, and '-', '_', and '.' are "
+ "allowed, but found %r in name %r "
+ % (filename, linenum, "".join(invalid_letters), name))
+ return
+ elif name in samples:
print_err("Duplicate sample name found in sample table "
"(%r) at line %i: %r. All sample names must "
"be unique!" % (filename, linenum, name))
diff --git a/paleomix/tools/zonkey/pipeline.py b/paleomix/tools/zonkey/pipeline.py
index 8a8ed55..2b50d46 100755
--- a/paleomix/tools/zonkey/pipeline.py
+++ b/paleomix/tools/zonkey/pipeline.py
@@ -315,26 +315,33 @@ def build_pipeline(config, root, nuc_bam, mito_bam, cache):
def run_admix_pipeline(config):
- print_info("Building Zonkey pipeline ", end='')
+ print_info("\nBuilding %i Zonkey pipeline(s):" % (len(config.samples),))
config.temp_root = os.path.join(config.destination, "temp")
if not config.dry_run:
fileutils.make_dirs(config.temp_root)
cache = {}
nodes = []
- for sample in config.samples.itervalues():
+ items = config.samples.iteritems()
+ for idx, (name, sample) in enumerate(sorted(items), start=1):
root = sample["Root"]
nuc_bam = sample["Files"].get("Nuc")
mito_bam = sample["Files"].get("Mito")
+ genomes = []
+ if mito_bam:
+ genomes.append("MT")
+ if nuc_bam:
+ genomes.append("Nuclear")
+
+ print_info(" %i. %s: %s DNA" % (idx, name, ' and '.join(genomes)))
+
nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))
- print_info(".", end='')
if config.multisample and not config.admixture_only:
nodes = [summary.SummaryNode(config, nodes)]
- print_info(".")
- if not run_pipeline(config, nodes, "\nRunning Zonkey ..."):
+ if not run_pipeline(config, nodes, "\nRunning Zonkey:"):
return 1
diff --git a/paleomix/ui.py b/paleomix/ui.py
index 7710a3b..f33a1cc 100644
--- a/paleomix/ui.py
+++ b/paleomix/ui.py
@@ -374,7 +374,7 @@ class ProgressUI(BaseUI):
def flush(self):
"""See BaseUI.flush."""
- if (self._refresh_count <= 0):
+ if self._refresh_count <= 0:
self._refresh_count = ProgressUI._SUMMARY_EVERY
self._print_summary()
@@ -455,14 +455,10 @@ def _fmt_runtime(runtime):
secs=(runtime % 60))
-# No longer provided
-VerboseUI = RunningUI
-QuietUI = RunningUI
-
# Different types of UIs
UI_TYPES = {
- "Verbose": VerboseUI,
- "Quiet": RunningUI,
+ "Verbose": RunningUI, # Deprecated
+ "Quiet": RunningUI, # Deprecated
"Running": RunningUI,
"Progress": ProgressUI,
"Summary": SummaryUI,
diff --git a/setup.py b/setup.py
index e4b3084..f3cafd5 100644
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ setup(
packages=find_packages(exclude=['misc', 'tests']),
- install_requires=['pysam>=0.8.3',
+ install_requires=['pysam>=0.10.0',
'setproctitle>=1.1.0'],
# Dependencies set in setup_requires to allow use of 'setup.py nosetests'
diff --git a/tests/.coveragerc b/tests/.coveragerc
new file mode 100644
index 0000000..c276359
--- /dev/null
+++ b/tests/.coveragerc
@@ -0,0 +1,11 @@
+[run]
+branch = True
+
+[report]
+skip_covered = True
+
+omit =
+ */paleomix/yaml*.py
+ */paleomix/yaml/lib2/*.py
+
+
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..e308acb
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,4 @@
+To run the test suite, install and run 'tox':
+
+ $ pip install --user tox
+ $ tox
diff --git a/tests/atomiccmd_test/builder_test.py b/tests/atomiccmd_test/builder_test.py
index f066471..e0d86d1 100644
--- a/tests/atomiccmd_test/builder_test.py
+++ b/tests/atomiccmd_test/builder_test.py
@@ -20,6 +20,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
+# pylint: disable=missing-docstring
+#
from flexmock import flexmock
from nose.tools import assert_equal, assert_raises
@@ -117,7 +119,8 @@ def test_builder__add_option__overwrite():
builder.add_option("-name", "*.txt")
builder.add_option("-or")
builder.add_option("-name", "*.bat")
- assert_equal(builder.call, ["find", "-name", "*.txt", "-or", "-name", "*.bat"])
+ assert_equal(builder.call, ["find", "-name",
+ "*.txt", "-or", "-name", "*.bat"])
###############################################################################
@@ -155,7 +158,8 @@ def test_builder__add_or_set_option__add_and_set():
def _do_test_builder__add_or_set_option__add_and_set(setter_1, setter_2):
builder = AtomicCmdBuilder("find")
setter_1(builder, "-name", "*.txt")
- assert_raises(AtomicCmdBuilderError, setter_2, builder, "-name", "*.bat")
+ assert_raises(AtomicCmdBuilderError, setter_2,
+ builder, "-name", "*.bat")
yield _do_test_builder__add_or_set_option__add_and_set, AtomicCmdBuilder.set_option, AtomicCmdBuilder.add_option
yield _do_test_builder__add_or_set_option__add_and_set, AtomicCmdBuilder.add_option, AtomicCmdBuilder.set_option
@@ -297,7 +301,8 @@ def test_builder__set_kwargs__after_finalize():
builder = AtomicCmdBuilder("echo")
builder.set_kwargs(IN_PATH="/a/b/")
builder.finalize()
- assert_raises(AtomicCmdBuilderError, builder.set_kwargs, OUT_PATH="/dst/file")
+ assert_raises(AtomicCmdBuilderError,
+ builder.set_kwargs, OUT_PATH="/dst/file")
assert_equal(builder.kwargs, expected)
@@ -305,7 +310,8 @@ def test_builder__set__kwargs__overwriting():
expected = {"IN_PATH": "/a/b/"}
builder = AtomicCmdBuilder("echo")
builder.set_kwargs(IN_PATH="/a/b/")
- assert_raises(AtomicCmdBuilderError, builder.set_kwargs, IN_PATH="/dst/file")
+ assert_raises(AtomicCmdBuilderError,
+ builder.set_kwargs, IN_PATH="/dst/file")
assert_equal(builder.kwargs, expected)
@@ -349,9 +355,10 @@ def test_builder__finalize__returns_singleton():
def test_builder__finalize__calls_atomiccmd():
was_called = []
- class _AtomicCmdMock:
+ class _AtomicCmdMock(object):
def __init__(self, *args, **kwargs):
- assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],))
+ assert_equal(args,
+ (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],))
assert_equal(kwargs, {"IN_FILE": "/in/file",
"OUT_FILE": "/out/file",
"set_cwd": True})
@@ -473,6 +480,47 @@ def test_builder__add_multiple_values_multiple_times():
###############################################################################
###############################################################################
+# AtomicCmdBuilder: add_multiple_kwargs
+
+def test_builder__add_multiple_kwargs():
+ values = ("file_a", "file_b")
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_kwargs(values)
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls"])
+
+
+def test_builder__add_multiple_kwargs_with_template():
+ values = ("file_a", "file_b")
+ expected = {"OUT_BAM_1": "file_a", "OUT_BAM_2": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_kwargs(values, template="OUT_BAM_%i")
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls"])
+
+
+def test_builder__add_multiple_kwargs_multiple_times():
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_kwargs(("file_a",))
+ assert_equal(kwargs, {"IN_FILE_01": "file_a"})
+ kwargs = builder.add_multiple_kwargs(("file_b",))
+ assert_equal(kwargs, {"IN_FILE_02": "file_b"})
+
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls"])
+
+
+###############################################################################
+###############################################################################
# AtomicJavaCmdBuilder
def test_java_builder__default__no_config():
@@ -517,12 +565,15 @@ def test_java_builder__multithreaded_gc():
def test_java_builder__multithreaded_gc__zero_or_negative_threads():
- assert_raises(ValueError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads=0)
- assert_raises(ValueError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads=-1)
+ assert_raises(ValueError, AtomicJavaCmdBuilder,
+ "/path/Foo.jar", gc_threads=0)
+ assert_raises(ValueError, AtomicJavaCmdBuilder,
+ "/path/Foo.jar", gc_threads=-1)
def test_java_builder__multithreaded_gc__non_int_threads():
- assert_raises(TypeError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads="3")
+ assert_raises(TypeError, AtomicJavaCmdBuilder,
+ "/path/Foo.jar", gc_threads="3")
def test_java_builder__kwargs():
@@ -606,13 +657,13 @@ def test_custom_cli__multiple_named_args():
return {}
obj = SingleNamedArg.customize(123, 456)
- assert_equal(obj.first, 123)
+ assert_equal(obj.first, 123)
assert_equal(obj.second, 456)
def test_custom_cli__only_customize_is_valid_function_name():
try:
- class ClassWithMisnamedFunction:
+ class ClassWithMisnamedFunction(object):
@create_customizable_cli_parameters
def not_called_customize(cls, first, second):
return {} # pragma: no coverage
@@ -637,7 +688,7 @@ def test_apply_options__single_option__default_pred__set_when_pred_is_true():
apply_options(mock, {"--foo": 17})
-def test_apply_options__single_option__default_pred__ignore_when_pred_is_false():
+def test_apply_options__single_option__default_pred__ignore_false_pred():
mock = flexmock()
apply_options(mock, {"Other": None})
diff --git a/tests/atomiccmd_test/command_test.py b/tests/atomiccmd_test/command_test.py
index f871851..aad8c2d 100644
--- a/tests/atomiccmd_test/command_test.py
+++ b/tests/atomiccmd_test/command_test.py
@@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -20,6 +20,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
+# pylint: disable=missing-docstring
+#
import os
import signal
import weakref
@@ -28,15 +30,15 @@ from flexmock import flexmock
import nose
from nose.tools import \
- assert_in, \
- assert_equal, \
- assert_raises
+ assert_in, \
+ assert_equal, \
+ assert_raises
from paleomix.common.testing import \
- with_temp_folder, \
- Monkeypatch, \
- get_file_contents, \
- set_file_contents
+ with_temp_folder, \
+ Monkeypatch, \
+ get_file_contents, \
+ set_file_contents
import paleomix.atomiccmd.command
import paleomix.common.fileutils as fileutils
@@ -97,7 +99,8 @@ def test_atomiccmd__set_cwd():
expected = temp_folder if set_cwd else cwd
result = get_file_contents(os.path.join(temp_folder, "result.txt"))
- assert os.path.samefile(expected, result), "%r != %r" % (expected, result)
+ assert os.path.samefile(
+ expected, result), "%r != %r" % (expected, result)
yield _do_test_atomiccmd__set_cwd, False
yield _do_test_atomiccmd__set_cwd, True
@@ -106,7 +109,7 @@ def test_atomiccmd__set_cwd():
# Full path when set_cwd is False, rel. path when True
def test_atomiccmd__set_cwd__temp_in_out():
@with_temp_folder
- def _do_test_atomiccmd__paths_temp_in(temp_folder, set_cwd, kwargs):
+ def _test_atomiccmd__paths_temp_in(temp_folder, set_cwd, kwargs):
cmd = AtomicCmd(("echo", "-n", "%%(%s)s" % tuple(kwargs.keys())),
TEMP_OUT_STDOUT="result.txt",
set_cwd=set_cwd,
@@ -118,10 +121,10 @@ def test_atomiccmd__set_cwd__temp_in_out():
result = get_file_contents(os.path.join(temp_folder, "result.txt"))
assert_equal(os.path.abspath(expected), os.path.abspath(result))
- yield _do_test_atomiccmd__paths_temp_in, True, {"TEMP_IN_FOO": "test_file"}
- yield _do_test_atomiccmd__paths_temp_in, False, {"TEMP_IN_FOO": "test_file"}
- yield _do_test_atomiccmd__paths_temp_in, True, {"TEMP_OUT_FOO": "test_file"}
- yield _do_test_atomiccmd__paths_temp_in, False, {"TEMP_OUT_FOO": "test_file"}
+ yield _test_atomiccmd__paths_temp_in, True, {"TEMP_IN_FOO": "test_file"}
+ yield _test_atomiccmd__paths_temp_in, False, {"TEMP_IN_FOO": "test_file"}
+ yield _test_atomiccmd__paths_temp_in, True, {"TEMP_OUT_FOO": "test_file"}
+ yield _test_atomiccmd__paths_temp_in, False, {"TEMP_OUT_FOO": "test_file"}
###############################################################################
@@ -143,12 +146,14 @@ def test_atomiccmd__paths():
OUT_STDERR="/var/log/pipe.stderr",
TEMP_OUT_STDOUT="pipe.stdout")
- assert_equal(cmd.executables, frozenset(["ls", "true"]))
- assert_equal(cmd.requirements, frozenset([bool]))
- assert_equal(cmd.input_files, frozenset(["/a/b/c", "/x/y/z"]))
- assert_equal(cmd.output_files, frozenset(["/out/foo", "foo/bar", "/var/log/pipe.stderr"]))
+ assert_equal(cmd.executables, frozenset(["ls", "true"]))
+ assert_equal(cmd.requirements, frozenset([bool]))
+ assert_equal(cmd.input_files, frozenset(["/a/b/c", "/x/y/z"]))
+ assert_equal(cmd.output_files, frozenset(
+ ["/out/foo", "foo/bar", "/var/log/pipe.stderr"]))
assert_equal(cmd.auxiliary_files, frozenset(["wat/wat"]))
- assert_equal(cmd.expected_temp_files, frozenset(["foo", "bar", "pipe.stderr"]))
+ assert_equal(cmd.expected_temp_files, frozenset(
+ ["foo", "bar", "pipe.stderr"]))
assert_in("xyb", cmd.optional_temp_files)
assert_in("pipe.stdout", cmd.optional_temp_files)
@@ -171,7 +176,8 @@ def test_atomiccmd__pipes_stdin(temp_folder):
cmd.run(temp_folder)
assert_equal(cmd.join(), [0])
result = get_file_contents(os.path.join(temp_folder, "result.txt"))
- assert_equal(result, ">This_is_FASTA!\nACGTN\n>This_is_ALSO_FASTA!\nCGTNA\n")
+ assert_equal(
+ result, ">This_is_FASTA!\nACGTN\n>This_is_ALSO_FASTA!\nCGTNA\n")
@with_temp_folder
@@ -214,8 +220,11 @@ def test_atomiccmd__pipes_stdin__dev_null_explicit(temp_folder):
# Test possible combinations of explicit / implicit saving of stdout/err
def test_atomiccmd__pipes_out():
@with_temp_folder
- def _do_test_atomiccmd__pipes_out(temp_folder, stdout, stderr, kwargs):
- cmd = AtomicCmd(("bash", "-c", "echo -n 'STDERR!' > /dev/stderr; echo -n 'STDOUT!';"), **kwargs)
+ def _test_atomiccmd__pipes_out(temp_folder, stdout, stderr, kwargs):
+ call = ("bash", "-c",
+ "echo -n 'STDERR!' > /dev/stderr; echo -n 'STDOUT!';")
+
+ cmd = AtomicCmd(call, **kwargs)
cmd.run(temp_folder)
assert_equal(cmd.join(), [0])
@@ -229,99 +238,110 @@ def test_atomiccmd__pipes_out():
assert_equal(set(os.listdir(temp_folder)), set(expected_files))
- yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", "pipe_bash_{0}.stderr", {}
- yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", "stderr.txt", {"OUT_STDERR": "stderr.txt"}
- yield _do_test_atomiccmd__pipes_out, "stdout.txt", "pipe_bash_{0}.stderr", {"OUT_STDOUT": "stdout.txt"}
- yield _do_test_atomiccmd__pipes_out, "stdout.txt", "stderr.txt", {"OUT_STDOUT": "stdout.txt",
- "OUT_STDERR": "stderr.txt"}
-
- yield _do_test_atomiccmd__pipes_out, None, None, {"OUT_STDOUT": AtomicCmd.DEVNULL,
- "OUT_STDERR": AtomicCmd.DEVNULL}
- yield _do_test_atomiccmd__pipes_out, None, "pipe_bash_{0}.stderr", {"OUT_STDOUT": AtomicCmd.DEVNULL}
- yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", None, {"OUT_STDERR": AtomicCmd.DEVNULL}
-
-
-def test_atomiccmd__pipes_out_dev_null():
- @with_temp_folder
- def _do_test_atomiccmd__pipes_out(temp_folder, stdout, stderr, kwargs):
- cmd = AtomicCmd(("bash", "-c", "echo -n 'STDERR!' > /dev/stderr; echo -n 'STDOUT!';"), **kwargs)
- cmd.run(temp_folder)
- assert_equal(cmd.join(), [0])
- result_out = get_file_contents(os.path.join(temp_folder, stdout.format(id(cmd))))
- result_err = get_file_contents(os.path.join(temp_folder, stderr.format(id(cmd))))
- assert_equal(result_out, "STDOUT!")
- assert_equal(result_err, "STDERR!")
+ yield _test_atomiccmd__pipes_out, \
+ "pipe_bash_{0}.stdout", "pipe_bash_{0}.stderr", {}
+ yield _test_atomiccmd__pipes_out, \
+ "pipe_bash_{0}.stdout", "stderr.txt", {"OUT_STDERR": "stderr.txt"}
+ yield _test_atomiccmd__pipes_out, \
+ "stdout.txt", "pipe_bash_{0}.stderr", {"OUT_STDOUT": "stdout.txt"}
+ yield _test_atomiccmd__pipes_out, \
+ "stdout.txt", "stderr.txt", {"OUT_STDOUT": "stdout.txt",
+ "OUT_STDERR": "stderr.txt"}
+
+ yield _test_atomiccmd__pipes_out, \
+ None, None, {"OUT_STDOUT": AtomicCmd.DEVNULL,
+ "OUT_STDERR": AtomicCmd.DEVNULL}
+ yield _test_atomiccmd__pipes_out, \
+ None, "pipe_bash_{0}.stderr", {"OUT_STDOUT": AtomicCmd.DEVNULL}
+ yield _test_atomiccmd__pipes_out, \
+ "pipe_bash_{0}.stdout", None, {"OUT_STDERR": AtomicCmd.DEVNULL}
def test_atomiccmd__paths__malformed_keys():
def _do_test_atomiccmd__paths__malformed(kwargs):
assert_raises(ValueError, AtomicCmd, "true", **kwargs)
- yield _do_test_atomiccmd__paths__malformed, {"IN": "/var/foo"} # Missing key-name #1
- yield _do_test_atomiccmd__paths__malformed, {"IN_": "/var/foo"} # Missing key-name #2
- yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT": "/var/foo"} # Missing key-name #3
- yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT_": "/var/foo"} # Missing key-name #4
- yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUX_FOO": "foo"} # Invalid key-type #1
- yield _do_test_atomiccmd__paths__malformed, {"INS_BAR": "foo"} # Invalid key-type #2
+ # Missing key-name #1
+ yield _do_test_atomiccmd__paths__malformed, {"IN": "/var/foo"}
+ # Missing key-name #2
+ yield _do_test_atomiccmd__paths__malformed, {"IN_": "/var/foo"}
+ # Missing key-name #3
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT": "/var/foo"}
+ # Missing key-name #4
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT_": "/var/foo"}
+ # Invalid key-type #1
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUX_FOO": "foo"}
+ # Invalid key-type #2
+ yield _do_test_atomiccmd__paths__malformed, {"INS_BAR": "foo"}
def test_atomiccmd__paths__invalid_values():
def _do_test_atomiccmd__paths__invalid_values(kwargs):
assert_raises(TypeError, AtomicCmd, "true", **kwargs)
- yield _do_test_atomiccmd__paths__invalid_values, {"IN_FILE": 1}
- yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_IN_FILE": set()}
- yield _do_test_atomiccmd__paths__invalid_values, {"OUT_FILE": [1, 2, 3]}
- yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_FILE": 1.0}
+ # Long name is used to create more informative error messages
+ test_function = _do_test_atomiccmd__paths__invalid_values
+
+ yield test_function, {"IN_FILE": 1}
+ yield test_function, {"TEMP_IN_FILE": set()}
+ yield test_function, {"OUT_FILE": [1, 2, 3]}
+ yield test_function, {"TEMP_OUT_FILE": 1.0}
- yield _do_test_atomiccmd__paths__invalid_values, {"IN_STDIN": {}}
- yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_IN_STDIN": frozenset()}
- yield _do_test_atomiccmd__paths__invalid_values, {"OUT_STDOUT": 1.7}
- yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_STDOUT": ()}
- yield _do_test_atomiccmd__paths__invalid_values, {"OUT_STDERR": xrange(3)}
- yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_STDERR": -1}
+ yield test_function, {"IN_STDIN": {}}
+ yield test_function, {"TEMP_IN_STDIN": frozenset()}
+ yield test_function, {"OUT_STDOUT": 1.7}
+ yield test_function, {"TEMP_OUT_STDOUT": ()}
+ yield test_function, {"OUT_STDERR": xrange(3)}
+ yield test_function, {"TEMP_OUT_STDERR": -1}
-# Subpaths are not allowed for temp IN/OUT files, neither relative nor asbsolute
+# Subpaths are not allowed for temp IN/OUT files, neither relative nor
+# asbsolute
def test_atomiccmd__paths__invalid_temp_paths():
def _do_test_atomiccmd__paths__invalid_temp_paths(kwargs):
assert_raises(ValueError, AtomicCmd, "true", **kwargs)
+ # Long name is used to create more informative error messages
+ test_function = _do_test_atomiccmd__paths__invalid_temp_paths
+
# No relative paths
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_FOO": "sub/infile"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_STDIN": "sub/stdin"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_FOO": "sub/outfile"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDOUT": "sub/stdout"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDERR": "sub/stderr"}
+ yield test_function, {"TEMP_IN_FOO": "sub/infile"}
+ yield test_function, {"TEMP_IN_STDIN": "sub/stdin"}
+ yield test_function, {"TEMP_OUT_FOO": "sub/outfile"}
+ yield test_function, {"TEMP_OUT_STDOUT": "sub/stdout"}
+ yield test_function, {"TEMP_OUT_STDERR": "sub/stderr"}
# No absolute paths
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_FOO": "/tmp/sub/infile"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_STDIN": "/dev/sub/stdin"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_FOO": "/etc/sub/outfile"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDOUT": "/var/sub/stdout"}
- yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDERR": "/home/sub/stderr"}
+ yield test_function, {"TEMP_IN_FOO": "/tmp/sub/infile"}
+ yield test_function, {"TEMP_IN_STDIN": "/dev/sub/stdin"}
+ yield test_function, {"TEMP_OUT_FOO": "/etc/sub/outfile"}
+ yield test_function, {"TEMP_OUT_STDOUT": "/var/sub/stdout"}
+ yield test_function, {"TEMP_OUT_STDERR": "/home/sub/stderr"}
# All OUT_ files must be unique, including all TEMP_OUT_
def test_atomiccmd__paths__overlapping_output():
- def _do_test_atomiccmd__paths__overlapping_output(key_1, file_1, key_2, file_2):
+ def _do_test_atomiccmd__paths__overlapping_output(key_1, file_1,
+ key_2, file_2):
assert_raises(ValueError, AtomicCmd, ("ls",), **{key_1: file_1,
key_2: file_2})
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_FILE_2", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_FILE_1", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_FILE_1", "outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_FILE_2", "outfile"
+ test = _do_test_atomiccmd__paths__overlapping_output
+
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "OUT_FILE_2", "/var/outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "OUT_FILE_1", "/var/outfile"
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_FILE_1", "outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_FILE_2", "outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDOUT", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_STDOUT", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDOUT", "outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDOUT", "outfile"
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDOUT", "/var/outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "OUT_STDOUT", "/var/outfile"
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDOUT", "outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDOUT", "outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDERR", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_STDERR", "/var/outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDERR", "outfile"
- yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDERR", "outfile"
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDERR", "/var/outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "OUT_STDERR", "/var/outfile"
+ yield test, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDERR", "outfile"
+ yield test, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDERR", "outfile"
# A pipe can be w/wo TEMP_, but not both
@@ -538,17 +558,18 @@ def test_atomiccmd__ready(temp_folder):
def test_atomiccmd__join_wait():
@with_temp_folder
- def _do_test_atomiccmd__join_wait(temp_folder, func, call, before_run, after_run):
+ def _do_test_atomiccmd__join_wait(temp_folder, func, call,
+ before_run, after_run):
cmd = AtomicCmd(call)
assert_equal(func(cmd), before_run)
cmd.run(temp_folder)
assert_equal(func(cmd), after_run)
- yield _do_test_atomiccmd__join_wait, AtomicCmd.join, "true", [None], [0]
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.join, "true", [None], [0]
yield _do_test_atomiccmd__join_wait, AtomicCmd.join, "false", [None], [1]
- yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "true", None, 0
- yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "false", None, 1
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "true", None, 0
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "false", None, 1
###############################################################################
@@ -830,8 +851,8 @@ def test_atomiccmd__cleanup_sigterm():
if do_kill:
raise OSError("KABOOM!")
- def _wrap_exit(rc):
- exit_called.append(rc)
+ def _wrap_exit(returncode):
+ exit_called.append(returncode)
_procs = [flexmock(pid=7913),
# I've got the same combination on my luggage!
@@ -842,7 +863,8 @@ def test_atomiccmd__cleanup_sigterm():
assert_equal(len(paleomix.atomiccmd.command._PROCS), 2)
with Monkeypatch("os.killpg", _wrap_killpg):
with Monkeypatch("sys.exit", _wrap_exit):
- paleomix.atomiccmd.command._cleanup_children(signal.SIGTERM, None)
+ paleomix.atomiccmd.command._cleanup_children(
+ signal.SIGTERM, None)
assert_equal(exit_called, [-signal.SIGTERM])
assert_equal(sigs_sent, {7913: (signal.SIGTERM, kill_at == 0),
@@ -863,11 +885,12 @@ def test_atomiccmd__cleanup_sigterm__dead_weakrefs():
def _wrap_killpg(_pid, _sig):
assert False # pragma: no coverage
- def _wrap_exit(rc):
- exit_called.append(rc)
+ def _wrap_exit(returncode):
+ exit_called.append(returncode)
with Monkeypatch("paleomix.atomiccmd.command._PROCS", procs_wrapper):
with Monkeypatch("os.killpg", _wrap_killpg):
with Monkeypatch("sys.exit", _wrap_exit):
- paleomix.atomiccmd.command._cleanup_children(signal.SIGTERM, None)
+ paleomix.atomiccmd.command._cleanup_children(
+ signal.SIGTERM, None)
assert_equal(exit_called, [-signal.SIGTERM])
diff --git a/tests/atomiccmd_test/pprint_test.py b/tests/atomiccmd_test/pprint_test.py
index 698989b..5d77200 100644
--- a/tests/atomiccmd_test/pprint_test.py
+++ b/tests/atomiccmd_test/pprint_test.py
@@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -20,8 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
+# pylint: disable=missing-docstring
+#
import os
-import StringIO
+import signal
import nose
from nose.tools import assert_equal
@@ -29,44 +31,34 @@ from paleomix.common.testing import with_temp_folder
from paleomix.atomiccmd.command import AtomicCmd
from paleomix.atomiccmd.sets import ParallelCmds, SequentialCmds
-from paleomix.atomiccmd.pprint import pprint, pformat, _pformat_list
-
-
-###############################################################################
-###############################################################################
-# pprint
-
-def test_pprint__simple():
- obj = StringIO.StringIO()
- cmd = AtomicCmd(("touch", "something"))
- pprint(cmd, out=obj)
- assert_equal(obj.getvalue(), ("<Command = ['touch', 'something']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>\n") % (id(cmd), id(cmd)))
+from paleomix.atomiccmd.pprint import pformat, _pformat_list
###############################################################################
###############################################################################
-# INFILE
+# pformat
def test_pformat__simple():
cmd = AtomicCmd(("touch", "something"))
- assert_equal(pformat(cmd), ("<Command = ['touch', 'something']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = touch something\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'")
+ % (id(cmd), id(cmd)))
@with_temp_folder
def test_pformat__simple__running(temp_folder):
cmd = AtomicCmd(("sleep", "10"))
cmd.run(temp_folder)
- assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
- " Status = Running ...\n"
- " STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
- " STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
- " CWD = '{cwd}'>").format(id=id(cmd),
- cwd=os.getcwd(),
- temp_dir=temp_folder))
+ assert_equal(pformat(cmd),
+ ("Command = sleep 10\n"
+ "Status = Running ...\n"
+ "STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
+ "CWD = '{cwd}'").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
cmd.terminate()
cmd.join()
@@ -75,12 +67,13 @@ def test_pformat__simple__running(temp_folder):
def test_pformat__simple__running__set_cwd(temp_folder):
cmd = AtomicCmd(("sleep", "10"), set_cwd=True)
cmd.run(temp_folder)
- assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
- " Status = Running ...\n"
- " STDOUT* = 'pipe_sleep_{id}.stdout'\n"
- " STDERR* = 'pipe_sleep_{id}.stderr'\n"
- " CWD = '{temp_dir}'>").format(id=id(cmd),
- temp_dir=temp_folder))
+ assert_equal(pformat(cmd),
+ ("Command = sleep 10\n"
+ "Status = Running ...\n"
+ "STDOUT* = 'pipe_sleep_{id}.stdout'\n"
+ "STDERR* = 'pipe_sleep_{id}.stderr'\n"
+ "CWD = '{temp_dir}'").format(id=id(cmd),
+ temp_dir=temp_folder))
cmd.terminate()
cmd.join()
@@ -90,27 +83,30 @@ def test_pformat__simple__done(temp_folder):
cmd = AtomicCmd("true")
cmd.run(temp_folder)
assert_equal(cmd.join(), [0])
- assert_equal(pformat(cmd), ("<Command = ['true']\n"
- " Status = Exited with return-code 0\n"
- " STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
- " STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
- " CWD = '{cwd}'>").format(id=id(cmd),
- cwd=os.getcwd(),
- temp_dir=temp_folder))
+ assert_equal(pformat(cmd),
+ ("Command = true\n"
+ "Status = Exited with return-code 0\n"
+ "STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
+ "CWD = '{cwd}'").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
@with_temp_folder
def test_pformat__simple__done__before_join(temp_folder):
cmd = AtomicCmd("true")
cmd.run(temp_folder)
+ # pylint: disable=protected-access
cmd._proc.wait()
- assert_equal(pformat(cmd), ("<Command = ['true']\n"
- " Status = Exited with return-code 0\n"
- " STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
- " STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
- " CWD = '{cwd}'>").format(id=id(cmd),
- cwd=os.getcwd(),
- temp_dir=temp_folder))
+ assert_equal(pformat(cmd),
+ ("Command = true\n"
+ "Status = Exited with return-code 0\n"
+ "STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
+ "CWD = '{cwd}'").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
assert_equal(cmd.join(), [0])
@@ -119,27 +115,62 @@ def test_pformat__simple__done__set_cwd(temp_folder):
cmd = AtomicCmd("true", set_cwd=True)
cmd.run(temp_folder)
assert_equal(cmd.join(), [0])
- assert_equal(pformat(cmd), ("<Command = ['true']\n"
- " Status = Exited with return-code 0\n"
- " STDOUT* = 'pipe_true_{id}.stdout'\n"
- " STDERR* = 'pipe_true_{id}.stderr'\n"
- " CWD = '{temp_dir}'>").format(id=id(cmd),
- temp_dir=temp_folder))
+ assert_equal(pformat(cmd),
+ ("Command = true\n"
+ "Status = Exited with return-code 0\n"
+ "STDOUT* = 'pipe_true_{id}.stdout'\n"
+ "STDERR* = 'pipe_true_{id}.stderr'\n"
+ "CWD = '{temp_dir}'").format(id=id(cmd),
+ temp_dir=temp_folder))
@with_temp_folder
-def test_pformat__simple__killed(temp_folder):
+def test_pformat__simple__terminated_by_pipeline(temp_folder):
cmd = AtomicCmd(("sleep", "10"))
cmd.run(temp_folder)
cmd.terminate()
assert_equal(cmd.join(), ["SIGTERM"])
- assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
- " Status = Terminated with signal SIGTERM\n"
- " STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
- " STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
- " CWD = '{cwd}'>").format(id=id(cmd),
- temp_dir=temp_folder,
- cwd=os.getcwd()))
+ assert_equal(pformat(cmd),
+ ("Command = sleep 10\n"
+ "Status = Automatically terminated by PALEOMIX\n"
+ "STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
+ "CWD = '{cwd}'").format(id=id(cmd),
+ temp_dir=temp_folder,
+ cwd=os.getcwd()))
+
+
+ at with_temp_folder
+def test_pformat__simple__killed_by_signal(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ # pylint: disable=protected-access
+ os.killpg(cmd._proc.pid, signal.SIGTERM)
+ assert_equal(cmd.join(), ["SIGTERM"])
+ assert_equal(pformat(cmd),
+ ("Command = sleep 10\n"
+ "Status = Terminated with signal SIGTERM\n"
+ "STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
+ "CWD = '{cwd}'").format(id=id(cmd),
+ temp_dir=temp_folder,
+ cwd=os.getcwd()))
+
+
+ at with_temp_folder
+def test_pformat__simple__temp_root_in_arguments(temp_folder):
+ cmd = AtomicCmd(("echo", "${TEMP_DIR}"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert_equal(pformat(cmd),
+ ("Command = echo '{temp_dir}'\n"
+ "Status = Exited with return-code 0\n"
+ "STDOUT* = '{temp_dir}/pipe_echo_{id}.stdout'\n"
+ "STDERR* = '{temp_dir}/pipe_echo_{id}.stderr'\n"
+ "CWD = '{cwd}'")
+ .format(id=id(cmd),
+ temp_dir=temp_folder,
+ cwd=os.getcwd()))
###############################################################################
@@ -148,37 +179,46 @@ def test_pformat__simple__killed(temp_folder):
def test_pformat__atomiccmd__simple_with_infile():
cmd = AtomicCmd(("cat", "%(IN_SOMETHING)s"), IN_SOMETHING="/etc/fstab")
- assert_equal(pformat(cmd), ("<Command = ['cat', '/etc/fstab']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = cat /etc/fstab\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_infile__set_cwd():
cmd = AtomicCmd(("cat", "%(IN_SOMETHING)s"),
IN_SOMETHING="/etc/fstab",
set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['cat', '/etc/fstab']\n"
- " STDOUT* = 'pipe_cat_%i.stdout'\n"
- " STDERR* = 'pipe_cat_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = cat /etc/fstab\n"
+ "STDOUT* = 'pipe_cat_%i.stdout'\n"
+ "STDERR* = 'pipe_cat_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'") % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_infile():
cmd = AtomicCmd(("cat", "%(TEMP_IN_FILE)s"),
TEMP_IN_FILE="infile.txt")
- assert_equal(pformat(cmd), ("<Command = ['cat', '${TEMP_DIR}/infile.txt']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'>") % (id(cmd), id(cmd)))
+
+ assert_equal(pformat(cmd),
+ ("Command = cat '${TEMP_DIR}/infile.txt'\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_infile__set_cwd():
cmd = AtomicCmd(("zcat", "%(TEMP_IN_FILE)s"),
TEMP_IN_FILE="infile.gz",
set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['zcat', 'infile.gz']\n"
- " STDOUT* = 'pipe_zcat_%i.stdout'\n"
- " STDERR* = 'pipe_zcat_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+ assert_equal(pformat(cmd),
+ ("Command = zcat infile.gz\n"
+ "STDOUT* = 'pipe_zcat_%i.stdout'\n"
+ "STDERR* = 'pipe_zcat_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'")
+ % (id(cmd), id(cmd)))
###############################################################################
@@ -187,32 +227,48 @@ def test_pformat__atomiccmd__simple_with_temp_infile__set_cwd():
def test_pformat__atomiccmd__simple_with_outfile():
cmd = AtomicCmd(("touch", "%(OUT_RC)s"), OUT_RC="/etc/bashrc")
- assert_equal(pformat(cmd), ("<Command = ['touch', '${TEMP_DIR}/bashrc']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = touch '${TEMP_DIR}/bashrc'\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_outfile__set_cwd():
- cmd = AtomicCmd(("touch", "%(OUT_RC)s"), OUT_RC="/etc/bashrc", set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['touch', 'bashrc']\n"
- " STDOUT* = 'pipe_touch_%i.stdout'\n"
- " STDERR* = 'pipe_touch_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+ cmd = AtomicCmd(("touch", "%(OUT_RC)s"),
+ OUT_RC="/etc/bashrc",
+ set_cwd=True)
+
+ assert_equal(pformat(cmd),
+ ("Command = touch bashrc\n"
+ "STDOUT* = 'pipe_touch_%i.stdout'\n"
+ "STDERR* = 'pipe_touch_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_outfile():
- cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"), TEMP_OUT_RC="bashrc")
- assert_equal(pformat(cmd), ("<Command = ['touch', '${TEMP_DIR}/bashrc']\n"
- " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+ cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"),
+ TEMP_OUT_RC="bashrc")
+
+ assert_equal(pformat(cmd),
+ ("Command = touch '${TEMP_DIR}/bashrc'\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_outfile__set_cwd():
- cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"), TEMP_OUT_RC="bashrc", set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['touch', 'bashrc']\n"
- " STDOUT* = 'pipe_touch_%i.stdout'\n"
- " STDERR* = 'pipe_touch_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+ cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"),
+ TEMP_OUT_RC="bashrc",
+ set_cwd=True)
+
+ assert_equal(pformat(cmd),
+ ("Command = touch bashrc\n"
+ "STDOUT* = 'pipe_touch_%i.stdout'\n"
+ "STDERR* = 'pipe_touch_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'")
+ % (id(cmd), id(cmd)))
###############################################################################
@@ -221,45 +277,55 @@ def test_pformat__atomiccmd__simple_with_temp_outfile__set_cwd():
def test_pformat__atomiccmd__simple_with_stdin():
cmd = AtomicCmd("gzip", IN_STDIN="/etc/fstab")
- assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
- " STDIN = '/etc/fstab'\n"
- " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = gzip\n"
+ "STDIN = '/etc/fstab'\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_stdin__set_cwd():
cmd = AtomicCmd("gzip", IN_STDIN="/etc/fstab", set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
- " STDIN = '/etc/fstab'\n"
- " STDOUT* = 'pipe_gzip_%i.stdout'\n"
- " STDERR* = 'pipe_gzip_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = gzip\n"
+ "STDIN = '/etc/fstab'\n"
+ "STDOUT* = 'pipe_gzip_%i.stdout'\n"
+ "STDERR* = 'pipe_gzip_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_stdin():
cmd = AtomicCmd("gzip", TEMP_IN_STDIN="stabstabstab")
- assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
- " STDIN* = '${TEMP_DIR}/stabstabstab'\n"
- " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = gzip\n"
+ "STDIN* = '${TEMP_DIR}/stabstabstab'\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_temp_stdin__set_cwd():
cmd = AtomicCmd("gzip", TEMP_IN_STDIN="stabstabstab", set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
- " STDIN* = 'stabstabstab'\n"
- " STDOUT* = 'pipe_gzip_%i.stdout'\n"
- " STDERR* = 'pipe_gzip_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+ assert_equal(pformat(cmd),
+ ("Command = gzip\n"
+ "STDIN* = 'stabstabstab'\n"
+ "STDOUT* = 'pipe_gzip_%i.stdout'\n"
+ "STDERR* = 'pipe_gzip_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'")
+ % (id(cmd), id(cmd)))
def test_pformat__atomiccmd__simple_with_stdin__cmd():
cmd_1 = AtomicCmd("gzip", OUT_STDOUT=AtomicCmd.PIPE)
cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1)
- assert_equal(pformat(cmd_2), ("<Command = ['gzip']\n"
- " STDIN = <PIPE>\n"
- " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
- " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd_2), id(cmd_2)))
+ assert_equal(pformat(cmd_2),
+ ("Command = gzip\n"
+ "STDIN = <PIPE>\n"
+ "STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'")
+ % (id(cmd_2), id(cmd_2)))
###############################################################################
@@ -268,41 +334,50 @@ def test_pformat__atomiccmd__simple_with_stdin__cmd():
def test_pformat__atomiccmd__simple_with_stdout():
cmd = AtomicCmd(("echo", "Water. Water."), OUT_STDOUT="/dev/ls")
- assert_equal(pformat(cmd), ("<Command = ['echo', 'Water. Water.']\n"
- " STDOUT = '${TEMP_DIR}/ls'\n"
- " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+ assert_equal(pformat(cmd),
+ ("Command = echo 'Water. Water.'\n"
+ "STDOUT = '${TEMP_DIR}/ls'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'")
+ % (id(cmd),))
def test_pformat__atomiccmd__simple_with_stdout__set_cwd():
- cmd = AtomicCmd(("echo", "*pant*. *pant*."), OUT_STDOUT="/dev/barf", set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['echo', '*pant*. *pant*.']\n"
- " STDOUT = 'barf'\n"
- " STDERR* = 'pipe_echo_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd),))
+ cmd = AtomicCmd(("echo", "*pant*. *pant*."),
+ OUT_STDOUT="/dev/barf", set_cwd=True)
+ assert_equal(pformat(cmd),
+ ("Command = echo '*pant*. *pant*.'\n"
+ "STDOUT = 'barf'\n"
+ "STDERR* = 'pipe_echo_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'") % (id(cmd),))
def test_pformat__atomiccmd__simple_with_temp_stdout():
cmd = AtomicCmd(("echo", "Oil. Oil."), TEMP_OUT_STDOUT="dm")
- assert_equal(pformat(cmd), ("<Command = ['echo', 'Oil. Oil.']\n"
- " STDOUT* = '${TEMP_DIR}/dm'\n"
- " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+ assert_equal(pformat(cmd),
+ ("Command = echo 'Oil. Oil.'\n"
+ "STDOUT* = '${TEMP_DIR}/dm'\n"
+ "STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'")
+ % (id(cmd),))
def test_pformat__atomiccmd__simple_with_temp_stdout__set_cwd():
cmd = AtomicCmd(("echo", "Room service. Room service."),
TEMP_OUT_STDOUT="pv",
set_cwd=True)
- assert_equal(pformat(cmd), ("<Command = ['echo', 'Room service. Room service.']\n"
- " STDOUT* = 'pv'\n"
- " STDERR* = 'pipe_echo_%i.stderr'\n"
- " CWD = '${TEMP_DIR}'>") % (id(cmd),))
+ assert_equal(pformat(cmd),
+ ("Command = echo 'Room service. Room service.'\n"
+ "STDOUT* = 'pv'\n"
+ "STDERR* = 'pipe_echo_%i.stderr'\n"
+ "CWD = '${TEMP_DIR}'") % (id(cmd),))
def test_pformat__atomiccmd__simple_with_stdout_pipe():
cmd = AtomicCmd(("echo", "!"), OUT_STDOUT=AtomicCmd.PIPE)
- assert_equal(pformat(cmd), ("<Command = ['echo', '!']\n"
- " STDOUT = <PIPE>\n"
- " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+ assert_equal(pformat(cmd),
+ ("Command = echo '!'\n"
+ "STDOUT = <PIPE>\n"
+ "STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'")
+ % (id(cmd),))
###############################################################################
@@ -310,24 +385,29 @@ def test_pformat__atomiccmd__simple_with_stdout_pipe():
# ParallelCmds
def test_pformat__sets__simple():
- def _do_test_pformat__sets__simple(cls, description):
+ template = ("{description}:\n"
+ " Process 1:\n"
+ " Command = echo foo\n"
+ " STDOUT = Piped to process 2\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1_id}.stderr'\n"
+ "\n"
+ " Process 2:\n"
+ " Command = gzip\n"
+ " STDIN = Piped from process 1\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stderr'")
+
+ def _test_pformat__sets__simple(cls, description):
cmd_1 = AtomicCmd(("echo", "foo"), OUT_STDOUT=AtomicCmd.PIPE)
cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1)
cmd = cls((cmd_1, cmd_2))
assert_equal(pformat(cmd),
- ("<{description}:\n"
- " - <00> Command = ['echo', 'foo']\n"
- " STDOUT = <01>\n"
- " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1_id}.stderr'\n"
- " - <01> Command = ['gzip']\n"
- " STDIN = <00>\n"
- " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stdout'\n"
- " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stderr'>")
- .format(description=description,
- cmd_1_id=id(cmd_1),
- cmd_2_id=id(cmd_2)))
- yield _do_test_pformat__sets__simple, ParallelCmds, "Parallel commands"
- yield _do_test_pformat__sets__simple, SequentialCmds, "Sequential commands"
+ template.format(description=description,
+ cmd_1_id=id(cmd_1),
+ cmd_2_id=id(cmd_2)))
+
+ yield _test_pformat__sets__simple, ParallelCmds, "Parallel processes"
+ yield _test_pformat__sets__simple, SequentialCmds, "Sequential processes"
def test_pformat__sets__nested():
@@ -337,21 +417,26 @@ def test_pformat__sets__nested():
set_1 = ParallelCmds((cmd_1, cmd_2))
set_2 = SequentialCmds((set_1, cmd_3))
assert_equal(pformat(set_2),
- ("<Sequential commands:\n"
- " - Parallel commands:\n"
- " - <00> Command = ['echo', 'foo']\n"
- " STDOUT = <01>\n"
- " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1_id}.stderr'\n"
- " - <01> Command = ['gzip']\n"
- " STDIN = <00>\n"
- " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stdout'\n"
- " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stderr'\n"
- " - <02> Command = ['sha1sum']\n"
- " STDOUT* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3_id}.stdout'\n"
- " STDERR* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3_id}.stderr'>")
- .format(cmd_1_id=id(cmd_1),
- cmd_2_id=id(cmd_2),
- cmd_3_id=id(cmd_3)))
+ ("Sequential processes:\n"
+ " Parallel processes:\n"
+ " Process 1:\n"
+ " Command = echo foo\n"
+ " STDOUT = Piped to process 2\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1}.stderr'\n"
+ "\n"
+ " Process 2:\n"
+ " Command = gzip\n"
+ " STDIN = Piped from process 1\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2}.stderr'\n"
+ "\n"
+ " Process 3:\n"
+ " Command = sha1sum\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3}.stderr'")
+ .format(cmd_1=id(cmd_1),
+ cmd_2=id(cmd_2),
+ cmd_3=id(cmd_3)))
###############################################################################
@@ -372,25 +457,29 @@ def test_pformat__bad_input():
# _pformat_list
def test_pformat_list__empty():
- assert_equal(_pformat_list([]), "[]")
+ assert_equal(_pformat_list([]), "")
def test_pformat_list__single():
- assert_equal(_pformat_list([3]), "[3]")
+ assert_equal(_pformat_list([3]), "3")
def test_pformat_list__multiple():
- assert_equal(_pformat_list([3, 2, 1]), "[3, 2, 1]")
+ assert_equal(_pformat_list([3, 2, 1]), "3 2 1")
def test_pformat_list__wrapped():
- assert_equal(_pformat_list([3, 2, 1], width=1), "[3,\n 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=2), "[3,\n 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=3), "[3,\n 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=4), "[3,\n 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=5), "[3,\n 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=6), "[3, 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=7), "[3, 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=8), "[3, 2,\n 1]")
- assert_equal(_pformat_list([3, 2, 1], width=9), "[3, 2, 1]")
- assert_equal(_pformat_list([3, 2, 1], width=10), "[3, 2, 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=1), "3 \\\n 2 \\\n 1")
+ assert_equal(_pformat_list([3, 2, 1], width=2), "3 \\\n 2 \\\n 1")
+ assert_equal(_pformat_list([3, 2, 1], width=3), "3 \\\n 2 \\\n 1")
+ assert_equal(_pformat_list([3, 2, 1], width=4), "3 2 \\\n 1")
+ assert_equal(_pformat_list([3, 2, 1], width=5), "3 2 \\\n 1")
+ assert_equal(_pformat_list([3, 2, 1], width=6), "3 2 1")
+ assert_equal(_pformat_list([3, 2, 1], width=7), "3 2 1")
+
+
+def test_pformat_list__escaped():
+ assert_equal(_pformat_list(["a", "b c"], width=100), "a 'b c'")
+ assert_equal(_pformat_list(["a", "$c"], width=100), "a '$c'")
+ assert_equal(_pformat_list(["!a", "c"], width=100), "'!a' c")
+ assert_equal(_pformat_list(["a", "'c"], width=100), """a ''"'"'c'""")
diff --git a/tests/atomiccmd_test/sets_test.py b/tests/atomiccmd_test/sets_test.py
index fecf702..bbdcef2 100644
--- a/tests/atomiccmd_test/sets_test.py
+++ b/tests/atomiccmd_test/sets_test.py
@@ -60,13 +60,19 @@ def test_atomicsets__properties():
OUT_1="out.txt")
obj = cls([cmd_mock_1, cmd_mock_2])
- assert_equal(obj.executables, cmd_mock_1.executables | cmd_mock_2.executables)
- assert_equal(obj.requirements, cmd_mock_1.requirements | cmd_mock_2.requirements)
- assert_equal(obj.input_files, cmd_mock_1.input_files | cmd_mock_2.input_files)
- assert_equal(obj.output_files, cmd_mock_1.output_files | cmd_mock_2.output_files)
- assert_equal(obj.auxiliary_files, cmd_mock_1.auxiliary_files | cmd_mock_2.auxiliary_files)
+ assert_equal(obj.executables, cmd_mock_1.executables |
+ cmd_mock_2.executables)
+ assert_equal(obj.requirements, cmd_mock_1.requirements |
+ cmd_mock_2.requirements)
+ assert_equal(obj.input_files, cmd_mock_1.input_files |
+ cmd_mock_2.input_files)
+ assert_equal(obj.output_files, cmd_mock_1.output_files |
+ cmd_mock_2.output_files)
+ assert_equal(obj.auxiliary_files,
+ cmd_mock_1.auxiliary_files | cmd_mock_2.auxiliary_files)
assert_equal(obj.expected_temp_files, frozenset(["out", "out.txt"]))
- assert_equal(obj.optional_temp_files, cmd_mock_1.optional_temp_files | cmd_mock_2.optional_temp_files)
+ assert_equal(obj.optional_temp_files,
+ cmd_mock_1.optional_temp_files | cmd_mock_2.optional_temp_files)
for cls in (ParallelCmds, SequentialCmds):
yield _do_test, cls
diff --git a/tests/common_tests/fileutils_test.py b/tests/common_tests/fileutils_test.py
index 02e2bc3..167b579 100644
--- a/tests/common_tests/fileutils_test.py
+++ b/tests/common_tests/fileutils_test.py
@@ -336,7 +336,7 @@ def test_is_executable__full_path__folder_is_non_executable():
def test_is_executable__rel_path__is_executable():
- assert is_executable(os.path.join(test_dir(), "run"))
+ assert is_executable(os.path.join(test_dir(), "setup.sh"))
def test_is_executable__rel_path__is_non_executable():
@@ -379,7 +379,8 @@ def test_which_executable__executable__by_path_order_1():
path_2 = os.path.join(os.getcwd(), path_1)
os.environ['PATH'] = ":".join((path_1, path_2))
- assert_equal(os.path.join(path_1, "run"), which_executable("run"))
+ assert_equal(os.path.join(path_1, "setup.sh"),
+ which_executable("setup.sh"))
finally:
os.environ['PATH'] = path
@@ -391,7 +392,8 @@ def test_which_executable__executable__by_path_order_2():
path_2 = os.path.join(os.getcwd(), path_1)
os.environ['PATH'] = ":".join((path_2, path_1))
- assert_equal(os.path.join(path_2, "run"), which_executable("run"))
+ assert_equal(os.path.join(path_2, "setup.sh"),
+ which_executable("setup.sh"))
finally:
os.environ['PATH'] = path
@@ -417,7 +419,7 @@ def test_executable_exists__full_path__is_non_executable():
def test_executable_exists__rel_path__is_executable():
- assert executable_exists(os.path.join(test_dir(), "run"))
+ assert executable_exists(os.path.join(test_dir(), "setup.sh"))
def test_executable_exists__rel_path__is_non_executable():
diff --git a/tests/common_tests/makefile_test.py b/tests/common_tests/makefile_test.py
index 58a099a..6f28bf6 100644
--- a/tests/common_tests/makefile_test.py
+++ b/tests/common_tests/makefile_test.py
@@ -71,6 +71,24 @@ _DUMMY_PATH = ("a", "random", "path")
_DUMMY_PATH_STR = ":".join(_DUMMY_PATH)
+class Unhashable(object):
+ __hash__ = None
+
+
+_COMMON_INVALID_VALUES = [
+ None,
+ False,
+ [],
+ (),
+ {},
+ [Unhashable()],
+ (Unhashable(),),
+ {None: Unhashable()},
+ object,
+ object(),
+]
+
+
###############################################################################
###############################################################################
# Setup timestamps for test files
@@ -129,10 +147,8 @@ def test_is_int__rejects_not_int():
spec = IsInt()
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
- yield _reject_not_str, None
- yield _reject_not_str, False
- yield _reject_not_str, ()
- yield _reject_not_str, {}
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str, value
def test_is_int__default_description():
@@ -183,10 +199,9 @@ def test_is_unsigned_int__rejects_not_unsigned_int():
yield _reject_not_str, -1
yield _reject_not_str, -1L
- yield _reject_not_str, None
- yield _reject_not_str, False
- yield _reject_not_str, ()
- yield _reject_not_str, {}
+
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str, value
def test_is_unsigned_int__default_description():
@@ -229,10 +244,9 @@ def test_is_float__rejects_not_float():
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
yield _reject_not_str, 0
- yield _reject_not_str, None
- yield _reject_not_str, False
- yield _reject_not_str, ()
- yield _reject_not_str, {}
+
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str, value
def test_is_float__default_description():
@@ -274,10 +288,10 @@ def test_is_boolean__rejects_not_boolean():
spec = IsBoolean()
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
- yield _reject_not_str, None
yield _reject_not_str, 0
- yield _reject_not_str, ()
- yield _reject_not_str, {}
+ for value in _COMMON_INVALID_VALUES:
+ if value != False:
+ yield _reject_not_str, value
def test_is_boolean__default_description():
@@ -329,10 +343,9 @@ def test_is_str__rejects_not_str():
spec = IsStr()
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
- yield _reject_not_str, None
yield _reject_not_str, 1
- yield _reject_not_str, ()
- yield _reject_not_str, {}
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str, value
def test_is_str__default_description():
@@ -376,8 +389,10 @@ def test_is_none__rejects_not_none():
yield _reject_not_none, ""
yield _reject_not_none, 0
- yield _reject_not_none, ()
- yield _reject_not_none, {}
+
+ for value in _COMMON_INVALID_VALUES:
+ if value is not None:
+ yield _reject_not_none, value
def test_is_none__default_description():
@@ -458,6 +473,24 @@ def test_is_value_lt__default_set__must_meet_spec():
assert_raises(ValueError, ValueLT, 10, default=17)
+def test_is_value_lt__handles_not_number():
+ def _rejects_value(value):
+ spec = ValueLT(123)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ def _accepts_value(value):
+ spec = ValueLT(123)
+ spec(_DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ yield _accepts_value, None
+ yield _accepts_value, False
+
+ for value in _COMMON_INVALID_VALUES:
+ if value not in (False, None):
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# ValueLE
@@ -516,6 +549,24 @@ def test_is_value_le__default_set__must_meet_spec():
assert_raises(ValueError, ValueLE, 10, default=17)
+def test_is_value_le__handles_not_number():
+ def _rejects_value(value):
+ spec = ValueLE(123)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ def _accepts_value(value):
+ spec = ValueLE(123)
+ spec(_DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ yield _accepts_value, None
+ yield _accepts_value, False
+
+ for value in _COMMON_INVALID_VALUES:
+ if value not in (False, None):
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# ValueGE
@@ -574,6 +625,24 @@ def test_is_value_ge__default_set__must_meet_spec():
assert_raises(ValueError, ValueGE, 10, default=7)
+def test_is_value_ge__handles_not_number():
+ def _rejects_value(value):
+ spec = ValueGE(123)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ def _accepts_value(value):
+ spec = ValueGE(123)
+ spec(_DUMMY_PATH, value)
+
+ yield _accepts_value, "foo"
+ yield _rejects_value, None
+ yield _rejects_value, False
+
+ for value in _COMMON_INVALID_VALUES:
+ if value not in (None, False):
+ yield _accepts_value, value
+
+
###############################################################################
###############################################################################
# ValueGT
@@ -632,6 +701,23 @@ def test_is_value_gt__default_set__must_meet_spec():
assert_raises(ValueError, ValueGT, 10, default=10)
+def test_is_value_gt__handles_not_number():
+ def _rejects_value(value):
+ spec = ValueGT(123)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ def _accepts_value(value):
+ spec = ValueGT(123)
+ spec(_DUMMY_PATH, value)
+
+ yield _accepts_value, "foo"
+ yield _rejects_value, None
+ yield _rejects_value, False
+ for value in _COMMON_INVALID_VALUES:
+ if value not in (False, None):
+ yield _accepts_value, value
+
+
###############################################################################
###############################################################################
# ValueIn
@@ -690,6 +776,16 @@ def test_is_value_in__default_set__must_meet_spec():
assert_raises(ValueError, ValueGT, range(5), default=5)
+def test_is_value_in__handles_types():
+ def _rejects_value(value):
+ spec = ValueIn((1, 2, 3, 4, 5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ for value in _COMMON_INVALID_VALUES:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# ValuesIntersects
@@ -758,6 +854,16 @@ def test_intersects__default_set__must_meet_spec():
assert_raises(ValueError, ValuesIntersect, range(5), default=[5])
+def test_intersects__handles_types():
+ def _rejects_value(value):
+ spec = ValuesIntersect(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ for value in _COMMON_INVALID_VALUES:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# ValueSubsetOf
@@ -772,6 +878,11 @@ def test_subset_of__multiple_values_in_set():
spec(_DUMMY_PATH, [1, 4])
+def test_subset_of__empty_set_is_subset():
+ spec = ValuesSubsetOf(range(5))
+ spec(_DUMMY_PATH, [])
+
+
def test_subset_of__single_value_not_in_set():
spec = ValuesSubsetOf(range(5))
assert_raises(MakefileError, spec, _DUMMY_PATH, [5])
@@ -826,6 +937,17 @@ def test_subset_of__default_set__must_meet_spec():
assert_raises(ValueError, ValuesSubsetOf, range(5), default=[4, 5])
+def test_subset_of__handles_types():
+ def _rejects_value(value):
+ spec = ValuesSubsetOf(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ for value in _COMMON_INVALID_VALUES:
+ if value != []:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# And
@@ -1040,6 +1162,16 @@ def test_string_in__default_set__must_meet_spec():
assert_raises(ValueError, StringIn, "ABCDEFGH", default="i")
+def test_string_in__handles_types():
+ def _rejects_value(value):
+ spec = StringIn("ABCDEFGH")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ for value in _COMMON_INVALID_VALUES:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# StringsIntersect
@@ -1083,6 +1215,16 @@ def test_strings_intersect__default_set__must_meet_spec():
assert_raises(ValueError, StringsIntersect, "ABCDEFGH", default=[1, 2, 3])
+def test_string_intersects__handles_types():
+ def _rejects_value(value):
+ spec = StringsIntersect("ABCDEFGH")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "xyz"
+ for value in _COMMON_INVALID_VALUES:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# StringsSubsetOf
@@ -1092,6 +1234,11 @@ def test_subset_of__case_insensitive__value_in_set():
spec(_DUMMY_PATH, ["Bce"])
+def test_subset_of__case_insensitive__empty_set_is_subset():
+ spec = StringsSubsetOf(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, [])
+
+
def test_subset_of__chars__case_insensitive__accepts_differences_in_case():
spec = StringsSubsetOf("abcdefghijkl ")
spec(_DUMMY_PATH, "A big DEAL")
@@ -1126,6 +1273,17 @@ def test_string_subset_of__default_set__must_meet_spec():
assert_raises(ValueError, StringsSubsetOf, "ABCDEFGH", default=[1, 2, 3])
+def test_string_subset_of__handles_types():
+ def _rejects_value(value):
+ spec = StringsSubsetOf("ABCDEFGH")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _rejects_value, "foo"
+ for value in _COMMON_INVALID_VALUES:
+ if value != []:
+ yield _rejects_value, value
+
+
###############################################################################
###############################################################################
# StringIsUppercase
@@ -1151,10 +1309,9 @@ def test_string_is_uppercase__rejects_not_uppercase_str():
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
yield _reject_not_uppercase_str, "AcEf"
- yield _reject_not_uppercase_str, None
yield _reject_not_uppercase_str, 1
- yield _reject_not_uppercase_str, ()
- yield _reject_not_uppercase_str, {}
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_uppercase_str, value
def test_string_is_uppercase__default_not_set():
@@ -1196,10 +1353,9 @@ def test_string_starts_with__rejects_not_uppercase_str():
def _reject_not_str_with_prefix(value):
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
- yield _reject_not_str_with_prefix, None
yield _reject_not_str_with_prefix, 1
- yield _reject_not_str_with_prefix, ()
- yield _reject_not_str_with_prefix, {}
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str_with_prefix, value
def test_string_starts_with__default_not_set():
@@ -1236,15 +1392,13 @@ def test_string_ends_with__rejects_string_without_prefix():
def test_string_ends_with__rejects_not_uppercase_str():
- spec = StringEndsWith("Foo")
-
def _reject_not_str_with_postfix(value):
+ spec = StringEndsWith("Foo")
assert_raises(MakefileError, spec, _DUMMY_PATH, value)
- yield _reject_not_str_with_postfix, None
yield _reject_not_str_with_postfix, 1
- yield _reject_not_str_with_postfix, ()
- yield _reject_not_str_with_postfix, {}
+ for value in _COMMON_INVALID_VALUES:
+ yield _reject_not_str_with_postfix, value
def test_string_ends_with__default_not_set():
diff --git a/tests/common_tests/versions_tests.py b/tests/common_tests/versions_tests.py
index 8275764..f508b13 100644
--- a/tests/common_tests/versions_tests.py
+++ b/tests/common_tests/versions_tests.py
@@ -20,8 +20,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
-# Disable warnings on strange function names
-# pylint: disable=C0103
+# pylint: disable=missing-docstring,too-few-public-methods
+#
import pickle
import operator
@@ -51,7 +51,7 @@ def test_check_str():
###############################################################################
###############################################################################
-## Check class -- hash and comparisons
+# Check class -- hash and comparisons
def test_check__eq_same_func_desc_and_version():
obj_1 = versions.Check("Desc {}", operator.lt, 1, 2, 3)
@@ -83,7 +83,7 @@ def test_check__not_eq_for_same_func_desc_diff_version():
###############################################################################
###############################################################################
-## EQ class
+# EQ class
def test_eq__str__one_value():
obj = versions.EQ(1)
@@ -124,7 +124,7 @@ def test_eq__check_values__not_equal_too_few_values():
###############################################################################
###############################################################################
-## GE class
+# GE class
def test_ge__str__one_value():
obj = versions.GE(1)
@@ -167,7 +167,7 @@ def test_ge__check_values__not_equal_too_few_values():
###############################################################################
###############################################################################
-## LT class
+# LT class
def test_lt__str__one_value():
obj = versions.LT(1)
@@ -209,7 +209,7 @@ def test_lt__check_values__not_less_than_too_few_values():
###############################################################################
###############################################################################
-## Any class
+# Any class
def test_any__str():
obj = versions.Any()
@@ -226,7 +226,7 @@ def test_lt__check_values__always_true():
###############################################################################
###############################################################################
-## And class
+# And class
def test_and__init__non_check_value():
assert_raises(ValueError, versions.And, versions.LT(2), None)
@@ -234,7 +234,7 @@ def test_and__init__non_check_value():
###############################################################################
###############################################################################
-## And class -- str
+# And class -- str
def test_and__str__single_item():
obj = versions.And(versions.GE(1))
@@ -269,7 +269,7 @@ def test_and__str__two_items__second_is_operator():
###############################################################################
###############################################################################
-## And class -- check_version
+# And class -- check_version
def test_and__check_version__both_true():
obj_1 = versions.GE(1, 2)
@@ -321,7 +321,7 @@ def test_and__check_version__insufficient_number_of_values():
###############################################################################
###############################################################################
-## Or class
+# Or class
def test_or__init__non_check_value():
assert_raises(ValueError, versions.Or, versions.LT(2), None)
@@ -329,7 +329,7 @@ def test_or__init__non_check_value():
###############################################################################
###############################################################################
-## Or class -- str
+# Or class -- str
def test_or__str__single_item():
obj = versions.Or(versions.GE(1))
@@ -364,7 +364,7 @@ def test_or__str__two_items__second_is_operator():
###############################################################################
###############################################################################
-## Or class -- check_version
+# Or class -- check_version
def test_or__check_version__both_true():
obj_1 = versions.GE(1, 2)
@@ -422,7 +422,7 @@ def test_or__check_version__insufficient_number_of_values__is_lazy():
###############################################################################
###############################################################################
-## RequirementObj -- constructor
+# RequirementObj -- constructor
def test_requirementobj__init__defaults():
obj = versions.RequirementObj(call=("echo", "foo"),
@@ -446,17 +446,19 @@ def test_requirementobj__init__non_defaults():
###############################################################################
###############################################################################
-## RequirementObj -- version
+# RequirementObj -- version
-def _echo_version(version, to="stdout", returncode=0):
+def _echo_version(version, dst="stdout", returncode=0):
tmpl = "import sys; sys.%s.write(%r); sys.exit(%s);"
- return ("/usr/bin/python", "-c", tmpl % (to, version, returncode))
+ return ("/usr/bin/python", "-c", tmpl % (dst, version, returncode))
+
+
_PIPES = ("stderr", "stdout")
def test_requirementobj__version__call():
def _do_test_version__single_digit(pipe, regexp, equals):
- call = _echo_version("v3.5.2\n", to=pipe)
+ call = _echo_version("v3.5.2\n", dst=pipe)
obj = versions.RequirementObj(call=call,
search=regexp,
checks=versions.Any())
@@ -484,13 +486,26 @@ def test_requirementobj__version__command_not_found():
checks=versions.Any())
try:
- obj.version # pylint: disable=
+ obj.version # pylint: disable=pointless-statement
assert False # pragma: no coverage
- except versions.VersionRequirementError, error:
+ except versions.VersionRequirementError as error:
# Should include OSError message
assert_in("No such file or directory", str(error))
+def test_requirementobj__version__command_not_executable():
+ obj = versions.RequirementObj(call=("./README.md",),
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ try:
+ obj.version # pylint: disable=pointless-statement
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError as error:
+ # Should include OSError message
+ assert_in("Permission denied", str(error))
+
+
def test_requirementobj__version__return_code_is_ignored():
obj = versions.RequirementObj(_echo_version("v1.2.3", returncode=1),
search=r"v(\d+)\.(\d+)",
@@ -528,9 +543,9 @@ def test_requirementobj__version__outdated_jre__with_or_without_version_str():
checks=versions.Any())
try:
- obj.version
+ obj.version # pylint: disable=pointless-statement
assert False # pragma: no coverage
- except versions.VersionRequirementError, error:
+ except versions.VersionRequirementError as error:
assert_in(error_msg, str(error))
messages = [
@@ -543,17 +558,46 @@ def test_requirementobj__version__outdated_jre__with_or_without_version_str():
###############################################################################
###############################################################################
-## RequirementObj -- __call__
+# RequirementObj -- executable
+
+
+def test_requirementobj__executable__no_cli_args():
+ obj = versions.RequirementObj(call=["samtools"],
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ assert_equal(obj.executable, "samtools")
+
+
+def test_requirementobj__executable__with_cli_arguments():
+ obj = versions.RequirementObj(call=["samtools", "--version"],
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ assert_equal(obj.executable, "samtools")
+
+
+def test_requirementobj__executable__function():
+ obj = versions.RequirementObj(call=lambda: "v1.1",
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ assert_equal(obj.executable, None)
+
+
+###############################################################################
+###############################################################################
+# RequirementObj -- __call__
class CheckCounted(versions.Check):
def __init__(self, return_value=True, expected=(1, 1)):
+ self.count = 0
+ self.return_value = return_value
versions.Check.__init__(self, "counted {}", operator.eq, *expected)
- object.__setattr__(self, "count", 0)
- object.__setattr__(self, "return_value", return_value)
- def _do_check_version(self, values, current):
- assert_equal(values, current)
- object.__setattr__(self, "count", self.count + 1)
+ def _do_check_version(self, current, reference):
+ assert_equal(current, reference)
+ self.count += 1
return self.return_value
@@ -596,7 +640,7 @@ def test_requirementobj__call__check_fails__function():
try:
obj()
assert False # pragma: no coverage
- except versions.VersionRequirementError, error:
+ except versions.VersionRequirementError as error:
assert_equal(str(error), expected)
@@ -605,8 +649,8 @@ def test_requirementobj__call__check_fails():
"Version requirements not met for test#1; please refer\n" \
"to the PALEOMIX documentation for more information.\n" \
"\n" \
- " Executable: /usr/bin/python\n" \
- " Call: /usr/bin/python -c import sys; " \
+ "Attempted to run command:\n" \
+ " $ /usr/bin/python -c import sys; " \
"sys.stdout.write('v1.0.2'); sys.exit(0);\n" \
" Version: v1.0.x\n" \
" Required: at least v1.1.x"
@@ -618,7 +662,7 @@ def test_requirementobj__call__check_fails():
try:
obj()
assert False # pragma: no coverage
- except versions.VersionRequirementError, error:
+ except versions.VersionRequirementError as error:
assert_equal(str(error), expected)
@@ -626,8 +670,8 @@ def test_requirementobj__call__check_fails__jre_outdated():
expected = \
"Version could not be determined for test#1:\n" \
"\n" \
- " Executable: /usr/bin/python\n" \
- " Call: /usr/bin/python -c import sys; " \
+ "Attempted to run command:\n" \
+ " $ /usr/bin/python -c import sys; " \
"sys.stdout.write('UnsupportedClassVersionError'); sys.exit(0);\n" \
"\n" \
"The version of the Java Runtime Environment on this\n" \
@@ -644,13 +688,13 @@ def test_requirementobj__call__check_fails__jre_outdated():
try:
obj()
assert False # pragma: no coverage
- except versions.VersionRequirementError, error:
+ except versions.VersionRequirementError as error:
assert_equal(str(error), expected)
###############################################################################
###############################################################################
-## Pickling of checks
+# Pickling of checks
def test_check__can_pickle():
def _do_test_can_pickle(obj):
@@ -666,7 +710,7 @@ def test_check__can_pickle():
###############################################################################
###############################################################################
-## Requirement
+# Requirement
def test_requirement__obj_is_cached_for_same_values():
obj1 = versions.Requirement("echo", "", versions.LT(1))
diff --git a/tests/run b/tests/run
deleted file mode 100755
index 2397383..0000000
--- a/tests/run
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-if ! nosetests --version &> /dev/null;
-then
- echo "Could not run 'nosetests'; please ensure that nose is installed:" > /dev/stderr
- echo " $ pip install nose" > /dev/stderr
- exit 1
-fi
-
-py_cmd=$(head -n1 $(which nosetests) | sed -e's/^#!//')
-
-for module in flexmock coverage;
-do
- if ! ${py_cmd} -c "import ${module}" &> /dev/null;
- then
- echo "Could import Python module '${module}'; please ensure that this module is installed:" > /dev/stderr
- echo " $ pip install ${module}" > /dev/stderr
- exit 1
- fi
-done
-
-MODULES=$(find paleomix -mindepth 1 -maxdepth 1 -name '*.py' -or -type d | sed -e 's#\.py##g' -e's#/#.#g' | grep -v "paleomix.yaml" | grep -v __init__)
-nosetests -I ".*_flymake.py" tests/ --with-coverage $@ \
- --cover-tests --cover-branches --cover-inclusive --cover-erase \
- $(for module in unit $MODULES;do echo --cover-package=$module;done) \
- 2>&1 | grep -v "[0-9]\+ \+0 \+[0-9]\+ \+0 \+100%"
-# --cover-html --cover-html-dir=tests/runs/coverage
diff --git a/tests/setup.sh b/tests/setup.sh
new file mode 100755
index 0000000..8455e08
--- /dev/null
+++ b/tests/setup.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+if [ ! -e "paleomix" ];
+then
+ cd ..
+fi
+
+find paleomix -name '*.py' -type f \
+ | sed -e's#\.py##' -e's#/#.#g' -e's#^#import #' \
+ | grep -v "__init__\|yaml\|resources" \
+ > tests/all_modules.py
\ No newline at end of file
diff --git a/tox.ini b/tox.ini
index 0aff906..82ba260 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,7 +10,9 @@ envlist = py27
changedir = tests
commands =
- nosetests .
+ ./setup.sh
+ nosetests --with-coverage --cover-erase --cover-inclusive --cover-tests \
+ --cover-package paleomix .
deps =
nose
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/paleomix.git
More information about the debian-med-commit
mailing list