[med-svn] [python-pbcore] 01/03: Imported Upstream version 1.2.4+dfsg
Afif Elghraoui
afif-guest at moszumanska.debian.org
Mon Oct 26 08:24:42 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository python-pbcore.
commit a8e05e349e793653d6fcaa1a8d466d4fd4271028
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Mon Oct 26 01:12:01 2015 -0700
Imported Upstream version 1.2.4+dfsg
---
CHANGELOG.org | 4 +
doc/pacbio-theme/static/headerGradient.jpg | Bin 7099 -> 0 bytes
doc/pacbio-theme/static/pacbio.css | 238 ----------------
doc/pacbio-theme/static/pacbioLogo.png | Bin 3128 -> 0 bytes
doc/pacbio-theme/static/pygments.css | 55 ----
doc/pacbio-theme/theme.conf | 4 -
doc/pbcore.io.dataset.rst | 95 ++++++-
pbcore/__init__.py | 2 +-
pbcore/data/datasets/__init__.py | 3 +
...00001823085912221377_s1_X0.subreads.fake_bc.bam | Bin 0 -> 192565 bytes
...1823085912221377_s1_X0.subreads.fake_bc.bam.bai | Bin 0 -> 16 bytes
...1823085912221377_s1_X0.subreads.fake_bc.bam.pbi | Bin 0 -> 1523 bytes
pbcore/data/datasets/pbalchemysim0.pbalign.bam | Bin 303745 -> 303947 bytes
pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi | Bin 2147 -> 2159 bytes
pbcore/data/datasets/pbalchemysim0.subreads.bam | Bin 301012 -> 301048 bytes
.../data/datasets/pbalchemysim0.subreads.bam.pbi | Bin 1003 -> 1030 bytes
pbcore/data/datasets/pbalchemysim1.pbalign.bam | Bin 287718 -> 287902 bytes
pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi | Bin 2028 -> 2082 bytes
pbcore/data/datasets/pbalchemysim1.subreads.bam | Bin 284875 -> 284903 bytes
.../data/datasets/pbalchemysim1.subreads.bam.pbi | Bin 945 -> 965 bytes
pbcore/data/empty.ccs.bam | Bin 540 -> 507 bytes
pbcore/data/empty.ccs.bam.pbi | Bin 67 -> 67 bytes
...100569412550000001823090301191423_s1_p0.ccs.bam | Bin 280793 -> 278396 bytes
...00001823085912221377_s1_X0.aligned_subreads.bam | Bin 201867 -> 201960 bytes
...1823085912221377_s1_X0.aligned_subreads.bam.bai | Bin 112 -> 112 bytes
...1823085912221377_s1_X0.aligned_subreads.bam.pbi | Bin 2521 -> 2609 bytes
...564852550000001823085912221377_s1_X0.scraps.bam | Bin 0 -> 608536 bytes
...4852550000001823085912221377_s1_X0.subreads.bam | Bin 204413 -> 204543 bytes
...550000001823085912221377_s1_X0.subreads.bam.pbi | Bin 1314 -> 1463 bytes
pbcore/io/BasH5IO.py | 9 +
pbcore/io/FastaIO.py | 2 +
pbcore/io/GffIO.py | 97 +++++--
pbcore/io/align/BamAlignment.py | 4 +-
pbcore/io/align/BamIO.py | 12 +-
pbcore/io/align/PacBioBamIndex.py | 43 ++-
pbcore/io/dataset/DataSetIO.py | 306 ++++++++++++++++-----
pbcore/io/dataset/DataSetMembers.py | 73 +++--
pbcore/io/dataset/DataSetReader.py | 1 +
pbcore/io/dataset/DataSetValidator.py | 30 +-
pbcore/io/dataset/DataSetWriter.py | 21 +-
pbcore/io/dataset/EntryPoints.py | 38 ++-
pbcore/io/dataset/_pbds.py | 2 +-
pbcore/io/dataset/utils.py | 79 ++++--
tests/test_pbcore_extract_bam_sequence.py | 4 +-
tests/test_pbcore_io_AlnFileReaders.py | 39 ++-
tests/test_pbcore_io_GffIO.py | 27 +-
tests/test_pbdataset.py | 185 +++++++++++--
tests/test_pbdataset_subtypes.py | 82 +++++-
48 files changed, 921 insertions(+), 534 deletions(-)
diff --git a/CHANGELOG.org b/CHANGELOG.org
index c00d28e..1a8c09f 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,7 @@
+* Version 1.2.4
+- add GFF gather functionality
+- support for 3.0.1 BAM format spec
+
* Version 1.2.3
- Updated for newer dataset schema
- BAM access speed improvements
diff --git a/doc/pacbio-theme/static/headerGradient.jpg b/doc/pacbio-theme/static/headerGradient.jpg
deleted file mode 100644
index 883f147..0000000
Binary files a/doc/pacbio-theme/static/headerGradient.jpg and /dev/null differ
diff --git a/doc/pacbio-theme/static/pacbio.css b/doc/pacbio-theme/static/pacbio.css
deleted file mode 100644
index b4ab87f..0000000
--- a/doc/pacbio-theme/static/pacbio.css
+++ /dev/null
@@ -1,238 +0,0 @@
-/**
- * Sphinx stylesheet -- default theme
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
- at import url("basic.css");
-
-/* -- page layout ----------------------------------------------------------- */
-
-body {
- font-family: Arial, sans-serif;
- font-size: 100%;
- background-color: #555;
- color: #555;
- margin: 0;
- padding: 0;
- min-width: 500px;
- max-width: 956px;
- margin: 0 auto;
-}
-
-div.documentwrapper {
- float: left;
- width: 100%;
-}
-
-div.bodywrapper {
- margin: 0 0 0 230px;
-}
-
-hr{
- border: 1px solid #B1B4B6;
-
-}
-
-div.document {
- background-color: #eee;
-}
-
-div.body {
- background-color: #ffffff;
- color: #3E4349;
- padding: 30px 30px 30px 30px;
- font-size: 0.8em;
-}
-
-div.footer {
- color: #555;
- background-color: #fff;
- padding: 13px 0;
- text-align: center;
- font-size: 75%;
-
-}
-div.footer a {
- color: #444;
- text-decoration: underline;
-}
-
-div.related {
- background: #fff url(headerGradient.jpg);
- line-height: 80px;
- color: #fff;
- font-size: 0.80em;
- height: 79px;
- z-index: -1;
-}
-
-div.related ul {
- background: url(pacbioLogo.png) 10px no-repeat;
- padding: 0 0 0 200px;
-}
-
-div.related a {
- color: #E2F3CC;
-}
-
-div.sphinxsidebar {
- font-size: 0.75em;
- line-height: 1.5em;
-}
-
-div.sphinxsidebarwrapper{
- padding: 20px 0;
-}
-
-div.sphinxsidebar h3,
-div.sphinxsidebar h4 {
- font-family: Arial, sans-serif;
- color: #222;
- font-size: 1.2em;
- font-weight: bold;
- margin: 0;
- padding: 5px 10px 0 10px;
-}
-
-div.sphinxsidebar h4{
- font-size: 1.1em;
-}
-
-div.sphinxsidebar h3 a {
- color: #444;
-}
-
-
-div.sphinxsidebar p {
- color: #888;
- padding: 0px 20px;
- margin-top: 5px;
-}
-
-div.sphinxsidebar p.topless {
-}
-
-div.sphinxsidebar ul {
- margin: 5px 20px 10px 20px;
- padding: 0;
- color: #000;
-}
-
-div.sphinxsidebar a {
- color: #444;
-}
-
-div.sphinxsidebar input {
- border: 1px solid #ccc;
- font-family: sans-serif;
- font-size: 1em;
-}
-
-div.sphinxsidebar input[type=text]{
- margin-left: 20px;
-}
-
-/* -- body styles ----------------------------------------------------------- */
-
-a {
- color: #005B81;
- text-decoration: none;
-}
-
-a:hover {
- color: #E32E00;
- text-decoration: underline;
-}
-
-div.body h1,
-div.body h2,
-div.body h3,
-div.body h4,
-div.body h5,
-div.body h6 {
- font-family: Arial, sans-serif;
- font-weight: bold;
- color: #264868;
- margin: 30px 0px 10px 0px;
- padding: 5px 0 5px 0px;
-}
-
-div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
-div.body h2 { font-size: 125%; }
-div.body h3 { font-size: 110%; }
-div.body h4 { font-size: 100%; }
-div.body h5 { font-size: 100%; }
-div.body h6 { font-size: 100%; }
-
-a.headerlink {
- color: #c60f0f;
- font-size: 0.8em;
- padding: 0 4px 0 4px;
- text-decoration: none;
-}
-
-a.headerlink:hover {
- background-color: #c60f0f;
- color: white;
-}
-
-div.body p, div.body dd, div.body li {
- line-height: 1.5em;
- font-size: 1em;
-}
-
-div.admonition p.admonition-title + p {
- display: inline;
-}
-
-div.highlight{
- background-color: white;
-}
-
-div.note {
- background-color: #eee;
- border: 1px solid #ccc;
-}
-
-div.seealso {
- background-color: #ffc;
- border: 1px solid #ff6;
-}
-
-div.topic {
- background-color: #eee;
-}
-
-div.warning {
- background-color: #ffe4e4;
- border: 1px solid #f66;
-}
-
-p.admonition-title {
- display: inline;
-}
-
-p.admonition-title:after {
- content: ":";
-}
-
-pre {
- padding: 10px;
- background-color: White;
- color: #222;
- line-height: 1.2em;
- border: 1px solid #C6C9CB;
- font-size: 1.2em;
- margin: 1.5em 0 1.5em 0;
- -webkit-box-shadow: 1px 1px 1px #d8d8d8;
- -moz-box-shadow: 1px 1px 1px #d8d8d8;
-}
-
-tt {
- background-color: #ecf0f3;
- color: #222;
- padding: 1px 2px;
- font-size: 1.2em;
- font-family: monospace;
-}
-
diff --git a/doc/pacbio-theme/static/pacbioLogo.png b/doc/pacbio-theme/static/pacbioLogo.png
deleted file mode 100644
index b2e4887..0000000
Binary files a/doc/pacbio-theme/static/pacbioLogo.png and /dev/null differ
diff --git a/doc/pacbio-theme/static/pygments.css b/doc/pacbio-theme/static/pygments.css
deleted file mode 100644
index 4588cde..0000000
--- a/doc/pacbio-theme/static/pygments.css
+++ /dev/null
@@ -1,55 +0,0 @@
-.c { color: #999988; font-style: italic } /* Comment */
-.k { font-weight: bold } /* Keyword */
-.o { font-weight: bold } /* Operator */
-.cm { color: #999988; font-style: italic } /* Comment.Multiline */
-.cp { color: #999999; font-weight: bold } /* Comment.preproc */
-.c1 { color: #999988; font-style: italic } /* Comment.Single */
-.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
-.ge { font-style: italic } /* Generic.Emph */
-.gr { color: #aa0000 } /* Generic.Error */
-.gh { color: #999999 } /* Generic.Heading */
-.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
-.go { color: #111 } /* Generic.Output */
-.gp { color: #555555 } /* Generic.Prompt */
-.gs { font-weight: bold } /* Generic.Strong */
-.gu { color: #aaaaaa } /* Generic.Subheading */
-.gt { color: #aa0000 } /* Generic.Traceback */
-.kc { font-weight: bold } /* Keyword.Constant */
-.kd { font-weight: bold } /* Keyword.Declaration */
-.kp { font-weight: bold } /* Keyword.Pseudo */
-.kr { font-weight: bold } /* Keyword.Reserved */
-.kt { color: #445588; font-weight: bold } /* Keyword.Type */
-.m { color: #009999 } /* Literal.Number */
-.s { color: #bb8844 } /* Literal.String */
-.na { color: #008080 } /* Name.Attribute */
-.nb { color: #999999 } /* Name.Builtin */
-.nc { color: #445588; font-weight: bold } /* Name.Class */
-.no { color: #ff99ff } /* Name.Constant */
-.ni { color: #800080 } /* Name.Entity */
-.ne { color: #990000; font-weight: bold } /* Name.Exception */
-.nf { color: #990000; font-weight: bold } /* Name.Function */
-.nn { color: #555555 } /* Name.Namespace */
-.nt { color: #000080 } /* Name.Tag */
-.nv { color: purple } /* Name.Variable */
-.ow { font-weight: bold } /* Operator.Word */
-.mf { color: #009999 } /* Literal.Number.Float */
-.mh { color: #009999 } /* Literal.Number.Hex */
-.mi { color: #009999 } /* Literal.Number.Integer */
-.mo { color: #009999 } /* Literal.Number.Oct */
-.sb { color: #bb8844 } /* Literal.String.Backtick */
-.sc { color: #bb8844 } /* Literal.String.Char */
-.sd { color: #bb8844 } /* Literal.String.Doc */
-.s2 { color: #bb8844 } /* Literal.String.Double */
-.se { color: #bb8844 } /* Literal.String.Escape */
-.sh { color: #bb8844 } /* Literal.String.Heredoc */
-.si { color: #bb8844 } /* Literal.String.Interpol */
-.sx { color: #bb8844 } /* Literal.String.Other */
-.sr { color: #808000 } /* Literal.String.Regex */
-.s1 { color: #bb8844 } /* Literal.String.Single */
-.ss { color: #bb8844 } /* Literal.String.Symbol */
-.bp { color: #999999 } /* Name.Builtin.Pseudo */
-.vc { color: #ff99ff } /* Name.Variable.Class */
-.vg { color: #ff99ff } /* Name.Variable.Global */
-.vi { color: #ff99ff } /* Name.Variable.Instance */
-.il { color: #009999 } /* Literal.Number.Integer.Long */
-
diff --git a/doc/pacbio-theme/theme.conf b/doc/pacbio-theme/theme.conf
deleted file mode 100644
index dd24a1a..0000000
--- a/doc/pacbio-theme/theme.conf
+++ /dev/null
@@ -1,4 +0,0 @@
-[theme]
-inherit = default
-stylesheet = pacbio.css
-pygments_style = tango
diff --git a/doc/pbcore.io.dataset.rst b/doc/pbcore.io.dataset.rst
index acf2269..032aab1 100644
--- a/doc/pbcore.io.dataset.rst
+++ b/doc/pbcore.io.dataset.rst
@@ -55,12 +55,15 @@ Filter::
usage: dataset.py filter [-h] infile outfile filters [filters ...]
- Add filters to an XML file
+ Add filters to an XML file. Suggested fields: ['bcf', 'bcq', 'bcr',
+ 'length', 'pos', 'qend', 'qname', 'qstart', 'readstart', 'rname', 'rq',
+ 'tend', 'tstart', 'zm']. More expensive fields: ['accuracy', 'bc', 'movie',
+ 'qs']
positional arguments:
infile The xml file to filter
outfile The resulting xml file
- filters The values and thresholds to filter (e.g. rq>0.85)
+ filters The values and thresholds to filter (e.g. 'rq>0.85')
optional arguments:
-h, --help show this help message and exit
@@ -123,13 +126,96 @@ Split::
--subdatasets Split on subdatasets
--outdir OUTDIR Specify an output directory
-Consolidation requires in-depth filtering and BAM file manipulation.
-Implementation plans TBD.
+Consolidate::
+ usage: dataset.py consolidate [-h] [--numFiles NUMFILES] [--noTmp]
+ infile datafile xmlfile
+
+ Consolidate the XML files
+
+ positional arguments:
+ infile The XML file to consolidate
+ datafile The resulting data file
+ xmlfile The resulting XML file
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --numFiles NUMFILES The number of data files to produce (1)
+ --noTmp Don't copy to a tmp location to ensure local disk
+ use
Usage Examples
=============================
+Filter Reads (CLI version)
+--------------------------
+
+In this scenario we have one or more bam files worth of subreads, aligned or
+otherwise, that we want to filter and put in a single bam file. This is
+possible using the CLI with the following steps, starting with a DataSet XML
+file::
+
+ # usage: dataset.py filter <in_fn.xml> <out_fn.xml> <filters>
+ dataset.py filter in_fn.subreadset.xml filtered_fn.subreadset.xml 'rq>0.85'
+
+ # usage: dataset.py consolidate <in_fn.xml> <out_data_fn.bam> <out_fn.xml>
+ dataset.py consolidate filtered_fn.subreadset.xml consolidate.subreads.bam out_fn.subreadset.xml
+
+The filtered DataSet and the consolidated DataSet should be read for read
+equivalent when used with SMRT Analysis software.
+
+Filter Reads (API version)
+--------------------------
+
+The API version of filtering allows for more advanced filtering criteria::
+
+ ss = SubreadSet('in_fn.subreadset.xml')
+ ss.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+ ('=', 'E.faecalis.2')],
+ tStart=[('<', '99'),
+ ('<', '299')],
+ tEnd=[('>', '0'),
+ ('>', '200')])
+
+Produces the following conditions for a read to be considered passing:
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 200)
+
+You can add sets of filters by providing equal length lists of requirements for
+each filter.
+
+Additional requirements added singly will be added to all filters::
+
+ ss.filters.addRequirement(rq=[('>', '0.85')])
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85)
+
+Additional requirements added with a plurality of options will duplicate the
+previous requiremnts for each option::
+
+ ss.filters.addRequirement(length=[('>', 500), ('>', 1000)])
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 500)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 500)
+OR
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 1000)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 1000)
+
+Of course you can always wipe the filters and start over::
+
+ ss.filters = None
+
+Consolidation is more similar to the CLI version::
+
+ ss.consolidate('cons.bam')
+ ss.write('cons.xml')
+
Resequencing Pipeline (CLI version)
-----------------------------------
@@ -295,4 +381,3 @@ DataSetMembers module.
:show-inheritance:
:undoc-members:
-
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 466ec29..dd6477f 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-__VERSION__ = "1.2.3"
+__VERSION__ = "1.2.4"
diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py
index 843faae..2d4378a 100755
--- a/pbcore/data/datasets/__init__.py
+++ b/pbcore/data/datasets/__init__.py
@@ -60,3 +60,6 @@ def getSubreadSet():
def getHdfSubreadSet():
return _getAbsPath(XML_FILES[7])
+
+def getBarcodedBam():
+ return _getAbsPath(BAM_FILES[3])
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam
new file mode 100644
index 0000000..ff3e6e8
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam differ
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai
new file mode 100644
index 0000000..79a6a43
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai differ
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi
new file mode 100644
index 0000000..4d2df12
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam b/pbcore/data/datasets/pbalchemysim0.pbalign.bam
index b9d3997..5cd4251 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi
index 1ae468c..144089a 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam b/pbcore/data/datasets/pbalchemysim0.subreads.bam
index f209211..8ea5174 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam and b/pbcore/data/datasets/pbalchemysim0.subreads.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi
index 0930a2e..532236a 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam b/pbcore/data/datasets/pbalchemysim1.pbalign.bam
index e0ee9d2..e699726 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi
index cf39b47..2477e13 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam b/pbcore/data/datasets/pbalchemysim1.subreads.bam
index 627f5f5..e45f2fa 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam and b/pbcore/data/datasets/pbalchemysim1.subreads.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi
index d637f58..28f0f2f 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi differ
diff --git a/pbcore/data/empty.ccs.bam b/pbcore/data/empty.ccs.bam
index 704abbe..3454722 100644
Binary files a/pbcore/data/empty.ccs.bam and b/pbcore/data/empty.ccs.bam differ
diff --git a/pbcore/data/empty.ccs.bam.pbi b/pbcore/data/empty.ccs.bam.pbi
index 9e7fd4e..efbe492 100644
Binary files a/pbcore/data/empty.ccs.bam.pbi and b/pbcore/data/empty.ccs.bam.pbi differ
diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam
index d927eac..c1e6df2 100644
Binary files a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam
index 789865a..02f6299 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai
index dea076b..9e1f72b 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi
index 4a8ff48..2c82b65 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam
new file mode 100644
index 0000000..cf034d2
Binary files /dev/null and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam
index 3514891..4809be5 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi
index 0cb1c4b..0027fb6 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi differ
diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py
index 8552bf6..b2ef3a5 100644
--- a/pbcore/io/BasH5IO.py
+++ b/pbcore/io/BasH5IO.py
@@ -411,6 +411,15 @@ class ZmwRead(CommonEqualityMixin):
PulseIndex = _makeQvAccessor("PulseIndex")
+ def StartFrame(self):
+ return self.EndFrame() - self.WidthInFrames()
+
+ def EndFrame(self):
+ fullPBF = arrayFromDataset(self._getBasecallsGroup()["PreBaseFrames"],
+ *self._getOffsets()[self.holeNumber])
+ fullWF = arrayFromDataset(self._getBasecallsGroup()["WidthInFrames"],
+ *self._getOffsets()[self.holeNumber])
+ return np.cumsum(fullWF + fullPBF)[self.readStart:self.readEnd]
class CCSZmwRead(ZmwRead):
"""
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
index 5e2d7bf..e750117 100644
--- a/pbcore/io/FastaIO.py
+++ b/pbcore/io/FastaIO.py
@@ -252,6 +252,8 @@ class FastaWriter(WriterBase):
raise ValueError
if len(args) == 1:
record = args[0]
+ if isinstance(record, IndexedFastaRecord):
+ record = FastaRecord(record.header, record.sequence)
assert isinstance(record, FastaRecord)
else:
header, sequence = args
diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py
index c3313f3..a08f0b7 100644
--- a/pbcore/io/GffIO.py
+++ b/pbcore/io/GffIO.py
@@ -42,7 +42,7 @@ __all__ = [ "Gff3Record",
"GffWriter" ]
from .base import ReaderBase, WriterBase
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
from copy import copy as shallow_copy
import tempfile
import os.path
@@ -256,42 +256,85 @@ def sort_gff(file_name, output_file_name=None):
return output_file_name
-def _merge_gffs(gff1, gff2, out):
- out = GffWriter(out)
+def merge_gffs (gff_files, output_file):
+ """
+ Merge a sequence of GFF files, preserving unique headers (except for the
+ source commandline, which will usually vary).
+ """
+ headers, header_keys = _merge_gff_headers(gff_files)
n_rec = 0
- with GffReader(gff1) as f1:
- map(out.writeHeader, f1.headers)
- with GffReader(gff2) as f2:
- rec1 = [ rec for rec in f1 ]
- rec2 = [ rec for rec in f2 ]
- i = j = 0
- while i < len(rec1) and j < len(rec2):
- if (rec1[i] > rec2[j]):
- out.writeRecord(rec2[j])
- j += 1
- else:
- out.writeRecord(rec1[i])
+ with GffWriter(output_file) as writer:
+ for key in header_keys:
+ for value in headers[key]:
+ writer.writeHeader("##%s %s" % (key, value))
+ for gff_file in gff_files:
+ with GffReader(gff_file) as reader:
+ for rec in reader:
+ writer.writeRecord(rec)
+ n_rec += 1
+ return n_rec
+
+
+def _merge_gff_headers(gff_files):
+ ignore_fields = set(["source-commandline", "gff-version", "date"])
+ def _get_headers(f):
+ with GffReader(f) as reader:
+ for h in reader.headers:
+ fields = h[2:].strip().split(" ")
+ if not fields[0] in ignore_fields:
+ yield fields[0], " ".join(fields[1:])
+ header_keys = []
+ for k, v in _get_headers(gff_files[0]):
+ if not k in header_keys:
+ header_keys.append(k)
+ headers = defaultdict(list)
+ for gff_file in gff_files:
+ for key, value in _get_headers(gff_file):
+ if not value in headers[key]:
+ headers[key].append(value)
+ return headers, header_keys
+
+
+def _merge_gffs_sorted(gff1, gff2, out_file):
+ import logging
+ logging.warn("Merging %s and %s" % (gff1, gff2))
+ with GffWriter(out_file) as out:
+ n_rec = 0
+ headers, header_keys = _merge_gff_headers([gff1, gff2])
+ with GffReader(gff1) as f1:
+ for key in header_keys:
+ for value in headers[key]:
+ out.writeHeader("##%s %s" % (key, value))
+ with GffReader(gff2) as f2:
+ rec1 = [ rec for rec in f1 ]
+ rec2 = [ rec for rec in f2 ]
+ i = j = 0
+ while i < len(rec1) and j < len(rec2):
+ if (rec1[i] > rec2[j]):
+ out.writeRecord(rec2[j])
+ j += 1
+ else:
+ out.writeRecord(rec1[i])
+ i += 1
+ for rec in rec1[i:]:
+ out.writeRecord(rec)
i += 1
- for rec in rec1[i:]:
- out.writeRecord(rec)
- i += 1
- for rec in rec2[j:]:
- out.writeRecord(rec)
- j += 2
- return i + j
+ for rec in rec2[j:]:
+ out.writeRecord(rec)
+ j += 2
+ return i + j
-def merge_gffs(gff_files, output_file_name):
+def merge_gffs_sorted(gff_files, output_file_name):
"""
Utility function to combine a set of N (>= 2) GFF files, with records
ordered by genomic location. (Assuming each input file is already sorted.)
"""
if len(gff_files) == 1: return gff_files[0]
while len(gff_files) > 2:
- tmpout = tempfile.NamedTemporaryFile()
- _merge_gffs(gff_files[0], gff_files[1], tmpout)
- tmpout.seek(0)
+ tmpout = tempfile.NamedTemporaryFile(suffix=".gff").name
+ _merge_gffs_sorted(gff_files[0], gff_files[1], tmpout)
gff_files = [tmpout] + gff_files[2:]
with open(output_file_name, "w") as f:
- _merge_gffs(gff_files[0], gff_files[1], f)
+ _merge_gffs_sorted(gff_files[0], gff_files[1], f)
return output_file_name
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 3ed36d7..14df751 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -253,7 +253,7 @@ class BamAlignment(AlignmentRecordMixin):
use of this property can result in code that won't work on
legacy data.
"""
- return 0.001 * self.peer.opt("rq")
+ return self.peer.opt("rq")
@property
def readType(self):
@@ -592,7 +592,7 @@ class BamAlignment(AlignmentRecordMixin):
if key in self.bam.pbi.columnNames:
return self.bam.pbi[self.rowNumber][key]
else:
- raise AttributeError, "no such column in pbi index"
+ raise AttributeError("no such column '%s' in pbi index" % key)
def __dir__(self):
basicDir = self.__dict__.keys()
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index 824674c..5a51bf8 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -168,16 +168,20 @@ class _BamReaderBase(ReaderBase):
def _checkFileCompatibility(self):
# Verify that this is a "pacbio" BAM file of version at least
- # 3.0b7
+ # 3.0.1
try:
checkedVersion = self.version
if "b" in checkedVersion:
- version, beta = checkedVersion.split("b")
- if int(beta) < 7: raise Exception()
+ raise Exception()
+ else:
+ major, minor, patch = checkedVersion.split('.')
+ assert major >= 3
+ assert minor >= 0
+ assert patch >= 1
except:
raise IncompatibleFile(
"This BAM file is incompatible with this API " +
- "(only PacBio BAM files version >= 3.0b7 are supported)")
+ "(only PacBio BAM files version >= 3.0.1 are supported)")
def __init__(self, fname, referenceFastaFname=None):
self.filename = fname = abspath(expanduser(fname))
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index 642593e..d14e1fa 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -47,7 +47,7 @@ PBI_HEADER_LEN = 32
PBI_FLAGS_BASIC = 0
PBI_FLAGS_MAPPED = 1
PBI_FLAGS_COORDINATE_SORTED = 2
-PBI_FLAGS_BARCODE_ADAPTER = 4
+PBI_FLAGS_BARCODE = 4
class PacBioBamIndex(object):
@@ -62,23 +62,32 @@ class PacBioBamIndex(object):
header = unpack("< 4s BBBx H I 18x", buf)
(self.magic, self.vPatch, self.vMinor,
self.vMajor, self.pbiFlags, self.nReads) = header
+ try:
+ assert self.vMajor >= 3
+ assert self.vMinor >= 0
+ assert self.vPatch >= 1
+ except:
+ raise IncompatibleFile(
+ "This PBI file is incompatible with this API "
+ "(only PacBio PBI files version >= 3.0.1 are supported)")
@property
def hasMappingInfo(self):
return (self.pbiFlags & PBI_FLAGS_MAPPED)
@property
- def hasAdapterInfo(self):
- return (self.pbiFlags & PBI_FLAGS_BARCODE_ADAPTER)
+ def hasBarcodeInfo(self):
+ return (self.pbiFlags & PBI_FLAGS_BARCODE)
def _loadMainIndex(self, f):
- # Main index holds subread and mapping info
- SUBREADS_INDEX_DTYPE = [
+ # Main index holds basic, mapping, and barcode info
+ BASIC_INDEX_DTYPE = [
("qId" , "i4"),
("qStart" , "i4"),
("qEnd" , "i4"),
("holeNumber" , "i4"),
- ("readQual" , "u2"),
+ ("readQual" , "f4"),
+ ("contextFlag" , "u1"),
("virtualFileOffset" , "i8") ]
MAPPING_INDEX_DTYPE = [
@@ -92,16 +101,24 @@ class PacBioBamIndex(object):
("nMM" , "u4"),
("mapQV" , "u1") ]
+ BARCODE_INDEX_DTYPE = [
+ ("bcForward" , "u2"),
+ ("bcReverse" , "u2"),
+ ("bcQual" , "u1")]
+
COMPUTED_COLUMNS_DTYPE = [
("nIns" , "u4"),
("nDel" , "u4") ]
- JOINT_DTYPE = SUBREADS_INDEX_DTYPE + MAPPING_INDEX_DTYPE + COMPUTED_COLUMNS_DTYPE
+ joint_dtype = BASIC_INDEX_DTYPE[:]
if self.hasMappingInfo:
- tbl = np.zeros(shape=(self.nReads,), dtype=JOINT_DTYPE).view(np.recarray)
- else:
- tbl = np.zeros(shape=(self.nReads,), dtype=SUBREADS_INDEX_DTYPE).view(np.recarray)
+ joint_dtype += MAPPING_INDEX_DTYPE
+ joint_dtype += COMPUTED_COLUMNS_DTYPE
+ if self.hasBarcodeInfo:
+ joint_dtype += BARCODE_INDEX_DTYPE
+ tbl = np.zeros(shape=(self.nReads,),
+ dtype=joint_dtype).view(np.recarray)
def peek(type_, length):
flavor, width = type_
@@ -109,7 +126,7 @@ class PacBioBamIndex(object):
if True:
# BASIC data always present
- for columnName, columnType in SUBREADS_INDEX_DTYPE:
+ for columnName, columnType in BASIC_INDEX_DTYPE:
tbl[columnName] = peek(columnType, self.nReads)
if self.hasMappingInfo:
@@ -120,6 +137,10 @@ class PacBioBamIndex(object):
tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM
tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM
+ if self.hasBarcodeInfo:
+ for columnName, columnType in BARCODE_INDEX_DTYPE:
+ tbl[columnName] = peek(columnType, self.nReads)
+
self._tbl = tbl
self._checkForBrokenColumns()
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index d0fe999..9c63cec 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -10,6 +10,7 @@ import copy
import os
import errno
import logging
+import itertools
import xml.dom.minidom
import tempfile
from functools import wraps
@@ -19,7 +20,9 @@ from pbcore.util.Process import backticks
from pbcore.io.opener import (openAlignmentFile, openIndexedAlignmentFile,
FastaReader, IndexedFastaReader, CmpH5Reader,
IndexedBamReader)
+from pbcore.io.align.PacBioBamIndex import PBI_FLAGS_BARCODE
from pbcore.io.FastaIO import splitFastaHeader, FastaWriter
+from pbcore.io.FastqIO import FastqReader, FastqWriter, qvsFromAscii
from pbcore.io import BaxH5Reader
from pbcore.io.align._BamSupport import UnavailableFeature
from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet,
@@ -33,7 +36,8 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
BarcodeSetMetadata,
ExternalResources,
ExternalResource, Filters)
-from pbcore.io.dataset.utils import consolidateBams, _infixFname
+from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
+ _indexBam, _indexFasta)
log = logging.getLogger(__name__)
@@ -53,41 +57,62 @@ def filtered(generator):
return wrapper
-def _toDsId(x):
- return "PacBio.DataSet.{x}".format(x=x)
+def _toDsId(name):
+ """Translate a class name into a MetaType/ID"""
+ return "PacBio.DataSet.{x}".format(x=name)
-def _dsIdToName(x):
- if DataSetMetaTypes.isValid(x):
- return x.split('.')[-1]
+def _dsIdToName(dsId):
+ """Translate a MetaType/ID into a class name"""
+ if DataSetMetaTypes.isValid(dsId):
+ return dsId.split('.')[-1]
+ else:
+ raise InvalidDataSetIOError("Invalid DataSet MetaType")
-def _dsIdToType(x):
- if DataSetMetaTypes.isValid(x):
+def _dsIdToType(dsId):
+ """Translate a MetaType/ID into a type"""
+ if DataSetMetaTypes.isValid(dsId):
types = DataSet.castableTypes()
- return types[_dsIdToName(x)]
+ return types[_dsIdToName(dsId)]
+ else:
+ raise InvalidDataSetIOError("Invalid DataSet MetaType")
-def _dsIdToSuffix(x):
+def _dsIdToSuffix(dsId):
+ """Translate a MetaType/ID into a file suffix"""
dsIds = DataSetMetaTypes.ALL
suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds}
suffixMap[_toDsId("DataSet")] = 'DataSet'
- if DataSetMetaTypes.isValid(x):
- suffix = suffixMap[x]
+ if DataSetMetaTypes.isValid(dsId):
+ suffix = suffixMap[dsId]
suffix = suffix.lower()
suffix += '.xml'
return suffix
+ else:
+ raise InvalidDataSetIOError("Invalid DataSet MetaType")
def _typeDataSet(dset):
+ """Determine the type of a dataset from the xml file without opening it"""
xml_rt = xmlRootType(dset)
dsId = _toDsId(xml_rt)
tbrType = _dsIdToType(dsId)
return tbrType
+def isDataSet(xmlfile):
+ """Determine if a file is a DataSet before opening it"""
+ try:
+ _typeDataSet(xmlfile)
+ return True
+ except:
+ return False
+
def openDataSet(*files, **kwargs):
- if not files[0].endswith('xml'):
- raise TypeError("openDataSet requires that the first file is an XML")
+ """Factory function for DataSet types as suggested by the first file"""
+ if not isDataSet(files[0]):
+ raise TypeError("openDataSet requires that the first file is a DS")
tbrType = _typeDataSet(files[0])
return tbrType(*files, **kwargs)
def openDataFile(*files, **kwargs):
+ """Factory function for DataSet types determined by the first data file"""
possibleTypes = [AlignmentSet, SubreadSet, ConsensusReadSet,
ConsensusAlignmentSet, ReferenceSet, HdfSubreadSet]
origFiles = files
@@ -126,10 +151,16 @@ def openDataFile(*files, **kwargs):
return SubreadSet(*origFiles, **kwargs)
def _stackRecArrays(recArrays):
+ """Stack recarrays into a single larger recarray"""
tbr = np.concatenate(recArrays)
tbr = tbr.view(np.recarray)
return tbr
+def _flatten(lol, times=1):
+ """ This wont do well with mixed nesting"""
+ for _ in range(times):
+ lol = np.concatenate(lol)
+ return lol
class DataSetMetaTypes(object):
"""
@@ -231,6 +262,7 @@ class DataSet(object):
"""
self._strict = kwargs.get('strict', False)
self._skipCounts = kwargs.get('skipCounts', False)
+ _induceIndices = kwargs.get('generateIndices', False)
# The metadata concerning the DataSet or subtype itself (e.g.
# name, version, UniqueId)
@@ -326,6 +358,13 @@ class DataSet(object):
elif self.totalLength <= 0 or self.numRecords <= 0:
self.updateCounts()
+ # generate indices if requested and needed
+ if _induceIndices:
+ self._induceIndices()
+
+ def _induceIndices(self):
+ raise NotImplementedError()
+
def __repr__(self):
"""Represent the dataset with an informative string:
@@ -381,9 +420,7 @@ class DataSet(object):
self.__class__.__name__ == 'DataSet'):
# determine whether or not this is the merge that is populating a
# dataset for the first time
- firstIn = False
- if len(self.externalResources) == 0:
- firstIn = True
+ firstIn = True if len(self.externalResources) == 0 else False
# Block on filters?
if (not firstIn and
@@ -407,6 +444,17 @@ class DataSet(object):
else:
result = self
+ # If this dataset has no subsets representing it, add self as a
+ # subdataset to the result
+ # TODO: this is a stopgap to prevent spurious subdatasets when
+ # creating datasets from dataset xml files...
+ if not self.subdatasets and not firstIn:
+ result.addDatasets(self.copy())
+
+ # add subdatasets
+ if other.subdatasets or not firstIn:
+ result.addDatasets(other.copy())
+
# add in the metadata (not to be confused with obj metadata)
if firstIn:
result.metadata = other.metadata
@@ -421,20 +469,6 @@ class DataSet(object):
# DataSets may only be merged if they have identical filters,
# So there is nothing new to add.
- # If this dataset has no subsets representing it, add self as a
- # subdataset to the result
- # TODO: this is a stopgap to prevent spurious subdatasets when
- # creating datasets from dataset xml files...
- if not self.subdatasets and not firstIn:
- result.addDatasets(self.copy())
-
- # add subdatasets
- if other.subdatasets or not firstIn:
- #if copyOnMerge:
- result.addDatasets(other.copy())
- #else:
- #result.addDatasets(other)
-
return result
else:
raise TypeError('DataSets can only be merged with records of the '
@@ -623,7 +657,7 @@ class DataSet(object):
def split(self, chunks=0, ignoreSubDatasets=False, contigs=False,
maxChunks=0, breakContigs=False, targetSize=5000, zmws=False,
- barcodes=False, byRecords=False, updateCounts=False):
+ barcodes=False, byRecords=False, updateCounts=True):
"""Deep copy the DataSet into a number of new DataSets containing
roughly equal chunks of the ExternalResources or subdatasets.
@@ -655,6 +689,13 @@ class DataSet(object):
breakContigs: Whether or not to break contigs when using maxChunks.
If True, something resembling an efficient number of
chunks for the dataset size will be produced.
+ targetSize: The target number of reads per chunk, when using
+ byRecords
+ zmws: Split by zmws
+ barcodes: Split by barcodes
+ byRecords: Split contigs by mapped records, rather than reference
+ length
+ updateCounts: Update the count metadata in each chunk
Returns:
A list of new DataSet objects (all other information deep copied).
@@ -837,9 +878,16 @@ class DataSet(object):
self.makePathsAbsolute()
xml_string = toXml(self)
if pretty:
- xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml()
- if validate:
- validateString(xml_string, relTo=outFile)
+ xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml(
+ encoding="UTF-8")
+ # Disable for contigsets: no support for multiple contigs in XSD
+ # Disable for ccssets: no support for datasetmetadata in XSD
+ # XXX turn this off temporarily to aid in pbsmrtpipe fixing
+ if (validate and
+ not self.datasetType in [ContigSet.datasetType,
+ ConsensusReadSet.datasetType]):
+ #validateString(xml_string, relTo=os.path.dirname(outFile))
+ pass
fileName = urlparse(outFile).path.strip()
if self._strict and not isinstance(self.datasetType, tuple):
if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
@@ -1017,7 +1065,9 @@ class DataSet(object):
resource.timeStampedName = tsName
def _getTimeStampedName(self, mType):
- time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S")
+ mType = mType.lower()
+ mType = '_'.join(mType.split('.'))
+ time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
return "{m}-{t}".format(m=mType, t=time)
@staticmethod
@@ -1297,6 +1347,21 @@ class DataSet(object):
ranges.append((movie, min(values), max(values)))
return ranges
+ # FIXME this is a workaround for the lack of support for barcode chunking
+ # in pbbam, and should probably go away once that is available.
+ @property
+ def barcodes(self):
+ """Return the list of barcodes explicitly set by filters via
+ DataSet.split(barcodes=True).
+
+ """
+ barcodes = []
+ for filt in self._filters:
+ for param in filt:
+ if param.name == "bc":
+ barcodes.append(param.value)
+ return barcodes
+
def toFofn(self, outfn=None, uri=False, relative=False):
"""Return a list of resource filenames (and write to optional outfile)
@@ -1394,7 +1459,10 @@ class DataSet(object):
@filters.setter
def filters(self, value):
"""Limit setting to ensure cache hygiene and filter compatibility"""
- self._filters = value
+ if value is None:
+ self._filters = Filters()
+ else:
+ self._filters = value
def reFilter(self, light=True):
"""
@@ -1476,7 +1544,9 @@ class DataSet(object):
@property
def sequencingChemistry(self):
- return self._checkIdentical('sequencingChemistry')
+ key = 'sequencingChemistry'
+ responses = self._pollResources(lambda x: getattr(x, key))
+ return _flatten(responses)
@property
def isEmpty(self):
@@ -1545,7 +1615,6 @@ class DataSet(object):
raise IOError(errno.EIO, "File not indexed", fname)
return True
-
def __getitem__(self, index):
"""Should respect filters for free, as _indexMap should only be
populated by filtered reads. Only pbi filters considered, however."""
@@ -1580,6 +1649,30 @@ class ReadSet(DataSet):
super(ReadSet, self).__init__(*files, **kwargs)
self._metadata = SubreadSetMetadata(self._metadata)
+ def _induceIndices(self):
+ for res in self.externalResources:
+ fname = res.resourceId
+ if not res.pbi:
+ _pbindexBam(fname)
+ self.close()
+ if not res.bai:
+ _indexBam(fname)
+ self.close()
+
+ @property
+ def isIndexed(self):
+ if self.hasPbi:
+ return True
+ return False
+
+ def assertBarcoded(self):
+ """Test whether all resources are barcoded files"""
+ res = self._pollResources(
+ lambda x: x.index.pbiFlags & PBI_FLAGS_BARCODE)
+ if not self._unifyResponses(res):
+ raise RuntimeError("File not barcoded")
+
+
def _openFiles(self):
"""Open the files (assert they exist, assert they are of the proper
type before accessing any file)
@@ -1607,12 +1700,12 @@ class ReadSet(DataSet):
location, referenceFastaFname=refFile)
else:
raise
- #if resource is None:
- #assert not self._strict
- #resource = openAlignmentFile(
- #location, referenceFastaFname=refFile)
- if not resource.isEmpty:
- self._openReaders.append(resource)
+ try:
+ if not resource.isEmpty:
+ self._openReaders.append(resource)
+ except UnavailableFeature: # isEmpty requires bai
+ if list(itertools.islice(resource, 1)):
+ self._openReaders.append(resource)
if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
raise IOError("No files were openable or reads found")
log.debug("Done opening resources")
@@ -1645,10 +1738,12 @@ class ReadSet(DataSet):
"""
# Find all possible barcodes and counts for each
- # TODO: switch this over to the pbi when bc information is exposed
+ self.assertIndexed()
+ self.assertBarcoded()
barcodes = defaultdict(int)
- for read in self.records:
- barcodes[tuple(read.peer.opt("bc"))] += 1
+ for bcTuple in itertools.izip(self.index.bcForward,
+ self.index.bcReverse):
+ barcodes[bcTuple] += 1
log.debug("{i} barcodes found".format(i=len(barcodes.keys())))
@@ -1677,6 +1772,7 @@ class ReadSet(DataSet):
log.debug("Generating new UUID")
for result in results:
result.newUuid()
+ # TODO: updateCounts
# Update the basic metadata for the new DataSets from external
# resources, or at least mark as dirty
@@ -1840,7 +1936,7 @@ class ReadSet(DataSet):
# Pull generic values, kwargs, general treatment in super
super(ReadSet, self).addMetadata(newMetadata, **kwargs)
- def consolidate(self, dataFile, numFiles=1):
+ def consolidate(self, dataFile, numFiles=1, useTmp=True):
"""Consolidate a larger number of bam files to a smaller number of bam
files (min 1)
@@ -1863,15 +1959,18 @@ class ReadSet(DataSet):
resLists.append(thisResList)
fnames = [_infixFname(dataFile, str(i)) for i in range(numFiles)]
for resList, fname in zip(resLists, fnames):
- consolidateBams(resList, fname, filterDset=self)
+ consolidateBams(resList, fname, filterDset=self, useTmp=useTmp)
log.debug("Replacing resources")
self.externalResources = ExternalResources()
self.addExternalResources(fnames)
else:
- consolidateBams(self.toExternalFiles(), dataFile, filterDset=self)
+ consolidateBams(self.toExternalFiles(), dataFile, filterDset=self,
+ useTmp=useTmp)
+ # TODO: add as subdatasets
log.debug("Replacing resources")
self.externalResources = ExternalResources()
self.addExternalResources([dataFile])
+ self._populateMetaTypes()
def __len__(self):
if self.numRecords == -1:
@@ -1919,6 +2018,13 @@ class HdfSubreadSet(ReadSet):
self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
self.objMetadata["MetaType"])
+ def _induceIndices(self):
+ log.debug("Bax files already indexed")
+
+ @property
+ def isIndexed(self):
+ return True
+
def _openFiles(self):
"""Open the files (assert they exist, assert they are of the proper
type before accessing any file)
@@ -2055,6 +2161,19 @@ class AlignmentSet(ReadSet):
else:
return super(AlignmentSet, self).consolidate(*args, **kwargs)
+ def _induceIndices(self):
+ if self.isCmpH5:
+ log.debug("Cmp.h5 files already indexed")
+ else:
+ return super(AlignmentSet, self)._induceIndices()
+
+ @property
+ def isIndexed(self):
+ if self.isCmpH5:
+ return True
+ else:
+ return super(AlignmentSet, self).isIndexed
+
def addReference(self, fname):
if isinstance(fname, ReferenceSet):
reference = fname.externalResources.resourceIds
@@ -2221,9 +2340,9 @@ class AlignmentSet(ReadSet):
winend = -1
for param in filt:
if param.name == 'tstart':
- winstart = param.value
- if param.name == 'tend':
winend = param.value
+ if param.name == 'tend':
+ winstart = param.value
# If the filter is just for rname, fill the window
# boundaries (pricey but worth the guaranteed behavior)
if winend == -1:
@@ -2365,7 +2484,7 @@ class AlignmentSet(ReadSet):
return shiftedAtoms
def _split_contigs(self, chunks, maxChunks=0, breakContigs=False,
- targetSize=5000, byRecords=True, updateCounts=True):
+ targetSize=5000, byRecords=False, updateCounts=True):
"""Split a dataset into reference windows based on contigs.
Args:
@@ -2396,21 +2515,22 @@ class AlignmentSet(ReadSet):
for rn in refNames
if len(self._indexReadsInReference(rn)) != 0]
else:
- atoms = [(rn, 0, 0) for rn in refNames if
+ atoms = [(rn, 0, refLens[rn]) for rn in refNames if
len(self._indexReadsInReference(rn)) != 0]
else:
log.debug("Skipping records for each reference check")
- atoms = [(rn, 0, 0) for rn in refNames]
+ atoms = [(rn, 0, refLens[rn]) for rn in refNames]
if byRecords:
log.debug("Counting records...")
# This is getting out of hand, but the number of references
# determines the best read counting algorithm:
if len(refNames) < 100:
- atoms = [(rn, 0, 0, self.countRecords(rn))
+ atoms = [(rn, 0, refLens[rn], self.countRecords(rn))
for rn in refNames]
else:
counts = self._countMappedReads()
- atoms = [(rn, 0, 0, counts[rn]) for rn in refNames]
+ atoms = [(rn, 0, refLens[rn], counts[rn])
+ for rn in refNames]
log.debug("{i} contigs found".format(i=len(atoms)))
if byRecords:
@@ -2473,6 +2593,27 @@ class AlignmentSet(ReadSet):
sub_segments.append(tmp)
segments.extend(sub_segments)
atoms = segments
+ elif breakContigs and not byRecords:
+ log.debug("Checking for oversized chunks")
+ # we may have chunks <= len(atoms). We wouldn't usually split up
+ # contigs, but some might be huge, resulting in some tasks running
+ # very long
+ # We are only doing this for refLength splits for now, as those are
+ # cheap (and quiver is linear in length not coverage)
+ dataSize = sum(refLens.values())
+ # target size per chunk:
+ target = dataSize/chunks
+ log.debug("Target chunk length: {t}".format(t=target))
+ newAtoms = []
+ for i, atom in enumerate(atoms):
+ testAtom = atom
+ while testAtom[2] - testAtom[1] > target:
+ newAtom1 = (testAtom[0], testAtom[1], testAtom[1] + target)
+ newAtom2 = (testAtom[0], testAtom[1] + target, testAtom[2])
+ newAtoms.append(newAtom1)
+ testAtom = newAtom2
+ newAtoms.append(testAtom)
+ atoms = newAtoms
log.debug("Done defining {n} chunks".format(n=chunks))
# duplicate
@@ -2489,7 +2630,8 @@ class AlignmentSet(ReadSet):
log.debug("Distributing chunks")
# if we didn't have to split atoms and are doing it byRecords, the
# original counts are still valid:
- # Now well have to count records again to recombine atoms
+ #
+ # Otherwise we'll now have to count records again to recombine atoms
chunks = self._chunkList(atoms, chunks, balanceKey)
log.debug("Done chunking")
@@ -2498,8 +2640,8 @@ class AlignmentSet(ReadSet):
if atoms[0][2]:
result.filters.addRequirement(
rname=[('=', c[0]) for c in chunk],
- tStart=[('>', c[1]) for c in chunk],
- tEnd=[('<', c[2]) for c in chunk])
+ tStart=[('<', c[2]) for c in chunk],
+ tEnd=[('>', c[1]) for c in chunk])
else:
result.filters.addRequirement(
rname=[('=', c[0]) for c in chunk])
@@ -3030,8 +3172,9 @@ class ContigSet(DataSet):
super(ContigSet, self).__init__(*files, **kwargs)
self._metadata = ContigSetMetadata(self._metadata)
self._updateMetadata()
+ self._fastq = False
- def consolidate(self, outfn=None, numFiles=1):
+ def consolidate(self, outfn=None, numFiles=1, useTmp=False):
"""Consolidation should be implemented for window text in names and
for filters in ContigSets"""
@@ -3066,6 +3209,7 @@ class ContigSet(DataSet):
writeTemp = True
writeMatches = {}
writeComments = {}
+ writeQualities = {}
for name, match_list in matches.items():
if len(match_list) > 1:
log.debug("Multiple matches found for {i}".format(i=name))
@@ -3086,7 +3230,11 @@ class ContigSet(DataSet):
# collapse matches
new_name = self._removeWindow(name)
- new_seq = ''.join([match.sequence for match in match_list])
+ new_seq = ''.join([match.sequence[:] for match in match_list])
+ if self._fastq:
+ new_qual = ''.join([match.qualityString for match in
+ match_list])
+ writeQualities[new_name] = new_qual
# set to write
writeTemp = True
@@ -3094,8 +3242,10 @@ class ContigSet(DataSet):
writeComments[new_name] = match_list[0].comment
else:
log.debug("One match found for {i}".format(i=name))
- writeMatches[name] = match_list[0].sequence
+ writeMatches[name] = match_list[0].sequence[:]
writeComments[name] = match_list[0].comment
+ if self._fastq:
+ writeQualities[name] = match_list[0].qualityString
if writeTemp:
log.debug("Writing a new file is necessary")
if not outfn:
@@ -3103,12 +3253,18 @@ class ContigSet(DataSet):
outdir = tempfile.mkdtemp(suffix="consolidated-contigset")
outfn = os.path.join(outdir,
'consolidated_contigs.fasta')
- with FastaWriter(outfn) as outfile:
+ with self._writer(outfn) as outfile:
log.debug("Writing new resource {o}".format(o=outfn))
for name, seq in writeMatches.items():
if writeComments[name]:
name = ' '.join([name, writeComments[name]])
+ if self._fastq:
+ outfile.writeRecord(name, seq,
+ qvsFromAscii(writeQualities[name]))
+ continue
outfile.writeRecord(name, seq)
+ if not self._fastq:
+ _indexFasta(outfn)
# replace resources
log.debug("Replacing resources")
self.externalResources = ExternalResources()
@@ -3117,12 +3273,19 @@ class ContigSet(DataSet):
log.debug("Replacing metadata")
self._metadata.contigs = []
self._populateContigMetadata()
+ self._populateMetaTypes()
+
+ @property
+ def _writer(self):
+ if self._fastq:
+ return FastqWriter
+ return FastaWriter
def _popSuffix(self, name):
"""Chunking and quivering adds suffixes to contig names, after the
normal ID and window. This complicates our dewindowing and
consolidation, so we'll remove them for now"""
- observedSuffixes = ['|quiver']
+ observedSuffixes = ['|quiver', '|plurality', '|arrow']
for suff in observedSuffixes:
if name.endswith(suff):
log.debug("Suffix found: {s}".format(s=suff))
@@ -3138,6 +3301,12 @@ class ContigSet(DataSet):
return '_'.join(name.split('_')[:-2]) + suff
return name
+ def _induceIndices(self):
+ if not self.isIndexed:
+ for fname in self.toExternalFiles():
+ _indexFasta(fname)
+ self.close()
+
def _parseWindow(self, name):
"""Chunking and quivering appends a window to the contig ID, which
allows us to consolidate the contig chunks."""
@@ -3230,6 +3399,10 @@ class ContigSet(DataSet):
log.debug("Opening resources")
for extRes in self.externalResources:
location = urlparse(extRes.resourceId).path
+ if location.endswith("fastq"):
+ self._fastq = True
+ self._openReaders.append(FastqReader(location))
+ continue
try:
resource = IndexedFastaReader(location)
except IOError:
@@ -3314,6 +3487,7 @@ class ContigSet(DataSet):
@staticmethod
def _metaTypeMapping():
return {'fasta':'PacBio.ContigFile.ContigFastaFile',
+ 'fastq':'PacBio.ContigFile.ContigFastqFile',
'fa':'PacBio.ContigFile.ContigFastaFile',
'fas':'PacBio.ContigFile.ContigFastaFile',
'fai':'PacBio.Index.SamIndex',
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index fb79d29..3f36238 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -54,6 +54,7 @@ import operator as OP
import numpy as np
import logging
from functools import partial as P
+from collections import Counter
from pbcore.io.dataset.DataSetWriter import namespaces
log = logging.getLogger(__name__)
@@ -427,10 +428,6 @@ class Filters(RecordWrapper):
@property
def _bamAccMap(self):
- # tStart/tEnd is a hack for overlapping ranges. you're not testing
- # whether the tStart/tEnd is within a range, you're testing if it
- # overlaps with the tStart/tEnd in the filter (overlaps with a
- # reference window)
return {'rname': (lambda x: x.referenceName),
'length': (lambda x: int(x.readLength)),
'qname': (lambda x: x.qNameA),
@@ -442,38 +439,43 @@ class Filters(RecordWrapper):
'pos': (lambda x: int(x.tStart)),
'accuracy': (lambda x: float(x.identity)),
'readstart': (lambda x: int(x.aStart)),
- 'tstart': (lambda x: int(x.tEnd)), # see above
- 'tend': (lambda x: int(x.tStart)), # see above
+ 'tstart': (lambda x: int(x.tStart)),
+ 'tend': (lambda x: int(x.tEnd)),
}
def _pbiAccMap(self, tIdMap):
- # tStart/tEnd is a hack for overlapping ranges. you're not testing
- # whether the tStart/tEnd is within a range, you're testing if it
- # overlaps with the tStart/tEnd in the filter (overlaps with a
- # reference window)
return {'rname': (lambda x, m=tIdMap: m[x.tId]),
'length': (lambda x: int(x.aEnd)-int(x.aStart)),
'qname': (lambda x: x.qId),
'zm': (lambda x: int(x.holeNumber)),
'pos': (lambda x: int(x.tStart)),
'readstart': (lambda x: int(x.aStart)),
- 'tstart': (lambda x: int(x.tEnd)), # see above
- 'tend': (lambda x: int(x.tStart)), # see above
+ 'tstart': (lambda x: int(x.tStart)),
+ 'tend': (lambda x: int(x.tEnd)),
}
- def _pbiVecAccMap(self, tIdMap):
- # tStart/tEnd is a hack for overlapping ranges. you're not testing
- # whether the tStart/tEnd is within a range, you're testing if it
- # overlaps with the tStart/tEnd in the filter (overlaps with a
- # reference window)
- return {'rname': (lambda x, m=tIdMap: m[x.tId]),
+ def _pbiMappedVecAccMap(self, tIdMap):
+ plus = {'rname': (lambda x, m=tIdMap: m[x.tId]),
'length': (lambda x: x.aEnd - x.aStart),
- 'qname': (lambda x: x.qId),
- 'zm': (lambda x: x.holeNumber),
'pos': (lambda x: x.tStart),
'readstart': (lambda x: x.aStart),
- 'tstart': (lambda x: x.tEnd), # see above
- 'tend': (lambda x: x.tStart), # see above
+ 'tstart': (lambda x: x.tStart),
+ 'tend': (lambda x: x.tEnd),
+ }
+ base = self._pbiVecAccMap(tIdMap)
+ base.update(plus)
+ return base
+
+ def _pbiVecAccMap(self, tIdMap):
+ return {'length': (lambda x: x.qEnd - x.qStart),
+ 'qstart': (lambda x: x.qStart),
+ 'qend': (lambda x: x.qEnd),
+ 'qname': (lambda x: x.qId),
+ 'zm': (lambda x: x.holeNumber),
+ 'rq': (lambda x: x.readQual),
+ 'bcf': (lambda x: x.bcForward),
+ 'bcr': (lambda x: x.bcForward),
+ 'bcq': (lambda x: x.bcQual),
}
@property
@@ -528,7 +530,9 @@ class Filters(RecordWrapper):
def filterIndexRecords(self, indexRecords, nameMap):
typeMap = self._bamTypeMap
accMap = self._pbiVecAccMap({})
- accMap['rname'] = lambda x: x.tId
+ if 'aEnd' in indexRecords.dtype.names:
+ accMap = self._pbiMappedVecAccMap({})
+ accMap['rname'] = lambda x: x.tId
filterLastResult = None
for filt in self:
lastResult = None
@@ -554,6 +558,10 @@ class Filters(RecordWrapper):
del lastResult
return filterLastResult
+ def fromString(self, filterString):
+ filtDict = {}
+ self._runCallbacks()
+
def addRequirement(self, **kwargs):
"""Use this to add requirements. Members of the list will be considered
options for fulfilling this requirement, all other filters will be
@@ -705,6 +713,9 @@ class ExternalResources(RecordWrapper):
super(self.__class__, self).__init__(record)
self.record['tag'] = self.__class__.__name__
+ # state tracking. Not good, but needs it:
+ self._resourceIds = []
+
def __eq__(self, other):
for extRef in self:
found = False
@@ -728,7 +739,14 @@ class ExternalResources(RecordWrapper):
def merge(self, other):
# make sure we don't add dupes
- curIds = [res.resourceId for res in self]
+ curIds = self.resourceIds
+
+ # check to make sure ResourceIds in other are unique
+ otherIds = Counter([res.resourceId for res in other])
+ dupes = [c for c in otherIds if otherIds[c] > 1]
+ if dupes:
+ raise RuntimeError("Duplicate ResourceIds found: "
+ "{f}".format(f=', '.join(dupes)))
for newRes in other:
# merge instead
@@ -752,6 +770,7 @@ class ExternalResources(RecordWrapper):
resourceIds: a list of uris as strings
"""
templist = []
+ self._resourceIds = []
for res in resourceIds:
toAdd = res
if not isinstance(res, ExternalResource):
@@ -774,13 +793,16 @@ class ExternalResources(RecordWrapper):
aren't in record form, but append will fix that for us automatically. A
bit messy, but fairly concise.
"""
+ self._resourceIds = []
self.record['children'] = []
for res in resources:
self.append(res)
@property
def resourceIds(self):
- return [res.resourceId for res in self]
+ if not self._resourceIds:
+ self._resourceIds = [res.resourceId for res in self]
+ return self._resourceIds
class ExternalResource(RecordWrapper):
@@ -1000,7 +1022,6 @@ class DataSetMetadata(RecordWrapper):
else:
self.append(other.summaryStats)
if not self.namespace:
- print "HERE"
self.namespace = other.namespace
self.attrib.update(other.attrib)
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index 3c932db..eadf00a 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -57,6 +57,7 @@ def _addXmlFile(dset, path):
root = tree.getroot()
tmp = _parseXml(type(dset), root)
tmp.makePathsAbsolute(curStart=os.path.dirname(path))
+ # copyOnMerge must be false, you're merging in a tmp and maintaining dset
dset.merge(tmp, copyOnMerge=False)
def openFofnFile(path):
diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py
index d0f9980..d588198 100755
--- a/pbcore/io/dataset/DataSetValidator.py
+++ b/pbcore/io/dataset/DataSetValidator.py
@@ -16,7 +16,7 @@ def validateResources(xmlroot, relTo='.'):
Args:
xmlroot: The ET root of an xml tree
relTo: ('.') The path relative to which resources may reside. This will
- work poorly if relTo is not set to the location of the incoming
+ work poorly if relTo is not set to the dirname of the incoming
XML file.
"""
stack = [xmlroot]
@@ -28,8 +28,10 @@ def validateResources(xmlroot, relTo='.'):
parsedId = urlparse(resId)
rfn = urlparse(resId).path.strip()
if not os.path.isfile(rfn):
- if not os.path.isfile(os.path.join(os.path.dirname(relTo),
- rfn)):
+ if (not os.path.isfile(os.path.join(relTo,
+ rfn)) and
+ not os.path.isfile(os.path.join('.',
+ rfn))):
raise IOError, "{f} not found".format(f=rfn)
def validateLxml(xml_fn, xsd_fn):
@@ -49,13 +51,10 @@ def validateMiniXsv(xml_fn, xsd_fn):
except ImportError:
log.debug('minixsv not found, validation disabled')
-def validateXml(xmlroot, skipResources=False):
- # pyxb precompiles and therefore does not need the original xsd file.
- #if not os.path.isfile(XSD):
- #raise SystemExit, "Validation xsd {s} not found".format(s=XSD)
+def validateXml(xmlroot, skipResources=False, relTo='.'):
- if not skipResources: # XXX generally a bad idea, but useful for pbvalidate
- validateResources(xmlroot)
+ if not skipResources:
+ validateResources(xmlroot, relTo)
# Conceal the first characters of UniqueIds if they are legal numbers that
# would for some odd reason be considered invalid. Let all illegal
@@ -75,14 +74,9 @@ def validateFile(xmlfn, skipResources=False):
if ':' in xmlfn:
xmlfn = urlparse(xmlfn).path.strip()
with open(xmlfn, 'r') as xmlfile:
- #root = etree.parse(xmlfile)
root = ET.parse(xmlfile).getroot()
- return validateXml(root, skipResources=skipResources)
-
-def validateString(xmlstring, relTo='.'):
- #root = etree.parse(xmlfile)
- root = ET.fromstring(xmlstring)
- validateResources(root, relTo=relTo)
- #return validateXml(root)
-
+ return validateXml(root, skipResources=skipResources,
+ relTo=os.path.dirname(xmlfn))
+def validateString(xmlString, skipResources=False, relTo='.'):
+ validateXml(ET.fromstring(xmlString), skipResources, relTo)
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index 976572d..6275b4f 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -158,9 +158,9 @@ def _toElementTree(dataSet, root=None, core=False):
root = ET.Element(rootType, attribs)
_addExternalResourcesElement(dataSet, root, core)
- _addDataSetMetadataElement(dataSet, root)
_addFiltersElement(dataSet, root)
_addDataSetsElement(dataSet, root)
+ _addDataSetMetadataElement(dataSet, root)
xsi = "{http://www.w3.org/2001/XMLSchema-instance}"
# The ElementTree element dictionary doesn't quite work the same as a
# regular dictionary, it seems, thus the convoluted get/set business
@@ -234,8 +234,15 @@ def _guessNs(tag):
def _eleFromDictList(eleAsDict, core=False):
"""A last ditch capture method for uknown Elements"""
+ if eleAsDict['tag'] == 'DataSets':
+ print eleAsDict['namespace']
if not eleAsDict['namespace']:
eleAsDict['namespace'] = _guessNs(eleAsDict['tag'])
+ elif (eleAsDict['namespace'] ==
+ "http://pacificbiosciences.com/PacBioDataModel.xsd"):
+ newNamespace = _guessNs(eleAsDict['tag'])
+ if newNamespace != '':
+ eleAsDict['namespace'] = newNamespace
if core:
ele = ET.Element("{{{n}}}{t}".format(n=eleAsDict['namespace'],
t=eleAsDict['tag']),
@@ -258,9 +265,7 @@ def _addFiltersElement(dataset, root, core=False):
root: The root ElementTree object. Extended here using SubElement
"""
if dataset.filters:
- filters = ET.SubElement(root, 'Filters')
- for child in dataset.filters.record['children']:
- filters.append(_eleFromDictList(child, core))
+ root.append(_eleFromDictList(dataset.filters.record, core=core))
def _addDataSetsElement(dataset, root):
"""Add DataSet Elements to root, which essentially nests ElementTrees.
@@ -269,9 +274,13 @@ def _addDataSetsElement(dataset, root):
root: The root ElementTree object. Extended here using SubElement
"""
if dataset.subdatasets:
- dse = ET.SubElement(root, 'DataSets')
+ rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'],
+ t='DataSets')
+ dse = ET.SubElement(root, rootType)
for subSet in dataset.subdatasets:
- subSetRoot = ET.SubElement(dse, subSet.__class__.__name__,
+ rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'],
+ t=subSet.__class__.__name__)
+ subSetRoot = ET.SubElement(dse, rootType,
subSet.objMetadata)
_toElementTree(subSet, subSetRoot)
diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py
index 47aede4..cdf02c7 100755
--- a/pbcore/io/dataset/EntryPoints.py
+++ b/pbcore/io/dataset/EntryPoints.py
@@ -2,7 +2,9 @@
import os
import argparse
+from collections import defaultdict
from pbcore.io import DataSet, ContigSet, openDataSet
+from pbcore.io.dataset.DataSetMembers import Filters
from pbcore.io.dataset.DataSetValidator import validateFile
import logging
@@ -24,7 +26,13 @@ def summarize_options(parser):
def createXml(args):
dsTypes = DataSet.castableTypes()
dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
- skipCounts=args.skipCounts)
+ skipCounts=args.skipCounts,
+ generateIndices=args.generateIndices)
+ if args.generateIndices:
+ # we generated the indices with the last open, lets capture them with
+ # this one:
+ dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
+ skipCounts=args.skipCounts)
log.debug("Dataset created")
dset.write(args.outfile, validate=args.novalidate, modPaths=True,
relPaths=args.relative)
@@ -42,6 +50,8 @@ def create_options(parser):
help="The fofn or BAM file(s) to make into an XML")
parser.add_argument("--type", type=str, default='DataSet',
dest='dsType', help="The type of XML to create")
+ parser.add_argument("--generateIndices", action='store_true',
+ default=False, help="The type of XML to create")
parser.add_argument("--novalidate", action='store_false', default=True,
help=("Don't validate the resulting XML, don't touch "
"paths"))
@@ -51,34 +61,38 @@ def create_options(parser):
parser.set_defaults(func=createXml)
def filterXml(args):
- log.error("Adding filters via CLI is temporarily out of order")
- exit(1)
if args.infile.endswith('xml'):
dataSet = openDataSet(args.infile, strict=args.strict)
- filters = []
+ filters = defaultdict(list)
separators = ['<=', '>=', '!=', '==', '>', '<', '=']
for filt in args.filters:
for sep in separators:
if sep in filt:
param, condition = filt.split(sep)
- condition = sep + condition
- filters[param] = condition
+ condition = (sep, condition)
+ filters[param].append(condition)
break
- dataSet.addFilters([filters])
+ dataSet.filters.addRequirement(**filters)
+ dataSet.updateCounts()
log.info("{i} filters added".format(i=len(filters)))
dataSet.write(args.outfile)
else:
raise IOError("No files found/found to be compatible")
def filter_options(parser):
- parser.description = 'Add filters to an XML file'
+ pbiFilterOptions = set(Filters()._pbiMappedVecAccMap({}).keys())
+ bamFilterOptions = set(Filters()._bamAccMap.keys())
+ parser.description = ('Add filters to an XML file. Suggested fields: '
+ '{f}. More expensive fields: {b}'.format(
+ f=sorted(list(pbiFilterOptions)),
+ b=sorted(list(bamFilterOptions - pbiFilterOptions))))
#parser.add_argument("infile", type=validate_file,
parser.add_argument("infile", type=str,
help="The xml file to filter")
parser.add_argument("outfile", type=str, help="The resulting xml file")
parser.add_argument("filters", type=str, nargs='+',
help="The values and thresholds to filter (e.g. "
- "rq>0.85)")
+ "'rq>0.85')")
parser.set_defaults(func=filterXml)
def splitXml(args):
@@ -207,7 +221,8 @@ def consolidateXml(args):
"""Combine BAMs and apply the filters described in the XML file, producing
one consolidated XML"""
dset = openDataSet(args.infile)
- dset.consolidate(args.datafile, numFiles=args.numFiles)
+ dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not
+ args.noTmp))
dset.write(args.xmlfile)
def consolidate_options(parser):
@@ -215,6 +230,9 @@ def consolidate_options(parser):
#parser.add_argument("infile", type=validate_file,
parser.add_argument("--numFiles", type=int, default=1,
help="The number of data files to produce (1)")
+ parser.add_argument("--noTmp", default=False, action='store_true',
+ help="Don't copy to a tmp location to ensure local"
+ " disk use")
parser.add_argument("infile", type=str,
help="The XML file to consolidate")
parser.add_argument("datafile", type=str,
diff --git a/pbcore/io/dataset/_pbds.py b/pbcore/io/dataset/_pbds.py
index a027058..34d0e05 100755
--- a/pbcore/io/dataset/_pbds.py
+++ b/pbcore/io/dataset/_pbds.py
@@ -1632,7 +1632,7 @@ CTD_ANON_._Automaton = _BuildAutomaton_()
-DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3)))
+DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.long, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3)))
DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'NumRecords'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 53, 3)))
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index 05fccb4..acf131e 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -5,13 +5,28 @@ import os
import tempfile
import logging
import json
+import shutil
+import pysam
from pbcore.util.Process import backticks
log = logging.getLogger(__name__)
-def consolidateBams(inFiles, outFile, filterDset=None):
+def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
"""Take a list of infiles, an outFile to produce, and optionally a dataset
(filters) to provide the definition and content of filtrations."""
+ # check space available
+ if useTmp:
+ tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
+ if (disk_free(os.path.split(outFile)[0]) >
+ sum(file_size(infn) for infn in inFiles)):
+ tmpInFiles = _tmpFiles(inFiles, tmpout)
+ origOutFile = outFile
+ origInFiles = inFiles[:]
+ inFiles = tmpInFiles
+ outFile = os.path.join(tmpout, "outfile.bam")
+ else:
+ useTmp = False
+
if filterDset and filterDset.filters:
finalOutfile = outFile
outFile = _infixFname(outFile, "_unfiltered")
@@ -21,6 +36,29 @@ def consolidateBams(inFiles, outFile, filterDset=None):
outFile = finalOutfile
_indexBam(outFile)
_pbindexBam(outFile)
+ if useTmp:
+ shutil.copy(outFile, origOutFile)
+ shutil.copy(outFile + ".bai", origOutFile + ".bai")
+ shutil.copy(outFile + ".pbi", origOutFile + ".pbi")
+
+def _tmpFiles(inFiles, tmpout=None):
+ tmpInFiles = []
+ if tmpout is None:
+ tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
+ for i, fname in enumerate(inFiles):
+ newfn = _infixFname(os.path.join(tmpout, os.path.basename(fname)), i)
+ shutil.copy(fname, newfn)
+ tmpInFiles.append(newfn)
+ return tmpInFiles
+
+def disk_free(path):
+ if path == '':
+ path = os.getcwd()
+ space = os.statvfs(path)
+ return space.f_bavail * space.f_frsize
+
+def file_size(path):
+ return os.stat(path).st_size
def _pbindexBam(fname):
cmd = "pbindex {i}".format(i=fname)
@@ -36,30 +74,13 @@ def _sortBam(fname):
o, r, m = backticks(cmd)
if r != 0:
raise RuntimeError(m)
- _mvFile(tmpname, fname)
-
-def _mvFile(inFname, outFname):
- cmd = "mv {i} {o}".format(i=inFname,
- o=outFname)
- log.info(cmd)
- o, r, m = backticks(cmd)
- if r != 0:
- raise RuntimeError(m)
-
-def _cpFile(inFname, outFname):
- cmd = "cp {i} {o}".format(i=inFname,
- o=outFname)
- log.info(cmd)
- o, r, m = backticks(cmd)
- if r != 0:
- raise RuntimeError(m)
+ shutil.move(tmpname, fname)
def _indexBam(fname):
- cmd = "samtools index {i}".format(i=fname)
- log.info(cmd)
- o, r, m = backticks(cmd)
- if r != 0:
- raise RuntimeError(m)
+ pysam.index(fname)
+
+def _indexFasta(fname):
+ pysam.faidx(fname)
def _mergeBams(inFiles, outFile):
if len(inFiles) > 1:
@@ -70,10 +91,7 @@ def _mergeBams(inFiles, outFile):
if r != 0:
raise RuntimeError(m)
else:
- _cpFile(inFiles[0], outFile)
-
-def _maxDigits(numItems):
- return len(str(numItems))
+ shutil.copy(inFiles[0], outFile)
def _filterBam(inFile, outFile, filterDset):
tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
@@ -88,11 +106,12 @@ def _filterBam(inFile, outFile, filterDset):
def _infixFname(fname, infix):
prefix, extension = os.path.splitext(fname)
- return prefix + infix + extension
+ return prefix + str(infix) + extension
def _emitFilterScript(filterDset, filtScriptName):
- bamtoolsFilters = ['reference', 'insertSize', 'mapQuality', 'name',
- 'position', 'queryBases']
+ """Use the filter script feature of bamtools. Use with specific filters if
+ all that are needed are available, otherwise filter by readname (easy but
+ uselessly expensive)"""
filterMap = {'rname': 'reference',
'pos': 'position',
'length': 'queryBases',
diff --git a/tests/test_pbcore_extract_bam_sequence.py b/tests/test_pbcore_extract_bam_sequence.py
index cc516fd..ffd0399 100644
--- a/tests/test_pbcore_extract_bam_sequence.py
+++ b/tests/test_pbcore_extract_bam_sequence.py
@@ -14,11 +14,11 @@ import shutil
import os
sam_str_ = """\
- at HD VN:1.5\tSO:coordinate\tpb:3.0b7
+ at HD VN:1.5\tSO:coordinate\tpb:3.0.1
@SQ\tSN:genome\tLN:28\tM5:734d5f3b2859595f4bd87a2fe6b7389b
@RG\tID:1\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd=ip;BASECALLERVERSION=2.0.1.0.123678;FRAMERATEHZ=75.000000;BINDINGKIT=foo;SEQUENCINGKIT=bar\tPU:movie1
@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\tDS:bax2bam
-movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:i:854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
+movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
fasta_str = """\
>genome
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index cac5d36..7d3bb97 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -2,9 +2,12 @@ from numpy.testing import (assert_array_almost_equal as ASIM,
assert_array_equal as AEQ)
from nose.tools import (nottest,
assert_raises,
- assert_equal as EQ)
+ assert_equal as EQ,
+ assert_almost_equal as EQISH)
from nose import SkipTest
+import tempfile
+import pysam
import numpy as np
import bisect
import h5py
@@ -407,10 +410,10 @@ class TestBasicBam(_BasicAlnFileReaderTests):
CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta())
def testSpecVersion(self):
- EQ("3.0b7", self.f.version)
+ EQ("3.0.1", self.f.version)
def testReadScore(self):
- EQ(0.904, self.fwdAln.readScore)
+ EQISH(0.904, self.fwdAln.readScore, 3)
class TestIndexedBam(_IndexedAlnFileReaderTests):
@@ -439,3 +442,33 @@ class TestCCSBam(object):
aln.qStart
with assert_raises(UnavailableFeature):
aln.qEnd
+
+
+class TestMultipleReadGroups(object):
+ """
+ Verify that BAM files with multiple read groups are handled sensibly - see
+ bug 26548.
+ """
+ SAM_IN = """\
+ at HD\tVN:1.5\tSO:coordinate\tpb:3.0.1
+ at SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b
+ at RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200\tPU:movie1
+ at RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\tPU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0
+movie1/54130/0_10\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t2\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:3f58e5b8\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2
+m140906_231018_42161_c100676332550000001823129611271486_s1_p0/1/10_20\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t12\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:b5482b33\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:20\tqs:i:10\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
+
+ def test_retrieve_read_group_properties(self):
+ f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
+ f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
+ with open(f1, "w") as f:
+ f.write(self.SAM_IN)
+ with pysam.AlignmentFile(f1) as sam_in:
+ with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
+ for aln in sam_in:
+ bam_out.write(aln)
+ movie_names = []
+ with BamReader(f2) as bam_in:
+ for aln in bam_in:
+ EQ(aln.sequencingChemistry, "P6-C4")
+ movie_names.append(aln.movieName)
+ EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py
index 8144910..d2e86b1 100644
--- a/tests/test_pbcore_io_GffIO.py
+++ b/tests/test_pbcore_io_GffIO.py
@@ -6,7 +6,7 @@ import os.path
from nose.tools import assert_equal, assert_raises
from pbcore.io import GffWriter, Gff3Record, GffReader
-from pbcore.io.GffIO import merge_gffs, sort_gff
+from pbcore.io.GffIO import merge_gffs, merge_gffs_sorted, sort_gff
from pbcore import data
class TestGff3Record:
@@ -109,11 +109,16 @@ class TestGffWriter:
class TestGffSorting(unittest.TestCase):
gff_data = ["""\
##gff-version 3
-##some random comment here
+##source ipdSummary
+##source-commandline ipdSummary etc.
+##sequence-region lambda_NEB3011 1 48502
chr1\tkinModCall\tmodified_base\t32580\t32580\t32\t-\t.\tcoverage=94;context=AATGGCATCGTTCCGGTGGTGGGCGTTGATGGCTGGTCCCG;IPDRatio=1.75
chr1\tkinModCall\tmodified_base\t32766\t32766\t42\t-\t.\tcoverage=170;context=GCTGGGAAGCTGGCTGAACGTGTCGGCATGGATTCTGTCGA;IPDRatio=1.70
chr1\tkinModCall\tmodified_base\t32773\t32773\t54\t-\t.\tcoverage=154;context=AACGCTGGCTGGGAAGCTGGCTGAACGTGTCGGCATGGATT;IPDRatio=2.65""", """\
##gff-version 3
+##source ipdSummary
+##source-commandline ipdSummary etc.
+##sequence-region lambda_NEB3011 1 48502
chr2\tkinModCall\tmodified_base\t1200\t1200\t47\t-\t.\tcoverage=109;context=ACTTTTCACGGTAGTTTTTTGCCGCTTTACCGCCCAGGCAC;IPDRatio=1.89
chr2\tkinModCall\tmodified_base\t1786\t1786\t36\t-\t.\tcoverage=153;context=TCCCACGTCTCACCGAGCGTGGTGTTTACGAAGGTTTTACG;IPDRatio=1.67
chr2\tkinModCall\tmodified_base\t1953\t1953\t39\t+\t.\tcoverage=148;context=AATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGC;IPDRatio=1.86""", """\
@@ -152,8 +157,24 @@ chr1\tkinModCall\tmodified_base\t16348\t16348\t42\t-\t.\tcoverage=115;context=CC
os.remove(cls.combined)
def test_merge_gffs(self):
- gff_out = "tmp_pbcore_merged.gff"
+ gff_out = "tmp_pbcore_merge.gff"
merge_gffs(self.files, gff_out)
+ n_rec = 0
+ for fn in self.files:
+ with GffReader(fn) as f:
+ n_rec += len([ rec for rec in f ])
+ with GffReader(gff_out) as f:
+ self.assertEqual(f.headers, [
+ "##gff-version 3",
+ "##source ipdSummary",
+ "##sequence-region lambda_NEB3011 1 48502",
+ ])
+ n_rec_merged = len([ rec for rec in f ])
+ self.assertEqual(n_rec, n_rec_merged)
+
+ def test_merge_gffs_sorted(self):
+ gff_out = "tmp_pbcore_merged_sorted.gff"
+ merge_gffs_sorted(self.files, gff_out)
with GffReader(gff_out) as f:
start = [ (rec.seqid, rec.start) for rec in f ]
self.assertEqual(start, self.sorted_start)
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index 66ad971..5457906 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -7,8 +7,10 @@ import tempfile
import numpy as np
import unittest
+import shutil
from unittest.case import SkipTest
+from pbcore.io import PacBioBamIndex, IndexedBamReader
from pbcore.io import openIndexedAlignmentFile
from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
openDataSet, DataSetMetaTypes, HdfSubreadSet)
@@ -21,6 +23,34 @@ import pbcore.data.datasets as data
log = logging.getLogger(__name__)
+def _check_constools():
+ cmd = "dataset.py"
+ o, r, m = backticks(cmd)
+ if r != 2:
+ return False
+
+ cmd = "bamtools"
+ o, r, m = backticks(cmd)
+ if r != 0:
+ return False
+
+ cmd = "pbindex"
+ o, r, m = backticks(cmd)
+ if r != 1:
+ return False
+
+ cmd = "samtools"
+ o, r, m = backticks(cmd)
+ if r != 1:
+ return False
+ return True
+
+def _internal_data():
+ if os.path.exists("/mnt/secondary-siv/testdata"):
+ return True
+ return False
+
+
class TestDataSet(unittest.TestCase):
"""Unit and integrationt tests for the DataSet class and \
associated module functions"""
@@ -39,7 +69,9 @@ class TestDataSet(unittest.TestCase):
# e.g. d.write("alignmentset.xml")
outdir = tempfile.mkdtemp(suffix="dataset-unittest")
outXml = os.path.join(outdir, 'tempfile.xml')
- d.write(outXml)
+ log.debug(outXml)
+ # don't validate, type DataSet doesn't validate well
+ d.write(outXml, validate=False)
# And then recover the same XML (or a different one):
# e.g. d = DataSet("alignmentset.xml")
d = DataSet(outXml)
@@ -79,6 +111,8 @@ class TestDataSet(unittest.TestCase):
ds.externalResources[-1].indices[0].resourceId ==
"IdontExist.bam.pbi")
+ @unittest.skipIf(not _check_constools(),
+ "bamtools or pbindex not found, skipping")
def test_split_cli(self):
outdir = tempfile.mkdtemp(suffix="dataset-unittest")
cmd = "dataset.py split --outdir {o} --contigs --chunks 2 {d}".format(
@@ -92,6 +126,71 @@ class TestDataSet(unittest.TestCase):
self.assertTrue(os.path.exists(
os.path.join(outdir, os.path.basename(data.getXml(16)))))
+ @unittest.skipIf(not _check_constools(),
+ "bamtools or pbindex not found, skipping")
+ def test_filter_cli(self):
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, "filtered8.xml")
+ log.debug(outfn)
+ cmd = "dataset.py filter {i} {o} {f}".format(
+ i=data.getXml(8),
+ o=outfn,
+ f="rname=E.faecalis.1")
+ log.debug(cmd)
+ o, r, m = backticks(cmd)
+ if r != 0:
+ log.debug(m)
+ self.assertEqual(r, 0)
+ self.assertTrue(os.path.exists(outfn))
+ aln = AlignmentSet(data.getXml(8))
+ aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+ aln.updateCounts()
+ dset = AlignmentSet(outfn)
+ self.assertEqual(str(aln.filters), str(dset.filters))
+ self.assertEqual(aln.totalLength, dset.totalLength)
+ self.assertEqual(aln.numRecords, dset.numRecords)
+
+ def test_merge_subdatasets(self):
+ # from data file
+ ds1 = AlignmentSet(data.getBam(0))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ ds2 = AlignmentSet(data.getBam(1))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ merged = ds1 + ds2
+ self.assertEqual(len(merged.subdatasets), 2)
+ self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+ AlignmentSet(data.getBam(0)).toExternalFiles())
+ self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+ self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+ AlignmentSet(data.getBam(1)).toExternalFiles())
+ self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+
+ # from data set
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ ds2 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds2.subdatasets), 0)
+ merged = ds1 + ds2
+ self.assertEqual(len(merged.subdatasets), 2)
+ self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+ AlignmentSet(data.getXml(8)).toExternalFiles())
+ self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+ self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+ AlignmentSet(data.getXml(11)).toExternalFiles())
+ self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+
+ # combined data set
+ merged = AlignmentSet(data.getXml(8), data.getXml(11))
+ self.assertEqual(len(merged.subdatasets), 2)
+ self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+ self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+ AlignmentSet(data.getXml(8)).toExternalFiles())
+ self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+ self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+ AlignmentSet(data.getXml(11)).toExternalFiles())
+
+ @unittest.skipIf(not _check_constools(),
+ "bamtools or pbindex not found, skipping")
def test_create_cli(self):
log.debug("Absolute")
outdir = tempfile.mkdtemp(suffix="dataset-unittest")
@@ -214,6 +313,25 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(aln.totalLength, -1)
self.assertEqual(aln.numRecords, -1)
+ # TODO: replace this with a reproducable bam file and move test upstream
+ @unittest.skip("Skip until suitable barcoded files found and updated")
+ def test_barcode_accession(self):
+ testFile = data.getBarcodedBam()
+ # Test the pbi file:
+ bam = IndexedBamReader(testFile)
+ pbi = PacBioBamIndex(testFile + '.pbi')
+ for brec, prec in zip(bam, pbi):
+ brec_bc = brec.peer.opt("bc")
+ prec_bc = [prec.bcLeft, prec.bcRight]
+ self.assertEqual(brec_bc, prec_bc)
+
+ # Test split by barcode:
+ ss = SubreadSet(testFile)
+ sss = ss.split(chunks=2, barcodes=True)
+ self.assertEqual(len(sss), 2)
+ for sset in sss:
+ self.assertTrue(len(sset.barcodes) > 1)
+
def test_attributes(self):
aln = AlignmentSet(data.getBam(0))
self.assertEqual(aln.sequencingChemistry, ['unknown'])
@@ -375,6 +493,15 @@ class TestDataSet(unittest.TestCase):
[('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
55, 1815)])
+ @unittest.skipUnless(os.path.isdir("/mnt/secondary-siv/testdata"),
+ "Missing testadata directory")
+ def test_large_pbi(self):
+ pbiFn = ('/mnt/secondary-siv/testdata/SA3-DS/lambda/simulated'
+ '/100Gb/alnsubreads/pbalchemy_100Gb_Seq_sim1_p0.'
+ 'aligned.bam.pbi')
+ pbi = PacBioBamIndex(pbiFn)
+ self.assertFalse(pbi.aStart is None)
+
def test_copy(self):
ds1 = DataSet(data.getXml(12))
ds2 = ds1.copy()
@@ -597,8 +724,9 @@ class TestDataSet(unittest.TestCase):
num += 1
self.assertTrue(num > 2000)
- @unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"),
- "Missing testadata directory")
+ @unittest.skip("Skip until /mnt/secondary/Share/Quiver/ updated")
+ #@unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"),
+ # "Missing testadata directory")
def test_reads_in_range_order_large(self):
window = ('Staphylococcus_aureus_subsp_aureus_USA300_TCH1516',
558500,
@@ -811,7 +939,7 @@ class TestDataSet(unittest.TestCase):
dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
self.assertEqual(len(dss), 3)
sizes = sorted([dset.numRecords for dset in dss])
- self.assertListEqual(sizes, [22, 31, 39])
+ self.assertListEqual(sizes, [20, 24, 48])
refWindows = sorted(reduce(lambda x, y: x + y,
[ds.refWindows for ds in dss]))
for ref in random_few:
@@ -933,25 +1061,24 @@ class TestDataSet(unittest.TestCase):
log.debug(dss[0].filters)
log.debug(dss[1].filters)
self.assertTrue(
- '( rname = E.faecalis.2 ) '
+ '( rname = E.faecalis.2 '
in str(dss[0].filters)
or
- '( rname = E.faecalis.2 ) '
+ '( rname = E.faecalis.2 '
in str(dss[1].filters))
ds = AlignmentSet(data.getBam())
- ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'),
- ('=', 'lambda_NEB3011')],
- tStart=[('<', '0'),
- ('<', '100')],
- tEnd=[('>', '99'),
- ('>', '299')])
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+ ('=', 'E.faecalis.2')],
+ tStart=[('<', '99'),
+ ('<', '299')],
+ tEnd=[('>', '0'),
+ ('>', '100')])
self.assertEqual(str(ds.filters),
- '( rname = lambda_NEB3011 AND tstart '
- '< 0 AND tend > 99 ) OR ( rname = lambd'
- 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
- # TODO: look into swapping around refWindows to make this work:
- #self.assertEqual(ds.refWindows, [('lambda_NEB3011', 0, 99),
- #('lambda_NEB3011', 100, 299)])
+ '( rname = E.faecalis.2 AND tstart '
+ '< 99 AND tend > 0 ) OR ( rname = '
+ 'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
+ self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
+ ('E.faecalis.2', 100, 299)])
def test_refLengths(self):
@@ -1064,6 +1191,28 @@ class TestDataSet(unittest.TestCase):
for i, item in enumerate(aln):
self.assertEqual(item, aln[i])
+ @unittest.skipIf(not _check_constools(),
+ "bamtools or pbindex not found, skipping")
+ def test_induce_indices(self):
+ # all of our test files are indexed. Copy just the main files to a temp
+ # location, open as dataset, assert unindexed, open with
+ # generateIndices=True, assert indexed
+ toTest = [8, 9, 10, 11, 12, 15, 16]
+ for fileNo in toTest:
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ orig_dset = openDataSet(data.getXml(fileNo))
+ resfnames = orig_dset.toExternalFiles()
+ new_resfnames = []
+ for fname in resfnames:
+ newfname = os.path.join(outdir, os.path.basename(fname))
+ shutil.copy(fname, newfname)
+ new_resfnames.append(newfname)
+ dset = type(orig_dset)(*new_resfnames)
+ self.assertFalse(dset.isIndexed)
+ dset = type(orig_dset)(*new_resfnames, generateIndices=True)
+ self.assertTrue(dset.isIndexed)
+
+
def test_reads_in_reference(self):
ds = AlignmentSet(data.getBam())
refNames = ds.refNames
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 71d2869..3500d66 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -12,7 +12,7 @@ from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
ReferenceSet, ContigSet, AlignmentSet,
FastaReader, FastaWriter, IndexedFastaReader,
HdfSubreadSet, ConsensusAlignmentSet,
- openDataFile)
+ openDataFile, FastaWriter)
import pbcore.data as upstreamData
import pbcore.data.datasets as data
from pbcore.io.dataset.DataSetValidator import validateXml
@@ -21,6 +21,11 @@ import xml.etree.ElementTree as ET
log = logging.getLogger(__name__)
def _check_constools():
+ cmd = "dataset.py"
+ o, r, m = backticks(cmd)
+ if r != 2:
+ return False
+
cmd = "bamtools"
o, r, m = backticks(cmd)
if r != 0:
@@ -137,6 +142,22 @@ class TestDataSet(unittest.TestCase):
self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
+ def test_ccsset_from_bam(self):
+ # DONE bug 28698
+ ds1 = ConsensusReadSet(upstreamData.getCCSBAM(), strict=False)
+ fn = tempfile.NamedTemporaryFile(suffix=".consensusreadset.xml").name
+ log.debug(fn)
+ ds1.write(fn, validate=False)
+ ds1.write(fn)
+
+ def test_subreadset_from_bam(self):
+ # DONE control experiment for bug 28698
+ bam = upstreamData.getUnalignedBam()
+ ds1 = SubreadSet(bam, strict=False)
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ log.debug(fn)
+ ds1.write(fn)
+
def test_ccsalignment_build(self):
ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False)
self.assertEquals(type(ds1).__name__, 'ConsensusAlignmentSet')
@@ -146,6 +167,22 @@ class TestDataSet(unittest.TestCase):
#self.assertEquals(type(ds2).__name__, 'ConsensusAlignmentSet')
#self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
+ def test_contigset_write(self):
+ fasta = upstreamData.getLambdaFasta()
+ ds = ContigSet(fasta)
+ self.assertTrue(isinstance(ds.resourceReaders()[0],
+ IndexedFastaReader))
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, 'test.fasta')
+ w = FastaWriter(outfn)
+ for rec in ds:
+ w.writeRecord(rec)
+ w.close()
+ fas = FastaReader(outfn)
+ for rec in fas:
+ # make sure a __repr__ didn't slip through:
+ self.assertFalse(rec.sequence.startswith('<'))
+
def test_file_factory(self):
# TODO: add ConsensusReadSet, cmp.h5 alignmentSet
types = [AlignmentSet(data.getXml(8)),
@@ -209,6 +246,13 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(read1, read2)
self.assertEqual(len(aln), len(nonCons))
+ # Test that it is a valid xml:
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ datafile = os.path.join(outdir, "apimerged.bam")
+ xmlfile = os.path.join(outdir, "apimerged.xml")
+ log.debug(xmlfile)
+ aln.write(xmlfile)
+
log.debug("Test with cheap filter")
aln = AlignmentSet(data.getXml(12))
self.assertEqual(len(list(aln)), 177)
@@ -286,7 +330,7 @@ class TestDataSet(unittest.TestCase):
"7920800000001823174110291514_s1_p0."
"all.alignmentset.xml")
aln = AlignmentSet(testFile)
- nonCons= AlignmentSet(testFile)
+ nonCons = AlignmentSet(testFile)
self.assertEqual(len(aln.toExternalFiles()), 3)
outdir = tempfile.mkdtemp(suffix="dataset-unittest")
outfn = os.path.join(outdir, 'merged.bam')
@@ -381,9 +425,39 @@ class TestDataSet(unittest.TestCase):
# open acc and compare to exp
for name, seq in zip(singletons, exp_single_seqs):
- self.assertEqual(acc_file.get_contig(name).sequence, seq)
- self.assertEqual(acc_file.get_contig(double).sequence, exp_double_seq)
+ self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
+ self.assertEqual(acc_file.get_contig(double).sequence[:],
+ exp_double_seq)
+ def test_contigset_consolidate_genomic_consensus(self):
+ """
+ Verify that the contigs output by GenomicConsensus (e.g. quiver) can
+ be consolidated.
+ """
+ FASTA1 = ("lambda_NEB3011_0_60",
+ "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
+ FASTA2 = ("lambda_NEB3011_120_180",
+ "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
+ FASTA3 = ("lambda_NEB3011_60_120",
+ "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
+ files = []
+ for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
+ _files = []
+ for suffix in ["", "|quiver", "|plurality", "|arrow"]:
+ tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
+ with open(tmpfile, "w") as f:
+ f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
+ _files.append(tmpfile)
+ files.append(_files)
+ for i in range(3):
+ ds = ContigSet(*[ f[i] for f in files ])
+ out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
+ fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
+ ds.consolidate(fa1)
+ ds.write(out1)
+ with ContigSet(out1) as ds_new:
+ self.assertEqual(len([ rec for rec in ds_new ]), 1,
+ "failed on %d" % i)
def test_split_hdfsubreadset(self):
hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23())
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git
More information about the debian-med-commit
mailing list