[med-svn] [Git][med-team/mash][master] 4 commits: New upstream version 2.1.1+dfsg
Sascha Steinbiss
gitlab at salsa.debian.org
Mon Jul 8 18:39:23 BST 2019
Sascha Steinbiss pushed to branch master at Debian Med / mash
Commits:
32193931 by Sascha Steinbiss at 2019-07-08T17:32:56Z
New upstream version 2.1.1+dfsg
- - - - -
583a8da0 by Sascha Steinbiss at 2019-07-08T17:33:22Z
Merge tag 'upstream/2.1.1+dfsg'
Upstream version 2.1.1+dfsg
- - - - -
a9f98d90 by Sascha Steinbiss at 2019-07-08T17:36:54Z
new upstream release
- - - - -
691bbed4 by Sascha Steinbiss at 2019-07-08T17:37:05Z
remove patch applied upstream
- - - - -
12 changed files:
- Makefile.in
- configure.ac
- debian/changelog
- − debian/patches/capnp0_7.patch
- debian/patches/series
- doc/sphinx/data.rst
- src/mash/CommandScreen.cpp
- src/mash/CommandSketch.cpp
- src/mash/HashList.h
- src/mash/Sketch.cpp
- src/mash/sketchParameterSetup.cpp
- src/mash/version.h
Changes:
=====================================
Makefile.in
=====================================
@@ -1,4 +1,4 @@
-CXXFLAGS += -O3 -std=c++11 -Isrc -I at capnp@/include -I at mathinc@
+CXXFLAGS += -O3 -std=c++14 -Isrc -I at capnp@/include -I at mathinc@
CPPFLAGS += @amcppflags@
UNAME_S=$(shell uname -s)
=====================================
configure.ac
=====================================
@@ -33,7 +33,7 @@ then
AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
fi
-CPPFLAGS="-I$with_capnp/include -std=c++11"
+CPPFLAGS="-I$with_capnp/include -std=c++14"
AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
=====================================
debian/changelog
=====================================
@@ -1,9 +1,13 @@
-mash (2.1+dfsg-3) UNRELEASED; urgency=medium
+mash (2.1.1+dfsg-1) unstable; urgency=medium
- * Team upload.
+ [ Michael R. Crusoe ]
* Mark the -doc package as Multi-Arch: foreign
- -- Michael R. Crusoe <michael.crusoe at gmail.com> Sat, 16 Feb 2019 01:04:02 -0800
+ [ Sascha Steinbiss ]
+ * New upstream release.
+ * Remove patch applied upstream.
+
+ -- Sascha Steinbiss <satta at debian.org> Mon, 08 Jul 2019 19:33:37 +0200
mash (2.1+dfsg-2) unstable; urgency=medium
=====================================
debian/patches/capnp0_7.patch deleted
=====================================
@@ -1,23 +0,0 @@
-Description: raise C++ standard to allow building with capnp 0.7
-Author: Sascha Steinbiss <satta at debian.org>
-Bug: https://github.com/marbl/Mash/issues/98
-Last-Update: 2018-12-26
---- a/configure.ac
-+++ b/configure.ac
-@@ -33,7 +33,7 @@
- AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
- fi
-
--CPPFLAGS="-I$with_capnp/include -std=c++11"
-+CPPFLAGS="-I$with_capnp/include -std=c++14"
-
- AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
-
---- a/Makefile.in
-+++ b/Makefile.in
-@@ -1,4 +1,4 @@
--CXXFLAGS += -O3 -std=c++11 -Isrc -I at capnp@/include -I at mathinc@
-+CXXFLAGS += -O3 -std=c++14 -Isrc -I at capnp@/include -I at mathinc@
- CPPFLAGS += @amcppflags@
-
- UNAME_S=$(shell uname -s)
=====================================
debian/patches/series
=====================================
@@ -3,5 +3,4 @@ use_debian_mathjax.patch
drop_memcpy_wrapper.patch
link_dynamically_against_capnp.patch
parallel.patch
-capnp0_7.patch
use_debian_packaged_libmurmurhash.patch
=====================================
doc/sphinx/data.rst
=====================================
@@ -15,3 +15,71 @@ Mash: fast genome and metagenome distance estimation using MinHash
`SRR2671867.BaAmes.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671867.BaAmes.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (157MB)
`SRR2671868.Bc10987.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671868.Bc10987.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (250MB)
+
+Mash Screen: High-throughput sequence containment estimation for genome discovery
+---------------------------------------------------------------------------------
+
+Custom scripts and intermediate data:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`MashScreen_supp.tar.gz <https://obj.umiacs.umd.edu/mash/screen/MashScreen_supp.tar.gz>`_
+
+Data files:
+~~~~~~~~~~~
+
+Mash Sketch databases for RefSeq release 88:
+ * `RefSeq88n.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz>`_: Genomes (k=21, s=1000), 1.2Gb uncompressed
+ * `RefSeq88p.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88p.msh.gz>`_: Proteomes (k=9, s=1000), 1.1Gb uncompressed
+
+`art.fastq.gz <https://obj.umiacs.umd.edu/mash/screen/art.fastq.gz>`_: Simulated reads for Shakya experiment
+
+Figure 5:
+ * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
+ * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
+
+Public data sources
+~~~~~~~~~~~~~~~~~~~
+
+The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
+
+HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
+and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
+``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
+
+SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
+
+RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
+
+Public data products
+~~~~~~~~~~~~~~~~~~~~
+
+Quebec Polyomavirus is submitted to GenBank as BK010702.
+
+Screen of SRA metagenomes vs. RefSeq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ * `sra_meta_nucl_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_95idy.tsv.gz>`_ (2.3Gb uncompressed)
+ * `sra_meta_nucl_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_80idy_3x.tsv.gz>`_ (6.7Gb uncompressed)
+ * `sra_meta_prot_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_95idy.tsv.gz>`_ (2.1Gb uncompressed)
+ * `sra_meta_prot_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_80idy_3x.tsv.gz>`_ (8.3Gb uncompressed)
+
+These files have a line for each RefSeq genome listing all metagenomic SRA runs
+(as of August 2018) with Mash Containment Scores above the specified threshold.
+They are provided for two screen modes:
+
+* ``nucl``: Genomic RefSeq sequences
+* ``prot``: Proteomic RefSeq sequences (combined amino acid sequences per organism). **NOTE:** Protein tables above are not p-value filtered and thus large (> ~50Gb) runs may have spurious hits. They also do not contain plasmids. Updates coming soon!
+
+...and at two thresholds:
+
+* ``95idy``: 95% Mash Containment Score, any coverage. Useful for finding runs containing a specific genome.
+* ``80idy_3x``: 80% Mash Containment Score, at least 3x median k-mer multiplicity.
+ Useful for finding related, but novel, sequences.
+
+The files are tab separated, with each line beginning with a RefSeq assembly accession, followed by SRA accessions, for example:
+
+::
+
+ GCF_000001215.4 SRR3401361 SRR3540373
+ GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081
+ GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081
=====================================
src/mash/CommandScreen.cpp
=====================================
@@ -291,7 +291,7 @@ int CommandScreen::run() const
for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ )
{
- HashList hashList(parameters.kmerSize);
+ HashList hashList(parameters.use64);
(*i)->toHashList(hashList);
@@ -550,38 +550,6 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
break;
}
- //kmersTotal++; TODO
-
- if ( ! noncanonical )
- {
- bool debug = false;
- useRevComp = true;
- bool prefixEqual = true;
-
- if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;}
-
- for ( uint64_t k = 0; k < kmerSize; k++ )
- {
- char base = seq[j + k];
- char baseMinus = seqRev[l - j - kmerSize + k];
-
- if ( debug ) cout << baseMinus;
-
- if ( prefixEqual && baseMinus > base )
- {
- useRevComp = false;
- break;
- }
-
- if ( prefixEqual && baseMinus < base )
- {
- prefixEqual = false;
- }
- }
-
- if ( debug ) cout << endl;
- }
-
const char * kmer;
if ( trans )
@@ -590,14 +558,15 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
}
else
{
- kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j;
+ const char *kmer_fwd = seq + j;
+ const char *kmer_rev = seqRev + length - j - kmerSize;
+ kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
}
//cout << kmer << '\t' << kmerSize << endl;
hash_u hash = getHash(kmer, kmerSize, seed, use64);
//cout << kmer << '\t' << hash.hash64 << endl;
input->minHashHeap->tryInsert(hash);
-
uint64_t key = use64 ? hash.hash64 : hash.hash32;
if ( input->hashCounts.count(key) == 1 )
=====================================
src/mash/CommandSketch.cpp
=====================================
@@ -28,7 +28,7 @@ CommandSketch::CommandSketch()
addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", ""));
- addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", ""));
+ addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment).", ""));
useSketchOptions();
}
=====================================
src/mash/HashList.h
=====================================
@@ -15,7 +15,7 @@ class HashList
public:
HashList() {use64 = true;}
- HashList(int kmerSize) {use64 = kmerSize > 16;}
+ HashList(bool use64new) {use64 = use64new;}
hash_u at(int index) const;
void clear();
=====================================
src/mash/Sketch.cpp
=====================================
@@ -536,13 +536,10 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
for ( uint64_t i = 0; i < length - kmerSize + 1; i++ )
{
- bool useRevComp = false;
- bool debug = false;
-
// repeatedly skip kmers with bad characters
-
+ //
bool bad = false;
-
+ //
for ( uint64_t j = i; j < i + kmerSize && i + kmerSize <= length; j++ )
{
if ( ! parameters.alphabet[seq[j]] )
@@ -552,56 +549,25 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
break;
}
}
-
+ //
if ( bad )
{
continue;
}
-
+ //
if ( i + kmerSize > length )
{
// skipped to end
break;
}
- if ( ! noncanonical )
- {
- useRevComp = true;
- bool prefixEqual = true;
-
- if ( debug ) {for ( uint64_t j = i; j < i + kmerSize; j++ ) { cout << *(seq + j); } cout << endl;}
-
- for ( uint64_t j = 0; j < kmerSize; j++ )
- {
- char base = seq[i + j];
- char baseMinus = seqRev[length - i - kmerSize + j];
-
- if ( debug ) cout << baseMinus;
-
- if ( prefixEqual && baseMinus > base )
- {
- useRevComp = false;
- break;
- }
-
- if ( prefixEqual && baseMinus < base )
- {
- prefixEqual = false;
- }
- }
-
- if ( debug ) cout << endl;
- }
-
const char *kmer_fwd = seq + i;
const char *kmer_rev = seqRev + length - i - kmerSize;
- const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev;
+ const char * kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
bool filter = false;
hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64);
- if ( debug ) cout << endl;
-
minHashHeap.tryInsert(hash);
}
=====================================
src/mash/sketchParameterSetup.cpp
=====================================
@@ -61,8 +61,7 @@ int sketchParameterSetup(Sketch::Parameters & parameters, const Command & comman
if ( parameters.reads && command.getOption("threads").active )
{
- cerr << "ERROR: The option " << command.getOption("threads").identifier << " cannot be used with " << command.getOption("reads").identifier << "." << endl;
- return 1;
+ cerr << "WARNING: The option " << command.getOption("threads").identifier << " will be ignored with " << command.getOption("reads").identifier << "." << endl;
}
if ( parameters.reads && ! parameters.concatenated )
=====================================
src/mash/version.h
=====================================
@@ -4,4 +4,4 @@
//
// See the LICENSE.txt file included with this software for license information.
-static const char * version = "2.1";
+static const char * version = "2.1.1";
View it on GitLab: https://salsa.debian.org/med-team/mash/compare/b71e9b0ad22e09feeb7bae0c33321f7156dc65c3...691bbed4b3751585ac8a5a1ec7e65df2d02b1e9c
--
View it on GitLab: https://salsa.debian.org/med-team/mash/compare/b71e9b0ad22e09feeb7bae0c33321f7156dc65c3...691bbed4b3751585ac8a5a1ec7e65df2d02b1e9c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190708/d167736a/attachment-0001.html>
More information about the debian-med-commit
mailing list