[med-svn] [Git][med-team/mash][upstream] New upstream version 2.1.1+dfsg

Sascha Steinbiss gitlab at salsa.debian.org
Mon Jul 8 18:39:26 BST 2019



Sascha Steinbiss pushed to branch upstream at Debian Med / mash


Commits:
32193931 by Sascha Steinbiss at 2019-07-08T17:32:56Z
New upstream version 2.1.1+dfsg
- - - - -


9 changed files:

- Makefile.in
- configure.ac
- doc/sphinx/data.rst
- src/mash/CommandScreen.cpp
- src/mash/CommandSketch.cpp
- src/mash/HashList.h
- src/mash/Sketch.cpp
- src/mash/sketchParameterSetup.cpp
- src/mash/version.h


Changes:

=====================================
Makefile.in
=====================================
@@ -1,4 +1,4 @@
-CXXFLAGS += -O3 -std=c++11 -Isrc -I at capnp@/include -I at mathinc@
+CXXFLAGS += -O3 -std=c++14 -Isrc -I at capnp@/include -I at mathinc@
 CPPFLAGS += @amcppflags@
 
 UNAME_S=$(shell uname -s)


=====================================
configure.ac
=====================================
@@ -33,7 +33,7 @@ then
 	AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
 fi
 
-CPPFLAGS="-I$with_capnp/include -std=c++11"
+CPPFLAGS="-I$with_capnp/include -std=c++14"
 
 AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
 


=====================================
doc/sphinx/data.rst
=====================================
@@ -15,3 +15,71 @@ Mash: fast genome and metagenome distance estimation using MinHash
 `SRR2671867.BaAmes.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671867.BaAmes.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (157MB)
 
 `SRR2671868.Bc10987.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671868.Bc10987.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (250MB)
+
+Mash Screen: High-throughput sequence containment estimation for genome discovery
+---------------------------------------------------------------------------------
+
+Custom scripts and intermediate data:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`MashScreen_supp.tar.gz <https://obj.umiacs.umd.edu/mash/screen/MashScreen_supp.tar.gz>`_
+
+Data files:
+~~~~~~~~~~~
+
+Mash Sketch databases for RefSeq release 88:
+ * `RefSeq88n.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz>`_: Genomes (k=21, s=1000), 1.2Gb uncompressed
+ * `RefSeq88p.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88p.msh.gz>`_: Proteomes (k=9, s=1000), 1.1Gb uncompressed
+
+`art.fastq.gz <https://obj.umiacs.umd.edu/mash/screen/art.fastq.gz>`_: Simulated reads for Shakya experiment
+
+Figure 5:
+ * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
+ * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
+
+Public data sources
+~~~~~~~~~~~~~~~~~~~
+
+The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
+
+HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
+and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
+``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
+
+SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
+
+RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
+
+Public data products
+~~~~~~~~~~~~~~~~~~~~
+
+Quebec Polyomavirus is submitted to GenBank as BK010702.
+
+Screen of SRA metagenomes vs. RefSeq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ * `sra_meta_nucl_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_95idy.tsv.gz>`_ (2.3Gb uncompressed)
+ * `sra_meta_nucl_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_80idy_3x.tsv.gz>`_ (6.7Gb uncompressed)
+ * `sra_meta_prot_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_95idy.tsv.gz>`_ (2.1Gb uncompressed)
+ * `sra_meta_prot_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_80idy_3x.tsv.gz>`_ (8.3Gb uncompressed)
+
+These files have a line for each RefSeq genome listing all metagenomic SRA runs
+(as of August 2018) with Mash Containment Scores above the specified threshold.
+They are provided for two screen modes:
+
+* ``nucl``: Genomic RefSeq sequences
+* ``prot``: Proteomic RefSeq sequences (combined amino acid sequences per organism). **NOTE:** Protein tables above are not p-value filtered and thus large (> ~50Gb) runs may have spurious hits. They also do not contain plasmids. Updates coming soon!
+
+...and at two thresholds:
+
+* ``95idy``: 95% Mash Containment Score, any coverage. Useful for finding runs containing a specific genome.
+* ``80idy_3x``: 80% Mash Containment Score, at least 3x median k-mer multiplicity.
+  Useful for finding related, but novel, sequences.
+
+The files are tab separated, with each line beginning with a RefSeq assembly accession, followed by SRA accessions, for example:
+
+::
+  
+  GCF_000001215.4	SRR3401361	SRR3540373
+  GCF_000001405.36	SRR5127794	ERR1539652	SRR413753	ERR206081
+  GCF_000001405.38	SRR5127794	ERR1539652	ERR1711677	SRR413753	ERR206081


=====================================
src/mash/CommandScreen.cpp
=====================================
@@ -291,7 +291,7 @@ int CommandScreen::run() const
 	
 	for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ )
 	{
-		HashList hashList(parameters.kmerSize);
+		HashList hashList(parameters.use64);
 		
 		(*i)->toHashList(hashList);
 		
@@ -550,38 +550,6 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
 				break;
 			}
 			
-			//kmersTotal++; TODO
-			
-			if ( ! noncanonical )
-			{
-				bool debug = false;
-				useRevComp = true;
-				bool prefixEqual = true;
-	
-				if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;}
-				
-				for ( uint64_t k = 0; k < kmerSize; k++ )
-				{
-					char base = seq[j + k];
-					char baseMinus = seqRev[l - j - kmerSize + k];
-		
-					if ( debug ) cout << baseMinus;
-		
-					if ( prefixEqual && baseMinus > base )
-					{
-						useRevComp = false;
-						break;
-					}
-		
-					if ( prefixEqual && baseMinus < base )
-					{
-						prefixEqual = false;
-					}
-				}
-	
-				if ( debug ) cout << endl;
-			}
-	
 			const char * kmer;
 			
 			if ( trans )
@@ -590,14 +558,15 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
 			}
 			else
 			{
-				kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j;
+				const char *kmer_fwd = seq + j;
+				const char *kmer_rev = seqRev + length - j - kmerSize;
+				kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
 			}
 			
 			//cout << kmer << '\t' << kmerSize << endl;
 			hash_u hash = getHash(kmer, kmerSize, seed, use64);
 			//cout << kmer << '\t' << hash.hash64 << endl;
 			input->minHashHeap->tryInsert(hash);
-			
 			uint64_t key = use64 ? hash.hash64 : hash.hash32;
 			
 			if ( input->hashCounts.count(key) == 1 )


=====================================
src/mash/CommandSketch.cpp
=====================================
@@ -28,7 +28,7 @@ CommandSketch::CommandSketch()
     addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
     addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
     addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", ""));
-    addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", ""));
+    addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment).", ""));
     useSketchOptions();
 }
 


=====================================
src/mash/HashList.h
=====================================
@@ -15,7 +15,7 @@ class HashList
 public:
     
     HashList() {use64 = true;}
-    HashList(int kmerSize) {use64 = kmerSize > 16;}
+    HashList(bool use64new) {use64 = use64new;}
     
     hash_u at(int index) const;
     void clear();


=====================================
src/mash/Sketch.cpp
=====================================
@@ -536,13 +536,10 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
     
     for ( uint64_t i = 0; i < length - kmerSize + 1; i++ )
     {
-        bool useRevComp = false;
-        bool debug = false;
-        
 		// repeatedly skip kmers with bad characters
-		
+		//
 		bool bad = false;
-		
+		//
 		for ( uint64_t j = i; j < i + kmerSize && i + kmerSize <= length; j++ )
 		{
 			if ( ! parameters.alphabet[seq[j]] )
@@ -552,56 +549,25 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
 				break;
 			}
 		}
-		
+		//
 		if ( bad )
 		{
 			continue;
 		}
-	
+		//	
 		if ( i + kmerSize > length )
 		{
 			// skipped to end
 			break;
 		}
             
-        if ( ! noncanonical )
-        {
-            useRevComp = true;
-            bool prefixEqual = true;
-        
-            if ( debug ) {for ( uint64_t j = i; j < i + kmerSize; j++ ) { cout << *(seq + j); } cout << endl;}
-        
-            for ( uint64_t j = 0; j < kmerSize; j++ )
-            {
-                char base = seq[i + j];
-                char baseMinus = seqRev[length - i - kmerSize + j];
-            
-                if ( debug ) cout << baseMinus;
-            
-                if ( prefixEqual && baseMinus > base )
-                {
-                    useRevComp = false;
-                    break;
-                }
-            
-                if ( prefixEqual && baseMinus < base )
-                {
-                    prefixEqual = false;
-                }
-            }
-        
-            if ( debug ) cout << endl;
-        }
-        
         const char *kmer_fwd = seq + i;
         const char *kmer_rev = seqRev + length - i - kmerSize;
-        const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev;
+        const char * kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
         bool filter = false;
         
         hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64);
         
-        if ( debug ) cout << endl;
-        
 		minHashHeap.tryInsert(hash);
     }
     


=====================================
src/mash/sketchParameterSetup.cpp
=====================================
@@ -61,8 +61,7 @@ int sketchParameterSetup(Sketch::Parameters & parameters, const Command & comman
     
     if ( parameters.reads && command.getOption("threads").active )
     {
-    	cerr << "ERROR: The option " << command.getOption("threads").identifier << " cannot be used with " << command.getOption("reads").identifier << "." << endl;
-    	return 1;
+    	cerr << "WARNING: The option " << command.getOption("threads").identifier << " will be ignored with " << command.getOption("reads").identifier << "." << endl;
     }
     
     if ( parameters.reads && ! parameters.concatenated )


=====================================
src/mash/version.h
=====================================
@@ -4,4 +4,4 @@
 //
 // See the LICENSE.txt file included with this software for license information.
 
-static const char * version = "2.1";
+static const char * version = "2.1.1";



View it on GitLab: https://salsa.debian.org/med-team/mash/commit/32193931d8c36d5baf3e1c5965564b90fc5fd4d9

-- 
View it on GitLab: https://salsa.debian.org/med-team/mash/commit/32193931d8c36d5baf3e1c5965564b90fc5fd4d9
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190708/f774bb0c/attachment-0001.html>


More information about the debian-med-commit mailing list