[med-svn] [Git][med-team/tantan][master] 3 commits: New upstream version 39

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Sat Jul 30 17:27:07 BST 2022



Nilesh Patra pushed to branch master at Debian Med / tantan


Commits:
12c8b554 by Nilesh Patra at 2022-07-30T21:50:00+05:30
New upstream version 39
- - - - -
504a5cf6 by Nilesh Patra at 2022-07-30T21:50:01+05:30
Update upstream source from tag 'upstream/39'

Update to upstream version '39'
with Debian dir bb2c9687829ebdbc69ad414798029b1c9fb1eb7f
- - - - -
62239853 by Nilesh Patra at 2022-07-30T21:52:41+05:30
Upload to unstable

- - - - -


10 changed files:

- README.rst
- debian/changelog
- src/Makefile
- src/mcf_fasta_sequence.cc
- src/mcf_tantan_options.cc
- src/tantan_app.cc
- src/tantan_repeat_finder.cc
- src/tantan_repeat_finder.hh
- test/tantan_test.out
- test/tantan_test.sh


Changes:

=====================================
README.rst
=====================================
@@ -143,9 +143,9 @@ Options
 -w  maximum tandem repeat period to consider
 -d  probability decay per period (period-(i+1) / period-i)
 -i  match score
--j  mismatch cost
+-j  mismatch cost (as a special case, 0 means no mismatches)
 -a  gap existence cost
--b  gap extension cost
+-b  gap extension cost (as a special case, 0 means no gaps)
 -s  minimum repeat probability for masking
 -n  minimum copy number, affects -f4 only
 -f  output type: 0=masked sequence, 1=repeat probabilities,
@@ -195,6 +195,16 @@ this is just a representative).  Column 7 shows the whole repeat:
 lowercase letters are insertions relative to the previous repeat unit,
 and dashes are deletions relative to the previous repeat unit.
 
+You can forbid insertions and deletions (which is faster) with
+``-b0``::
+
+  tantan -f4 -b0 seqs.fa
+
+You can also forbid mismatches with ``-j0``, so this gets exact
+repeats only::
+
+  tantan -f4 -b0 -j0 seqs.fa
+
 Miscellaneous
 -------------
 


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+tantan (39-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version 39
+
+ -- Nilesh Patra <nilesh at debian.org>  Sat, 30 Jul 2022 21:52:32 +0530
+
 tantan (31-1) unstable; urgency=medium
 
   * Team upload.


=====================================
src/Makefile
=====================================
@@ -10,7 +10,7 @@ clean:
 	rm -f ../bin/tantan
 
 VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 31) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 39) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
 
 VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
 


=====================================
src/mcf_fasta_sequence.cc
=====================================
@@ -2,98 +2,84 @@
 
 #include "mcf_fasta_sequence.hh"
 
-#include <cctype>  // isspace
+#include <stddef.h>
+
 //#include <iostream>  // for debugging
 #include <istream>
-#include <iterator>  // istreambuf_iterator, ostreambuf_iterator
 #include <ostream>
+#include <streambuf>
 
 namespace mcf {
 
 static void readSequence(std::istream &s, std::vector<uchar> &sequence,
                          char delimiter) {
   if (!s) return;
-  std::istreambuf_iterator<char> inpos(s);
-  std::istreambuf_iterator<char> endpos;
-  while (inpos != endpos) {
-    char c = *inpos;
-    if (c == delimiter) break;
-    uchar u = static_cast<uchar>(c);
-    if (!std::isspace(u)) sequence.push_back(u);
-    ++inpos;
+  std::streambuf *b = s.rdbuf();
+  int c = b->sgetc();
+  while (c != std::streambuf::traits_type::eof()) {
+    if (c > ' ') {
+      if (c == delimiter) break;
+      sequence.push_back(c);
+    }
+    c = b->snextc();
   }
 }
 
 static void readQualityCodes(std::istream &s, std::vector<uchar> &qualityCodes,
                              std::vector<uchar>::size_type howMany) {
   if (!s) return;
-  std::istreambuf_iterator<char> inpos(s);
-  std::istreambuf_iterator<char> endpos;
-  while (inpos != endpos) {
-    if (qualityCodes.size() == howMany) break;
-    char c = *inpos;
-    uchar u = static_cast<uchar>(c);
-    if (!std::isspace(u)) qualityCodes.push_back(u);
-    ++inpos;
+  std::streambuf *b = s.rdbuf();
+  while (howMany > 0) {
+    int c = b->sbumpc();
+    if (c == std::streambuf::traits_type::eof()) break;  // xxx ???
+    if (c > ' ') {
+      qualityCodes.push_back(c);
+      --howMany;
+    }
   }
 }
 
 std::istream &operator>>(std::istream &s, FastaSequence &f) {
-  std::string title;
-  std::vector<uchar> sequence;
-  std::string secondTitle;
-  std::vector<uchar> qualityCodes;
-
   char firstChar = '>';
   s >> firstChar;
   if (firstChar != '>' && firstChar != '@') s.setstate(std::ios::failbit);
-  getline(s, title);
+  if (!s) return s;
+
+  f.title.clear();
+  f.sequence.clear();
+  f.secondTitle.clear();
+  f.qualityCodes.clear();
+
+  getline(s, f.title);
 
   if (firstChar == '>') {
-    readSequence(s, sequence, '>');
+    readSequence(s, f.sequence, '>');
   } else {
-    readSequence(s, sequence, '+');
-    char secondChar;
-    s >> secondChar;
-    getline(s, secondTitle);
-    readQualityCodes(s, qualityCodes, sequence.size());
+    readSequence(s, f.sequence, '+');
+    s >> firstChar;
+    getline(s, f.secondTitle);
+    readQualityCodes(s, f.qualityCodes, f.sequence.size());
     // perhaps check whether we read enough quality codes
   }
 
-  if (!s) return s;
-
-  f.title.swap(title);
-  f.sequence.swap(sequence);
-  f.secondTitle.swap(secondTitle);
-  f.qualityCodes.swap(qualityCodes);
-
   return s;
 }
 
 static void writeOneLine(std::ostream &s, const std::vector<uchar> &v) {
-  std::ostreambuf_iterator<char> o(s);
-  std::vector<uchar>::const_iterator b = v.begin();
-  std::vector<uchar>::const_iterator e = v.end();
-  while (b != e) {
-    o = static_cast<char>(*b);
-    ++b;
-  }
-  o = '\n';
+  std::streambuf *b = s.rdbuf();
+  size_t size = v.size();
+  if (size) b->sputn(reinterpret_cast<const char *>(&v[0]), size);
+  b->sputc('\n');
 }
 
 static void writeMultiLines(std::ostream &s, const std::vector<uchar> &v) {
-  const int lettersPerLine = 50;  // ?
-  std::ostreambuf_iterator<char> o(s);
-  std::vector<uchar>::const_iterator b = v.begin();
-  std::vector<uchar>::const_iterator e = v.end();
-  std::vector<uchar>::const_iterator i = b;
-  while (i != e) {
-    o = static_cast<char>(*i);
-    ++i;
-    if (i - b == lettersPerLine || i == e) {
-      o = '\n';
-      b = i;
-    }
+  size_t lettersPerLine = 50;  // ?
+  std::streambuf *b = s.rdbuf();
+  size_t size = v.size();
+  for (size_t i = 0; i < size; i += lettersPerLine) {
+    if (size - i < lettersPerLine) lettersPerLine = size - i;
+    b->sputn(reinterpret_cast<const char *>(&v[i]), lettersPerLine);
+    b->sputc('\n');
   }
 }
 


=====================================
src/mcf_tantan_options.cc
=====================================
@@ -6,6 +6,8 @@
 
 #include <unistd.h>
 
+#include <limits.h>
+
 #include <cstdlib>  // EXIT_SUCCESS
 #include <iostream>
 #include <stdexcept>
@@ -53,7 +55,7 @@ TantanOptions::TantanOptions() :
     maxCycleLength(-1),  // depends on isProtein
     repeatOffsetProbDecay(0.9),
     matchScore(0),
-    mismatchCost(0),
+    mismatchCost(-1),
     gapExistenceCost(0),
     gapExtensionCost(-1),  // means: no gaps
     minMaskProb(0.5),
@@ -79,10 +81,10 @@ Options (default settings):\n\
  -d  probability decay per period ("
       + stringify(repeatOffsetProbDecay) + ")\n\
  -i  match score (BLOSUM62 if -p, else 2 if -f4, else 1)\n\
- -j  mismatch cost (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
+ -j  mismatch cost, 0 means infinite (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
  -a  gap existence cost ("
       + stringify(gapExistenceCost) + ")\n\
- -b  gap extension cost (7 if -f4, else infinity)\n\
+ -b  gap extension cost, 0 means no gaps (7 if -f4, else 0)\n\
  -s  minimum repeat probability for masking ("
       + stringify(minMaskProb) + ")\n\
  -n  minimum copy number, affects -f4 only ("
@@ -144,7 +146,7 @@ Options (default settings):\n\
 	break;
       case 'j':
 	unstringify(mismatchCost, optarg);
-	if (mismatchCost <= 0)
+	if (mismatchCost < 0)
 	  badopt(c, optarg);
 	break;
       case 'a':
@@ -152,7 +154,7 @@ Options (default settings):\n\
         break;
       case 'b':
         unstringify(gapExtensionCost, optarg);
-        if (gapExtensionCost <= 0)
+        if (gapExtensionCost < 0)
           badopt(c, optarg);
         break;
       case 's':
@@ -179,9 +181,11 @@ Options (default settings):\n\
 
   if (maxCycleLength < 0) maxCycleLength = (isProtein ? 50 : 100);
 
-  if (!isProtein || matchScore || mismatchCost) {
-    if (matchScore   == 0) matchScore   = (outputType == repOut ? 2 : 1);
-    if (mismatchCost == 0) mismatchCost = (outputType == repOut ? 7 : 1);
+  if (mismatchCost == 0) mismatchCost = INT_MAX;
+
+  if (!isProtein || matchScore > 0 || mismatchCost > 0) {
+    if (matchScore   < 1) matchScore   = (outputType == repOut ? 2 : 1);
+    if (mismatchCost < 1) mismatchCost = (outputType == repOut ? 7 : 1);
   }
 
   indexOfFirstNonOptionArgument = optind;


=====================================
src/tantan_app.cc
=====================================
@@ -78,7 +78,7 @@ void initScoresAndProbabilities() {
   if (options.scoreMatrixFileName) {
     unfilify(scoreMatrix, options.scoreMatrixFileName);
   } else if (options.isProtein) {
-    if (options.matchScore && options.mismatchCost) {
+    if (options.matchScore > 0 && options.mismatchCost > 0) {
       scoreMatrix.initMatchMismatch(options.matchScore, options.mismatchCost,
 				    Alphabet::protein);
     } else {
@@ -342,15 +342,8 @@ void processOneSequence(FastaSequence &f, std::ostream &output) {
 
 void processOneFile(std::istream &input, std::ostream &output) {
   bool isFirstSequence = true;
-
-  // This code strives to minimize memory usage.  The sequence-reading
-  // operation does not overwrite the sequence until it finishes
-  // reading successfully.  So, we don't want to overwrite a large,
-  // old sequence.  Hence, we make a brand-new FastaSequence each time
-  // through the loop.
-  while (true) {
-    FastaSequence f;
-    if (!(input >> f)) break;
+  FastaSequence f;
+  while (input >> f) {
     if (isFirstSequence && !options.isProtein &&
         isDubiousDna(BEG(f.sequence), END(f.sequence)))
       std::cerr << "tantan: that's some funny-lookin DNA\n";


=====================================
src/tantan_repeat_finder.cc
=====================================
@@ -120,25 +120,6 @@ void RepeatFinder::calcBackwardTransitionScoresWithGaps() {
   scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
 }
 
-void RepeatFinder::calcBackwardTransitionScores() {
-  if (endGapScore > -HUGE_VAL) return calcBackwardTransitionScoresWithGaps();
-
-  double toBackground = f2b + scoresPtr[0];
-  double toForeground = -HUGE_VAL;
-  double *foregroundPtr = scoresPtr + 1;
-  double *foregroundEnd = foregroundPtr + maxRepeatOffset;
-
-  while (foregroundPtr < foregroundEnd) {
-    toForeground += b2fGrowth;
-    double f = *foregroundPtr;
-    toForeground = std::max(toForeground, f);
-    *foregroundPtr = std::max(toBackground, f2f0 + f);
-    ++foregroundPtr;
-  }
-
-  scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
-}
-
 void RepeatFinder::calcEmissionScores() {
   const double *matrixRow = substitutionMatrix[*seqPtr];
   double *oldScores = scoresPtr - dpScoresPerLetter;
@@ -158,6 +139,36 @@ void RepeatFinder::calcEmissionScores() {
   std::copy(oldScores + i, scoresPtr, scoresPtr + i);
 }
 
+void RepeatFinder::calcScoresForOneSequencePosition() {
+  if (endGapScore > -HUGE_VAL) {
+    calcEmissionScores();
+    calcBackwardTransitionScoresWithGaps();
+    return;
+  }
+
+  double *oldScores = scoresPtr - dpScoresPerLetter;
+  double toBackground = f2b + oldScores[0];
+  const double *matrixRow = substitutionMatrix[*seqPtr];
+  int maxOffset = maxOffsetInTheSequence();
+  double toForeground = -HUGE_VAL;
+  int i = 1;
+
+  for (; i <= maxOffset; ++i) {
+    double f = oldScores[i] + matrixRow[seqPtr[-i]];
+    toForeground = std::max(toForeground + b2fGrowth, f);
+    scoresPtr[i] = std::max(toBackground, f2f0 + f);
+  }
+
+  for (; i <= maxRepeatOffset; ++i) {
+    toForeground += b2fGrowth;
+    scoresPtr[i] = toBackground;
+  }
+
+  std::copy(oldScores + i, scoresPtr, scoresPtr + i);
+
+  scoresPtr[0] = std::max(b2b + oldScores[0], b2fLast + toForeground);
+}
+
 void RepeatFinder::makeCheckpoint() {
   checkpoint += dpScoresPerLetter;
   std::copy(scoresPtr - dpScoresPerLetter, scoresPtr, checkpoint);


=====================================
src/tantan_repeat_finder.hh
=====================================
@@ -69,8 +69,8 @@ private:
 
   void initializeBackwardScores();
   void calcBackwardTransitionScoresWithGaps();
-  void calcBackwardTransitionScores();
   void calcEmissionScores();
+  void calcScoresForOneSequencePosition();
   void makeCheckpoint();
   void redoCheckpoint();
   int offsetWithMaxScore() const;
@@ -87,11 +87,6 @@ private:
   double scoreWithEmission(const double *matrixRow, int offset) const {
     return scoresPtr[offset] + matrixRow[seqPtr[-offset]];
   }
-
-  void calcScoresForOneSequencePosition() {
-    calcEmissionScores();
-    calcBackwardTransitionScores();
-  }
 };
 
 }


=====================================
test/tantan_test.out
=====================================
@@ -1513,3 +1513,23 @@ chrM	8271	8290	9	2.11111	ACCCCCTCT	ACCCCCTCT,ACCCCCTCT,A
 chrM	13647	13691	30	1.48276	CCCCACCCTTACTAACATTAACGAAAATAA	CCCCACCCTTACTAACATTAACGAAAATAA,CCCCACCCT-ACTAA
 chrM	15829	15844	6	2.5	AATCCT	AATCCT,AATCCT,AAT
 chrM	16183	16195	1	12	C	C,C,C,C,C,C,C,C,C,C,C,C
+
+SRR019778.4	6	35	7	4.14286	TTGTGTG	TGTGTAT,TGTGTGT,TGTGTGT,GGTGTGT,G
+SRR019778.42	3	44	4	10.25	TGTG	TGTG,TGTG,TGTG,TGTG,TGTG,TGTG,TGTT,TGTT,TTTT,TTTT,T
+SRR019778.50	0	21	7	3	GTGTGTT	GTGTGTT,GTGTGTT,GAGTGTT
+SRR019778.64	30	44	3	4.66667	GAG	GAG,GAG,GAG,GAG,GA
+SRR019778.65	2	13	2	5.5	TG	TG,TG,TG,TG,TG,T
+SRR019778.65	4	37	16	2.0625	TGTGTGTGTTTTCTGT	TGTGTGTGTTTTCTGT,TGTGTGTGTTTTTTGT,T
+SRR019778.78	9	31	10	2.2	TGCCTTACTA	TGCCTTACTA,TGCCTTACTA,TG
+SRR019778.95	2	18	4	4	TGAT	TGAT,TGAT,TGAT,TGAT
+SRR019778.95	22	45	4	5.75	ATAG	ATAG,ATAG,ATAG,ATAG,ATAG,ATA
+
+SRR019778.4	28	38	2	5	GT	GT,GT,GT,GT,GT
+SRR019778.42	3	30	2	13.5	TG	TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,T
+SRR019778.42	33	44	1	11	T	T,T,T,T,T,T,T,T,T,T,T
+SRR019778.45	18	37	9	2.11111	TGTGTGTGT	TGTGTGTGT,TGTGTGTGT,T
+SRR019778.64	30	44	3	4.66667	GAG	GAG,GAG,GAG,GAG,GA
+SRR019778.65	2	13	2	5.5	TG	TG,TG,TG,TG,TG,T
+SRR019778.78	9	31	10	2.2	TGCCTTACTA	TGCCTTACTA,TGCCTTACTA,TG
+SRR019778.95	2	18	4	4	TGAT	TGAT,TGAT,TGAT,TGAT
+SRR019778.95	22	45	4	5.75	ATAG	ATAG,ATAG,ATAG,ATAG,ATAG,ATA


=====================================
test/tantan_test.sh
=====================================
@@ -35,5 +35,8 @@ countLowercaseLetters () {
     tantan -f4 -b12 hard.fa
     echo
     tantan -f4 -n1 hg19_chrM.fa
-} 2>&1 |
-diff -u tantan_test.out -
+    echo
+    tantan -f4 -b0 panda.fastq
+    echo
+    tantan -f4 -b0 -j0 panda.fastq
+} 2>&1 | diff -u tantan_test.out -



View it on GitLab: https://salsa.debian.org/med-team/tantan/-/compare/c9b5da3096c0e42186405a47d164108a4c976e72...62239853d177b9ec34380ad61203ad3ffcb6fd77

-- 
View it on GitLab: https://salsa.debian.org/med-team/tantan/-/compare/c9b5da3096c0e42186405a47d164108a4c976e72...62239853d177b9ec34380ad61203ad3ffcb6fd77
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220730/78b2337b/attachment-0001.htm>


More information about the debian-med-commit mailing list