[med-svn] [Git][med-team/tantan][master] 3 commits: New upstream version 39
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Sat Jul 30 17:27:07 BST 2022
Nilesh Patra pushed to branch master at Debian Med / tantan
Commits:
12c8b554 by Nilesh Patra at 2022-07-30T21:50:00+05:30
New upstream version 39
- - - - -
504a5cf6 by Nilesh Patra at 2022-07-30T21:50:01+05:30
Update upstream source from tag 'upstream/39'
Update to upstream version '39'
with Debian dir bb2c9687829ebdbc69ad414798029b1c9fb1eb7f
- - - - -
62239853 by Nilesh Patra at 2022-07-30T21:52:41+05:30
Upload to unstable
- - - - -
10 changed files:
- README.rst
- debian/changelog
- src/Makefile
- src/mcf_fasta_sequence.cc
- src/mcf_tantan_options.cc
- src/tantan_app.cc
- src/tantan_repeat_finder.cc
- src/tantan_repeat_finder.hh
- test/tantan_test.out
- test/tantan_test.sh
Changes:
=====================================
README.rst
=====================================
@@ -143,9 +143,9 @@ Options
-w maximum tandem repeat period to consider
-d probability decay per period (period-(i+1) / period-i)
-i match score
--j mismatch cost
+-j mismatch cost (as a special case, 0 means no mismatches)
-a gap existence cost
--b gap extension cost
+-b gap extension cost (as a special case, 0 means no gaps)
-s minimum repeat probability for masking
-n minimum copy number, affects -f4 only
-f output type: 0=masked sequence, 1=repeat probabilities,
@@ -195,6 +195,16 @@ this is just a representative). Column 7 shows the whole repeat:
lowercase letters are insertions relative to the previous repeat unit,
and dashes are deletions relative to the previous repeat unit.
+You can forbid insertions and deletions (which is faster) with
+``-b0``::
+
+ tantan -f4 -b0 seqs.fa
+
+You can also forbid mismatches with ``-j0``, so this gets exact
+repeats only::
+
+ tantan -f4 -b0 -j0 seqs.fa
+
Miscellaneous
-------------
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+tantan (39-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version 39
+
+ -- Nilesh Patra <nilesh at debian.org> Sat, 30 Jul 2022 21:52:32 +0530
+
tantan (31-1) unstable; urgency=medium
* Team upload.
=====================================
src/Makefile
=====================================
@@ -10,7 +10,7 @@ clean:
rm -f ../bin/tantan
VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 31) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 39) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
=====================================
src/mcf_fasta_sequence.cc
=====================================
@@ -2,98 +2,84 @@
#include "mcf_fasta_sequence.hh"
-#include <cctype> // isspace
+#include <stddef.h>
+
//#include <iostream> // for debugging
#include <istream>
-#include <iterator> // istreambuf_iterator, ostreambuf_iterator
#include <ostream>
+#include <streambuf>
namespace mcf {
static void readSequence(std::istream &s, std::vector<uchar> &sequence,
char delimiter) {
if (!s) return;
- std::istreambuf_iterator<char> inpos(s);
- std::istreambuf_iterator<char> endpos;
- while (inpos != endpos) {
- char c = *inpos;
- if (c == delimiter) break;
- uchar u = static_cast<uchar>(c);
- if (!std::isspace(u)) sequence.push_back(u);
- ++inpos;
+ std::streambuf *b = s.rdbuf();
+ int c = b->sgetc();
+ while (c != std::streambuf::traits_type::eof()) {
+ if (c > ' ') {
+ if (c == delimiter) break;
+ sequence.push_back(c);
+ }
+ c = b->snextc();
}
}
static void readQualityCodes(std::istream &s, std::vector<uchar> &qualityCodes,
std::vector<uchar>::size_type howMany) {
if (!s) return;
- std::istreambuf_iterator<char> inpos(s);
- std::istreambuf_iterator<char> endpos;
- while (inpos != endpos) {
- if (qualityCodes.size() == howMany) break;
- char c = *inpos;
- uchar u = static_cast<uchar>(c);
- if (!std::isspace(u)) qualityCodes.push_back(u);
- ++inpos;
+ std::streambuf *b = s.rdbuf();
+ while (howMany > 0) {
+ int c = b->sbumpc();
+ if (c == std::streambuf::traits_type::eof()) break; // xxx ???
+ if (c > ' ') {
+ qualityCodes.push_back(c);
+ --howMany;
+ }
}
}
std::istream &operator>>(std::istream &s, FastaSequence &f) {
- std::string title;
- std::vector<uchar> sequence;
- std::string secondTitle;
- std::vector<uchar> qualityCodes;
-
char firstChar = '>';
s >> firstChar;
if (firstChar != '>' && firstChar != '@') s.setstate(std::ios::failbit);
- getline(s, title);
+ if (!s) return s;
+
+ f.title.clear();
+ f.sequence.clear();
+ f.secondTitle.clear();
+ f.qualityCodes.clear();
+
+ getline(s, f.title);
if (firstChar == '>') {
- readSequence(s, sequence, '>');
+ readSequence(s, f.sequence, '>');
} else {
- readSequence(s, sequence, '+');
- char secondChar;
- s >> secondChar;
- getline(s, secondTitle);
- readQualityCodes(s, qualityCodes, sequence.size());
+ readSequence(s, f.sequence, '+');
+ s >> firstChar;
+ getline(s, f.secondTitle);
+ readQualityCodes(s, f.qualityCodes, f.sequence.size());
// perhaps check whether we read enough quality codes
}
- if (!s) return s;
-
- f.title.swap(title);
- f.sequence.swap(sequence);
- f.secondTitle.swap(secondTitle);
- f.qualityCodes.swap(qualityCodes);
-
return s;
}
static void writeOneLine(std::ostream &s, const std::vector<uchar> &v) {
- std::ostreambuf_iterator<char> o(s);
- std::vector<uchar>::const_iterator b = v.begin();
- std::vector<uchar>::const_iterator e = v.end();
- while (b != e) {
- o = static_cast<char>(*b);
- ++b;
- }
- o = '\n';
+ std::streambuf *b = s.rdbuf();
+ size_t size = v.size();
+ if (size) b->sputn(reinterpret_cast<const char *>(&v[0]), size);
+ b->sputc('\n');
}
static void writeMultiLines(std::ostream &s, const std::vector<uchar> &v) {
- const int lettersPerLine = 50; // ?
- std::ostreambuf_iterator<char> o(s);
- std::vector<uchar>::const_iterator b = v.begin();
- std::vector<uchar>::const_iterator e = v.end();
- std::vector<uchar>::const_iterator i = b;
- while (i != e) {
- o = static_cast<char>(*i);
- ++i;
- if (i - b == lettersPerLine || i == e) {
- o = '\n';
- b = i;
- }
+ size_t lettersPerLine = 50; // ?
+ std::streambuf *b = s.rdbuf();
+ size_t size = v.size();
+ for (size_t i = 0; i < size; i += lettersPerLine) {
+ if (size - i < lettersPerLine) lettersPerLine = size - i;
+ b->sputn(reinterpret_cast<const char *>(&v[i]), lettersPerLine);
+ b->sputc('\n');
}
}
=====================================
src/mcf_tantan_options.cc
=====================================
@@ -6,6 +6,8 @@
#include <unistd.h>
+#include <limits.h>
+
#include <cstdlib> // EXIT_SUCCESS
#include <iostream>
#include <stdexcept>
@@ -53,7 +55,7 @@ TantanOptions::TantanOptions() :
maxCycleLength(-1), // depends on isProtein
repeatOffsetProbDecay(0.9),
matchScore(0),
- mismatchCost(0),
+ mismatchCost(-1),
gapExistenceCost(0),
gapExtensionCost(-1), // means: no gaps
minMaskProb(0.5),
@@ -79,10 +81,10 @@ Options (default settings):\n\
-d probability decay per period ("
+ stringify(repeatOffsetProbDecay) + ")\n\
-i match score (BLOSUM62 if -p, else 2 if -f4, else 1)\n\
- -j mismatch cost (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
+ -j mismatch cost, 0 means infinite (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
-a gap existence cost ("
+ stringify(gapExistenceCost) + ")\n\
- -b gap extension cost (7 if -f4, else infinity)\n\
+ -b gap extension cost, 0 means no gaps (7 if -f4, else 0)\n\
-s minimum repeat probability for masking ("
+ stringify(minMaskProb) + ")\n\
-n minimum copy number, affects -f4 only ("
@@ -144,7 +146,7 @@ Options (default settings):\n\
break;
case 'j':
unstringify(mismatchCost, optarg);
- if (mismatchCost <= 0)
+ if (mismatchCost < 0)
badopt(c, optarg);
break;
case 'a':
@@ -152,7 +154,7 @@ Options (default settings):\n\
break;
case 'b':
unstringify(gapExtensionCost, optarg);
- if (gapExtensionCost <= 0)
+ if (gapExtensionCost < 0)
badopt(c, optarg);
break;
case 's':
@@ -179,9 +181,11 @@ Options (default settings):\n\
if (maxCycleLength < 0) maxCycleLength = (isProtein ? 50 : 100);
- if (!isProtein || matchScore || mismatchCost) {
- if (matchScore == 0) matchScore = (outputType == repOut ? 2 : 1);
- if (mismatchCost == 0) mismatchCost = (outputType == repOut ? 7 : 1);
+ if (mismatchCost == 0) mismatchCost = INT_MAX;
+
+ if (!isProtein || matchScore > 0 || mismatchCost > 0) {
+ if (matchScore < 1) matchScore = (outputType == repOut ? 2 : 1);
+ if (mismatchCost < 1) mismatchCost = (outputType == repOut ? 7 : 1);
}
indexOfFirstNonOptionArgument = optind;
=====================================
src/tantan_app.cc
=====================================
@@ -78,7 +78,7 @@ void initScoresAndProbabilities() {
if (options.scoreMatrixFileName) {
unfilify(scoreMatrix, options.scoreMatrixFileName);
} else if (options.isProtein) {
- if (options.matchScore && options.mismatchCost) {
+ if (options.matchScore > 0 && options.mismatchCost > 0) {
scoreMatrix.initMatchMismatch(options.matchScore, options.mismatchCost,
Alphabet::protein);
} else {
@@ -342,15 +342,8 @@ void processOneSequence(FastaSequence &f, std::ostream &output) {
void processOneFile(std::istream &input, std::ostream &output) {
bool isFirstSequence = true;
-
- // This code strives to minimize memory usage. The sequence-reading
- // operation does not overwrite the sequence until it finishes
- // reading successfully. So, we don't want to overwrite a large,
- // old sequence. Hence, we make a brand-new FastaSequence each time
- // through the loop.
- while (true) {
- FastaSequence f;
- if (!(input >> f)) break;
+ FastaSequence f;
+ while (input >> f) {
if (isFirstSequence && !options.isProtein &&
isDubiousDna(BEG(f.sequence), END(f.sequence)))
std::cerr << "tantan: that's some funny-lookin DNA\n";
=====================================
src/tantan_repeat_finder.cc
=====================================
@@ -120,25 +120,6 @@ void RepeatFinder::calcBackwardTransitionScoresWithGaps() {
scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
}
-void RepeatFinder::calcBackwardTransitionScores() {
- if (endGapScore > -HUGE_VAL) return calcBackwardTransitionScoresWithGaps();
-
- double toBackground = f2b + scoresPtr[0];
- double toForeground = -HUGE_VAL;
- double *foregroundPtr = scoresPtr + 1;
- double *foregroundEnd = foregroundPtr + maxRepeatOffset;
-
- while (foregroundPtr < foregroundEnd) {
- toForeground += b2fGrowth;
- double f = *foregroundPtr;
- toForeground = std::max(toForeground, f);
- *foregroundPtr = std::max(toBackground, f2f0 + f);
- ++foregroundPtr;
- }
-
- scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
-}
-
void RepeatFinder::calcEmissionScores() {
const double *matrixRow = substitutionMatrix[*seqPtr];
double *oldScores = scoresPtr - dpScoresPerLetter;
@@ -158,6 +139,36 @@ void RepeatFinder::calcEmissionScores() {
std::copy(oldScores + i, scoresPtr, scoresPtr + i);
}
+void RepeatFinder::calcScoresForOneSequencePosition() {
+ if (endGapScore > -HUGE_VAL) {
+ calcEmissionScores();
+ calcBackwardTransitionScoresWithGaps();
+ return;
+ }
+
+ double *oldScores = scoresPtr - dpScoresPerLetter;
+ double toBackground = f2b + oldScores[0];
+ const double *matrixRow = substitutionMatrix[*seqPtr];
+ int maxOffset = maxOffsetInTheSequence();
+ double toForeground = -HUGE_VAL;
+ int i = 1;
+
+ for (; i <= maxOffset; ++i) {
+ double f = oldScores[i] + matrixRow[seqPtr[-i]];
+ toForeground = std::max(toForeground + b2fGrowth, f);
+ scoresPtr[i] = std::max(toBackground, f2f0 + f);
+ }
+
+ for (; i <= maxRepeatOffset; ++i) {
+ toForeground += b2fGrowth;
+ scoresPtr[i] = toBackground;
+ }
+
+ std::copy(oldScores + i, scoresPtr, scoresPtr + i);
+
+ scoresPtr[0] = std::max(b2b + oldScores[0], b2fLast + toForeground);
+}
+
void RepeatFinder::makeCheckpoint() {
checkpoint += dpScoresPerLetter;
std::copy(scoresPtr - dpScoresPerLetter, scoresPtr, checkpoint);
=====================================
src/tantan_repeat_finder.hh
=====================================
@@ -69,8 +69,8 @@ private:
void initializeBackwardScores();
void calcBackwardTransitionScoresWithGaps();
- void calcBackwardTransitionScores();
void calcEmissionScores();
+ void calcScoresForOneSequencePosition();
void makeCheckpoint();
void redoCheckpoint();
int offsetWithMaxScore() const;
@@ -87,11 +87,6 @@ private:
double scoreWithEmission(const double *matrixRow, int offset) const {
return scoresPtr[offset] + matrixRow[seqPtr[-offset]];
}
-
- void calcScoresForOneSequencePosition() {
- calcEmissionScores();
- calcBackwardTransitionScores();
- }
};
}
=====================================
test/tantan_test.out
=====================================
@@ -1513,3 +1513,23 @@ chrM 8271 8290 9 2.11111 ACCCCCTCT ACCCCCTCT,ACCCCCTCT,A
chrM 13647 13691 30 1.48276 CCCCACCCTTACTAACATTAACGAAAATAA CCCCACCCTTACTAACATTAACGAAAATAA,CCCCACCCT-ACTAA
chrM 15829 15844 6 2.5 AATCCT AATCCT,AATCCT,AAT
chrM 16183 16195 1 12 C C,C,C,C,C,C,C,C,C,C,C,C
+
+SRR019778.4 6 35 7 4.14286 TTGTGTG TGTGTAT,TGTGTGT,TGTGTGT,GGTGTGT,G
+SRR019778.42 3 44 4 10.25 TGTG TGTG,TGTG,TGTG,TGTG,TGTG,TGTG,TGTT,TGTT,TTTT,TTTT,T
+SRR019778.50 0 21 7 3 GTGTGTT GTGTGTT,GTGTGTT,GAGTGTT
+SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA
+SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T
+SRR019778.65 4 37 16 2.0625 TGTGTGTGTTTTCTGT TGTGTGTGTTTTCTGT,TGTGTGTGTTTTTTGT,T
+SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG
+SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT
+SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA
+
+SRR019778.4 28 38 2 5 GT GT,GT,GT,GT,GT
+SRR019778.42 3 30 2 13.5 TG TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,T
+SRR019778.42 33 44 1 11 T T,T,T,T,T,T,T,T,T,T,T
+SRR019778.45 18 37 9 2.11111 TGTGTGTGT TGTGTGTGT,TGTGTGTGT,T
+SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA
+SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T
+SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG
+SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT
+SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA
=====================================
test/tantan_test.sh
=====================================
@@ -35,5 +35,8 @@ countLowercaseLetters () {
tantan -f4 -b12 hard.fa
echo
tantan -f4 -n1 hg19_chrM.fa
-} 2>&1 |
-diff -u tantan_test.out -
+ echo
+ tantan -f4 -b0 panda.fastq
+ echo
+ tantan -f4 -b0 -j0 panda.fastq
+} 2>&1 | diff -u tantan_test.out -
View it on GitLab: https://salsa.debian.org/med-team/tantan/-/compare/c9b5da3096c0e42186405a47d164108a4c976e72...62239853d177b9ec34380ad61203ad3ffcb6fd77
--
View it on GitLab: https://salsa.debian.org/med-team/tantan/-/compare/c9b5da3096c0e42186405a47d164108a4c976e72...62239853d177b9ec34380ad61203ad3ffcb6fd77
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220730/78b2337b/attachment-0001.htm>
More information about the debian-med-commit
mailing list