[med-svn] samtools 01/11: stats: Compute checksum for names/seqs/quals
Charles Plessy
plessy at moszumanska.debian.org
Thu Aug 21 22:58:53 UTC 2014
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch develop
in repository samtools.
commit c8e1a2bf94c48787b26a0e5d4f2b4d3c488ebc07
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date: Wed Aug 13 08:48:13 2014 +0100
stats: Compute checksum for names/seqs/quals
---
stats.c | 26 ++++++++++++++++++++++++++
test/stat/1.stats.expected | 3 +++
test/stat/2.stats.expected | 3 +++
test/stat/3.stats.expected | 3 +++
test/stat/4.stats.expected | 3 +++
test/stat/5.stats.expected | 3 +++
test/stat/6.stats.expected | 3 +++
7 files changed, 44 insertions(+)
diff --git a/stats.c b/stats.c
index 8eb24e8..f72165e 100644
--- a/stats.c
+++ b/stats.c
@@ -26,6 +26,7 @@
#include <getopt.h>
#include <errno.h>
#include <assert.h>
+#include <zlib.h> // for crc32
#include <htslib/faidx.h>
#include <htslib/sam.h>
#include <htslib/hts.h>
@@ -122,6 +123,9 @@ typedef struct
uint64_t nbases_trimmed; // bwa trimmed bases
uint64_t nmismatches;
uint64_t nreads_QCfailed, nreads_secondary;
+ struct {
+ uint32_t names, reads, quals;
+ } checksum;
// GC-depth related data
uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
@@ -572,6 +576,23 @@ void realloc_buffers(stats_t *stats, int seq_len)
realloc_rseq_buffer(stats);
}
+void update_checksum(bam1_t *bam_line, stats_t *stats)
+{
+ uint8_t *name = (uint8_t*) bam_get_qname(bam_line);
+ int len = 0;
+ while ( name[len] ) len++;
+ stats->checksum.names += crc32(0L, name, len);
+
+ int seq_len = bam_line->core.l_qseq;
+ if ( !seq_len ) return;
+
+ uint8_t *seq = bam_get_seq(bam_line);
+ stats->checksum.reads += crc32(0L, seq, (seq_len+1)/2);
+
+ uint8_t *qual = bam_get_qual(bam_line);
+ stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2);
+}
+
void collect_stats(bam1_t *bam_line, stats_t *stats)
{
if ( stats->rg_hash )
@@ -599,6 +620,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++;
+ update_checksum(bam_line, stats);
+
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
@@ -909,6 +932,9 @@ void output_stats(stats_t *stats, int sparse)
for (i=1; i<stats->argc; i++)
printf(" %s",stats->argv[i]);
printf("\n");
+ printf("# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
+ printf("# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
+ printf("CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below)
printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
diff --git a/test/stat/1.stats.expected b/test/stat/1.stats.expected
index fa8a3b9..0604069 100644
--- a/test/stat/1.stats.expected
+++ b/test/stat/1.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 29c426ae 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
diff --git a/test/stat/2.stats.expected b/test/stat/2.stats.expected
index a017dad..2024271 100644
--- a/test/stat/2.stats.expected
+++ b/test/stat/2.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 29c426ae 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
diff --git a/test/stat/3.stats.expected b/test/stat/3.stats.expected
index bd2e000..157c321 100644
--- a/test/stat/3.stats.expected
+++ b/test/stat/3.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 ce379e9a 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
diff --git a/test/stat/4.stats.expected b/test/stat/4.stats.expected
index 8aafd23..d6c5622 100644
--- a/test/stat/4.stats.expected
+++ b/test/stat/4.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 e7724556 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
diff --git a/test/stat/5.stats.expected b/test/stat/5.stats.expected
index 4e0ffcd..6588d66 100644
--- a/test/stat/5.stats.expected
+++ b/test/stat/5.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 32507d92 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
diff --git a/test/stat/6.stats.expected b/test/stat/6.stats.expected
index 4e0ffcd..6588d66 100644
--- a/test/stat/6.stats.expected
+++ b/test/stat/6.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK 1a1c1362 32507d92 7bab45da
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN raw total sequences: 2
SN filtered sequences: 0
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git
More information about the debian-med-commit
mailing list