[med-svn] samtools 01/11: stats: Compute checksum for names/seqs/quals

Charles Plessy plessy at moszumanska.debian.org
Thu Aug 21 22:58:53 UTC 2014


This is an automated email from the git hooks/post-receive script.

plessy pushed a commit to branch develop
in repository samtools.

commit c8e1a2bf94c48787b26a0e5d4f2b4d3c488ebc07
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date:   Wed Aug 13 08:48:13 2014 +0100

    stats: Compute checksum for names/seqs/quals
---
 stats.c                    | 26 ++++++++++++++++++++++++++
 test/stat/1.stats.expected |  3 +++
 test/stat/2.stats.expected |  3 +++
 test/stat/3.stats.expected |  3 +++
 test/stat/4.stats.expected |  3 +++
 test/stat/5.stats.expected |  3 +++
 test/stat/6.stats.expected |  3 +++
 7 files changed, 44 insertions(+)

diff --git a/stats.c b/stats.c
index 8eb24e8..f72165e 100644
--- a/stats.c
+++ b/stats.c
@@ -26,6 +26,7 @@
 #include <getopt.h>
 #include <errno.h>
 #include <assert.h>
+#include <zlib.h>   // for crc32
 #include <htslib/faidx.h>
 #include <htslib/sam.h>
 #include <htslib/hts.h>
@@ -122,6 +123,9 @@ typedef struct
     uint64_t nbases_trimmed;  // bwa trimmed bases
     uint64_t nmismatches;
     uint64_t nreads_QCfailed, nreads_secondary;
+    struct {
+        uint32_t names, reads, quals;
+    } checksum;
 
     // GC-depth related data
     uint32_t ngcd, igcd;        // The maximum number of GC depth bins and index of the current bin
@@ -572,6 +576,23 @@ void realloc_buffers(stats_t *stats, int seq_len)
     realloc_rseq_buffer(stats);
 }
 
+void update_checksum(bam1_t *bam_line, stats_t *stats)
+{
+    uint8_t *name = (uint8_t*) bam_get_qname(bam_line);
+    int len = 0;
+    while ( name[len] ) len++;
+    stats->checksum.names +=  crc32(0L, name, len);
+
+    int seq_len = bam_line->core.l_qseq;
+    if ( !seq_len ) return;
+
+    uint8_t *seq = bam_get_seq(bam_line);
+    stats->checksum.reads += crc32(0L, seq, (seq_len+1)/2);
+
+    uint8_t *qual = bam_get_qual(bam_line);
+    stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2);
+}
+
 void collect_stats(bam1_t *bam_line, stats_t *stats)
 {
     if ( stats->rg_hash )
@@ -599,6 +620,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
     if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
     if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++;
 
+    update_checksum(bam_line, stats);
+
     int seq_len = bam_line->core.l_qseq;
     if ( !seq_len ) return;
 
@@ -909,6 +932,9 @@ void output_stats(stats_t *stats, int sparse)
     for (i=1; i<stats->argc; i++)
         printf(" %s",stats->argv[i]);
     printf("\n");
+    printf("# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n");
+    printf("# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
+    printf("CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
     printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
     printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));  // not counting excluded seqs (and none of the below)
     printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
diff --git a/test/stat/1.stats.expected b/test/stat/1.stats.expected
index fa8a3b9..0604069 100644
--- a/test/stat/1.stats.expected
+++ b/test/stat/1.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	29c426ae	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0
diff --git a/test/stat/2.stats.expected b/test/stat/2.stats.expected
index a017dad..2024271 100644
--- a/test/stat/2.stats.expected
+++ b/test/stat/2.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	29c426ae	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0
diff --git a/test/stat/3.stats.expected b/test/stat/3.stats.expected
index bd2e000..157c321 100644
--- a/test/stat/3.stats.expected
+++ b/test/stat/3.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	ce379e9a	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0
diff --git a/test/stat/4.stats.expected b/test/stat/4.stats.expected
index 8aafd23..d6c5622 100644
--- a/test/stat/4.stats.expected
+++ b/test/stat/4.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	e7724556	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0
diff --git a/test/stat/5.stats.expected b/test/stat/5.stats.expected
index 4e0ffcd..6588d66 100644
--- a/test/stat/5.stats.expected
+++ b/test/stat/5.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	32507d92	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0
diff --git a/test/stat/6.stats.expected b/test/stat/6.stats.expected
index 4e0ffcd..6588d66 100644
--- a/test/stat/6.stats.expected
+++ b/test/stat/6.stats.expected
@@ -1,3 +1,6 @@
+# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
+# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
+CHK	1a1c1362	32507d92	7bab45da
 # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
 SN	raw total sequences:	2
 SN	filtered sequences:	0

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git



More information about the debian-med-commit mailing list