[med-svn] samtools 20/26: stats: Fix of mismmatch-per-cycle calculation in presence of hard-clipped reads

Charles Plessy plessy at moszumanska.debian.org
Tue Dec 10 10:06:26 UTC 2013


This is an automated email from the git hooks/post-receive script.

plessy pushed a commit to branch debian/unstable
in repository samtools.

commit 9e782306ac8092a80d53b7e2cf6108c48c8ea11c
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date:   Wed Dec 4 16:52:29 2013 +0000

    stats: Fix of mismmatch-per-cycle calculation in presence of hard-clipped reads
---
 INSTALL |  3 +++
 stats.c | 30 +++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/INSTALL b/INSTALL
index 37d84a9..f60ac70 100644
--- a/INSTALL
+++ b/INSTALL
@@ -12,6 +12,9 @@ the modern Linux/Unix distributions. If you do not have this library installed,
 you can still compile the rest of SAMtools by manually changing:
 `-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and
 comment out the line starting with `LIBCURSES='.
+Note that on some systems the library is available as -lcurses while on others
+as -lnurses. This can be set in Makefile by setting LIBCURSES= -lcurses vs
+-lncurses.
 
 
 Compilation
diff --git a/stats.c b/stats.c
index 7500271..45864d9 100644
--- a/stats.c
+++ b/stats.c
@@ -306,12 +306,24 @@ void count_indels(stats_t *stats,bam1_t *bam_line)
     }
 }
 
+int unclipped_length(bam1_t *bam_line)
+{
+    int icig, read_len = bam_line->core.l_qseq;
+    for (icig=0; icig<bam_line->core.n_cigar; icig++)
+    {
+        int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+        if ( cig==BAM_CHARD_CLIP )
+            read_len += bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+    }
+    return read_len;
+}
+
 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) 
 {
     int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
     int icig,iread=0,icycle=0;
     int iref = bam_line->core.pos - stats->rseq_pos;
-    int read_len   = bam_line->core.l_qseq;
+    int read_len   = unclipped_length(bam_line); 
     uint8_t *read  = bam1_seq(bam_line);
     uint8_t *quals = bam1_qual(bam_line);
     uint64_t *mpc_buf = stats->mpc_buf;
@@ -322,18 +334,18 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
         //  MIDNSHP
         int cig  = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
         int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
-        if ( cig==1 )
+        if ( cig==BAM_CINS )
         {
             iread  += ncig;
             icycle += ncig;
             continue;
         }
-        if ( cig==2 )
+        if ( cig==BAM_CDEL )
         {
             iref += ncig;
             continue;
         }
-        if ( cig==4 )
+        if ( cig==BAM_CSOFT_CLIP )
         {
             icycle += ncig;
             // Soft-clips are present in the sequence, but the position of the read marks a start of the sequence after clipping
@@ -341,15 +353,15 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
             iread  += ncig;
             continue;
         }
-        if ( cig==5 )
+        if ( cig==BAM_CHARD_CLIP )
         {
             icycle += ncig;
             continue;
         }
         // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
         //  chunk of refseq in memory. Not very frequent and not noticable in the stats.
-        if ( cig==3 || cig==5 ) continue;
-        if ( cig!=0 )
+        if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP ) continue;
+        if ( cig!=BAM_CMATCH )
             error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
        
         if ( ncig+iref > stats->nrseq_buf )
@@ -599,7 +611,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
 
     stats->read_lengths[seq_len]++;
 
-    // Count GC and ACGT per cycle
+    // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
     uint8_t base, *seq  = bam1_seq(bam_line);
     int gc_count  = 0;
     int i;
@@ -644,7 +656,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
     if ( stats->trim_qual>0 ) 
         stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
 
-    // Quality histogram and average quality
+    // Quality histogram and average quality. Clipping is neglected.
     for (i=0; i<seq_len; i++)
     {
         uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git



More information about the debian-med-commit mailing list