[med-svn] samtools 20/26: stats: Fix of mismmatch-per-cycle calculation in presence of hard-clipped reads
Charles Plessy
plessy at moszumanska.debian.org
Tue Dec 10 10:06:26 UTC 2013
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch debian/unstable
in repository samtools.
commit 9e782306ac8092a80d53b7e2cf6108c48c8ea11c
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date: Wed Dec 4 16:52:29 2013 +0000
stats: Fix of mismmatch-per-cycle calculation in presence of hard-clipped reads
---
INSTALL | 3 +++
stats.c | 30 +++++++++++++++++++++---------
2 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/INSTALL b/INSTALL
index 37d84a9..f60ac70 100644
--- a/INSTALL
+++ b/INSTALL
@@ -12,6 +12,9 @@ the modern Linux/Unix distributions. If you do not have this library installed,
you can still compile the rest of SAMtools by manually changing:
`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and
comment out the line starting with `LIBCURSES='.
+Note that on some systems the library is available as -lcurses while on others
+as -lnurses. This can be set in Makefile by setting LIBCURSES= -lcurses vs
+-lncurses.
Compilation
diff --git a/stats.c b/stats.c
index 7500271..45864d9 100644
--- a/stats.c
+++ b/stats.c
@@ -306,12 +306,24 @@ void count_indels(stats_t *stats,bam1_t *bam_line)
}
}
+int unclipped_length(bam1_t *bam_line)
+{
+ int icig, read_len = bam_line->core.l_qseq;
+ for (icig=0; icig<bam_line->core.n_cigar; icig++)
+ {
+ int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+ if ( cig==BAM_CHARD_CLIP )
+ read_len += bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+ }
+ return read_len;
+}
+
void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
{
int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
int icig,iread=0,icycle=0;
int iref = bam_line->core.pos - stats->rseq_pos;
- int read_len = bam_line->core.l_qseq;
+ int read_len = unclipped_length(bam_line);
uint8_t *read = bam1_seq(bam_line);
uint8_t *quals = bam1_qual(bam_line);
uint64_t *mpc_buf = stats->mpc_buf;
@@ -322,18 +334,18 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
// MIDNSHP
int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
- if ( cig==1 )
+ if ( cig==BAM_CINS )
{
iread += ncig;
icycle += ncig;
continue;
}
- if ( cig==2 )
+ if ( cig==BAM_CDEL )
{
iref += ncig;
continue;
}
- if ( cig==4 )
+ if ( cig==BAM_CSOFT_CLIP )
{
icycle += ncig;
// Soft-clips are present in the sequence, but the position of the read marks a start of the sequence after clipping
@@ -341,15 +353,15 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
iread += ncig;
continue;
}
- if ( cig==5 )
+ if ( cig==BAM_CHARD_CLIP )
{
icycle += ncig;
continue;
}
// Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
// chunk of refseq in memory. Not very frequent and not noticable in the stats.
- if ( cig==3 || cig==5 ) continue;
- if ( cig!=0 )
+ if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP ) continue;
+ if ( cig!=BAM_CMATCH )
error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
if ( ncig+iref > stats->nrseq_buf )
@@ -599,7 +611,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
stats->read_lengths[seq_len]++;
- // Count GC and ACGT per cycle
+ // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored
uint8_t base, *seq = bam1_seq(bam_line);
int gc_count = 0;
int i;
@@ -644,7 +656,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
if ( stats->trim_qual>0 )
stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
- // Quality histogram and average quality
+ // Quality histogram and average quality. Clipping is neglected.
for (i=0; i<seq_len; i++)
{
uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git
More information about the debian-med-commit
mailing list