[med-svn] samtools 11/26: New "flags" command plus moved filtering from htslib/pileup to BAM reading callbacks

Charles Plessy plessy at moszumanska.debian.org
Tue Dec 10 10:06:25 UTC 2013


This is an automated email from the git hooks/post-receive script.

plessy pushed a commit to branch debian/unstable
in repository samtools.

commit 399b28378ebd22623ab4b7a48b519b1f3a1da02b
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date:   Thu Nov 28 11:23:43 2013 +0000

    New "flags" command plus moved filtering from htslib/pileup to BAM reading callbacks
---
 Makefile     |  3 ++-
 bam2depth.c  | 15 +++++++----
 bam_flags.c  | 43 +++++++++++++++++++++++++++++++
 bam_plcmd.c  | 83 ++++++++++++++++++++++++++++++++++--------------------------
 bamtk.c      |  5 +++-
 bedcov.c     | 13 +++++++---
 cut_target.c | 27 ++++++++++++--------
 phase.c      | 23 ++++++++++-------
 8 files changed, 146 insertions(+), 66 deletions(-)

diff --git a/Makefile b/Makefile
index bf5557f..e208ac5 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ AOBJS=		bam_index.o bam_plcmd.o sam_view.o \
 			bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
 			bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
 			cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \
-            faidx.o stats.o 
+            faidx.o stats.o bam_flags.o
             # tview todo: bam_tview.o bam_tview_curses.o bam_tview_html.o bam_lpileup.o
 INCLUDES=	-I. -I$(HTSDIR)
 LIBCURSES=	-lcurses # -lXCurses
@@ -121,6 +121,7 @@ bam_stat.o: bam_stat.c $(bam_h)
 bam_tview.o: bam_tview.c $(bam_tview_h)
 bam_tview_curses.o: bam_tview_curses.c $(bam_tview_h)
 bam_tview_html.o: bam_tview_html.c $(bam_tview_h)
+bam_flags.o: bam_flags.c $(sam_h)
 bamshuf.o: bamshuf.c $(htslib_sam_h) $(HTSDIR)/htslib/ksort.h
 bamtk.o: bamtk.c $(bam_h) version.h samtools.h
 bedcov.o: bedcov.c $(htslib_bgzf_h) $(bam_h) $(HTSDIR)/htslib/kseq.h
diff --git a/bam2depth.c b/bam2depth.c
index 6fd023a..c884741 100644
--- a/bam2depth.c
+++ b/bam2depth.c
@@ -24,11 +24,16 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if c
 static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
 {
 	aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
-	int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
-	if (!(b->core.flag&BAM_FUNMAP)) {
-		if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
-		else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
-	}
+    int ret;
+    while (1)
+    {
+        ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+        if ( ret<0 ) break;
+        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( (int)b->core.qual < aux->min_mapQ ) continue;
+        if ( aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len ) continue;
+        break;
+    }
 	return ret;
 }
 
diff --git a/bam_flags.c b/bam_flags.c
new file mode 100644
index 0000000..f2d0be3
--- /dev/null
+++ b/bam_flags.c
@@ -0,0 +1,43 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <htslib/sam.h>
+
+static void usage(void)
+{
+    fprintf(stderr, "\n");
+    fprintf(stderr, "About: Convert between textual and numeric flag representation\n");
+    fprintf(stderr, "Usage: samtools flags [INT|STR]\n");
+    fprintf(stderr, "Flags:\n");
+    fprintf(stderr, "\t0x%x\tPAIRED        .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
+    fprintf(stderr, "\t0x%x\tPROPER_PAIR   .. each segment properly aligned according to the alig\n", BAM_FPROPER_PAIR);
+    fprintf(stderr, "\t0x%x\tUNMAP         .. segment unmapped\n", BAM_FUNMAP);
+    fprintf(stderr, "\t0x%x\tMUNMAP        .. next segment in the template unmapped\n", BAM_FMUNMAP);
+    fprintf(stderr, "\t0x%x\tREVERSE       .. SEQ is reverse complemented\n", BAM_FREVERSE);
+    fprintf(stderr, "\t0x%x\tMREVERSE      .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
+    fprintf(stderr, "\t0x%x\tREAD1         .. the first segment in the template\n", BAM_FREAD1);
+    fprintf(stderr, "\t0x%x\tREAD2         .. the last segment in the template\n", BAM_FREAD2);
+    fprintf(stderr, "\t0x%x\tSECONDARY     .. secondary alignment\n", BAM_FSECONDARY);
+    fprintf(stderr, "\t0x%x\tQCFAIL        .. not passing quality controls\n", BAM_FQCFAIL);
+    fprintf(stderr, "\t0x%x\tDUP           .. PCR or optical duplicate\n", BAM_FDUP);
+    fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
+    fprintf(stderr, "\n");
+}
+
+
+int main_flags(int argc, char *argv[])
+{
+    if ( argc!=2 ) usage();
+    else
+    {
+        int mask = bam_str2flag(argv[1]);
+        if ( mask<0 ) { fprintf(stderr,"Could not parse \"%s\"\n", argv[1]); return 1; }
+        printf("0x%x\t%s\n", mask, bam_flag2str(mask));
+    }
+	return 0;
+}
+
diff --git a/bam_plcmd.c b/bam_plcmd.c
index eb56253..efdf4ab 100644
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -592,6 +592,7 @@ int bam_mpileup(int argc, char *argv[])
 	mplp.min_frac = 0.002; mplp.min_support = 1;
 	mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
     mplp.argc = argc; mplp.argv = argv;
+    mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
     static struct option lopts[] = 
     {
         {"rf",1,0,1},   // require flag
@@ -601,8 +602,14 @@ int bam_mpileup(int argc, char *argv[])
 	while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:vx",lopts,NULL)) >= 0) {
 		switch (c) {
         case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
-        case  1 : mplp.rflag_require = strtol(optarg,0,0); break;
-        case  2 : mplp.rflag_filter  = strtol(optarg,0,0); break;
+        case  1 : 
+            mplp.rflag_require = bam_str2flag(optarg); 
+            if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
+            break;
+        case  2 : 
+            mplp.rflag_filter = bam_str2flag(optarg); 
+            if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
+            break;
 		case 'f':
 			mplp.fai = fai_load(optarg);
 			if (mplp.fai == 0) return 1;
@@ -657,47 +664,51 @@ int bam_mpileup(int argc, char *argv[])
 		}
 	}
 	if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
-	if (argc == 1) {
+	if (argc == 1) 
+    {
+        char *tmp_require = bam_flag2str(mplp.rflag_require);
+        char *tmp_filter  = bam_flag2str(mplp.rflag_filter);
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
 		fprintf(stderr, "Input options:\n\n");
-		fprintf(stderr, "       -6           assume the quality is in the Illumina-1.3+ encoding\n");
-		fprintf(stderr, "       -A           count anomalous read pairs\n");
-		fprintf(stderr, "       -B           disable BAQ computation\n");
-		fprintf(stderr, "       -b FILE      list of input BAM filenames, one per line [null]\n");
-		fprintf(stderr, "       -C INT       parameter for adjusting mapQ; 0 to disable [0]\n");
-		fprintf(stderr, "       -d INT       max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
-		fprintf(stderr, "       -E           recalculate extended BAQ on the fly thus ignoring existing BQs\n");
-		fprintf(stderr, "       -f FILE      faidx indexed reference sequence file [null]\n");
-		fprintf(stderr, "       -G FILE      exclude read groups listed in FILE [null]\n");
-		fprintf(stderr, "       -l FILE      list of positions (chr pos) or regions (BED) [null]\n");
-		fprintf(stderr, "       -M INT       cap mapping quality at INT [%d]\n", mplp.max_mq);
-		fprintf(stderr, "       -r STR       region in which pileup is generated [null]\n");
-		fprintf(stderr, "       -R           ignore RG tags\n");
-		fprintf(stderr, "       -q INT       skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
-		fprintf(stderr, "       -Q INT       skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
-		fprintf(stderr, "       --rf INT     required flags: skip reads with mask bits unset []\n");
-		fprintf(stderr, "       --ff INT     filter flags: skip reads with mask bits set []\n");
-		fprintf(stderr, "       -x           disable read-pair overlap detection\n");
+		fprintf(stderr, "       -6              assume the quality is in the Illumina-1.3+ encoding\n");
+		fprintf(stderr, "       -A              count anomalous read pairs\n");
+		fprintf(stderr, "       -B              disable BAQ computation\n");
+		fprintf(stderr, "       -b FILE         list of input BAM filenames, one per line [null]\n");
+		fprintf(stderr, "       -C INT          parameter for adjusting mapQ; 0 to disable [0]\n");
+		fprintf(stderr, "       -d INT          max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
+		fprintf(stderr, "       -E              recalculate extended BAQ on the fly thus ignoring existing BQs\n");
+		fprintf(stderr, "       -f FILE         faidx indexed reference sequence file [null]\n");
+		fprintf(stderr, "       -G FILE         exclude read groups listed in FILE [null]\n");
+		fprintf(stderr, "       -l FILE         list of positions (chr pos) or regions (BED) [null]\n");
+		fprintf(stderr, "       -M INT          cap mapping quality at INT [%d]\n", mplp.max_mq);
+		fprintf(stderr, "       -r STR          region in which pileup is generated [null]\n");
+		fprintf(stderr, "       -R              ignore RG tags\n");
+		fprintf(stderr, "       -q INT          skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
+		fprintf(stderr, "       -Q INT          skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+		fprintf(stderr, "       --rf STR|INT    required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+		fprintf(stderr, "       --ff STR|INT    filter flags: skip reads with mask bits set [%s]\n", tmp_filter);
+		fprintf(stderr, "       -x              disable read-pair overlap detection\n");
 		fprintf(stderr, "\nOutput options:\n\n");
-		fprintf(stderr, "       -D/V         output per-sample DP/DV in BCF (requires -g/-v)\n");
-		fprintf(stderr, "       -g/v         generate BCF/VCF output (genotype likelihoods)\n");
-		fprintf(stderr, "       -O           output base positions on reads (disabled by -g/-u)\n");
-		fprintf(stderr, "       -s           output mapping quality (disabled by -g/-u)\n");
-		fprintf(stderr, "       -S           output per-sample strand bias P-value in BCF (require -g/-u)\n");
-		fprintf(stderr, "       -u           generate uncompressed BCF/VCF output\n");
+		fprintf(stderr, "       -D/V            output per-sample DP/DV in BCF (requires -g/-v)\n");
+		fprintf(stderr, "       -g/v            generate BCF/VCF output (genotype likelihoods)\n");
+		fprintf(stderr, "       -O              output base positions on reads (disabled by -g/-u)\n");
+		fprintf(stderr, "       -s              output mapping quality (disabled by -g/-u)\n");
+		fprintf(stderr, "       -S              output per-sample strand bias P-value in BCF (require -g/-u)\n");
+		fprintf(stderr, "       -u              generate uncompressed BCF/VCF output\n");
 		fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n");
-		fprintf(stderr, "       -e INT       Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
-		fprintf(stderr, "       -F FLOAT     minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
-		fprintf(stderr, "       -h INT       coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
-		fprintf(stderr, "       -I           do not perform indel calling\n");
-		fprintf(stderr, "       -L INT       max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
-		fprintf(stderr, "       -m INT       minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
-		fprintf(stderr, "       -o INT       Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
-		fprintf(stderr, "       -p           apply -m and -F per-sample to increase sensitivity\n");
-		fprintf(stderr, "       -P STR       comma separated list of platforms for indels [all]\n");
+		fprintf(stderr, "       -e INT          Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+		fprintf(stderr, "       -F FLOAT        minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
+		fprintf(stderr, "       -h INT          coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
+		fprintf(stderr, "       -I              do not perform indel calling\n");
+		fprintf(stderr, "       -L INT          max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
+		fprintf(stderr, "       -m INT          minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
+		fprintf(stderr, "       -o INT          Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+		fprintf(stderr, "       -p              apply -m and -F per-sample to increase sensitivity\n");
+		fprintf(stderr, "       -P STR          comma separated list of platforms for indels [all]\n");
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
+        free(tmp_require); free(tmp_filter);
 		return 1;
 	}
     int ret;
diff --git a/bamtk.c b/bamtk.c
index cdf69a0..9c0dbe7 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -29,6 +29,7 @@ int main_pad2unpad(int argc, char *argv[]);
 int main_bedcov(int argc, char *argv[]);
 int main_bamshuf(int argc, char *argv[]);
 int main_stats(int argc, char *argv[]);
+int main_flags(int argc, char *argv[]);
 
 int faidx_main(int argc, char *argv[]);
 
@@ -71,7 +72,8 @@ static void usage(FILE *fp)
 "         phase       phase heterozygotes\n"
 "         stats       generate stats (former bamcheck)\n"
 "  -- viewing\n"
-"         tview       text alignment viewer (todo)\n"
+"         flags       explain BAM flags\n"
+"         tview       text alignment viewer\n"
 "         view        SAM<->BAM conversion\n"
 //"         depad       convert padded BAM to unpadded BAM\n" // not stable
 "\n");
@@ -127,6 +129,7 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "bedcov") == 0)    ret = main_bedcov(argc-1, argv+1);
 	else if (strcmp(argv[1], "bamshuf") == 0)   ret = main_bamshuf(argc-1, argv+1);
 	else if (strcmp(argv[1], "stats") == 0)     ret = main_stats(argc-1, argv+1);
+	else if (strcmp(argv[1], "flags") == 0)     ret = main_flags(argc-1, argv+1);
 	else if (strcmp(argv[1], "pileup") == 0) {
 		fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
 		return 1;
diff --git a/bedcov.c b/bedcov.c
index 5fb9b37..8c143c1 100644
--- a/bedcov.c
+++ b/bedcov.c
@@ -19,9 +19,16 @@ typedef struct {
 
 static int read_bam(void *data, bam1_t *b)
 {
-	aux_t *aux = (aux_t*)data;
-	int ret = bam_iter_read(aux->fp, aux->iter, b);
-	if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+	aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
+    int ret;
+    while (1)
+    {
+        ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+        if ( ret<0 ) break;
+        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( (int)b->core.qual < aux->min_mapQ ) continue;
+        break;
+    }
 	return ret;
 }
 
diff --git a/cut_target.c b/cut_target.c
index 8c56f33..cd266a5 100644
--- a/cut_target.c
+++ b/cut_target.c
@@ -121,21 +121,27 @@ static int read_aln(void *data, bam1_t *b)
 	extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
 	ct_t *g = (ct_t*)data;
 	int ret, len;
-	ret = bam_read1(g->fp, b);
-	if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
-		if (b->core.tid != g->tid) { // then load the sequence
-			free(g->ref);
-			g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
-			g->tid = b->core.tid;
-		}
-		bam_prob_realn_core(b, g->ref, 1<<1|1);
-	}
+    while (1)
+    {
+        ret = bam_read1(g->fp, b);
+        if ( ret<0 ) break;
+        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( g->fai && b->core.tid >= 0 ) {
+            if (b->core.tid != g->tid) { // then load the sequence
+                free(g->ref);
+                g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+                g->tid = b->core.tid;
+            }
+            bam_prob_realn_core(b, g->ref, 1<<1|1);
+        }
+        break;
+    }
 	return ret;
 }
 
 int main_cut_target(int argc, char *argv[])
 {
-	int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+	int c, tid, pos, n, lasttid = -1, l, max_l;
 	const bam_pileup1_t *p;
 	bam_plp_t plp;
 	uint16_t *cns;
@@ -178,7 +184,6 @@ int main_cut_target(int argc, char *argv[])
 			lasttid = tid;
 		}
 		cns[pos] = gencns(&g, n, p);
-		lastpos = pos;
 	}
 	process_cns(g.h, lasttid, l, cns);
 	free(cns);
diff --git a/phase.c b/phase.c
index bf8d6d3..b086b5f 100644
--- a/phase.c
+++ b/phase.c
@@ -450,15 +450,20 @@ static int readaln(void *data, bam1_t *b)
 {
 	phaseg_t *g = (phaseg_t*)data;
 	int ret;
-	ret = bam_read1(g->fp, b);
-	if (ret < 0) return ret;
-	if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
-		if (g->n == g->m) {
-			g->m = g->m? g->m<<1 : 16;
-			g->b = realloc(g->b, g->m * sizeof(bam1_t*));
-		}
-		g->b[g->n++] = bam_dup1(b);
-	}
+    while (1)
+    {
+        ret = bam_read1(g->fp, b);
+        if (ret < 0) break;
+        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( g->pre ) {
+            if (g->n == g->m) {
+                g->m = g->m? g->m<<1 : 16;
+                g->b = realloc(g->b, g->m * sizeof(bam1_t*));
+            }
+            g->b[g->n++] = bam_dup1(b);
+        }
+        break;
+    }
 	return ret;
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git



More information about the debian-med-commit mailing list