[med-svn] samtools 11/26: New "flags" command plus moved filtering from htslib/pileup to BAM reading callbacks
Charles Plessy
plessy at moszumanska.debian.org
Tue Dec 10 10:06:25 UTC 2013
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch debian/unstable
in repository samtools.
commit 399b28378ebd22623ab4b7a48b519b1f3a1da02b
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date: Thu Nov 28 11:23:43 2013 +0000
New "flags" command plus moved filtering from htslib/pileup to BAM reading callbacks
---
Makefile | 3 ++-
bam2depth.c | 15 +++++++----
bam_flags.c | 43 +++++++++++++++++++++++++++++++
bam_plcmd.c | 83 ++++++++++++++++++++++++++++++++++--------------------------
bamtk.c | 5 +++-
bedcov.c | 13 +++++++---
cut_target.c | 27 ++++++++++++--------
phase.c | 23 ++++++++++-------
8 files changed, 146 insertions(+), 66 deletions(-)
diff --git a/Makefile b/Makefile
index bf5557f..e208ac5 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ AOBJS= bam_index.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \
- faidx.o stats.o
+ faidx.o stats.o bam_flags.o
# tview todo: bam_tview.o bam_tview_curses.o bam_tview_html.o bam_lpileup.o
INCLUDES= -I. -I$(HTSDIR)
LIBCURSES= -lcurses # -lXCurses
@@ -121,6 +121,7 @@ bam_stat.o: bam_stat.c $(bam_h)
bam_tview.o: bam_tview.c $(bam_tview_h)
bam_tview_curses.o: bam_tview_curses.c $(bam_tview_h)
bam_tview_html.o: bam_tview_html.c $(bam_tview_h)
+bam_flags.o: bam_flags.c $(sam_h)
bamshuf.o: bamshuf.c $(htslib_sam_h) $(HTSDIR)/htslib/ksort.h
bamtk.o: bamtk.c $(bam_h) version.h samtools.h
bedcov.o: bedcov.c $(htslib_bgzf_h) $(bam_h) $(HTSDIR)/htslib/kseq.h
diff --git a/bam2depth.c b/bam2depth.c
index 6fd023a..c884741 100644
--- a/bam2depth.c
+++ b/bam2depth.c
@@ -24,11 +24,16 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if c
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
{
aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
- int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
- if (!(b->core.flag&BAM_FUNMAP)) {
- if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
- else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
- }
+ int ret;
+ while (1)
+ {
+ ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+ if ( ret<0 ) break;
+ if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( (int)b->core.qual < aux->min_mapQ ) continue;
+ if ( aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len ) continue;
+ break;
+ }
return ret;
}
diff --git a/bam_flags.c b/bam_flags.c
new file mode 100644
index 0000000..f2d0be3
--- /dev/null
+++ b/bam_flags.c
@@ -0,0 +1,43 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <htslib/sam.h>
+
+static void usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Convert between textual and numeric flag representation\n");
+ fprintf(stderr, "Usage: samtools flags [INT|STR]\n");
+ fprintf(stderr, "Flags:\n");
+ fprintf(stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
+ fprintf(stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the alig\n", BAM_FPROPER_PAIR);
+ fprintf(stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP);
+ fprintf(stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP);
+ fprintf(stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE);
+ fprintf(stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
+ fprintf(stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1);
+ fprintf(stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2);
+ fprintf(stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY);
+ fprintf(stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL);
+ fprintf(stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP);
+ fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
+ fprintf(stderr, "\n");
+}
+
+
+int main_flags(int argc, char *argv[])
+{
+ if ( argc!=2 ) usage();
+ else
+ {
+ int mask = bam_str2flag(argv[1]);
+ if ( mask<0 ) { fprintf(stderr,"Could not parse \"%s\"\n", argv[1]); return 1; }
+ printf("0x%x\t%s\n", mask, bam_flag2str(mask));
+ }
+ return 0;
+}
+
diff --git a/bam_plcmd.c b/bam_plcmd.c
index eb56253..efdf4ab 100644
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -592,6 +592,7 @@ int bam_mpileup(int argc, char *argv[])
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
mplp.argc = argc; mplp.argv = argv;
+ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
static struct option lopts[] =
{
{"rf",1,0,1}, // require flag
@@ -601,8 +602,14 @@ int bam_mpileup(int argc, char *argv[])
while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:vx",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
- case 1 : mplp.rflag_require = strtol(optarg,0,0); break;
- case 2 : mplp.rflag_filter = strtol(optarg,0,0); break;
+ case 1 :
+ mplp.rflag_require = bam_str2flag(optarg);
+ if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
+ break;
+ case 2 :
+ mplp.rflag_filter = bam_str2flag(optarg);
+ if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
+ break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == 0) return 1;
@@ -657,47 +664,51 @@ int bam_mpileup(int argc, char *argv[])
}
}
if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
- if (argc == 1) {
+ if (argc == 1)
+ {
+ char *tmp_require = bam_flag2str(mplp.rflag_require);
+ char *tmp_filter = bam_flag2str(mplp.rflag_filter);
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
fprintf(stderr, "Input options:\n\n");
- fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
- fprintf(stderr, " -A count anomalous read pairs\n");
- fprintf(stderr, " -B disable BAQ computation\n");
- fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n");
- fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
- fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n");
- fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n");
- fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
- fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n");
- fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
- fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
- fprintf(stderr, " -R ignore RG tags\n");
- fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
- fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
- fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n");
- fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n");
- fprintf(stderr, " -x disable read-pair overlap detection\n");
+ fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
+ fprintf(stderr, " -A count anomalous read pairs\n");
+ fprintf(stderr, " -B disable BAQ computation\n");
+ fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n");
+ fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n");
+ fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n");
+ fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n");
+ fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
+ fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n");
+ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
+ fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
+ fprintf(stderr, " -R ignore RG tags\n");
+ fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
+ fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+ fprintf(stderr, " --rf STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ fprintf(stderr, " --ff STR|INT filter flags: skip reads with mask bits set [%s]\n", tmp_filter);
+ fprintf(stderr, " -x disable read-pair overlap detection\n");
fprintf(stderr, "\nOutput options:\n\n");
- fprintf(stderr, " -D/V output per-sample DP/DV in BCF (requires -g/-v)\n");
- fprintf(stderr, " -g/v generate BCF/VCF output (genotype likelihoods)\n");
- fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n");
- fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n");
- fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n");
- fprintf(stderr, " -u generate uncompressed BCF/VCF output\n");
+ fprintf(stderr, " -D/V output per-sample DP/DV in BCF (requires -g/-v)\n");
+ fprintf(stderr, " -g/v generate BCF/VCF output (genotype likelihoods)\n");
+ fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n");
+ fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n");
+ fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n");
+ fprintf(stderr, " -u generate uncompressed BCF/VCF output\n");
fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n");
- fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
- fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
- fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
- fprintf(stderr, " -I do not perform indel calling\n");
- fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
- fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
- fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
- fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n");
- fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
+ fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+ fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
+ fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
+ fprintf(stderr, " -I do not perform indel calling\n");
+ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
+ fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
+ fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+ fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n");
+ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
+ free(tmp_require); free(tmp_filter);
return 1;
}
int ret;
diff --git a/bamtk.c b/bamtk.c
index cdf69a0..9c0dbe7 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -29,6 +29,7 @@ int main_pad2unpad(int argc, char *argv[]);
int main_bedcov(int argc, char *argv[]);
int main_bamshuf(int argc, char *argv[]);
int main_stats(int argc, char *argv[]);
+int main_flags(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
@@ -71,7 +72,8 @@ static void usage(FILE *fp)
" phase phase heterozygotes\n"
" stats generate stats (former bamcheck)\n"
" -- viewing\n"
-" tview text alignment viewer (todo)\n"
+" flags explain BAM flags\n"
+" tview text alignment viewer\n"
" view SAM<->BAM conversion\n"
//" depad convert padded BAM to unpadded BAM\n" // not stable
"\n");
@@ -127,6 +129,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1);
else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1);
else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1);
+ else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1);
else if (strcmp(argv[1], "pileup") == 0) {
fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
diff --git a/bedcov.c b/bedcov.c
index 5fb9b37..8c143c1 100644
--- a/bedcov.c
+++ b/bedcov.c
@@ -19,9 +19,16 @@ typedef struct {
static int read_bam(void *data, bam1_t *b)
{
- aux_t *aux = (aux_t*)data;
- int ret = bam_iter_read(aux->fp, aux->iter, b);
- if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
+ int ret;
+ while (1)
+ {
+ ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+ if ( ret<0 ) break;
+ if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( (int)b->core.qual < aux->min_mapQ ) continue;
+ break;
+ }
return ret;
}
diff --git a/cut_target.c b/cut_target.c
index 8c56f33..cd266a5 100644
--- a/cut_target.c
+++ b/cut_target.c
@@ -121,21 +121,27 @@ static int read_aln(void *data, bam1_t *b)
extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
ct_t *g = (ct_t*)data;
int ret, len;
- ret = bam_read1(g->fp, b);
- if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
- if (b->core.tid != g->tid) { // then load the sequence
- free(g->ref);
- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
- g->tid = b->core.tid;
- }
- bam_prob_realn_core(b, g->ref, 1<<1|1);
- }
+ while (1)
+ {
+ ret = bam_read1(g->fp, b);
+ if ( ret<0 ) break;
+ if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( g->fai && b->core.tid >= 0 ) {
+ if (b->core.tid != g->tid) { // then load the sequence
+ free(g->ref);
+ g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+ g->tid = b->core.tid;
+ }
+ bam_prob_realn_core(b, g->ref, 1<<1|1);
+ }
+ break;
+ }
return ret;
}
int main_cut_target(int argc, char *argv[])
{
- int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+ int c, tid, pos, n, lasttid = -1, l, max_l;
const bam_pileup1_t *p;
bam_plp_t plp;
uint16_t *cns;
@@ -178,7 +184,6 @@ int main_cut_target(int argc, char *argv[])
lasttid = tid;
}
cns[pos] = gencns(&g, n, p);
- lastpos = pos;
}
process_cns(g.h, lasttid, l, cns);
free(cns);
diff --git a/phase.c b/phase.c
index bf8d6d3..b086b5f 100644
--- a/phase.c
+++ b/phase.c
@@ -450,15 +450,20 @@ static int readaln(void *data, bam1_t *b)
{
phaseg_t *g = (phaseg_t*)data;
int ret;
- ret = bam_read1(g->fp, b);
- if (ret < 0) return ret;
- if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
- if (g->n == g->m) {
- g->m = g->m? g->m<<1 : 16;
- g->b = realloc(g->b, g->m * sizeof(bam1_t*));
- }
- g->b[g->n++] = bam_dup1(b);
- }
+ while (1)
+ {
+ ret = bam_read1(g->fp, b);
+ if (ret < 0) break;
+ if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( g->pre ) {
+ if (g->n == g->m) {
+ g->m = g->m? g->m<<1 : 16;
+ g->b = realloc(g->b, g->m * sizeof(bam1_t*));
+ }
+ g->b[g->n++] = bam_dup1(b);
+ }
+ break;
+ }
return ret;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git
More information about the debian-med-commit
mailing list