[med-svn] [pirs] 01/03: Imported Upstream version 2.0.2+dfsg
Andreas Tille
tille at debian.org
Wed Aug 17 08:10:14 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository pirs.
commit c0f993b844932dfbe2bc965f66ed7687209963e3
Author: Andreas Tille <tille at debian.org>
Date: Wed Aug 17 10:08:17 2016 +0200
Imported Upstream version 2.0.2+dfsg
---
.gitignore | 1 +
src/pirs/MaskQvalsByEamss.cpp | 21 ++-----
src/pirs/MaskQvalsByEamss.h | 20 ++-----
.../baseCalling_Matrix_calculator.pl | 43 ++++++++------
.../baseCalling_Matrix_calculator.txt | 69 ++++++++++++++++++++++
5 files changed, 107 insertions(+), 47 deletions(-)
diff --git a/.gitignore b/.gitignore
index 58f5301..dbdcdc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
src/afterclean.lst
+.DS_Store
*.bak
*.md5
._*
diff --git a/src/pirs/MaskQvalsByEamss.cpp b/src/pirs/MaskQvalsByEamss.cpp
index 4d5e276..21283c1 100644
--- a/src/pirs/MaskQvalsByEamss.cpp
+++ b/src/pirs/MaskQvalsByEamss.cpp
@@ -1,27 +1,18 @@
/**
- ** Copyright (c) 2008-2010 Illumina, Inc.
- **
- ** This software is covered by the "Illumina Genome Analyzer Software
- ** License Agreement" and the "Illumina Source Code License Agreement",
- ** and certain third party copyright/licenses, and any user of this
- ** source file is bound by the terms therein (see accompanying files
- ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
- ** Illumina_Source_Code_License_Agreement.pdf and third party
- ** copyright/license notices).
- **
- ** This file is part of the Consensus Assessment of Sequence And VAriation
+ ** This file is based on the Consensus Assessment of Sequence And VAriation
** (CASAVA) software package.
**
- ** \file MaskQvalsByEamss.cpp
+ ** \file ./CASAVA_v1.8.2/src/c++/lib/demultiplex/MaskQvalsByEamss.cpp
**
** \brief Masks quality values of a read, using the EAMSS.
**
** \author Come Raczy
+ **
+ ** \modifed_by Eric Biggers
+ **
+ ** \changed Use vectors instead fo strings; also got rid of the boost header.
**/
-// Modified for pIRS: Use vectors instead fo strings; also got rid of the boost
-// header.
-
#include <string>
#include <vector>
#include <string.h>
diff --git a/src/pirs/MaskQvalsByEamss.h b/src/pirs/MaskQvalsByEamss.h
index 61c912b..413d14b 100644
--- a/src/pirs/MaskQvalsByEamss.h
+++ b/src/pirs/MaskQvalsByEamss.h
@@ -1,26 +1,18 @@
/**
- ** Copyright (c) 2008-2010 Illumina, Inc.
- **
- ** This software is covered by the "Illumina Genome Analyzer Software
- ** License Agreement" and the "Illumina Source Code License Agreement",
- ** and certain third party copyright/licenses, and any user of this
- ** source file is bound by the terms therein (see accompanying files
- ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
- ** Illumina_Source_Code_License_Agreement.pdf and third party
- ** copyright/license notices).
- **
- ** This file is part of the Consensus Assessment of Sequence And VAriation
+ ** This file is based on the Consensus Assessment of Sequence And VAriation
** (CASAVA) software package.
**
- ** \file MaskQvalsByEamss.cpp
+ ** \file ./CASAVA_v1.8.2/src/c++/include/demultiplex/MaskQvalsByEamss.hh
**
** \brief Masks quality values of a read, using the EAMSS.
**
** \author Come Raczy
+ **
+ ** \modifed_by Eric Biggers
+ **
+ ** \changed Pass qValues and baseCalls as C arrays (on the stack)
**/
-// Modified for pIRS: Pass qValues and baseCalls as C arrays (on the stack)
-
#ifndef CASAVA_DEMULTIPLEX_MASK_QVALS_BY_EAMSS_H
#define CASAVA_DEMULTIPLEX_MASK_QVALS_BY_EAMSS_H
diff --git a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
index d91c626..db6c764 100755
--- a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
+++ b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
=pod
Author: Hu Xuesong @ BGI <huxuesong at genomics.org.cn>
-Version: 0.1.2 @ 20140115
+Version: 0.1.5 @ 20140207
=cut
use strict;
use warnings;
@@ -35,11 +35,10 @@ sub main::HELP_MESSAGE() {
$main::help =~ s|\n|\033[32;1m\n|g;
$main::ARG_DESC='[PROGRAM_ARG1 ...]' unless $main::ARG_DESC;
print STDERR <<EOH;
-\nUsage: \033[0;1m$0\033[0;0m [-OPTIONS [-MORE_OPTIONS]]
+\nUsage: \033[0;1m$0\033[0;0m -i xxx.bam -r ref.fa.gz -o xxx [-MORE_OPTIONS]
The following single-character options are accepted:
-\033[32;1m$main::help\033[0;0mOptions may be merged together.
-Space is not required between options and their arguments.
+\033[32;1m$main::help\033[0;0m
EOH
}
sub main::VERSION_MESSAGE() {
@@ -59,24 +58,28 @@ EOH
}
}
-$main::VERSION=1.0.0;
-our $opts='i:r:o:l:s:c:t:qb';
-our($opt_i, $opt_o, $opt_r, $opt_l, $opt_s, $opt_c, $opt_t, $opt_q, $opt_b);
+$main::VERSION=1.0.1;
+our $opts='i:r:o:l:s:c:t:m:q';
+our($opt_i, $opt_o, $opt_r, $opt_l, $opt_s, $opt_c, $opt_t, $opt_q, $opt_m);
#our $desc='';
our $help=<<EOH;
\t-i Input Pair-End SAM/BAM files [used with "samtools view xxx"]
-\t-r ref fasta file (./ref/human.fa) [.{gz,bz2} is OK]
-\t-s trim SNP positions from (<filename>) in format /^ChrID\\tPos/. VCF file with only SNP is OK.
+\t-r Reference FASTA file [.{gz,bz2} is OK]
+\t-s skip SNP positions from (<filename>) in format /^ChrID\\tPos/. VCF file with only SNP is OK.
+\t-m minimal accepted MAPQ (60)
\t-l read length of reads (int) [Optional. Specify to override auto detected value.]
\t-o output prefix (./matrix).{count,ratio}.matrix and .{stat,info}
-\t-c ChrID list file [Useful to analyse only autosomes]
+\t-c ChrID list file [to specify a subset of chromosomes, one per line]
\t-q Use Qascii=64 for sam files instead of 33
\t-t Trim ChrID in ref fasta file to match alignment results (none) [use RegEx for s/\$ARG//;]
-\t-b No 5s Pause for batch runs
EOH
ShowHelp();
+unless ($opt_r) {
+ die "[x]-r Reference FASTA file not specified !\n"
+} else { die "[x]-r $opt_r not exists !\n" unless -f $opt_r; }
+
unless ($opt_i) {
die "[x]-i Input.bam not specified !\n"
} else {
@@ -85,7 +88,7 @@ unless ($opt_i) {
warn "[!]Use user specified Read Length=",$opt_l,"\n";
} else {
my ($READLEN,$lines)=(0,0);
- open IN,"-|","$SAMTOOLSBIN view -f 3 -F 1792 $opt_i" or die "Error opening $opt_i: $!\n";
+ open IN,"-|","$SAMTOOLSBIN view -f 3 -F 3840 $opt_i" or die "Error opening $opt_i: $!\n";
while (<IN>) {
#next if /^@\w\w\t\w\w:/;
#chomp;
@@ -101,19 +104,18 @@ unless ($opt_i) {
$opt_l = $READLEN;
warn "[!]Use detected Read Length=",$opt_l," from beginning $MAXREADStoCHECK proper lines of input file.\n";
}
- open( INSAM,"-|","$SAMTOOLSBIN view -f 3 -F 1792 -h $opt_i") or die "Error opening $opt_i: $!\n";
+ open( INSAM,"-|","$SAMTOOLSBIN view -f 3 -F 3840 -h $opt_i") or die "Error opening $opt_i: $!\n";
}
-$opt_r='./ref/human.fa' if ! $opt_r;
$opt_o='./matrix' if ! $opt_o;
+$opt_m=60 if (! $opt_m) or $opt_m < 0;
die "[x]-r $opt_r not exists !\n" unless -f $opt_r;
if ($opt_s) {die "[x]-s $opt_s not exists !\n" unless -f $opt_s;}
-print STDERR "From [$opt_i] of [$opt_l] with [$opt_r] to [$opt_o]\n";
+print STDERR "From [$opt_i][$opt_m] of [$opt_l] with [$opt_r] to [$opt_o]\n";
print STDERR "ChrID will be trimed by s/$opt_t//;\n" if $opt_t;
print STDERR "ChrID list:[$opt_c]\n" if $opt_c;
print STDERR "SNP skipping list:[$opt_s]\n" if $opt_s;
print STDERR "SAM files with Qascii=64\n" if $opt_q;
-unless ($opt_b) {print STDERR "Wait 3 seconds to continue...\n"; sleep 5;}
#my $start_time = [gettimeofday];
#BEGIN
@@ -129,6 +131,7 @@ if ($opt_c) {
++$Genome{$_};
}
close C;
+ print STDERR "[!]ChrID list:[",join('],[',(sort keys %Genome)),"].\n";
}
warn "[!]Reading Reference Genome:\n";
if ($opt_r =~ /.bz2$/) {
@@ -326,9 +329,11 @@ while (<INSAM>) {
#next unless ($read1[1] & 3) == 3; # paired + mapped in a proper pair; samtools view -f 3
#next if $read1[1] >= 256; # not primary || QC failure || optical or PCR duplicate; samtools view -F 1792
next unless $read1[6] eq '='; # same Reference sequence NAME
+ next if $read1[4] < $opt_m; # 5,MAPQ: MAPping Quality (Phred-scaled)
#next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
my $OPT = join("\t", at read1[11 .. $#read1]);
next if $OPT =~ /\bXT:A:R\b/;
+ next if $OPT =~ /\bXA:Z:/; # bwa mem use XA:Z for Alternative hits, SA:Z for Chimeric reads. We need skip those with Alternative hits.
my $ref1=uc getBases($read1[2],$read1[3],$READLEN) or print join("\t", at read1),"\n";
my $ReadCycle;
if ($read1[1] & 64) {
@@ -406,6 +411,7 @@ my @BaseQ;
for my $base (@BaseOrder) {
push @BaseQ,"$base-$_" for (0..$MaxQ);
}
+=pod
$tmp .= "\n
[Info]
Date = $date
@@ -429,7 +435,8 @@ MismatchBase = $MisBase
QB_Bases = $QBbase
QB_Mismatches = $QBmis
<<END
-
+=cut
+$tmp .= "\n
[DistMatrix]
#".join("\t",'Ref','Cycle', at BaseQ);
print OA $tmp;
@@ -604,5 +611,5 @@ TODO:
add reference masked len.
Example for bam file:
-samtools view -f 3 -F 1792 -h GA0146.sort.bam | ./pirs/baseCalling_Matrix_calculator -p sam -r ref.fa -s GA0146.SNPs.filter.vcf -l 125 -o GA0146.matrix -b >GA0146.matrix.log 2>GA0146.matrix.err
+samtools view -f 3 -F 3840 -h GA0146.sort.bam | ./pirs/baseCalling_Matrix_calculator -p sam -r ref.fa -s GA0146.SNPs.filter.vcf -l 125 -o GA0146.matrix -b >GA0146.matrix.log 2>GA0146.matrix.err
# log should be empty in normal situation.
diff --git a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt
new file mode 100644
index 0000000..d1d53b6
--- /dev/null
+++ b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt
@@ -0,0 +1,69 @@
+Input file: samtools view -f 3 -F 1792 -h xxx.bam
+
+SAM FORMAT:
+ ┌────┬───────┬──────────────────────────────────────────────────────────┐
+ │Col │ Field │ Description │
+ ├────┼───────┼──────────────────────────────────────────────────────────┤
+ │ 1 │ QNAME │ Query (pair) NAME │
+ │ 2 │ FLAG │ bitwise FLAG │
+ │ 3 │ RNAME │ Reference sequence NAME │
+ │ 4 │ POS │ 1-based leftmost POSition/coordinate of clipped sequence │
+ │ 5 │ MAPQ │ MAPping Quality (Phred-scaled) │
+ │ 6 │ CIAGR │ extended CIGAR string │
+ │ 7 │ MRNM │ Mate Reference sequence NaMe (`=' if same as RNAME) │
+ │ 8 │ MPOS │ 1-based Mate POSistion │
+ │ 9 │ ISIZE │ Inferred insert SIZE │
+ │10 │ SEQ │ query SEQuence on the same strand as the reference │
+ │11 │ QUAL │ query QUALity (ASCII-33 gives the Phred base quality) │
+ │12 │ OPT │ variable OPTional fields in the format TAG:VTYPE:VALUE │
+ └────┴───────┴──────────────────────────────────────────────────────────┘
+
+SAM FLAG:
+ ┌────┬────────┬───────────────────────────────────────┐
+ │Chr │ Flag │ Description │
+ ├────┼────────┼───────────────────────────────────────┤
+ │ p +│ 0x0001 │ the read is paired in sequencing │
+ │ P +│ 0x0002 │ the read is mapped in a proper pair │
+ │ u │ 0x0004 │ the query sequence itself is unmapped │
+ │ U │ 0x0008 │ the mate is unmapped │
+ │ r │ 0x0010 │ strand of the query (1 for reverse) │
+ │ R │ 0x0020 │ strand of the mate │
+ │ 1 │ 0x0040 │ the read is the first read in a pair │
+ │ 2 │ 0x0080 │ the read is the second read in a pair │
+ │ s -│ 0x0100 │ the alignment is not primary │
+ │ f -│ 0x0200 │ QC failure │
+ │ d -│ 0x0400 │ optical or PCR duplicate │
+ └────┴────────┴───────────────────────────────────────┘
+
+Options:
+ -i Input Pair-End SAM/BAM files [used with "samtools view xxx"]
+ -r Reference FASTA file [.{gz,bz2} is OK]
+ -s skip SNP positions from (<filename>) in format /^ChrID\tPos/. VCF file with only SNP is OK.
+ -m minimal accepted MAPQ (50)
+ -l read length of reads (int) [Optional. Specify to override auto detected value.]
+ -o output prefix (./matrix).{count,ratio}.matrix and .{stat,info}
+ -c ChrID list file [to specify a subset of chromosomes, one per line]
+ -q Use Qascii=64 for sam files instead of 33
+ -t Trim ChrID in ref fasta file to match alignment results (none) [use RegEx for s/\$ARG//;]
+
+Should be set: i,r,o
+Better to have: s
+With reasonable defaults: m
+Up to you: c
+Histrical Reason: l,q,t
+
+补充:
+-q 没它就按33来,有它就按64来。
+-t 适用于参考序列是用"chr01",但soap文件是用"01"的情况。在华大遇到过人的soap文件,染色体名只有数字和X、Y。此时 -t chr 即可。
+
+All filters on input SAM/BAM file:
+1. samtools view -f 3 -F 1792 含义见上面“SAM FLAG”表格第一列的加号和减号。即必须有加号的项、没有减号的项。
+2. (CIAGR =~ /(\d+)M/) && ($1 != $READLEN);
+3. RNAME not in given Reference file and Chr List
+4. MRNM not '='
+5. MAPQ < "-m minimal accepted MAPQ (50)"
+6. OPT =~ /\bXT:A:R\b/;
+7. OPT =~ /\bXA:Z:/;
+
+Example CLI:
+perl baseCalling_Matrix_calculator.pl -b -r ref.fa -i bam/xxx.sort.bam -s vcf/xxx.SNPs.filter.vcf -o matrix/xxx.matrix 2>matrix/xxx.matrix.err
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pirs.git
More information about the debian-med-commit
mailing list