[med-svn] [picard-tools] 02/07: Imported Upstream version 1.141+dfsg.1
Andreas Tille
tille at debian.org
Thu Nov 26 10:47:45 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository picard-tools.
commit c2bd278d7fa1110705dbda7f84a48ab4501b9d9c
Author: Andreas Tille <tille at debian.org>
Date: Thu Nov 26 11:13:04 2015 +0100
Imported Upstream version 1.141+dfsg.1
---
Dockerfile | 6 +-
README.md | 26 +-
build.sbt | 4 +-
build.xml | 3 +-
.../analysis/CollectAlignmentSummaryMetrics.java | 2 +-
src/java/picard/analysis/CollectGcBiasMetrics.java | 114 ++-----
.../picard/analysis/CollectInsertSizeMetrics.java | 4 +-
.../picard/analysis/CollectMultipleMetrics.java | 111 +++++--
src/java/picard/analysis/CollectOxoGMetrics.java | 6 +
.../analysis/CollectQualityYieldMetrics.java | 16 +-
src/java/picard/analysis/CollectRnaSeqMetrics.java | 2 +-
src/java/picard/analysis/CollectWgsMetrics.java | 29 +-
.../analysis/CollectWgsMetricsFromQuerySorted.java | 298 +++++++++++++++++
.../CollectWgsMetricsFromSampledSites.java | 62 ++++
.../{directed => }/GcBiasMetricsCollector.java | 97 ++++--
src/java/picard/analysis/GcBiasSummaryMetrics.java | 2 +-
src/java/picard/analysis/GcBiasUtils.java | 122 +++++++
.../analysis/directed/TargetMetricsCollector.java | 34 +-
src/java/picard/cmdline/ClassFinder.java | 20 +-
src/java/picard/cmdline/CommandLineProgram.java | 3 +
.../illumina/IlluminaBasecallsConverter.java | 24 +-
.../picard/illumina/IlluminaBasecallsToFastq.java | 6 +-
.../picard/illumina/IlluminaBasecallsToSam.java | 6 +-
src/java/picard/pedigree/PedFile.java | 10 +-
src/java/picard/sam/AbstractAlignmentMerger.java | 67 +++-
src/java/picard/sam/DownsampleSam.java | 71 ++++-
src/java/picard/sam/DuplicationMetrics.java | 2 +-
src/java/picard/sam/HitsForInsert.java | 38 +--
src/java/picard/sam/MergeBamAlignment.java | 10 +-
src/java/picard/sam/RevertSam.java | 9 +-
src/java/picard/sam/SamAlignmentMerger.java | 36 ++-
.../markduplicates/EstimateLibraryComplexity.java | 170 +++++++++-
.../picard/sam/markduplicates/MarkDuplicates.java | 162 ++++++++--
.../DiskBasedReadEndsForMarkDuplicatesMap.java | 10 +-
.../util/ReadEndsForMarkDuplicates.java | 29 +-
.../util/ReadEndsForMarkDuplicatesCodec.java | 4 +-
.../ReadEndsForMarkDuplicatesWithBarcodes.java | 41 +++
...ReadEndsForMarkDuplicatesWithBarcodesCodec.java | 75 +++++
src/java/picard/util/MathUtil.java | 9 +
.../util/QuerySortedReadPairIteratorUtil.java | 65 ++++
src/java/picard/vcf/GenotypeConcordance.java | 2 +-
.../vcf/GenotypeConcordanceContingencyMetrics.java | 12 +-
src/java/picard/vcf/GenotypeConcordanceCounts.java | 28 +-
src/java/picard/vcf/LiftoverVcf.java | 49 ++-
src/java/picard/vcf/SortVcf.java | 3 +-
src/java/picard/vcf/filter/FilterVcf.java | 19 +-
src/scripts/picard/analysis/insertSizeHistogram.R | 113 ++++---
src/scripts/picard/docker_helper.sh | 2 +-
.../picard/analysis/CollectGcBiasMetricsTest.java | 351 ++++++++++++++-------
.../analysis/CollectInsertSizeMetricsTest.java | 18 ++
.../analysis/CollectMultipleMetricsTest.java | 21 +-
.../picard/analysis/CollectRnaSeqMetricsTest.java | 5 +
.../CollectWgsMetricsFromQuerySortedTest.java | 52 +++
.../CollectWgsMetricsFromSampledSitesTest.java | 98 ++++++
.../directed/CollectTargetedMetricsTest.java | 158 ++++++++++
.../java/picard/cmdline/PicardCommandLineTest.java | 18 ++
.../illumina/CheckIlluminaDirectoryTest.java | 1 +
.../illumina/IlluminaBasecallsToFastqTest.java | 15 +-
.../IlluminaBasecallsToSamAdapterClippingTest.java | 1 +
.../illumina/IlluminaBasecallsToSamTest.java | 2 +
.../illumina/IlluminaLaneMetricsCollectorTest.java | 3 +-
.../illumina/parser/IlluminaDataProviderTest.java | 10 +-
.../java/picard/sam/AddCommentsToBamTest.java | 6 +
src/tests/java/picard/sam/GatherBamFilesTest.java | 2 +
.../java/picard/sam/MergeBamAlignmentTest.java | 72 +++--
src/tests/java/picard/sam/RevertSamTest.java | 2 +
.../java/picard/sam/SamFileConverterTest.java | 1 +
.../sam/markduplicates/MarkDuplicatesTest.java | 102 ++++++
src/tests/java/picard/util/FifoBufferTest.java | 2 +-
.../util/QuerySortedReadPairIteratorUtilTest.java | 93 ++++++
src/tests/java/picard/vcf/LiftoverVcfTest.java | 95 ++++++
src/tests/java/picard/vcf/TestFilterVcf.java | 48 ++-
.../multiple_orientation.sam.insert_size_metrics | 312 ++++++++++++++++++
testdata/picard/metrics/chrMNO.reference.fasta | 65 ++++
testdata/picard/quality/chrM.empty.interval_list | 4 +
testdata/picard/quality/chrM.reference.fasta.fai | 1 +
testdata/picard/quality/chrM.single.interval_list | 3 +
.../picard/sam/CollectGcBiasMetrics/MNOheader.dict | 4 +
.../sam/MergeBamAlignment/contam.aligned.sam | 25 ++
.../sam/MergeBamAlignment/contam.expected.sam | 16 +
.../sam/MergeBamAlignment/contam.unmapped.sam | 12 +
testdata/picard/sam/contiguous.interval_list | 11 +
testdata/picard/sam/forMetrics.sam | 23 ++
testdata/picard/sam/namesorted.test.sam | 33 ++
testdata/picard/sam/onePos.interval_list | 10 +
testdata/picard/vcf/dummy.reference.dict | 2 +
testdata/picard/vcf/dummy.reference.fasta | 10 +
.../vcf/filter/testFilteringNoSeqDictionary.vcf | 65 ++++
testdata/picard/vcf/test.over.chain | 3 +
testdata/picard/vcf/testLiftover.vcf | 4 +
90 files changed, 3234 insertions(+), 578 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index aad8a53..e55979f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,4 +28,8 @@ RUN ant clean all && \
rm -rf lib && \
rm build.xml
-ENTRYPOINT ["./docker_helper.sh"]
\ No newline at end of file
+RUN mkdir /usr/working
+WORKDIR /usr/working
+
+ENTRYPOINT ["/usr/picard/docker_helper.sh"]
+CMD [""]
diff --git a/README.md b/README.md
index 21c60d7..d321239 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,32 @@ Picard is implemented using the HTSJDK Java library[HTSJDK][1], supporting
accessing of common file formats, such as [SAM][2] and [VCF][3], used for high-throughput
sequencing data.
+To clone and build:
+Clone the repo:
+
+ git clone git at github.com:broadinstitute/picard.git
+ cd picard/
+
+Clone htsjdk into a subdirectory:
+
+ ant clone-htsjdk
+Build:
+
+ ant
+
+Enjoy!
+
+ java -jar dist/picard.jar
+
+----
+
+
It's also possible to build a version of Picard that supports reading from
GA4GH API, e.g. Google Genomics:
-* Fetch [gatk-tools-java](https://github.com/gatk-tools-java)
+* Fetch [gatk-tools-java](https://github.com/googlegenomics/gatk-tools-java)
-```git clone https://github.com/gatk-tools-java```
+```git clone https://github.com/googlegenomics/gatk-tools-java```
* Build gatk-tools-java:
@@ -53,4 +73,4 @@ Please see the [Picard Documentation](http://broadinstitute.github.io/picard) fo
[1]: http://github.com/samtools/htsjdk
[2]: http://samtools.sourceforge.net
-[3]: http://vcftools.sourceforge.net/specs.html
\ No newline at end of file
+[3]: http://vcftools.sourceforge.net/specs.html
diff --git a/build.sbt b/build.sbt
index 276d2e2..1d93b1a 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,7 +4,7 @@ import sbt.Package.ManifestAttributes
name := "picard"
-version := "1.138"
+version := "1.141"
organization := "com.github.broadinstitute"
@@ -15,7 +15,7 @@ javaSource in Test := baseDirectory.value / "src/tests"
unmanagedResourceDirectories in Test := Seq(baseDirectory.value / "src/scripts", baseDirectory.value / "testdata", baseDirectory.value / "src/tests/scripts")
libraryDependencies ++= Seq(
- "com.github.samtools" % "htsjdk" % "1.138",
+ "com.github.samtools" % "htsjdk" % "1.141",
("com.google.cloud.genomics" % "gatk-tools-java" % "1.1" % "picardopt").
exclude("org.mortbay.jetty", "servlet-api"),
"org.testng" % "testng" % "6.8.8" % Test
diff --git a/build.xml b/build.xml
index 7ecd771..4e72f5d 100755
--- a/build.xml
+++ b/build.xml
@@ -53,7 +53,7 @@
<arg value="--pretty=format:%H_%at"/>
</exec>
<property name="repository.revision" value=""/>
- <property name="picard-version" value="1.138"/>
+ <property name="picard-version" value="1.141"/>
<property name="command-line-html-dir" value="${dist}/html"/>
<property name="testng.verbosity" value="2"/>
<property name="test.debug.port" value="5005"/>
@@ -414,6 +414,7 @@
<document-command title="FilterVcf" main-class="picard.vcf.filter.FilterVcf"/>
<document-command title="FixMateInformation" main-class="picard.sam.FixMateInformation"/>
<document-command title="GatherBamFiles" main-class="picard.sam.GatherBamFiles"/>
+ <document-command title="GatherVcfs" main-class="picard.vcf.GatherVcfs"/>
<document-command title="GenotypeConcordance" main-class="picard.vcf.GenotypeConcordance"/>
<document-command title="IlluminaBasecallsToFastq" main-class="picard.illumina.IlluminaBasecallsToFastq"/>
<document-command title="IlluminaBasecallsToSam" main-class="picard.illumina.IlluminaBasecallsToSam"/>
diff --git a/src/java/picard/analysis/CollectAlignmentSummaryMetrics.java b/src/java/picard/analysis/CollectAlignmentSummaryMetrics.java
index 489bdc0..5398e00 100644
--- a/src/java/picard/analysis/CollectAlignmentSummaryMetrics.java
+++ b/src/java/picard/analysis/CollectAlignmentSummaryMetrics.java
@@ -98,7 +98,7 @@ public class CollectAlignmentSummaryMetrics extends SinglePassSamProgram {
);
@Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ")
- private Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
+ public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
@Option(shortName="BS", doc="Whether the SAM or BAM file consists of bisulfite sequenced reads. ")
public boolean IS_BISULFITE_SEQUENCED = false;
diff --git a/src/java/picard/analysis/CollectGcBiasMetrics.java b/src/java/picard/analysis/CollectGcBiasMetrics.java
index e748ecf..7909e2e 100644
--- a/src/java/picard/analysis/CollectGcBiasMetrics.java
+++ b/src/java/picard/analysis/CollectGcBiasMetrics.java
@@ -28,13 +28,8 @@ import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
-import htsjdk.samtools.reference.ReferenceSequenceFile;
-import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.StringUtil;
-import picard.PicardException;
-import picard.analysis.directed.GcBiasMetricsCollector;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
@@ -43,14 +38,12 @@ import picard.util.RExecutor;
import java.io.File;
import java.text.NumberFormat;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.Set;
/**
* Tool to collect information about GC bias in the reads in a given BAM file. Computes
- * the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%
+ * the number of windows (of size specified by SCAN_WINDOW_SIZE) in the genome at each GC%
* and counts the number of read starts in each GC bin. What is output and plotted is
* the "normalized coverage" in each bin - i.e. the number of reads per window normalized
* to the average number of reads per window across the whole genome.
@@ -60,7 +53,7 @@ import java.util.Set;
*/
@CommandLineProgramProperties(
usage = "Tool to collect information about GC bias in the reads in a given BAM file. Computes" +
- " the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%" +
+ " the number of windows (of size specified by SCAN_WINDOW_SIZE) in the genome at each GC%" +
" and counts the number of read starts in each GC bin. What is output and plotted is" +
" the \"normalized coverage\" in each bin - i.e. the number of reads per window normalized" +
" to the average number of reads per window across the whole genome..\n",
@@ -76,13 +69,13 @@ public class CollectGcBiasMetrics extends SinglePassSamProgram {
@Option(shortName = "CHART", doc = "The PDF file to render the chart to.")
public File CHART_OUTPUT;
- @Option(shortName = "S", doc = "The text file to write summary metrics to.", optional = true)
+ @Option(shortName = "S", doc = "The text file to write summary metrics to.")
public File SUMMARY_OUTPUT;
- @Option(doc = "The size of windows on the genome that are used to bin reads.")
- public int WINDOW_SIZE = 100;
+ @Option(shortName = "WINDOW_SIZE", doc = "The size of the scanning windows on the reference genome that are used to bin reads.")
+ public int SCAN_WINDOW_SIZE = 100;
- @Option(doc = "For summary metrics, exclude GC windows that include less than this fraction of the genome.")
+ @Option(shortName = "MGF", doc = "For summary metrics, exclude GC windows that include less than this fraction of the genome.")
public double MINIMUM_GENOME_FRACTION = 0.00001;
@Option(shortName = "BS", doc = "Whether the SAM or BAM file consists of bisulfite sequenced reads.")
@@ -94,16 +87,9 @@ public class CollectGcBiasMetrics extends SinglePassSamProgram {
// Calculates GcBiasMetrics for all METRIC_ACCUMULATION_LEVELs provided
private GcBiasMetricsCollector multiCollector;
- //windowSize is the size of the scanning window that goes over the reference
- private final int windowSize = WINDOW_SIZE;
- final int[] windowsByGc = new int[WINDOWS];
-
- // Histograms to track the number of windows at each GC, and the number of read starts
- // at windows of each GC. Need 101 to get from 0-100.
- private static final int WINDOWS = 101;
-
- //Hash map of gc[] with reference name as key
- private final Map<String, byte[]> gcByRef = new HashMap<String, byte[]>();
+ // Bins for the histograms to track the number of windows at each GC, and the number of read starts
+ // at bins of each GC %. Need 101 to get from 0-100.
+ private static final int BINS = 101;
////////////////////////////////////////////////////////////////////////////
// Stock main method
@@ -113,32 +99,21 @@ public class CollectGcBiasMetrics extends SinglePassSamProgram {
}
/////////////////////////////////////////////////////////////////////////////
- // Setup calculates gc[] for the reference. Must be done at startup to avoid
- // missing reference sequences in the case of small files that may
- // not have reads aligning to every reference sequence
+ // Setup calculates windowsByGc for the entire reference. Must be done at
+ // startup to avoid missing reference contigs in the case of small files
+ // that may not have reads aligning to every reference contig.
/////////////////////////////////////////////////////////////////////////////
@Override
protected void setup(final SAMFileHeader header, final File samFile) {
IOUtil.assertFileIsWritable(CHART_OUTPUT);
-
- if (SUMMARY_OUTPUT != null) IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);
-
+ IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
- final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
- ReferenceSequence ref;
-
- while ((ref = refFile.nextSequence()) != null) {
- final byte[] refBases = ref.getBases();
- final String refName = ref.getName();
- StringUtil.toUpperCase(refBases);
- final int refLength = refBases.length;
- final int lastWindowStart = refLength - windowSize;
- final byte[] gc = calculateAllGcs(refBases, windowsByGc, lastWindowStart);
- gcByRef.put(refName, gc);
- }
+ //Calculate windowsByGc for the reference sequence
+ final int[] windowsByGc = GcBiasUtils.calculateRefWindowsByGc(BINS, REFERENCE_SEQUENCE, SCAN_WINDOW_SIZE);
+
//Delegate actual collection to GcBiasMetricCollector
- multiCollector = new GcBiasMetricsCollector(METRIC_ACCUMULATION_LEVEL, gcByRef, windowsByGc, header.getReadGroups(), windowSize, IS_BISULFITE_SEQUENCED);
+ multiCollector = new GcBiasMetricsCollector(METRIC_ACCUMULATION_LEVEL, windowsByGc, header.getReadGroups(), SCAN_WINDOW_SIZE, IS_BISULFITE_SEQUENCED);
}
////////////////////////////////////////////////////////////////////////////
@@ -176,60 +151,7 @@ public class CollectGcBiasMetrics extends SinglePassSamProgram {
OUTPUT.getAbsolutePath(),
SUMMARY_OUTPUT.getAbsolutePath(),
CHART_OUTPUT.getAbsolutePath(),
- String.valueOf(WINDOW_SIZE));
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Calculcate all the GC values for all windows
- /////////////////////////////////////////////////////////////////////////////
- private byte[] calculateAllGcs(final byte[] refBases, final int[] windowsByGc, final int lastWindowStart) {
- final CalculateGcState state = new CalculateGcState();
- final int refLength = refBases.length;
- final byte[] gc = new byte[refLength + 1];
- for (int i = 1; i < lastWindowStart; ++i) {
- final int windowEnd = i + windowSize;
- final int windowGc = calculateGc(refBases, i, windowEnd, state);
- gc[i] = (byte) windowGc;
- if (windowGc != -1) windowsByGc[windowGc]++;
- }
- return gc;
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Calculates GC as a number from 0 to 100 in the specified window.
- // If the window includes more than five no-calls then -1 is returned.
- /////////////////////////////////////////////////////////////////////////////
- private int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
- if (state.init) {
- state.init = false;
- state.gcCount = 0;
- state.nCount = 0;
- for (int i = startIndex; i < endIndex; ++i) {
- final byte base = bases[i];
- if (base == 'G' || base == 'C') ++state.gcCount;
- else if (base == 'N') ++state.nCount;
- }
- } else {
- final byte newBase = bases[endIndex - 1];
- if (newBase == 'G' || newBase == 'C') ++state.gcCount;
- else if (newBase == 'N') ++state.nCount;
-
- if (state.priorBase == 'G' || state.priorBase == 'C') --state.gcCount;
- else if (state.priorBase == 'N') --state.nCount;
- }
- state.priorBase = bases[startIndex];
- if (state.nCount > 4) return -1;
- else return (state.gcCount * 100) / (endIndex - startIndex);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Keeps track of current GC calculation state
- /////////////////////////////////////////////////////////////////////////////
- class CalculateGcState {
- boolean init = true;
- int nCount;
- int gcCount;
- byte priorBase;
+ String.valueOf(SCAN_WINDOW_SIZE));
}
}
diff --git a/src/java/picard/analysis/CollectInsertSizeMetrics.java b/src/java/picard/analysis/CollectInsertSizeMetrics.java
index 141c5fd..420cccd 100644
--- a/src/java/picard/analysis/CollectInsertSizeMetrics.java
+++ b/src/java/picard/analysis/CollectInsertSizeMetrics.java
@@ -56,7 +56,7 @@ import java.util.Set;
)
public class CollectInsertSizeMetrics extends SinglePassSamProgram {
private static final Log log = Log.getInstance(CollectInsertSizeMetrics.class);
- private static final String Histogram_R_SCRIPT = "picard/analysis/insertSizeHistogram.R";
+ protected static final String Histogram_R_SCRIPT = "picard/analysis/insertSizeHistogram.R";
@Option(shortName="H", doc="File to write insert size Histogram chart to.")
public File Histogram_FILE;
@@ -75,7 +75,7 @@ public class CollectInsertSizeMetrics extends SinglePassSamProgram {
public float MINIMUM_PCT = 0.05f;
@Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ")
- private Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
+ public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
// Calculates InsertSizeMetrics for all METRIC_ACCUMULATION_LEVELs provided
private InsertSizeMetricsCollector multiCollector;
diff --git a/src/java/picard/analysis/CollectMultipleMetrics.java b/src/java/picard/analysis/CollectMultipleMetrics.java
index 1603822..79da4b2 100644
--- a/src/java/picard/analysis/CollectMultipleMetrics.java
+++ b/src/java/picard/analysis/CollectMultipleMetrics.java
@@ -1,7 +1,9 @@
package picard.analysis;
import htsjdk.samtools.util.CollectionUtil;
+import htsjdk.samtools.util.Log;
import picard.PicardException;
+import picard.analysis.artifacts.CollectSequencingArtifactMetrics;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -9,9 +11,7 @@ import picard.cmdline.programgroups.Metrics;
import picard.cmdline.StandardOptionDefinitions;
import java.io.File;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
+import java.util.*;
/**
* Class that is designed to instantiate and execute multiple metrics programs that extend
@@ -23,7 +23,7 @@ import java.util.List;
@CommandLineProgramProperties(
usage = "Takes an input BAM and reference sequence and runs one or more Picard " +
"metrics modules at the same time to cut down on I/O. Currently all programs are run with " +
- "default options and fixed output extesions, but this may become more flexible in future.",
+ "default options and fixed output extensions, but this may become more flexible in future.",
usageShort = "A \"meta-metrics\" calculating program that produces multiple metrics for the provided SAM/BAM",
programGroup = Metrics.class
)
@@ -34,25 +34,31 @@ public class CollectMultipleMetrics extends CommandLineProgram {
* Includes a method for determining whether or not a Program explicitly needs a reference sequence (i.e. cannot be null)
*/
public static interface ProgramInterface {
- SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference);
+ SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference,
+ final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals);
public boolean needsReferenceSequence();
+ public boolean supportsMetricAccumulationLevel();
}
-
+
public static enum Program implements ProgramInterface {
CollectAlignmentSummaryMetrics {
@Override
public boolean needsReferenceSequence() {
return false;
}
-
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return true;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectAlignmentSummaryMetrics program = new CollectAlignmentSummaryMetrics();
program.OUTPUT = new File(outbase + ".alignment_summary_metrics");
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway. These are set here to make sure that in case of a the derived class
// overrides
+ program.METRIC_ACCUMULATION_LEVEL = metricAccumulationLevel;
program.INPUT = input;
program.REFERENCE_SEQUENCE = reference;
@@ -64,15 +70,19 @@ public class CollectMultipleMetrics extends CommandLineProgram {
public boolean needsReferenceSequence() {
return false;
}
-
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return true;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectInsertSizeMetrics program = new CollectInsertSizeMetrics();
program.OUTPUT = new File(outbase + ".insert_size_metrics");
program.Histogram_FILE = new File(outbase + ".insert_size_histogram.pdf");
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway. These are set here to make sure that in case of a the derived class
// overrides
+ program.METRIC_ACCUMULATION_LEVEL = metricAccumulationLevel;
program.INPUT = input;
program.REFERENCE_SEQUENCE = reference;
@@ -85,7 +95,11 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return false;
}
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return false;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final QualityScoreDistribution program = new QualityScoreDistribution();
program.OUTPUT = new File(outbase + ".quality_distribution_metrics");
program.CHART_OUTPUT = new File(outbase + ".quality_distribution.pdf");
@@ -104,7 +118,11 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return false;
}
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return false;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final MeanQualityByCycle program = new MeanQualityByCycle();
program.OUTPUT = new File(outbase + ".quality_by_cycle_metrics");
program.CHART_OUTPUT = new File(outbase + ".quality_by_cycle.pdf");
@@ -123,7 +141,11 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return false;
}
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return false;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectBaseDistributionByCycle program = new CollectBaseDistributionByCycle();
program.OUTPUT = new File(outbase + ".base_distribution_by_cycle_metrics");
program.CHART_OUTPUT = new File(outbase + ".base_distribution_by_cycle.pdf");
@@ -142,16 +164,20 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return true;
}
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return true;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectGcBiasMetrics program = new CollectGcBiasMetrics();
program.OUTPUT = new File(outbase + ".gc_bias.detail_metrics");
program.SUMMARY_OUTPUT = new File(outbase + ".gc_bias.summary_metrics");
program.CHART_OUTPUT = new File(outbase + ".gc_bias.pdf");
program.INPUT = input;
- program.METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS,
- MetricAccumulationLevel.LIBRARY);
- program.WINDOW_SIZE = 100;
+ // previously MetricAccumulationLevel.ALL_READS, MetricAccumulationLevel.LIBRARY
+ program.METRIC_ACCUMULATION_LEVEL = metricAccumulationLevel;
+ program.SCAN_WINDOW_SIZE = 100;
program.MINIMUM_GENOME_FRACTION = 1.0E-5;
program.IS_BISULFITE_SEQUENCED = false;
program.ASSUME_SORTED = false;
@@ -168,18 +194,44 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return true;
}
@Override
- public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference) {
+ public boolean supportsMetricAccumulationLevel() {
+ return true;
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectRnaSeqMetrics program = new CollectRnaSeqMetrics();
program.OUTPUT = new File(outbase + ".rna_metrics");
program.CHART_OUTPUT = new File(outbase + ".rna_coverage.pdf");
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway. These are set here to make sure that in case of a the derived class
// overrides
+ program.METRIC_ACCUMULATION_LEVEL = metricAccumulationLevel;
program.INPUT = input;
program.REFERENCE_SEQUENCE = reference;
return program;
}
+ },
+ CollectSequencingArtifactMetrics {
+ @Override
+ public boolean needsReferenceSequence() {
+ return true;
+ }
+ @Override
+ public boolean supportsMetricAccumulationLevel() { return false; }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
+ final CollectSequencingArtifactMetrics program = new CollectSequencingArtifactMetrics();
+ program.OUTPUT = new File(outbase);
+ program.DB_SNP = dbSnp;
+ program.INTERVALS = intervals;
+ // Generally programs should not be accessing these directly but it might make things smoother
+ // to just set them anyway. These are set here to make sure that in case of a the derived class
+ // overrides
+ program.INPUT = input;
+ program.REFERENCE_SEQUENCE = reference;
+ return program;
+ }
}
}
@@ -197,17 +249,31 @@ public class CollectMultipleMetrics extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Base name of output files.")
public String OUTPUT;
+ // create the default accumulation level as a variable. We'll use this to init the command-line arg and for validation later.
+ private final Set<MetricAccumulationLevel> accumLevelDefault = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
+
+ @Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics.")
+ public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = new HashSet<MetricAccumulationLevel>(accumLevelDefault);
+
@Option(doc = "List of metrics programs to apply during the pass through the SAM file.")
public List<Program> PROGRAM = CollectionUtil.makeList(Program.CollectAlignmentSummaryMetrics, Program.CollectBaseDistributionByCycle,
Program.CollectInsertSizeMetrics, Program.MeanQualityByCycle, Program.QualityScoreDistribution);
+ @Option(doc = "An optional list of intervals to restrict analysis to.", optional = true)
+ public File INTERVALS;
+
+ @Option(doc = "VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis.", optional = true)
+ public File DB_SNP;
+
/**
* Contents of PROGRAM list is transferred to this list during command-line validation, so that an outside
* developer can invoke this class programmatically and provide alternative Programs to run by calling
* setProgramsToRun().
*/
private List<ProgramInterface> programsToRun;
-
+
+ private static final Log log = Log.getInstance(CollectMultipleMetrics.class);
+
// Stock main method
public static void main(final String[] args) {
new CollectMultipleMetrics().instanceMainWithExit(args);
@@ -219,6 +285,7 @@ public class CollectMultipleMetrics extends CommandLineProgram {
return new String[]{"No programs specified with PROGRAM"};
}
programsToRun = new ArrayList<ProgramInterface>(PROGRAM);
+
return super.customCommandLineValidation();
}
@@ -241,7 +308,11 @@ public class CollectMultipleMetrics extends CommandLineProgram {
if (program.needsReferenceSequence() && REFERENCE_SEQUENCE==null) {
throw new PicardException("The " + program.toString() + " program needs a Reference Sequence, please set REFERENCE_SEQUENCE in the command line");
}
- final SinglePassSamProgram instance = program.makeInstance(OUTPUT, INPUT, REFERENCE_SEQUENCE);
+ if (!accumLevelDefault.equals(METRIC_ACCUMULATION_LEVEL) && !program.supportsMetricAccumulationLevel()) {
+ log.warn("The " + program.toString() + " program does not support a metric accumulation level, but METRIC_ACCUMULATION_LEVEL" +
+ " was overridden in the command line. " + program.toString() + " will be run against the entire input.");
+ }
+ final SinglePassSamProgram instance = program.makeInstance(OUTPUT, INPUT, REFERENCE_SEQUENCE, METRIC_ACCUMULATION_LEVEL, DB_SNP, INTERVALS);
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway
diff --git a/src/java/picard/analysis/CollectOxoGMetrics.java b/src/java/picard/analysis/CollectOxoGMetrics.java
index 5392ba4..3055ffc 100644
--- a/src/java/picard/analysis/CollectOxoGMetrics.java
+++ b/src/java/picard/analysis/CollectOxoGMetrics.java
@@ -19,6 +19,7 @@ import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SamLocusIterator;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
+import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -202,6 +203,11 @@ public class CollectOxoGMetrics extends CommandLineProgram {
final Set<String> samples = new HashSet<String>();
final Set<String> libraries = new HashSet<String>();
+
+ if (in.getFileHeader().getReadGroups().isEmpty()) {
+ throw new PicardException("This analysis requires a read group entry in the alignment file header");
+ }
+
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
diff --git a/src/java/picard/analysis/CollectQualityYieldMetrics.java b/src/java/picard/analysis/CollectQualityYieldMetrics.java
index 4753e18..7c3cb8c 100644
--- a/src/java/picard/analysis/CollectQualityYieldMetrics.java
+++ b/src/java/picard/analysis/CollectQualityYieldMetrics.java
@@ -47,7 +47,8 @@ import java.io.File;
*/
@CommandLineProgramProperties(
usage = "Collects quality yield metrics, a set of metrics that quantify the quality and yield of sequence data from a " +
- "SAM/BAM input file.",
+ "SAM/BAM input file. Note that the default behaviour of this program changed as of November 6th 2015 to no longer " +
+ "include secondary and supplemental alignments in the computation.",
usageShort = "Collects a set of metrics that quantify the quality and yield of sequence data from the provided SAM/BAM",
programGroup = Metrics.class
)
@@ -66,6 +67,14 @@ public class CollectQualityYieldMetrics extends CommandLineProgram {
"as inputs instead of the quality scores in the QUAL field.")
public boolean USE_ORIGINAL_QUALITIES = true;
+ @Option(doc="If true, include bases from secondary alignments in metrics. Setting to true may cause double-counting " +
+ "of bases if there are secondary alignments in the input file.")
+ public boolean INCLUDE_SECONDARY_ALIGNMENTS = false;
+
+ @Option(doc="If true, include bases from supplemental alignments in metrics. Setting to true may cause double-counting " +
+ "of bases if there are supplemental alignments in the input file.")
+ public boolean INCLUDE_SUPPLEMENTAL_ALIGNMENTS = false;
+
/** Stock main method for a command line program. */
public static void main(final String[] argv) {
new CollectQualityYieldMetrics().instanceMainWithExit(argv);
@@ -92,6 +101,9 @@ public class CollectQualityYieldMetrics extends CommandLineProgram {
final QualityYieldMetrics metrics = new QualityYieldMetrics();
for (final SAMRecord rec : sam) {
+ if (!INCLUDE_SECONDARY_ALIGNMENTS && rec.getNotPrimaryAlignmentFlag()) continue;
+ if (!INCLUDE_SUPPLEMENTAL_ALIGNMENTS && rec.getSupplementaryAlignmentFlag()) continue;
+
metrics.TOTAL_READS++;
final int length = rec.getReadLength();
@@ -173,7 +185,5 @@ public class CollectQualityYieldMetrics extends CommandLineProgram {
/** The sum of quality scores of all bases divided by 20 */
public long PF_Q20_EQUIVALENT_YIELD = 0;
-
}
-
}
diff --git a/src/java/picard/analysis/CollectRnaSeqMetrics.java b/src/java/picard/analysis/CollectRnaSeqMetrics.java
index b205a28..996105a 100644
--- a/src/java/picard/analysis/CollectRnaSeqMetrics.java
+++ b/src/java/picard/analysis/CollectRnaSeqMetrics.java
@@ -83,7 +83,7 @@ public class CollectRnaSeqMetrics extends SinglePassSamProgram {
public double RRNA_FRAGMENT_PERCENTAGE = 0.8;
@Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ")
- private final Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
+ public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
private RnaSeqMetricsCollector collector;
diff --git a/src/java/picard/analysis/CollectWgsMetrics.java b/src/java/picard/analysis/CollectWgsMetrics.java
index bfb4a00..b0bb642 100644
--- a/src/java/picard/analysis/CollectWgsMetrics.java
+++ b/src/java/picard/analysis/CollectWgsMetrics.java
@@ -10,11 +10,7 @@ import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
-import htsjdk.samtools.util.Histogram;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.ProgressLogger;
-import htsjdk.samtools.util.SamLocusIterator;
+import htsjdk.samtools.util.*;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -64,6 +60,9 @@ public class CollectWgsMetrics extends CommandLineProgram {
@Option(doc = "Determines whether to include the base quality histogram in the metrics file.")
public boolean INCLUDE_BQ_HISTOGRAM = false;
+ @Option(doc="If true, count unpaired reads, and paired reads with one end unmapped")
+ public boolean COUNT_UNPAIRED = false;
+
private final Log log = Log.getInstance(CollectWgsMetrics.class);
/** Metrics for evaluating the performance of whole genome sequencing experiments. */
@@ -136,15 +135,17 @@ public class CollectWgsMetrics extends CommandLineProgram {
final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
+ final SamLocusIterator iterator = getLocusIterator(in);
- final SamLocusIterator iterator = new SamLocusIterator(in);
final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
final CountingFilter dupeFilter = new CountingDuplicateFilter();
final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
final CountingPairedFilter pairFilter = new CountingPairedFilter();
filters.add(mapqFilter);
filters.add(dupeFilter);
- filters.add(pairFilter);
+ if (!COUNT_UNPAIRED) {
+ filters.add(pairFilter);
+ }
filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice
iterator.setSamFilters(filters);
iterator.setEmitUncoveredLoci(true);
@@ -213,9 +214,9 @@ public class CollectWgsMetrics extends CommandLineProgram {
metrics.MEDIAN_COVERAGE = histo.getMedian();
metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();
- final long basesExcludedByDupes = dupeFilter.getFilteredBases();
- final long basesExcludedByMapq = mapqFilter.getFilteredBases();
- final long basesExcludedByPairing = pairFilter.getFilteredBases();
+ final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
+ final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
+ final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
final double total = histo.getSum();
final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
@@ -254,6 +255,14 @@ public class CollectWgsMetrics extends CommandLineProgram {
protected WgsMetrics generateWgsMetrics() {
return new WgsMetrics();
}
+
+ protected long getBasesExcludedBy(final CountingFilter filter) {
+ return filter.getFilteredBases();
+ }
+
+ protected SamLocusIterator getLocusIterator(final SamReader in) {
+ return new SamLocusIterator(in);
+ }
}
/**
diff --git a/src/java/picard/analysis/CollectWgsMetricsFromQuerySorted.java b/src/java/picard/analysis/CollectWgsMetricsFromQuerySorted.java
new file mode 100644
index 0000000..b28fa50
--- /dev/null
+++ b/src/java/picard/analysis/CollectWgsMetricsFromQuerySorted.java
@@ -0,0 +1,298 @@
+package picard.analysis;
+
+import htsjdk.samtools.*;
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.*;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.CommandLineProgramProperties;
+import picard.cmdline.Option;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.Metrics;
+import picard.util.QuerySortedReadPairIteratorUtil;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Computes a number of metrics that are useful for evaluating coverage and performance of sequencing experiments.
+ *
+ * @author ebanks
+ */
+ at CommandLineProgramProperties(
+ usage = "Computes a number of metrics that are useful for evaluating coverage and performance of " +
+ "sequencing experiments.",
+ usageShort = "Writes sequencing-related metrics for a SAM or BAM file",
+ programGroup = Metrics.class
+)
+public class CollectWgsMetricsFromQuerySorted extends CommandLineProgram {
+
+ @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
+ public File INPUT;
+
+ @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output metrics file.")
+ public File OUTPUT;
+
+ @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
+ public int MINIMUM_MAPPING_QUALITY = 20;
+
+ @Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true)
+ public int MINIMUM_BASE_QUALITY = 20;
+
+ private final Log log = Log.getInstance(CollectWgsMetricsFromQuerySorted.class);
+
+ /** Metrics for evaluating the performance of whole genome sequencing experiments. */
+ public static class QuerySortedSeqMetrics extends CollectWgsMetrics.WgsMetrics {
+ /** The total number of bases, before any filters are applied. */
+ public long TOTAL_BASES = 0;
+ /** The number of usable bases, after all filters are applied. */
+ public long TOTAL_USABLE_BASES = 0;
+
+ /** The number of read pairs, before all filters are applied. */
+ public long TOTAL_READ_PAIRS = 0;
+ /** The number of duplicate read pairs, before all filters are applied. */
+ public long TOTAL_DUPE_PAIRS = 0;
+
+ /** The number of read pairs with standard orientations from which to calculate mean insert size, after filters are applied. */
+ public long TOTAL_ORIENTED_PAIRS = 0;
+ /** The mean insert size, after filters are applied. */
+ public double MEAN_INSERT_SIZE = 0.0;
+ }
+
+ public static void main(final String[] args) {
+ new CollectWgsMetricsFromQuerySorted().instanceMainWithExit(args);
+ }
+
+ @Override
+ protected int doWork() {
+ IOUtil.assertFileIsReadable(INPUT);
+ IOUtil.assertFileIsWritable(OUTPUT);
+
+ // progress tracker
+ final ProgressLogger progress = new ProgressLogger(log, 50000000, "Processed", "read pairs");
+
+ // the SAM reader
+ final SamReader reader = SamReaderFactory.makeDefault().open(INPUT);
+ final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(reader.iterator());
+
+ // the metrics to keep track of
+ final QuerySortedSeqMetrics metrics = new QuerySortedSeqMetrics();
+ long basesExcludedByDupes = 0;
+ long basesExcludedByMapq = 0;
+ long basesExcludedByPairing = 0;
+ long basesExcludedByBaseq = 0;
+ long basesExcludedByOverlap = 0;
+ double insertSizeSum = 0.0;
+
+ // Loop through all the loci by read pairs
+ QuerySortedReadPairIteratorUtil.ReadPair pairToAnalyze = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ while (pairToAnalyze != null) {
+
+ final boolean isProperPair = (pairToAnalyze.read2 != null);
+
+ // how many bases do we have?
+ final int read1bases = pairToAnalyze.read1.getReadLength();
+ final int read2bases = isProperPair ? pairToAnalyze.read2.getReadLength() : 0;
+ final int totalReadBases = read1bases + read2bases;
+
+ // now compute metrics...
+ metrics.TOTAL_BASES += totalReadBases;
+ if (isProperPair) metrics.TOTAL_READ_PAIRS++;
+
+ if (!isProperPair || pairToAnalyze.read1.getMateUnmappedFlag() || pairToAnalyze.read2.getMateUnmappedFlag()) {
+ basesExcludedByPairing += totalReadBases;
+ } else if (pairToAnalyze.read1.getDuplicateReadFlag()) {
+ metrics.TOTAL_DUPE_PAIRS++;
+ basesExcludedByDupes += totalReadBases;
+ } else {
+
+ // determine the bad bases from the reads
+ final BaseExclusionHelper read1exclusions = determineBaseExclusions(pairToAnalyze.read1);
+ final BaseExclusionHelper read2exclusions = determineBaseExclusions(pairToAnalyze.read2);
+ basesExcludedByMapq += read1exclusions.basesExcludedByMapq + read2exclusions.basesExcludedByMapq;
+ basesExcludedByBaseq += read1exclusions.lowBQcount + read2exclusions.lowBQcount;
+
+ // keep track of the total usable bases
+ int usableBaseCount = totalReadBases;
+ usableBaseCount -= (read1exclusions.basesExcludedByMapq + read1exclusions.lowBQcount);
+ usableBaseCount -= (read2exclusions.basesExcludedByMapq + read2exclusions.lowBQcount);
+
+ // subtract out bad bases from overlaps between the reads, but only if both reads pass mapping quality thresholds
+ if (read1exclusions.basesExcludedByMapq == 0 && read2exclusions.basesExcludedByMapq == 0) {
+ final int overlapCount = getOverlappingBaseCount(read1exclusions, read2exclusions);
+ basesExcludedByOverlap += overlapCount;
+ usableBaseCount -= overlapCount;
+ }
+
+ metrics.TOTAL_USABLE_BASES += usableBaseCount;
+
+ final int insertSize = Math.abs(pairToAnalyze.read1.getInferredInsertSize());
+ if (insertSize > 0 && pairToAnalyze.read1.getProperPairFlag()) {
+ metrics.TOTAL_ORIENTED_PAIRS++;
+ insertSizeSum += insertSize;
+ }
+ }
+
+ // record progress
+ progress.record(pairToAnalyze.read1);
+
+ // iterate
+ pairToAnalyze = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ }
+
+ // finalize metrics
+ setUnusedMetrics(metrics);
+ metrics.GENOME_TERRITORY = reader.getFileHeader().getSequenceDictionary().getReferenceLength();
+ metrics.MEAN_COVERAGE = metrics.TOTAL_USABLE_BASES / (double)metrics.GENOME_TERRITORY;
+ metrics.PCT_EXC_DUPE = basesExcludedByDupes / (double)metrics.TOTAL_BASES;
+ metrics.PCT_EXC_MAPQ = basesExcludedByMapq / (double)metrics.TOTAL_BASES;
+ metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / (double)metrics.TOTAL_BASES;
+ metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / (double)metrics.TOTAL_BASES;
+ metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / (double)metrics.TOTAL_BASES;
+ final double totalExcludedBases = metrics.TOTAL_BASES - metrics.TOTAL_USABLE_BASES;
+ metrics.PCT_EXC_TOTAL = totalExcludedBases / metrics.TOTAL_BASES;
+ metrics.MEAN_INSERT_SIZE = insertSizeSum / metrics.TOTAL_ORIENTED_PAIRS;
+
+ final MetricsFile<QuerySortedSeqMetrics, Integer> out = getMetricsFile();
+ out.addMetric(metrics);
+ out.write(OUTPUT);
+
+ return 0;
+ }
+
+ /**
+ * Get the count of low quality and/or softclip bases in the given read
+ *
+ * @param exclusions the helper object
+ * @return non-negative int
+ */
+ private int getLowQualityOrSoftclipBaseCount(final BaseExclusionHelper exclusions) {
+ final byte[] quals = exclusions.read.getBaseQualities();
+
+ int badCount = exclusions.firstUnclippedBaseIndex + (quals.length - exclusions.firstTrailingClippedBaseIndex);
+ for (int i = exclusions.firstUnclippedBaseIndex; i < exclusions.firstTrailingClippedBaseIndex; i++) {
+ if (quals[i] < MINIMUM_BASE_QUALITY)
+ badCount++;
+ }
+ return badCount;
+ }
+
+ /**
+ * set the values of the unused metrics to -1
+ *
+ * @param metrics the metrics object
+ */
+ private void setUnusedMetrics(final QuerySortedSeqMetrics metrics) {
+ metrics.SD_COVERAGE = -1;
+ metrics.MEDIAN_COVERAGE = -1;
+ metrics.MAD_COVERAGE = -1;
+ metrics.PCT_5X = -1;
+ metrics.PCT_10X = -1;
+ metrics.PCT_15X = -1;
+ metrics.PCT_20X = -1;
+ metrics.PCT_25X = -1;
+ metrics.PCT_30X = -1;
+ metrics.PCT_40X = -1;
+ metrics.PCT_50X = -1;
+ metrics.PCT_60X = -1;
+ metrics.PCT_70X = -1;
+ metrics.PCT_80X = -1;
+ metrics.PCT_90X = -1;
+ metrics.PCT_100X = -1;
+ metrics.PCT_EXC_CAPPED = -1;
+ }
+
+ /**
+ * Get the count of overlapping bases for the given reads
+ *
+ * @param read1exclusions the 1st read exclusions
+ * @param read2exclusions the 2nd read exclusions
+ * @return non-negative int
+ */
+ private int getOverlappingBaseCount(final BaseExclusionHelper read1exclusions, final BaseExclusionHelper read2exclusions) {
+ // make life easy by ensuring that reads come in coordinate order
+ if ( read2exclusions.read.getAlignmentStart() < read1exclusions.read.getAlignmentStart() ) {
+ return getOverlappingBaseCount(read2exclusions, read1exclusions);
+ }
+
+ // must be overlapping
+ if ( read1exclusions.read.getAlignmentEnd() < read2exclusions.read.getAlignmentStart() ||
+ !read1exclusions.read.getReferenceIndex().equals(read2exclusions.read.getReferenceIndex()) )
+ return 0;
+
+ final byte[] read1quals = read1exclusions.read.getBaseQualities();
+ final byte[] read2quals = read2exclusions.read.getBaseQualities();
+ final int indexOfOverlapInFirstRead = read1exclusions.read.getReadPositionAtReferencePosition(read2exclusions.read.getAlignmentStart(), true) - 1;
+ final int maxPossibleOverlap = read1exclusions.firstTrailingClippedBaseIndex - indexOfOverlapInFirstRead;
+ // the overlap cannot actually be larger than the usable bases in read2
+ final int actualOverlap = Math.min(maxPossibleOverlap, read2exclusions.firstTrailingClippedBaseIndex - read2exclusions.firstUnclippedBaseIndex);
+ int numHighQualityOverlappingBases = 0;
+
+ for (int i = 0; i < actualOverlap; i++) {
+ // we count back from the end of the aligned bases (i.e. not included soft-clips) in read1 and from the front of read2
+ final int posInRead1 = read1exclusions.firstTrailingClippedBaseIndex - actualOverlap + i;
+ final int posInRead2 = read2exclusions.firstUnclippedBaseIndex + i;
+
+ // we only want to count it if they are both high quality (i.e. not already counted among bad bases)
+ if (read1quals[posInRead1] >= MINIMUM_BASE_QUALITY && read2quals[posInRead2] >= MINIMUM_BASE_QUALITY) {
+ numHighQualityOverlappingBases++;
+ }
+ }
+
+ return numHighQualityOverlappingBases;
+ }
+
+ /**
+ * Determine how many bases are excluded because of low mapping or base quality.
+ *
+ * @param read the read
+ * @return non-null object
+ */
+ private BaseExclusionHelper determineBaseExclusions(final SAMRecord read) {
+ final BaseExclusionHelper exclusions = new BaseExclusionHelper(read);
+
+ if (read.getMappingQuality() < MINIMUM_MAPPING_QUALITY) {
+ exclusions.basesExcludedByMapq = read.getReadLength();
+ } else {
+ exclusions.lowBQcount = getLowQualityOrSoftclipBaseCount(exclusions);
+ }
+
+ return exclusions;
+ }
+
+ private static class BaseExclusionHelper {
+ public SAMRecord read;
+ public int firstUnclippedBaseIndex;
+ public int firstTrailingClippedBaseIndex;
+ public int basesExcludedByMapq = 0;
+ public int lowBQcount = 0;
+
+ public BaseExclusionHelper(final SAMRecord read) {
+ this.read = read;
+
+ final List<CigarElement> cigarElements = read.getCigar().getCigarElements();
+ firstUnclippedBaseIndex = 0;
+ for (final CigarElement element : cigarElements) {
+ final CigarOperator op = element.getOperator();
+
+ if (op == CigarOperator.SOFT_CLIP) {
+ firstUnclippedBaseIndex = element.getLength();
+ } else if (op != CigarOperator.HARD_CLIP) {
+ break;
+ }
+ }
+
+ firstTrailingClippedBaseIndex = read.getReadLength();
+ for (int i = cigarElements.size() - 1; i >= 0; --i) {
+ final CigarElement element = cigarElements.get(i);
+ final CigarOperator op = element.getOperator();
+
+ if (op == CigarOperator.SOFT_CLIP) {
+ firstTrailingClippedBaseIndex -= element.getLength();
+ } else if (op != CigarOperator.HARD_CLIP) {
+ break;
+ }
+ }
+ }
+ }
+}
+
diff --git a/src/java/picard/analysis/CollectWgsMetricsFromSampledSites.java b/src/java/picard/analysis/CollectWgsMetricsFromSampledSites.java
new file mode 100644
index 0000000..d2fdec3
--- /dev/null
+++ b/src/java/picard/analysis/CollectWgsMetricsFromSampledSites.java
@@ -0,0 +1,62 @@
+package picard.analysis;
+
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.util.*;
+import picard.cmdline.CommandLineProgramProperties;
+import picard.cmdline.Option;
+import picard.cmdline.programgroups.Metrics;
+
+import java.io.File;
+
+/**
+ * Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments,
+ * but only at a set of sampled positions.
+ * It is important that the sampled positions be chosen so that they are spread out at least further than a read's length apart;
+ * otherwise, you run the risk of double-counting reads in the metrics.
+ *
+ * @author ebanks
+ */
+ at CommandLineProgramProperties(
+ usage = "Computes a number of metrics that are useful for evaluating coverage and performance of " +
+ "whole genome sequencing experiments, but only at a set of sampled positions. " +
+ "It is important that the sampled positions be chosen so that they are spread out " +
+ "at least further than a read's length apart; otherwise, you run the risk of double-counting " +
+ "reads in the metrics.",
+ usageShort = "Writes whole genome sequencing-related metrics for a SAM or BAM file",
+ programGroup = Metrics.class
+)
+public class CollectWgsMetricsFromSampledSites extends CollectWgsMetrics {
+
+ @Option(shortName = "INTERVALS", doc = "An interval list file that contains the locations of the positions to assess.", optional = false)
+ public File INTERVALS = null;
+
+ public static void main(final String[] args) {
+ new CollectWgsMetricsFromSampledSites().instanceMainWithExit(args);
+ }
+
+ @Override
+ protected SamLocusIterator getLocusIterator(final SamReader in) {
+ IOUtil.assertFileIsReadable(INTERVALS);
+ return new SamLocusIterator(in, IntervalList.fromFile(INTERVALS));
+ }
+
+ /**
+ * By design we want to count just those bases at the positions we care about, not across the entire read.
+ * Therefore, we call filter.getFilteredRecords() so that only the bases in the pileup at a given position
+ * are included in the calculations (with filter.getFilteredBases() we would be including other bases in
+ * the read too).
+ */
+ @Override
+ protected long getBasesExcludedBy(final CountingFilter filter) {
+ return filter.getFilteredRecords();
+ }
+
+ // rename the class so that in the metric file it is annotated differently.
+ public static class SampledWgsMetrics extends WgsMetrics {}
+
+ @Override
+ protected WgsMetrics generateWgsMetrics() {
+ return new SampledWgsMetrics();
+ }
+}
+
diff --git a/src/java/picard/analysis/directed/GcBiasMetricsCollector.java b/src/java/picard/analysis/GcBiasMetricsCollector.java
similarity index 79%
rename from src/java/picard/analysis/directed/GcBiasMetricsCollector.java
rename to src/java/picard/analysis/GcBiasMetricsCollector.java
index c8cb08e..614a131 100644
--- a/src/java/picard/analysis/directed/GcBiasMetricsCollector.java
+++ b/src/java/picard/analysis/GcBiasMetricsCollector.java
@@ -1,4 +1,28 @@
-package picard.analysis.directed;
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.analysis;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
@@ -6,10 +30,8 @@ import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.QualityUtil;
import htsjdk.samtools.util.SequenceUtil;
-import picard.analysis.GcBiasDetailMetrics;
-import picard.analysis.GcBiasSummaryMetrics;
+import htsjdk.samtools.util.StringUtil;
import picard.metrics.GcBiasMetrics;
-import picard.analysis.MetricAccumulationLevel;
import picard.metrics.MultiLevelCollector;
import picard.metrics.PerUnitMetricCollector;
@@ -20,21 +42,25 @@ import java.util.Set;
import java.util.HashMap;
/** Calculates GC Bias Metrics on multiple levels
- * Created by kbergin on 3/23/15.
+ * Created by kbergin on 3/23/15.
*/
public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, Integer, GcBiasCollectorArgs> {
// Histograms to track the number of windows at each GC, and the number of read starts
// at windows of each GC. Need 101 to get from 0-100.
- private final int windowSize;
+ private final int scanWindowSize;
private final boolean bisulfite;
- private final Map<String, byte[]> gcByRef;
- private int[] windowsByGc = new int[WINDOWS];
- private static final int WINDOWS = 101;
+ private int[] windowsByGc = new int[BINS];
+ private static final int BINS = 101;
- public GcBiasMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final Map<String, byte[]> gcByRef, final int[] windowsByGc, final List<SAMReadGroupRecord> samRgRecords, final int windowSize, final boolean bisulfite) {
- this.windowSize = windowSize;
+ //will hold the relevant gc information per contig
+ private byte [] gc = null;
+ private int referenceIndex = -1;
+ private byte [] refBases = null;
+
+ public GcBiasMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final int[] windowsByGc,
+ final List<SAMReadGroupRecord> samRgRecords, final int scanWindowSize, final boolean bisulfite) {
+ this.scanWindowSize = scanWindowSize;
this.bisulfite = bisulfite;
- this.gcByRef = gcByRef;
this.windowsByGc = windowsByGc;
setup(accumulationLevels, samRgRecords);
}
@@ -100,10 +126,16 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
final SAMRecord rec = args.getRec();
final String type;
if (!rec.getReadUnmappedFlag()) {
- final ReferenceSequence ref = args.getRef();
- final byte[] refBases = ref.getBases();
- final String refName = ref.getName();
- final byte[] gc = gcByRef.get(refName);
+ if(referenceIndex != rec.getReferenceIndex() || gc == null){
+ final ReferenceSequence ref = args.getRef();
+ refBases = ref.getBases();
+ StringUtil.toUpperCase(refBases);
+ final int refLength = refBases.length;
+ final int lastWindowStart = refLength - scanWindowSize;
+ gc = GcBiasUtils.calculateAllGcs(refBases, lastWindowStart, scanWindowSize);
+ referenceIndex=rec.getReferenceIndex();
+ }
+
final String group;
if (this.readGroup != null) {
type = this.readGroup;
@@ -159,7 +191,7 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
final long[] errorsByGc = gcCur.errorsByGc;
final long[] basesByGc = gcCur.basesByGc;
final int totalClusters = gcCur.totalClusters;
- final int totalAlignedReads = gcCur.totalAlignedReads;
+ final long totalAlignedReads = gcCur.totalAlignedReads;
final String group = gcCur.group;
final GcBiasMetrics metrics = new GcBiasMetrics();
@@ -174,22 +206,21 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
detail.GC = i;
detail.WINDOWS = windowsByGc[i];
detail.READ_STARTS = readsByGc[i];
- if (errorsByGc[i] > 0) detail.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
+ if (errorsByGc[i] > 0) {
+ detail.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
+ }
if (windowsByGc[i] != 0) {
detail.NORMALIZED_COVERAGE = (detail.READ_STARTS / (double) detail.WINDOWS) / meanReadsPerWindow;
detail.ERROR_BAR_WIDTH = (Math.sqrt(detail.READ_STARTS) / (double) detail.WINDOWS) / meanReadsPerWindow;
- }
- else{
+ } else {
detail.NORMALIZED_COVERAGE = 0;
detail.ERROR_BAR_WIDTH = 0;
}
detail.ACCUMULATION_LEVEL = group;
- if (group.equals("Read Group")) {
- detail.READ_GROUP = gcType;}
- else if (group.equals("Sample")) {
- detail.SAMPLE = gcType;}
- else if (group.equals("Library")) {
- detail.LIBRARY = gcType;}
+ if (group.equals("Read Group")) {detail.READ_GROUP = gcType;}
+ else if (group.equals("Sample")) {detail.SAMPLE = gcType;}
+ else if (group.equals("Library")) {detail.LIBRARY = gcType;}
+
metrics.DETAILS.addMetric(detail);
}
@@ -200,7 +231,7 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
else if (group.equals("Library")) {summary.LIBRARY = gcType;}
summary.ACCUMULATION_LEVEL = group;
- summary.WINDOW_SIZE = windowSize;
+ summary.WINDOW_SIZE = scanWindowSize;
summary.TOTAL_CLUSTERS = totalClusters;
summary.ALIGNED_READS = totalAlignedReads;
@@ -249,12 +280,12 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
/////////////////////////////////////////////////////////////////////////////
//Keeps track of each level of GcCalculation
/////////////////////////////////////////////////////////////////////////////
- class GcObject{
+ class GcObject {
int totalClusters = 0;
- int totalAlignedReads = 0;
- int[] readsByGc = new int[WINDOWS];
- long[] basesByGc = new long[WINDOWS];
- long[] errorsByGc = new long[WINDOWS];
+ long totalAlignedReads = 0;
+ int[] readsByGc = new int[BINS];
+ long[] basesByGc = new long[BINS];
+ long[] errorsByGc = new long[BINS];
String group = null;
}
@@ -264,7 +295,7 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
/////////////////////////////////////////////////////////////////////////////
private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) {
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters;
- final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - windowSize : rec.getAlignmentStart();
+ final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart();
++gcObj.totalAlignedReads;
if (pos > 0) {
final int windowGc = gc[pos];
diff --git a/src/java/picard/analysis/GcBiasSummaryMetrics.java b/src/java/picard/analysis/GcBiasSummaryMetrics.java
index 6841a67..f2ba5c7 100644
--- a/src/java/picard/analysis/GcBiasSummaryMetrics.java
+++ b/src/java/picard/analysis/GcBiasSummaryMetrics.java
@@ -41,7 +41,7 @@ public class GcBiasSummaryMetrics extends MultilevelMetrics {
public int TOTAL_CLUSTERS;
/** The total number of aligned reads used to compute the gc bias metrics. */
- public int ALIGNED_READS;
+ public long ALIGNED_READS;
/**
* Illumina-style AT dropout metric. Calculated by taking each GC bin independently and calculating
diff --git a/src/java/picard/analysis/GcBiasUtils.java b/src/java/picard/analysis/GcBiasUtils.java
new file mode 100644
index 0000000..00c3622
--- /dev/null
+++ b/src/java/picard/analysis/GcBiasUtils.java
@@ -0,0 +1,122 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.analysis;
+
+import htsjdk.samtools.reference.ReferenceSequence;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
+import htsjdk.samtools.util.SequenceUtil;
+import htsjdk.samtools.util.StringUtil;
+import java.io.File;
+
+/** Utilities to calculate GC Bias
+ * Created by kbergin on 9/23/15.
+ */
+public class GcBiasUtils {
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Calculates GC as a number from 0 to 100 in the specified window.
+ // If the window includes more than five no-calls then -1 is returned.
+ /////////////////////////////////////////////////////////////////////////////
+ public static int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
+ if (state.init) {
+ state.init = false;
+ state.gcCount = 0;
+ state.nCount = 0;
+ for (int i = startIndex; i < endIndex; ++i) {
+ final byte base = bases[i];
+ if (SequenceUtil.basesEqual(base, (byte)'G') || SequenceUtil.basesEqual(base, (byte)'C')) ++state.gcCount;
+ else if (SequenceUtil.basesEqual(base, (byte)'N')) ++state.nCount;
+ }
+ } else {
+ final byte newBase = bases[endIndex - 1];
+ if (SequenceUtil.basesEqual(newBase, (byte)'G') || SequenceUtil.basesEqual(newBase, (byte)'C')) ++state.gcCount;
+ else if (newBase == 'N') ++state.nCount;
+
+ if (SequenceUtil.basesEqual(state.priorBase, (byte)'G') || SequenceUtil.basesEqual(state.priorBase, (byte)'C')) --state.gcCount;
+ else if (SequenceUtil.basesEqual(state.priorBase, (byte)'N')) --state.nCount;
+ }
+ state.priorBase = bases[startIndex];
+ if (state.nCount > 4) return -1;
+ else return (state.gcCount * 100) / (endIndex - startIndex);
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Calculate number of 100bp windows in the refBases passed in that fall into
+ // each gc content bin (0-100% gc)
+ /////////////////////////////////////////////////////////////////////////////
+ public static int[] calculateRefWindowsByGc(final int windows, final File referenceSequence, final int windowSize) {
+ final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequence);
+ ReferenceSequence ref;
+
+ final int [] windowsByGc = new int [windows];
+
+ while ((ref = refFile.nextSequence()) != null) {
+ final byte[] refBases = ref.getBases();
+ StringUtil.toUpperCase(refBases);
+ final int refLength = refBases.length;
+ final int lastWindowStart = refLength - windowSize;
+
+ final CalculateGcState state = new GcBiasUtils().new CalculateGcState();
+
+ for (int i = 1; i < lastWindowStart; ++i) {
+ final int windowEnd = i + windowSize;
+ final int gcBin = calculateGc(refBases, i, windowEnd, state);
+ if (gcBin != -1) windowsByGc[gcBin]++;
+ }
+ }
+
+ return windowsByGc;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Calculate all the GC values for all windows
+ /////////////////////////////////////////////////////////////////////////////
+ public static byte [] calculateAllGcs(final byte[] refBases, final int lastWindowStart, final int windowSize) {
+
+ final CalculateGcState state = new GcBiasUtils().new CalculateGcState();
+
+ final int refLength = refBases.length;
+ final byte[] gc = new byte[refLength + 1];
+
+ for (int i = 1; i < lastWindowStart; ++i) {
+ final int windowEnd = i + windowSize;
+ final int windowGc = calculateGc(refBases, i, windowEnd, state);
+ gc[i] = (byte) windowGc;
+ }
+
+ return gc;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Keeps track of current GC calculation state
+ /////////////////////////////////////////////////////////////////////////////
+ class CalculateGcState {
+ boolean init = true;
+ int nCount;
+ int gcCount;
+ byte priorBase;
+ }
+}
diff --git a/src/java/picard/analysis/directed/TargetMetricsCollector.java b/src/java/picard/analysis/directed/TargetMetricsCollector.java
index f08b48c..4828191 100644
--- a/src/java/picard/analysis/directed/TargetMetricsCollector.java
+++ b/src/java/picard/analysis/directed/TargetMetricsCollector.java
@@ -421,7 +421,7 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
/** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */
private void calculateTargetCoverageMetrics() {
- final short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array
+ final int[] depths = new int[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array
int zeroCoverageTargets = 0;
int depthIndex = 0;
double totalCoverage = 0;
@@ -433,10 +433,10 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
continue;
}
- final short[] targetDepths = c.getDepths();
+ final int[] targetDepths = c.getDepths();
basesConsidered += targetDepths.length;
- for (final short depth : targetDepths) {
+ for (final int depth : targetDepths) {
depths[depthIndex++] = depth;
totalCoverage += depth;
}
@@ -449,7 +449,13 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
Arrays.sort(depths);
// Note. basesConsidered can be between 0 and depths.length inclusive. indexOf80thPercentile will be -1 in the latter case
final int indexOf80thPercentile = Math.max((depths.length - 1 - basesConsidered) + (int) (basesConsidered * 0.2), 0);
- final int coverageAt80thPercentile = depths[indexOf80thPercentile];
+ final int coverageAt80thPercentile;
+ if(depths.length > 0) {
+ coverageAt80thPercentile = depths[indexOf80thPercentile];
+ }
+ else {
+ throw new PicardException("Interval list only contains one zero-length interval.");
+ }
this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile;
this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) allTargets.getIntervals().size();
@@ -464,7 +470,7 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
int targetBases100x = 0;
for (final Coverage c : this.coverageByTarget.values()) {
- for (final short depth : c.getDepths()) {
+ for (final int depth : c.getDepths()) {
++totalTargetBases;
if (depth >= 2) {
@@ -526,6 +532,11 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
final Interval interval = entry.getKey();
final Coverage cov = entry.getValue();
+ if (interval.length() <= 0) {
+ log.warn("interval of length zero found: " + interval + " skipped.");
+ continue;
+ }
+
final double gcDouble = this.intervalToGc.get(interval);
final int gc = (int) Math.round(gcDouble * 100);
@@ -589,19 +600,18 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
*/
public static class Coverage {
private final Interval interval;
- private final short[] depths;
+ private final int[] depths;
/** Constructs a new coverage object for the provided mapping with the desired padding either side. */
public Coverage(final Interval i, final int padding) {
this.interval = i;
- this.depths = new short[interval.length() + 2*padding];
+ this.depths = new int[interval.length() + 2*padding];
}
/** Adds a single point of depth at the desired offset into the coverage array. */
public void addBase(final int offset) {
if (offset >= 0 && offset < this.depths.length) {
- // Prevent overflow if depth is too great, while avoiding doubling memory requirement.
- if (this.depths[offset] < Short.MAX_VALUE) {
+ if (this.depths[offset] < Integer.MAX_VALUE) {
this.depths[offset] += 1;
}
}
@@ -609,15 +619,15 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
/** Returns true if any base in the range has coverage of > 1 */
public boolean hasCoverage() {
- for (final short s : depths) {
+ for (final int s : depths) {
if (s > 1) return true;
}
return false;
}
- /** Gets the coverage depths as an array of shorts. */
- public short[] getDepths() { return this.depths; }
+ /** Gets the coverage depths as an array of ints. */
+ public int[] getDepths() { return this.depths; }
public int getTotal() {
int total = 0;
diff --git a/src/java/picard/cmdline/ClassFinder.java b/src/java/picard/cmdline/ClassFinder.java
index 81bd2cb..2a7e610 100644
--- a/src/java/picard/cmdline/ClassFinder.java
+++ b/src/java/picard/cmdline/ClassFinder.java
@@ -30,7 +30,8 @@ import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.net.URLClassLoader;
-import java.net.URLDecoder;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@@ -63,8 +64,8 @@ public class ClassFinder {
// but the jarPath is remembered so that the iteration over the classpath skips anything other than
// the jarPath.
jarPath = jarFile.getCanonicalPath();
- final URL[] urls = {new URL("file", "", jarPath)};
- loader = new URLClassLoader(urls, Thread.currentThread().getContextClassLoader());
+ final URL[] urls = {new File(jarPath).toURI().toURL()};
+ loader = new URLClassLoader(urls, Thread.currentThread().getContextClassLoader());
}
/** Convert a filename to a class name by removing '.class' and converting '/'s to '.'s. */
@@ -95,9 +96,14 @@ public class ClassFinder {
while (urls.hasMoreElements()) {
try {
String urlPath = urls.nextElement().getFile();
- urlPath = URLDecoder.decode(urlPath, "UTF-8");
- if ( urlPath.startsWith("file:") ) {
- urlPath = urlPath.substring(5);
+ // convert URL to URI
+ // http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4466485
+ // using URLDecode does not work if urlPath has a '+' character
+ try {
+ URI uri = new URI(urlPath);
+ urlPath = uri.getPath();
+ } catch (URISyntaxException e) {
+ log.warn("Cannot convert to URI the " + urlPath + " URL");
}
if (urlPath.indexOf('!') > 0) {
urlPath = urlPath.substring(0, urlPath.indexOf('!'));
@@ -180,4 +186,4 @@ public class ClassFinder {
public Set<Class<?>> getClasses() {
return this.classes;
}
-}
\ No newline at end of file
+}
diff --git a/src/java/picard/cmdline/CommandLineProgram.java b/src/java/picard/cmdline/CommandLineProgram.java
index 4dad5a0..231e4d9 100644
--- a/src/java/picard/cmdline/CommandLineProgram.java
+++ b/src/java/picard/cmdline/CommandLineProgram.java
@@ -37,6 +37,8 @@ import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.zip.DeflaterFactory;
+import htsjdk.variant.variantcontext.writer.Options;
+import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import java.io.File;
import java.lang.annotation.Annotation;
@@ -165,6 +167,7 @@ public abstract class CommandLineProgram {
}
SamReaderFactory.setDefaultValidationStringency(VALIDATION_STRINGENCY);
BlockCompressedOutputStream.setDefaultCompressionLevel(COMPRESSION_LEVEL);
+ if (VALIDATION_STRINGENCY != ValidationStringency.STRICT) VariantContextWriterBuilder.setDefaultOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
if (MAX_RECORDS_IN_RAM != null) {
SAMFileWriterImpl.setDefaultMaxRecordsInRam(MAX_RECORDS_IN_RAM);
diff --git a/src/java/picard/illumina/IlluminaBasecallsConverter.java b/src/java/picard/illumina/IlluminaBasecallsConverter.java
index 01103ab..d56f0fe 100644
--- a/src/java/picard/illumina/IlluminaBasecallsConverter.java
+++ b/src/java/picard/illumina/IlluminaBasecallsConverter.java
@@ -133,6 +133,8 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
private final TimerTask gcTimerTask;
private List<Integer> tiles;
private final boolean includeNonPfReads;
+ private final boolean ignoreUnexpectedBarcodes;
+
private final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype;
// Annoying that we need this.
private final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass;
@@ -155,6 +157,8 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
* @param codecPrototype For spilling output records to disk.
* @param outputRecordClass Inconveniently needed to create SortingCollections.
* @param includeNonPfReads If true, will include ALL reads (including those which do not have PF set)
+ * @param ignoreUnexpectedBarcodes If true, will ignore reads whose called barcode is not found in barcodeRecordWriterMap,
+ * otherwise will throw an exception
*/
public IlluminaBasecallsConverter(final File basecallsDir, final int lane, final ReadStructure readStructure,
final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
@@ -168,14 +172,15 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
final boolean applyEamssFiltering,
- final boolean includeNonPfReads
+ final boolean includeNonPfReads,
+ final boolean ignoreUnexpectedBarcodes
) {
this(basecallsDir, null, lane, readStructure,
barcodeRecordWriterMap, demultiplex, maxReadsInRamPerTile,
tmpDirs, numProcessors, forceGc, firstTile, tileLimit,
outputRecordComparator, codecPrototype, outputRecordClass,
bclQualityEvaluationStrategy, applyEamssFiltering,
- includeNonPfReads);
+ includeNonPfReads, ignoreUnexpectedBarcodes);
}
/**
@@ -197,6 +202,8 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
* @param codecPrototype For spilling output records to disk.
* @param outputRecordClass Inconveniently needed to create SortingCollections.
* @param includeNonPfReads If true, will include ALL reads (including those which do not have PF set)
+ * @param ignoreUnexpectedBarcodes If true, will ignore reads whose called barcode is not found in barcodeRecordWriterMap,
+ * otherwise will throw an exception
*/
public IlluminaBasecallsConverter(final File basecallsDir, File barcodesDir, final int lane,
final ReadStructure readStructure,
@@ -210,7 +217,8 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
- final boolean applyEamssFiltering, final boolean includeNonPfReads
+ final boolean applyEamssFiltering, final boolean includeNonPfReads,
+ final boolean ignoreUnexpectedBarcodes
) {
this.barcodeRecordWriterMap = barcodeRecordWriterMap;
this.demultiplex = demultiplex;
@@ -221,6 +229,7 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
this.outputRecordClass = outputRecordClass;
this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
this.includeNonPfReads = includeNonPfReads;
+ this.ignoreUnexpectedBarcodes = ignoreUnexpectedBarcodes;
// If we're forcing garbage collection, collect every 5 minutes in a daemon thread.
if (forceGc) {
@@ -426,8 +435,15 @@ public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
// Grab the existing collection, or initialize it if it doesn't yet exist
SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
if (recordCollection == null) {
- if (!barcodeRecordWriterMap.containsKey(barcode))
+ // TODO: The implementation here for supporting ignoreUnexpectedBarcodes is not efficient,
+ // but the alternative is an extensive rewrite. We are living with the inefficiency for
+ // this special case for the time being.
+ if (!barcodeRecordWriterMap.containsKey(barcode)) {
+ if (ignoreUnexpectedBarcodes) {
+ return;
+ }
throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected. (Is it referenced in the parameters file?)", barcode));
+ }
recordCollection = this.newSortingCollection();
this.barcodeToRecordCollection.put(barcode, recordCollection);
this.barcodeToProcessingState.put(barcode, null);
diff --git a/src/java/picard/illumina/IlluminaBasecallsToFastq.java b/src/java/picard/illumina/IlluminaBasecallsToFastq.java
index c058215..50a641e 100644
--- a/src/java/picard/illumina/IlluminaBasecallsToFastq.java
+++ b/src/java/picard/illumina/IlluminaBasecallsToFastq.java
@@ -153,6 +153,10 @@ public class IlluminaBasecallsToFastq extends CommandLineProgram {
@Option(doc="Whether to include non-PF reads", shortName="NONPF", optional=true)
public boolean INCLUDE_NON_PF_READS = true;
+ @Option(doc="Whether to ignore reads whose barcodes are not found in MULTIPLEX_PARAMS. Useful when outputting " +
+ "fastqs for only a subset of the barcodes in a lane.", shortName="INGORE_UNEXPECTED")
+ public boolean IGNORE_UNEXPECTED_BARCODES = false;
+
@Option(doc="The read name header formatting to emit. Casava1.8 formatting has additional information beyond Illumina, including: " +
"the passing-filter flag value for the read, the flowcell name, and the sequencer name.", optional = false)
public ReadNameFormat READ_NAME_FORMAT = ReadNameFormat.CASAVA_1_8;
@@ -239,7 +243,7 @@ public class IlluminaBasecallsToFastq extends CommandLineProgram {
FORCE_GC, FIRST_TILE, TILE_LIMIT, queryNameComparator,
new FastqRecordsForClusterCodec(readStructure.templates.length(),
readStructure.barcodes.length()), FastqRecordsForCluster.class, bclQualityEvaluationStrategy,
- this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS);
+ this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS, IGNORE_UNEXPECTED_BARCODES);
log.info("READ STRUCTURE IS " + readStructure.toString());
diff --git a/src/java/picard/illumina/IlluminaBasecallsToSam.java b/src/java/picard/illumina/IlluminaBasecallsToSam.java
index 94e56d0..839a9d9 100644
--- a/src/java/picard/illumina/IlluminaBasecallsToSam.java
+++ b/src/java/picard/illumina/IlluminaBasecallsToSam.java
@@ -201,6 +201,10 @@ public class IlluminaBasecallsToSam extends CommandLineProgram {
@Option(doc="Whether to include non-PF reads", shortName="NONPF", optional=true)
public boolean INCLUDE_NON_PF_READS = true;
+ @Option(doc="Whether to ignore reads whose barcodes are not found in LIBRARY_PARAMS. Useful when outputting " +
+ "BAMs for only a subset of the barcodes in a lane.", shortName="INGORE_UNEXPECTED")
+ public boolean IGNORE_UNEXPECTED_BARCODES = false;
+
private final Map<String, SAMFileWriterWrapper> barcodeSamWriterMap = new HashMap<String, SAMFileWriterWrapper>();
private ReadStructure readStructure;
IlluminaBasecallsConverter<SAMRecordsForCluster> basecallsConverter;
@@ -241,7 +245,7 @@ public class IlluminaBasecallsToSam extends CommandLineProgram {
basecallsConverter = new IlluminaBasecallsConverter<SAMRecordsForCluster>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
barcodeSamWriterMap, true, MAX_READS_IN_RAM_PER_TILE/numOutputRecords, TMP_DIR, NUM_PROCESSORS, FORCE_GC,
FIRST_TILE, TILE_LIMIT, new QueryNameComparator(), new Codec(numOutputRecords), SAMRecordsForCluster.class,
- bclQualityEvaluationStrategy, this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS);
+ bclQualityEvaluationStrategy, this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS, IGNORE_UNEXPECTED_BARCODES);
log.info("DONE_READING STRUCTURE IS " + readStructure.toString());
diff --git a/src/java/picard/pedigree/PedFile.java b/src/java/picard/pedigree/PedFile.java
index fae0464..7ebcf28 100644
--- a/src/java/picard/pedigree/PedFile.java
+++ b/src/java/picard/pedigree/PedFile.java
@@ -145,23 +145,19 @@ public class PedFile extends TreeMap<String, PedFile.PedTrio> {
public Number getPhenotype() { return phenotype; }
}
- /** Function that accepts a map from sample-name to its sex and creates a PEDFile
- * documenting the sexes. Note that the parents are created as UNKNOWNS in this implementation
- * as the purpose is only to create a PED file for the sex of the samples, not the whole pedigree
+ /** Function that accepts a map from sample-name to its sex and creates a PEDFile documenting the sexes.
* @param sampleSexes a map from sample-name to its sex
* @return a PedFile object that contains data.
*/
static public PedFile fromSexMap(final Map<String, Sex> sampleSexes) {
final PedFile pedfile = new PedFile(true);
- int parentCounter = 1;
for (final Map.Entry<String, Sex> sampleSex : sampleSexes.entrySet()) {
final PedFile.PedTrio ped = pedfile.new PedTrio(
sampleSex.getKey(), sampleSex.getKey(),
- "UNKNOWN" + (parentCounter),
- "UNKNOWN" + (parentCounter + 1),
+ "." ,
+ "." ,
sampleSex.getValue(), PedFile.NO_PHENO);
- parentCounter += 2;
pedfile.add(ped);
}
diff --git a/src/java/picard/sam/AbstractAlignmentMerger.java b/src/java/picard/sam/AbstractAlignmentMerger.java
index 26614b8..137d463 100644
--- a/src/java/picard/sam/AbstractAlignmentMerger.java
+++ b/src/java/picard/sam/AbstractAlignmentMerger.java
@@ -45,6 +45,7 @@ import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.FilteringIterator;
+import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CigarUtil;
@@ -113,6 +114,7 @@ public abstract class AbstractAlignmentMerger {
private final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy;
private boolean keepAlignerProperPairFlags = false;
private boolean addMateCigar = false;
+ private boolean unmapContaminantReads = false;
private final SamRecordFilter alignmentFilter = new SamRecordFilter() {
public boolean filterOut(final SAMRecord record) {
@@ -123,6 +125,7 @@ public abstract class AbstractAlignmentMerger {
throw new UnsupportedOperationException("Paired SamRecordFilter not implemented!");
}
};
+
private boolean includeSecondaryAlignments = true;
/** Class that allows a Sorting Collection and a SAMFileWriter to be treated identically. */
@@ -160,6 +163,8 @@ public abstract class AbstractAlignmentMerger {
protected boolean ignoreAlignment(final SAMRecord sam) { return false; } // default implementation
+ protected boolean isContaminant(final HitsForInsert hits) { return false; } // default implementation
+
/**
* Constructor
*
@@ -189,6 +194,8 @@ public abstract class AbstractAlignmentMerger {
* @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
* alignments but none primary, for a read or read pair.
* @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
+ * @param unmapContaminantReads If true, identify reads having the signature of contamination from a foreign organism (i.e. mostly clipped bases),
+ * and mark them as unmapped.
*/
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
final File referenceFasta, final boolean clipAdapters,
@@ -199,7 +206,8 @@ public abstract class AbstractAlignmentMerger {
final List<SamPairUtil.PairOrientation> expectedOrientations,
final SAMFileHeader.SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
- final boolean addMateCigar) {
+ final boolean addMateCigar,
+ final boolean unmapContaminantReads) {
IOUtil.assertFileIsReadable(unmappedBamFile);
IOUtil.assertFileIsWritable(targetBamFile);
IOUtil.assertFileIsReadable(referenceFasta);
@@ -248,6 +256,7 @@ public abstract class AbstractAlignmentMerger {
this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy;
this.addMateCigar = addMateCigar;
+ this.unmapContaminantReads = unmapContaminantReads;
}
/** Allows the caller to override the maximum records in RAM. */
@@ -269,7 +278,6 @@ public abstract class AbstractAlignmentMerger {
}
/**
- * /**
* Merges the alignment data with the non-aligned records from the source BAM file.
*/
public void mergeAlignment(final File referenceFasta) {
@@ -349,6 +357,10 @@ public abstract class AbstractAlignmentMerger {
final boolean clone = nextAligned.numHits() > 1 || nextAligned.hasSupplementalHits();
SAMRecord r1Primary = null, r2Primary = null;
+ // by this point there should be a single chosen primary alignment, which we will use to determine whether the read is contaminant.
+ // this must be done before the main iteration, since secondary / supplementary alignments will be affected by the primary.
+ final boolean unmapDueToContaminant = this.unmapContaminantReads && isContaminant(nextAligned);
+
if (rec.getReadPairedFlag()) {
for (int i = 0; i < nextAligned.numHits(); ++i) {
// firstAligned or secondAligned may be null, if there wasn't an alignment for the end,
@@ -375,9 +387,10 @@ public abstract class AbstractAlignmentMerger {
r2Primary = secondToWrite;
}
- transferAlignmentInfoToPairedRead(firstToWrite, secondToWrite, firstAligned, secondAligned);
+ transferAlignmentInfoToPairedRead(firstToWrite, secondToWrite, firstAligned, secondAligned, unmapDueToContaminant);
// Only write unmapped read when it has the mate info from the primary alignment.
+ // this avoids the scenario of having multiple unmapped reads with the same name & pair flags
if (!firstToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
addIfNotFiltered(sink, firstToWrite);
if (firstToWrite.getReadUnmappedFlag()) ++unmapped;
@@ -398,26 +411,35 @@ public abstract class AbstractAlignmentMerger {
for (final SAMRecord supp : supplementals) {
final SAMRecord out = clone(sourceRec);
- transferAlignmentInfoToFragment(out, supp);
+ transferAlignmentInfoToFragment(out, supp, unmapDueToContaminant);
if (matePrimary != null) SamPairUtil.setMateInformationOnSupplementalAlignment(out, matePrimary, addMateCigar);
- ++aligned;
- addIfNotFiltered(sink, out);
+ // don't write supplementary reads that were unmapped by transferAlignmentInfoToFragment
+ if (!out.getReadUnmappedFlag()) {
+ addIfNotFiltered(sink, out);
+ ++aligned;
+ } else ++unmapped;
}
}
} else {
for (int i = 0; i < nextAligned.numHits(); ++i) {
final SAMRecord recToWrite = clone ? clone(rec) : rec;
- transferAlignmentInfoToFragment(recToWrite, nextAligned.getFragment(i));
- addIfNotFiltered(sink, recToWrite);
+ final boolean isPrimary = !nextAligned.getFragment(i).isSecondaryOrSupplementary();
+ transferAlignmentInfoToFragment(recToWrite, nextAligned.getFragment(i), unmapDueToContaminant);
+ // Only write unmapped read if it was originally the primary.
+ // this avoids the scenario of having multiple unmapped reads with the same name & pair flags
+ if (!recToWrite.getReadUnmappedFlag() || isPrimary) addIfNotFiltered(sink, recToWrite);
if (recToWrite.getReadUnmappedFlag()) ++unmapped;
else ++aligned;
}
// Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
for (final SAMRecord supplementalRec : nextAligned.getSupplementalFirstOfPairOrFragment()) {
final SAMRecord recToWrite = clone(rec);
- transferAlignmentInfoToFragment(recToWrite, supplementalRec);
- addIfNotFiltered(sink, recToWrite);
- ++aligned;
+ transferAlignmentInfoToFragment(recToWrite, supplementalRec, unmapDueToContaminant);
+ // don't write supplementary reads that were unmapped by transferAlignmentInfoToFragment
+ if (!recToWrite.getReadUnmappedFlag()) {
+ addIfNotFiltered(sink, recToWrite);
+ ++aligned;
+ } else ++unmapped;
}
}
nextAligned = nextAligned();
@@ -505,18 +527,29 @@ public abstract class AbstractAlignmentMerger {
/**
* Copies alignment info from aligned to unaligned read, clips as appropriate, and sets PG ID.
+ * May also un-map the resulting read if the alignment is bad (e.g. no unclipped bases).
*
* @param unaligned Original SAMRecord, and object into which values are copied.
* @param aligned Holds alignment info that will be copied into unaligned.
+ * @param isContaminant Should this read be unmapped due to contamination?
*/
- private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SAMRecord aligned) {
+ private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SAMRecord aligned, final boolean isContaminant) {
setValuesFromAlignment(unaligned, aligned);
updateCigarForTrimmedOrClippedBases(unaligned, aligned);
if (SAMUtils.cigarMapsNoBasesToRef(unaligned.getCigar())) {
+ log.warn("Record contains no unclipped bases; making unmapped: " + aligned);
SAMUtils.makeReadUnmapped(unaligned);
} else if (SAMUtils.recordMapsEntirelyBeyondEndOfReference(aligned)) {
log.warn("Record mapped off end of reference; making unmapped: " + aligned);
SAMUtils.makeReadUnmapped(unaligned);
+ } else if (isContaminant) {
+ log.warn("Record looks like foreign contamination; making unmapped: " + aligned);
+ // NB: for reads that look like contamination, just set unmapped flag and zero MQ but keep other flags as-is.
+ // this maintains the sort order so that downstream analyses can use them for calculating evidence
+ // of contamination vs other causes (e.g. structural variants)
+ unaligned.setReadUnmappedFlag(true);
+ unaligned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
+ unaligned.setAttribute(SAMTag.FT.name(), "Cross-species contamination");
}
}
@@ -527,10 +560,12 @@ public abstract class AbstractAlignmentMerger {
* @param secondUnaligned Original second of pair, into which alignment and pair info will be written.
* @param firstAligned Aligned first of pair, or null if no alignment.
* @param secondAligned Aligned second of pair, or null if no alignment.
+ * @param isContaminant Should this pair be unmapped due to contamination?
*/
- private void transferAlignmentInfoToPairedRead(final SAMRecord firstUnaligned, final SAMRecord secondUnaligned, final SAMRecord firstAligned, final SAMRecord secondAligned) {
- if (firstAligned != null) transferAlignmentInfoToFragment(firstUnaligned, firstAligned);
- if (secondAligned != null) transferAlignmentInfoToFragment(secondUnaligned, secondAligned);
+ private void transferAlignmentInfoToPairedRead(final SAMRecord firstUnaligned, final SAMRecord secondUnaligned,
+ final SAMRecord firstAligned, final SAMRecord secondAligned, final boolean isContaminant) {
+ if (firstAligned != null) transferAlignmentInfoToFragment(firstUnaligned, firstAligned, isContaminant);
+ if (secondAligned != null) transferAlignmentInfoToFragment(secondUnaligned, secondAligned, isContaminant);
if (isClipOverlappingReads()) clipForOverlappingReads(firstUnaligned, secondUnaligned);
SamPairUtil.setMateInfo(secondUnaligned, firstUnaligned, header, addMateCigar);
if (!keepAlignerProperPairFlags) {
@@ -538,7 +573,6 @@ public abstract class AbstractAlignmentMerger {
}
}
-
/**
* Checks to see whether the ends of the reads overlap and soft clips reads
* them if necessary.
@@ -694,7 +728,6 @@ public abstract class AbstractAlignmentMerger {
}
}
-
protected SAMSequenceDictionary getSequenceDictionary() { return this.sequenceDictionary; }
protected SAMProgramRecord getProgramRecord() { return this.programRecord; }
diff --git a/src/java/picard/sam/DownsampleSam.java b/src/java/picard/sam/DownsampleSam.java
index 6675c0a..4be3f7b 100644
--- a/src/java/picard/sam/DownsampleSam.java
+++ b/src/java/picard/sam/DownsampleSam.java
@@ -1,5 +1,30 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
package picard.sam;
+import htsjdk.samtools.DownsamplingIteratorFactory;
+import htsjdk.samtools.DownsamplingIteratorFactory.Strategy;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
@@ -17,17 +42,27 @@ import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
import java.util.Random;
/**
- * Class to randomly downsample a BAM file while respecting that we should either get rid
- * of both ends of a pair or neither end of the pair!
+ * Class to randomly downsample a BAM file while respecting that we should either retain or discard
+ * all of the reads for a template - i.e. all reads with the same name, whether first or second of
+ * pair, secondary or supplementary, all travel together.
+ *
+ * @author Tim Fennell
*/
@CommandLineProgramProperties(
- usage = "Randomly down-sample a SAM or BAM file to retain " +
- "a random subset of the reads. Mate-pairs are either both kept or both discarded. Reads marked as not primary " +
- "alignments are all discarded. Each read is given a probability P of being retained - results with the exact " +
- "same input in the same order and with the same value for RANDOM_SEED will produce the same results.",
+ usage = "Randomly down-sample a SAM or BAM file to retain only a subset of the reads in the file. " +
+ "All reads for a templates are kept or discarded as a unit, with the goal of retaining reads" +
+ "from PROBABILITY * input templates. While this will usually result in approximately " +
+ "PROBABILITY * input reads being retained also, for very small PROBABILITIES this may not " +
+ "be the case.\n" +
+ "A number of different downsampling strategies are supported using the STRATEGY option:\n\n" +
+ "ConstantMemory: " + DownsamplingIteratorFactory.CONSTANT_MEMORY_DESCRPTION + "\n\n" +
+ "HighAccuracy: " + DownsamplingIteratorFactory.HIGH_ACCURACY_DESCRIPTION + "\n\n" +
+ "Chained: " + DownsamplingIteratorFactory.CHAINED_DESCRIPTION + "\n",
usageShort = "Down-sample a SAM or BAM file to retain a random subset of the reads",
programGroup = SamOrBam.class
)
@@ -39,13 +74,21 @@ public class DownsampleSam extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.")
public File OUTPUT;
+ @Option(shortName="S", doc="The downsampling strategy to use. See usage for discussion.")
+ public Strategy STRATEGY = Strategy.ConstantMemory;
+
@Option(shortName = "R", doc = "Random seed to use if reproducibilty is desired. " +
"Setting to null will cause multiple invocations to produce different results.")
- public Long RANDOM_SEED = 1L;
+ public Integer RANDOM_SEED = 1;
@Option(shortName = "P", doc = "The probability of keeping any individual read, between 0 and 1.")
public double PROBABILITY = 1;
+ @Option(shortName = "A", doc = "The accuracy that the downsampler should try to achieve if the selected strategy supports it. " +
+ "Note that accuracy is never guaranteed, but some strategies will attempt to provide accuracy within the requested bounds." +
+ "Higher accuracy will generally require more memory.")
+ public double ACCURACY = 0.0001;
+
private final Log log = Log.getInstance(DownsampleSam.class);
public static void main(final String[] args) {
@@ -57,20 +100,28 @@ public class DownsampleSam extends CommandLineProgram {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
+ // Warn the user if they are running with P=1; 0 <= P <= 1 is checked by the DownsamplingIteratorFactory
+ if (PROBABILITY == 1) {
+ log.warn("Running DownsampleSam with PROBABILITY=1! This will likely just recreate the input file.");
+ }
+
final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Wrote");
- final DownsamplingIterator iterator = new DownsamplingIterator(in.iterator(), r, PROBABILITY);
+ final DownsamplingIterator iterator = DownsamplingIteratorFactory.make(INPUT, STRATEGY, PROBABILITY, ACCURACY, RANDOM_SEED);
- for (final SAMRecord rec : iterator) {
+ while (iterator.hasNext()) {
+ final SAMRecord rec = iterator.next();
out.addAlignment(rec);
progress.record(rec);
}
out.close();
CloserUtil.close(in);
- log.info("Finished! Kept " + iterator.getKeptReads() + " out of " + iterator.getTotalReads() + " reads.");
+ final NumberFormat fmt = new DecimalFormat("0.00%");
+ log.info("Finished downsampling.");
+ log.info("Kept ", iterator.getAcceptedCount(), " out of ", iterator.getSeenCount(), " reads (", fmt.format(iterator.getAcceptedFraction()), ").");
return 0;
}
diff --git a/src/java/picard/sam/DuplicationMetrics.java b/src/java/picard/sam/DuplicationMetrics.java
index 2523c12..441d983 100644
--- a/src/java/picard/sam/DuplicationMetrics.java
+++ b/src/java/picard/sam/DuplicationMetrics.java
@@ -78,7 +78,7 @@ public class DuplicationMetrics extends MetricBase {
/**
* Estimates the size of a library based on the number of paired end molecules observed
- * and the number of unique pairs ovserved.
+ * and the number of unique pairs observed.
*
* Based on the Lander-Waterman equation that states:
* C/X = 1 - exp( -N/X )
diff --git a/src/java/picard/sam/HitsForInsert.java b/src/java/picard/sam/HitsForInsert.java
index 5ee312c..1158d8a 100644
--- a/src/java/picard/sam/HitsForInsert.java
+++ b/src/java/picard/sam/HitsForInsert.java
@@ -54,7 +54,6 @@ class HitsForInsert {
NONE, ONE, MORE_THAN_ONE
}
-
// These are package-visible to make life easier for the PrimaryAlignmentSelectionStrategies.
final List<SAMRecord> firstOfPairOrFragment = new ArrayList<SAMRecord>();
final List<SAMRecord> secondOfPair = new ArrayList<SAMRecord>();
@@ -147,8 +146,25 @@ class HitsForInsert {
}
/**
- * Set all alignments to not primary, except for the one specified by the argument. If paired, and set the
- * alignment for both ends if there is an alignment for both ends, otherwise just for the end for which
+ * Get the index of the first primary we see in the list of hits (either read1 or read2).
+ * NOTE: if the PrimaryAlignmentSelectionStrategy has not been run, the returned value may not represent the ONLY primary.
+ *
+ * @return the index, or -1 if no primary was found.
+ */
+ public int getIndexOfEarliestPrimary() {
+ for (int i = 0; i < numHits(); i++) {
+ final SAMRecord firstAligned = getFirstOfPair(i);
+ final SAMRecord secondAligned = getSecondOfPair(i);
+ final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) ||
+ (secondAligned != null && !secondAligned.isSecondaryOrSupplementary());
+ if (isPrimaryAlignment) return i;
+ }
+ return -1;
+ }
+
+ /**
+ * Used by PrimaryAlignmentSelectionStrategy to set all alignments to not primary, except for the one specified by the argument.
+ * If paired, and set the alignment for both ends if there is an alignment for both ends, otherwise just for the end for which
* there is an alignment at the given index.
* @param primaryAlignmentIndex
*/
@@ -167,7 +183,6 @@ class HitsForInsert {
this.getSecondOfPair(i).setNotPrimaryAlignmentFlag(notPrimary);
}
}
-
}
/**
@@ -220,8 +235,6 @@ class HitsForInsert {
}
}
-
-
/**
* Determine if there is a single primary alignment in a list of alignments.
* @param records
@@ -244,19 +257,6 @@ class HitsForInsert {
else return tallyPrimaryAlignments(secondOfPair);
}
- int findPrimaryAlignment(final List<SAMRecord> records) {
- int indexOfPrimaryAlignment = -1;
- for (int i = 0; i < records.size(); ++i) {
- if (records.get(i) != null && !records.get(i).isSecondaryOrSupplementary()) {
- if (indexOfPrimaryAlignment != -1) {
- throw new IllegalStateException("Multiple primary alignments found for read " + getReadName());
- }
- indexOfPrimaryAlignment = i;
- }
- }
- return indexOfPrimaryAlignment;
- }
-
// null HI tag sorts after any non-null.
private static class HitIndexComparator implements Comparator<SAMRecord> {
public int compare(final SAMRecord rec1, final SAMRecord rec2) {
diff --git a/src/java/picard/sam/MergeBamAlignment.java b/src/java/picard/sam/MergeBamAlignment.java
index 38e74a8..d72f375 100644
--- a/src/java/picard/sam/MergeBamAlignment.java
+++ b/src/java/picard/sam/MergeBamAlignment.java
@@ -187,6 +187,13 @@ public class MergeBamAlignment extends CommandLineProgram {
@Option(shortName = "MC", optional = true, doc = "Adds the mate CIGAR tag (MC) if true, does not if false.")
public Boolean ADD_MATE_CIGAR = true;
+ @Option(shortName = "UNMAP_CONTAM", optional = true, doc = "Detect reads originating from foreign organisms (e.g. bacterial DNA in a non-bacterial sample)," +
+ "and unmap + label those reads accordingly.")
+ public boolean UNMAP_CONTAMINANT_READS = false;
+
+ @Option(doc = "If UNMAP_CONTAMINANT_READS is set, require this many unclipped bases or else the read will be marked as contaminant.")
+ public int MIN_UNCLIPPED_BASES = 32;
+
private static final Log log = Log.getInstance(MergeBamAlignment.class);
/**
@@ -240,7 +247,8 @@ public class MergeBamAlignment extends CommandLineProgram {
ALIGNED_READS_ONLY, ALIGNED_BAM, MAX_INSERTIONS_OR_DELETIONS,
ATTRIBUTES_TO_RETAIN, ATTRIBUTES_TO_REMOVE, READ1_TRIM, READ2_TRIM,
READ1_ALIGNED_BAM, READ2_ALIGNED_BAM, EXPECTED_ORIENTATIONS, SORT_ORDER,
- PRIMARY_ALIGNMENT_STRATEGY.newInstance(), ADD_MATE_CIGAR);
+ PRIMARY_ALIGNMENT_STRATEGY.newInstance(), ADD_MATE_CIGAR, UNMAP_CONTAMINANT_READS,
+ MIN_UNCLIPPED_BASES);
merger.setClipOverlappingReads(CLIP_OVERLAPPING_READS);
merger.setMaxRecordsInRam(MAX_RECORDS_IN_RAM);
merger.setKeepAlignerProperPairFlags(ALIGNER_PROPER_PAIR_FLAGS);
diff --git a/src/java/picard/sam/RevertSam.java b/src/java/picard/sam/RevertSam.java
index b1fc3d7..ab0d1be 100644
--- a/src/java/picard/sam/RevertSam.java
+++ b/src/java/picard/sam/RevertSam.java
@@ -103,8 +103,9 @@ public class RevertSam extends CommandLineProgram {
add(SAMTag.MQ.name());
add(SAMTag.SA.name()); // Supplementary alignment metadata
add(SAMTag.MC.name()); // Mate Cigar
+ add(SAMTag.AS.name());
}};
-
+
@Option(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " +
"a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " +
"mates, duplicated records, records with mismatches in length of bases and qualities. This option can " +
@@ -213,12 +214,14 @@ public class RevertSam extends CommandLineProgram {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary()) continue;
- // Actually to the reverting of the remaining records
+ // log the progress before you revert because otherwise the "last read position" might not be accurate
+ progress.record(rec);
+
+ // Actually do the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing) sorter.add(rec);
else out.addAlignment(rec);
- progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
diff --git a/src/java/picard/sam/SamAlignmentMerger.java b/src/java/picard/sam/SamAlignmentMerger.java
index b2936f9..f781a61 100644
--- a/src/java/picard/sam/SamAlignmentMerger.java
+++ b/src/java/picard/sam/SamAlignmentMerger.java
@@ -14,6 +14,7 @@ import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
+import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.DelegatingIterator;
@@ -43,7 +44,9 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
private final List<File> read1AlignedSamFile;
private final List<File> read2AlignedSamFile;
private final int maxGaps;
+ private final int minUnclippedBases;
private boolean forceSort = false;
+ private final OverclippedReadFilter contaminationFilter;
/**
* Constructor
@@ -81,6 +84,10 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
* @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair,
* in which none are primary, or more than one is marked primary
* @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
+ *
+ * @param unmapContaminantReads If true, identify reads having the signature of contamination from a foreign organism (i.e. mostly clipped bases),
+ * and mark them as unmapped.
+ * @param minUnclippedBases If unmapContaminantReads is set, require this many unclipped bases or else the read will be marked as contaminant.
*/
public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
@@ -92,11 +99,13 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
final List<SamPairUtil.PairOrientation> expectedOrientations,
final SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
- final boolean addMateCigar) {
+ final boolean addMateCigar,
+ final boolean unmapContaminantReads,
+ final int minUnclippedBases) {
super(unmappedBamFile, targetBamFile, referenceFasta, clipAdapters, bisulfiteSequence,
alignedReadsOnly, programRecord, attributesToRetain, attributesToRemove, read1BasesTrimmed,
- read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar);
+ read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar, unmapContaminantReads);
if ((alignedSamFile == null || alignedSamFile.size() == 0) &&
(read1AlignedSamFile == null || read1AlignedSamFile.size() == 0 ||
@@ -122,6 +131,8 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
this.read1AlignedSamFile = read1AlignedSamFile;
this.read2AlignedSamFile = read2AlignedSamFile;
this.maxGaps = maxGaps;
+ this.minUnclippedBases = minUnclippedBases;
+ this.contaminationFilter = new OverclippedReadFilter(minUnclippedBases, false);
log.info("Processing SAM file(s): " + alignedSamFile != null ? alignedSamFile : read1AlignedSamFile + "," + read2AlignedSamFile);
}
@@ -333,6 +344,27 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
return gaps > maxGaps;
}
+ /**
+ * Criteria for contaminant reads:
+ * 1. primary alignment has fewer than minUnclippedBases unclipped bases
+ * 2. primary alignment has both ends clipped
+ * 3. for pairs, at least one end of primary alignment meets above criteria
+ */
+ protected boolean isContaminant(final HitsForInsert hits) {
+ boolean isContaminant = false;
+ if (hits.numHits() > 0) {
+ final int primaryIndex = hits.getIndexOfEarliestPrimary();
+ if (primaryIndex < 0) throw new IllegalStateException("No primary alignment was found, despite having nonzero hits.");
+ final SAMRecord primaryRead1 = hits.getFirstOfPair(primaryIndex);
+ final SAMRecord primaryRead2 = hits.getSecondOfPair(primaryIndex);
+ if (primaryRead1 != null && primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead1, primaryRead2);
+ else if (primaryRead1 != null) isContaminant = contaminationFilter.filterOut(primaryRead1);
+ else if (primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead2);
+ else throw new IllegalStateException("Neither read1 or read2 exist for chosen primary alignment");
+ }
+ return isContaminant;
+ }
+
// Accessor for testing
public boolean getForceSort() {return this.forceSort; }
}
diff --git a/src/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
index ac53b40..077ac67 100644
--- a/src/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
+++ b/src/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
@@ -127,13 +127,21 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
"group size would be approximately 10 reads.")
public int MAX_GROUP_RATIO = 500;
+ @Option(doc = "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true)
+ public String BARCODE_TAG = null;
+
+ @Option(doc = "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
+ public String READ_ONE_BARCODE_TAG = null;
+
+ @Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
+ public String READ_TWO_BARCODE_TAG = null;
+
private final Log log = Log.getInstance(EstimateLibraryComplexity.class);
/**
* Little class to hold the sequence of a pair of reads and tile location information.
*/
static class PairedReadSequence implements OpticalDuplicateFinder.PhysicalLocation {
- static int size_in_bytes = 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint
short readGroup = -1;
short tile = -1;
short x = -1, y = -1;
@@ -142,6 +150,10 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
byte[] read2;
short libraryId;
+ public static int getSizeInBytes() {
+ return 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint
+ }
+
public short getReadGroup() { return this.readGroup; }
public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }
@@ -161,14 +173,44 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
public short getLibraryId() { return this.libraryId; }
public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }
+
+ public static SortingCollection.Codec<PairedReadSequence> getCodec() {
+ return new PairedReadCodec();
+ }
+ }
+
+ static class PairedReadSequenceWithBarcodes extends PairedReadSequence {
+ int barcode; // primary barcode for this read (and pair)
+ int readOneBarcode; // read one barcode, 0 if not present
+ int readTwoBarcode; // read two barcode, 0 if not present or not paired
+
+ public PairedReadSequenceWithBarcodes() {
+ super();
+ }
+
+ public PairedReadSequenceWithBarcodes(final PairedReadSequence val) {
+ if (null == val) throw new PicardException("val was null");
+ this.readGroup = val.getReadGroup();
+ this.tile = val.getTile();
+ this.x = val.getX();
+ this.y = val.getY();
+ this.qualityOk = val.qualityOk;
+ this.read1 = val.read1.clone();
+ this.read2 = val.read2.clone();
+ this.libraryId = val.getLibraryId();
+ }
+
+ public static int getSizeInBytes() {
+ return PairedReadSequence.getSizeInBytes() + (3 * 4); // rough guess at memory footprint
+ }
}
/**
* Codec class for writing and read PairedReadSequence objects.
*/
static class PairedReadCodec implements SortingCollection.Codec<PairedReadSequence> {
- private DataOutputStream out;
- private DataInputStream in;
+ protected DataOutputStream out;
+ protected DataInputStream in;
public void setOutputStream(final OutputStream out) {
this.out = new DataOutputStream(out);
@@ -188,7 +230,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
this.out.write(val.read1);
this.out.writeInt(val.read2.length);
this.out.write(val.read2);
- } catch (IOException ioe) {
+ } catch (final IOException ioe) {
throw new PicardException("Error write out read pair.", ioe);
}
}
@@ -198,7 +240,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
final PairedReadSequence val = new PairedReadSequence();
try {
val.readGroup = this.in.readShort();
- } catch (EOFException eof) {
+ } catch (final EOFException eof) {
return null;
}
@@ -219,7 +261,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
}
return val;
- } catch (IOException ioe) {
+ } catch (final IOException ioe) {
throw new PicardException("Exception reading read pair.", ioe);
}
}
@@ -228,6 +270,49 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadCodec(); }
}
+
+ /**
+ * Codec class for writing and read PairedReadSequence objects.
+ */
+ static class PairedReadWithBarcodesCodec extends PairedReadCodec {
+ @Override
+ public void encode(final PairedReadSequence val) {
+ if (!(val instanceof PairedReadSequenceWithBarcodes)) {
+ throw new PicardException("Val was not a PairedReadSequenceWithBarcodes");
+ }
+ final PairedReadSequenceWithBarcodes data = (PairedReadSequenceWithBarcodes) val;
+
+ super.encode(val);
+
+ try {
+ this.out.writeInt(data.barcode);
+ this.out.writeInt(data.readOneBarcode);
+ this.out.writeInt(data.readTwoBarcode);
+ } catch (final IOException ioe) {
+ throw new PicardException("Error write out read pair.", ioe);
+ }
+ }
+
+ @Override
+ public PairedReadSequence decode() {
+ try {
+ final PairedReadSequence parentVal = super.decode();
+ if (null == parentVal) return null; // EOF
+ final PairedReadSequenceWithBarcodes val = new PairedReadSequenceWithBarcodes(parentVal);
+ val.barcode = this.in.readInt();
+ val.readOneBarcode = this.in.readInt();
+ val.readTwoBarcode = this.in.readInt();
+
+ return val;
+ } catch (final IOException ioe) {
+ throw new PicardException("Exception reading read pair.", ioe);
+ }
+ }
+
+ @Override
+ public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadWithBarcodesCodec(); }
+ }
+
/**
* Comparator that orders read pairs on the first N bases of both reads.
*/
@@ -251,13 +336,38 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
}
}
+ public int getBarcodeValue(final SAMRecord record) {
+ return getReadBarcodeValue(record, BARCODE_TAG);
+ }
+
+ public static int getReadBarcodeValue(final SAMRecord record, final String tag) {
+ if (null == tag) return 0;
+ final String attr = record.getStringAttribute(tag);
+ if (null == attr) return 0;
+ else return attr.hashCode();
+ }
+
+ private int getReadOneBarcodeValue(final SAMRecord record) {
+ return getReadBarcodeValue(record, READ_ONE_BARCODE_TAG);
+ }
+
+ private int getReadTwoBarcodeValue(final SAMRecord record) {
+ return getReadBarcodeValue(record, READ_TWO_BARCODE_TAG);
+ }
+
/** Stock main method. */
public static void main(final String[] args) {
new EstimateLibraryComplexity().instanceMainWithExit(args);
}
public EstimateLibraryComplexity() {
- MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / PairedReadSequence.size_in_bytes) / 2;
+ final int sizeInBytes;
+ if (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG) {
+ sizeInBytes = PairedReadSequenceWithBarcodes.getSizeInBytes();
+ } else {
+ sizeInBytes = PairedReadSequence.getSizeInBytes();
+ }
+ MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
}
/**
@@ -273,11 +383,22 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
final int recordsRead = 0;
- final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class,
- new PairedReadCodec(),
- new PairedReadComparator(),
- MAX_RECORDS_IN_RAM,
- TMP_DIR);
+ final SortingCollection<PairedReadSequence> sorter;
+ final boolean useBarcodes = (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG);
+
+ if (!useBarcodes) {
+ sorter = SortingCollection.newInstance(PairedReadSequence.class,
+ new PairedReadCodec(),
+ new PairedReadComparator(),
+ MAX_RECORDS_IN_RAM,
+ TMP_DIR);
+ } else {
+ sorter = SortingCollection.newInstance(PairedReadSequence.class,
+ new PairedReadWithBarcodesCodec(),
+ new PairedReadComparator(),
+ MAX_RECORDS_IN_RAM,
+ TMP_DIR);
+ }
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
@@ -295,7 +416,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
- prs = new PairedReadSequence();
+ prs = useBarcodes ? new PairedReadSequenceWithBarcodes() : new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg));
@@ -315,10 +436,19 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases);
+ final PairedReadSequenceWithBarcodes prsWithBarcodes = (useBarcodes) ? (PairedReadSequenceWithBarcodes) prs : null;
+
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
+ if (useBarcodes) {
+ prsWithBarcodes.barcode = getBarcodeValue(rec);
+ prsWithBarcodes.readOneBarcode = getReadOneBarcodeValue(rec);
+ }
} else {
prs.read2 = bases;
+ if (useBarcodes) {
+ prsWithBarcodes.readTwoBarcode = getReadTwoBarcodeValue(rec);
+ }
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
@@ -380,7 +510,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null) continue;
- if (matches(lhs, rhs, MAX_DIFF_RATE)) {
+ if (matches(lhs, rhs, MAX_DIFF_RATE, useBarcodes)) {
dupes.add(rhs);
seqs.set(j, null);
}
@@ -446,12 +576,22 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
* Checks to see if two reads pairs have sequence that are the same, give or take a few
* errors/diffs as dictated by the maxDiffRate.
*/
- private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate) {
+ private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate, final boolean useBarcodes) {
final int read1Length = Math.min(lhs.read1.length, rhs.read1.length);
final int read2Length = Math.min(lhs.read2.length, rhs.read2.length);
final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);
int errors = 0;
+ if (useBarcodes) {
+ final PairedReadSequenceWithBarcodes lhsWithBarcodes = (PairedReadSequenceWithBarcodes) lhs;
+ final PairedReadSequenceWithBarcodes rhsWithBarcodes = (PairedReadSequenceWithBarcodes) rhs;
+ if (lhsWithBarcodes.barcode != rhsWithBarcodes.barcode ||
+ lhsWithBarcodes.readOneBarcode != rhsWithBarcodes.readOneBarcode ||
+ lhsWithBarcodes.readTwoBarcode != rhsWithBarcodes.readTwoBarcode) {
+ return false;
+ }
+ }
+
// The loop can start from MIN_IDENTICAL_BASES because we've already confirmed that
// at least those first few bases are identical when sorting.
for (int i = MIN_IDENTICAL_BASES; i < read1Length; ++i) {
diff --git a/src/java/picard/sam/markduplicates/MarkDuplicates.java b/src/java/picard/sam/markduplicates/MarkDuplicates.java
index 7e3def8..fc982a0 100644
--- a/src/java/picard/sam/markduplicates/MarkDuplicates.java
+++ b/src/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -44,6 +44,8 @@ import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
+import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
+import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec;
import java.io.*;
import java.util.*;
@@ -82,6 +84,15 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
"some of the sorting collections. If you are running out of memory, try reducing this number.")
public double SORTING_COLLECTION_SIZE_RATIO = 0.25;
+ @Option(doc = "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true)
+ public String BARCODE_TAG = null;
+
+ @Option(doc = "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
+ public String READ_ONE_BARCODE_TAG = null;
+
+ @Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
+ public String READ_TWO_BARCODE_TAG = null;
+
private SortingCollection<ReadEndsForMarkDuplicates> pairSort;
private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
private SortingLongCollection duplicateIndexes;
@@ -89,6 +100,18 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
protected LibraryIdGenerator libraryIdGenerator = null; // this is initialized in buildSortedReadEndLists
+ private int getBarcodeValue(final SAMRecord record) {
+ return EstimateLibraryComplexity.getReadBarcodeValue(record, BARCODE_TAG);
+ }
+
+ private int getReadOneBarcodeValue(final SAMRecord record) {
+ return EstimateLibraryComplexity.getReadBarcodeValue(record, READ_ONE_BARCODE_TAG);
+ }
+
+ private int getReadTwoBarcodeValue(final SAMRecord record) {
+ return EstimateLibraryComplexity.getReadBarcodeValue(record, READ_TWO_BARCODE_TAG);
+ }
+
public MarkDuplicates() {
DUPLICATE_SCORING_STRATEGY = ScoringStrategy.SUM_OF_BASE_QUALITIES;
}
@@ -109,11 +132,13 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(METRICS_FILE);
+ final boolean useBarcodes = (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG);
+
reportMemoryStats("Start of doWork");
log.info("Reading input file and constructing read end information.");
- buildSortedReadEndLists();
+ buildSortedReadEndLists(useBarcodes);
reportMemoryStats("After buildSortedReadEndLists");
- generateDuplicateIndexes();
+ generateDuplicateIndexes(useBarcodes);
reportMemoryStats("After generateDuplicateIndexes");
log.info("Marking " + this.numDuplicateIndices + " records as duplicates.");
@@ -146,7 +171,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
if (!rec.isSecondaryOrSupplementary()) {
- final String library = libraryIdGenerator.getLibraryName(header, rec);
+ final String library = LibraryIdGenerator.getLibraryName(header, rec);
DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
if (metrics == null) {
metrics = new DuplicationMetrics();
@@ -229,25 +254,43 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
* hold the necessary information (reference sequence, 5' read coordinate) to do
* duplication, caching to disk as necssary to sort them.
*/
- private void buildSortedReadEndLists() {
- final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / ReadEndsForMarkDuplicates.SIZE_OF);
+ private void buildSortedReadEndLists(final boolean useBarcodes) {
+ final int sizeInBytes;
+ if (useBarcodes) {
+ sizeInBytes = ReadEndsForMarkDuplicatesWithBarcodes.getSizeOf();
+ } else {
+ sizeInBytes = ReadEndsForMarkDuplicates.getSizeOf();
+ }
+ MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
+ final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes);
log.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
+ final ReadEndsForMarkDuplicatesCodec fragCodec, pairCodec, diskCodec;
+ if (useBarcodes) {
+ fragCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+ pairCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+ diskCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+ } else {
+ fragCodec = new ReadEndsForMarkDuplicatesCodec();
+ pairCodec = new ReadEndsForMarkDuplicatesCodec();
+ diskCodec = new ReadEndsForMarkDuplicatesCodec();
+ }
+
this.pairSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class,
- new ReadEndsForMarkDuplicatesCodec(),
- new ReadEndsMDComparator(),
+ pairCodec,
+ new ReadEndsMDComparator(useBarcodes),
maxInMemory,
TMP_DIR);
this.fragSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class,
- new ReadEndsForMarkDuplicatesCodec(),
- new ReadEndsMDComparator(),
+ fragCodec,
+ new ReadEndsMDComparator(useBarcodes),
maxInMemory,
TMP_DIR);
final SamHeaderAndIterator headerAndIterator = openInputs();
final SAMFileHeader header = headerAndIterator.header;
- final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP);
+ final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP, diskCodec);
long index = 0;
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
@@ -277,7 +320,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
}
// If this read is unmapped but sorted with the mapped reads, just skip it.
} else if (!rec.isSecondaryOrSupplementary()) {
- final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec);
+ final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec, useBarcodes);
this.fragSort.add(fragmentEnd);
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
@@ -286,7 +329,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
// See if we've already seen the first end or not
if (pairedEnds == null) {
- pairedEnds = buildReadEnds(header, index, rec);
+ pairedEnds = buildReadEnds(header, index, rec, useBarcodes);
tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
} else {
final int sequence = fragmentEnd.read1ReferenceIndex;
@@ -296,8 +339,12 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
// before updating the orientation later.
if (rec.getFirstOfPairFlag()) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
+ if (useBarcodes)
+ ((ReadEndsForMarkDuplicatesWithBarcodes) pairedEnds).readOneBarcode = getReadOneBarcodeValue(rec);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
+ if (useBarcodes)
+ ((ReadEndsForMarkDuplicatesWithBarcodes) pairedEnds).readTwoBarcode = getReadTwoBarcodeValue(rec);
}
// If the second read is actually later, just add the second read data, else flip the reads
@@ -341,8 +388,14 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
}
/** Builds a read ends object that represents a single read. */
- private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec) {
- final ReadEndsForMarkDuplicates ends = new ReadEndsForMarkDuplicates();
+ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec, final boolean useBarcodes) {
+ final ReadEndsForMarkDuplicates ends;
+
+ if (useBarcodes) {
+ ends = new ReadEndsForMarkDuplicatesWithBarcodes();
+ } else {
+ ends = new ReadEndsForMarkDuplicates();
+ }
ends.read1ReferenceIndex = rec.getReferenceIndex();
ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart();
ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F;
@@ -372,6 +425,16 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
}
}
+ if (useBarcodes) {
+ final ReadEndsForMarkDuplicatesWithBarcodes endsWithBarcode = (ReadEndsForMarkDuplicatesWithBarcodes) ends;
+ endsWithBarcode.barcode = getBarcodeValue(rec);
+ if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) {
+ endsWithBarcode.readOneBarcode = getReadOneBarcodeValue(rec);
+ } else {
+ endsWithBarcode.readTwoBarcode = getReadTwoBarcodeValue(rec);
+ }
+ }
+
return ends;
}
@@ -381,7 +444,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
*
* @return an array with an ordered list of indexes into the source file
*/
- private void generateDuplicateIndexes() {
+ private void generateDuplicateIndexes(final boolean useBarcodes) {
// Keep this number from getting too large even if there is a huge heap.
final int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF,
(double) (Integer.MAX_VALUE - 5));
@@ -397,7 +460,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
if (firstOfNextChunk == null) {
firstOfNextChunk = next;
nextChunk.add(firstOfNextChunk);
- } else if (areComparableForDuplicates(firstOfNextChunk, next, true)) {
+ } else if (areComparableForDuplicates(firstOfNextChunk, next, true, useBarcodes)) {
nextChunk.add(next);
} else {
if (nextChunk.size() > 1) {
@@ -419,7 +482,7 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
boolean containsFrags = false;
for (final ReadEndsForMarkDuplicates next : this.fragSort) {
- if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) {
+ if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false, useBarcodes)) {
nextChunk.add(next);
containsPairs = containsPairs || next.isPaired();
containsFrags = containsFrags || !next.isPaired();
@@ -443,18 +506,29 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
this.duplicateIndexes.doneAddingStartIteration();
}
- private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2) {
- boolean retval = lhs.libraryId == rhs.libraryId &&
- lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
- lhs.read1Coordinate == rhs.read1Coordinate &&
- lhs.orientation == rhs.orientation;
+ private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
+ boolean areComparable = lhs.libraryId == rhs.libraryId;
+
+ if (useBarcodes && areComparable) { // areComparable is useful here to avoid the casts below
+ final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
+ final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
+ areComparable = lhsWithBarcodes.barcode == rhsWithBarcodes.barcode &&
+ lhsWithBarcodes.readOneBarcode == rhsWithBarcodes.readOneBarcode &&
+ lhsWithBarcodes.readTwoBarcode == rhsWithBarcodes.readTwoBarcode;
+ }
+
+ if (areComparable) {
+ areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
+ lhs.read1Coordinate == rhs.read1Coordinate &&
+ lhs.orientation == rhs.orientation;
+ }
- if (retval && compareRead2) {
- retval = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
+ if (areComparable && compareRead2) {
+ areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
}
- return retval;
+ return areComparable;
}
private void addIndexAsDuplicate(final long bamIndex) {
@@ -522,18 +596,38 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
}
}
+ // To avoid overflows or underflows when subtracting two large (positive and negative) numbers
+ static int compareInteger(final int x, final int y) {
+ return (x < y) ? -1 : ((x == y) ? 0 : 1);
+ }
+
/** Comparator for ReadEndsForMarkDuplicates that orders by read1 position then pair orientation then read2 position. */
static class ReadEndsMDComparator implements Comparator<ReadEndsForMarkDuplicates> {
+
+ final boolean useBarcodes;
+
+ public ReadEndsMDComparator(final boolean useBarcodes) {
+ this.useBarcodes = useBarcodes;
+ }
+
public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs) {
- int retval = lhs.libraryId - rhs.libraryId;
- if (retval == 0) retval = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex;
- if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate;
- if (retval == 0) retval = lhs.orientation - rhs.orientation;
- if (retval == 0) retval = lhs.read2ReferenceIndex - rhs.read2ReferenceIndex;
- if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate;
- if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);
- if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);
- return retval;
+ int compareDifference = lhs.libraryId - rhs.libraryId;
+ if (useBarcodes) {
+ final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
+ final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
+ if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.barcode, rhsWithBarcodes.barcode);
+ if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.readOneBarcode, rhsWithBarcodes.readOneBarcode);
+ if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.readTwoBarcode, rhsWithBarcodes.readTwoBarcode);
+ }
+ if (compareDifference == 0) compareDifference = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex;
+ if (compareDifference == 0) compareDifference = lhs.read1Coordinate - rhs.read1Coordinate;
+ if (compareDifference == 0) compareDifference = lhs.orientation - rhs.orientation;
+ if (compareDifference == 0) compareDifference = lhs.read2ReferenceIndex - rhs.read2ReferenceIndex;
+ if (compareDifference == 0) compareDifference = lhs.read2Coordinate - rhs.read2Coordinate;
+ if (compareDifference == 0) compareDifference = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);
+ if (compareDifference == 0) compareDifference = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);
+
+ return compareDifference;
}
}
}
diff --git a/src/java/picard/sam/markduplicates/util/DiskBasedReadEndsForMarkDuplicatesMap.java b/src/java/picard/sam/markduplicates/util/DiskBasedReadEndsForMarkDuplicatesMap.java
index 0e0fee5..e0e62c7 100644
--- a/src/java/picard/sam/markduplicates/util/DiskBasedReadEndsForMarkDuplicatesMap.java
+++ b/src/java/picard/sam/markduplicates/util/DiskBasedReadEndsForMarkDuplicatesMap.java
@@ -53,8 +53,8 @@ import java.util.*;
public class DiskBasedReadEndsForMarkDuplicatesMap implements ReadEndsForMarkDuplicatesMap {
private final CoordinateSortedPairInfoMap<String, ReadEndsForMarkDuplicates> pairInfoMap;
- public DiskBasedReadEndsForMarkDuplicatesMap(int maxOpenFiles) {
- pairInfoMap = new CoordinateSortedPairInfoMap<String, ReadEndsForMarkDuplicates>(maxOpenFiles, new Codec());
+ public DiskBasedReadEndsForMarkDuplicatesMap(int maxOpenFiles, final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec) {
+ pairInfoMap = new CoordinateSortedPairInfoMap<String, ReadEndsForMarkDuplicates>(maxOpenFiles, new Codec(readEndsForMarkDuplicatesCodec));
}
public ReadEndsForMarkDuplicates remove(int mateSequenceIndex, String key) {
@@ -74,7 +74,11 @@ public class DiskBasedReadEndsForMarkDuplicatesMap implements ReadEndsForMarkDup
}
private static class Codec implements CoordinateSortedPairInfoMap.Codec<String, ReadEndsForMarkDuplicates> {
- private final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec = new ReadEndsForMarkDuplicatesCodec();
+ private final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec;
+
+ public Codec(final ReadEndsForMarkDuplicatesCodec readEndsForMarkDuplicatesCodec) {
+ this.readEndsForMarkDuplicatesCodec = readEndsForMarkDuplicatesCodec;
+ }
public void setInputStream(final InputStream is) {
readEndsForMarkDuplicatesCodec.setInputStream(is);
diff --git a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java
index 14f381c..c55fc88 100644
--- a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java
+++ b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java
@@ -37,11 +37,38 @@ public class ReadEndsForMarkDuplicates extends ReadEnds {
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate
- long: read1IndexInFile, read2IndexInFile
*/
- public static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
+ protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
+ 8 + // last 8 == reference overhead
13; // This is determined experimentally with JProfiler
+ public static int getSizeOf() {
+ return SIZE_OF;
+ }
+
public short score = 0;
public long read1IndexInFile = -1;
public long read2IndexInFile = -1;
+
+ public ReadEndsForMarkDuplicates() {}
+
+ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
+ this.libraryId = read.getLibraryId();
+ this.orientation = read.orientation;
+ this.read1ReferenceIndex = read.read1ReferenceIndex;
+ this.read1Coordinate = read.read1Coordinate;
+ this.read2ReferenceIndex = read.read2ReferenceIndex;
+ this.read2Coordinate = read.read2Coordinate;
+
+ this.readGroup = read.getReadGroup();
+ this.tile = read.getTile();
+ this.x = read.x;
+ this.y = read.y;
+
+ this.orientationForOpticalDuplicates = read.orientationForOpticalDuplicates;
+
+ this.score = read.score;
+
+ this.read1IndexInFile = read.read1IndexInFile;
+ this.read2IndexInFile = read.read2IndexInFile;
+ }
}
\ No newline at end of file
diff --git a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java
index 09c8377..8b9d464 100644
--- a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java
+++ b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java
@@ -30,8 +30,8 @@ import java.io.*;
/** Coded for ReadEnds that just outputs the primitive fields and reads them back. */
public class ReadEndsForMarkDuplicatesCodec implements SortingCollection.Codec<ReadEndsForMarkDuplicates> {
- private DataInputStream in;
- private DataOutputStream out;
+ protected DataInputStream in;
+ protected DataOutputStream out;
public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
return new ReadEndsForMarkDuplicatesCodec();
diff --git a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodes.java b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodes.java
new file mode 100644
index 0000000..09a2c8b
--- /dev/null
+++ b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodes.java
@@ -0,0 +1,41 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates.util;
+
+public class ReadEndsForMarkDuplicatesWithBarcodes extends ReadEndsForMarkDuplicates {
+ public int barcode = 0; // primary barcode for this read (and pair)
+ public int readOneBarcode = 0; // read one barcode, 0 if not present
+ public int readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
+
+ public ReadEndsForMarkDuplicatesWithBarcodes() { }
+
+ public ReadEndsForMarkDuplicatesWithBarcodes(final ReadEndsForMarkDuplicates read) {
+ super(read);
+ }
+
+ public static int getSizeOf() {
+ return ReadEndsForMarkDuplicates.getSizeOf() + (3 * 4);
+ }
+}
diff --git a/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodesCodec.java b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodesCodec.java
new file mode 100644
index 0000000..f230a41
--- /dev/null
+++ b/src/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodesCodec.java
@@ -0,0 +1,75 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates.util;
+
+import htsjdk.samtools.util.SortingCollection;
+import picard.PicardException;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+/**
+ * Created by nhomer on 9/13/15.
+ */
+public class ReadEndsForMarkDuplicatesWithBarcodesCodec extends ReadEndsForMarkDuplicatesCodec {
+
+ @Override
+ public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
+ return new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+ }
+
+ @Override
+ public void encode(final ReadEndsForMarkDuplicates read) {
+ if (!(read instanceof ReadEndsForMarkDuplicatesWithBarcodes)) {
+ throw new PicardException("Read was not a ReadEndsForMarkDuplicatesWithBarcodes");
+ }
+ super.encode(read);
+
+ try {
+ final ReadEndsForMarkDuplicatesWithBarcodes val = (ReadEndsForMarkDuplicatesWithBarcodes)read;
+ out.writeInt(val.barcode);
+ out.writeInt(val.readOneBarcode);
+ out.writeInt(val.readTwoBarcode);
+ } catch (final IOException ioe) {
+ throw new PicardException("Exception writing ReadEnds to file.", ioe);
+ }
+ }
+
+ @Override
+ public ReadEndsForMarkDuplicates decode() {
+ final ReadEndsForMarkDuplicates parentRead = super.decode();
+ if (null == parentRead) return null; // EOF
+ final ReadEndsForMarkDuplicatesWithBarcodes read = new ReadEndsForMarkDuplicatesWithBarcodes(parentRead);
+ try {
+ read.barcode = in.readInt();
+ read.readOneBarcode = in.readInt();
+ read.readTwoBarcode = in.readInt();
+ return read;
+ } catch (final IOException ioe) {
+ throw new PicardException("Exception writing ReadEnds to file.", ioe);
+ }
+ }
+
+}
diff --git a/src/java/picard/util/MathUtil.java b/src/java/picard/util/MathUtil.java
index 92019bc..2cfebe7 100644
--- a/src/java/picard/util/MathUtil.java
+++ b/src/java/picard/util/MathUtil.java
@@ -144,7 +144,16 @@ final public class MathUtil {
return min;
}
+ /** Returns the smallest value stored in the array. */
+ public static int min(final int[] nums) {
+ int min = nums[0];
+ for (int i = 1; i < nums.length; ++i) {
+ if (nums[i] < min) min = nums[i];
+ }
+ return min;
+ }
+
/** Returns the smallest value stored in the array. */
public static short min(final short[] nums) {
short min = nums[0];
diff --git a/src/java/picard/util/QuerySortedReadPairIteratorUtil.java b/src/java/picard/util/QuerySortedReadPairIteratorUtil.java
new file mode 100644
index 0000000..989010c
--- /dev/null
+++ b/src/java/picard/util/QuerySortedReadPairIteratorUtil.java
@@ -0,0 +1,65 @@
+package picard.util;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.util.PeekableIterator;
+
+/**
+ * A collection of helper utilities for iterating through reads that are in query-name sorted
+ * read order as pairs
+ */
+public class QuerySortedReadPairIteratorUtil {
+
+ public static class ReadPair {
+ public SAMRecord read1 = null;
+ public SAMRecord read2 = null;
+ }
+
+ /**
+ * Get the next read pair (where both have the same read name).
+ * If we encounter an unpaired read, the second read in the pair will be set to null.
+ *
+ * @param iterator iterator of reads
+ * @return ReadPair object holding the reads, or null if there are no more reads in the iterator
+ */
+ public static ReadPair getNextReadPair(final PeekableIterator<SAMRecord> iterator) {
+
+ final ReadPair readPair = new ReadPair();
+ readPair.read1 = getNextUsableRead(iterator, false);
+ if (readPair.read1 == null) {
+ return null;
+ }
+
+ final SAMRecord peekedNextRead = getNextUsableRead(iterator, true);
+ if (peekedNextRead != null && peekedNextRead.getReadName().equals(readPair.read1.getReadName())) {
+ readPair.read2 = getNextUsableRead(iterator, false);
+ }
+
+ return readPair;
+ }
+
+ /**
+ * Return the next usable read in the iterator
+ *
+ * @param iterator the iterator to pull from
+ * @param justPeek if true, just peek the next usable read rather than pulling it (note: it may remove unusable reads from the iterator)
+ * @return the next read or null if none are left
+ */
+ private static SAMRecord getNextUsableRead(final PeekableIterator<SAMRecord> iterator, final boolean justPeek) {
+
+ while (iterator.hasNext()) {
+ // trash the next read if it fails PF, is secondary, or is supplementary
+ final SAMRecord nextRead = iterator.peek();
+ if (nextRead.getReadFailsVendorQualityCheckFlag() || nextRead.isSecondaryOrSupplementary()) {
+ iterator.next();
+ }
+ // otherwise, return it
+ else {
+ return justPeek ? nextRead : iterator.next();
+ }
+ }
+
+ // no good reads left
+ return null;
+ }
+
+}
\ No newline at end of file
diff --git a/src/java/picard/vcf/GenotypeConcordance.java b/src/java/picard/vcf/GenotypeConcordance.java
index ca15069..f10e973 100644
--- a/src/java/picard/vcf/GenotypeConcordance.java
+++ b/src/java/picard/vcf/GenotypeConcordance.java
@@ -358,7 +358,7 @@ public class GenotypeConcordance extends CommandLineProgram {
scheme.validateScheme();
for (final TruthState truthState : TruthState.values()) {
for (final CallState callState : CallState.values()) {
- final int count = counter.getCount(truthState, callState);
+ final long count = counter.getCount(truthState, callState);
final String contingencyValues = scheme.getContingencyStateString(truthState, callState);
if (count > 0 || OUTPUT_ALL_ROWS) {
final GenotypeConcordanceDetailMetrics detailMetrics = new GenotypeConcordanceDetailMetrics();
diff --git a/src/java/picard/vcf/GenotypeConcordanceContingencyMetrics.java b/src/java/picard/vcf/GenotypeConcordanceContingencyMetrics.java
index c9c5901..6290b24 100644
--- a/src/java/picard/vcf/GenotypeConcordanceContingencyMetrics.java
+++ b/src/java/picard/vcf/GenotypeConcordanceContingencyMetrics.java
@@ -27,7 +27,7 @@ public class GenotypeConcordanceContingencyMetrics extends MetricBase {
scheme.validateScheme();
concordanceCounts.validateCountsAgainstScheme(scheme);
- Map<ContingencyState, Integer> counts = concordanceCounts.getContingencyStateCounts(scheme);
+ Map<ContingencyState, Long> counts = concordanceCounts.getContingencyStateCounts(scheme);
this.TP_COUNT = counts.get(ContingencyState.TP);
this.TN_COUNT = counts.get(ContingencyState.TN);
this.FP_COUNT = counts.get(ContingencyState.FP);
@@ -45,17 +45,17 @@ public class GenotypeConcordanceContingencyMetrics extends MetricBase {
public String CALL_SAMPLE;
/** The TP (true positive) count across all variants */
- public int TP_COUNT;
+ public long TP_COUNT;
/** The TN (true negative) count across all variants */
- public int TN_COUNT;
+ public long TN_COUNT;
/** The FP (false positive) count across all variants */
- public int FP_COUNT;
+ public long FP_COUNT;
/** The FN (false negative) count across all variants */
- public int FN_COUNT;
+ public long FN_COUNT;
/** The empty (no contingency info) count across all variants */
- public int EMPTY_COUNT;
+ public long EMPTY_COUNT;
}
diff --git a/src/java/picard/vcf/GenotypeConcordanceCounts.java b/src/java/picard/vcf/GenotypeConcordanceCounts.java
index 29340ee..18d169f 100644
--- a/src/java/picard/vcf/GenotypeConcordanceCounts.java
+++ b/src/java/picard/vcf/GenotypeConcordanceCounts.java
@@ -103,7 +103,7 @@ public class GenotypeConcordanceCounts {
}
else if (includeHomRef || isVar(truthState, callState)) {
final TruthAndCallStates truthAndCallStates = new TruthAndCallStates(truthState, callState);
- final int count = getCount(truthAndCallStates);
+ final long count = getCount(truthAndCallStates);
if (truthState.getCode()==callState.getCode()) {
//If we enter this, we are 'on the diagonal'
numerator += count;
@@ -146,7 +146,7 @@ public class GenotypeConcordanceCounts {
for (final TruthState truthState : truthStateArray) {
for (final CallState callState : CallState.values()) {
final TruthAndCallStates truthAndCallStates = new TruthAndCallStates(truthState, callState);
- final int count = getCount(truthAndCallStates);
+ final long count = getCount(truthAndCallStates);
for (final ContingencyState contingencyState : scheme.getConcordanceStateArray(truthAndCallStates)) {
if (ContingencyState.TP == contingencyState) {
numerator += count;
@@ -176,7 +176,7 @@ public class GenotypeConcordanceCounts {
for (final CallState callState : callStateList) {
for (final TruthState truthState : TruthState.values()) {
final TruthAndCallStates truthAndCallStates = new TruthAndCallStates(truthState, callState);
- final int count = getCount(truthAndCallStates);
+ final long count = getCount(truthAndCallStates);
for (final ContingencyState contingencyState : scheme.getConcordanceStateArray(truthAndCallStates)) {
if (ContingencyState.TP == contingencyState) {
numerator += count;
@@ -206,7 +206,7 @@ public class GenotypeConcordanceCounts {
for (final TruthState truthState : truthStateArray) {
for (final CallState callState : CallState.values()) {
final TruthAndCallStates truthAndCallStates = new TruthAndCallStates(truthState, callState);
- final int count = getCount(truthAndCallStates);
+ final long count = getCount(truthAndCallStates);
for (final ContingencyState contingencyState : scheme.getConcordanceStateArray(truthAndCallStates)) {
if (ContingencyState.TN == contingencyState) {
numerator += count;
@@ -223,16 +223,16 @@ public class GenotypeConcordanceCounts {
/**
* Returns the count defined by the truth state set and call state set.
*/
- public int getCount(final TruthState truthState, final CallState callState) {
+ public long getCount(final TruthState truthState, final CallState callState) {
return getCount(new TruthAndCallStates(truthState, callState));
}
/**
* Returns the count defined by the truth state set and call state set.
*/
- public int getCount(final TruthAndCallStates truthAndCallStates) {
+ public long getCount(final TruthAndCallStates truthAndCallStates) {
final Histogram<TruthAndCallStates>.Bin bin = this.counter.get(truthAndCallStates);
- return (bin == null ? 0 : (int) bin.getValue());
+ return (bin == null ? 0L : (long) bin.getValue());
}
/**
@@ -259,8 +259,8 @@ public class GenotypeConcordanceCounts {
/**
* Returns the sum of all pairs of tuples defined by the truth state set and call state set.
*/
- public int getSum(final Set<TruthState> truthStateSet, final Set<CallState> callStateSet) {
- int count = 0;
+ public long getSum(final Set<TruthState> truthStateSet, final Set<CallState> callStateSet) {
+ long count = 0;
for (final TruthState truthState : truthStateSet) {
for (final CallState callState : callStateSet) {
count += getCount(truthState, callState);
@@ -272,19 +272,19 @@ public class GenotypeConcordanceCounts {
/**
* Returns the sum of all pairs of tuples defined by the truth state set and call state set.
*/
- public int getSum() {
+ public long getSum() {
return getSum(new HashSet<TruthState>(Arrays.asList(TruthState.values())), new HashSet<CallState>(Arrays.asList(CallState.values())));
}
/**
* Returns the total number of times each contingency state is encountered, summed across all truth/call state pairs.
*/
- public Map<ContingencyState, Integer> getContingencyStateCounts(final GenotypeConcordanceScheme scheme) {
+ public Map<ContingencyState, Long> getContingencyStateCounts(final GenotypeConcordanceScheme scheme) {
scheme.validateScheme();
- final Map<ContingencyState, Integer> counts = new HashMap<ContingencyState, Integer>();
+ final Map<ContingencyState, Long> counts = new HashMap<ContingencyState, Long>();
for (final ContingencyState contingencyState : ContingencyState.values()) {
- counts.put(contingencyState, 0);
+ counts.put(contingencyState, 0L);
}
for (final TruthState truthState : TruthState.values()) {
@@ -292,7 +292,7 @@ public class GenotypeConcordanceCounts {
final TruthAndCallStates truthAndCallStates = new TruthAndCallStates(truthState, callState);
final ContingencyState[] contingencyStateArray = scheme.getConcordanceStateArray(truthAndCallStates);
for (final ContingencyState contingencyState : contingencyStateArray) {
- final int newCount = counts.get(contingencyState) + getCount(truthAndCallStates);
+ final long newCount = counts.get(contingencyState) + getCount(truthAndCallStates);
counts.put(contingencyState, newCount);
}
}
diff --git a/src/java/picard/vcf/LiftoverVcf.java b/src/java/picard/vcf/LiftoverVcf.java
index 0d8cb26..4b5346e 100644
--- a/src/java/picard/vcf/LiftoverVcf.java
+++ b/src/java/picard/vcf/LiftoverVcf.java
@@ -2,6 +2,7 @@ package picard.vcf;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
@@ -13,9 +14,7 @@ import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
@@ -71,14 +70,18 @@ public class LiftoverVcf extends CommandLineProgram {
public File REFERENCE_SEQUENCE = Defaults.REFERENCE_FASTA;
/** Filter name to use when a target cannot be lifted over. */
- public static final String FILTER_CANNOT_LIFTOVER = "FailedLiftover";
+ public static final String FILTER_CANNOT_LIFTOVER_INDEL = "ReverseComplementedIndel";
+
+ /** Filter name to use when a target cannot be lifted over. */
+ public static final String FILTER_NO_TARGET = "NoTarget";
/** Filter name to use when a target is lifted over, but the reference allele doens't match the new reference. */
public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele";
/** Filters to be added to the REJECT file. */
private static final List<VCFFilterHeaderLine> FILTERS = CollectionUtil.makeList(
- new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER, "Variant could not be lifted between genome builds."),
+ new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_INDEL, "Indel falls into a reverse complemented region in the target genome."),
+ new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."),
new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover.")
);
@@ -135,31 +138,39 @@ public class LiftoverVcf extends CommandLineProgram {
log.info("Lifting variants over and sorting.");
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class,
- new VCFRecordCodec(outHeader),
+ new VCFRecordCodec(outHeader, VALIDATION_STRINGENCY != ValidationStringency.STRICT),
outHeader.getVCFRecordComparator(),
MAX_RECORDS_IN_RAM,
TMP_DIR);
ProgressLogger progress = new ProgressLogger(log, 1000000, "read");
+ // a mapping from original allele to reverse complemented allele
+ final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(10);
for (final VariantContext ctx : in) {
++total;
final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
final Interval target = liftOver.liftOver(source, 1.0);
- if (target == null) {
- rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
+ // if the target is null OR (the target is reverse complemented AND the variant is an indel or mixed), then we cannot lift it over
+ if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel()))) {
+ final String reason = (target == null) ? FILTER_NO_TARGET : FILTER_CANNOT_LIFTOVER_INDEL;
+ rejects.add(new VariantContextBuilder(ctx).filter(reason).make());
failedLiftover++;
}
else {
// Fix the alleles if we went from positive to negative strand
+ reverseComplementAlleleMap.clear();
final List<Allele> alleles = new ArrayList<Allele>();
+
for (final Allele oldAllele : ctx.getAlleles()) {
if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
alleles.add(oldAllele);
}
else {
- alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
+ final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
+ alleles.add(fixedAllele);
+ reverseComplementAlleleMap.put(oldAllele, fixedAllele);
}
}
@@ -173,7 +184,7 @@ public class LiftoverVcf extends CommandLineProgram {
builder.id(ctx.getID());
builder.attributes(ctx.getAttributes());
- builder.genotypes(ctx.getGenotypes());
+ builder.genotypes(fixGenotypes(ctx.getGenotypes(), reverseComplementAlleleMap));
builder.filters(ctx.getFilters());
builder.log10PError(ctx.getLog10PError());
@@ -230,4 +241,22 @@ public class LiftoverVcf extends CommandLineProgram {
return 0;
}
+
+ protected static GenotypesContext fixGenotypes(final GenotypesContext originals, final Map<Allele, Allele> reverseComplementAlleleMap) {
+ // optimization: if nothing needs to be fixed then don't bother
+ if ( reverseComplementAlleleMap.isEmpty() ) {
+ return originals;
+ }
+
+ final GenotypesContext fixedGenotypes = GenotypesContext.create(originals.size());
+ for ( final Genotype genotype : originals ) {
+ final List<Allele> fixedAlleles = new ArrayList<Allele>();
+ for ( final Allele allele : genotype.getAlleles() ) {
+ final Allele fixedAllele = reverseComplementAlleleMap.containsKey(allele) ? reverseComplementAlleleMap.get(allele) : allele;
+ fixedAlleles.add(fixedAllele);
+ }
+ fixedGenotypes.add(new GenotypeBuilder(genotype).alleles(fixedAlleles).make());
+ }
+ return fixedGenotypes;
+ }
}
\ No newline at end of file
diff --git a/src/java/picard/vcf/SortVcf.java b/src/java/picard/vcf/SortVcf.java
index 8af8ed8..005a018 100644
--- a/src/java/picard/vcf/SortVcf.java
+++ b/src/java/picard/vcf/SortVcf.java
@@ -2,6 +2,7 @@ package picard.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
@@ -143,7 +144,7 @@ public class SortVcf extends CommandLineProgram {
final SortingCollection<VariantContext> sorter =
SortingCollection.newInstance(
VariantContext.class,
- new VCFRecordCodec(outputHeader),
+ new VCFRecordCodec(outputHeader, VALIDATION_STRINGENCY != ValidationStringency.STRICT),
outputHeader.getVCFRecordComparator(),
MAX_RECORDS_IN_RAM,
TMP_DIR);
diff --git a/src/java/picard/vcf/filter/FilterVcf.java b/src/java/picard/vcf/filter/FilterVcf.java
index 886f16e..0c277f5 100644
--- a/src/java/picard/vcf/filter/FilterVcf.java
+++ b/src/java/picard/vcf/filter/FilterVcf.java
@@ -23,6 +23,7 @@
*/
package picard.vcf.filter;
+import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
@@ -33,6 +34,7 @@ import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
+import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -94,8 +96,18 @@ public class FilterVcf extends CommandLineProgram {
final VCFFileReader in = new VCFFileReader(INPUT, false);
final FilterApplyingVariantIterator iterator = new FilterApplyingVariantIterator(in.iterator(), variantFilters, genotypeFilters);
- final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(OUTPUT).build();
final VCFHeader header = in.getFileHeader();
+ // If the user is writing to a .bcf or .vcf, VariantContextBuilderWriter requires a Sequence Dictionary. Make sure that the
+ // Input VCF has one.
+ final VariantContextWriterBuilder variantContextWriterBuilder = new VariantContextWriterBuilder();
+ if (isVcfOrBcf(OUTPUT)) {
+ final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary();
+ if (sequenceDictionary == null) {
+ throw new PicardException("The input vcf must have a sequence dictionary in order to create indexed vcf or bcfs.");
+ }
+ variantContextWriterBuilder.setReferenceDictionary(sequenceDictionary);
+ }
+ final VariantContextWriter out = variantContextWriterBuilder.setOutputFile(OUTPUT).build();
header.addMetaDataLine(new VCFFilterHeaderLine("AllGtsFiltered", "Site filtered out because all genotypes are filtered out."));
header.addMetaDataLine(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters."));
for (final VariantFilter filter : variantFilters) {
@@ -114,4 +126,9 @@ public class FilterVcf extends CommandLineProgram {
in.close();
return 0;
}
+
+ private boolean isVcfOrBcf(final File file) {
+ final String fileName = file.getName();
+ return fileName.endsWith(".vcf") || fileName.endsWith(".bcf");
+ }
}
diff --git a/src/scripts/picard/analysis/insertSizeHistogram.R b/src/scripts/picard/analysis/insertSizeHistogram.R
index a2cdd32..b6a0bf1 100644
--- a/src/scripts/picard/analysis/insertSizeHistogram.R
+++ b/src/scripts/picard/analysis/insertSizeHistogram.R
@@ -25,6 +25,16 @@ for (i in 1:length(startFinder)) {
}
}
+getCumulative <- function(y, yrange) {
+ yNew <- rep(0, nrow(y));
+ yLength <- nrow(y)
+ ySum <- sum(y[,1])
+ for (i in 1:yLength) {
+ yNew[i] <- (yrange * sum(y[i:yLength,1]) / ySum)
+ }
+ return (yNew)
+}
+
histogram <- read.table(metricsFile, header=TRUE, sep="\t", skip=secondBlankLine, comment.char="", quote='', check.names=FALSE)
## The histogram has a fr_count/rf_count/tandem_count for each metric "level"
@@ -33,11 +43,10 @@ headers <- sapply(sub(".fr_count","",names(histogram),fixed=TRUE), "[[" ,1)
headers <- sapply(sub(".rf_count","",headers,fixed=TRUE), "[[" ,1)
headers <- sapply(sub(".tandem_count","",headers,fixed=TRUE), "[[" ,1)
-## Duplicated header names cause this to barf. KT & Yossi report that this is going to be extremely difficult to
-## resolve and it's unlikely that anyone cares anyways. Trap this situation and avoid the PDF so it won't cause
-## the workflow to fail
+## Duplicate header names could cause this to barf. But it really shouldn't when we have "All_reads.fr_count" and
+## "All_reads.rf_count" for example. Not sure why this would fail, but I care.
if (any(duplicated(headers))) {
- print(paste("Not creating insert size PDF as there are duplicated header names:", headers[which(duplicated(headers))]))
+ levels = unique(headers[2:length(headers)]);
} else {
levels <- c()
for (i in 2:length(headers)) {
@@ -45,54 +54,58 @@ if (any(duplicated(headers))) {
levels[length(levels)+1] <- headers[i]
}
}
+}
- pdf(pdfFile)
-
- for (i in 1:length(levels)) {
- ## Reconstitutes the histogram column headers for this level
- fr <- paste(levels[i], "fr_count", sep=".")
- rf <- paste(levels[i], "rf_count", sep=".")
- tandem <- paste(levels[i], "tandem_count", sep=".")
-
- frrange = ifelse(fr %in% names(histogram), max(histogram[fr]), 0)
- rfrange = ifelse(rf %in% names(histogram), max(histogram[rf]), 0)
- tandemrange = ifelse(tandem %in% names(histogram), max(histogram[tandem]), 0)
-
- yrange <- max(frrange, rfrange, tandemrange)
- xrange <- ifelse(histoWidth > 0, histoWidth, max(histogram$insert_size))
-
- plot(x=NULL, y=NULL,
- type="n",
- main=paste("Insert Size Histogram for", levels[i], "\nin file", bamName),
- xlab="Insert Size",
- ylab="Count",
- xlim=range(0, xrange),
- ylim=range(0, yrange))
-
- colors <- c()
- labels <- c()
-
- if (fr %in% names(histogram) ) {
- lines(histogram$insert_size, as.matrix(histogram[fr]), type="h", col="red")
- colors <- c(colors, "red")
- labels <- c(labels, "FR")
- }
- if (rf %in% names(histogram)) {
- lines(histogram$insert_size, as.matrix(histogram[rf]), type="h", col="blue")
- colors <- c(colors, "blue")
- labels <- c(labels, "RF")
- }
-
- if (tandem %in% names(histogram)) {
- lines(histogram$insert_size, as.matrix(histogram[tandem]), type="h", col="orange")
- colors <- c(colors, "orange")
- labels <- c(labels, "TANDEM")
- }
-
- ## Create the legend
- legend("topright", labels, fill=colors, col=colors, cex=0.7)
+pdf(pdfFile)
+
+for (i in 1:length(levels)) {
+ ## Reconstitutes the histogram column headers for this level
+ fr <- paste(levels[i], "fr_count", sep=".")
+ rf <- paste(levels[i], "rf_count", sep=".")
+ tandem <- paste(levels[i], "tandem_count", sep=".")
+
+ frrange = ifelse(fr %in% names(histogram), max(histogram[fr]), 0)
+ rfrange = ifelse(rf %in% names(histogram), max(histogram[rf]), 0)
+ tandemrange = ifelse(tandem %in% names(histogram), max(histogram[tandem]), 0)
+
+ yrange <- max(frrange, rfrange, tandemrange)
+ xrange <- ifelse(histoWidth > 0, histoWidth, max(histogram$insert_size))
+
+ par(mar=c(5,4,4,4));
+ plot(x=NULL, y=NULL,
+ type="n",
+ main=paste("Insert Size Histogram for", levels[i], "\nin file", bamName),
+ xlab="Insert Size",
+ ylab="Count",
+ xlim=range(0, xrange),
+ ylim=range(0, yrange))
+ axis(side=4, at=seq(from=0, to=1, by=0.1)*yrange, labels=seq(from=0, to=1, by=0.10));
+ mtext(side=4, line=2, text="cumulative fraction of reads > insert size");
+
+ colors <- c()
+ labels <- c()
+
+ if (fr %in% names(histogram) ) {
+ lines(histogram$insert_size, as.matrix(histogram[fr]), type="h", col="red")
+ lines(histogram$insert_size, getCumulative(histogram[fr], yrange), col="darkred", lty=2)
+ colors <- c(colors, "red")
+ labels <- c(labels, "FR")
+ }
+ if (rf %in% names(histogram)) {
+ lines(histogram$insert_size, as.matrix(histogram[rf]), type="h", col="blue")
+ lines(histogram$insert_size, getCumulative(histogram[rf], yrange), col="darkblue", lty=2)
+ colors <- c(colors, "blue")
+ labels <- c(labels, "RF")
+ }
+ if (tandem %in% names(histogram)) {
+ lines(histogram$insert_size, as.matrix(histogram[tandem]), type="h", col="orange")
+ lines(histogram$insert_size, getCumulative(histogram[tandem], yrange), col="darkorange", lty=2)
+ colors <- c(colors, "orange")
+ labels <- c(labels, "TANDEM")
}
- dev.off()
+ ## Create the legend
+ legend("topright", labels, fill=colors, col=colors, cex=0.7)
}
+dev.off()
diff --git a/src/scripts/picard/docker_helper.sh b/src/scripts/picard/docker_helper.sh
index 405c3f1..7020441 100755
--- a/src/scripts/picard/docker_helper.sh
+++ b/src/scripts/picard/docker_helper.sh
@@ -24,4 +24,4 @@ done
shift $(expr $OPTIND - 1)
TOOL_WITH_ARGS=$@
-java ${JVM_ARGS} -jar picard.jar ${TOOL_WITH_ARGS}
\ No newline at end of file
+java ${JVM_ARGS} -jar /usr/picard/picard.jar ${TOOL_WITH_ARGS}
diff --git a/src/tests/java/picard/analysis/CollectGcBiasMetricsTest.java b/src/tests/java/picard/analysis/CollectGcBiasMetricsTest.java
index 33c27a3..ba8fe96 100644
--- a/src/tests/java/picard/analysis/CollectGcBiasMetricsTest.java
+++ b/src/tests/java/picard/analysis/CollectGcBiasMetricsTest.java
@@ -6,24 +6,21 @@ import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordSetBuilder;
-import htsjdk.samtools.SAMTextHeaderCodec;
import htsjdk.samtools.metrics.MetricsFile;
-import htsjdk.samtools.util.BufferedLineReader;
-import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.SAMException;
+import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import org.testng.Assert;
-import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.sam.SortSam;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Random;
-import static org.testng.Assert.assertEquals;
/**
* Created by kbergin on 3/26/15 to test GcBias MultiLevel Collector.
@@ -38,136 +35,87 @@ public class CollectGcBiasMetricsTest extends CommandLineProgramTest {
private final static String library1 = "TestLibrary1";
private final static String library2 = "TestLibrary2";
private final static String library3 = "TestLibrary3";
+ private final static int LENGTH = 99;
+ private final static int NUM_READS = 100;
+ private final static String READ_NAME = "TESTBARCODE";
private final static File TEST_DIR = new File("testdata/picard/sam/CollectGcBiasMetrics/");
- private final File dict = new File(TEST_DIR, "Mheader.dict");
+ private final File dict = new File(TEST_DIR, "MNOheader.dict");
- File tempSamFile;
+ File tempSamFileChrM_O;
+ File tempSamFileAllChr;
SAMRecordSetBuilder setBuilder1 = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
SAMRecordSetBuilder setBuilder2 = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
SAMRecordSetBuilder setBuilder3 = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
+ SAMRecordSetBuilder setBuilder4 = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
+
SAMReadGroupRecord readGroupRecord1 = new SAMReadGroupRecord(readGroupId1);
SAMReadGroupRecord readGroupRecord2 = new SAMReadGroupRecord(readGroupId2);
SAMReadGroupRecord readGroupRecord3 = new SAMReadGroupRecord(readGroupId3);
- //create a samfile with different samples, read groups and libraries that overlap for testing.
+ /////////////////////////////////////////////////////////////////////////////
+ //create two Sam Files.
+ //One with different samples, read groups and libraries that overlap for runGcBiasMultiLevelTest. Reads will align to chrM and O, not N.
+ //Second Sam file is one sample/read group/library but has reads that align to all three chr (M,N,O). For runWindowsComparisonTest.
+ /////////////////////////////////////////////////////////////////////////////
+
@BeforeTest
void setupBuilder() throws IOException {
- final int numReads = 100;
- final String flowCellBarcode = "TESTBARCODE";
- final String readName = flowCellBarcode;
+ tempSamFileChrM_O = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
+ tempSamFileAllChr = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
+ tempSamFileChrM_O.deleteOnExit();
+ tempSamFileAllChr.deleteOnExit();
- tempSamFile = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
- File tempSamFileUnsorted = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
+ final File tempSamFileUnsorted = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
tempSamFileUnsorted.deleteOnExit();
- tempSamFile.deleteOnExit();
- BufferedLineReader bufferedLineReader = null;
+
+ final SAMFileHeader header = new SAMFileHeader();
+
try {
- bufferedLineReader = new BufferedLineReader(new FileInputStream(dict));
- } catch (FileNotFoundException e) {
+ header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(dict));
+ header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
+ } catch (final SAMException e) {
e.printStackTrace();
}
- final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
- final SAMFileHeader header = codec.decode(bufferedLineReader, dict.toString());
- header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
-
//build different levels to put into the same bam file for testing multi level collection
- setup(numReads, readName, 1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder1); //Sample 1, Library 1, RG 1
- setup(numReads, readName, 2, readGroupId2, readGroupRecord2, sample1, library2, header, setBuilder2); //Sample 1, Library 2, RG 2
- setup(numReads, readName, 3, readGroupId3, readGroupRecord3, sample2, library3, header, setBuilder3); //Sample 2, Library 3, RG 3
+ setupTest1(1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder1); //Sample 1, Library 1, RG 1
+ setupTest1(2, readGroupId2, readGroupRecord2, sample1, library2, header, setBuilder2); //Sample 1, Library 2, RG 2
+ setupTest1(3, readGroupId3, readGroupRecord3, sample2, library3, header, setBuilder3); //Sample 2, Library 3, RG 3
- final SAMFileWriter writer = new SAMFileWriterFactory()
- .setCreateIndex(true).makeBAMWriter(header, false, tempSamFileUnsorted);
+ //build one last readgroup for comparing that window count stays the same whether you use all contigs or not
+ setupTest2(1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder4);
- for (final SAMRecord record : setBuilder1) {
- writer.addAlignment(record);
- }
- for (final SAMRecord record : setBuilder2) {
- writer.addAlignment(record);
- }
- for (final SAMRecord record : setBuilder3) {
- writer.addAlignment(record);
- }
- writer.close();
+ final List<SAMRecordSetBuilder> test1Builders = new ArrayList<SAMRecordSetBuilder>();
+ test1Builders.add(setBuilder1);
+ test1Builders.add(setBuilder2);
+ test1Builders.add(setBuilder3);
- //sort the temp file
- final SortSam sorter = new SortSam();
- final String[] args = new String[]{"INPUT=" + tempSamFileUnsorted.getAbsolutePath(), "OUTPUT=" + tempSamFile.getAbsolutePath(), "SORT_ORDER=coordinate"};
+ final List<SAMRecordSetBuilder> test2Builders = new ArrayList<SAMRecordSetBuilder>();
+ test2Builders.add(setBuilder4);
- sorter.instanceMain(args);
- }
-
- void setup(final int numReads,
- final String readName,
- final int ID,
- final String readGroupId,
- final SAMReadGroupRecord readGroupRecord,
- final String sample,
- final String library,
- final SAMFileHeader header,
- final SAMRecordSetBuilder setBuilder) throws IOException {
-
- final String separator = ":";
- readGroupRecord.setSample(sample);
- readGroupRecord.setPlatform(platform);
- readGroupRecord.setLibrary(library);
- readGroupRecord.setPlatformUnit(readGroupId);
- header.addReadGroup(readGroupRecord);
- setBuilder.setReadGroup(readGroupRecord);
- setBuilder.setUseNmFlag(true);
-
- setBuilder.setHeader(header);
-
- final int max = 15000;
- final int min = 1;
- final Random rg = new Random(5);
-
- for (int i = 0; i < numReads; i++) {
- final int start = rg.nextInt(max) + min;
- final String newReadName = readName + separator + ID + separator + i;
- setBuilder.addPair(newReadName, 0, start+ID, start+ID+99);
- }
+ tempSamFileChrM_O = build(test1Builders, tempSamFileUnsorted, header);
+ tempSamFileAllChr = build(test2Builders, tempSamFileUnsorted, header);
}
public String getCommandLineProgramName() {
return CollectGcBiasMetrics.class.getSimpleName();
}
- @Test //test all collection levels
- public void test() throws IOException{
- runTest(tempSamFile);
- }
-
- public void runTest(final File input) throws IOException {
+ /////////////////////////////////////////////////////////////////////////////
+ //This test checks the functionality of the gc bias code. Compares values from running a generated temporary Sam file through
+ // CollectGcBiasMetrics to manually-calculated values.
+ /////////////////////////////////////////////////////////////////////////////
+ @Test
+ public void runGcBiasMultiLevelTest() throws IOException {
final File outfile = File.createTempFile("test", ".gc_bias_summary_metrics");
final File detailsOutfile = File.createTempFile("test", ".gc_bias_detail_metrics");
- final File pdf = File.createTempFile("test", ".pdf");
- final String referenceFile = "testdata/picard/quality/chrM.reference.fasta";
- final int windowSize = 100;
- final double minGenFraction = 1.0E-5;
- final boolean biSulfiteSeq = false;
- final boolean assumeSorted = false;
outfile.deleteOnExit();
detailsOutfile.deleteOnExit();
- pdf.deleteOnExit();
- final String[] args = new String[]{
- "INPUT=" + input.getAbsolutePath(),
- "OUTPUT=" + detailsOutfile.getAbsolutePath(),
- "REFERENCE_SEQUENCE=" + referenceFile,
- "SUMMARY_OUTPUT=" + outfile.getAbsolutePath(),
- "CHART_OUTPUT=" + pdf.getAbsolutePath(),
- "WINDOW_SIZE=" + windowSize,
- "MINIMUM_GENOME_FRACTION=" + minGenFraction,
- "IS_BISULFITE_SEQUENCED=" + biSulfiteSeq,
- "LEVEL=ALL_READS",
- "LEVEL=SAMPLE",
- "LEVEL=READ_GROUP",
- "ASSUME_SORTED=" + assumeSorted
- };
- Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ runGcBias(tempSamFileChrM_O, outfile, detailsOutfile);
final MetricsFile<GcBiasSummaryMetrics, Comparable<?>> output = new MetricsFile<GcBiasSummaryMetrics, Comparable<?>>();
output.read(new FileReader(outfile));
@@ -176,36 +124,211 @@ public class CollectGcBiasMetricsTest extends CommandLineProgramTest {
if (metrics.ACCUMULATION_LEVEL.equals("All Reads")) { //ALL_READS level
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 300);
Assert.assertEquals(metrics.ALIGNED_READS, 600);
- Assert.assertEquals(metrics.AT_DROPOUT, 7.234062);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.086217);
+ Assert.assertEquals(metrics.AT_DROPOUT, 21.624498);
+ Assert.assertEquals(metrics.GC_DROPOUT, 3.525922);
} else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup1")) { //Library 1
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 9.20674);
- Assert.assertEquals(metrics.GC_DROPOUT, 3.834244);
+ Assert.assertEquals(metrics.AT_DROPOUT, 23.627784);
+ Assert.assertEquals(metrics.GC_DROPOUT, 2.582877);
} else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup2")) {//Library 2
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 10.144505);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.08986);
+ Assert.assertEquals(metrics.AT_DROPOUT, 23.784958);
+ Assert.assertEquals(metrics.GC_DROPOUT, 4.025922);
} else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup3")) {//Library 3
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 9.229205);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.977838);
+ Assert.assertEquals(metrics.AT_DROPOUT, 21.962578);
+ Assert.assertEquals(metrics.GC_DROPOUT, 4.559328);
} else if (metrics.SAMPLE != null && metrics.SAMPLE.equals("TestSample1")) {//Library 1 and 2
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 200);
Assert.assertEquals(metrics.ALIGNED_READS, 400);
- Assert.assertEquals(metrics.AT_DROPOUT, 7.410747);
- Assert.assertEquals(metrics.GC_DROPOUT, 3.83986);
+ Assert.assertEquals(metrics.AT_DROPOUT, 23.194597);
+ Assert.assertEquals(metrics.GC_DROPOUT, 3.275922);
} else if (metrics.SAMPLE != null && metrics.SAMPLE.equals("TestSample2")) {//Library 3
Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 9.229205);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.977838);
+ Assert.assertEquals(metrics.AT_DROPOUT, 21.962578);
+ Assert.assertEquals(metrics.GC_DROPOUT, 4.559328);
} else {
Assert.fail("Unexpected metric: " + metrics);
}
}
}
+
+ /////////////////////////////////////////////////////////////////////////////
+ //Compare GcBiasDetailMetrics output file from test1 which only has reads that align to chrM and chrO, but not chrN (in the middle) to
+ // a GcBiasDetailMetrics output file that has reads aligned to all three chromosomes in this reference file. The number of 100bp windows
+ // found across the whole reference should be the same regardless of where records align.
+ //This test ensures that there is not a bug in calculating the gc windows.
+ /////////////////////////////////////////////////////////////////////////////
+ @Test
+ public void runWindowsComparisonTest() throws IOException {
+ final File outfile = File.createTempFile("test", ".gc_bias_summary_metrics");
+ final File allChrOutFile = File.createTempFile("testAllChr", ".gc_bias_summary_metrics");
+ final File detailsOutfile = File.createTempFile("test", ".gc_bias_detail_metrics");
+ final File allChrDetailsOutfile = File.createTempFile("testAllChrDetails", ".gc_bias_detail_metrics");
+ outfile.deleteOnExit();
+ allChrOutFile.deleteOnExit();
+ detailsOutfile.deleteOnExit();
+ allChrDetailsOutfile.deleteOnExit();
+
+ runGcBias(tempSamFileChrM_O, outfile, detailsOutfile);
+ runGcBias(tempSamFileAllChr, allChrOutFile, allChrDetailsOutfile);
+
+ final MetricsFile<GcBiasDetailMetrics, Comparable<?>> outputDetails = new MetricsFile<GcBiasDetailMetrics, Comparable<?>>();
+ outputDetails.read(new FileReader(detailsOutfile));
+ final List<GcBiasDetailMetrics> details = outputDetails.getMetrics();
+
+ final MetricsFile<GcBiasDetailMetrics, Comparable<?>> outputAllChrDetails = new MetricsFile<GcBiasDetailMetrics, Comparable<?>>();
+ outputAllChrDetails.read(new FileReader(allChrDetailsOutfile));
+
+ int i = 0;
+
+ //Output for the two sam files are only the same for the "All Reads" level
+ for (final GcBiasDetailMetrics metrics : outputAllChrDetails.getMetrics()) {
+ if (metrics.ACCUMULATION_LEVEL.equals("All Reads")) {
+ Assert.assertEquals(metrics.WINDOWS, details.get(i).WINDOWS);
+ i++;
+ }
+ else {break;}
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Writes the setBuilders to a SAMFileWriter and sorts the sam.
+ // Takes in a list of SAMRecordSetBuilders because of the multi-level collection: setBuilders cannot take in more than one read group
+ // or library or sample, so there are separate ones for each type when testing multi-level collection.
+ /////////////////////////////////////////////////////////////////////////////
+ public File build (final List<SAMRecordSetBuilder> setBuilder, final File unsortedSam, final SAMFileHeader header) throws IOException {
+ final File sortedSam = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
+ sortedSam.deleteOnExit();
+
+ final SAMFileWriter writer = new SAMFileWriterFactory()
+ .setCreateIndex(true).makeBAMWriter(header, false, unsortedSam);
+
+ for( final SAMRecordSetBuilder subSetBuilder : setBuilder){
+ for (final SAMRecord record : subSetBuilder) {
+ writer.addAlignment(record);
+ }
+ }
+ writer.close();
+
+ final SortSam sorter = new SortSam();
+ final String[] args = new String[] {
+ "INPUT=" + unsortedSam.getAbsolutePath(),
+ "OUTPUT=" + sortedSam.getAbsolutePath(),
+ "SORT_ORDER=coordinate"
+ };
+
+ sorter.instanceMain(args);
+
+ return sortedSam;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Runs CollectGcBias with input Sam file and outputs details and summary files for truth assertion.
+ /////////////////////////////////////////////////////////////////////////////
+ public void runGcBias (final File input, final File outfile, final File detailsOutfile) throws IOException {
+ final String referenceFile = "testdata/picard/metrics/chrMNO.reference.fasta";
+ final File pdf = File.createTempFile("test", ".pdf");
+ pdf.deleteOnExit();
+
+ final int windowSize = 100;
+ final double minGenFraction = 1.0E-5;
+ final boolean biSulfiteSeq = false;
+ final boolean assumeSorted = false;
+
+ final String[] args = new String[]{
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + detailsOutfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + referenceFile,
+ "SUMMARY_OUTPUT=" + outfile.getAbsolutePath(),
+ "CHART_OUTPUT=" + pdf.getAbsolutePath(),
+ "SCAN_WINDOW_SIZE=" + windowSize,
+ "MINIMUM_GENOME_FRACTION=" + minGenFraction,
+ "IS_BISULFITE_SEQUENCED=" + biSulfiteSeq,
+ "LEVEL=ALL_READS",
+ "LEVEL=SAMPLE",
+ "LEVEL=READ_GROUP",
+ "ASSUME_SORTED=" + assumeSorted
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ //Used to generate the Sam Record Sets with SamRecordSetBuilder.addPair().
+ //testNumber 1: runGcBiasMultiLevelTest, generates records aligning to chrM and chrO
+ //testNumber 2: runWindowsComparisonTest, generates records aligning to chrM,N,O.
+ /////////////////////////////////////////////////////////////////////////////
+ public void setupTest1(final int ID, final String readGroupId, final SAMReadGroupRecord readGroupRecord, final String sample,
+ final String library, final SAMFileHeader header, final SAMRecordSetBuilder setBuilder)
+ throws IOException {
+
+ final String separator = ":";
+ final int contig1 = 0;
+ final int contig2 = 1;
+ readGroupRecord.setSample(sample);
+ readGroupRecord.setPlatform(platform);
+ readGroupRecord.setLibrary(library);
+ readGroupRecord.setPlatformUnit(readGroupId);
+ header.addReadGroup(readGroupRecord);
+ setBuilder.setReadGroup(readGroupRecord);
+ setBuilder.setUseNmFlag(true);
+
+ setBuilder.setHeader(header);
+
+ final int max = 800;
+ final int min = 1;
+ final Random rg = new Random(5);
+
+ //add records that align to chrM and O but not N
+ for (int i = 0; i < NUM_READS; i++) {
+ final int start = rg.nextInt(max) + min;
+ final String newReadName = READ_NAME + separator + ID + separator + i;
+
+ if (i != NUM_READS - 1) {
+ setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH);
+ } else {
+ setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH);
+ }
+ }
+ }
+
+ public void setupTest2(final int ID, final String readGroupId, final SAMReadGroupRecord readGroupRecord, final String sample,
+ final String library, final SAMFileHeader header, final SAMRecordSetBuilder setBuilder)
+ throws IOException {
+
+ final String separator = ":";
+ final int contig1 = 0;
+ final int contig2 = 1;
+ final int contig3 = 2;
+ readGroupRecord.setSample(sample);
+ readGroupRecord.setPlatform(platform);
+ readGroupRecord.setLibrary(library);
+ readGroupRecord.setPlatformUnit(readGroupId);
+ setBuilder.setReadGroup(readGroupRecord);
+ setBuilder.setUseNmFlag(true);
+
+ setBuilder.setHeader(header);
+
+ final int max = 800;
+ final int min = 1;
+ final Random rg = new Random(5);
+
+ //add records that align to all 3 chr in reference file
+ for (int i = 0; i < NUM_READS; i++) {
+ final int start = rg.nextInt(max) + min;
+ final String newReadName = READ_NAME + separator + ID + separator + i;
+
+ if (i<=NUM_READS/3) {
+ setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH);
+ } else if (i< (NUM_READS - (NUM_READS/3))) {
+ setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH);
+ } else {
+ setBuilder.addPair(newReadName, contig3, start + ID, start + ID + LENGTH);
+ }
+ }
+ }
}
diff --git a/src/tests/java/picard/analysis/CollectInsertSizeMetricsTest.java b/src/tests/java/picard/analysis/CollectInsertSizeMetricsTest.java
index 37c2d51..295b458 100755
--- a/src/tests/java/picard/analysis/CollectInsertSizeMetricsTest.java
+++ b/src/tests/java/picard/analysis/CollectInsertSizeMetricsTest.java
@@ -26,7 +26,9 @@ package picard.analysis;
import htsjdk.samtools.metrics.MetricsFile;
import org.testng.Assert;
import org.testng.annotations.Test;
+import picard.PicardException;
import picard.cmdline.CommandLineProgramTest;
+import picard.util.RExecutor;
import java.io.File;
import java.io.FileReader;
@@ -223,4 +225,20 @@ public class CollectInsertSizeMetricsTest extends CommandLineProgramTest {
Assert.assertEquals(output.getAllHistograms().size(), 5);
}
+
+ @Test
+ public void testMultipleOrientationsForHistogram() throws IOException {
+ final File output = new File("testdata/picard/analysis/directed/CollectInsertSizeMetrics", "multiple_orientation.sam.insert_size_metrics");
+ final File pdf = File.createTempFile("test", ".pdf");
+ pdf.deleteOnExit();
+
+ final int rResult;
+ rResult = RExecutor.executeFromClasspath(
+ CollectInsertSizeMetrics.Histogram_R_SCRIPT,
+ output.getAbsolutePath(),
+ pdf.getAbsolutePath(),
+ "Flags of Chad and Romania");
+
+ Assert.assertEquals(rResult, 0);
+ }
}
diff --git a/src/tests/java/picard/analysis/CollectMultipleMetricsTest.java b/src/tests/java/picard/analysis/CollectMultipleMetricsTest.java
index 8e8c077..9b42ea0 100644
--- a/src/tests/java/picard/analysis/CollectMultipleMetricsTest.java
+++ b/src/tests/java/picard/analysis/CollectMultipleMetricsTest.java
@@ -48,6 +48,7 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
"INPUT=" + input.getAbsolutePath(),
"OUTPUT=" + outfile.getAbsolutePath(),
"REFERENCE_SEQUENCE=" + reference.getAbsolutePath(),
+ "METRIC_ACCUMULATION_LEVEL="+MetricAccumulationLevel.ALL_READS.name(),
"PROGRAM=null",
"PROGRAM="+CollectMultipleMetrics.Program.CollectAlignmentSummaryMetrics.name(),
"PROGRAM="+CollectMultipleMetrics.Program.CollectInsertSizeMetrics.name()
@@ -112,6 +113,7 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
"INPUT=" + input.getAbsolutePath(),
"OUTPUT=" + outfile.getAbsolutePath(),
"REFERENCE_SEQUENCE=" + reference.getAbsolutePath(),
+ "METRIC_ACCUMULATION_LEVEL="+MetricAccumulationLevel.ALL_READS.name(),
"PROGRAM=null",
"PROGRAM="+CollectMultipleMetrics.Program.CollectAlignmentSummaryMetrics.name(),
"PROGRAM="+CollectMultipleMetrics.Program.CollectInsertSizeMetrics.name()
@@ -259,6 +261,7 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
}
}
}
+
@Test //test all gcBias collection levels
public void testGcBiasMetrics() throws IOException{
runGcTest(tempSamFile);
@@ -272,6 +275,7 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
"INPUT=" + input.getAbsolutePath(),
"OUTPUT=" + outfile.getAbsolutePath(),
"REFERENCE_SEQUENCE=" + referenceFile,
+ "METRIC_ACCUMULATION_LEVEL="+MetricAccumulationLevel.ALL_READS.name(),
"PROGRAM=null",
"PROGRAM="+CollectMultipleMetrics.Program.CollectAlignmentSummaryMetrics.name(),
"PROGRAM="+CollectMultipleMetrics.Program.CollectInsertSizeMetrics.name(),
@@ -288,21 +292,6 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
Assert.assertEquals(metrics.ALIGNED_READS, 600);
Assert.assertEquals(metrics.AT_DROPOUT, 7.234062);
Assert.assertEquals(metrics.GC_DROPOUT, 4.086217);
- } else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup1")) { //Library 1
- Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
- Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 9.20674);
- Assert.assertEquals(metrics.GC_DROPOUT, 3.834244);
- } else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup2")) {//Library 2
- Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
- Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 10.144505);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.08986);
- } else if (metrics.READ_GROUP != null && metrics.READ_GROUP.equals("TestReadGroup3")) {//Library 3
- Assert.assertEquals(metrics.TOTAL_CLUSTERS, 100);
- Assert.assertEquals(metrics.ALIGNED_READS, 200);
- Assert.assertEquals(metrics.AT_DROPOUT, 9.229205);
- Assert.assertEquals(metrics.GC_DROPOUT, 4.977838);
} else {
Assert.fail("Unexpected metric: " + metrics);
}
@@ -412,4 +401,4 @@ public class CollectMultipleMetricsTest extends CommandLineProgramTest {
}
}
-}
\ No newline at end of file
+}
diff --git a/src/tests/java/picard/analysis/CollectRnaSeqMetricsTest.java b/src/tests/java/picard/analysis/CollectRnaSeqMetricsTest.java
index 72dba84..d587791 100644
--- a/src/tests/java/picard/analysis/CollectRnaSeqMetricsTest.java
+++ b/src/tests/java/picard/analysis/CollectRnaSeqMetricsTest.java
@@ -68,6 +68,8 @@ public class CollectRnaSeqMetricsTest extends CommandLineProgramTest {
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
+ samFile.deleteOnExit();
+
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
@@ -82,6 +84,7 @@ public class CollectRnaSeqMetricsTest extends CommandLineProgramTest {
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
+ metricsFile.deleteOnExit();
final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
@@ -139,6 +142,7 @@ public class CollectRnaSeqMetricsTest extends CommandLineProgramTest {
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
+ samFile.deleteOnExit();
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
@@ -153,6 +157,7 @@ public class CollectRnaSeqMetricsTest extends CommandLineProgramTest {
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
+ metricsFile.deleteOnExit();
final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
diff --git a/src/tests/java/picard/analysis/CollectWgsMetricsFromQuerySortedTest.java b/src/tests/java/picard/analysis/CollectWgsMetricsFromQuerySortedTest.java
new file mode 100644
index 0000000..584d6c5
--- /dev/null
+++ b/src/tests/java/picard/analysis/CollectWgsMetricsFromQuerySortedTest.java
@@ -0,0 +1,52 @@
+package picard.analysis;
+
+import htsjdk.samtools.metrics.MetricsFile;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+
+import java.io.*;
+
+/**
+ * Tests for methods in CollectWgsMetricsFromQuerySorted
+ *
+ *
+ * @author Eric Banks
+ */
+
+public class CollectWgsMetricsFromQuerySortedTest extends CommandLineProgramTest {
+
+ private static final File TEST_DATA_DIR = new File("testdata/picard/sam");
+
+ public String getCommandLineProgramName() {
+ return CollectWgsMetricsFromQuerySorted.class.getSimpleName();
+ }
+
+
+ @Test
+ public void testMetricsFromClippedOverhangs() throws IOException {
+ final File input = new File(TEST_DATA_DIR, "namesorted.test.sam");
+ final File outfile = File.createTempFile("metrics", ".txt");
+ outfile.deleteOnExit();
+ final String[] args = new String[] {
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath()
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<CollectWgsMetricsFromQuerySorted.QuerySortedSeqMetrics, Comparable<?>> output = new MetricsFile<CollectWgsMetricsFromQuerySorted.QuerySortedSeqMetrics, Comparable<?>>();
+ output.read(new FileReader(outfile));
+
+ for (final CollectWgsMetricsFromQuerySorted.QuerySortedSeqMetrics metrics : output.getMetrics()) {
+ Assert.assertEquals(metrics.TOTAL_BASES, 606);
+ Assert.assertEquals(metrics.TOTAL_USABLE_BASES, 238);
+ Assert.assertEquals(metrics.PCT_EXC_OVERLAP, 0.085809); // 52 of 606 bases
+ Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.188119); // 114 of 606 bases
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.333333); // 202 of 606 bases
+ Assert.assertEquals(metrics.TOTAL_READ_PAIRS, 3);
+ Assert.assertEquals(metrics.TOTAL_DUPE_PAIRS, 1);
+ Assert.assertEquals(metrics.TOTAL_ORIENTED_PAIRS, 2);
+ Assert.assertEquals(metrics.MEAN_INSERT_SIZE, 118.0);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/tests/java/picard/analysis/CollectWgsMetricsFromSampledSitesTest.java b/src/tests/java/picard/analysis/CollectWgsMetricsFromSampledSitesTest.java
new file mode 100755
index 0000000..4b956b4
--- /dev/null
+++ b/src/tests/java/picard/analysis/CollectWgsMetricsFromSampledSitesTest.java
@@ -0,0 +1,98 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package picard.analysis;
+
+import htsjdk.samtools.metrics.MetricsFile;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+
+/**
+ * Tests CollectWgsMetricsFromSampledSites
+ */
+public class CollectWgsMetricsFromSampledSitesTest extends CommandLineProgramTest {
+ private static final File TEST_DATA_DIR = new File("testdata/picard/sam/");
+
+ public String getCommandLineProgramName() {
+ return CollectWgsMetricsFromSampledSites.class.getSimpleName();
+ }
+
+ @Test
+ public void testOnePos() throws IOException {
+ final File input = new File(TEST_DATA_DIR, "forMetrics.sam");
+ final File outfile = File.createTempFile("test", ".wgs_metrics");
+ final File ref = new File(TEST_DATA_DIR, "merger.fasta");
+ final File intervals = new File(TEST_DATA_DIR, "onePos.interval_list");
+ outfile.deleteOnExit();
+ final String[] args = new String[] {
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(),
+ "INTERVALS=" + intervals.getAbsolutePath()
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<CollectWgsMetricsFromSampledSites.SampledWgsMetrics, Comparable<?>> output = new MetricsFile<CollectWgsMetricsFromSampledSites.SampledWgsMetrics, Comparable<?>>();
+ output.read(new FileReader(outfile));
+
+ for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 1);
+ Assert.assertEquals(metrics.MEAN_COVERAGE, 3.0);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.272727); // 3 of 11
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.181818); // 2 of 11
+ Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.090909); // 1 of 9
+ Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.090909); // 1 of 9
+ }
+ }
+
+ @Test
+ public void testContiguousIntervals() throws IOException {
+ final File input = new File(TEST_DATA_DIR, "forMetrics.sam");
+ final File outfile = File.createTempFile("test", ".wgs_metrics");
+ final File ref = new File(TEST_DATA_DIR, "merger.fasta");
+ final File intervals = new File(TEST_DATA_DIR, "contiguous.interval_list");
+ outfile.deleteOnExit();
+ final String[] args = new String[] {
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(),
+ "INTERVALS=" + intervals.getAbsolutePath()
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<CollectWgsMetrics.WgsMetrics, Comparable<?>> output = new MetricsFile<CollectWgsMetrics.WgsMetrics, Comparable<?>>();
+ output.read(new FileReader(outfile));
+
+ for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 5);
+ Assert.assertEquals(metrics.MEAN_COVERAGE, 2.6);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.0);
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.066667);
+ }
+ }
+}
diff --git a/src/tests/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/tests/java/picard/analysis/directed/CollectTargetedMetricsTest.java
new file mode 100644
index 0000000..a72517c
--- /dev/null
+++ b/src/tests/java/picard/analysis/directed/CollectTargetedMetricsTest.java
@@ -0,0 +1,158 @@
+package picard.analysis.directed;
+
+import htsjdk.samtools.SAMException;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMFileWriter;
+import htsjdk.samtools.SAMFileWriterFactory;
+import htsjdk.samtools.SAMReadGroupRecord;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordSetBuilder;
+import htsjdk.samtools.SAMTextHeaderCodec;
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.BufferedLineReader;
+import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
+import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.util.Log;
+import org.testng.Assert;
+import org.testng.annotations.AfterTest;
+import org.testng.annotations.BeforeTest;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+import picard.sam.SortSam;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Random;
+
+public class CollectTargetedMetricsTest extends CommandLineProgramTest {
+ private final static File TEST_DIR = new File("testdata/picard/sam/CollectGcBiasMetrics/");
+ private final File dict = new File(TEST_DIR, "Mheader.dict");
+ private File tempSamFile;
+ private File outfile;
+ private File perTargetOutfile;
+ private final static int LENGTH = 99;
+
+ private final static String sample = "TestSample1";
+ private final static String readGroupId = "TestReadGroup1";
+ private final static String platform = "ILLUMINA";
+ private final static String library = "TestLibrary1";
+ private final static int numReads = 40000;
+
+ @Override
+ public String getCommandLineProgramName() {
+ return CollectTargetedPcrMetrics.class.getSimpleName();
+ }
+
+ //create a samfile with 40000 reads for testing whether a cap is found.
+ @BeforeTest
+ void setupBuilder() throws IOException {
+ final String readName = "TESTBARCODE";
+
+ //Create Sam Files
+ tempSamFile = File.createTempFile("CollectTargetedMetrics", ".bam", TEST_DIR);
+ final File tempSamFileUnsorted = File.createTempFile("CollectTargetedMetrics", ".bam", TEST_DIR);
+ tempSamFileUnsorted.deleteOnExit();
+ tempSamFile.deleteOnExit();
+ final SAMFileHeader header = new SAMFileHeader();
+
+ //Check that dictionary file is readable and then set header dictionary
+ try {
+ header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(dict));
+ header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
+ } catch (final SAMException e) {
+ e.printStackTrace();
+ }
+
+ //Set readGroupRecord
+ final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(readGroupId);
+ readGroupRecord.setSample(sample);
+ readGroupRecord.setPlatform(platform);
+ readGroupRecord.setLibrary(library);
+ readGroupRecord.setPlatformUnit(readGroupId);
+ header.addReadGroup(readGroupRecord);
+
+ //Add to setBuilder
+ final SAMRecordSetBuilder setBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
+ setBuilder.setReadGroup(readGroupRecord);
+ setBuilder.setUseNmFlag(true);
+ setBuilder.setHeader(header);
+
+ //Read settings
+ final String separator = ":";
+ final int ID = 1;
+ final int max = 15000;
+ final int min = 1;
+ final Random rg = new Random(5);
+
+ for (int i = 0; i < numReads; i++) {
+ final int start = rg.nextInt(max) + min;
+ final String newReadName = readName + separator + ID + separator + i;
+ setBuilder.addPair(newReadName, 0, start + ID, start + ID + LENGTH);
+ }
+
+ //Write SAM file
+ final SAMFileWriter writer = new SAMFileWriterFactory()
+ .setCreateIndex(true).makeBAMWriter(header, false, tempSamFileUnsorted);
+
+ for (final SAMRecord record : setBuilder) {
+ writer.addAlignment(record);
+ }
+ writer.close();
+
+ //sort the temp file
+ final SortSam sorter = new SortSam();
+ final String[] args = new String[]{
+ "INPUT=" + tempSamFileUnsorted.getAbsolutePath(),
+ "OUTPUT=" + tempSamFile.getAbsolutePath(),
+ "SORT_ORDER=coordinate"
+ };
+
+ sorter.instanceMain(args);
+
+ //create output files for tests
+ outfile = File.createTempFile("test", ".TargetedMetrics_Coverage");
+ perTargetOutfile = File.createTempFile("perTarget", ".perTargetCoverage");
+ outfile.deleteOnExit();
+ perTargetOutfile.deleteOnExit();
+ }
+
+ @DataProvider(name = "targetedIntervalDataProvider")
+ public Object[][] targetedIntervalDataProvider() {
+ final String referenceFile = "testdata/picard/quality/chrM.reference.fasta";
+ final String emptyIntervals = "testdata/picard/quality/chrM.empty.interval_list";
+ final String singleIntervals = "testdata/picard/quality/chrM.single.interval_list";
+
+ return new Object[][] {
+ {tempSamFile, outfile, perTargetOutfile, referenceFile, singleIntervals},
+ {tempSamFile, outfile, perTargetOutfile, referenceFile, emptyIntervals}
+ };
+ }
+
+ @Test(dataProvider = "targetedIntervalDataProvider")
+ public void runCollectTargetedMetricsTest(final File input, final File outfile, final File perTargetOutfile, final String referenceFile,
+ final String targetIntervals) throws IOException {
+
+ final String[] args = new String[] {
+ "TARGET_INTERVALS=" + targetIntervals,
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + referenceFile,
+ "PER_TARGET_COVERAGE=" + perTargetOutfile.getAbsolutePath(),
+ "LEVEL=ALL_READS",
+ "AMPLICON_INTERVALS=" + targetIntervals
+ };
+
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<TargetedPcrMetrics, Comparable<?>> output = new MetricsFile<TargetedPcrMetrics, Comparable<?>>();
+ output.read(new FileReader(outfile));
+
+ for (final TargetedPcrMetrics metrics : output.getMetrics()) {
+ Assert.assertEquals(metrics.TOTAL_READS, numReads * 2);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/tests/java/picard/cmdline/PicardCommandLineTest.java b/src/tests/java/picard/cmdline/PicardCommandLineTest.java
new file mode 100644
index 0000000..f62d750
--- /dev/null
+++ b/src/tests/java/picard/cmdline/PicardCommandLineTest.java
@@ -0,0 +1,18 @@
+package picard.cmdline;
+
+import org.testng.annotations.Test;
+
+import java.util.Collections;
+
+/**
+ * Created by farjoun on 9/10/15.
+ */
+public class PicardCommandLineTest {
+
+ @Test
+ public void TestPicardPublic() { // this tests fails if any CLP in picard is missing its @CommandLineProgramProperties annotation
+ PicardCommandLine picardCommandLine = new PicardCommandLine();
+ picardCommandLine.instanceMain(new String[]{""});
+ }
+
+}
\ No newline at end of file
diff --git a/src/tests/java/picard/illumina/CheckIlluminaDirectoryTest.java b/src/tests/java/picard/illumina/CheckIlluminaDirectoryTest.java
index 30f55e9..7828b22 100644
--- a/src/tests/java/picard/illumina/CheckIlluminaDirectoryTest.java
+++ b/src/tests/java/picard/illumina/CheckIlluminaDirectoryTest.java
@@ -76,6 +76,7 @@ public class CheckIlluminaDirectoryTest extends CommandLineProgramTest {
IOUtil.deleteDirectoryTree(dataDir);
IOUtil.deleteDirectoryTree(basecallDir);
IOUtil.deleteDirectoryTree(intensityDir);
+ IOUtil.deleteDirectoryTree(illuminaDir);
}
public void makeFiles(final SupportedIlluminaFormat[] formats, final int lane, final List<Integer> tiles,
diff --git a/src/tests/java/picard/illumina/IlluminaBasecallsToFastqTest.java b/src/tests/java/picard/illumina/IlluminaBasecallsToFastqTest.java
index 953665d..c0d5e6f 100644
--- a/src/tests/java/picard/illumina/IlluminaBasecallsToFastqTest.java
+++ b/src/tests/java/picard/illumina/IlluminaBasecallsToFastqTest.java
@@ -28,6 +28,7 @@ import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.LineReader;
import htsjdk.samtools.util.StringUtil;
import htsjdk.samtools.util.TestUtil;
+import org.testng.annotations.AfterMethod;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.illumina.parser.ReadStructure;
@@ -121,12 +122,17 @@ public class IlluminaBasecallsToFastqTest extends CommandLineProgramTest {
* This test utility takes a libraryParamsFile and generates output sam files through IlluminaBasecallsToFastq to compare against
* preloaded test data
*
- * @param jobName
- * @param libraryParamsFile
- * @param concatNColumnFields
- * @param readStructureString
+ * @param lane lane number to use
+ * @param jobName name of job for the temp file
+ * @param libraryParamsFile the params file to use for the de-multiplexing
+ * @param concatNColumnFields how many columns to concatenate to get the barcode
+ * @param readStructureString what read-structure string to use
+ * @param baseCallsDir what directory can I find the BCLs in
+ * @param testDataDir what directory can I find the expected resulting files
+ *
* @throws Exception
*/
+
private void runStandardTest(final int lane, final String jobName, final String libraryParamsFile,
final int concatNColumnFields, final String readStructureString, final File baseCallsDir,
final File testDataDir) throws Exception {
@@ -134,6 +140,7 @@ public class IlluminaBasecallsToFastqTest extends CommandLineProgramTest {
try {
outputDir.delete();
outputDir.mkdir();
+
outputDir.deleteOnExit();
// Create barcode.params with output files in the temp directory
final File libraryParams = new File(outputDir, libraryParamsFile);
diff --git a/src/tests/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java b/src/tests/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java
index ac4f64f..0e79419 100644
--- a/src/tests/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java
+++ b/src/tests/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java
@@ -84,6 +84,7 @@ public class IlluminaBasecallsToSamAdapterClippingTest extends CommandLineProgra
}
}
samReader.close();
+ samFile.delete();
}
@DataProvider(name="data")
diff --git a/src/tests/java/picard/illumina/IlluminaBasecallsToSamTest.java b/src/tests/java/picard/illumina/IlluminaBasecallsToSamTest.java
index c9cc60e..675f472 100644
--- a/src/tests/java/picard/illumina/IlluminaBasecallsToSamTest.java
+++ b/src/tests/java/picard/illumina/IlluminaBasecallsToSamTest.java
@@ -27,6 +27,7 @@ import htsjdk.samtools.util.BufferedLineReader;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.LineReader;
import htsjdk.samtools.util.StringUtil;
+import htsjdk.samtools.util.TestUtil;
import org.testng.Assert;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
@@ -164,5 +165,6 @@ public class IlluminaBasecallsToSamTest extends CommandLineProgramTest {
for (final File outputSam : samFiles) {
IOUtil.assertFilesEqual(outputSam, new File(testDataDir, outputSam.getName()));
}
+ TestUtil.recursiveDelete(outputDir);
}
}
diff --git a/src/tests/java/picard/illumina/IlluminaLaneMetricsCollectorTest.java b/src/tests/java/picard/illumina/IlluminaLaneMetricsCollectorTest.java
index a986279..59c90f6 100644
--- a/src/tests/java/picard/illumina/IlluminaLaneMetricsCollectorTest.java
+++ b/src/tests/java/picard/illumina/IlluminaLaneMetricsCollectorTest.java
@@ -61,7 +61,8 @@ public class IlluminaLaneMetricsCollectorTest {
final File laneMetricsFile = buildOutputFile(clp.OUTPUT_DIRECTORY, clp.OUTPUT_PREFIX, IlluminaLaneMetrics.getExtension());
final File canonicalLaneFile = buildOutputFile(runDirectory, testRun, IlluminaLaneMetrics.getExtension());
IOUtil.assertFilesEqual(canonicalLaneFile, laneMetricsFile);
- IOUtil.deleteDirectoryTree(clp.OUTPUT_DIRECTORY); }
+ IOUtil.deleteDirectoryTree(clp.OUTPUT_DIRECTORY);
+ }
@DataProvider(name = "testCollectIlluminaLaneMetrics")
public Object[][] testCollectIlluminaLaneMetricsDataProvider() {
diff --git a/src/tests/java/picard/illumina/parser/IlluminaDataProviderTest.java b/src/tests/java/picard/illumina/parser/IlluminaDataProviderTest.java
index 4cbbccb..de4cf0a 100644
--- a/src/tests/java/picard/illumina/parser/IlluminaDataProviderTest.java
+++ b/src/tests/java/picard/illumina/parser/IlluminaDataProviderTest.java
@@ -259,10 +259,12 @@ public class IlluminaDataProviderTest {
}
@Test(dataProvider = "badData", expectedExceptions = {PicardException.class, IllegalArgumentException.class})
- public void testIlluminaDataProviderMissingDatas(final int lane,
- final IlluminaDataType[] actualDts,
- final String illuminaConfigStr,
- final File basecallsDirectory)
+ public void testIlluminaDataProviderMissingDatas(
+ final String testName, final int lane, final int size,
+ final List<Integer> tiles,
+ final IlluminaDataType[] actualDts,
+ final String illuminaConfigStr,
+ final File basecallsDirectory)
throws Exception {
final IlluminaDataProviderFactory factory = new IlluminaDataProviderFactory(basecallsDirectory, lane, new ReadStructure(illuminaConfigStr), bclQualityEvaluationStrategy, actualDts);
factory.makeDataProvider();
diff --git a/src/tests/java/picard/sam/AddCommentsToBamTest.java b/src/tests/java/picard/sam/AddCommentsToBamTest.java
index ca542af..bf61a89 100644
--- a/src/tests/java/picard/sam/AddCommentsToBamTest.java
+++ b/src/tests/java/picard/sam/AddCommentsToBamTest.java
@@ -29,6 +29,7 @@ public class AddCommentsToBamTest extends CommandLineProgramTest {
@Test
public void testAddCommentsToBam() throws Exception {
final File outputFile = File.createTempFile("addCommentsToBamTest.", BamFileIoUtils.BAM_FILE_EXTENSION);
+ outputFile.deleteOnExit();
runIt(INPUT_FILE, outputFile, commentList);
final SAMFileHeader newHeader = SamReaderFactory.makeDefault().getFileHeader(outputFile);
@@ -41,19 +42,24 @@ public class AddCommentsToBamTest extends CommandLineProgramTest {
}
Assert.assertEquals(newHeader.getComments(), massagedComments);
+ outputFile.delete();
}
@Test(expectedExceptions = PicardException.class)
public void testUsingSam() throws Exception {
final File outputFile = File.createTempFile("addCommentsToBamTest.samFile", BamFileIoUtils.BAM_FILE_EXTENSION);
+ outputFile.deleteOnExit();
runIt(SAM_FILE, outputFile, commentList);
+ outputFile.delete();
throw new IllegalStateException("We shouldn't be here!");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testUsingNewlines() throws Exception {
final File outputFile = File.createTempFile("addCommentsToBamTest.mewLine", BamFileIoUtils.BAM_FILE_EXTENSION);
+ outputFile.deleteOnExit();
runIt(SAM_FILE, outputFile, new String[]{"this is\n a crazy\n test"});
+ outputFile.delete();
throw new IllegalStateException("We shouldn't be here!");
}
diff --git a/src/tests/java/picard/sam/GatherBamFilesTest.java b/src/tests/java/picard/sam/GatherBamFilesTest.java
index 865c74e..1f8aa23 100644
--- a/src/tests/java/picard/sam/GatherBamFilesTest.java
+++ b/src/tests/java/picard/sam/GatherBamFilesTest.java
@@ -32,6 +32,7 @@ public class GatherBamFilesTest extends CommandLineProgramTest {
@Test
public void testTheGathering() throws Exception {
final File outputFile = File.createTempFile("gatherBamFilesTest.samFile.", BamFileIoUtils.BAM_FILE_EXTENSION);
+ outputFile.deleteOnExit();
final List<String> args = new ArrayList<String>();
for (final File splitBam : SPLIT_BAMS) {
args.add("INPUT=" + splitBam.getAbsolutePath());
@@ -49,6 +50,7 @@ public class GatherBamFilesTest extends CommandLineProgramTest {
@Test
public void sanityCheckTheGathering() throws Exception {
final File outputFile = File.createTempFile("gatherBamFilesTest.samFile.", BamFileIoUtils.BAM_FILE_EXTENSION);
+ outputFile.deleteOnExit();
final List<String> args = new ArrayList<String>();
for (final File splitBam : SPLIT_BAMS) {
args.add("INPUT=" + splitBam.getAbsolutePath());
diff --git a/src/tests/java/picard/sam/MergeBamAlignmentTest.java b/src/tests/java/picard/sam/MergeBamAlignmentTest.java
index 8dd4bf3..2a2914f 100644
--- a/src/tests/java/picard/sam/MergeBamAlignmentTest.java
+++ b/src/tests/java/picard/sam/MergeBamAlignmentTest.java
@@ -41,6 +41,7 @@ import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
+import htsjdk.samtools.util.IOUtil;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@@ -96,8 +97,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
@Test
public void testMergerWithSupplemental() throws Exception {
final File outputWithSupplemental = File.createTempFile("mergeWithSupplementalTest", ".sam");
- System.out.println(outputWithSupplemental.getAbsolutePath());
- // outputWithSupplemental.deleteOnExit();
+ outputWithSupplemental.deleteOnExit();
doMergeAlignment(unmappedBam,
Collections.singletonList(supplementalReadAlignedBam),
@@ -105,8 +105,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, outputWithSupplemental,
- SamPairUtil.PairOrientation.FR, null, null, null
- );
+ SamPairUtil.PairOrientation.FR, null, null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(outputWithSupplemental);
@@ -177,8 +176,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null
- );
+ SamPairUtil.PairOrientation.FR, null, null, null, null);
SamReader result = SamReaderFactory.makeDefault().open(output);
Assert.assertEquals(result.getFileHeader().getSequenceDictionary().getSequences().size(), 8,
@@ -239,8 +237,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
null, null, null, null,
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null
- );
+ SamPairUtil.PairOrientation.FR, null, null, null, null);
CloserUtil.close(result);
@@ -269,8 +266,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null
- );
+ SamPairUtil.PairOrientation.FR, null, null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(output);
@@ -322,7 +318,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
final SamAlignmentMerger merger = new SamAlignmentMerger(unmapped, target, fasta, null, true, false,
false, Arrays.asList(aligned), 1, null, null, null, null, null, null,
Arrays.asList(SamPairUtil.PairOrientation.FR), SAMFileHeader.SortOrder.coordinate,
- new BestMapqPrimaryAlignmentSelectionStrategy(), false);
+ new BestMapqPrimaryAlignmentSelectionStrategy(), false, false, 30);
merger.mergeAlignment(Defaults.REFERENCE_FASTA);
Assert.assertEquals(sorted, !merger.getForceSort());
@@ -358,8 +354,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null
- );
+ SamPairUtil.PairOrientation.FR, null, null, null, null);
SamReaderFactory factory = SamReaderFactory.makeDefault();
final SamReader result = factory.open(output);
@@ -424,8 +419,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
"0", "1.0", "align!", "myAligner",
true, fasta, merged,
SamPairUtil.PairOrientation.FR, null,
- null, null
- );
+ null, null, null);
Assert.fail("Merger should have failed because unmapped reads are not in queryname order but didn't");
}
@@ -441,8 +435,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, merged,
- null, null, null, null
- );
+ null, null, null, null, null);
// Iterate over the merged output and gather some statistics
final Map<String, AlignmentAccumulator> accumulatorMap = new HashMap<String, AlignmentAccumulator>();
@@ -604,8 +597,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, mergedSam,
- null, null, null, null
- );
+ null, null, null, null, null);
assertSamValid(mergedSam);
@@ -867,7 +859,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
false, fasta, mergedSam,
- null, null, null, null);
+ null, null, null, null, null);
assertSamValid(mergedSam);
@@ -1014,7 +1006,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
fasta, output,
SamPairUtil.PairOrientation.FR,
MergeBamAlignment.PrimaryAlignmentStrategy.EarliestFragment,
- null, null);
+ null, null, null);
Assert.fail("Exception was not thrown");
}
@@ -1037,8 +1029,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
true, fasta, output,
SamPairUtil.PairOrientation.FR, MergeBamAlignment.PrimaryAlignmentStrategy.EarliestFragment,
ONE_OF_THE_BEST_TAG,
- null
- );
+ null, false);
final SamReader mergedReader = SamReaderFactory.makeDefault().open(output);
@@ -1180,7 +1171,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
true,
new File(TEST_DATA_DIR, "cliptest.fasta"), output,
SamPairUtil.PairOrientation.FR, null,
- null, null);
+ null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(output);
final Map<String, SAMRecord> firstReadEncountered = new HashMap<String, SAMRecord>();
@@ -1240,8 +1231,6 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
secondUnmappedRead.setReadPairedFlag(true);
secondUnmappedRead.setSecondOfPairFlag(true);
-
-
final SAMFileWriter unmappedWriter = factory.makeSAMWriter(header, false, unmappedSam);
unmappedWriter.addAlignment(firstUnmappedRead);
unmappedWriter.addAlignment(secondUnmappedRead);
@@ -1270,8 +1259,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
new File(TEST_DATA_DIR, "cliptest.fasta"), output,
SamPairUtil.PairOrientation.FR,
MergeBamAlignment.PrimaryAlignmentStrategy.BestEndMapq,
- null, includeSecondary
- );
+ null, includeSecondary, null);
final SamReader reader = SamReaderFactory.makeDefault().open(output);
int numFirstRecords = 0;
@@ -1315,7 +1303,8 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
final boolean pairedRun, final File refSeq, final File output,
final SamPairUtil.PairOrientation expectedOrientation, final MergeBamAlignment.PrimaryAlignmentStrategy primaryAlignmentStrategy,
final String attributesToRetain,
- final Boolean includeSecondary) {
+ final Boolean includeSecondary,
+ final Boolean unmapContaminantReads) {
final List<String> args = new ArrayList<String>(Arrays.asList(
"UNMAPPED_BAM=" + unmappedBam.getAbsolutePath(),
@@ -1372,6 +1361,9 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
if (includeSecondary != null) {
args.add("INCLUDE_SECONDARY_ALIGNMENTS=" + includeSecondary);
}
+ if (unmapContaminantReads != null) {
+ args.add("UNMAP_CONTAMINANT_READS=" + unmapContaminantReads);
+ }
Assert.assertEquals(runPicardCommandLine(args), 0, "Merge did not succeed");
}
@@ -1509,7 +1501,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
"0", "1.0", "align!", "myAligner",
true, fasta, output,
SamPairUtil.PairOrientation.FR, MergeBamAlignment.PrimaryAlignmentStrategy.MostDistant,
- null, includeSecondary);
+ null, includeSecondary, null);
final SamReader reader = SamReaderFactory.makeDefault().open(output);
int numFirstRecords = 0;
@@ -1673,4 +1665,24 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
},
};
}
+
+ @Test
+ public void testContaminationDetection() throws IOException {
+ final File unmappedSam = new File(TEST_DATA_DIR, "contam.unmapped.sam");
+ final File alignedSam = new File(TEST_DATA_DIR, "contam.aligned.sam");
+ final File expectedSam = new File(TEST_DATA_DIR, "contam.expected.sam");
+ final File refFasta = new File(TEST_DATA_DIR, "cliptest.fasta");
+ final File mergedSam = File.createTempFile("merged", ".sam");
+ mergedSam.deleteOnExit();
+
+ doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam),
+ null, null, null, null,
+ false, true, false, 1,
+ "0", "1.0", "align!", "myAligner",
+ true, refFasta, mergedSam,
+ null, null, null, null, true);
+
+ assertSamValid(mergedSam);
+ IOUtil.assertFilesEqual(expectedSam, mergedSam);
+ }
}
diff --git a/src/tests/java/picard/sam/RevertSamTest.java b/src/tests/java/picard/sam/RevertSamTest.java
index e6493fd..76ae2f3 100755
--- a/src/tests/java/picard/sam/RevertSamTest.java
+++ b/src/tests/java/picard/sam/RevertSamTest.java
@@ -66,6 +66,7 @@ public class RevertSamTest extends CommandLineProgramTest {
final List<String> attributesToClear) throws Exception {
final File output = File.createTempFile("reverted", ".sam");
+ output.deleteOnExit();
final RevertSam reverter = new RevertSam();
final String args[] = new String[5 + (so != null ? 1 : 0) + attributesToClear.size() + (sample != null ? 1 : 0) + (library != null ? 1 : 0)];
int index = 0;
@@ -152,6 +153,7 @@ public class RevertSamTest extends CommandLineProgramTest {
public void basicNegativeTest(final String sample, final String library) throws Exception {
final File output = File.createTempFile("bad", ".sam");
+ output.deleteOnExit();
final RevertSam reverter = new RevertSam();
final String args[] = new String[2 + (sample != null ? 1 : 0) + (library != null ? 1 : 0)];
int index = 0;
diff --git a/src/tests/java/picard/sam/SamFileConverterTest.java b/src/tests/java/picard/sam/SamFileConverterTest.java
index 3628047..7c567aa 100644
--- a/src/tests/java/picard/sam/SamFileConverterTest.java
+++ b/src/tests/java/picard/sam/SamFileConverterTest.java
@@ -81,6 +81,7 @@ public class SamFileConverterTest {
samFormatConverter.INPUT = inputFile;
try {
samFormatConverter.OUTPUT = File.createTempFile("SamFileConverterTest." + inputFile.getName(), extension);
+ samFormatConverter.OUTPUT.deleteOnExit();
} catch (final IOException e) {
e.printStackTrace();
}
diff --git a/src/tests/java/picard/sam/markduplicates/MarkDuplicatesTest.java b/src/tests/java/picard/sam/markduplicates/MarkDuplicatesTest.java
index 351ae1f..a6b7b77 100644
--- a/src/tests/java/picard/sam/markduplicates/MarkDuplicatesTest.java
+++ b/src/tests/java/picard/sam/markduplicates/MarkDuplicatesTest.java
@@ -33,6 +33,7 @@ import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.util.IterableAdapter;
import htsjdk.samtools.util.TestUtil;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
@@ -196,6 +197,8 @@ public class MarkDuplicatesTest extends AbstractMarkDuplicatesCommandLineProgram
markDuplicates.PROGRAM_RECORD_ID = null;
Assert.assertEquals(markDuplicates.doWork(), 0);
Assert.assertEquals(markDuplicates.numOpticalDuplicates(), expectedNumOpticalDuplicates);
+ TestUtil.recursiveDelete(outputDir);
+
}
@DataProvider(name="testOpticalDuplicateDetectionDataProvider")
@@ -205,4 +208,103 @@ public class MarkDuplicatesTest extends AbstractMarkDuplicatesCommandLineProgram
{new File(TEST_DATA_DIR, "optical_dupes_casava.sam"), 1L},
};
}
+
+ @Test
+ public void testWithBarcodeFragmentDuplicate() {
+ final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+ tester.addMappedFragment(2, 41212324, false, "50M", DEFAULT_BASE_QUALITY);
+ tester.addMappedFragment(2, 41212324, true, "50M", DEFAULT_BASE_QUALITY);
+ final String barcodeTag = "BC";
+ for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
+ record.setAttribute(barcodeTag, "Barcode1");
+ }
+ tester.addArg("BARCODE_TAG=" + barcodeTag);
+ tester.runTest();
+ }
+
+ public void addMappedFragment(final int referenceSequenceIndex, final int alignmentStart, final boolean isDuplicate, final String cigar,
+ final int defaultQualityScore) {
+ final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+ tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair("RUNID:2:2:15993:13362", 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ final String barcodeTag = "BC";
+ for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
+ record.setAttribute(barcodeTag, "Barcode1");
+ }
+ tester.addArg("BARCODE_TAG=" + barcodeTag);
+ tester.runTest();
+ }
+
+ @Test
+ public void testWithBarcodeDuplicate() {
+ final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+ tester.addMatePair("RUNID:1:1:15993:13361", 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair("RUNID:2:2:15993:13362", 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ final String barcodeTag = "BC";
+ for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
+ record.setAttribute(barcodeTag, "Barcode1");
+ }
+ tester.addArg("BARCODE_TAG=" + barcodeTag);
+ tester.runTest();
+ }
+
+ @Test
+ public void testWithBarcodeComplex() {
+ final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+ final String readNameOne = "RUNID:1:1:15993:13361";
+ final String readNameTwo = "RUNID:2:2:15993:13362";
+ final String readNameThree = "RUNID:3:3:15993:13362";
+
+ // first two reads have the same barcode, third read has a different barcode
+ tester.addMatePair(readNameOne, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair(readNameTwo, 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); // same barcode as the first
+ tester.addMatePair(readNameThree, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+
+ final String barcodeTag = "BC";
+ for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
+ if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) {
+ record.setAttribute(barcodeTag, "Barcode1");
+ }
+ else if (record.getReadName().equals(readNameThree)) {
+ record.setAttribute(barcodeTag, "Barcode2");
+ }
+ }
+ tester.addArg("BARCODE_TAG=" + barcodeTag);
+ tester.runTest();
+ }
+
+ @Test
+ public void testWithIndividualReadBarcodes() {
+ final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+ final String readNameOne = "RUNID:1:1:15993:13361";
+ final String readNameTwo = "RUNID:2:2:15993:13362";
+ final String readNameThree = "RUNID:3:3:15993:13362";
+
+ // first two reads have the same barcode (all three), third read has a different barcode for the second end
+ tester.addMatePair(readNameOne, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+ tester.addMatePair(readNameTwo, 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); // same barcode as the first
+ tester.addMatePair(readNameThree, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY);
+
+ final String barcodeTag = "BC";
+ final String readOneBarcodeTag = "BX"; // want the same tag as the second end, since this is allowed
+ final String readTwoBarcodeTag = "BX";
+ for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
+ record.setAttribute(barcodeTag, "Barcode1"); // same barcode
+ if (record.getFirstOfPairFlag()) { // always the same value for the first end
+ record.setAttribute(readOneBarcodeTag, "readOne1");
+ }
+ else { // second end
+ if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) {
+ record.setAttribute(readTwoBarcodeTag, "readTwo1");
+ } else if (record.getReadName().equals(readNameThree)) {
+ record.setAttribute(readTwoBarcodeTag, "readTwo2");
+ }
+ }
+ }
+ tester.addArg("BARCODE_TAG=" + barcodeTag);
+ tester.addArg("READ_ONE_BARCODE_TAG=" + readOneBarcodeTag);
+ tester.addArg("READ_TWO_BARCODE_TAG=" + readTwoBarcodeTag);
+
+ tester.runTest();
+ }
}
diff --git a/src/tests/java/picard/util/FifoBufferTest.java b/src/tests/java/picard/util/FifoBufferTest.java
index 76adab1..5f65bd6 100644
--- a/src/tests/java/picard/util/FifoBufferTest.java
+++ b/src/tests/java/picard/util/FifoBufferTest.java
@@ -41,7 +41,7 @@ public class FifoBufferTest {
*/
public void test(final double megabytes) throws IOException {
final File inputFile = File.createTempFile("fifo_input.", ".foo");
-
+ inputFile.deleteOnExit();
// Generate a file with a set number of megabytes of random data
final int nBytes = (int) (megabytes * 1024 * 1024);
{
diff --git a/src/tests/java/picard/util/QuerySortedReadPairIteratorUtilTest.java b/src/tests/java/picard/util/QuerySortedReadPairIteratorUtilTest.java
new file mode 100644
index 0000000..29c277a
--- /dev/null
+++ b/src/tests/java/picard/util/QuerySortedReadPairIteratorUtilTest.java
@@ -0,0 +1,93 @@
+package picard.util;
+
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordSetBuilder;
+import htsjdk.samtools.util.PeekableIterator;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ * Unit tests for QuerySortedReadPairIteratorUtil
+ */
+public class QuerySortedReadPairIteratorUtilTest {
+ private static final int READ_LENGTH = 20;
+
+ @Test
+ public void testBasicPairedRead() {
+ SAMRecordSetBuilder builder = new SAMRecordSetBuilder(false, SAMFileHeader.SortOrder.queryname);
+ builder.setReadLength(READ_LENGTH);
+ builder.addPair("mapped_paired", 1, 1, 31);
+ PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(builder.iterator());
+
+ QuerySortedReadPairIteratorUtil.ReadPair pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNotNull(pair);
+ Assert.assertNotNull(pair.read1);
+ Assert.assertNotNull(pair.read2);
+ Assert.assertEquals("mapped_paired", pair.read1.getReadName());
+ Assert.assertEquals("mapped_paired", pair.read2.getReadName());
+
+ pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNull(pair);
+ }
+
+ @Test
+ public void testBasicUnmappedReadPair() {
+ SAMRecordSetBuilder builder = new SAMRecordSetBuilder(false, SAMFileHeader.SortOrder.queryname);
+ builder.setReadLength(READ_LENGTH);
+ builder.addUnmappedPair("unmapped_paired");
+ PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(builder.iterator());
+
+ QuerySortedReadPairIteratorUtil.ReadPair pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNotNull(pair);
+ Assert.assertNotNull(pair.read1);
+ Assert.assertNotNull(pair.read2);
+ Assert.assertEquals("unmapped_paired", pair.read1.getReadName());
+ Assert.assertEquals("unmapped_paired", pair.read2.getReadName());
+
+ pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNull(pair);
+ }
+
+ @Test
+ public void testBasicHalfmappedReadPair() {
+ SAMRecordSetBuilder builder = new SAMRecordSetBuilder(false, SAMFileHeader.SortOrder.queryname);
+ builder.setReadLength(READ_LENGTH);
+ builder.addPair("halfmapped_paired", 1, 1, 31, false, true, "20M", "20M", true, false, 20);
+ PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(builder.iterator());
+
+ QuerySortedReadPairIteratorUtil.ReadPair pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNotNull(pair);
+ Assert.assertNotNull(pair.read1);
+ Assert.assertNotNull(pair.read2);
+ Assert.assertEquals("halfmapped_paired", pair.read1.getReadName());
+ Assert.assertEquals("halfmapped_paired", pair.read2.getReadName());
+
+ pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNull(pair);
+ }
+
+ @Test
+ public void testFragmentNoReadPair() {
+ SAMRecordSetBuilder builder = new SAMRecordSetBuilder(false, SAMFileHeader.SortOrder.queryname);
+ builder.setReadLength(READ_LENGTH);
+ builder.addFrag("mapped_frag_a", 1, 1, false);
+ builder.addFrag("mapped_frag_b", 1, 1, false);
+ PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(builder.iterator());
+
+ QuerySortedReadPairIteratorUtil.ReadPair pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNotNull(pair);
+ Assert.assertNotNull(pair.read1);
+ Assert.assertNull(pair.read2);
+ Assert.assertEquals("mapped_frag_a", pair.read1.getReadName());
+
+ pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNotNull(pair);
+ Assert.assertNotNull(pair.read1);
+ Assert.assertNull(pair.read2);
+ Assert.assertEquals("mapped_frag_b", pair.read1.getReadName());
+
+ pair = QuerySortedReadPairIteratorUtil.getNextReadPair(iterator);
+ Assert.assertNull(pair);
+ }
+}
diff --git a/src/tests/java/picard/vcf/LiftoverVcfTest.java b/src/tests/java/picard/vcf/LiftoverVcfTest.java
new file mode 100644
index 0000000..f71cf55
--- /dev/null
+++ b/src/tests/java/picard/vcf/LiftoverVcfTest.java
@@ -0,0 +1,95 @@
+package picard.vcf;
+
+import htsjdk.samtools.util.IOUtil;
+import htsjdk.variant.variantcontext.*;
+import htsjdk.variant.vcf.VCFFileReader;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ * Test class for LiftoverVcf.
+ *
+ * Created by ebanks on 8/11/15.
+ */
+public class LiftoverVcfTest extends CommandLineProgramTest {
+
+ private static final File TEST_DATA_PATH = new File("testdata/picard/vcf/");
+ private static final File CHAIN_FILE = new File(TEST_DATA_PATH, "test.over.chain");
+ private static final File REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.reference.fasta");
+ private static final File OUTPUT_DATA_PATH = IOUtil.createTempDir("LiftoverVcfsTest", null);
+
+ public String getCommandLineProgramName() {
+ return LiftoverVcf.class.getSimpleName();
+ }
+
+ @AfterClass
+ public void teardown() {
+ IOUtil.deleteDirectoryTree(OUTPUT_DATA_PATH);
+ }
+
+ @Test
+ public void testDoNotFixReverseComplementedIndels() {
+ final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf");
+ final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf");
+ final File input = new File(TEST_DATA_PATH, "testLiftover.vcf");
+
+ liftOutputFile.deleteOnExit();
+ rejectOutputFile.deleteOnExit();
+
+ final String[] args = new String[]{
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + liftOutputFile.getAbsolutePath(),
+ "REJECT=" + rejectOutputFile.getAbsolutePath(),
+ "CHAIN=" + CHAIN_FILE,
+ "REFERENCE_SEQUENCE=" + REFERENCE_FILE,
+ "CREATE_INDEX=false"
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final VCFFileReader liftReader = new VCFFileReader(liftOutputFile, false);
+ for (final VariantContext inputContext : liftReader) {
+ Assert.fail("there should be no passing indels in the liftover");
+ }
+ final VCFFileReader rejectReader = new VCFFileReader(rejectOutputFile, false);
+ int counter = 0;
+ for (final VariantContext inputContext : rejectReader) {
+ counter++;
+ }
+ Assert.assertEquals(counter, 2, "the wrong number of rejected indels faile the liftover");
+ }
+
+ @Test
+ public void testFixReverseComplementedGenotypes() {
+
+ final Allele refA = Allele.create("A", true);
+ final Allele altC = Allele.create("C", false);
+ final GenotypesContext originalGenotypes = GenotypesContext.create(3);
+ originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make());
+ originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make());
+ originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make());
+
+ final Allele refT = Allele.create("T", true);
+ final Allele altG = Allele.create("G", false);
+ final GenotypesContext expectedGenotypes = GenotypesContext.create(3);
+ expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make());
+ expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make());
+ expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make());
+
+ final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2);
+ reverseComplementAlleleMap.put(refA, refT);
+ reverseComplementAlleleMap.put(altC, altG);
+ final GenotypesContext actualGenotypes = LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap);
+
+ for ( final String sample : Arrays.asList("homref", "het", "homvar") ) {
+ final List<Allele> expected = expectedGenotypes.get(sample).getAlleles();
+ final List<Allele> actual = actualGenotypes.get(sample).getAlleles();
+ Assert.assertEquals(expected.get(0), actual.get(0));
+ Assert.assertEquals(expected.get(1), actual.get(1));
+ }
+ }
+}
diff --git a/src/tests/java/picard/vcf/TestFilterVcf.java b/src/tests/java/picard/vcf/TestFilterVcf.java
index 0909807..8629746 100644
--- a/src/tests/java/picard/vcf/TestFilterVcf.java
+++ b/src/tests/java/picard/vcf/TestFilterVcf.java
@@ -40,10 +40,11 @@ import java.util.TreeSet;
*/
public class TestFilterVcf {
private final File INPUT = new File("testdata/picard/vcf/filter/testFiltering.vcf");
+ private final File BAD_INPUT = new File("testdata/picard/vcf/filter/testFilteringNoSeqDictionary.vcf");
/** Tests that all records get PASS set as their filter when extreme values are used for filtering. */
@Test public void testNoFiltering() throws Exception {
- final File out = testFiltering(INPUT, 0, 0, 0, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 0, 0, Double.MAX_VALUE);
final VCFFileReader in = new VCFFileReader(out, false);
for (final VariantContext ctx : in) {
if (!ctx.filtersWereApplied() || ctx.isFiltered()) {
@@ -55,7 +56,7 @@ public class TestFilterVcf {
/** Tests that sites with a het allele balance < 0.4 are marked as filtered out. */
@Test public void testAbFiltering() throws Exception {
final Set<String> fails = CollectionUtil.makeSet("tf2", "rs28566954", "rs28548431");
- final File out = testFiltering(INPUT, 0.4, 0, 0, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0.4, 0, 0, Double.MAX_VALUE);
final ListMap<String,String> filters = slurpFilters(out);
Assert.assertEquals(filters.keySet(), fails, "Failed sites did not match expected set of failed sites.");
}
@@ -63,7 +64,15 @@ public class TestFilterVcf {
/** Tests that genotypes with DP < 18 are marked as failed, but not >= 18. */
@Test public void testDpFiltering() throws Exception {
final Set<String> fails = CollectionUtil.makeSet("rs71509448", "rs71628926", "rs13302979", "rs2710876");
- final File out = testFiltering(INPUT, 0, 18, 0, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 18, 0, Double.MAX_VALUE);
+ final ListMap<String,String> filters = slurpFilters(out);
+ Assert.assertEquals(filters.keySet(), fails, "Failed sites did not match expected set of failed sites.");
+ }
+
+ /** Tests that genotypes with DP < 18 are marked as failed, but not >= 18. */
+ @Test public void testDpFilteringToVcf() throws Exception {
+ final Set<String> fails = CollectionUtil.makeSet("rs71509448", "rs71628926", "rs13302979", "rs2710876");
+ final File out = testFiltering(INPUT, ".vcf", 0, 18, 0, Double.MAX_VALUE);
final ListMap<String,String> filters = slurpFilters(out);
Assert.assertEquals(filters.keySet(), fails, "Failed sites did not match expected set of failed sites.");
}
@@ -73,17 +82,17 @@ public class TestFilterVcf {
final Set<String> fails = CollectionUtil.makeSet("rs71509448"); // SNP with GQ=21; lowest GQ in file
{
- final File out = testFiltering(INPUT, 0, 0, 20, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 0, 20, Double.MAX_VALUE);
final ListMap<String, String> filters = slurpFilters(out);
Assert.assertEquals(filters.size(), 0, "Should not have filtered sites: " + filters);
}
{
- final File out = testFiltering(INPUT, 0, 0, 21, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 0, 21, Double.MAX_VALUE);
final ListMap<String, String> filters = slurpFilters(out);
Assert.assertEquals(filters.size(), 0, "Should not have filtered sites: " + filters);
}
{
- final File out = testFiltering(INPUT, 0, 0, 22, Double.MAX_VALUE);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 0, 22, Double.MAX_VALUE);
final ListMap<String, String> filters = slurpFilters(out);
Assert.assertEquals(filters.keySet(), fails, "Failed sites did not match expected set of failed sites.");
}
@@ -92,21 +101,21 @@ public class TestFilterVcf {
/** Tests that genotypes with DP < 18 are marked as failed, but not >= 18. */
@Test public void testFsFiltering() throws Exception {
final Set<String> fails = CollectionUtil.makeSet("rs13303033", "rs28548431", "rs2799066");
- final File out = testFiltering(INPUT, 0, 0, 0, 5.0d);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0, 0, 0, 5.0d);
final ListMap<String,String> filters = slurpFilters(out);
Assert.assertEquals(filters.keySet(), fails, "Failed sites did not match expected set of failed sites.");
}
@Test public void testCombinedFiltering() throws Exception {
final TreeSet<String> fails = new TreeSet<String>(CollectionUtil.makeSet("rs13302979", "rs13303033", "rs2710876" , "rs2799066" , "rs28548431", "rs28566954", "rs71509448", "rs71628926", "tf2"));
- final File out = testFiltering(INPUT, 0.4, 18, 22, 5.0d);
+ final File out = testFiltering(INPUT, ".vcf.gz", 0.4, 18, 22, 5.0d);
final ListMap<String,String> filters = slurpFilters(out);
Assert.assertEquals(new TreeSet<String>(filters.keySet()), fails, "Failed sites did not match expected set of failed sites.");
}
/** Utility method that takes a a VCF and a set of parameters and filters the VCF. */
- File testFiltering(final File vcf, final double minAb, final int minDp, final int minGq, final double maxFs) throws Exception {
- final File out = File.createTempFile("filterVcfTest.", ".vcf.gz");
+ File testFiltering(final File vcf, final String outputExtension, final double minAb, final int minDp, final int minGq, final double maxFs) throws Exception {
+ final File out = File.createTempFile("filterVcfTest.", outputExtension);
out.deleteOnExit();
final FilterVcf filterer = new FilterVcf();
@@ -126,6 +135,25 @@ public class TestFilterVcf {
return out;
}
+ /** Tests that attempting to write to an uncompressed vcf fails if the input has no sequence dictionary */
+ @Test(expectedExceptions = PicardException.class)
+ public void testFilteringToVcfWithNoSequenceDictionary() throws Exception {
+ final File out = File.createTempFile("filterVcfTest.", ".vcf");
+ out.deleteOnExit();
+
+ final FilterVcf filterer = new FilterVcf();
+ filterer.CREATE_INDEX = true;
+ filterer.INPUT = BAD_INPUT;
+ filterer.OUTPUT = out;
+ filterer.MIN_AB = 0;
+ filterer.MIN_DP = 18;
+ filterer.MIN_GQ = 0;
+ filterer.MAX_FS = Double.MAX_VALUE;
+
+ filterer.doWork();
+ }
+
+
/** Consumes a VCF and returns a ListMap where each they keys are the IDs of filtered out sites and the values are the set of filters. */
ListMap<String,String> slurpFilters(final File vcf) {
final ListMap<String,String> map = new ListMap<String, String>();
diff --git a/testdata/picard/analysis/directed/CollectInsertSizeMetrics/multiple_orientation.sam.insert_size_metrics b/testdata/picard/analysis/directed/CollectInsertSizeMetrics/multiple_orientation.sam.insert_size_metrics
new file mode 100644
index 0000000..db774e5
--- /dev/null
+++ b/testdata/picard/analysis/directed/CollectInsertSizeMetrics/multiple_orientation.sam.insert_size_metrics
@@ -0,0 +1,312 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.CollectInsertSizeMetrics INPUT=/path/to/bam REFERENCE_SEQUENCE=/path/to/reference ASSUME_SORTED=true OUTPUT=/path/to/output H=/path/to/histogram STOP_AFTER=0 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Some Day of the Week
+
+## METRICS CLASS picard.analysis.InsertSizeMetrics
+MEDIAN_INSERT_SIZE MEDIAN_ABSOLUTE_DEVIATION MIN_INSERT_SIZE MAX_INSERT_SIZE MEAN_INSERT_SIZE STANDARD_DEVIATION READ_PAIRS PAIR_ORIENTATION WIDTH_OF_10_PERCENT WIDTH_OF_20_PERCENT WIDTH_OF_30_PERCENT WIDTH_OF_40_PERCENT WIDTH_OF_50_PERCENT WIDTH_OF_60_PERCENT WIDTH_OF_70_PERCENT WIDTH_OF_80_PERCENT WIDTH_OF_90_PERCENT WIDTH_OF_99_PERCENT SAMPLE LIBRARY READ_GROUP
+1 1 1 1 1.1 1.1 1 FR 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1.1 1.1 1 RF 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1.1 1.1 1 TANDEM 1 1 1 1 1 1 1 1 1 1
+
+## HISTOGRAM java.lang.Integer
+insert_size All_Reads.fr_count All_Reads.rf_count All_Reads.tandem_count
+1 0 100 0
+2 0 100 0
+3 0 100 0
+4 0 100 0
+5 0 100 0
+6 0 100 0
+7 0 100 0
+8 0 100 0
+9 0 100 0
+10 0 100 0
+11 0 100 0
+12 0 100 0
+13 0 100 0
+14 0 100 0
+15 0 100 0
+16 0 100 0
+17 0 100 0
+18 0 100 0
+19 0 100 0
+20 0 100 0
+21 0 100 0
+22 0 100 0
+23 0 100 0
+24 0 100 0
+25 0 100 0
+26 0 100 0
+27 0 100 0
+28 0 100 0
+29 0 100 0
+30 0 100 0
+31 0 100 0
+32 0 100 0
+33 0 100 0
+34 0 100 0
+35 0 100 0
+36 0 100 0
+37 0 100 0
+38 0 100 0
+39 0 100 0
+40 0 100 0
+41 0 100 0
+42 0 100 0
+43 0 100 0
+44 0 100 0
+45 0 100 0
+46 0 100 0
+47 0 100 0
+48 0 100 0
+49 0 100 0
+50 0 100 0
+51 0 100 0
+52 0 100 0
+53 0 100 0
+54 0 100 0
+55 0 100 0
+56 0 100 0
+57 0 100 0
+58 0 100 0
+59 0 100 0
+60 0 100 0
+61 0 100 0
+62 0 100 0
+63 0 100 0
+64 0 100 0
+65 0 100 0
+66 0 100 0
+67 0 100 0
+68 0 100 0
+69 0 100 0
+70 0 100 0
+71 0 100 0
+72 0 100 0
+73 0 100 0
+74 0 100 0
+75 0 100 0
+76 0 100 0
+77 0 100 0
+78 0 100 0
+79 0 100 0
+80 0 100 0
+81 0 100 0
+82 0 100 0
+83 0 100 0
+84 0 100 0
+85 0 100 0
+86 0 100 0
+87 0 100 0
+88 0 100 0
+89 0 100 0
+90 0 100 0
+91 0 100 0
+92 0 100 0
+93 0 100 0
+94 0 100 0
+95 0 100 0
+96 0 100 0
+97 0 100 0
+98 0 100 0
+99 0 100 0
+100 0 0 100
+101 0 0 100
+102 0 0 100
+103 0 0 100
+104 0 0 100
+105 0 0 100
+106 0 0 100
+107 0 0 100
+108 0 0 100
+109 0 0 100
+110 0 0 100
+111 0 0 100
+112 0 0 100
+113 0 0 100
+114 0 0 100
+115 0 0 100
+116 0 0 100
+117 0 0 100
+118 0 0 100
+119 0 0 100
+120 0 0 100
+121 0 0 100
+122 0 0 100
+123 0 0 100
+124 0 0 100
+125 0 0 100
+126 0 0 100
+127 0 0 100
+128 0 0 100
+129 0 0 100
+130 0 0 100
+131 0 0 100
+132 0 0 100
+133 0 0 100
+134 0 0 100
+135 0 0 100
+136 0 0 100
+137 0 0 100
+138 0 0 100
+139 0 0 100
+140 0 0 100
+141 0 0 100
+142 0 0 100
+143 0 0 100
+144 0 0 100
+145 0 0 100
+146 0 0 100
+147 0 0 100
+148 0 0 100
+149 0 0 100
+150 0 0 100
+151 0 0 100
+152 0 0 100
+153 0 0 100
+154 0 0 100
+155 0 0 100
+156 0 0 100
+157 0 0 100
+158 0 0 100
+159 0 0 100
+160 0 0 100
+161 0 0 100
+162 0 0 100
+163 0 0 100
+164 0 0 100
+165 0 0 100
+166 0 0 100
+167 0 0 100
+168 0 0 100
+169 0 0 100
+170 0 0 100
+171 0 0 100
+172 0 0 100
+173 0 0 100
+174 0 0 100
+175 0 0 100
+176 0 0 100
+177 0 0 100
+178 0 0 100
+179 0 0 100
+180 0 0 100
+181 0 0 100
+182 0 0 100
+183 0 0 100
+184 0 0 100
+185 0 0 100
+186 0 0 100
+187 0 0 100
+188 0 0 100
+189 0 0 100
+190 0 0 100
+191 0 0 100
+192 0 0 100
+193 0 0 100
+194 0 0 100
+195 0 0 100
+196 0 0 100
+197 0 0 100
+198 0 0 100
+199 0 0 100
+200 100 0 0
+201 100 0 0
+202 100 0 0
+203 100 0 0
+204 100 0 0
+205 100 0 0
+206 100 0 0
+207 100 0 0
+208 100 0 0
+209 100 0 0
+210 100 0 0
+211 100 0 0
+212 100 0 0
+213 100 0 0
+214 100 0 0
+215 100 0 0
+216 100 0 0
+217 100 0 0
+218 100 0 0
+219 100 0 0
+220 100 0 0
+221 100 0 0
+222 100 0 0
+223 100 0 0
+224 100 0 0
+225 100 0 0
+226 100 0 0
+227 100 0 0
+228 100 0 0
+229 100 0 0
+230 100 0 0
+231 100 0 0
+232 100 0 0
+233 100 0 0
+234 100 0 0
+235 100 0 0
+236 100 0 0
+237 100 0 0
+238 100 0 0
+239 100 0 0
+240 100 0 0
+241 100 0 0
+242 100 0 0
+243 100 0 0
+244 100 0 0
+245 100 0 0
+246 100 0 0
+247 100 0 0
+248 100 0 0
+249 100 0 0
+250 100 0 0
+251 100 0 0
+252 100 0 0
+253 100 0 0
+254 100 0 0
+255 100 0 0
+256 100 0 0
+257 100 0 0
+258 100 0 0
+259 100 0 0
+260 100 0 0
+261 100 0 0
+262 100 0 0
+263 100 0 0
+264 100 0 0
+265 100 0 0
+266 100 0 0
+267 100 0 0
+268 100 0 0
+269 100 0 0
+270 100 0 0
+271 100 0 0
+272 100 0 0
+273 100 0 0
+274 100 0 0
+275 100 0 0
+276 100 0 0
+277 100 0 0
+278 100 0 0
+279 100 0 0
+280 100 0 0
+281 100 0 0
+282 100 0 0
+283 100 0 0
+284 100 0 0
+285 100 0 0
+286 100 0 0
+287 100 0 0
+288 100 0 0
+289 100 0 0
+290 100 0 0
+291 100 0 0
+292 100 0 0
+293 100 0 0
+294 100 0 0
+295 100 0 0
+296 100 0 0
+297 100 0 0
+298 100 0 0
+299 100 0 0
diff --git a/testdata/picard/metrics/chrMNO.reference.fasta b/testdata/picard/metrics/chrMNO.reference.fasta
new file mode 100644
index 0000000..ebce079
--- /dev/null
+++ b/testdata/picard/metrics/chrMNO.reference.fasta
@@ -0,0 +1,65 @@
+>chrM
+GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
+TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
+GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
+CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
+AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT
+GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA
+AACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGC
+CAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAAT
+TTTATCTTTAGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACA
+TTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCC
+GCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAAC
+CAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCA
+AAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAA
+ATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGC
+AAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAG
+GGACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCC
+ACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGT
+TTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACC
+GCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTT
+TAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAA
+>chrN
+CAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCC
+CTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTA
+GCAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCG
+CCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTC
+TAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGA
+CAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAAC
+TATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTA
+TAAACTAATACACCAGTCTTGTAAACCGGAGACGAAAACCTTTTTCCAAG
+GACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAA
+GATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGG
+TACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTAC
+ATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCA
+CCTGTAGTACATAAAAACCCAACCCACATCAAACCCCCCCCCCCCATGCT
+TACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAAC
+TCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTT
+AACAGTACATAGTACATAAAGTCATTTACCGTACATAGCACATTACAGTC
+AAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTG
+ACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCT
+CGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGAC
+ATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCC
+CTTAAATAAGACATCACGATG
+>chrO
+CAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCC
+CTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTA
+GCAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCG
+CCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTC
+TAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGA
+CAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAAC
+TATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTA
+TAAACTAATACACCAGTCTTGTAAACCGGAGACGAAAACCTTTTTCCAAG
+GACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAA
+GATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGG
+TACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTAC
+ATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCA
+CCTGTAGTACATAAAAACCCAACCCACATCAAACCCCCCCCCCCCATGCT
+TACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAAC
+TCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTT
+AACAGTACATAGTACATAAAGTCATTTACCGTACATAGCACATTACAGTC
+AAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTG
+ACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCT
+CGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGAC
+ATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCC
+CTTAAATAAGACATCACGATG
diff --git a/testdata/picard/quality/chrM.empty.interval_list b/testdata/picard/quality/chrM.empty.interval_list
new file mode 100644
index 0000000..44daa55
--- /dev/null
+++ b/testdata/picard/quality/chrM.empty.interval_list
@@ -0,0 +1,4 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chrM LN:16903 SP:Mus musculus
+chrM 2 1 + interval-1
+chrM 20 100 + interval-2
\ No newline at end of file
diff --git a/testdata/picard/quality/chrM.reference.fasta.fai b/testdata/picard/quality/chrM.reference.fasta.fai
new file mode 100644
index 0000000..51af2c9
--- /dev/null
+++ b/testdata/picard/quality/chrM.reference.fasta.fai
@@ -0,0 +1 @@
+chrM 16571 6 50 51
diff --git a/testdata/picard/quality/chrM.single.interval_list b/testdata/picard/quality/chrM.single.interval_list
new file mode 100644
index 0000000..11993be
--- /dev/null
+++ b/testdata/picard/quality/chrM.single.interval_list
@@ -0,0 +1,3 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chrM LN:16903 AS:mm9 SP:Mus musculus
+chrM 1 1000 + interval-1
\ No newline at end of file
diff --git a/testdata/picard/sam/CollectGcBiasMetrics/MNOheader.dict b/testdata/picard/sam/CollectGcBiasMetrics/MNOheader.dict
new file mode 100644
index 0000000..062b708
--- /dev/null
+++ b/testdata/picard/sam/CollectGcBiasMetrics/MNOheader.dict
@@ -0,0 +1,4 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chrM LN:1019 UR:chrM.reference.fasta
+ at SQ SN:chrN LN:1041 UR:chrM.reference.fasta
+ at SQ SN:chrO LN:1041 UR:chrM.reference.fasta
\ No newline at end of file
diff --git a/testdata/picard/sam/MergeBamAlignment/contam.aligned.sam b/testdata/picard/sam/MergeBamAlignment/contam.aligned.sam
new file mode 100644
index 0000000..1d01448
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/contam.aligned.sam
@@ -0,0 +1,25 @@
+ at HD VN:1.0 SO:queryname
+ at SQ SN:chr1 LN:1000
+ at RG ID:0 SM:Hi,Mom! PL:ILLUMINA
+ at CO frag_multiple_primary_1 should be marked contaminant because the overclipped alignment has higher MAPQ, and the other alignment should be omitted
+ at CO frag_multiple_primary_2 should NOT be marked contaminant because the good alignment has higher MAPQ, and the overclipped alignment should be marked as secondary
+ at CO frag_primary_clipped should be marked contaminant because primary alignment is overclipped, and the secondary / supplementary should be omitted
+ at CO frag_secondary_clipped should NOT be marked contaminant because only secondary is overclipped, and will be preserved as-is
+ at CO r1_clipped_r2_clipped should be marked contaminant because at least one segment is overclipped
+ at CO r1_clipped_r2_perfect should be marked contaminant because at least one segment is overclipped
+ at CO r1_clipped_r2_unmapped should be marked contaminant because at least one segment is overclipped
+frag_multiple_primary_1 0 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_multiple_primary_1 0 chr1 1 15 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_multiple_primary_2 0 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_primary_clipped 0 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_primary_clipped 256 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_primary_clipped 2048 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_clipped 97 chr1 1 30 20S10M20S chr1 51 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_clipped 145 chr1 51 30 20S10M20S chr1 1 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_perfect 97 chr1 1 30 20S10M20S chr1 51 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_perfect 145 chr1 51 30 50M chr1 1 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_unmapped 73 chr1 1 30 20S10M20S chr1 51 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_unmapped 133 chr1 51 0 * chr1 1 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
diff --git a/testdata/picard/sam/MergeBamAlignment/contam.expected.sam b/testdata/picard/sam/MergeBamAlignment/contam.expected.sam
new file mode 100644
index 0000000..607579d
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/contam.expected.sam
@@ -0,0 +1,16 @@
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734
+ at RG ID:0 SM:Hi,Mom! PL:ILLUMINA
+ at PG ID:0 VN:1.0 CL:align! PN:myAligner
+frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
+frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
+frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
+frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
+r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+r1_clipped_r2_unmapped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0
+r1_clipped_r2_clipped 157 * 0 0 20S10M20S * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
+r1_clipped_r2_perfect 157 * 0 0 50M * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
diff --git a/testdata/picard/sam/MergeBamAlignment/contam.unmapped.sam b/testdata/picard/sam/MergeBamAlignment/contam.unmapped.sam
new file mode 100644
index 0000000..ceb5f19
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/contam.unmapped.sam
@@ -0,0 +1,12 @@
+ at HD VN:1.0 SO:queryname
+ at RG ID:0 SM:Hi,Mom! PL:ILLUMINA
+frag_multiple_primary_1 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_multiple_primary_2 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_primary_clipped 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+frag_secondary_clipped 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_clipped 77 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_clipped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_perfect 77 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_perfect 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_unmapped 77 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? RG:Z:0
+r1_clipped_r2_unmapped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? RG:Z:0
diff --git a/testdata/picard/sam/contiguous.interval_list b/testdata/picard/sam/contiguous.interval_list
new file mode 100644
index 0000000..a9d1d20
--- /dev/null
+++ b/testdata/picard/sam/contiguous.interval_list
@@ -0,0 +1,11 @@
+ at HD VN:1.0
+ at SQ SN:chr1 LN:101 UR:merger.fasta M5:bd01f7e11515bb6beda8f7257902aa67
+ at SQ SN:chr2 LN:101 UR:merger.fasta M5:31c33e2155b3de5e2554b693c475b310
+ at SQ SN:chr3 LN:101 UR:merger.fasta M5:631593c6dd2048ae88dcce2bd505d295
+ at SQ SN:chr4 LN:101 UR:merger.fasta M5:c60cb92f1ee5b78053c92bdbfa19abf1
+ at SQ SN:chr5 LN:101 UR:merger.fasta M5:07ebc213c7611db0eacbb1590c3e9bda
+ at SQ SN:chr6 LN:101 UR:merger.fasta M5:7be2f5e7ee39e60a6c3b5b6a41178c6d
+ at SQ SN:chr7 LN:404 UR:merger.fasta M5:da488fc432cdaf2c20c96da473a7b630
+ at SQ SN:chr8 LN:202 UR:merger.fasta M5:d339678efce576d5546e88b49a487b63
+chr7 10 11 + .
+chr7 16 18 + .
diff --git a/testdata/picard/sam/forMetrics.sam b/testdata/picard/sam/forMetrics.sam
new file mode 100644
index 0000000..2874c76
--- /dev/null
+++ b/testdata/picard/sam/forMetrics.sam
@@ -0,0 +1,23 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chr1 LN:101
+ at SQ SN:chr2 LN:101
+ at SQ SN:chr3 LN:101
+ at SQ SN:chr4 LN:101
+ at SQ SN:chr5 LN:101
+ at SQ SN:chr6 LN:101
+ at SQ SN:chr7 LN:404
+ at SQ SN:chr8 LN:202
+ at RG ID:0 SM:Hi,Mom! PL:ILLUMINA
+ at PG ID:1 PN:Hey! VN:2.0
+read1 83 chr7 1 60 99M2H = 302 201 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read1Dup 1107 chr7 1 60 99M2H = 302 201 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read2 89 chr7 1 60 1H100M * 0 0 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read3 83 chr7 1 60 101M = 302 201 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read4 147 chr7 16 60 98M3H = 21 -96 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCC AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read4 99 chr7 21 60 101M = 16 96 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read5 147 chr7 25 10 101M = 25 101 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read5 99 chr7 25 10 101M = 25 -101 CAACAGAAGCAGGAATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGATTATCGAATCAAAAAAAAATCCCGATTTCATTCCGCAGCTAACCTCCCAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read1 163 chr7 30 60 101M = 1 -201 ACGCGGCATCACGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCAAGAGCATACA AA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read1Dup 1187 chr7 30 60 101M = 1 -201 ACGCGGCATCACGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCAAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read3 163 chr7 30 5 10M1D10M5I76M = 1 -201 ACGCGGCATCACGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCAAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
+read2 165 * 0 0 * chr7 1 0 ACGCGGCATCACGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCAAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:0
diff --git a/testdata/picard/sam/namesorted.test.sam b/testdata/picard/sam/namesorted.test.sam
new file mode 100644
index 0000000..645f140
--- /dev/null
+++ b/testdata/picard/sam/namesorted.test.sam
@@ -0,0 +1,33 @@
+ at HD VN:1.5 GO:none SO:queryname
+ at SQ SN:1 LN:249250621
+ at SQ SN:2 LN:243199373
+ at SQ SN:3 LN:198022430
+ at SQ SN:4 LN:191154276
+ at SQ SN:5 LN:180915260
+ at SQ SN:6 LN:171115067
+ at SQ SN:7 LN:159138663
+ at SQ SN:8 LN:146364022
+ at SQ SN:9 LN:141213431
+ at SQ SN:10 LN:135534747
+ at SQ SN:11 LN:135006516
+ at SQ SN:12 LN:133851895
+ at SQ SN:13 LN:115169878
+ at SQ SN:14 LN:107349540
+ at SQ SN:15 LN:102531392
+ at SQ SN:16 LN:90354753
+ at SQ SN:17 LN:81195210
+ at SQ SN:18 LN:78077248
+ at SQ SN:19 LN:59128983
+ at SQ SN:20 LN:63025520
+ at SQ SN:21 LN:48129895
+ at SQ SN:22 LN:51304566
+ at SQ SN:X LN:155270560
+ at SQ SN:Y LN:59373566
+ at SQ SN:MT LN:16569
+ at RG ID:20FUK.1 PL:illumina PU:20FUKAAXX100202.1 LB:Solexa-18483 SM:NA12878 CN:BI
+readpair1 99 1 1811000 29 58M43S = 1811263 118 CGTGGTGGCAGGCACCTGTAGTCCCAGCTATTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCAGGAGGCGGACCTTGCAGTGAGCCAAGATCGACCCA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA############################################ X0:i:4 X1:i:200 XC:i:58 MD:Z:58 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:0 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+readpair1 147 1 1811030 37 12S89M = 1811004 -118 GATCGTCACATCGTGTCTGCTCTCAGCCTGATGCAATACACAGTTTTAGCTGAAGTCTAAGAGGAAAATTCAGTCTCACATAGATATGCAGCTGGAAAAAG #############AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA X0:i:1 X1:i:0 XC:i:89 MD:Z:89 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:37 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@>YV[Z`X[aXQWXRBGUTX^V]^^VNXRXXSVVUHWQWVUZZX]WHVXUXRUUUV[VJ\XRPXZXTQWWWTVTT_X^_]XOXWWZZY[Z
+readpair2 99 1 1811000 29 58M43S3H = 1811263 118 CGTGGTGGCAGGCACCTGTAGTCCCAGCTATTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCAGGAGGCGGACCTTGCAGTGAGCCAAGATCGACCCA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA############################################ X0:i:4 X1:i:200 XC:i:58 MD:Z:58 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:0 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+readpair2 147 1 1811030 37 4H12S89M = 1811004 -118 GATCGTCACATCGTGTCTGCTCTCAGCCTGATGCAATACACAGTTTTAGCTGAAGTCTAAGAGGAAAATTCAGTCTCACATAGATATGCAGCTGGAAAAAG #############AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA X0:i:1 X1:i:0 XC:i:89 MD:Z:89 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:37 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@>YV[Z`X[aXQWXRBGUTX^V]^^VNXRXXSVVUHWQWVUZZX]WHVXUXRUUUV[VJ\XRPXZXTQWWWTVTT_X^_]XOXWWZZY[Z
+readpair2Dup 1123 1 1811000 29 58M43S3H = 1811263 118 CGTGGTGGCAGGCACCTGTAGTCCCAGCTATTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCAGGAGGCGGACCTTGCAGTGAGCCAAGATCGACCCA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA############################################ X0:i:4 X1:i:200 XC:i:58 MD:Z:58 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:0 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+readpair2Dup 1171 1 1811030 37 4H12S89M = 1811004 -118 GATCGTCACATCGTGTCTGCTCTCAGCCTGATGCAATACACAGTTTTAGCTGAAGTCTAAGAGGAAAATTCAGTCTCACATAGATATGCAGCTGGAAAAAG #############AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA X0:i:1 X1:i:0 XC:i:89 MD:Z:89 RG:Z:20FUK.1 XG:i:0 AM:i:0 NM:i:0 SM:i:37 XM:i:0 XO:i:0 BQ:Z:@@@@@@@@@@@@>YV[Z`X[aXQWXRBGUTX^V]^^VNXRXXSVVUHWQWVUZZX]WHVXUXRUUUV[VJ\XRPXZXTQWWWTVTT_X^_]XOXWWZZY[Z
diff --git a/testdata/picard/sam/onePos.interval_list b/testdata/picard/sam/onePos.interval_list
new file mode 100644
index 0000000..68b7aaf
--- /dev/null
+++ b/testdata/picard/sam/onePos.interval_list
@@ -0,0 +1,10 @@
+ at HD VN:1.0
+ at SQ SN:chr1 LN:101 UR:merger.fasta M5:bd01f7e11515bb6beda8f7257902aa67
+ at SQ SN:chr2 LN:101 UR:merger.fasta M5:31c33e2155b3de5e2554b693c475b310
+ at SQ SN:chr3 LN:101 UR:merger.fasta M5:631593c6dd2048ae88dcce2bd505d295
+ at SQ SN:chr4 LN:101 UR:merger.fasta M5:c60cb92f1ee5b78053c92bdbfa19abf1
+ at SQ SN:chr5 LN:101 UR:merger.fasta M5:07ebc213c7611db0eacbb1590c3e9bda
+ at SQ SN:chr6 LN:101 UR:merger.fasta M5:7be2f5e7ee39e60a6c3b5b6a41178c6d
+ at SQ SN:chr7 LN:404 UR:merger.fasta M5:da488fc432cdaf2c20c96da473a7b630
+ at SQ SN:chr8 LN:202 UR:merger.fasta M5:d339678efce576d5546e88b49a487b63
+chr7 32 32 + .
diff --git a/testdata/picard/vcf/dummy.reference.dict b/testdata/picard/vcf/dummy.reference.dict
new file mode 100644
index 0000000..7d43142
--- /dev/null
+++ b/testdata/picard/vcf/dummy.reference.dict
@@ -0,0 +1,2 @@
+ at HD VN:1.5 SO:unsorted
+ at SQ SN:chr1 LN:540 M5:8d1131bdad4dc1a11abd8d8e11c69909 UR:file:testdata/picard/vcf/dummy.reference.fasta
diff --git a/testdata/picard/vcf/dummy.reference.fasta b/testdata/picard/vcf/dummy.reference.fasta
new file mode 100644
index 0000000..176e153
--- /dev/null
+++ b/testdata/picard/vcf/dummy.reference.fasta
@@ -0,0 +1,10 @@
+>chr1 dna:chromosome chromosome:GRCh37:1:1:540:1
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
diff --git a/testdata/picard/vcf/filter/testFilteringNoSeqDictionary.vcf b/testdata/picard/vcf/filter/testFilteringNoSeqDictionary.vcf
new file mode 100644
index 0000000..9b9d600
--- /dev/null
+++ b/testdata/picard/vcf/filter/testFilteringNoSeqDictionary.vcf
@@ -0,0 +1,65 @@
+##fileformat=VCFv4.1
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
+##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
+##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##GATKCommandLine=<ID=GenotypeGVCFs,Version=3.2-77-gbf8aa36,Date="Fri Aug 29 09:15:21 EDT 2014",Epoch=1409318121531,CommandLineOptions="analysis_type=GenotypeGVCFs input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/seq/references/Homo_sapiens_assembly19/v1/variant_calling/exome_calling_regions.v1.interval_list] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq [...]
+##GATKCommandLine=<ID=HaplotypeCaller,Version=3.2-77-gbf8aa36,Date="Thu Aug 28 10:36:15 EDT 2014",Epoch=1409236575395,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[RP697.NA12878.bam] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/seq/references/Homo_sapiens_assembly19/v1/variant_calling/exome_calling_regions.v1.interval_list] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 re [...]
+##GVCFBlock=minGQ=0(inclusive),maxGQ=1(exclusive)
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=CCC,Number=1,Type=Integer,Description="Number of called chromosomes">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of all GQ values">
+##INFO=<ID=GQ_STDDEV,Number=1,Type=Float,Description="Standard deviation of all GQ values">
+##INFO=<ID=HWP,Number=1,Type=Float,Description="P value from test of Hardy Weinberg Equilibrium">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NCC,Number=1,Type=Integer,Description="Number of no-called samples">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
+##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
+1 324822 tf1 A T 2213.77 . AC=2;AF=1.00;AN=2;DP=79;FS=0.000;GQ_MEAN=234.00;MLEAC=2;MLEAF=1.00;MQ=25.61;MQ0=0;NCC=0;QD=28.38;SOR=0.855 GT:AD:DP:GQ:PL 1/1:0,78:78:99:2242,234,0
+1 883899 rs72631890 T G 1315.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=3.58;ClippingRankSum=-1.034e+00;DB;DP=123;FS=0.000;GQ_MEAN=1344.00;MLEAC=1;MLEAF=0.500;MQ=57.87;MQ0=0;MQRankSum=-3.820e-01;NCC=0;QD=10.70;ReadPosRankSum=0.085;SOR=0.765 GT:AD:DP:GQ:PL 0/1:72,51:123:99:1344,0,1858
+1 899942 rs71509448 G A 172.80 . AC=2;AF=1.00;AN=2;DB;DP=11;FS=0.000;GQ_MEAN=21.00;MLEAC=2;MLEAF=1.00;MQ=50.96;MQ0=0;NCC=0;QD=15.71;SOR=4.977 GT:AD:DP:GQ:PGT:PID:PL 1/1:0,11:11:21:1|1:899928_G_C:201,21,0
+1 900298 rs71628926 C G 165.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-2.793e+00;ClippingRankSum=0.135;DB;DP=12;FS=0.000;GQ_MEAN=147.00;MLEAC=1;MLEAF=0.500;MQ=56.04;MQ0=0;MQRankSum=0.135;NCC=0;QD=13.81;ReadPosRankSum=0.135;SOR=1.022 GT:AD:DP:GQ:PL 0/1:5,7:12:99:194,0,147
+1 909419 rs28548431 C T 522.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=2.61;ClippingRankSum=-2.420e-01;DB;DP=54;FS=5.418;GQ_MEAN=551.00;MLEAC=1;MLEAF=0.500;MQ=60.00;MQ0=0;MQRankSum=1.28;NCC=0;QD=9.68;ReadPosRankSum=-5.110e-01;SOR=2.066 GT:AD:DP:GQ:PL 0/1:34,20:54:99:551,0,978
+1 912049 rs9803103 T C 336.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.522e+00;ClippingRankSum=-2.082e+00;DB;DP=41;FS=0.000;GQ_MEAN=365.00;MLEAC=1;MLEAF=0.500;MQ=58.87;MQ0=0;MQRankSum=0.612;NCC=0;QD=9.90;ReadPosRankSum=1.07;SOR=1.179 GT:AD:DP:GQ:PL 0/1:20,14:34:99:365,0,564
+1 914333 rs13302979 C G 151.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=1.23;ClippingRankSum=-2.130e+00;DB;DP=14;FS=0.000;GQ_MEAN=180.00;MLEAC=1;MLEAF=0.500;MQ=60.00;MQ0=0;MQRankSum=-9.680e-01;NCC=0;QD=10.84;ReadPosRankSum=-1.226e+00;SOR=0.412 GT:AD:DP:GQ:PL 0/1:8,6:14:99:180,0,224
+1 914414 tf2 CGAA C 97.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-9.030e-01;ClippingRankSum=-2.650e-01;DP=21;FS=0.000;GQ_MEAN=126.00;MLEAC=1;MLEAF=0.500;MQ=57.06;MQ0=0;MQRankSum=-2.650e-01;NCC=0;QD=5.43;ReadPosRankSum=-6.230e-01;SOR=0.892 GT:AD:DP:GQ:PL 0/1:14,4:18:99:126,0,575
+1 914852 rs13303368 G C 664.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-9.500e-02;ClippingRankSum=1.13;DB;DP=40;FS=3.468;GQ_MEAN=546.00;MLEAC=1;MLEAF=0.500;MQ=58.36;MQ0=0;MQRankSum=0.829;NCC=0;QD=16.62;ReadPosRankSum=-9.380e-01;SOR=2.019 GT:AD:DP:GQ:PL 0/1:18,22:40:99:693,0,546
+1 914876 rs13302983 T C 2241.77 . AC=2;AF=1.00;AN=2;DB;DP=61;FS=0.000;GQ_MEAN=189.00;MLEAC=2;MLEAF=1.00;MQ=59.62;MQ0=0;NCC=0;QD=30.09;SOR=2.948 GT:AD:DP:GQ:PL 1/1:0,61:61:99:2270,189,0
+1 914940 rs13303033 T C 1330.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=2.60;ClippingRankSum=-1.980e-01;DB;DP=98;FS=5.001;GQ_MEAN=1170.00;MLEAC=1;MLEAF=0.500;MQ=58.76;MQ0=0;MQRankSum=0.344;NCC=0;QD=13.86;ReadPosRankSum=2.22;SOR=0.603 GT:AD:DP:GQ:PL 0/1:47,49:96:99:1359,0,1170
+1 915227 rs13303355 A G 6323.77 . AC=2;AF=1.00;AN=2;DB;DP=168;FS=0.000;GQ_MEAN=508.00;MLEAC=2;MLEAF=1.00;MQ=59.73;MQ0=0;NCC=0;QD=31.26;SOR=3.456 GT:AD:DP:GQ:PL 1/1:0,168:168:99:6352,508,0
+1 916549 rs6660139 A G 1033.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=2.85;ClippingRankSum=0.722;DB;DP=98;FS=0.000;GQ_MEAN=1062.00;MLEAC=1;MLEAF=0.500;MQ=59.34;MQ0=0;MQRankSum=-3.550e-01;NCC=0;QD=10.66;ReadPosRankSum=-7.950e-01;SOR=0.773 GT:AD:DP:GQ:PL 0/1:57,40:97:99:1062,0,1626
+1 916590 rs28566954 G A 793.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-4.441e+00;ClippingRankSum=-1.191e+00;DB;DP=41;FS=0.000;GQ_MEAN=372.00;MLEAC=1;MLEAF=0.500;MQ=59.44;MQ0=0;MQRankSum=0.154;NCC=0;QD=19.36;ReadPosRankSum=0.042;SOR=0.346 GT:AD:DP:GQ:PL 0/1:13,28:41:99:822,0,372
+1 935222 rs2298214 C A 1381.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-6.667e+00;ClippingRankSum=-1.069e+00;DB;DP=104;FS=2.093;GQ_MEAN=1410.00;MLEAC=1;MLEAF=0.500;MQ=59.78;MQ0=0;MQRankSum=0.595;NCC=0;QD=13.29;ReadPosRankSum=1.23;SOR=1.673 GT:AD:DP:GQ:PL 0/1:52,52:104:99:1410,0,1574
+1 948921 rs15842 T C 1963.77 . AC=2;AF=1.00;AN=2;DB;DP=62;FS=0.000;GQ_MEAN=186.00;MLEAC=2;MLEAF=1.00;MQ=51.79;MQ0=0;NCC=0;QD=31.67;SOR=1.057 GT:AD:DP:GQ:PL 1/1:0,62:62:99:1992,186,0
+1 948929 tf3 GGCCCACA G 777.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=1.94;ClippingRankSum=-5.070e-01;DP=67;FS=0.000;GQ_MEAN=806.00;MLEAC=1;MLEAF=0.500;MQ=51.26;MQ0=0;MQRankSum=-4.852e+00;NCC=0;QD=6.94;ReadPosRankSum=0.209;SOR=0.730 GT:AD:DP:GQ:PL 0/1:26,22:48:99:806,0,981
+1 949608 rs1921 G A 1741.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-6.267e+00;ClippingRankSum=-4.930e-01;DB;DP=131;FS=3.479;GQ_MEAN=1770.00;MLEAC=1;MLEAF=0.500;MQ=58.32;MQ0=0;MQRankSum=-1.115e+00;NCC=0;QD=13.50;ReadPosRankSum=0.884;SOR=1.050 GT:AD:DP:GQ:PL 0/1:67,62:129:99:1770,0,2109
+1 949654 rs8997 A G 7066.77 . AC=2;AF=1.00;AN=2;DB;DP=201;FS=0.000;GQ_MEAN=596.00;MLEAC=2;MLEAF=1.00;MQ=59.03;MQ0=0;NCC=0;QD=29.00;SOR=1.863 GT:AD:DP:GQ:PL 1/1:0,198:198:99:7095,596,0
+1 977330 rs2799066 T C 1397.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=5.53;ClippingRankSum=0.116;DB;DP=128;FS=8.556;GQ_MEAN=1426.00;MLEAC=1;MLEAF=0.500;MQ=59.23;MQ0=0;MQRankSum=1.03;NCC=0;QD=11.75;ReadPosRankSum=2.08;SOR=1.516 GT:AD:DP:GQ:PL 0/1:69,50:119:99:1426,0,1861
+1 977570 rs2710876 G A 251.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.002e+00;ClippingRankSum=-7.660e-01;DB;DP=15;FS=0.000;GQ_MEAN=174.00;MLEAC=1;MLEAF=0.500;MQ=60.00;MQ0=0;MQRankSum=0.059;NCC=0;QD=16.78;ReadPosRankSum=-1.355e+00;SOR=1.112 GT:AD:DP:GQ:PL 0/1:6,9:15:99:280,0,174
+1 978603 rs138543546 CCT C 1281.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.863e+00;ClippingRankSum=0.545;DB;DP=65;FS=0.000;GQ_MEAN=1253.00;MLEAC=1;MLEAF=0.500;MQ=58.98;MQ0=0;MQRankSum=-1.083e+00;NCC=0;QD=20.03;ReadPosRankSum=-2.830e-01;SOR=0.263 GT:AD:DP:GQ:PL 0/1:30,34:64:99:1310,0,1253
+1 981087 rs3128098 A G 712.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=0.874;ClippingRankSum=-2.870e-01;DB;DP=69;FS=3.654;GQ_MEAN=741.00;MLEAC=1;MLEAF=0.500;MQ=59.39;MQ0=0;MQRankSum=0.336;NCC=0;QD=10.33;ReadPosRankSum=-1.770e-01;SOR=1.296 GT:AD:DP:GQ:PL 0/1:41,28:69:99:741,0,1074
diff --git a/testdata/picard/vcf/test.over.chain b/testdata/picard/vcf/test.over.chain
new file mode 100644
index 0000000..fb0a9c8
--- /dev/null
+++ b/testdata/picard/vcf/test.over.chain
@@ -0,0 +1,3 @@
+chain 540 chr1 540 + 0 540 chr1 540 - 0 540 2
+540
+
diff --git a/testdata/picard/vcf/testLiftover.vcf b/testdata/picard/vcf/testLiftover.vcf
new file mode 100644
index 0000000..e6161ed
--- /dev/null
+++ b/testdata/picard/vcf/testLiftover.vcf
@@ -0,0 +1,4 @@
+##fileformat=VCFv4.1
+#CHROM POS ID REF ALT QUAL FILTER INFO
+chr1 1 . C CCCCT 15676.17 PASS .
+chr1 61 . CA C 724.43 PASS .
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/picard-tools.git
More information about the debian-med-commit
mailing list