[med-svn] [picard-tools] 01/02: Imported Upstream version 2.7.1+dfsg
Andreas Tille
tille at debian.org
Thu Oct 27 16:11:02 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository picard-tools.
commit de94da5bdffb7f09b560b25fececb5fbd4ab34ea
Author: Andreas Tille <tille at debian.org>
Date: Thu Oct 27 17:07:49 2016 +0200
Imported Upstream version 2.7.1+dfsg
---
README.md | 4 +
.../analysis/AlignmentSummaryMetricsCollector.java | 12 +-
src/main/java/picard/analysis/ChimeraUtil.java | 5 +-
.../picard/analysis/CollectMultipleMetrics.java | 22 +-
.../java/picard/analysis/CollectOxoGMetrics.java | 7 +-
.../analysis/CollectQualityYieldMetrics.java | 4 +-
.../java/picard/analysis/CollectRawWgsMetrics.java | 55 ++-
.../java/picard/analysis/CollectWgsMetrics.java | 430 ++++++++++++++++-----
.../CollectWgsMetricsFromSampledSites.java | 52 ++-
.../CollectWgsMetricsWithNonZeroCoverage.java | 98 ++++-
.../picard/analysis/GcBiasMetricsCollector.java | 4 +-
.../java/picard/analysis/GcBiasSummaryMetrics.java | 2 +-
.../{replicates => }/MergeableMetricBase.java | 22 +-
.../picard/analysis/TheoreticalSensitivity.java | 4 +-
.../CollectSequencingArtifactMetrics.java | 91 +++--
.../analysis/artifacts/ErrorSummaryMetrics.java | 36 ++
.../artifacts/SequencingArtifactMetrics.java | 5 +-
.../analysis/directed/CalculateHsMetrics.java | 4 +-
.../picard/analysis/directed/CollectHsMetrics.java | 4 +-
.../analysis/directed/CollectTargetedMetrics.java | 4 +-
.../directed/InsertSizeMetricsCollector.java | 2 +-
.../analysis/directed/TargetMetricsCollector.java | 18 +-
.../CollectIndependentReplicateMetrics.java | 4 +-
.../replicates/IndependentReplicateMetric.java | 2 +
.../java/picard/cmdline/PicardCommandLine.java | 2 +-
.../programgroups/Fingerprinting.java} | 38 +-
.../java/picard/fingerprint/CheckFingerprint.java | 137 +++++--
.../CrosscheckReadGroupFingerprints.java | 4 +-
.../picard/fingerprint/FingerprintChecker.java | 73 +++-
.../picard/fingerprint/FingerprintResults.java | 18 +-
.../picard/illumina/parser/readers/BclReader.java | 2 +-
.../java/picard/sam/AbstractAlignmentMerger.java | 30 +-
src/main/java/picard/sam/DuplicationMetrics.java | 56 +--
src/main/java/picard/sam/FastqToSam.java | 26 +-
src/main/java/picard/sam/MergeBamAlignment.java | 4 +-
src/main/java/picard/sam/RevertSam.java | 14 +-
src/main/java/picard/sam/SetNmAndUqTags.java | 79 +---
.../{SetNmAndUqTags.java => SetNmMdAndUqTags.java} | 21 +-
.../markduplicates/EstimateLibraryComplexity.java | 2 +-
.../picard/sam/markduplicates/MarkDuplicates.java | 10 +-
.../SimpleMarkDuplicatesWithMateCigar.java | 19 +-
.../UmiAwareDuplicateSetIterator.java | 124 ++++++
.../UmiAwareMarkDuplicatesWithMateCigar.java | 89 +++++
.../java/picard/sam/markduplicates/UmiGraph.java | 218 +++++++++++
.../AbstractMarkDuplicatesCommandLineProgram.java | 2 +-
...ctOpticalDuplicateFinderCommandLineProgram.java | 4 +-
src/main/java/picard/util/DbSnpBitSetUtil.java | 29 +-
.../picard/vcf/CollectVariantCallingMetrics.java | 3 +-
src/main/java/picard/vcf/filter/FilterVcf.java | 2 +-
src/main/resources/picard/analysis/wgsHistogram.R | 4 +-
.../analysis/CollectInsertSizeMetricsTest.java | 52 +--
.../picard/analysis/CollectWgsMetricsTest.java | 8 +-
.../CollectWgsMetricsWithNonZeroCoverageTest.java | 127 ++++++
.../{replicates => }/MergeableMetricBaseTest.java | 27 +-
.../analysis/TheoreticalSensitivityTest.java | 15 +-
src/test/java/picard/analysis/WgsMetricsTest.java | 108 ++++++
.../CollectSequencingArtifactMetricsTest.java | 1 +
.../directed/CollectTargetedMetricsTest.java | 4 +-
.../picard/fingerprint/FingerprintCheckerTest.java | 43 ++-
.../java/picard/sam/DuplicationMetricsTest.java | 91 +++++
.../java/picard/sam/MergeBamAlignmentTest.java | 100 +++--
src/test/java/picard/sam/RevertSamTest.java | 2 +-
...ndUqTagsTest.java => SetNmMdAndUqTagsTest.java} | 6 +-
...ractMarkDuplicatesCommandLineProgramTester.java | 2 +-
.../UmiAwareMarkDuplicatesWithMateCigarTest.java | 166 ++++++++
.../UmiAwareMarkDuplicatesWithMateCigarTester.java | 167 ++++++++
.../java/picard/sam/testers/SamFileTester.java | 4 +-
src/test/java/picard/vcf/VcfTestUtils.java | 40 ++
.../no_bq_cutoff.error_summary_metrics | 15 +
.../no_mq_cutoff.error_summary_metrics | 15 +
.../unmapped_mate.error_summary_metrics | 15 +
.../with_context.error_summary_metrics | 15 +
.../with_dbsnp.error_summary_metrics | 15 +
.../with_intervals.error_summary_metrics | 15 +
...apiens_assembly19.haplotype_database.subset.txt | 92 +++++
testdata/picard/fingerprint/NA12891.fp.vcf | 101 +++++
testdata/picard/fingerprint/NA12891.vcf | 101 +++++
testdata/picard/fingerprint/NA12892.fp.vcf | 101 +++++
testdata/picard/fingerprint/NA12892.vcf | 101 +++++
.../sam/MergeBamAlignment/cliptest.aligned.sam | 20 +-
.../sam/MergeBamAlignment/cliptest.unmapped.sam | 8 +-
.../sam/MergeBamAlignment/contam.expected.sam | 8 +-
.../sam/MergeBamAlignment/removetags.aligned.sam | 10 +
.../picard/sam/MergeBamAlignment/removetags.dict | 2 +
.../picard/sam/MergeBamAlignment/removetags.fasta | 21 +
.../sam/MergeBamAlignment/removetags.fasta.fai | 1 +
.../sam/MergeBamAlignment/removetags.unmapped.sam | 10 +
87 files changed, 2979 insertions(+), 551 deletions(-)
diff --git a/README.md b/README.md
index d6e303c..7c45cbd 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,10 @@ During development in Picard, it is sometimes necessary to build locally against
* Run `./gradlew install printVersion` in your htsjdk clone to install that version to your local maven repository. Take note of the version number that gets printed at the end.
* Switch back to your Picard clone, and run `./gradlew shadowJar -Dhtsjdk.version=VERSION`, where VERSION is the version of HTSJDK you installed to your local maven repository.
+####Releasing Picard
+
+Full instructions on how to create a new release of Picard are [here](https://github.com/broadinstitute/picard/wiki/How-to-release-Picard)
+
----
It's also possible to build a version of Picard that supports reading from
diff --git a/src/main/java/picard/analysis/AlignmentSummaryMetricsCollector.java b/src/main/java/picard/analysis/AlignmentSummaryMetricsCollector.java
index 6d9507c..1dba3fe 100644
--- a/src/main/java/picard/analysis/AlignmentSummaryMetricsCollector.java
+++ b/src/main/java/picard/analysis/AlignmentSummaryMetricsCollector.java
@@ -24,13 +24,7 @@
package picard.analysis;
-import htsjdk.samtools.AlignmentBlock;
-import htsjdk.samtools.BAMRecord;
-import htsjdk.samtools.CigarElement;
-import htsjdk.samtools.CigarOperator;
-import htsjdk.samtools.ReservedTagConstants;
-import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CoordMath;
@@ -253,7 +247,7 @@ public class AlignmentSummaryMetricsCollector extends SAMRecordAndReferenceMulti
metrics.READS_ALIGNED_IN_PAIRS++;
// Check that both ends have mapq > minimum
- final Integer mateMq = record.getIntegerAttribute("MQ");
+ final Integer mateMq = record.getIntegerAttribute(SAMTag.MQ.toString());
if (mateMq == null || mateMq >= MAPPING_QUALITY_THRESOLD && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
++this.chimerasDenominator;
@@ -267,7 +261,7 @@ public class AlignmentSummaryMetricsCollector extends SAMRecordAndReferenceMulti
// Consider chimeras that occur *within* the read using the SA tag
if (record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
++this.chimerasDenominator;
- if (record.getAttribute("SA") != null) ++this.chimeras;
+ if (record.getAttribute(SAMTag.SA.toString()) != null) ++this.chimeras;
}
}
}
diff --git a/src/main/java/picard/analysis/ChimeraUtil.java b/src/main/java/picard/analysis/ChimeraUtil.java
index 2235ccd..91e65b8 100644
--- a/src/main/java/picard/analysis/ChimeraUtil.java
+++ b/src/main/java/picard/analysis/ChimeraUtil.java
@@ -25,6 +25,7 @@
package picard.analysis;
import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SamPairUtil.PairOrientation;
@@ -66,7 +67,7 @@ public class ChimeraUtil {
(Math.abs(r1.getInferredInsertSize()) > maxInsertSize || // either far apart on the same contig
!r1.getReferenceIndex().equals(r2.getReferenceIndex()) || // or on different contigs
!matchesExpectedOrientations(r1, expectedOrientations) || // or in unexpected orientations
- r2.getAttribute("SA") != null); // (another check for an unexpected orientation here)
+ r2.getAttribute(SAMTag.SA.toString()) != null); // (another check for an unexpected orientation here)
}
private static boolean isMappedPair(final SAMRecord rec) {
@@ -74,6 +75,6 @@ public class ChimeraUtil {
}
private static boolean matchesExpectedOrientations(final SAMRecord rec, final Set<PairOrientation> expectedOrientations) {
- return expectedOrientations.contains(SamPairUtil.getPairOrientation(rec)) && rec.getAttribute("SA") == null;
+ return expectedOrientations.contains(SamPairUtil.getPairOrientation(rec)) && rec.getAttribute(SAMTag.SA.toString()) == null;
}
}
diff --git a/src/main/java/picard/analysis/CollectMultipleMetrics.java b/src/main/java/picard/analysis/CollectMultipleMetrics.java
index 4b53813..3c5cf6a 100644
--- a/src/main/java/picard/analysis/CollectMultipleMetrics.java
+++ b/src/main/java/picard/analysis/CollectMultipleMetrics.java
@@ -61,7 +61,8 @@ public class CollectMultipleMetrics extends CommandLineProgram {
static final String USAGE_DETAILS ="This 'meta-metrics' tool runs one or more of the metrics collection modules at the same" +
" time to cut down on the time spent reading in data from input files. Available modules include " +
"CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, QualityScoreDistribution, MeanQualityByCycle, " +
- "and CollectBaseDistributionByCycle. The tool produces outputs of '.pdf' and '.txt' files for each module, except for the " +
+ "CollectBaseDistributionByCycle, CollectGcBiasMetrics, RnaSeqMetrics, CollectSequencingArtifactMetrics, and CollectQualityYieldMetrics. " +
+ "The tool produces outputs of '.pdf' and '.txt' files for each module, except for the " +
"CollectAlignmentSummaryMetrics module, which outputs only a '.txt' file. Output files are named by specifying a base name " +
"(without any file extensions).<br /><br />" +
"" +
@@ -88,8 +89,13 @@ public class CollectMultipleMetrics extends CommandLineProgram {
"</pre>" +
"<hr />";
public static interface ProgramInterface {
+ /** By default, this method calls the {@link #makeInstance(String, String, File, File, Set, File, File)} method without 'includeUnpaired' parameter. */
+ default SinglePassSamProgram makeInstance(final String outbase, final String outext, final File input, final File reference,
+ final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals, final boolean includeUnpaired) {
+ return makeInstance(outbase, outext, input, reference, metricAccumulationLevel, dbSnp, intervals);
+ }
SinglePassSamProgram makeInstance(final String outbase, final String outext, final File input, final File reference,
- final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals);
+ final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals);
public boolean needsReferenceSequence();
public boolean supportsMetricAccumulationLevel();
}
@@ -275,11 +281,16 @@ public class CollectMultipleMetrics extends CommandLineProgram {
public boolean supportsMetricAccumulationLevel() { return false; }
@Override
public SinglePassSamProgram makeInstance(final String outbase, final String outext, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
+ return makeInstance(outbase, outext, input, reference, metricAccumulationLevel, dbSnp, intervals, false);
+ }
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase, final String outext, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals, final boolean includeUnpaired) {
final CollectSequencingArtifactMetrics program = new CollectSequencingArtifactMetrics();
program.OUTPUT = new File(outbase);
program.FILE_EXTENSION = outext;
program.DB_SNP = dbSnp;
program.INTERVALS = intervals;
+ program.INCLUDE_UNPAIRED = includeUnpaired;
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway. These are set here to make sure that in case of a the derived class
// overrides
@@ -342,9 +353,12 @@ public class CollectMultipleMetrics extends CommandLineProgram {
public File INTERVALS;
@Option(doc = "VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis " +
- "by some PROGRAMs, PROGRAMS whose CLP doesn't allow for this argument will quetly ignore it.", optional = true)
+ "by some PROGRAMs; PROGRAMs whose CLP doesn't allow for this argument will quietly ignore it.", optional = true)
public File DB_SNP;
+ @Option(shortName = "UNPAIRED", doc = "Include unpaired reads in CollectSequencingArtifactMetrics. If set to true then all paired reads will be included as well - " +
+ "MINIMUM_INSERT_SIZE and MAXIMUM_INSERT_SIZE will be ignored in CollectSequencingArtifactMetrics.")
+ public boolean INCLUDE_UNPAIRED = false;
/**
* Contents of PROGRAM set is transferred to this set during command-line validation, so that an outside
* developer can invoke this class programmatically and provide alternative Programs to run by calling
@@ -395,7 +409,7 @@ public class CollectMultipleMetrics extends CommandLineProgram {
}
final String outext = (null != FILE_EXTENSION) ? FILE_EXTENSION : ""; // Add a file extension if desired
- final SinglePassSamProgram instance = program.makeInstance(OUTPUT, outext, INPUT, REFERENCE_SEQUENCE, METRIC_ACCUMULATION_LEVEL, DB_SNP, INTERVALS);
+ final SinglePassSamProgram instance = program.makeInstance(OUTPUT, outext, INPUT, REFERENCE_SEQUENCE, METRIC_ACCUMULATION_LEVEL, DB_SNP, INTERVALS, INCLUDE_UNPAIRED);
// Generally programs should not be accessing these directly but it might make things smoother
// to just set them anyway
diff --git a/src/main/java/picard/analysis/CollectOxoGMetrics.java b/src/main/java/picard/analysis/CollectOxoGMetrics.java
index 5ff0878..ae3c101 100644
--- a/src/main/java/picard/analysis/CollectOxoGMetrics.java
+++ b/src/main/java/picard/analysis/CollectOxoGMetrics.java
@@ -60,6 +60,7 @@ import java.util.Set;
import static htsjdk.samtools.util.CodeUtil.getOrElse;
import static htsjdk.samtools.util.SequenceUtil.generateAllKmers;
import static java.lang.Math.log10;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Class for trying to quantify the CpCG->CpCA error rate.
@@ -117,7 +118,7 @@ public class CollectOxoGMetrics extends CommandLineProgram {
doc = "The minimum base quality score for a base to be included in analysis.")
public int MINIMUM_QUALITY_SCORE = 20;
- @Option(shortName = "MQ",
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME,
doc = "The minimum mapping quality score for a base to be included in analysis.")
public int MINIMUM_MAPPING_QUALITY = 30;
@@ -129,6 +130,9 @@ public class CollectOxoGMetrics extends CommandLineProgram {
doc = "The maximum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.")
public int MAXIMUM_INSERT_SIZE = 600;
+ @Option(shortName = "NON_PF", doc = "Whether or not to include non-PF reads.")
+ public boolean INCLUDE_NON_PF_READS = true;
+
@Option(doc = "When available, use original quality scores for filtering.")
public boolean USE_OQ = true;
@@ -283,6 +287,7 @@ public class CollectOxoGMetrics extends CommandLineProgram {
}
iterator.setEmitUncoveredLoci(false);
iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
+ iterator.setIncludeNonPfReads(INCLUDE_NON_PF_READS);
final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
filters.add(new NotPrimaryAlignmentFilter());
diff --git a/src/main/java/picard/analysis/CollectQualityYieldMetrics.java b/src/main/java/picard/analysis/CollectQualityYieldMetrics.java
index e2f9a58..f3d16f6 100644
--- a/src/main/java/picard/analysis/CollectQualityYieldMetrics.java
+++ b/src/main/java/picard/analysis/CollectQualityYieldMetrics.java
@@ -156,10 +156,10 @@ public class CollectQualityYieldMetrics extends SinglePassSamProgram {
public static class QualityYieldMetrics extends MetricBase {
/** The total number of reads in the input file */
- public int TOTAL_READS = 0;
+ public long TOTAL_READS = 0;
/** The number of reads that are PF - pass filter */
- public int PF_READS = 0;
+ public long PF_READS = 0;
/** The average read length of all the reads (will be fixed for a lane) */
public int READ_LENGTH = 0;
diff --git a/src/main/java/picard/analysis/CollectRawWgsMetrics.java b/src/main/java/picard/analysis/CollectRawWgsMetrics.java
index bde9b13..1dc534e 100644
--- a/src/main/java/picard/analysis/CollectRawWgsMetrics.java
+++ b/src/main/java/picard/analysis/CollectRawWgsMetrics.java
@@ -24,10 +24,14 @@
package picard.analysis;
+import htsjdk.samtools.util.Histogram;
+import htsjdk.samtools.util.IntervalList;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
/**
* Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing
* experiments, same implementation as CollectWgsMetrics, with different defaults: lacks baseQ and mappingQ filters
@@ -71,7 +75,7 @@ public class CollectRawWgsMetrics extends CollectWgsMetrics{
"<a href='https://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectWgsMetrics.WgsMetrics'>" +
"the WgsMetrics documentation</a> for detailed explanations of the output metrics." +
"<hr />";
- @Option(shortName="MQ", doc="Minimum mapping quality for a read to contribute coverage.")
+ @Option(shortName=MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc="Minimum mapping quality for a read to contribute coverage.")
public int MINIMUM_MAPPING_QUALITY = 0;
@Option(shortName="Q", doc="Minimum base quality for a base to contribute coverage.")
@@ -84,11 +88,54 @@ public class CollectRawWgsMetrics extends CollectWgsMetrics{
public int LOCUS_ACCUMULATION_CAP = 200000;
// rename the class so that in the metric file it is annotated differently.
- public static class RawWgsMetrics extends WgsMetrics {}
+ public static class RawWgsMetrics extends WgsMetrics {
+ public RawWgsMetrics() {
+ super();
+ }
+
+ public RawWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ super(intervals, depthHistogram, pctExcludedByMapq, pctExcludedByDupes, pctExcludedByPairing, pctExcludedByBaseq,
+ pctExcludedByOverlap, pctExcludedByCapping, pctTotal, coverageCap, baseQHistogram, sampleSize);
+ }
+ }
@Override
- protected WgsMetrics generateWgsMetrics() {
- return new RawWgsMetrics();
+ protected WgsMetrics generateWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ return new RawWgsMetrics(
+ intervals,
+ depthHistogram,
+ pctExcludedByMapq,
+ pctExcludedByDupes,
+ pctExcludedByPairing,
+ pctExcludedByBaseq,
+ pctExcludedByOverlap,
+ pctExcludedByCapping,
+ pctTotal,
+ coverageCap,
+ baseQHistogram,
+ sampleSize);
}
}
diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java
index 0b76735..9a9db29 100644
--- a/src/main/java/picard/analysis/CollectWgsMetrics.java
+++ b/src/main/java/picard/analysis/CollectWgsMetrics.java
@@ -24,22 +24,16 @@
package picard.analysis;
import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.SecondaryAlignmentFilter;
-import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
-import htsjdk.samtools.util.Histogram;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.IntervalList;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.ProgressLogger;
-import htsjdk.samtools.util.QualityUtil;
-import htsjdk.samtools.util.SamLocusIterator;
-import htsjdk.samtools.util.SequenceUtil;
+import htsjdk.samtools.util.*;
+import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -55,7 +49,8 @@ import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
-import java.util.stream.IntStream;
+
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments.
@@ -96,7 +91,7 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
@Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "The reference sequence fasta aligned to.")
public File REFERENCE_SEQUENCE;
- @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 20;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage. N bases will be treated as having a base quality " +
@@ -134,74 +129,278 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
private static final double LOG_ODDS_THRESHOLD = 3.0;
/** Metrics for evaluating the performance of whole genome sequencing experiments. */
- public static class WgsMetrics extends MetricBase {
+ public static class WgsMetrics extends MergeableMetricBase {
+
+ /** The intervals over which this metric was computed. */
+ @MergingIsManual
+ protected IntervalList intervals;
+
+ /** The count of sites with a given observed depth. */
+ @MergingIsManual
+ protected final Histogram<Integer> depthHistogram;
+
+ /** The count of bases observed with a given base quality. */
+ @MergingIsManual
+ protected final Histogram<Integer> baseQHistogram;
+
+ /** The maximum depth/coverage to consider. */
+ @MergeByAssertEquals
+ protected final int coverageCap;
+
+ /** The sample size used for theoretical het sensitivity. */
+ @NoMergingKeepsValue
+ protected final int theoreticalHetSensitivitySampleSize;
+
+ /**
+ * Create an instance of this metric that is not mergeable.
+ */
+ public WgsMetrics() {
+ intervals = null;
+ depthHistogram = null;
+ baseQHistogram = null;
+ theoreticalHetSensitivitySampleSize = -1;
+ coverageCap = -1;
+ }
+
+ /**
+ * Create an instance of this metric that is mergeable.
+ *
+ * @param depthHistogram the count of genomic positions observed for each observed depth.
+ * @param pctExcludedByMapq the fraction of aligned bases that were filtered out because they were in reads with low mapping quality.
+ * @param pctExcludedByDupes the fraction of aligned bases that were filtered out because they were in reads marked as duplicates.
+ * @param pctExcludedByPairing the fraction of bases that were filtered out because they were in reads without a mapped mate pair.
+ * @param pctExcludedByBaseq the fraction of aligned bases that were filtered out because they were of low base quality.
+ * @param pctExcludedByOverlap the fraction of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads.
+ * @param pctExcludedByCapping the fraction of aligned bases that were filtered out because they would have raised coverage above the capped value.
+ * @param pctExcludeTotal the fraction of bases excluded across all filters.
+ * @param coverageCap Treat positions with coverage exceeding this value as if they had coverage at this value.
+ * @param baseQHistogram the count of bases observed with a given quality.
+ * @param theoreticalHetSensitivitySampleSize the sample size used for theoretical het sensitivity sampling.
+ */
+ public WgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctExcludeTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int theoreticalHetSensitivitySampleSize) {
+ this.intervals = intervals.uniqued();
+ this.depthHistogram = depthHistogram;
+ this.baseQHistogram = baseQHistogram;
+ this.coverageCap = coverageCap;
+ this.theoreticalHetSensitivitySampleSize = theoreticalHetSensitivitySampleSize;
+
+ PCT_EXC_MAPQ = pctExcludedByMapq;
+ PCT_EXC_DUPE = pctExcludedByDupes;
+ PCT_EXC_UNPAIRED = pctExcludedByPairing;
+ PCT_EXC_BASEQ = pctExcludedByBaseq;
+ PCT_EXC_OVERLAP = pctExcludedByOverlap;
+ PCT_EXC_CAPPED = pctExcludedByCapping;
+ PCT_EXC_TOTAL = pctExcludeTotal;
+
+ calculateDerivedFields();
+ }
/** The number of non-N bases in the genome reference over which coverage will be evaluated. */
+ @NoMergingIsDerived
public long GENOME_TERRITORY;
/** The mean coverage in bases of the genome territory, after all filters are applied. */
+ @NoMergingIsDerived
public double MEAN_COVERAGE;
/** The standard deviation of coverage of the genome after all filters are applied. */
+ @NoMergingIsDerived
public double SD_COVERAGE;
/** The median coverage in bases of the genome territory, after all filters are applied. */
+ @NoMergingIsDerived
public double MEDIAN_COVERAGE;
/** The median absolute deviation of coverage of the genome after all filters are applied. */
+ @NoMergingIsDerived
public double MAD_COVERAGE;
/** The fraction of aligned bases that were filtered out because they were in reads with low mapping quality (default is < 20). */
+ @NoMergingIsDerived
public double PCT_EXC_MAPQ;
/** The fraction of aligned bases that were filtered out because they were in reads marked as duplicates. */
+ @NoMergingIsDerived
public double PCT_EXC_DUPE;
/** The fraction of aligned bases that were filtered out because they were in reads without a mapped mate pair. */
+ @NoMergingIsDerived
public double PCT_EXC_UNPAIRED;
/** The fraction of aligned bases that were filtered out because they were of low base quality (default is < 20). */
+ @NoMergingIsDerived
public double PCT_EXC_BASEQ;
/** The fraction of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads. */
+ @NoMergingIsDerived
public double PCT_EXC_OVERLAP;
/** The fraction of aligned bases that were filtered out because they would have raised coverage above the capped value (default cap = 250x). */
+ @NoMergingIsDerived
public double PCT_EXC_CAPPED;
/** The total fraction of aligned bases excluded due to all filters. */
+ @NoMergingIsDerived
public double PCT_EXC_TOTAL;
/** The fraction of bases that attained at least 1X sequence coverage in post-filtering bases. */
- public double PCT_1X;
+ @NoMergingIsDerived
+ public double PCT_1X;
/** The fraction of bases that attained at least 5X sequence coverage in post-filtering bases. */
- public double PCT_5X;
+ @NoMergingIsDerived
+ public double PCT_5X;
/** The fraction of bases that attained at least 10X sequence coverage in post-filtering bases. */
- public double PCT_10X;
+ @NoMergingIsDerived
+ public double PCT_10X;
/** The fraction of bases that attained at least 15X sequence coverage in post-filtering bases. */
- public double PCT_15X;
+ @NoMergingIsDerived
+ public double PCT_15X;
/** The fraction of bases that attained at least 20X sequence coverage in post-filtering bases. */
- public double PCT_20X;
+ @NoMergingIsDerived
+ public double PCT_20X;
/** The fraction of bases that attained at least 25X sequence coverage in post-filtering bases. */
- public double PCT_25X;
+ @NoMergingIsDerived
+ public double PCT_25X;
/** The fraction of bases that attained at least 30X sequence coverage in post-filtering bases. */
- public double PCT_30X;
+ @NoMergingIsDerived
+ public double PCT_30X;
/** The fraction of bases that attained at least 40X sequence coverage in post-filtering bases. */
- public double PCT_40X;
+ @NoMergingIsDerived
+ public double PCT_40X;
/** The fraction of bases that attained at least 50X sequence coverage in post-filtering bases. */
- public double PCT_50X;
+ @NoMergingIsDerived
+ public double PCT_50X;
/** The fraction of bases that attained at least 60X sequence coverage in post-filtering bases. */
- public double PCT_60X;
+ @NoMergingIsDerived
+ public double PCT_60X;
/** The fraction of bases that attained at least 70X sequence coverage in post-filtering bases. */
- public double PCT_70X;
+ @NoMergingIsDerived
+ public double PCT_70X;
/** The fraction of bases that attained at least 80X sequence coverage in post-filtering bases. */
- public double PCT_80X;
+ @NoMergingIsDerived
+ public double PCT_80X;
/** The fraction of bases that attained at least 90X sequence coverage in post-filtering bases. */
- public double PCT_90X;
+ @NoMergingIsDerived
+ public double PCT_90X;
/** The fraction of bases that attained at least 100X sequence coverage in post-filtering bases. */
- public double PCT_100X;
+ @NoMergingIsDerived
+ public double PCT_100X;
/** The theoretical HET SNP sensitivity. */
+ @NoMergingIsDerived
public double HET_SNP_SENSITIVITY;
/** The Phred Scaled Q Score of the theoretical HET SNP sensitivity. */
+ @NoMergingIsDerived
public double HET_SNP_Q;
+
+ /**
+ * Merges the various PCT_EXC_* metrics.
+ * @param other metric to merge into this one.
+ */
+ @Override
+ public void merge(final MergeableMetricBase other) {
+ final WgsMetrics otherMetric = (WgsMetrics) other;
+
+ if (depthHistogram == null || otherMetric.depthHistogram == null) {
+ throw new PicardException("Depth histogram is required when deriving metrics.");
+ }
+
+ // Union the intervals over which bases are called. They should have no overlaps!
+ // NB: interval lists are already uniqued.
+ final long genomeTerritory = this.intervals.getBaseCount() + otherMetric.intervals.getBaseCount();
+ this.intervals.addall(otherMetric.intervals.getIntervals());
+ this.intervals = this.intervals.uniqued();
+ if (this.intervals.getBaseCount() != genomeTerritory) {
+ throw new PicardException("Trying to merge WgsMetrics calculated on intervals that overlap.");
+ }
+
+ // NB:
+ // Since: PCT_EXC_TOTAL = (totalWithExcludes - thisMetricTotal) / totalWithExcludes;
+ // Thus: totalWithExcludes = total / (1 - PCT_EXC_TOTAL)
+ // Proof: Exercise is left to the reader.
+ final long thisMetricTotal = (long) depthHistogram.getSum();
+ final long otherMetricTotal = (long) otherMetric.depthHistogram.getSum();
+ final long total = thisMetricTotal + otherMetricTotal;
+ final long thisTotalWithExcludes = (long) (thisMetricTotal / (1.0 - PCT_EXC_TOTAL));
+ final long otherTotalWithExcludes = (long) (otherMetricTotal / (1.0 - otherMetric.PCT_EXC_TOTAL));
+ final double totalWithExcludes = thisTotalWithExcludes + otherTotalWithExcludes;
+
+ if (0 < totalWithExcludes) {
+ PCT_EXC_DUPE = (PCT_EXC_DUPE * thisTotalWithExcludes + otherMetric.PCT_EXC_DUPE * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_MAPQ = (PCT_EXC_MAPQ * thisTotalWithExcludes + otherMetric.PCT_EXC_MAPQ * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_UNPAIRED = (PCT_EXC_UNPAIRED * thisTotalWithExcludes + otherMetric.PCT_EXC_UNPAIRED * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_BASEQ = (PCT_EXC_BASEQ * thisTotalWithExcludes + otherMetric.PCT_EXC_BASEQ * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_OVERLAP = (PCT_EXC_OVERLAP * thisTotalWithExcludes + otherMetric.PCT_EXC_OVERLAP * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_CAPPED = (PCT_EXC_CAPPED * thisTotalWithExcludes + otherMetric.PCT_EXC_CAPPED * otherTotalWithExcludes) / totalWithExcludes;
+ PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;
+ }
+
+ // do any merging that are dictated by the annotations.
+ super.merge(other);
+
+ // merge the histograms
+ this.depthHistogram.addHistogram(otherMetric.depthHistogram);
+ if (baseQHistogram != null && otherMetric.baseQHistogram != null) this.baseQHistogram.addHistogram(otherMetric.baseQHistogram);
+ }
+
+ @Override
+ public void calculateDerivedFields() {
+ if (depthHistogram == null) throw new PicardException("Depth histogram is required when deriving metrics.");
+ if (baseQHistogram != null && theoreticalHetSensitivitySampleSize <= 0) {
+ throw new PicardException("Sample size is required when a baseQ histogram is given when deriving metrics.");
+ }
+
+ final long[] depthHistogramArray = new long[coverageCap+1];
+ for (final Histogram.Bin<Integer> bin : depthHistogram.values()) {
+ final int depth = Math.min((int) bin.getIdValue(), coverageCap);
+ depthHistogramArray[depth] += bin.getValue();
+ }
+
+ GENOME_TERRITORY = (long) depthHistogram.getSumOfValues();
+ MEAN_COVERAGE = depthHistogram.getMean();
+ SD_COVERAGE = depthHistogram.getStandardDeviation();
+ MEDIAN_COVERAGE = depthHistogram.getMedian();
+ MAD_COVERAGE = depthHistogram.getMedianAbsoluteDeviation();
+
+ PCT_1X = MathUtil.sum(depthHistogramArray, 1, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_5X = MathUtil.sum(depthHistogramArray, 5, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_10X = MathUtil.sum(depthHistogramArray, 10, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_15X = MathUtil.sum(depthHistogramArray, 15, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_20X = MathUtil.sum(depthHistogramArray, 20, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_25X = MathUtil.sum(depthHistogramArray, 25, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_30X = MathUtil.sum(depthHistogramArray, 30, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_40X = MathUtil.sum(depthHistogramArray, 40, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_50X = MathUtil.sum(depthHistogramArray, 50, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_60X = MathUtil.sum(depthHistogramArray, 60, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_70X = MathUtil.sum(depthHistogramArray, 70, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_80X = MathUtil.sum(depthHistogramArray, 80, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_90X = MathUtil.sum(depthHistogramArray, 90, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+ PCT_100X = MathUtil.sum(depthHistogramArray, 100, depthHistogramArray.length) / (double) GENOME_TERRITORY;
+
+ // Get Theoretical Het SNP Sensitivity
+ if (baseQHistogram != null) {
+ final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram);
+ final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHistogram);
+ HET_SNP_SENSITIVITY = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, LOG_ODDS_THRESHOLD);
+ HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - HET_SNP_SENSITIVITY));
+ }
+ }
}
public static void main(final String[] args) {
new CollectWgsMetrics().instanceMainWithExit(args);
}
+ /** Gets the SamReader from which records will be examined. This will also set the header so that it is available in
+ * */
+ protected SamReader getSamReader() {
+ final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
+ this.header = in.getFileHeader();
+ return in;
+ }
+
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
@@ -220,13 +419,12 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
// Setup all the inputs
final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
- final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
+ final SamReader in = getSamReader();
final SamLocusIterator iterator = getLocusIterator(in);
- this.header = in.getFileHeader();
final List<SamRecordFilter> filters = new ArrayList<>();
- final CountingFilter dupeFilter = new CountingDuplicateFilter();
final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
+ final CountingFilter dupeFilter = new CountingDuplicateFilter();
final CountingPairedFilter pairFilter = new CountingPairedFilter();
// The order in which filters are added matters!
filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice
@@ -242,7 +440,7 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
iterator.setIncludeNonPfReads(false);
iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP);
- final WgsMetricsCollector collector = getCollector(COVERAGE_CAP);
+ final WgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine());
final boolean usingStopAfter = STOP_AFTER > 0;
final long stopAfter = STOP_AFTER - 1;
@@ -273,12 +471,92 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
return 0;
}
+ /** Gets the intervals over which we will calculate metrics. */
+ protected IntervalList getIntervalsToExamine() {
+ final IntervalList intervals;
+ if (INTERVALS != null) {
+ IOUtil.assertFileIsReadable(INTERVALS);
+ intervals = IntervalList.fromFile(INTERVALS);
+ } else {
+ intervals = new IntervalList(this.header);
+ for (final SAMSequenceRecord rec : this.header.getSequenceDictionary().getSequences()) {
+ final Interval interval = new Interval(rec.getSequenceName(), 1, rec.getSequenceLength());
+ intervals.add(interval);
+ }
+ }
+ return intervals;
+ }
+
+ /** This method should only be called after {@link this.getSamReader()} is called. */
protected SAMFileHeader getSamFileHeader() {
+ if (this.header == null) throw new IllegalStateException("getSamFileHeader() was called but this.header is null");
return this.header;
}
- protected WgsMetrics generateWgsMetrics() {
- return new WgsMetrics();
+ protected WgsMetrics generateWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int theoreticalHetSensitivitySampleSize) {
+ return new WgsMetrics(
+ intervals,
+ depthHistogram,
+ pctExcludedByMapq,
+ pctExcludedByDupes,
+ pctExcludedByPairing,
+ pctExcludedByBaseq,
+ pctExcludedByOverlap,
+ pctExcludedByCapping,
+ pctTotal,
+ coverageCap,
+ baseQHistogram,
+ theoreticalHetSensitivitySampleSize
+ );
+ }
+
+ private WgsMetrics generateWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final long basesExcludedByMapq,
+ final long basesExcludedByDupes,
+ final long basesExcludedByPairing,
+ final long basesExcludedByBaseq,
+ final long basesExcludedByOverlap,
+ final long basesExcludedByCapping,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int theoreticalHetSensitivitySampleSize) {
+ final double total = depthHistogram.getSum();
+ final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
+
+ final double pctExcludedByMapq = (0 == totalWithExcludes) ? 0 : (basesExcludedByMapq / totalWithExcludes);
+ final double pctExcludedByDupes = (0 == totalWithExcludes) ? 0 : (basesExcludedByDupes / totalWithExcludes);
+ final double pctExcludedByPairing = (0 == totalWithExcludes) ? 0 : (basesExcludedByPairing / totalWithExcludes);
+ final double pctExcludedByBaseq = (0 == totalWithExcludes) ? 0 : (basesExcludedByBaseq / totalWithExcludes);
+ final double pctExcludedByOverlap = (0 == totalWithExcludes) ? 0 : (basesExcludedByOverlap / totalWithExcludes);
+ final double pctExcludedByCapping = (0 == totalWithExcludes) ? 0 : (basesExcludedByCapping / totalWithExcludes);
+ final double pctTotal = (0 == totalWithExcludes) ? 0 : ((totalWithExcludes - total) / totalWithExcludes);
+
+ return generateWgsMetrics(
+ intervals,
+ depthHistogram,
+ pctExcludedByMapq,
+ pctExcludedByDupes,
+ pctExcludedByPairing,
+ pctExcludedByBaseq,
+ pctExcludedByOverlap,
+ pctExcludedByCapping,
+ pctTotal,
+ coverageCap,
+ baseQHistogram,
+ theoreticalHetSensitivitySampleSize
+ );
}
/**
@@ -293,8 +571,13 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
return (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in);
}
- protected WgsMetricsCollector getCollector(final int coverageCap) {
- return new WgsMetricsCollector(coverageCap);
+ /**
+ * @param coverageCap the maximum depth/coverage to consider.
+ * @param intervals the intervals over which metrics are collected.
+ * @return
+ */
+ protected WgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) {
+ return new WgsMetricsCollector(coverageCap, intervals);
}
protected class WgsMetricsCollector {
@@ -305,12 +588,14 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
private long basesExcludedByBaseq = 0;
private long basesExcludedByOverlap = 0;
private long basesExcludedByCapping = 0;
+ protected final IntervalList intervals;
protected final int coverageCap;
- public WgsMetricsCollector(final int coverageCap) {
+ public WgsMetricsCollector(final int coverageCap, final IntervalList intervals) {
depthHistogramArray = new long[coverageCap + 1];
baseQHistogramArray = new long[Byte.MAX_VALUE];
- this.coverageCap = coverageCap;
+ this.coverageCap = coverageCap;
+ this.intervals = intervals;
}
public void addInfo(final SamLocusIterator.LocusInfo info, final ReferenceSequence ref) {
@@ -352,9 +637,9 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
}
protected void addMetricsToFile(final MetricsFile<WgsMetrics, Integer> file,
- final CountingFilter dupeFilter,
- final CountingFilter mapqFilter,
- final CountingPairedFilter pairFilter) {
+ final CountingFilter dupeFilter,
+ final CountingFilter mapqFilter,
+ final CountingPairedFilter pairFilter) {
// get the depth histogram and metrics
final Histogram<Integer> depthHistogram = getDepthHistogram();
final WgsMetrics metrics = getMetrics(depthHistogram, dupeFilter, mapqFilter, pairFilter);
@@ -372,7 +657,7 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
return getHistogram(baseQHistogramArray, "value", "baseq_count");
}
- private Histogram<Integer> getHistogram(final long[] array, final String binLabel, final String valueLabel) {
+ protected Histogram<Integer> getHistogram(final long[] array, final String binLabel, final String valueLabel) {
final Histogram<Integer> histogram = new Histogram<>(binLabel, valueLabel);
for (int i = 0; i < array.length; ++i) {
histogram.increment(i, array[i]);
@@ -381,55 +666,22 @@ static final String USAGE_DETAILS = "<p>This tool collects metrics about the fra
}
protected WgsMetrics getMetrics(final Histogram<Integer> depthHistogram,
- final CountingFilter dupeFilter,
- final CountingFilter mapqFilter,
- final CountingPairedFilter pairFilter) {
-
- // the base q het histogram
-
- final WgsMetrics metrics = generateWgsMetrics();
- metrics.GENOME_TERRITORY = (long) depthHistogram.getSumOfValues();
- metrics.MEAN_COVERAGE = depthHistogram.getMean();
- metrics.SD_COVERAGE = depthHistogram.getStandardDeviation();
- metrics.MEDIAN_COVERAGE = depthHistogram.getMedian();
- metrics.MAD_COVERAGE = depthHistogram.getMedianAbsoluteDeviation();
-
- final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
- final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
- final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
- final double total = depthHistogram.getSum();
- final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
-
- metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
- metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
- metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
- metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
- metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
- metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
- metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;
-
- metrics.PCT_1X = MathUtil.sum(depthHistogramArray, 1, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_5X = MathUtil.sum(depthHistogramArray, 5, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_10X = MathUtil.sum(depthHistogramArray, 10, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_15X = MathUtil.sum(depthHistogramArray, 15, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_20X = MathUtil.sum(depthHistogramArray, 20, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_25X = MathUtil.sum(depthHistogramArray, 25, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_30X = MathUtil.sum(depthHistogramArray, 30, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_40X = MathUtil.sum(depthHistogramArray, 40, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_50X = MathUtil.sum(depthHistogramArray, 50, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_60X = MathUtil.sum(depthHistogramArray, 60, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_70X = MathUtil.sum(depthHistogramArray, 70, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_80X = MathUtil.sum(depthHistogramArray, 80, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_90X = MathUtil.sum(depthHistogramArray, 90, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
- metrics.PCT_100X = MathUtil.sum(depthHistogramArray, 100, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY;
-
- // Get Theoretical Het SNP Sensitivity
- final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram);
- final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(getBaseQHistogram());
- metrics.HET_SNP_SENSITIVITY = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, SAMPLE_SIZE, LOG_ODDS_THRESHOLD);
- metrics.HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - metrics.HET_SNP_SENSITIVITY));
-
- return metrics;
+ final CountingFilter dupeFilter,
+ final CountingFilter mapqFilter,
+ final CountingPairedFilter pairFilter) {
+ return generateWgsMetrics(
+ this.intervals,
+ depthHistogram,
+ getBasesExcludedBy(mapqFilter),
+ getBasesExcludedBy(dupeFilter),
+ getBasesExcludedBy(pairFilter),
+ basesExcludedByBaseq,
+ basesExcludedByOverlap,
+ basesExcludedByCapping,
+ coverageCap,
+ getBaseQHistogram(),
+ SAMPLE_SIZE
+ );
}
}
}
diff --git a/src/main/java/picard/analysis/CollectWgsMetricsFromSampledSites.java b/src/main/java/picard/analysis/CollectWgsMetricsFromSampledSites.java
index 35c73f2..d580354 100644
--- a/src/main/java/picard/analysis/CollectWgsMetricsFromSampledSites.java
+++ b/src/main/java/picard/analysis/CollectWgsMetricsFromSampledSites.java
@@ -81,11 +81,53 @@ public class CollectWgsMetricsFromSampledSites extends CollectWgsMetrics {
}
// rename the class so that in the metric file it is annotated differently.
- public static class SampledWgsMetrics extends WgsMetrics {}
+ public static class SampledWgsMetrics extends WgsMetrics {
+ public SampledWgsMetrics() {
+ super();
+ }
- @Override
- protected WgsMetrics generateWgsMetrics() {
- return new SampledWgsMetrics();
+ public SampledWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ super(intervals, depthHistogram, pctExcludedByMapq, pctExcludedByDupes, pctExcludedByPairing, pctExcludedByBaseq,
+ pctExcludedByOverlap, pctExcludedByCapping, pctTotal, coverageCap, baseQHistogram, sampleSize);
+ }
}
-}
+
+ @Override
+ protected WgsMetrics generateWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ return new SampledWgsMetrics(
+ intervals,
+ depthHistogram,
+ pctExcludedByMapq,
+ pctExcludedByDupes,
+ pctExcludedByPairing,
+ pctExcludedByBaseq,
+ pctExcludedByOverlap,
+ pctExcludedByCapping,
+ pctTotal,
+ coverageCap,
+ baseQHistogram,
+ sampleSize);
+ }}
diff --git a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java
index a829cc5..6144a41 100644
--- a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java
+++ b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java
@@ -25,11 +25,9 @@
package picard.analysis;
import htsjdk.samtools.SAMReadGroupRecord;
+import htsjdk.samtools.SamReader;
import htsjdk.samtools.metrics.MetricsFile;
-import htsjdk.samtools.util.Histogram;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.StringUtil;
+import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@@ -39,7 +37,11 @@ import picard.filter.CountingPairedFilter;
import picard.util.RExecutor;
import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.util.List;
+import java.util.stream.Collectors;
@CommandLineProgramProperties(
usage = CollectWgsMetricsWithNonZeroCoverage.USAGE_SUMMARY + CollectWgsMetricsWithNonZeroCoverage.USAGE_DETAILS,
@@ -75,12 +77,35 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
// Store this here since we need access to it in the doWork method
private WgsMetricsWithNonZeroCoverageCollector collector = null;
+ private SamReader samReader = null;
+
/** Metrics for evaluating the performance of whole genome sequencing experiments. */
public static class WgsMetricsWithNonZeroCoverage extends WgsMetrics {
public enum Category { WHOLE_GENOME, NON_ZERO_REGIONS }
/** One of either WHOLE_GENOME or NON_ZERO_REGIONS */
+ @MergeByAssertEquals
public Category CATEGORY;
+
+ public WgsMetricsWithNonZeroCoverage() {
+ super();
+ }
+
+ public WgsMetricsWithNonZeroCoverage(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ super(intervals, depthHistogram, pctExcludedByMapq, pctExcludedByDupes, pctExcludedByPairing, pctExcludedByBaseq,
+ pctExcludedByOverlap, pctExcludedByCapping, pctTotal, coverageCap, baseQHistogram, sampleSize);
+ }
}
public static void main(final String[] args) {
@@ -88,16 +113,28 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
}
@Override
+ protected SamReader getSamReader() {
+ if (this.samReader == null) {
+ this.samReader = super.getSamReader();
+ }
+ return this.samReader;
+ }
+
+ @Override
protected int doWork() {
IOUtil.assertFileIsWritable(CHART_OUTPUT);
+ IOUtil.assertFileIsReadable(INPUT);
- this.collector = new WgsMetricsWithNonZeroCoverageCollector(COVERAGE_CAP);
+ // Initialize the SamReader, so the header is available prior to super.doWork, for getIntervalsToExamine call. */
+ getSamReader();
- final List<SAMReadGroupRecord> readGroups = this.getSamFileHeader().getReadGroups();
- final String plotSubtitle = (readGroups.size() == 1) ? StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary()) : "";
+ this.collector = new WgsMetricsWithNonZeroCoverageCollector(COVERAGE_CAP, getIntervalsToExamine());
super.doWork();
+ final List<SAMReadGroupRecord> readGroups = getSamFileHeader().getReadGroups();
+ final String plotSubtitle = (readGroups.size() == 1) ? StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary()) : "";
+
if (collector.areHistogramsEmpty()) {
log.warn("No valid bases found in input file. No plot will be produced.");
} else {
@@ -115,12 +152,34 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
}
@Override
- protected WgsMetricsWithNonZeroCoverage generateWgsMetrics() {
- return new WgsMetricsWithNonZeroCoverage();
+ protected WgsMetrics generateWgsMetrics(final IntervalList intervals,
+ final Histogram<Integer> depthHistogram,
+ final double pctExcludedByMapq,
+ final double pctExcludedByDupes,
+ final double pctExcludedByPairing,
+ final double pctExcludedByBaseq,
+ final double pctExcludedByOverlap,
+ final double pctExcludedByCapping,
+ final double pctTotal,
+ final int coverageCap,
+ final Histogram<Integer> baseQHistogram,
+ final int sampleSize) {
+ return new WgsMetricsWithNonZeroCoverage(
+ intervals,
+ depthHistogram,
+ pctExcludedByMapq,
+ pctExcludedByDupes,
+ pctExcludedByPairing,
+ pctExcludedByBaseq,
+ pctExcludedByOverlap,
+ pctExcludedByCapping,
+ pctTotal,
+ coverageCap,
+ baseQHistogram,
+ sampleSize);
}
-
@Override
- protected WgsMetricsCollector getCollector(final int coverageCap) {
+ protected WgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) {
assert(coverageCap == this.collector.coverageCap);
return this.collector;
}
@@ -128,8 +187,8 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
protected class WgsMetricsWithNonZeroCoverageCollector extends WgsMetricsCollector {
Histogram<Integer> depthHistogram = null;
- public WgsMetricsWithNonZeroCoverageCollector(final int coverageCap) {
- super(coverageCap);
+ public WgsMetricsWithNonZeroCoverageCollector(final int coverageCap, final IntervalList intervals) {
+ super(coverageCap, intervals);
}
@Override
@@ -139,7 +198,7 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
final CountingFilter mapqFilter,
final CountingPairedFilter pairFilter) {
this.depthHistogram = getDepthHistogram();
- final Histogram<Integer> depthHistogramNonZero = depthHistogramNonZero();
+ final Histogram<Integer> depthHistogramNonZero = getDepthHistogramNonZero();
final WgsMetricsWithNonZeroCoverage metrics = (WgsMetricsWithNonZeroCoverage) getMetrics(depthHistogram, dupeFilter, mapqFilter, pairFilter);
final WgsMetricsWithNonZeroCoverage metricsNonZero = (WgsMetricsWithNonZeroCoverage) getMetrics(depthHistogramNonZero, dupeFilter, mapqFilter, pairFilter);
@@ -149,14 +208,21 @@ public class CollectWgsMetricsWithNonZeroCoverage extends CollectWgsMetrics {
file.addMetric(metrics);
file.addMetric(metricsNonZero);
+ file.addHistogram(depthHistogram);
+ file.addHistogram(depthHistogramNonZero);
if (includeBQHistogram) {
addBaseQHistogram(file);
}
}
- private Histogram<Integer> depthHistogramNonZero() {
- final Histogram<Integer> depthHistogram = new Histogram<>("coverage", "count");
+ @Override
+ protected Histogram<Integer> getDepthHistogram() {
+ return getHistogram(depthHistogramArray, "coverage", "count_WHOLE_GENOME");
+ }
+
+ private Histogram<Integer> getDepthHistogramNonZero() {
+ final Histogram<Integer> depthHistogram = new Histogram<>("coverage", "count_NON_ZERO_REGIONS");
// do not include the zero-coverage bin
for (int i = 1; i < depthHistogramArray.length; ++i) {
depthHistogram.increment(i, depthHistogramArray[i]);
diff --git a/src/main/java/picard/analysis/GcBiasMetricsCollector.java b/src/main/java/picard/analysis/GcBiasMetricsCollector.java
index e372da6..f2d3a2f 100644
--- a/src/main/java/picard/analysis/GcBiasMetricsCollector.java
+++ b/src/main/java/picard/analysis/GcBiasMetricsCollector.java
@@ -190,7 +190,7 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
final int[] readsByGc = gcCur.readsByGc;
final long[] errorsByGc = gcCur.errorsByGc;
final long[] basesByGc = gcCur.basesByGc;
- final int totalClusters = gcCur.totalClusters;
+ final long totalClusters = gcCur.totalClusters;
final long totalAlignedReads = gcCur.totalAlignedReads;
final String group = gcCur.group;
@@ -308,7 +308,7 @@ public class GcBiasMetricsCollector extends MultiLevelCollector<GcBiasMetrics, I
//Keeps track of each level of GcCalculation
/////////////////////////////////////////////////////////////////////////////
class GcObject {
- int totalClusters = 0;
+ long totalClusters = 0;
long totalAlignedReads = 0;
int[] readsByGc = new int[BINS];
long[] basesByGc = new long[BINS];
diff --git a/src/main/java/picard/analysis/GcBiasSummaryMetrics.java b/src/main/java/picard/analysis/GcBiasSummaryMetrics.java
index 0808776..2b9ef41 100644
--- a/src/main/java/picard/analysis/GcBiasSummaryMetrics.java
+++ b/src/main/java/picard/analysis/GcBiasSummaryMetrics.java
@@ -38,7 +38,7 @@ public class GcBiasSummaryMetrics extends MultilevelMetrics {
public int WINDOW_SIZE;
/** The total number of clusters that were seen in the gc bias calculation. */
- public int TOTAL_CLUSTERS;
+ public long TOTAL_CLUSTERS;
/** The total number of aligned reads used to compute the gc bias metrics. */
public long ALIGNED_READS;
diff --git a/src/main/java/picard/analysis/replicates/MergeableMetricBase.java b/src/main/java/picard/analysis/MergeableMetricBase.java
similarity index 90%
rename from src/main/java/picard/analysis/replicates/MergeableMetricBase.java
rename to src/main/java/picard/analysis/MergeableMetricBase.java
index 0f94d24..4f686a7 100644
--- a/src/main/java/picard/analysis/replicates/MergeableMetricBase.java
+++ b/src/main/java/picard/analysis/MergeableMetricBase.java
@@ -20,8 +20,9 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
+ *
*/
-package picard.analysis.replicates;
+package picard.analysis;
import htsjdk.samtools.metrics.MetricBase;
@@ -44,18 +45,33 @@ import java.lang.reflect.Field;
*/
public class MergeableMetricBase extends MetricBase {
+ /** Metrics whose values can be merged by adding. */
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
protected @interface MergeByAdding {}
+ /** Metrics whose values should be equal when merging. */
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
protected @interface MergeByAssertEquals {}
+ /** Metrics that are not merged, but are subsequently derived from other metrics, for example by
+ * {@link #calculateDerivedFields()}. */
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
protected @interface NoMergingIsDerived {}
+ /** Metrics that are merged manually in the {@link #merge(MergeableMetricBase)} ()}. Typically these metrics need
+ * access to both metrics being merged. */
+ @Retention(RetentionPolicy.RUNTIME)
+ @Target(ElementType.FIELD)
+ protected @interface MergingIsManual {}
+
+ /** Metrics that are not merged. */
+ @Retention(RetentionPolicy.RUNTIME)
+ @Target(ElementType.FIELD)
+ protected @interface NoMergingKeepsValue {}
+
/** checks if this instance can be merged with another
*
* Other must have all the fields that this instance has, and
@@ -115,7 +131,9 @@ public class MergeableMetricBase extends MetricBase {
if (field.getAnnotationsByType(MergeByAdding.class).length +
field.getAnnotationsByType(MergeByAssertEquals.class).length +
- field.getAnnotationsByType(NoMergingIsDerived.class).length == 0) {
+ field.getAnnotationsByType(NoMergingIsDerived.class).length +
+ field.getAnnotationsByType(MergingIsManual.class).length +
+ field.getAnnotationsByType(NoMergingKeepsValue.class).length == 0) {
throw new IllegalStateException("All fields of this class must be annotated with @MergeByAdding, @NoMergingIsDerived, or @MergeByAssertEquals. " +
"Field " + field.getName() + " isn't annotated.");
}
diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java
index 48ebd9b..9b93ab4 100644
--- a/src/main/java/picard/analysis/TheoreticalSensitivity.java
+++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java
@@ -198,7 +198,9 @@ public class TheoreticalSensitivity {
final double[] normalizedHistogram = new double[histogram.size()];
for (int i = 0; i < histogram.size(); i++) {
- normalizedHistogram[i] = histogram.get(i).getValue() / histogramSumOfValues;
+ if (histogram.get(i) != null) {
+ normalizedHistogram[i] = histogram.get(i).getValue() / histogramSumOfValues;
+ }
}
return normalizedHistogram;
}
diff --git a/src/main/java/picard/analysis/artifacts/CollectSequencingArtifactMetrics.java b/src/main/java/picard/analysis/artifacts/CollectSequencingArtifactMetrics.java
index 9bfc561..25a3269 100644
--- a/src/main/java/picard/analysis/artifacts/CollectSequencingArtifactMetrics.java
+++ b/src/main/java/picard/analysis/artifacts/CollectSequencingArtifactMetrics.java
@@ -4,37 +4,28 @@ import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.filter.AggregateFilter;
-import htsjdk.samtools.filter.AlignedFilter;
-import htsjdk.samtools.filter.DuplicateReadFilter;
-import htsjdk.samtools.filter.FailsVendorReadQualityFilter;
-import htsjdk.samtools.filter.InsertSizeFilter;
-import htsjdk.samtools.filter.MappingQualityFilter;
-import htsjdk.samtools.filter.NotPrimaryAlignmentFilter;
-import htsjdk.samtools.filter.SamRecordFilter;
+import htsjdk.samtools.filter.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.IntervalList;
-import htsjdk.samtools.util.IntervalListReferenceSequenceMask;
-import htsjdk.samtools.util.StringUtil;
+import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.analysis.SinglePassSamProgram;
+import picard.analysis.artifacts.SequencingArtifactMetrics.BaitBiasDetailMetrics;
+import picard.analysis.artifacts.SequencingArtifactMetrics.BaitBiasSummaryMetrics;
+import picard.analysis.artifacts.SequencingArtifactMetrics.PreAdapterDetailMetrics;
+import picard.analysis.artifacts.SequencingArtifactMetrics.PreAdapterSummaryMetrics;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import picard.util.DbSnpBitSetUtil;
-import picard.analysis.artifacts.SequencingArtifactMetrics.*;
import java.io.File;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
+import java.util.stream.Collectors;
+import static com.sun.tools.doclets.formats.html.markup.HtmlStyle.header;
import static htsjdk.samtools.util.CodeUtil.getOrElse;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Quantify substitution errors caused by mismatched base pairings during various
@@ -105,7 +96,7 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
@Option(shortName = "Q", doc = "The minimum base quality score for a base to be included in analysis.")
public int MINIMUM_QUALITY_SCORE = 20;
- @Option(shortName = "MQ", doc = "The minimum mapping quality score for a base to be included in analysis.")
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "The minimum mapping quality score for a base to be included in analysis.")
public int MINIMUM_MAPPING_QUALITY = 30;
@Option(shortName = "MIN_INS", doc = "The minimum insert size for a read to be included in analysis.")
@@ -118,6 +109,12 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
"MINIMUM_INSERT_SIZE and MAXIMUM_INSERT_SIZE will be ignored.")
public boolean INCLUDE_UNPAIRED = false;
+ @Option(shortName = "DUPES", doc = "Include duplicate reads. If set to true then all reads flagged as duplicates will be included as well.")
+ public boolean INCLUDE_DUPLICATES = false;
+
+ @Option(shortName = "NON_PF", doc = "Whether or not to include non-PF reads.")
+ public boolean INCLUDE_NON_PF_READS = false;
+
@Option(shortName = "TANDEM", doc = "Set to true if mate pairs are being sequenced from the same strand, " +
"i.e. they're expected to face the same direction.")
public boolean TANDEM_READS = false;
@@ -142,6 +139,7 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
private File preAdapterDetailsOut;
private File baitBiasSummaryOut;
private File baitBiasDetailsOut;
+ private File errorSummaryFile;
private IntervalListReferenceSequenceMask intervalMask;
private DbSnpBitSetUtil dbSnpMask;
@@ -154,10 +152,6 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
private final Set<String> libraries = new HashSet<String>();
private final Map<String, ArtifactCounter> artifactCounters = new HashMap<String, ArtifactCounter>();
- public static void main(final String[] args) {
- new CollectSequencingArtifactMetrics().instanceMainWithExit(args);
- }
-
@Override
protected String[] customCommandLineValidation() {
final List<String> messages = new ArrayList<String>();
@@ -176,6 +170,8 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
messages.add("MAXIMUM_INSERT_SIZE cannot be less than MINIMUM_INSERT_SIZE unless set to 0");
}
+ if (REFERENCE_SEQUENCE == null) messages.add("REFERENCE_SEQUENCE must be provided.");
+
return messages.isEmpty() ? null : messages.toArray(new String[messages.size()]);
}
@@ -184,13 +180,10 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
final String outext = (null != FILE_EXTENSION) ? FILE_EXTENSION : ""; // Add a file extension if desired
preAdapterSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_SUMMARY_EXT + outext);
preAdapterDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT + outext);
- baitBiasSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT + outext);
- baitBiasDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT + outext);
-
- IOUtil.assertFileIsWritable(preAdapterSummaryOut);
- IOUtil.assertFileIsWritable(preAdapterDetailsOut);
- IOUtil.assertFileIsWritable(baitBiasSummaryOut);
- IOUtil.assertFileIsWritable(baitBiasDetailsOut);
+ baitBiasSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT + outext);
+ baitBiasDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT + outext);
+ errorSummaryFile = new File(OUTPUT + SequencingArtifactMetrics.ERROR_SUMMARY_EXT + outext);
+ IOUtil.assertFilesAreWritable(Arrays.asList(preAdapterSummaryOut, preAdapterDetailsOut, baitBiasSummaryOut, baitBiasDetailsOut, errorSummaryFile));
for (final SAMReadGroupRecord rec : header.getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
@@ -209,9 +202,9 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
// set record-level filters
final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
- filters.add(new FailsVendorReadQualityFilter());
+ if (!INCLUDE_NON_PF_READS) filters.add(new FailsVendorReadQualityFilter());
filters.add(new NotPrimaryAlignmentFilter());
- filters.add(new DuplicateReadFilter());
+ if (!INCLUDE_DUPLICATES) filters.add(new DuplicateReadFilter());
filters.add(new AlignedFilter(true)); // discard unmapped reads
filters.add(new MappingQualityFilter(MINIMUM_MAPPING_QUALITY));
if (!INCLUDE_UNPAIRED) {
@@ -308,6 +301,7 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
final MetricsFile<PreAdapterDetailMetrics, Integer> preAdapterDetailMetricsFile = getMetricsFile();
final MetricsFile<BaitBiasSummaryMetrics, Integer> baitBiasSummaryMetricsFile = getMetricsFile();
final MetricsFile<BaitBiasDetailMetrics, Integer> baitBiasDetailMetricsFile = getMetricsFile();
+ final MetricsFile<ErrorSummaryMetrics,?> errorSummaryMetricsFile = getMetricsFile();
for (final ArtifactCounter counter : artifactCounters.values()) {
// build metrics
@@ -327,13 +321,42 @@ static final String USAGE_DETAILS = "<p>This tool examines two sources of sequen
baitBiasDetailMetricsFile.addMetric(baitBiasDetailMetrics);
}
}
-
}
preAdapterDetailMetricsFile.write(preAdapterDetailsOut);
preAdapterSummaryMetricsFile.write(preAdapterSummaryOut);
baitBiasDetailMetricsFile.write(baitBiasDetailsOut);
baitBiasSummaryMetricsFile.write(baitBiasSummaryOut);
+
+ // Calculate the summary error rates - it's CRITICAL that the other files are written out
+ // first as this code modifies the pre-adapter detail metrics!
+ if (!preAdapterDetailMetricsFile.getMetrics().isEmpty()) {
+ final List<PreAdapterDetailMetrics> in = preAdapterDetailMetricsFile.getMetrics();
+ in.forEach(m -> {
+ if (m.REF_BASE == 'G' || m.REF_BASE == 'T') {
+ m.REF_BASE = (char) SequenceUtil.complement((byte) m.REF_BASE);
+ m.ALT_BASE = (char) SequenceUtil.complement((byte) m.ALT_BASE);
+ }
+ });
+
+ // Group the metrics by error type
+ final Map<String,List<PreAdapterDetailMetrics>> byError =
+ in.stream().collect(Collectors.groupingBy(m -> m.REF_BASE + ">" + m.ALT_BASE));
+
+ for (final String error : new TreeSet<>(byError.keySet())) {
+ final List<PreAdapterDetailMetrics> ms = byError.get(error);
+ final ErrorSummaryMetrics summary = new ErrorSummaryMetrics();
+ summary.REF_BASE = ms.get(0).REF_BASE;
+ summary.ALT_BASE = ms.get(0).ALT_BASE;
+ summary.SUBSTITUTION = error;
+ summary.REF_COUNT = ms.stream().mapToLong(m -> m.PRO_REF_BASES + m.CON_REF_BASES).sum();
+ summary.ALT_COUNT = ms.stream().mapToLong(m -> m.PRO_ALT_BASES + m.CON_ALT_BASES).sum();
+ summary.calculateDerivedFields();
+ errorSummaryMetricsFile.addMetric(summary);
+ }
+ }
+
+ errorSummaryMetricsFile.write(errorSummaryFile);
}
@Override
diff --git a/src/main/java/picard/analysis/artifacts/ErrorSummaryMetrics.java b/src/main/java/picard/analysis/artifacts/ErrorSummaryMetrics.java
new file mode 100644
index 0000000..c18ccb4
--- /dev/null
+++ b/src/main/java/picard/analysis/artifacts/ErrorSummaryMetrics.java
@@ -0,0 +1,36 @@
+package picard.analysis.artifacts;
+
+import picard.analysis.MergeableMetricBase;
+
+/**
+ * Summary metrics produced by {@link CollectSequencingArtifactMetrics} as a roll up of the
+ * context-specific error rates, to provide global error rates per type of base substitution.
+ *
+ * Errors are normalized to the lexically lower reference base and summarized together. E.g.
+ * G>T is converted to C>A and merged with data from C>A for reporting.
+ */
+public class ErrorSummaryMetrics extends MergeableMetricBase {
+ /** The reference base (or it's complement). */
+ @MergeByAssertEquals public char REF_BASE;
+
+ /** The alternative base (or it's complement). */
+ @MergeByAssertEquals public char ALT_BASE;
+
+ /** A single string representing the substition from REF_BASE to ALT_BASE for convenience. */
+ @MergeByAssertEquals public String SUBSTITUTION;
+
+ /** The number of reference bases observed. */
+ @MergeByAdding public long REF_COUNT;
+
+ /** The number of alt bases observed. */
+ @MergeByAdding public long ALT_COUNT;
+
+ /** The rate of the substitution in question. */
+ @NoMergingIsDerived public double SUBSTITUTION_RATE;
+
+ @Override
+ public void calculateDerivedFields() {
+ final double total = REF_COUNT + ALT_COUNT;
+ this.SUBSTITUTION_RATE = (total == 0) ? 0 : ALT_COUNT / total;
+ }
+}
diff --git a/src/main/java/picard/analysis/artifacts/SequencingArtifactMetrics.java b/src/main/java/picard/analysis/artifacts/SequencingArtifactMetrics.java
index 02eb22c..12c9a23 100644
--- a/src/main/java/picard/analysis/artifacts/SequencingArtifactMetrics.java
+++ b/src/main/java/picard/analysis/artifacts/SequencingArtifactMetrics.java
@@ -8,8 +8,9 @@ import java.util.Comparator;
public class SequencingArtifactMetrics {
public static final String PRE_ADAPTER_SUMMARY_EXT = ".pre_adapter_summary_metrics";
public static final String PRE_ADAPTER_DETAILS_EXT = ".pre_adapter_detail_metrics";
- public static final String BAIT_BIAS_SUMMARY_EXT = ".bait_bias_summary_metrics";
- public static final String BAIT_BIAS_DETAILS_EXT = ".bait_bias_detail_metrics";
+ public static final String BAIT_BIAS_SUMMARY_EXT = ".bait_bias_summary_metrics";
+ public static final String BAIT_BIAS_DETAILS_EXT = ".bait_bias_detail_metrics";
+ public static final String ERROR_SUMMARY_EXT = ".error_summary_metrics";
private static final double MIN_ERROR = 1e-10; // minimum error rate to report
diff --git a/src/main/java/picard/analysis/directed/CalculateHsMetrics.java b/src/main/java/picard/analysis/directed/CalculateHsMetrics.java
index d41c394..68a465c 100644
--- a/src/main/java/picard/analysis/directed/CalculateHsMetrics.java
+++ b/src/main/java/picard/analysis/directed/CalculateHsMetrics.java
@@ -28,6 +28,8 @@ import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
/**
* Calculates a set of HS metrics from a sam or bam file. See HsMetricsCollector and CollectTargetedMetrics for more details.
*
@@ -45,7 +47,7 @@ import picard.cmdline.programgroups.Metrics;
@Deprecated
public class CalculateHsMetrics extends CollectHsMetrics {
- @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 1;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true)
diff --git a/src/main/java/picard/analysis/directed/CollectHsMetrics.java b/src/main/java/picard/analysis/directed/CollectHsMetrics.java
index ba3905b..4c66a62 100644
--- a/src/main/java/picard/analysis/directed/CollectHsMetrics.java
+++ b/src/main/java/picard/analysis/directed/CollectHsMetrics.java
@@ -39,6 +39,8 @@ import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
/**
* Collects a set of HS metrics from a sam or bam file. See HsMetricsCollector and CollectTargetedMetrics for more details.
*
@@ -94,7 +96,7 @@ static final String USAGE_DETAILS = "This tool takes a SAM/BAM file input and co
@Option(shortName = "N", doc = "Bait set name. If not provided it is inferred from the filename of the bait intervals.", optional = true)
public String BAIT_SET_NAME;
- @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 20;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true)
diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java
index 5078071..9b35003 100644
--- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java
+++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java
@@ -25,6 +25,8 @@ import java.io.File;
import java.util.List;
import java.util.Set;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
/**
* <p>Both CollectTargetedPCRMetrics and CollectHsSelection share virtually identical program structures except
* for the name of their targeting mechanisms (e.g. bait set or amplicon set). The shared behavior of these programs
@@ -82,7 +84,7 @@ public abstract class CollectTargetedMetrics<METRIC extends MultilevelMetrics, C
"considered 'near probe' and included in percent selected.")
public int NEAR_DISTANCE = TargetedPcrMetricsCollector.NEAR_PROBE_DISTANCE_DEFAULT;
- @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 1;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true)
diff --git a/src/main/java/picard/analysis/directed/InsertSizeMetricsCollector.java b/src/main/java/picard/analysis/directed/InsertSizeMetricsCollector.java
index 5a2d3bf..d278ae8 100644
--- a/src/main/java/picard/analysis/directed/InsertSizeMetricsCollector.java
+++ b/src/main/java/picard/analysis/directed/InsertSizeMetricsCollector.java
@@ -149,7 +149,7 @@ public class InsertSizeMetricsCollector extends MultiLevelCollector<InsertSizeMe
double low = median;
double high = median;
- while (low >= histogram.getMin() || high <= histogram.getMax()) {
+ while (low >= histogram.getMin()-1 || high <= histogram.getMax()+1) {
final Histogram.Bin<Integer> lowBin = histogram.get((int) low);
if (lowBin != null) covered += lowBin.getValue();
diff --git a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java
index e5d16ff..f7c93a5 100644
--- a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java
+++ b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java
@@ -58,14 +58,7 @@ import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.reflect.Field;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
/**
* TargetMetrics, are metrics to measure how well we hit specific targets (or baits) when using a targeted sequencing process like hybrid selection
@@ -406,6 +399,15 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri
this.metrics.PROBE_SET = name;
}
+ /**
+ * Returns the accumulated coverage per target. Note that while the returned Map is
+ * immutable, it is possible that the underlying Map will continue to be mutated if
+ * the map is retrieved prior to additional calls to {@link #acceptRecord(SAMRecord)}.
+ */
+ public Map<Interval, Coverage> getCoverageByTarget() {
+ return Collections.unmodifiableMap(this.coverageByTarget);
+ }
+
/** Adds information about an individual SAMRecord to the statistics. */
public void acceptRecord(final SAMRecord record) {
// Just ignore secondary alignments altogether
diff --git a/src/main/java/picard/analysis/replicates/CollectIndependentReplicateMetrics.java b/src/main/java/picard/analysis/replicates/CollectIndependentReplicateMetrics.java
index e48d7d2..59322a3 100644
--- a/src/main/java/picard/analysis/replicates/CollectIndependentReplicateMetrics.java
+++ b/src/main/java/picard/analysis/replicates/CollectIndependentReplicateMetrics.java
@@ -74,6 +74,8 @@ import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
/**
* A CLP that, given a BAM and a VCF with genotypes of the same sample, estimates the rate of independent replication of reads within the bam.
* That is, it estimates the fraction of the reads which look like duplicates (in the MarkDuplicates sense of the word) but are actually
@@ -124,7 +126,7 @@ public class CollectIndependentReplicateMetrics extends CommandLineProgram {
@Option(shortName = "GQ", doc = "minimal value for the GQ field in the VCF to use variant site.", optional = true)
public Integer MINIMUM_GQ = 90;
- @Option(shortName = "MQ", doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true)
+ @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true)
public Integer MINIMUM_MQ = 40;
@Option(shortName = "BQ", doc = "minimal value for the base quality of a base to be used in the estimation.", optional = true)
diff --git a/src/main/java/picard/analysis/replicates/IndependentReplicateMetric.java b/src/main/java/picard/analysis/replicates/IndependentReplicateMetric.java
index 06db4cc..89dbd9a 100644
--- a/src/main/java/picard/analysis/replicates/IndependentReplicateMetric.java
+++ b/src/main/java/picard/analysis/replicates/IndependentReplicateMetric.java
@@ -24,6 +24,8 @@
package picard.analysis.replicates;
+import picard.analysis.MergeableMetricBase;
+
/**
* A class to store information relevant for biological rate estimation
*
diff --git a/src/main/java/picard/cmdline/PicardCommandLine.java b/src/main/java/picard/cmdline/PicardCommandLine.java
index f3e4d3a..c35198c 100644
--- a/src/main/java/picard/cmdline/PicardCommandLine.java
+++ b/src/main/java/picard/cmdline/PicardCommandLine.java
@@ -303,7 +303,7 @@ public class PicardCommandLine {
}
// Output similar matches
- System.err.println(String.format("'%s' is not a valid command. See PicardCommandLine --help for more information.", command));
+ System.err.println(String.format("'%s' is not a valid command. See PicardCommandLine -h for more information.", command));
if (bestDistance < HELP_SIMILARITY_FLOOR) {
System.err.println(String.format("Did you mean %s?", (bestN < 2) ? "this" : "one of these"));
for (final Class clazz : classes) {
diff --git a/src/main/java/picard/fingerprint/FingerprintResults.java b/src/main/java/picard/cmdline/programgroups/Fingerprinting.java
similarity index 53%
copy from src/main/java/picard/fingerprint/FingerprintResults.java
copy to src/main/java/picard/cmdline/programgroups/Fingerprinting.java
index 5d6b0bc..9b19b39 100644
--- a/src/main/java/picard/fingerprint/FingerprintResults.java
+++ b/src/main/java/picard/cmdline/programgroups/Fingerprinting.java
@@ -1,7 +1,7 @@
/*
* The MIT License
*
- * Copyright (c) 2010 The Broad Institute
+ * Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -21,35 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
+package picard.cmdline.programgroups;
-package picard.fingerprint;
+import picard.cmdline.CommandLineProgramGroup;
-import java.io.File;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-/**
- * Class that is used to represent the results of comparing a read group within a SAM file
- * against one or more set of fingerprint genotypes.
- *
- * @author Tim Fennell
- */
-public class FingerprintResults {
- private final File samFile;
- private final String readGroup;
- private final SortedSet<MatchResults> matchResults = new TreeSet<>();
-
- public FingerprintResults(final File samFile, final String readGroup) {
- this.samFile = samFile;
- this.readGroup = readGroup;
- }
-
- public void addResults(final MatchResults matchResults) {
- this.matchResults.add(matchResults);
- }
-
- public File getSamFile() { return samFile; }
- public String getReadGroup() { return readGroup; }
- public SortedSet<MatchResults> getMatchResults() { return matchResults; }
+public class Fingerprinting implements CommandLineProgramGroup {
+ @Override
+ public String getName() { return "Fingerprinting Tools"; }
+ @Override
+ public String getDescription() { return "Tools for manipulating fingerprints, or related data."; }
}
-
diff --git a/src/main/java/picard/fingerprint/CheckFingerprint.java b/src/main/java/picard/fingerprint/CheckFingerprint.java
index bf11400..9a8e926 100644
--- a/src/main/java/picard/fingerprint/CheckFingerprint.java
+++ b/src/main/java/picard/fingerprint/CheckFingerprint.java
@@ -1,7 +1,7 @@
/*
* The MIT License
*
- * Copyright (c) 2010 The Broad Institute
+ * Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,7 @@
package picard.fingerprint;
+import htsjdk.samtools.BamFileIoUtils;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
@@ -33,6 +34,8 @@ import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
+import htsjdk.variant.vcf.VCFFileReader;
+import htsjdk.variant.vcf.VCFHeader;
import picard.PicardException;
import picard.analysis.FingerprintingDetailMetrics;
import picard.analysis.FingerprintingSummaryMetrics;
@@ -40,30 +43,31 @@ import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
-import picard.cmdline.programgroups.Alpha;
+import picard.cmdline.programgroups.Fingerprinting;
import java.io.File;
-import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
/**
- * Attempts to check the sample identity of the sequence data in the provided SAM/BAM file
+ * Attempts to check the sample identity of the sequence/genotype data in the provided file (SAM/BAM or VCF)
* against a set of known genotypes in the supplied genotype file (in either GELI or VCF format).
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = CheckFingerprint.USAGE_DETAILS,
- usageShort = "Computes a fingerprint from the supplied SAM/BAM file and compares it to the provided genotypes",
- programGroup = Alpha.class // TODO -- when mature please move to a to-be-created Fingerprinting.class
+ usageShort = "Computes a fingerprint from the supplied input (SAM/BAM or VCF) file and compares it to the provided genotypes",
+ programGroup = Fingerprinting.class
)
public class CheckFingerprint extends CommandLineProgram {
- static final String USAGE_DETAILS = "Computes a fingerprint from the supplied SAM/BAM file and " +
+ static final String USAGE_DETAILS = "Computes a fingerprint from the supplied input file (SAM/BAM or VCF) file and " +
"compares it to the expected fingerprint genotypes provided. The key output is a LOD score " +
"which represents the relative likelihood of the sequence data originating from the same " +
"sample as the genotypes vs. from a random sample. Two outputs are produced: (1) a summary " +
- "metrics file that gives metrics related single read group (lane or index within a lane) " +
+ "metrics file that gives metrics at the single sample level (if the input was a VCF) or at the read " +
+ "level (lane or index within a lane) (if the input was a SAM/BAM) " +
"versus a set of known genotypes for the expected sample, and (2) a detail metrics file that " +
"contains an individual SNP/Haplotype comparison within a fingerprint comparison. The two " +
"files may be specified individually using the SUMMARY_OUTPUT and DETAIL_OUTPUT options. " +
@@ -71,10 +75,15 @@ public class CheckFingerprint extends CommandLineProgram {
"files, with the summary metrics having a file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' " +
"and the detail metrics having a file extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'.";
- @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
+ @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file SAM/BAM or VCF. If a VCF is used, " +
+ "it must have at least one sample. If there are more than one samples in the VCF, the parameter OBSERVED_SAMPLE_ALIAS must " +
+ "be provided in order to indicate which sample's data to use. If there are no samples in the VCF, an exception will be thrown.")
public File INPUT;
- @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The base of output files to write. The summary metrics " +
+ @Option(optional = true, doc = "If the input is a VCF, this parameters used to select which sample's data in the VCF to use.")
+ public String OBSERVED_SAMPLE_ALIAS;
+
+ @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The base prefix of output files to write. The summary metrics " +
"will have the file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' and the detail metrics will have " +
"the extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'.", mutex = {"SUMMARY_OUTPUT", "DETAIL_OUTPUT"})
public String OUTPUT;
@@ -89,10 +98,10 @@ public class CheckFingerprint extends CommandLineProgram {
"any number of genotypes; CheckFingerprint will use only those that are usable for fingerprinting.")
public File GENOTYPES;
- @Option(optional=true, doc = "If using VCF format genotypes, this parameter can be used to specify which sample's " +
- "genotypes to use from the VCF file. If not supplied the sample name from the BAM read group header " +
- "is used instead.")
- public String SAMPLE_ALIAS;
+ @Option(shortName = "SAMPLE_ALIAS", optional=true, doc = "This parameter can be used to specify which sample's genotypes to use from the " +
+ "expected VCF file (the GENOTYPES file). If it is not supplied, the sample name from the input " +
+ "(VCF or BAM read group header) will be used.")
+ public String EXPECTED_SAMPLE_ALIAS;
@Option(shortName="H", doc = "A file of haplotype information produced by the CheckFingerprint program.")
public File HAPLOTYPE_MAP;
@@ -101,7 +110,8 @@ public class CheckFingerprint extends CommandLineProgram {
"where the most likely haplotype achieves at least this LOD.")
public double GENOTYPE_LOD_THRESHOLD = 5;
- @Option(shortName="IGNORE_RG", doc = "If true, treat the entire input BAM as one single read group in the calculation, " +
+ @Option(optional=true, shortName="IGNORE_RG", doc = "If the input is a SAM/BAM, and this parameter is true, treat the " +
+ "entire input BAM as one single read group in the calculation, " +
"ignoring RG annotations, and producing a single fingerprint metric for the entire BAM.")
public boolean IGNORE_READ_GROUPS = false;
@@ -136,30 +146,72 @@ public class CheckFingerprint extends CommandLineProgram {
IOUtil.assertFileIsWritable(outputSummaryMetricsFile);
final FingerprintChecker checker = new FingerprintChecker(HAPLOTYPE_MAP);
+ List<FingerprintResults> results;
- SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), SAMSequenceDictionaryExtractor.extractDictionary(GENOTYPES), true);
- SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), checker.getHeader().getSequenceDictionary(), true);
+ String observedSampleAlias = null;
+ final boolean isBamOrSamFile = isBamOrSamFile(INPUT);
+ if (isBamOrSamFile) {
+ SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), SAMSequenceDictionaryExtractor.extractDictionary(GENOTYPES), true);
+ SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), checker.getHeader().getSequenceDictionary(), true);
- // If sample alias isn't supplied, assume it's the one from the INPUT file's RGs
- if (SAMPLE_ALIAS == null) {
+ // Verify that there's only one sample in the SAM/BAM.
final SamReader in = SamReaderFactory.makeDefault().open(INPUT);
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
- if (SAMPLE_ALIAS == null) {
- SAMPLE_ALIAS = rec.getSample();
+ if (observedSampleAlias == null) {
+ observedSampleAlias = rec.getSample();
}
- else if (!SAMPLE_ALIAS.equals(rec.getSample())) {
- throw new PicardException("SAM File must not contain data from multiple samples.");
+ else if (!observedSampleAlias.equals(rec.getSample())) {
+ throw new PicardException("INPUT SAM/BAM file must not contain data from multiple samples.");
}
}
CloserUtil.close(in);
- }
+ // If expected sample alias isn't supplied, assume it's the one from the INPUT file's RGs
+ if (EXPECTED_SAMPLE_ALIAS == null) {
+ EXPECTED_SAMPLE_ALIAS = observedSampleAlias;
+ }
+
+ results = checker.checkFingerprints(
+ Collections.singletonList(INPUT),
+ Collections.singletonList(GENOTYPES),
+ EXPECTED_SAMPLE_ALIAS,
+ IGNORE_READ_GROUPS);
+ } else { // Input is a VCF
+ // Note that FingerprintChecker.loadFingerprints() verifies that the VCF's Sequence Dictionaries agree with that of the Haplotye Map File
+
+ // Verify that there is only one sample in the VCF
+ final VCFFileReader fileReader = new VCFFileReader(INPUT, false);
+ final VCFHeader fileHeader = fileReader.getFileHeader();
+ if (fileHeader.getNGenotypeSamples() < 1) {
+ throw new PicardException("INPUT VCF file must contain at least one sample.");
+ }
+ if ((fileHeader.getNGenotypeSamples() > 1) && (OBSERVED_SAMPLE_ALIAS == null)) {
+ throw new PicardException("INPUT VCF file contains multiple samples and yet the OBSERVED_SAMPLE_ALIAS parameter is not set.");
+ }
+ // set observedSampleAlias to the parameter, if set. Otherwise, if here, this must be a single sample VCF, get it's sample
+ observedSampleAlias = (OBSERVED_SAMPLE_ALIAS != null) ? OBSERVED_SAMPLE_ALIAS : fileHeader.getGenotypeSamples().get(0);
- final List<FingerprintResults> results = checker.checkFingerprints(
- Arrays.asList(INPUT),
- Arrays.asList(GENOTYPES),
- SAMPLE_ALIAS,
- IGNORE_READ_GROUPS);
+ // Now verify that observedSampleAlias is, in fact, in the VCF
+ if (!fileHeader.getGenotypeSamples().contains(observedSampleAlias)) {
+ throw new PicardException("INPUT VCF file does not contain OBSERVED_SAMPLE_ALIAS: " + observedSampleAlias);
+ }
+
+ if (OBSERVED_SAMPLE_ALIAS == null) {
+ observedSampleAlias = fileHeader.getGenotypeSamples().get(0);
+ }
+ fileReader.close();
+
+ // If expected sample alias isn't supplied, assume it's the one from the INPUT file
+ if (EXPECTED_SAMPLE_ALIAS == null) {
+ EXPECTED_SAMPLE_ALIAS = observedSampleAlias;
+ }
+
+ results = checker.checkFingerprints(
+ Collections.singletonList(INPUT),
+ Collections.singletonList(GENOTYPES),
+ observedSampleAlias,
+ EXPECTED_SAMPLE_ALIAS);
+ }
final MetricsFile<FingerprintingSummaryMetrics,?> summaryFile = getMetricsFile();
final MetricsFile<FingerprintingDetailMetrics,?> detailsFile = getMetricsFile();
@@ -169,8 +221,8 @@ public class CheckFingerprint extends CommandLineProgram {
final FingerprintingSummaryMetrics metrics = new FingerprintingSummaryMetrics();
metrics.READ_GROUP = fpr.getReadGroup();
- metrics.SAMPLE = SAMPLE_ALIAS;
- metrics.LL_EXPECTED_SAMPLE = mr.getSampleLikelihood();
+ metrics.SAMPLE = EXPECTED_SAMPLE_ALIAS;
+ metrics.LL_EXPECTED_SAMPLE = mr.getSampleLikelihood();
metrics.LL_RANDOM_SAMPLE = mr.getPopulationLikelihood();
metrics.LOD_EXPECTED_SAMPLE = mr.getLOD();
@@ -202,8 +254,8 @@ public class CheckFingerprint extends CommandLineProgram {
// Build the detail metrics
final FingerprintingDetailMetrics details = new FingerprintingDetailMetrics();
- details.READ_GROUP = fpr.getReadGroup();
- details.SAMPLE = SAMPLE_ALIAS;
+ details.READ_GROUP = fpr.getReadGroup();
+ details.SAMPLE = EXPECTED_SAMPLE_ALIAS;
details.SNP = lr.getSnp().getName();
details.SNP_ALLELES = lr.getSnp().getAlleleString();
details.CHROM = lr.getSnp().getChrom();
@@ -217,7 +269,7 @@ public class CheckFingerprint extends CommandLineProgram {
}
summaryFile.addMetric(metrics);
- log.info(metrics.READ_GROUP + " vs. " + metrics.SAMPLE + ": LOD = " + metrics.LOD_EXPECTED_SAMPLE);
+ log.info("Read Group: " + metrics.READ_GROUP + " / " + observedSampleAlias + " vs. " + metrics.SAMPLE + ": LOD = " + metrics.LOD_EXPECTED_SAMPLE);
}
summaryFile.write(outputSummaryMetricsFile);
@@ -225,4 +277,21 @@ public class CheckFingerprint extends CommandLineProgram {
return 0;
}
+
+ protected String[] customCommandLineValidation() {
+ IOUtil.assertFileIsReadable(INPUT);
+
+ boolean isBamOrSamFile = isBamOrSamFile(INPUT);
+ if (!isBamOrSamFile && IGNORE_READ_GROUPS) {
+ return new String[]{"The parameter IGNORE_READ_GROUPS can only be used with BAM/SAM inputs."};
+ }
+ if (isBamOrSamFile && OBSERVED_SAMPLE_ALIAS != null) {
+ return new String[]{"The parameter OBSERVED_SAMPLE_ALIAS can only be used with a VCF input."};
+ }
+ return super.customCommandLineValidation();
+ }
+
+ private boolean isBamOrSamFile(final File f) {
+ return (BamFileIoUtils.isBamFile(f) || f.getName().endsWith(IOUtil.SAM_FILE_EXTENSION));
+ }
}
diff --git a/src/main/java/picard/fingerprint/CrosscheckReadGroupFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckReadGroupFingerprints.java
index a54ffa1..e4c419f 100644
--- a/src/main/java/picard/fingerprint/CrosscheckReadGroupFingerprints.java
+++ b/src/main/java/picard/fingerprint/CrosscheckReadGroupFingerprints.java
@@ -34,7 +34,7 @@ import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.FormatUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.SAMReadGroupRecord;
-import picard.cmdline.programgroups.Alpha;
+import picard.cmdline.programgroups.Fingerprinting;
import java.io.File;
import java.io.PrintStream;
@@ -50,7 +50,7 @@ import java.util.concurrent.TimeUnit;
@CommandLineProgramProperties(
usage = "Checks if all read groups within a set of BAM files appear to come from the same individual",
usageShort = "Checks if all read groups appear to come from the same individual",
- programGroup = Alpha.class // TODO -- when mature please move to a to-be-created Fingerprinting.class
+ programGroup = Fingerprinting.class
)
public class CrosscheckReadGroupFingerprints extends CommandLineProgram {
diff --git a/src/main/java/picard/fingerprint/FingerprintChecker.java b/src/main/java/picard/fingerprint/FingerprintChecker.java
index 4b4b55e..690dc7b 100644
--- a/src/main/java/picard/fingerprint/FingerprintChecker.java
+++ b/src/main/java/picard/fingerprint/FingerprintChecker.java
@@ -29,18 +29,13 @@ import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.NotPrimaryAlignmentFilter;
import htsjdk.samtools.filter.SamRecordFilter;
-import htsjdk.samtools.util.Interval;
-import htsjdk.samtools.util.IntervalList;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.SamLocusIterator;
-import htsjdk.samtools.SAMFileReader;
+import htsjdk.samtools.util.*;
import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.util.SequenceUtil;
-import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFFileReader;
import picard.PicardException;
import java.io.File;
@@ -130,13 +125,12 @@ public class FingerprintChecker {
* of an individual sample to load (and exclude all others).
* @return a Map of Sample name to Fingerprint
*/
- public Map<String,Fingerprint> loadFingerprints(final File fingerprintFile, final String specificSample) {
- final IntervalList loci = this.haplotypes.getIntervalList();
- final GenotypeReader reader = new GenotypeReader();
- final GenotypeReader.VariantIterator iterator = reader.read(fingerprintFile, loci);
+ public Map<String, Fingerprint> loadFingerprints(final File fingerprintFile, final String specificSample) {
+ final VCFFileReader reader = new VCFFileReader(fingerprintFile, false);
+ final CloseableIterator<VariantContext> iterator = reader.iterator();
SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(),
- iterator.getSequenceDictionary());
+ reader.getSequenceDictionary(fingerprintFile));
final Map<String, Fingerprint> fingerprints = new HashMap<>();
Set<String> samples = null;
@@ -272,8 +266,10 @@ public class FingerprintChecker {
* the interval list.
*/
public Map<SAMReadGroupRecord, Fingerprint> fingerprintSamFile(final File samFile, final IntervalList loci) {
- final SAMFileReader in = new SAMFileReader(samFile);
- in.enableIndexCaching(true);
+ final SamReader in = SamReaderFactory.makeDefault()
+ .enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES)
+ .open(samFile);
+
SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(),
in.getFileHeader().getSequenceDictionary());
@@ -531,7 +527,7 @@ public class FingerprintChecker {
final Fingerprint combinedFp = new Fingerprint(specificSample, f, null);
fingerprintsByReadGroup.values().forEach(combinedFp::merge);
- final FingerprintResults results = new FingerprintResults(f, specificSample);
+ final FingerprintResults results = new FingerprintResults(f, null, specificSample);
for (final Fingerprint expectedFp : expectedFingerprints) {
final MatchResults result = calculateMatchResults(combinedFp, expectedFp, 0, pLossofHet);
results.addResults(result);
@@ -541,7 +537,7 @@ public class FingerprintChecker {
} else {
for (final SAMReadGroupRecord rg : fingerprintsByReadGroup.keySet()) {
- final FingerprintResults results = new FingerprintResults(f, rg.getPlatformUnit());
+ final FingerprintResults results = new FingerprintResults(f, rg.getPlatformUnit(), specificSample);
for (final Fingerprint expectedFp : expectedFingerprints) {
final MatchResults result = calculateMatchResults(fingerprintsByReadGroup.get(rg), expectedFp, 0, pLossofHet);
results.addResults(result);
@@ -556,6 +552,51 @@ public class FingerprintChecker {
}
/**
+ * Top level method to take a set of one or more observed genotype (VCF) files and one or more expected genotype (VCF) files and compare
+ * one or more sample in the observed genotype file with one or more in the expected file and generate results for each set.
+ *
+ * @param observedGenotypeFiles The list of genotype files containing observed calls, from which to pull fingerprint genotypes
+ * @param expectedGenotypeFiles The list of genotype files containing expected calls, from which to pull fingerprint genotypes
+ * @param observedSample an optional single sample whose genotypes to load from the observed genotype file (if null, use all)
+ * @param expectedSample an optional single sample whose genotypes to load from the expected genotype file (if null, use all)
+ */
+ public List<FingerprintResults> checkFingerprints(final List<File> observedGenotypeFiles,
+ final List<File> expectedGenotypeFiles,
+ final String observedSample,
+ final String expectedSample) {
+
+ // Load the expected fingerprint genotypes
+ final List<Fingerprint> expectedFingerprints = new ArrayList<>();
+ for (final File f : expectedGenotypeFiles) {
+ expectedFingerprints.addAll(loadFingerprints(f, expectedSample).values());
+ }
+
+ if (expectedFingerprints.isEmpty()) {
+ throw new IllegalStateException("Could not find any fingerprints in: " + expectedGenotypeFiles);
+ }
+
+ final List<FingerprintResults> resultsList = new ArrayList<>();
+
+ for (final File f : observedGenotypeFiles) {
+ final Map<String, Fingerprint> observedFingerprintsBySample = loadFingerprints(f, observedSample);
+ if (observedFingerprintsBySample.isEmpty()) {
+ throw new IllegalStateException("Found no fingerprints in observed genotypes file: " + observedGenotypeFiles);
+ }
+
+ for (final String sample : observedFingerprintsBySample.keySet()) {
+ final FingerprintResults results = new FingerprintResults(f, null, sample);
+ for (Fingerprint expectedFp : expectedFingerprints) {
+ final MatchResults result = calculateMatchResults(observedFingerprintsBySample.get(sample), expectedFp, 0, pLossofHet);
+ results.addResults(result);
+ }
+ resultsList.add(results);
+ }
+ }
+ return resultsList;
+ }
+
+
+ /**
* Compares two fingerprints and calculates a MatchResults object which contains detailed
* information about the match (or mismatch) between fingerprints including the LOD score
* for whether or not the two are likely from the same sample.
diff --git a/src/main/java/picard/fingerprint/FingerprintResults.java b/src/main/java/picard/fingerprint/FingerprintResults.java
index 5d6b0bc..a4943b3 100644
--- a/src/main/java/picard/fingerprint/FingerprintResults.java
+++ b/src/main/java/picard/fingerprint/FingerprintResults.java
@@ -29,27 +29,31 @@ import java.util.SortedSet;
import java.util.TreeSet;
/**
- * Class that is used to represent the results of comparing a read group within a SAM file
- * against one or more set of fingerprint genotypes.
+ * Class that is used to represent the results of comparing a read group within a SAM file, or a sample
+ * within a VCF against one or more set of fingerprint genotypes.
*
* @author Tim Fennell
*/
public class FingerprintResults {
- private final File samFile;
- private final String readGroup;
+ private final File inputFile;
+ private final String readGroup; // null if the input is a VCF.
+ private final String sampleAlias;
private final SortedSet<MatchResults> matchResults = new TreeSet<>();
- public FingerprintResults(final File samFile, final String readGroup) {
- this.samFile = samFile;
+ public FingerprintResults(final File inputFile, final String readGroup, final String sampleAlias) {
+ this.inputFile = inputFile;
this.readGroup = readGroup;
+ this.sampleAlias = sampleAlias;
}
public void addResults(final MatchResults matchResults) {
this.matchResults.add(matchResults);
}
- public File getSamFile() { return samFile; }
+ public File getInputFile() { return inputFile; }
public String getReadGroup() { return readGroup; }
+ public String getSampleAlias() { return sampleAlias; }
+
public SortedSet<MatchResults> getMatchResults() { return matchResults; }
}
diff --git a/src/main/java/picard/illumina/parser/readers/BclReader.java b/src/main/java/picard/illumina/parser/readers/BclReader.java
index 777a565..06c6b80 100644
--- a/src/main/java/picard/illumina/parser/readers/BclReader.java
+++ b/src/main/java/picard/illumina/parser/readers/BclReader.java
@@ -208,7 +208,7 @@ public class BclReader implements CloseableIterator<BclData> {
final long elementsInFile = file.length() - HEADER_SIZE;
if (numClusters != elementsInFile) {
CloserUtil.close(stream);
- throw new PicardException("Expected " + numClusters + " in file but found " + elementsInFile);
+ throw new PicardException("Expected " + numClusters + " in file " + file.getAbsolutePath() + " but found " + elementsInFile);
}
}
diff --git a/src/main/java/picard/sam/AbstractAlignmentMerger.java b/src/main/java/picard/sam/AbstractAlignmentMerger.java
index 1797d52..fbad3a4 100644
--- a/src/main/java/picard/sam/AbstractAlignmentMerger.java
+++ b/src/main/java/picard/sam/AbstractAlignmentMerger.java
@@ -45,7 +45,6 @@ import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.FilteringIterator;
-import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CigarUtil;
@@ -73,7 +72,7 @@ import java.util.List;
* 2. Merge the alignment information and public tags ONLY from the aligned SAMRecords
* 3. Do additional modifications -- handle clipping, trimming, etc.
* 4. Fix up mate information on paired reads
- * 5. Do a final calculation of the NM and UQ tags.
+ * 5. Do a final calculation of the NM and UQ tags (coordinate sorted only)
* 6. Write the records to the output file.
* <p/>
* Concrete subclasses which extend AbstractAlignmentMerger should implement getQueryNameSortedAlignedRecords.
@@ -477,7 +476,7 @@ public abstract class AbstractAlignmentMerger {
for (final SAMRecord rec : sink.sorter) {
if (!rec.getReadUnmappedFlag() && refSeq != null) {
- fixNMandUQ(rec, refSeq, bisulfiteSequence);
+ fixNmMdAndUq(rec, refSeq, bisulfiteSequence);
}
writer.addAlignment(rec);
finalProgress.record(rec);
@@ -490,7 +489,7 @@ public abstract class AbstractAlignmentMerger {
log.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}
- /** Calculates and sets the NM and UQ tags from the record and the reference
+ /** Calculates and sets the NM, MD, and and UQ tags from the record and the reference
*
* @param record the record to be fixed
* @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
@@ -499,10 +498,13 @@ public abstract class AbstractAlignmentMerger {
*
* No return value, modifies the provided record.
*/
- public static void fixNMandUQ(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
+ public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
final byte[] referenceBases = refSeqWalker.get(refSeqWalker.getSequenceDictionary().getSequenceIndex(record.getReferenceName())).getBases();
- record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
-
+ // only recalculate NM if it isn't bisulfite, since it needs to be treated specially below
+ SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence);
+ if (isBisulfiteSequence) { // recalculate the NM tag for bisulfite data
+ record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
+ }
if (record.getBaseQualities() != SAMRecord.NULL_QUALS) {
record.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(record, referenceBases, 0, isBisulfiteSequence));
}
@@ -605,11 +607,13 @@ public abstract class AbstractAlignmentMerger {
if (posDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(pos, Math.min(pos.getReadLength(),
pos.getReadLength() - posDiff + 1));
+ removeNmMdAndUqTags(pos); // these tags are now invalid!
}
if (negDiff > 0) {
CigarUtil.softClip3PrimeEndOfRead(neg, Math.min(neg.getReadLength(),
neg.getReadLength() - negDiff + 1));
+ removeNmMdAndUqTags(neg); // these tags are now invalid!
}
}
@@ -736,6 +740,7 @@ public abstract class AbstractAlignmentMerger {
// If the adapter sequence is marked and clipAdapter is true, clip it
if (this.clipAdapters && rec.getAttribute(ReservedTagConstants.XT) != null) {
CigarUtil.softClip3PrimeEndOfRead(rec, rec.getIntegerAttribute(ReservedTagConstants.XT));
+ removeNmMdAndUqTags(rec); // these tags are now invalid!
}
}
@@ -797,4 +802,15 @@ public abstract class AbstractAlignmentMerger {
public void close() {
CloserUtil.close(this.refSeq);
}
+
+
+ /** Removes the NM, MD, and UQ tags. This is useful if we modify the read and are not able to recompute these tags,
+ * for example when no reference is available.
+ * @param rec the record to modify.
+ */
+ private static void removeNmMdAndUqTags(final SAMRecord rec) {
+ rec.setAttribute(SAMTag.NM.name(), null);
+ rec.setAttribute(SAMTag.MD.name(), null);
+ rec.setAttribute(SAMTag.UQ.name(), null);
+ }
}
diff --git a/src/main/java/picard/sam/DuplicationMetrics.java b/src/main/java/picard/sam/DuplicationMetrics.java
index 7544031..e668ba8 100644
--- a/src/main/java/picard/sam/DuplicationMetrics.java
+++ b/src/main/java/picard/sam/DuplicationMetrics.java
@@ -24,62 +24,84 @@
package picard.sam;
-import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.util.Histogram;
+import picard.analysis.MergeableMetricBase;
/**
* Metrics that are calculated during the process of marking duplicates
* within a stream of SAMRecords.
*/
-public class DuplicationMetrics extends MetricBase {
+public class DuplicationMetrics extends MergeableMetricBase {
/** The library on which the duplicate marking was performed. */
+ @MergeByAssertEquals
public String LIBRARY;
/**
* The number of mapped reads examined which did not have a mapped mate pair,
* either because the read is unpaired, or the read is paired to an unmapped mate.
*/
+ @MergeByAdding
public long UNPAIRED_READS_EXAMINED;
/** The number of mapped read pairs examined. (Primary, non-supplemental) */
+ @MergeByAdding
public long READ_PAIRS_EXAMINED;
/** The number of reads that were either secondary or supplementary */
+ @MergeByAdding
public long SECONDARY_OR_SUPPLEMENTARY_RDS;
/** The total number of unmapped reads examined. (Primary, non-supplemental) */
+ @MergeByAdding
public long UNMAPPED_READS;
/** The number of fragments that were marked as duplicates. */
+ @MergeByAdding
public long UNPAIRED_READ_DUPLICATES;
/** The number of read pairs that were marked as duplicates. */
+ @MergeByAdding
public long READ_PAIR_DUPLICATES;
/**
* The number of read pairs duplicates that were caused by optical duplication.
* Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source.
*/
+ @MergeByAdding
public long READ_PAIR_OPTICAL_DUPLICATES;
/** The fraction of mapped sequence that is marked as duplicate. */
+ @NoMergingIsDerived
public Double PERCENT_DUPLICATION;
/** The estimated number of unique molecules in the library based on PE duplication. */
+ @NoMergingIsDerived
public Long ESTIMATED_LIBRARY_SIZE;
/**
* Fills in the ESTIMATED_LIBRARY_SIZE based on the paired read data examined where
* possible and the PERCENT_DUPLICATION.
*/
- public void calculateDerivedMetrics() {
+ @Override
+ public void calculateDerivedFields() {
this.ESTIMATED_LIBRARY_SIZE = estimateLibrarySize(this.READ_PAIRS_EXAMINED - this.READ_PAIR_OPTICAL_DUPLICATES,
- this.READ_PAIRS_EXAMINED - this.READ_PAIR_DUPLICATES);
+ this.READ_PAIRS_EXAMINED - this.READ_PAIR_DUPLICATES);
PERCENT_DUPLICATION = (UNPAIRED_READ_DUPLICATES + READ_PAIR_DUPLICATES *2) /(double) (UNPAIRED_READS_EXAMINED + READ_PAIRS_EXAMINED *2);
}
/**
+ * Fills in the ESTIMATED_LIBRARY_SIZE based on the paired read data examined where
+ * possible and the PERCENT_DUPLICATION.
+ *
+ * Deprecated, use {@link #calculateDerivedFields()} instead.
+ */
+ @Deprecated
+ public void calculateDerivedMetrics() {
+ this.calculateDerivedFields();
+ }
+
+ /**
* Estimates the size of a library based on the number of paired end molecules observed
* and the number of unique pairs observed.
*
@@ -149,7 +171,7 @@ public class DuplicationMetrics extends MetricBase {
public Histogram<Double> calculateRoiHistogram() {
if (ESTIMATED_LIBRARY_SIZE == null) {
try {
- calculateDerivedMetrics();
+ calculateDerivedFields();
if (ESTIMATED_LIBRARY_SIZE == null) return null;
}
catch (IllegalStateException ise) { return null; }
@@ -171,7 +193,7 @@ public class DuplicationMetrics extends MetricBase {
DuplicationMetrics m = new DuplicationMetrics();
m.READ_PAIRS_EXAMINED = Integer.parseInt(args[0]);
m.READ_PAIR_DUPLICATES = Integer.parseInt(args[1]);
- m.calculateDerivedMetrics();
+ m.calculateDerivedFields();
System.out.println("Percent Duplication: " + m.PERCENT_DUPLICATION);
System.out.println("Est. Library Size : " + m.ESTIMATED_LIBRARY_SIZE);
System.out.println();
@@ -180,27 +202,5 @@ public class DuplicationMetrics extends MetricBase {
for (Histogram.Bin<Double> bin : m.calculateRoiHistogram().values()) {
System.out.println(bin.getId() + "\t" + bin.getValue());
}
-
-// DuplicationMetrics m = new DuplicationMetrics();
-// m.READ_PAIRS_EXAMINED = Long.parseLong(args[0]);
-// m.READ_PAIR_DUPLICATES = Long.parseLong(args[1]);
-// final long UNIQUE_READ_PAIRS = m.READ_PAIRS_EXAMINED - m.READ_PAIR_DUPLICATES;
-// final double xCoverage = Double.parseDouble(args[2]);
-// final double uniqueXCoverage = xCoverage * ((double) UNIQUE_READ_PAIRS / (double) m.READ_PAIRS_EXAMINED);
-// final double oneOverCoverage = 1 / xCoverage;
-//
-// m.calculateDerivedMetrics();
-// System.out.println("Percent Duplication: " + m.PERCENT_DUPLICATION);
-// System.out.println("Est. Library Size : " + m.ESTIMATED_LIBRARY_SIZE);
-// System.out.println();
-//
-//
-// System.out.println("Coverage\tUnique Coverage\tDuplication");
-// for (double d = oneOverCoverage; (int) (d*xCoverage)<=50; d+=oneOverCoverage) {
-// double coverage = d * xCoverage;
-// double uniqueCoverage = uniqueXCoverage * m.estimateRoi(m.ESTIMATED_LIBRARY_SIZE, d, m.READ_PAIRS_EXAMINED, UNIQUE_READ_PAIRS);
-// double duplication = (coverage - uniqueCoverage) / coverage;
-// System.out.println(coverage + "\t" + uniqueCoverage + "\t" + duplication);
-// }
}
}
diff --git a/src/main/java/picard/sam/FastqToSam.java b/src/main/java/picard/sam/FastqToSam.java
index 2b8bd84..4d89d85 100644
--- a/src/main/java/picard/sam/FastqToSam.java
+++ b/src/main/java/picard/sam/FastqToSam.java
@@ -42,6 +42,7 @@ import htsjdk.samtools.util.Iso8601Date;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.QualityEncodingDetector;
+import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SolexaQualityConverter;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
@@ -145,7 +146,8 @@ public class FastqToSam extends CommandLineProgram {
@Option(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.")
public int MAX_Q = SAMUtils.MAX_PHRED_SCORE;
- @Option(doc="If true and this is an unpaired fastq any occurrence of '/1' will be removed from the end of a read name.")
+ @Deprecated
+ @Option(doc="Deprecated (No longer used). If true and this is an unpaired fastq any occurrence of '/1' or '/2' will be removed from the end of a read name.")
public Boolean STRIP_UNPAIRED_MATE_NUMBER = false;
@Option(doc="Allow (and ignore) empty lines")
@@ -314,7 +316,7 @@ public class FastqToSam extends CommandLineProgram {
final ProgressLogger progress = new ProgressLogger(LOG);
for ( ; freader.hasNext() ; readCount++) {
final FastqRecord frec = freader.next();
- final SAMRecord srec = createSamRecord(writer.getFileHeader(), getReadName(frec.getReadHeader(), false) , frec, false) ;
+ final SAMRecord srec = createSamRecord(writer.getFileHeader(), SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadHeader()) , frec, false) ;
srec.setReadPairedFlag(false);
writer.addAlignment(srec);
progress.record(srec);
@@ -331,8 +333,8 @@ public class FastqToSam extends CommandLineProgram {
final FastqRecord frec1 = freader1.next();
final FastqRecord frec2 = freader2.next();
- final String frec1Name = getReadName(frec1.getReadHeader(), true);
- final String frec2Name = getReadName(frec2.getReadHeader(), true);
+ final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadHeader());
+ final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadHeader());
final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2);
final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ;
@@ -487,22 +489,6 @@ public class FastqToSam extends CommandLineProgram {
return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath();
}
- // Read names cannot contain blanks
- private String getReadName(final String fastqHeader, final boolean paired) {
- final int idx = fastqHeader.indexOf(' ');
- String readName = (idx == -1) ? fastqHeader : fastqHeader.substring(0,idx);
-
- // NOTE: the while loop isn't necessarily the most efficient way to handle this but we don't
- // expect this to ever happen more than once, just trapping pathological cases
- while (STRIP_UNPAIRED_MATE_NUMBER && !paired && (readName.endsWith("/1") || readName.endsWith("/2"))) {
- // If this is an unpaired run we want to make sure that "/1" isn't tacked on the end of the read name,
- // as this can cause problems down the road in MergeBamAlignment
- readName = readName.substring(0, readName.length() - 2);
- }
-
- return readName;
- }
-
@Override
protected String[] customCommandLineValidation() {
if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"};
diff --git a/src/main/java/picard/sam/MergeBamAlignment.java b/src/main/java/picard/sam/MergeBamAlignment.java
index 18a54f8..b51b166 100644
--- a/src/main/java/picard/sam/MergeBamAlignment.java
+++ b/src/main/java/picard/sam/MergeBamAlignment.java
@@ -58,7 +58,9 @@ public class MergeBamAlignment extends CommandLineProgram {
" The purpose of this tool is to use information from the unmapped BAM to fix up aligner output. The resulting file will be valid " +
"for use by other Picard tools. For simple BAM file merges, use MergeSamFiles. Note that MergeBamAlignment expects to " +
"find a sequence dictionary in the same directory as REFERENCE_SEQUENCE and expects it " +
- "to have the same base name as the reference FASTA except with the extension \".dict\". " +
+ "to have the same base name as the reference FASTA except with the extension \".dict\". If " +
+ "the output sort order is not coordinate, then reads that are clipped due to adapters or overlapping " +
+ "will not contain the NM, MD, or UQ tags." +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar MergeBamAlignment \\<br /> " +
diff --git a/src/main/java/picard/sam/RevertSam.java b/src/main/java/picard/sam/RevertSam.java
index c2d0820..00fff44 100644
--- a/src/main/java/picard/sam/RevertSam.java
+++ b/src/main/java/picard/sam/RevertSam.java
@@ -108,6 +108,7 @@ public class RevertSam extends CommandLineProgram {
"</pre>" +
"Will output a BAM/SAM file per read group. By default, all outputs will be in BAM format. " +
"However, outputs will be in SAM format if the input path ends with '.sam', or CRAM format if it ends with '.cram'." +
+ " This behaviour can be overriden with OUTPUT_BY_READGROUP_FILE_FORMAT option."+
"<hr />";
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM/BAM file to revert the state of.")
public File INPUT;
@@ -121,6 +122,10 @@ public class RevertSam extends CommandLineProgram {
@Option(shortName = "OBR", doc = "When true, outputs each read group in a separate file.")
public boolean OUTPUT_BY_READGROUP = false;
+ public static enum FileType {sam, bam, cram,dynamic}
+ @Option(shortName = "OBRFF", doc = "When using OUTPUT_BY_READGROUP, the output file format can be set to a certain format." )
+ public FileType OUTPUT_BY_READGROUP_FILE_FORMAT=FileType.dynamic;
+
@Option(shortName = "SO", doc = "The sort order to create the reverted output file with.")
public SortOrder SORT_ORDER = SortOrder.queryname;
@@ -209,7 +214,14 @@ public class RevertSam extends CommandLineProgram {
final Map<String, File> outputMap;
final Map<String, SAMFileHeader> headerMap;
if (OUTPUT_BY_READGROUP) {
- final String defaultExtension = getDefaultExtension(INPUT.toString());
+
+ final String defaultExtension;
+ if (OUTPUT_BY_READGROUP_FILE_FORMAT==FileType.dynamic) {
+ defaultExtension = getDefaultExtension(INPUT.toString());
+ } else {
+ defaultExtension= "." + OUTPUT_BY_READGROUP_FILE_FORMAT.toString();
+ }
+
outputMap = createOutputMap(OUTPUT_MAP, OUTPUT, defaultExtension, inHeader.getReadGroups());
ValidationUtil.assertAllReadGroupsMapped(outputMap, inHeader.getReadGroups());
headerMap = createHeaderMap(inHeader, SORT_ORDER, REMOVE_ALIGNMENT_INFORMATION);
diff --git a/src/main/java/picard/sam/SetNmAndUqTags.java b/src/main/java/picard/sam/SetNmAndUqTags.java
index 8a1b9c5..46decad 100644
--- a/src/main/java/picard/sam/SetNmAndUqTags.java
+++ b/src/main/java/picard/sam/SetNmAndUqTags.java
@@ -21,43 +21,27 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
+
package picard.sam;
-import htsjdk.samtools.SAMException;
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMFileWriter;
-import htsjdk.samtools.SAMFileWriterFactory;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SamReader;
-import htsjdk.samtools.SamReaderFactory;
-import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
-import htsjdk.samtools.util.CloserUtil;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.ProgressLogger;
-import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
-import picard.cmdline.Option;
-import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
-import java.io.File;
-import java.util.stream.StreamSupport;
-
/**
* @author Yossi Farjoun
*/
+ at Deprecated
@CommandLineProgramProperties(
- usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmAndUqTags.USAGE_DETAILS,
+ usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmMdAndUqTags.USAGE_DETAILS,
usageShort = SetNmAndUqTags.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
-public class SetNmAndUqTags extends CommandLineProgram {
- static final String USAGE_SUMMARY = "Fixes the UQ and NM tags in a SAM file. ";
- static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM and UQ tags by comparing with the reference."+
+public class SetNmAndUqTags extends SetNmMdAndUqTags {
+ static final String USAGE_SUMMARY = "DEPRECATED: Use SetNmMdAndUqTags instead.";
+ static final String USAGE_DETAILS = "DEPRECATED: Use SetNmMdAndUqTags instead. This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+
"<br />" +
- "This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
- "these tags then.<br />"+
+ "This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
+ "these tags then.<br />"+
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar SetNmAndUqTags \\<br />" +
@@ -65,50 +49,5 @@ public class SetNmAndUqTags extends CommandLineProgram {
" O=fixed.bam \\<br />"+
"</pre>" +
"<hr />";
- @Option(doc = "The BAM or SAM file to fix.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
- public File INPUT;
-
- @Option(doc = "The fixed BAM or SAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
- public File OUTPUT;
-
- @Option(doc = "Whether the file contains bisulfite sequence (used when calculating the NM tag).")
- public boolean IS_BISULFITE_SEQUENCE = false;
-
- @Override
- protected String[] customCommandLineValidation() {
- if (REFERENCE_SEQUENCE == null) {
- return new String[]{"Must have a non-null REFERENCE_SEQUENCE"};
- }
- return super.customCommandLineValidation();
- }
-
- private final Log log = Log.getInstance(SetNmAndUqTags.class);
-
- public static void main(final String[] argv) {
- new SetNmAndUqTags().instanceMainWithExit(argv);
- }
-
- protected int doWork() {
- IOUtil.assertFileIsReadable(INPUT);
- IOUtil.assertFileIsWritable(OUTPUT);
- final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
-
- if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
- throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder());
- }
-
- final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
- writer.setProgressLogger(
- new ProgressLogger(log, (int) 1e7, "Wrote", "records"));
-
- final ReferenceSequenceFileWalker refSeq = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
-
- StreamSupport.stream(reader.spliterator(),false)
- .peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNMandUQ(rec, refSeq, IS_BISULFITE_SEQUENCE);})
- .forEach(writer::addAlignment);
-
- CloserUtil.close(reader);
- writer.close();
- return 0;
- }
}
+
diff --git a/src/main/java/picard/sam/SetNmAndUqTags.java b/src/main/java/picard/sam/SetNmMdAndUqTags.java
similarity index 85%
copy from src/main/java/picard/sam/SetNmAndUqTags.java
copy to src/main/java/picard/sam/SetNmMdAndUqTags.java
index 8a1b9c5..fa52e14 100644
--- a/src/main/java/picard/sam/SetNmAndUqTags.java
+++ b/src/main/java/picard/sam/SetNmMdAndUqTags.java
@@ -27,7 +27,6 @@ import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
-import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
@@ -48,19 +47,19 @@ import java.util.stream.StreamSupport;
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
- usage = SetNmAndUqTags.USAGE_SUMMARY + SetNmAndUqTags.USAGE_DETAILS,
- usageShort = SetNmAndUqTags.USAGE_SUMMARY,
+ usage = SetNmMdAndUqTags.USAGE_SUMMARY + SetNmMdAndUqTags.USAGE_DETAILS,
+ usageShort = SetNmMdAndUqTags.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
-public class SetNmAndUqTags extends CommandLineProgram {
- static final String USAGE_SUMMARY = "Fixes the UQ and NM tags in a SAM file. ";
- static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM and UQ tags by comparing with the reference."+
+public class SetNmMdAndUqTags extends CommandLineProgram {
+ static final String USAGE_SUMMARY = "Fixes the NM, MD, and UQ tags in a SAM file. ";
+ static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+
"<br />" +
"This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+
"these tags then.<br />"+
"<h4>Usage example:</h4>" +
"<pre>" +
- "java -jar picard.jar SetNmAndUqTags \\<br />" +
+ "java -jar picard.jar SetNmMDAndUqTags \\<br />" +
" I=sorted.bam \\<br />" +
" O=fixed.bam \\<br />"+
"</pre>" +
@@ -82,10 +81,10 @@ public class SetNmAndUqTags extends CommandLineProgram {
return super.customCommandLineValidation();
}
- private final Log log = Log.getInstance(SetNmAndUqTags.class);
+ private final Log log = Log.getInstance(SetNmMdAndUqTags.class);
public static void main(final String[] argv) {
- new SetNmAndUqTags().instanceMainWithExit(argv);
+ new SetNmMdAndUqTags().instanceMainWithExit(argv);
}
protected int doWork() {
@@ -103,8 +102,8 @@ public class SetNmAndUqTags extends CommandLineProgram {
final ReferenceSequenceFileWalker refSeq = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
- StreamSupport.stream(reader.spliterator(),false)
- .peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNMandUQ(rec, refSeq, IS_BISULFITE_SEQUENCE);})
+ StreamSupport.stream(reader.spliterator(),false)
+ .peek(rec->{if(!rec.getReadUnmappedFlag()) AbstractAlignmentMerger.fixNmMdAndUq(rec, refSeq, IS_BISULFITE_SEQUENCE);})
.forEach(writer::addAlignment);
CloserUtil.close(reader);
diff --git a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
index 4bf41b0..24b4073 100644
--- a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
+++ b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
@@ -588,7 +588,7 @@ public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCom
}
}
- metrics.calculateDerivedMetrics();
+ metrics.calculateDerivedFields();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
index e343c9e..0141c4b 100644
--- a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
+++ b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -86,10 +86,12 @@ public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
"duplicate. To do this, a new tag called the duplicate type (DT) tag was recently added as an optional output in " +
"the 'optional field' section of a SAM/BAM file. Invoking the TAGGING_POLICY option," +
" you can instruct the program to mark all the duplicates (All), only the optical duplicates (OpticalOnly), or no " +
- "duplicates (DontTag). This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the primary " +
- "methods to identify and differentiate duplicate types. The records within the output of a SAM/BAM file will have values " +
- "for the 'DT' tag (depending on the invoked TAGGING_POLICY), as either library/PCR-generated duplicates (LB), or " +
- "sequencing-platform artifact duplicates (SQ).</p> "+
+ "duplicates (DontTag). The records within the output of a SAM/BAM file will have values for the 'DT' tag (depending on the invoked " +
+ "TAGGING_POLICY), as either library/PCR-generated duplicates (LB), or sequencing-platform artifact duplicates (SQ). " +
+ "This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the primary methods to identify " +
+ "and differentiate duplicate types. Set READ_NAME_REGEX to null to skip optical duplicate detection, e.g. for RNA-seq " +
+ "or other data where duplicate sets are extremely large and estimating library complexity is not an aim. " +
+ "Note that without optical duplicate counts, library size estimation will be inaccurate.</p> "+
"<p>MarkDuplicates also produces a metrics file indicating the numbers of duplicates for both single- and paired-end reads.</p> "+
diff --git a/src/test/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java
similarity index 95%
rename from src/test/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java
rename to src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java
index 006dd33..e8fe4d0 100644
--- a/src/test/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java
+++ b/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java
@@ -32,10 +32,7 @@ import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordDuplicateComparator;
import htsjdk.samtools.SAMTag;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.IterableAdapter;
-import htsjdk.samtools.util.Log;
-import htsjdk.samtools.util.ProgressLogger;
+import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.programgroups.Testing;
@@ -120,10 +117,7 @@ public class SimpleMarkDuplicatesWithMateCigar extends MarkDuplicates {
final SAMRecordDuplicateComparator comparator = new SAMRecordDuplicateComparator(Collections.singletonList(headerAndIterator.header));
comparator.setScoringStrategy(this.DUPLICATE_SCORING_STRATEGY);
- final DuplicateSetIterator iterator = new DuplicateSetIterator(headerAndIterator.iterator,
- headerAndIterator.header,
- false,
- comparator);
+ final CloseableIterator<DuplicateSet> iterator = getDuplicateSetIterator(headerAndIterator, comparator);
// progress logger!
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
@@ -233,4 +227,11 @@ public class SimpleMarkDuplicatesWithMateCigar extends MarkDuplicates {
return 0;
}
-}
\ No newline at end of file
+
+ protected CloseableIterator<DuplicateSet> getDuplicateSetIterator(final SamHeaderAndIterator headerAndIterator, final SAMRecordDuplicateComparator comparator) {
+ return new DuplicateSetIterator(headerAndIterator.iterator,
+ headerAndIterator.header,
+ false,
+ comparator);
+ }
+}
diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java
new file mode 100644
index 0000000..ee1ddfa
--- /dev/null
+++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java
@@ -0,0 +1,124 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * This acts as an iterator over duplicate sets. If a particular duplicate
+ * set consists of records that contain UMIs this iterator breaks up a single
+ * duplicate set into multiple duplicate based on the content of the UMIs.
+ * Since there may be sequencing errors in the UMIs, this class allows for
+ * simple error correction based on edit distances between the UMIs.
+ *
+ * @author fleharty
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.DuplicateSet;
+import htsjdk.samtools.DuplicateSetIterator;
+import htsjdk.samtools.util.CloseableIterator;
+import picard.PicardException;
+
+import java.util.*;
+
+/**
+ * UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator
+ * in such a way that each duplicate set may be broken up into subsets according
+ * to UMIs in the records. Some tolerance for errors in the UMIs is allowed, and
+ * the degree of this is controlled by the maxEditDistanceToJoin parameter.
+ */
+class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
+ private final DuplicateSetIterator wrappedIterator;
+ private Iterator<DuplicateSet> nextSetsIterator;
+ private final int maxEditDistanceToJoin;
+ private final String umiTag;
+ private final String inferredUmiTag;
+ private final boolean allowMissingUmis;
+ private boolean isOpen = false;
+
+ /**
+ * Creates a UMI aware duplicate set iterator
+ *
+ * @param wrappedIterator UMI aware duplicate set iterator is a wrapper
+ * @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups
+ * @param umiTag The tag used in the bam file that designates the UMI
+ * @param assignedUmiTag The tag in the bam file that designates the assigned UMI
+ */
+ UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin,
+ final String umiTag, final String assignedUmiTag, final boolean allowMissingUmis) {
+ this.wrappedIterator = wrappedIterator;
+ this.maxEditDistanceToJoin = maxEditDistanceToJoin;
+ this.umiTag = umiTag;
+ this.inferredUmiTag = assignedUmiTag;
+ this.allowMissingUmis = allowMissingUmis;
+ isOpen = true;
+ nextSetsIterator = Collections.emptyIterator();
+ }
+
+ @Override
+ public void close() {
+ isOpen = false;
+ wrappedIterator.close();
+ }
+
+ @Override
+ public boolean hasNext() {
+ if(!isOpen) {
+ return false;
+ }
+ else {
+ if(nextSetsIterator.hasNext() || wrappedIterator.hasNext()) {
+ return true;
+ }
+ else {
+ isOpen = false;
+ return false;
+ }
+ }
+ }
+
+ @Override
+ public DuplicateSet next() {
+ if (!nextSetsIterator.hasNext()) {
+ process(wrappedIterator.next());
+ }
+ return nextSetsIterator.next();
+ }
+
+ /**
+ * Takes a duplicate set and breaks it up into possible smaller sets according to the UMI,
+ * and updates nextSetsIterator to be an iterator on that set of DuplicateSets.
+ *
+ * @param set Duplicate set that may be broken up into subsets according the UMIs
+ */
+ private void process(final DuplicateSet set) {
+
+ // Ensure that the nextSetsIterator isn't already occupied
+ if (nextSetsIterator.hasNext()) {
+ throw new PicardException("nextSetsIterator is expected to be empty, but already contains data.");
+ }
+
+ final UmiGraph umiGraph = new UmiGraph(set, umiTag, inferredUmiTag, allowMissingUmis);
+ nextSetsIterator = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin).iterator();
+ }
+}
diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java
new file mode 100644
index 0000000..0821baf
--- /dev/null
+++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java
@@ -0,0 +1,89 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.DuplicateSet;
+import htsjdk.samtools.DuplicateSetIterator;
+import htsjdk.samtools.SAMRecordDuplicateComparator;
+import htsjdk.samtools.util.*;
+import picard.cmdline.CommandLineProgramProperties;
+import picard.cmdline.Option;
+import picard.cmdline.programgroups.Alpha;
+
+/**
+ * This is a simple tool to mark duplicates making use of UMIs in the reads.
+ *
+ * It makes use of the fact that duplicate sets with UMIs can be broken up into subsets based on
+ * information contained in the UMI. Since UMIs may contain sequencing errors, this tool allows
+ * for UMIs that are different but within a given edit distance to be considered to be part of the
+ * same duplicate set.
+ *
+ * Users should continue to use MarkDuplicates in general, the main motivation for this tool is to provide a way to
+ * mark duplicates using information from UMIs.
+ *
+ * @author fleharty
+ */
+ at CommandLineProgramProperties(
+ usage = UmiAwareMarkDuplicatesWithMateCigar.USAGE_SUMMARY + UmiAwareMarkDuplicatesWithMateCigar.USAGE_DETAILS,
+ usageShort = UmiAwareMarkDuplicatesWithMateCigar.USAGE_SUMMARY,
+ programGroup = Alpha.class
+)
+public class UmiAwareMarkDuplicatesWithMateCigar extends SimpleMarkDuplicatesWithMateCigar {
+ static final String USAGE_SUMMARY = "Identifies duplicate reads using information from read positions and UMIs." +
+ "All records are then written to the output file with the duplicate records flagged.";
+ static final String USAGE_DETAILS = "<p>UmiAwareMarkDuplicatesWithMateCigar locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are " +
+ "defined as originating from a single fragment of DNA. </p>" +
+ "<p>This tool identifies a duplicate set by assuming that all members of a duplicate set must have the same start and end position," +
+ "and must also have a sufficiently similar UMIs. Sufficiently similar is parameterized by MAX_EDIT_DISTANCE_TO_JOIN which indicates" +
+ "the edit distance between UMIs that shall be considered to be part of the same original molecule.</p>" +
+ "<p>This tool is not intended to be used on data without UMIs, see MarkDuplicates for marking duplicates that" +
+ "do not have UMIs.</p>";
+
+ @Option(shortName = "MAX_EDIT_DISTANCE_TO_JOIN", doc = "Largest edit distance that UMIs must have in order to be considered as coming from distinct source molecules.", optional = true)
+ public int MAX_EDIT_DISTANCE_TO_JOIN = 1;
+
+ @Option(shortName = "UMI_TAG_NAME", doc = "Tag name to use for UMI", optional = true)
+ public String UMI_TAG_NAME = "RX";
+
+ @Option(shortName = "ASSIGNED_UMI_TAG", doc = "Tag name to use for assigned UMI", optional = true)
+ public String ASSIGNED_UMI_TAG = "MI";
+
+ // Since we inherit from SimpleMarkDuplicatesWithMateCigar, it is useful for us to also inherit the tests
+ // which do not contain UMIs. By default, we don't allow for missing UMIs, but for the inherited tests
+ // we allow for missing UMIs.
+ @Option(doc = "Allow for missing UMIs if data doesn't have UMIs. This option is intended to be used only for testing the code. Use SimpleMarkDuplicatesWithMateCigar if data has missing UMIs.", optional = true)
+ public boolean ALLOW_MISSING_UMIS = false;
+
+ private final Log log = Log.getInstance(UmiAwareMarkDuplicatesWithMateCigar.class);
+
+ @Override
+ protected CloseableIterator<DuplicateSet> getDuplicateSetIterator(final SamHeaderAndIterator headerAndIterator, final SAMRecordDuplicateComparator comparator) {
+ return new UmiAwareDuplicateSetIterator(
+ new DuplicateSetIterator(headerAndIterator.iterator,
+ headerAndIterator.header,
+ false,
+ comparator), MAX_EDIT_DISTANCE_TO_JOIN, UMI_TAG_NAME, ASSIGNED_UMI_TAG, ALLOW_MISSING_UMIS);
+ }
+}
diff --git a/src/main/java/picard/sam/markduplicates/UmiGraph.java b/src/main/java/picard/sam/markduplicates/UmiGraph.java
new file mode 100644
index 0000000..12b2359
--- /dev/null
+++ b/src/main/java/picard/sam/markduplicates/UmiGraph.java
@@ -0,0 +1,218 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.DuplicateSet;
+import htsjdk.samtools.SAMRecord;
+import picard.PicardException;
+
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.util.stream.Collectors.counting;
+/**
+ * UmiGraph is used to identify UMIs that come from the same original source molecule. The assumption
+ * is that UMIs with small edit distances are likely to be read errors on the sequencer rather than
+ * distinct molecules.
+ *
+ * The algorithm used here is to join all pairs of UMIs that are within maxEditDistanceToJoin. It is possible
+ * for a set of UMIs A, B and C to all be considered as part of the same source molecule even if two of the UMIs
+ * have a Hamming distance larger than maxEditDistanceToJoin. Suppose A = "ATCC", B = "AACC", and C = "AACG"
+ * and maxEditDistanceToJoin = 1. In this case, A and B are 1 Hamming distance so they are joined, and B and C
+ * are 1 Hamming distance so they are joined. Because A and B are joined and because B and C are joined, this results
+ * in A and C being joined even though they have a distance of 2.
+ *
+ * @author fleharty
+ */
+public class UmiGraph {
+ private final List<SAMRecord> records; // SAMRecords from the original duplicate set considered to break up by UMI
+ private final Map<String, Long> umiCounts; // Map of UMI sequences and how many times they have been observed
+ private final int[] duplicateSetID; // ID of the duplicate set that the UMI belongs to, the index is the UMI ID
+ private final String[] umi; // Sequence of actual UMI, the index is the UMI ID
+ private final int numUmis; // Number of observed UMIs
+ private final String umiTag; // UMI tag used in the SAM/BAM/CRAM file ie. RX
+ private final String assignedUmiTag; // Assigned UMI tag used in the SAM/BAM/CRAM file ie. MI
+ private final boolean allowMissingUmis; // Allow for missing UMIs
+
+ public UmiGraph(DuplicateSet set, String umiTag, String assignedUmiTag, boolean allowMissingUmis) {
+ this.umiTag = umiTag;
+ this.assignedUmiTag = assignedUmiTag;
+ this.allowMissingUmis = allowMissingUmis;
+ records = set.getRecords();
+
+ // First ensure that all the reads have a UMI, if any reads are missing a UMI throw an exception unless allowMissingUmis is true
+ for (SAMRecord rec : records) {
+ if(rec.getStringAttribute(umiTag) == null) {
+ if(allowMissingUmis) {
+ rec.setAttribute(umiTag, "");
+ } else {
+ throw new PicardException("Read " + rec.getReadName() + " does not contain a UMI with the " + umiTag + " attribute.");
+ }
+ }
+ }
+
+ // Count the number of times each UMI occurs
+ umiCounts = records.stream().collect(Collectors.groupingBy(p -> p.getStringAttribute(umiTag), counting()));
+
+ // At first we consider every UMI as if it were its own duplicate set
+ numUmis = umiCounts.size();
+ umi = new String[numUmis];
+
+ duplicateSetID = IntStream.rangeClosed(0, numUmis-1).toArray();
+
+ int i = 0;
+ for (String key : umiCounts.keySet()) {
+ umi[i] = key;
+ i++;
+ }
+ }
+
+ // Part of Union-Find with Path Compression to determine the duplicate set a particular UMI belongs to.
+ private int findRepresentativeUmi(int umiID) {
+ int representativeUmi = umiID; // All UMIs of a duplicate set will have the same reprsentativeUmi.
+ while (representativeUmi != duplicateSetID[representativeUmi]) {
+ representativeUmi = duplicateSetID[representativeUmi];
+ }
+ while (umiID != representativeUmi) {
+ int newUmiID = duplicateSetID[umiID];
+ duplicateSetID[umiID] = representativeUmi;
+ umiID = newUmiID;
+ }
+ return representativeUmi;
+ }
+
+ // Part of Union-Find with Path Compression that joins to UMIs to be part of the same duplicate set.
+ private void joinUmisIntoDuplicateSet(final int umi1ID, final int umi2ID) {
+ int representativeUmi1 = findRepresentativeUmi(umi1ID);
+ int representativeUmi2 = findRepresentativeUmi(umi2ID);
+ if (representativeUmi1 == representativeUmi2) return;
+ duplicateSetID[representativeUmi1] = representativeUmi2;
+ }
+
+ List<DuplicateSet> joinUmisIntoDuplicateSets(final int maxEditDistanceToJoin) {
+ // Compare all UMIs to each other. If they are within maxEditDistanceToJoin
+ // join them to the same duplicate set using the union-find algorithm.
+ for (int i = 0; i < numUmis; i++) {
+ for (int j = i + 1; j < numUmis; j++) {
+ if (isWithinEditDistance(umi[i], umi[j], maxEditDistanceToJoin)) {
+ joinUmisIntoDuplicateSet(i, j);
+ }
+ }
+ }
+
+ // This ensures that all duplicate sets have unique IDs. During Union-Find a tree is constructed
+ // where each UMI points to parent UMI. This ensures that all UMIs that belong to the same duplicate
+ // set point to the same parent UMI. Note that the parent UMI is only used as a representative UMI and
+ // is not at all related to the assigned UMI.
+ for (int i = 0; i < numUmis; i++) {
+ duplicateSetID[i] = findRepresentativeUmi(i);
+ }
+
+ final Map<Integer, List<SAMRecord>> duplicateSets = new HashMap<>();
+
+ // Assign UMIs to duplicateSets
+ final Map<String, Integer> duplicateSetsFromUmis = getDuplicateSetsFromUmis();
+ for (SAMRecord rec : records) {
+ final String umi = rec.getStringAttribute(umiTag);
+ final Integer duplicateSetIndex = duplicateSetsFromUmis.get(umi);
+
+ if (duplicateSets.containsKey(duplicateSetIndex)) {
+ duplicateSets.get(duplicateSetIndex).add(rec);
+ }
+ else {
+ final List<SAMRecord> n = new ArrayList<>();
+ n.add(rec);
+ duplicateSets.put(duplicateSetIndex, n);
+ }
+ }
+
+ final List<DuplicateSet> duplicateSetList = new ArrayList<>();
+ for (final Map.Entry<Integer, List<SAMRecord>> entry : duplicateSets.entrySet()) {
+ final DuplicateSet ds = new DuplicateSet();
+ final List<SAMRecord> recordList = entry.getValue();
+
+ // Add records to the DuplicateSet
+ for (final SAMRecord rec : recordList) {
+ ds.add(rec);
+ }
+
+ // For a particular duplicate set, identify the most common UMI
+ // and use this as an assigned UMI.
+ long maxCount = 0;
+ String assignedUmi = null;
+ for (SAMRecord rec : recordList) {
+ final String umi = rec.getStringAttribute(umiTag);
+
+ if (umiCounts.get(umi) > maxCount) {
+ maxCount = umiCounts.get(umi);
+ assignedUmi = umi;
+ }
+ }
+
+ // Set the records to contain the assigned UMI
+ for (final SAMRecord rec : recordList) {
+ if (allowMissingUmis && rec.getStringAttribute(umiTag) == "") {
+ // The SAM spec doesn't support empty tags, so we set it to null if it is empty.
+ rec.setAttribute(umiTag, null);
+ } else {
+ rec.setAttribute(assignedUmiTag, assignedUmi);
+ }
+ }
+
+ duplicateSetList.add(ds);
+ }
+
+ return duplicateSetList;
+ }
+
+ // Determine if the two strings s1 and s2 are within edit distance of editDistance.
+ // TODO: use HTSJDK version when this become available
+ private boolean isWithinEditDistance(final String s1, final String s2, final int editDistance) {
+ // Comparing edit distance of strings with different lengths is not supported
+ if (s1.length() != s2.length()) {
+ throw new PicardException("Attempting to determine if two UMIs of different length were within a specified edit distance.");
+ }
+ int measuredDistance = 0;
+ for (int i = 0;i < s1.length();i++) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ measuredDistance++;
+ if (measuredDistance > editDistance) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ // Create a map that maps a umi to the duplicateSetID
+ private Map<String, Integer> getDuplicateSetsFromUmis() {
+ final Map<String, Integer> duplicateSetsFromUmis = new HashMap<>();
+ for (int i = 0; i < duplicateSetID.length; i++) {
+ duplicateSetsFromUmis.put(umi[i], duplicateSetID[i]);
+ }
+ return duplicateSetsFromUmis;
+ }
+}
diff --git a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java
index 669767d..5bf6972 100644
--- a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java
+++ b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java
@@ -176,7 +176,7 @@ public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractO
metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue();
}
}
- metrics.calculateDerivedMetrics();
+ metrics.calculateDerivedFields();
file.addMetric(metrics);
}
diff --git a/src/main/java/picard/sam/markduplicates/util/AbstractOpticalDuplicateFinderCommandLineProgram.java b/src/main/java/picard/sam/markduplicates/util/AbstractOpticalDuplicateFinderCommandLineProgram.java
index 0b66a24..b762398 100644
--- a/src/main/java/picard/sam/markduplicates/util/AbstractOpticalDuplicateFinderCommandLineProgram.java
+++ b/src/main/java/picard/sam/markduplicates/util/AbstractOpticalDuplicateFinderCommandLineProgram.java
@@ -40,7 +40,9 @@ public abstract class AbstractOpticalDuplicateFinderCommandLineProgram extends C
@Option(doc = "Regular expression that can be used to parse read names in the incoming SAM file. Read names are " +
"parsed to extract three variables: tile/region, x coordinate and y coordinate. These values are used " +
"to estimate the rate of optical duplication in order to give a more accurate estimated library size. " +
- "Set this option to null to disable optical duplicate detection. " +
+ "Set this option to null to disable optical duplicate detection, e.g. for RNA-seq " +
+ "or other data where duplicate sets are extremely large and estimating library complexity is not an aim. " +
+ "Note that without optical duplicate counts, library size estimation will be inaccurate. " +
"The regular expression should contain three capture groups for the three variables, in order. " +
"It must match the entire read name. " +
"Note that if the default regex is specified, a regex match is not actually done, but instead the read name " +
diff --git a/src/main/java/picard/util/DbSnpBitSetUtil.java b/src/main/java/picard/util/DbSnpBitSetUtil.java
index af0559c..b4e7097 100755
--- a/src/main/java/picard/util/DbSnpBitSetUtil.java
+++ b/src/main/java/picard/util/DbSnpBitSetUtil.java
@@ -26,18 +26,14 @@ package picard.util;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IntervalList;
+import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import picard.vcf.ByIntervalListVariantContextIterator;
import java.io.File;
-import java.util.BitSet;
-import java.util.Collection;
-import java.util.EnumSet;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
/**
* Utility class to use with DbSnp files to determine is a locus is
@@ -90,7 +86,7 @@ public class DbSnpBitSetUtil {
if (dbSnpFile == null) throw new IllegalArgumentException("null dbSnpFile");
final Map<DbSnpBitSetUtil, Set<VariantType>> tmp = new HashMap<>();
tmp.put(this, EnumSet.copyOf(variantsToMatch));
- loadVcf(dbSnpFile, sequenceDictionary, tmp, intervals);
+ loadVcf(dbSnpFile, sequenceDictionary, tmp, intervals, Optional.empty());
}
/** Factory method to create both a SNP bitmask and an indel bitmask in a single pass of the VCF. */
@@ -104,6 +100,16 @@ public class DbSnpBitSetUtil {
public static DbSnpBitSets createSnpAndIndelBitSets(final File dbSnpFile,
final SAMSequenceDictionary sequenceDictionary,
final IntervalList intervals) {
+ return createSnpAndIndelBitSets(dbSnpFile, sequenceDictionary, intervals, Optional.empty());
+ }
+
+ /** Factory method to create both a SNP bitmask and an indel bitmask in a single pass of the VCF.
+ * If intervals are given, consider only SNP and indel sites that overlap the intervals. If log is given,
+ * progress loading the variants will be written to the log. */
+ public static DbSnpBitSets createSnpAndIndelBitSets(final File dbSnpFile,
+ final SAMSequenceDictionary sequenceDictionary,
+ final IntervalList intervals,
+ final Optional<Log> log) {
final DbSnpBitSets sets = new DbSnpBitSets();
sets.snps = new DbSnpBitSetUtil();
@@ -112,7 +118,7 @@ public class DbSnpBitSetUtil {
final Map<DbSnpBitSetUtil, Set<VariantType>> map = new HashMap<>();
map.put(sets.snps, EnumSet.of(VariantType.SNP));
map.put(sets.indels, EnumSet.of(VariantType.insertion, VariantType.deletion));
- loadVcf(dbSnpFile, sequenceDictionary, map, intervals);
+ loadVcf(dbSnpFile, sequenceDictionary, map, intervals, log);
return sets;
}
@@ -120,8 +126,10 @@ public class DbSnpBitSetUtil {
private static void loadVcf(final File dbSnpFile,
final SAMSequenceDictionary sequenceDictionary,
final Map<DbSnpBitSetUtil, Set<VariantType>> bitSetsToVariantTypes,
- final IntervalList intervals) {
+ final IntervalList intervals,
+ final Optional<Log> log) {
+ final Optional<ProgressLogger> progress = log.map(l -> new ProgressLogger(l, (int) 1e5, "Read", "variants"));
final VCFFileReader variantReader = new VCFFileReader(dbSnpFile, intervals != null);
final Iterator<VariantContext> variantIterator;
if (intervals != null) {
@@ -154,6 +162,7 @@ public class DbSnpBitSetUtil {
for (int i = kv.getStart(); i <= kv.getEnd(); i++) bits.set(i, true);
}
}
+ progress.map(p -> p.record(kv.getContig(), kv.getStart()));
}
CloserUtil.close(variantReader);
diff --git a/src/main/java/picard/vcf/CollectVariantCallingMetrics.java b/src/main/java/picard/vcf/CollectVariantCallingMetrics.java
index 33e0b93..49bc848 100644
--- a/src/main/java/picard/vcf/CollectVariantCallingMetrics.java
+++ b/src/main/java/picard/vcf/CollectVariantCallingMetrics.java
@@ -44,6 +44,7 @@ import picard.vcf.processor.VariantProcessor;
import java.io.File;
import java.util.Collection;
import java.util.HashSet;
+import java.util.Optional;
import java.util.Set;
/** Collects summary and per-sample metrics about variant calls in a VCF file. */
@@ -101,7 +102,7 @@ public class CollectVariantCallingMetrics extends CommandLineProgram {
final IntervalList targetIntervals = (TARGET_INTERVALS == null) ? null : IntervalList.fromFile(TARGET_INTERVALS).uniqued();
log.info("Loading dbSNP file ...");
- final DbSnpBitSetUtil.DbSnpBitSets dbsnp = DbSnpBitSetUtil.createSnpAndIndelBitSets(DBSNP, sequenceDictionary, targetIntervals);
+ final DbSnpBitSetUtil.DbSnpBitSets dbsnp = DbSnpBitSetUtil.createSnpAndIndelBitSets(DBSNP, sequenceDictionary, targetIntervals, Optional.of(log));
log.info("Starting iteration of variants.");
diff --git a/src/main/java/picard/vcf/filter/FilterVcf.java b/src/main/java/picard/vcf/filter/FilterVcf.java
index ee9798b..b94096a 100644
--- a/src/main/java/picard/vcf/filter/FilterVcf.java
+++ b/src/main/java/picard/vcf/filter/FilterVcf.java
@@ -89,7 +89,7 @@ public class FilterVcf extends CommandLineProgram {
+ " The script puts the following variables in the script context: "
+ " 'variant' a VariantContext ( https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/variantcontext/VariantContext.html ) and "
+ " 'header' a VCFHeader ( https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/vcf/VCFHeader.html )."
- + " Last value of the script should be a boolean to tell wether we should accept or reject the record.",
+ + " Last value of the script should be a boolean to tell whether we should accept or reject the record.",
optional = true)
public File JAVASCRIPT_FILE = null;
diff --git a/src/main/resources/picard/analysis/wgsHistogram.R b/src/main/resources/picard/analysis/wgsHistogram.R
index 6a05076..bdcbbdb 100644
--- a/src/main/resources/picard/analysis/wgsHistogram.R
+++ b/src/main/resources/picard/analysis/wgsHistogram.R
@@ -76,10 +76,10 @@ for (i in 1:2) {
percentOfMean <- coverage / meanCoverage; # x-axis
percentCovered <- rep(0, length(count)); # y-axis
- # must do a cumulative sume of percentCovered
+ # must do a cumulative sum of percentCovered
totalCount = sum(as.numeric(count));
for (j in 1:length(percentCovered)) {
- percentCovered[j] = 100.0# sum(as.numeric(count[j:length(percentCovered)])) / totalCount;
+ percentCovered[j] = sum(as.numeric(count[j:length(percentCovered)])) / totalCount;
}
ymin = percentCovered[round(meanCoverage+1)]
diff --git a/src/test/java/picard/analysis/CollectInsertSizeMetricsTest.java b/src/test/java/picard/analysis/CollectInsertSizeMetricsTest.java
index 295b458..6c7c972 100755
--- a/src/test/java/picard/analysis/CollectInsertSizeMetricsTest.java
+++ b/src/test/java/picard/analysis/CollectInsertSizeMetricsTest.java
@@ -54,7 +54,10 @@ public class CollectInsertSizeMetricsTest extends CommandLineProgramTest {
final String[] args = new String[] {
"INPUT=" + input.getAbsolutePath(),
"OUTPUT=" + outfile.getAbsolutePath(),
- "HISTOGRAM_FILE=" + pdf.getAbsolutePath()
+ "HISTOGRAM_FILE=" + pdf.getAbsolutePath(),
+ "LEVEL=SAMPLE",
+ "LEVEL=LIBRARY",
+ "LEVEL=READ_GROUP"
};
Assert.assertEquals(runPicardCommandLine(args), 0);
@@ -112,39 +115,38 @@ public class CollectInsertSizeMetricsTest extends CommandLineProgramTest {
Assert.assertEquals(metrics.WIDTH_OF_80_PERCENT, 9);
Assert.assertEquals(metrics.WIDTH_OF_90_PERCENT, 11);
Assert.assertEquals(metrics.WIDTH_OF_99_PERCENT, 11);
-
}
else if (metrics.LIBRARY.equals("Solexa-41734") && metrics.READ_GROUP == null) {
- Assert.assertEquals((int)metrics.MEDIAN_INSERT_SIZE, 26);
+ Assert.assertEquals((int)metrics.MEDIAN_INSERT_SIZE, 38);
Assert.assertEquals(metrics.MIN_INSERT_SIZE, 36);
Assert.assertEquals(metrics.MAX_INSERT_SIZE, 41);
- Assert.assertEquals(metrics.READ_PAIRS, 9);
- Assert.assertEquals(metrics.WIDTH_OF_10_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_20_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_30_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_40_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_50_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_60_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_70_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_80_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_90_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_99_PERCENT, 11);
+ Assert.assertEquals(metrics.READ_PAIRS, 2);
+ Assert.assertEquals(metrics.WIDTH_OF_10_PERCENT, 5);
+ Assert.assertEquals(metrics.WIDTH_OF_20_PERCENT, 5);
+ Assert.assertEquals(metrics.WIDTH_OF_30_PERCENT, 5);
+ Assert.assertEquals(metrics.WIDTH_OF_40_PERCENT, 5);
+ Assert.assertEquals(metrics.WIDTH_OF_50_PERCENT, 5);
+ Assert.assertEquals(metrics.WIDTH_OF_60_PERCENT, 7);
+ Assert.assertEquals(metrics.WIDTH_OF_70_PERCENT, 7);
+ Assert.assertEquals(metrics.WIDTH_OF_80_PERCENT, 7);
+ Assert.assertEquals(metrics.WIDTH_OF_90_PERCENT, 7);
+ Assert.assertEquals(metrics.WIDTH_OF_99_PERCENT, 7);
}
else if (metrics.READ_GROUP.equals("62A79AAXX100907.7")) {
- Assert.assertEquals((int)metrics.MEDIAN_INSERT_SIZE, 36);
+ Assert.assertEquals((int)metrics.MEDIAN_INSERT_SIZE, 37);
Assert.assertEquals(metrics.MIN_INSERT_SIZE, 36);
Assert.assertEquals(metrics.MAX_INSERT_SIZE, 41);
Assert.assertEquals(metrics.READ_PAIRS, 4);
- Assert.assertEquals(metrics.WIDTH_OF_10_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_20_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_30_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_40_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_50_PERCENT, 1);
- Assert.assertEquals(metrics.WIDTH_OF_60_PERCENT, 5);
- Assert.assertEquals(metrics.WIDTH_OF_70_PERCENT, 5);
- Assert.assertEquals(metrics.WIDTH_OF_80_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_90_PERCENT, 11);
- Assert.assertEquals(metrics.WIDTH_OF_99_PERCENT, 11);
+ Assert.assertEquals(metrics.WIDTH_OF_10_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_20_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_30_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_40_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_50_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_60_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_70_PERCENT, 3);
+ Assert.assertEquals(metrics.WIDTH_OF_80_PERCENT, 9);
+ Assert.assertEquals(metrics.WIDTH_OF_90_PERCENT, 9);
+ Assert.assertEquals(metrics.WIDTH_OF_99_PERCENT, 9);
}
else if (metrics.READ_GROUP.equals("62A79AAXX100907.6")) {
Assert.assertEquals((int)metrics.MEDIAN_INSERT_SIZE, 41);
diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTest.java b/src/test/java/picard/analysis/CollectWgsMetricsTest.java
index 87d5f93..518e066 100644
--- a/src/test/java/picard/analysis/CollectWgsMetricsTest.java
+++ b/src/test/java/picard/analysis/CollectWgsMetricsTest.java
@@ -233,12 +233,12 @@ public class CollectWgsMetricsTest extends CommandLineProgramTest {
setBuilder.setReadLength(10);
- int expectedSingltonCoverage = 0;
+ int expectedSingletonCoverage = 0;
- expectedSingltonCoverage += 13;
+ expectedSingletonCoverage += 13;
setBuilder.addPair("overlappingReads", 0, 2, 5, false, false, "10M", "10M", true, false, 30);
- expectedSingltonCoverage += 2 * 5; // 5 bases for each mate are good (see AAA!!!AA!! below).
+ expectedSingletonCoverage += 2 * 5; // 5 bases for each mate are good (see AAA!!!AA!! below).
setBuilder.addPair("poorQualityReads", 1, 2, 20, false, false, "10M", "10M", true, false, -1);
for(int i = 1; i < 5; i++) {
@@ -283,7 +283,7 @@ public class CollectWgsMetricsTest extends CommandLineProgramTest {
Assert.assertEquals((long) depthHistogram.getSumOfValues(), metrics.GENOME_TERRITORY);
Assert.assertEquals(baseQHistogram.getSumOfValues(), depthHistogram.getSum());
- Assert.assertEquals((long) depthHistogram.get(1).getValue(), expectedSingltonCoverage);
+ Assert.assertEquals((long) depthHistogram.get(1).getValue(), expectedSingletonCoverage);
Assert.assertEquals((long) depthHistogram.get(3).getValue(), 2*10);
}
diff --git a/src/test/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverageTest.java b/src/test/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverageTest.java
new file mode 100644
index 0000000..f82ad2e
--- /dev/null
+++ b/src/test/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverageTest.java
@@ -0,0 +1,127 @@
+package picard.analysis;
+
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.Histogram;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+import picard.analysis.CollectWgsMetricsWithNonZeroCoverage.*;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+
+/**
+ * Tests for CollectWgsMetricsWithNonZeroCoverage.
+ */
+public class CollectWgsMetricsWithNonZeroCoverageTest extends CommandLineProgramTest {
+
+ private final static File TEST_DIR = new File("testdata/picard/sam/");
+
+ public String getCommandLineProgramName() {
+ return CollectWgsMetricsWithNonZeroCoverage.class.getSimpleName();
+ }
+
+ @Test
+ public void testWithoutIntervals() throws IOException {
+ final File input = new File(TEST_DIR, "forMetrics.sam");
+ final File outfile = File.createTempFile("test", ".wgs_metrics");
+ final File pdffile = File.createTempFile("test", ".wgs_metrics.pdf");
+ final File ref = new File(TEST_DIR, "merger.fasta");
+ final int sampleSize = 1000;
+ outfile.deleteOnExit();
+ pdffile.deleteOnExit();
+ final String[] args = new String[] {
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(),
+ "SAMPLE_SIZE=" + sampleSize,
+ "CHART=" + pdffile.getAbsolutePath()
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<WgsMetricsWithNonZeroCoverage , Integer> output = new MetricsFile<>();
+ output.read(new FileReader(outfile));
+
+ for (final WgsMetricsWithNonZeroCoverage metrics : output.getMetrics()) {
+ if (metrics.CATEGORY == WgsMetricsWithNonZeroCoverage.Category.WHOLE_GENOME) {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 1210);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403);
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149);
+ Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075);
+ Assert.assertEquals(metrics.PCT_1X, 0.107438);
+ } else {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 130);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403);
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149);
+ Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075);
+ Assert.assertEquals(metrics.PCT_1X, 1.0);
+ }
+ }
+
+ for (final Histogram<Integer> histogram : output.getAllHistograms()) {
+ if (histogram.getValueLabel().equals("count_WHOLE_GENOME")) {
+ Assert.assertEquals(histogram.get(0).getValue(), 1080d);
+ } else {
+ Assert.assertEquals(histogram.get(0).getValue(), 0d);
+ }
+ Assert.assertEquals(histogram.get(1).getValue(), 9d);
+ Assert.assertEquals(histogram.get(2).getValue(), 35d);
+ Assert.assertEquals(histogram.get(3).getValue(), 86d);
+ Assert.assertEquals(histogram.get(4).getValue(), 0d);
+ }
+ }
+
+ @Test
+ public void testWithIntervals() throws IOException {
+ final File input = new File(TEST_DIR, "forMetrics.sam");
+ final File outfile = File.createTempFile("test", ".wgs_metrics");
+ final File pdffile = File.createTempFile("test", ".wgs_metrics.pdf");
+ final File ref = new File(TEST_DIR, "merger.fasta");
+ final File intervals = new File(TEST_DIR, "largeIntervals.interval_list");
+ final int sampleSize = 1000;
+ outfile.deleteOnExit();
+ pdffile.deleteOnExit();
+ final String[] args = new String[] {
+ "INPUT=" + input.getAbsolutePath(),
+ "OUTPUT=" + outfile.getAbsolutePath(),
+ "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(),
+ "INTERVALS=" + intervals.getAbsolutePath(),
+ "SAMPLE_SIZE=" + sampleSize,
+ "CHART=" + pdffile.getAbsolutePath()
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile<WgsMetricsWithNonZeroCoverage , Integer> output = new MetricsFile<>();
+ output.read(new FileReader(outfile));
+
+ for (final WgsMetricsWithNonZeroCoverage metrics : output.getMetrics()) {
+ if (metrics.CATEGORY == WgsMetricsWithNonZeroCoverage.Category.WHOLE_GENOME) {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 404);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403);
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149);
+ Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075);
+ Assert.assertEquals(metrics.PCT_1X, 0.321782);
+ } else {
+ Assert.assertEquals(metrics.GENOME_TERRITORY, 130);
+ Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403);
+ Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149);
+ Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075);
+ Assert.assertEquals(metrics.PCT_1X, 1.0);
+ }
+ }
+
+ for (final Histogram<Integer> histogram : output.getAllHistograms()) {
+ if (histogram.getValueLabel().equals("count_WHOLE_GENOME")) {
+ Assert.assertEquals(histogram.get(0).getValue(), 274d);
+ } else {
+ Assert.assertEquals(histogram.get(0).getValue(), 0d);
+ }
+ Assert.assertEquals(histogram.get(1).getValue(), 9d);
+ Assert.assertEquals(histogram.get(2).getValue(), 35d);
+ Assert.assertEquals(histogram.get(3).getValue(), 86d);
+ Assert.assertEquals(histogram.get(4).getValue(), 0d);
+
+ }
+ }
+}
diff --git a/src/test/java/picard/analysis/replicates/MergeableMetricBaseTest.java b/src/test/java/picard/analysis/MergeableMetricBaseTest.java
similarity index 81%
rename from src/test/java/picard/analysis/replicates/MergeableMetricBaseTest.java
rename to src/test/java/picard/analysis/MergeableMetricBaseTest.java
index bdcc8ac..1fcbfa5 100644
--- a/src/test/java/picard/analysis/replicates/MergeableMetricBaseTest.java
+++ b/src/test/java/picard/analysis/MergeableMetricBaseTest.java
@@ -1,4 +1,29 @@
-package picard.analysis.replicates;
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 Fulcrum Genomics
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+package picard.analysis;
import org.testng.Assert;
import org.testng.annotations.Test;
diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java
index 4fa9dd2..66b4aa4 100644
--- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java
+++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java
@@ -188,7 +188,7 @@ public class TheoreticalSensitivityTest {
qualityDistribution[j] = scanBaseQ.nextDouble();
}
- final int sampleSize = 1000;
+ final int sampleSize = 1_000;
final double logOddsThreshold = 3.0;
final double result = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, logOddsThreshold);
Assert.assertEquals(result, expectedResult, tolerance);
@@ -201,17 +201,18 @@ public class TheoreticalSensitivityTest {
final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics");
final File wgsSampledMetricsFile = new File(TEST_DIR, "test_Solexa-316269_sampled.wgs_metrics");
+ //These magic numbers come from a separate implementation of the code in R.
return new Object[][] {
- {.9130, wgsMetricsFile},
- {.9784, hsMetricsFile},
- {.9562, targetedMetricsFile},
- {.9892, wgsSampledMetricsFile}
+ {0.897_342_54, wgsMetricsFile},
+ {0.967_707_04, hsMetricsFile},
+ {0.956_186_66, targetedMetricsFile},
+ {0.995_084_32, wgsSampledMetricsFile}
};
}
@Test(dataProvider = "hetSensDataProvider")
public void testHetSensTargeted(final double expected, final File metricsFile) throws Exception{
- final double tolerance = 0.02;
+ final double tolerance = 0.000_000_01;
final MetricsFile Metrics = new MetricsFile();
Metrics.read(new FileReader(metricsFile));
@@ -222,7 +223,7 @@ public class TheoreticalSensitivityTest {
final double [] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram);
final double [] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram);
- final int sampleSize = 1000;
+ final int sampleSize = 1_000;
final double logOddsThreshold = 3.0;
final double result = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, logOddsThreshold);
diff --git a/src/test/java/picard/analysis/WgsMetricsTest.java b/src/test/java/picard/analysis/WgsMetricsTest.java
new file mode 100644
index 0000000..4cbcfef
--- /dev/null
+++ b/src/test/java/picard/analysis/WgsMetricsTest.java
@@ -0,0 +1,108 @@
+package picard.analysis;
+
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.util.Histogram;
+import htsjdk.samtools.util.Interval;
+import htsjdk.samtools.util.IntervalList;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import picard.PicardException;
+
+/**
+ * Tests for WgsMetrics.
+ */
+public class WgsMetricsTest {
+
+ private Histogram<Integer> emptyDepthHistogram() { return new Histogram<>(); }
+ private Histogram<Integer> singleDepthHistogram(final int depth, final int count) {
+ final Histogram<Integer> histogram = new Histogram<>();
+ histogram.increment(depth, count);
+ return histogram;
+ }
+ private Histogram<Integer> twoSiteDepthHistogram(final int depth1, final int count1, final int depth2, final int count2) {
+ final Histogram<Integer> histogram = new Histogram<>();
+ if (0 < depth1) histogram.increment(depth1, count1);
+ if (0 < depth2) histogram.increment(depth2, count2);
+ return histogram;
+ }
+
+ private IntervalList buildIntervalList(final int start, final int end) {
+ final SAMFileHeader header = new SAMFileHeader();
+ header.addSequence(new SAMSequenceRecord("CONTIG", 100000000));
+ final IntervalList intervals = new IntervalList(header);
+ if (0 < start) intervals.add(new Interval("CONTIG", start, end));
+ return intervals;
+ }
+
+ private CollectWgsMetrics.WgsMetrics emptyMetrics() {
+ return new CollectWgsMetrics.WgsMetrics(
+ buildIntervalList(-1, -1),
+ emptyDepthHistogram(),
+ 0, 0, 0, 0, 0, 0, 0, 1000000,
+ null, -1
+ );
+ }
+
+ private CollectWgsMetrics.WgsMetrics singleDepthMetrics(final int depth, final int countScale, final int start) {
+ final int count = 100000 * countScale;
+ final int totalExcluded = (10 + 20 + 30 + 40 + 50 + 60) * countScale;
+ return new CollectWgsMetrics.WgsMetrics(
+ buildIntervalList(start, start),
+ singleDepthHistogram(depth, count),
+ 10d * countScale / count, 20d * countScale / count, 30d * countScale / count,
+ 40d * countScale / count, 50d * countScale / count, 60d * countScale / count,
+ totalExcluded / (double) (count + totalExcluded),
+ 1000000,
+ null, -1
+ );
+ }
+
+ private CollectWgsMetrics.WgsMetrics twoSiteDepthMetrics(final int depth1, final int countScale1,
+ final int depth2, final int countScale2,
+ final int start) {
+ final int count1 = 100000 * countScale1;
+ final int count2 = 100000 * countScale2;
+ final int count = count1 + count2;
+ final int countScale = countScale1 + countScale2;
+ final int totalExcluded = (10 + 20 + 30 + 40 + 50 + 60) * countScale;
+ return new CollectWgsMetrics.WgsMetrics(
+ buildIntervalList(start, start+1),
+ twoSiteDepthHistogram(depth1, count1, depth2, count2),
+ 10d * countScale / count, 20d * countScale / count, 30d * countScale / count,
+ 40d * countScale / count, 50d * countScale / count, 60d * countScale / count,
+ totalExcluded / (double) (count + totalExcluded),
+ 100000,
+ null, -1
+ );
+ }
+
+ @Test(dataProvider = "testWgsMetricsMergeDataProvider")
+ public void testWgsMetricsMerge(final CollectWgsMetrics.WgsMetrics left,
+ final CollectWgsMetrics.WgsMetrics right,
+ final CollectWgsMetrics.WgsMetrics expected) {
+ left.merge(right);
+ left.calculateDerivedFields();
+ Assert.assertTrue(left.equals(expected));
+ }
+
+ @DataProvider(name = "testWgsMetricsMergeDataProvider")
+ public Object[][] testWgsMetricsMergeDataProvider() {
+ return new Object[][] {
+ {emptyMetrics(), emptyMetrics(), emptyMetrics()},
+ {emptyMetrics(), singleDepthMetrics(1, 1, 1), singleDepthMetrics(1, 1, 1)},
+ {singleDepthMetrics(1, 1, 1), emptyMetrics(), singleDepthMetrics(1, 1, 1)},
+ {singleDepthMetrics(1, 1, 1), singleDepthMetrics(1, 1, 2), twoSiteDepthMetrics(1, 2, 0, 0, 1)},
+ {singleDepthMetrics(1, 1, 1), singleDepthMetrics(1, 2, 2), twoSiteDepthMetrics(1, 3, 0, 0, 1)},
+ {singleDepthMetrics(1, 1, 1), singleDepthMetrics(1, 1, 2), twoSiteDepthMetrics(1, 2, 0, 0, 1)},
+ {singleDepthMetrics(1, 1, 1), singleDepthMetrics(2, 1, 2), twoSiteDepthMetrics(1, 1, 2, 1, 1)},
+ {twoSiteDepthMetrics(1, 1, 2, 1, 1), twoSiteDepthMetrics(1, 1, 2, 1, 3), twoSiteDepthMetrics(1, 2, 2, 2, 1)}
+ };
+ }
+
+ @Test(expectedExceptions = {PicardException.class})
+ public void testMergeOverlappingIntervals() {
+ singleDepthMetrics(1, 1, 1).merge(singleDepthMetrics(1, 1, 1));
+ }
+}
diff --git a/src/test/java/picard/analysis/artifacts/CollectSequencingArtifactMetricsTest.java b/src/test/java/picard/analysis/artifacts/CollectSequencingArtifactMetricsTest.java
index d044ad0..0b23b4b 100644
--- a/src/test/java/picard/analysis/artifacts/CollectSequencingArtifactMetricsTest.java
+++ b/src/test/java/picard/analysis/artifacts/CollectSequencingArtifactMetricsTest.java
@@ -72,6 +72,7 @@ public class CollectSequencingArtifactMetricsTest extends CommandLineProgramTest
Assert.assertTrue(areMetricsEqual(expectedBase, actualBase, SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT),"Pre-Adapter details files differ.");
Assert.assertTrue(areMetricsEqual(expectedBase, actualBase, SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT), "Bait-Bias summary files differ.");
Assert.assertTrue(areMetricsEqual(expectedBase, actualBase, SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT), "Bait-bias details files differ.");
+ Assert.assertTrue(areMetricsEqual(expectedBase, actualBase, SequencingArtifactMetrics.ERROR_SUMMARY_EXT), "Error-summary files differ.");
}
private boolean areMetricsEqual(final File expectedBase, final File actualBase, final String extension) {
diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java
index 5b99adc..00010f4 100644
--- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java
+++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java
@@ -24,7 +24,7 @@ import java.util.Random;
public class CollectTargetedMetricsTest extends CommandLineProgramTest {
private final static File TEST_DIR = new File("testdata/picard/sam/CollectGcBiasMetrics/");
private final File dict = new File(TEST_DIR, "Mheader.dict");
- private File tempSamFile;
+ private File tempSamFile, tempSamFileIndex;
private File outfile;
private File perTargetOutfile;
private final static int LENGTH = 99;
@@ -47,9 +47,11 @@ public class CollectTargetedMetricsTest extends CommandLineProgramTest {
//Create Sam Files
tempSamFile = File.createTempFile("CollectTargetedMetrics", ".bam", TEST_DIR);
+ tempSamFileIndex = new File(tempSamFile.toString().replaceAll("\\.bam$",".bai"));
final File tempSamFileUnsorted = File.createTempFile("CollectTargetedMetrics", ".bam", TEST_DIR);
tempSamFileUnsorted.deleteOnExit();
tempSamFile.deleteOnExit();
+ tempSamFileIndex.deleteOnExit();
final SAMFileHeader header = new SAMFileHeader();
//Check that dictionary file is readable and then set header dictionary
diff --git a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java
index 119e530..e54539f 100644
--- a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java
+++ b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java
@@ -4,11 +4,11 @@ import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import picard.vcf.VcfTestUtils;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
/**
* Created by farjoun on 8/27/15.
@@ -19,6 +19,11 @@ public class FingerprintCheckerTest {
private final Snp snp = new Snp("test", "chr1", 1, (byte) 'A', (byte) 'C', maf, Collections.singletonList("dummy"));
private final HaplotypeBlock hb = new HaplotypeBlock(maf);
+ private static final double DELTA = 1e-6;
+
+ private static final File TEST_DATA_DIR = new File("testdata/picard/fingerprint/");
+ private static final File SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING = new File(TEST_DATA_DIR, "Homo_sapiens_assembly19.haplotype_database.subset.txt");
+
@BeforeClass
public void setup() {
hb.addSnp(snp);
@@ -75,4 +80,34 @@ public class FingerprintCheckerTest {
// (a hom normal isn't expected to be measured as a het in the tumor)
Assert.assertTrue(mr.getLodTN() > mr.getLodNT());
}
+
+ @DataProvider(name = "checkFingerprintsVcfDataProvider")
+ public Object[][] testCheckFingerprintsVcfDataProvider() {
+ return new Object[][] {
+ {new File(TEST_DATA_DIR, "NA12891.vcf"), new File(TEST_DATA_DIR, "NA12891.fp.vcf"), "NA12891", "NA12891", -0.02128, -1.026742, 1.005462},
+ {new File(TEST_DATA_DIR, "NA12892.vcf"), new File(TEST_DATA_DIR, "NA12892.fp.vcf"), "NA12892", "NA12892", -0.021945, -1.08308, 1.061135},
+ {new File(TEST_DATA_DIR, "NA12891.vcf"), new File(TEST_DATA_DIR, "NA12892.fp.vcf"), "NA12891", "NA12892", -5.941691, -1.026742, -4.914948},
+ {new File(TEST_DATA_DIR, "NA12892.vcf"), new File(TEST_DATA_DIR, "NA12891.fp.vcf"), "NA12892", "NA12891", -5.998029, -1.08308, -4.914948}
+ };
+ }
+
+ @Test(dataProvider = "checkFingerprintsVcfDataProvider")
+ public void testCheckFingerprints(File vcfFile, File genotypesFile, String observedSampleAlias, String expectedSampleAlias,
+ double llExpectedSample, double llRandomSample, double lodExpectedSample) throws IOException {
+
+ final FingerprintChecker fpChecker = new FingerprintChecker(SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING);
+ final List<FingerprintResults> results = fpChecker.checkFingerprints(Collections.singletonList(vcfFile),
+ Collections.singletonList(genotypesFile),
+ observedSampleAlias,
+ expectedSampleAlias);
+ Assert.assertEquals(results.size(), 1);
+ final FingerprintResults fpr = results.get(0);
+ Assert.assertNull(fpr.getReadGroup());
+ Assert.assertEquals(fpr.getSampleAlias(), observedSampleAlias);
+ final MatchResults mr = fpr.getMatchResults().first();
+ Assert.assertEquals(mr.getSample(), expectedSampleAlias);
+ Assert.assertEquals(mr.getSampleLikelihood(), llExpectedSample, DELTA);
+ Assert.assertEquals(mr.getPopulationLikelihood(), llRandomSample, DELTA);
+ Assert.assertEquals(mr.getLOD(), lodExpectedSample, DELTA);
+ }
}
\ No newline at end of file
diff --git a/src/test/java/picard/sam/DuplicationMetricsTest.java b/src/test/java/picard/sam/DuplicationMetricsTest.java
new file mode 100644
index 0000000..cc6b1ca
--- /dev/null
+++ b/src/test/java/picard/sam/DuplicationMetricsTest.java
@@ -0,0 +1,91 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 Nils Homer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam;
+
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+/**
+ * Tests for DuplicationMetrics.
+ */
+public class DuplicationMetricsTest {
+
+ private DuplicationMetrics emptyMetrics() {
+ final DuplicationMetrics metric = new DuplicationMetrics();
+ metric.LIBRARY = "LIBRARY";
+ metric.UNPAIRED_READS_EXAMINED = 0;
+ metric.READ_PAIRS_EXAMINED = 0;
+ metric.SECONDARY_OR_SUPPLEMENTARY_RDS = 0;
+ metric.UNMAPPED_READS = 0;
+ metric.UNPAIRED_READ_DUPLICATES = 0;
+ metric.READ_PAIR_DUPLICATES = 0;
+ metric.READ_PAIR_OPTICAL_DUPLICATES = 0;
+ metric.calculateDerivedFields();
+ return metric;
+ }
+
+ private DuplicationMetrics nonEmptyMetrics(final int scale) {
+ final DuplicationMetrics metric = new DuplicationMetrics();
+ metric.LIBRARY = "LIBRARY";
+ metric.UNPAIRED_READS_EXAMINED = 1000 * scale;
+ metric.READ_PAIRS_EXAMINED = 1000 * scale;
+ metric.SECONDARY_OR_SUPPLEMENTARY_RDS = scale;
+ metric.UNMAPPED_READS = 10 * scale;
+ metric.UNPAIRED_READ_DUPLICATES = 100 * scale;
+ metric.READ_PAIR_DUPLICATES = 110 * scale;
+ metric.READ_PAIR_OPTICAL_DUPLICATES = 10 * scale;
+ metric.calculateDerivedFields();
+ return metric;
+ }
+
+ @Test(dataProvider="testMergeDataProvider")
+ public void testMerge(final DuplicationMetrics left, final DuplicationMetrics right, final DuplicationMetrics expected) {
+ left.merge(right);
+ left.calculateDerivedFields();
+
+ Assert.assertEquals(left.LIBRARY, expected.LIBRARY);
+ Assert.assertEquals(left.UNPAIRED_READS_EXAMINED, expected.UNPAIRED_READS_EXAMINED);
+ Assert.assertEquals(left.READ_PAIRS_EXAMINED, expected.READ_PAIRS_EXAMINED);
+ Assert.assertEquals(left.SECONDARY_OR_SUPPLEMENTARY_RDS, expected.SECONDARY_OR_SUPPLEMENTARY_RDS);
+ Assert.assertEquals(left.UNMAPPED_READS, expected.UNMAPPED_READS);
+ Assert.assertEquals(left.UNPAIRED_READ_DUPLICATES, expected.UNPAIRED_READ_DUPLICATES);
+ Assert.assertEquals(left.READ_PAIR_DUPLICATES, expected.READ_PAIR_DUPLICATES);
+ Assert.assertEquals(left.READ_PAIR_OPTICAL_DUPLICATES, expected.READ_PAIR_OPTICAL_DUPLICATES);
+ Assert.assertEquals(left.PERCENT_DUPLICATION, expected.PERCENT_DUPLICATION);
+ Assert.assertEquals(left.ESTIMATED_LIBRARY_SIZE, expected.ESTIMATED_LIBRARY_SIZE);
+ }
+
+ @DataProvider(name="testMergeDataProvider")
+ public Object[][] testMergeDataProvider() {
+ return new Object[][] {
+ {emptyMetrics(), emptyMetrics(), emptyMetrics()},
+ {emptyMetrics(), nonEmptyMetrics(1), nonEmptyMetrics(1)},
+ {nonEmptyMetrics(1), emptyMetrics(), nonEmptyMetrics(1)},
+ {nonEmptyMetrics(1), nonEmptyMetrics(1), nonEmptyMetrics(2)},
+ {nonEmptyMetrics(1), nonEmptyMetrics(2), nonEmptyMetrics(3)}
+ };
+ }
+}
diff --git a/src/test/java/picard/sam/MergeBamAlignmentTest.java b/src/test/java/picard/sam/MergeBamAlignmentTest.java
index 2a2914f..e0449ce 100644
--- a/src/test/java/picard/sam/MergeBamAlignmentTest.java
+++ b/src/test/java/picard/sam/MergeBamAlignmentTest.java
@@ -23,23 +23,7 @@
*/
package picard.sam;
-import htsjdk.samtools.BamFileIoUtils;
-import htsjdk.samtools.Cigar;
-import htsjdk.samtools.CigarElement;
-import htsjdk.samtools.CigarOperator;
-import htsjdk.samtools.Defaults;
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMFileWriter;
-import htsjdk.samtools.SAMFileWriterFactory;
-import htsjdk.samtools.SAMProgramRecord;
-import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMRecordIterator;
-import htsjdk.samtools.SAMSequenceRecord;
-import htsjdk.samtools.SAMTag;
-import htsjdk.samtools.SamPairUtil;
-import htsjdk.samtools.SamReader;
-import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.*;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import org.testng.Assert;
@@ -105,7 +89,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, outputWithSupplemental,
- SamPairUtil.PairOrientation.FR, null, null, null, null);
+ SamPairUtil.PairOrientation.FR, null, null, null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(outputWithSupplemental);
@@ -176,7 +160,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null, null);
+ SamPairUtil.PairOrientation.FR, null, null, null, null, null);
SamReader result = SamReaderFactory.makeDefault().open(output);
Assert.assertEquals(result.getFileHeader().getSequenceDictionary().getSequences().size(), 8,
@@ -237,7 +221,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
null, null, null, null,
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null, null);
+ SamPairUtil.PairOrientation.FR, null, null, null, null, null);
CloserUtil.close(result);
@@ -266,7 +250,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null, null);
+ SamPairUtil.PairOrientation.FR, null, null, null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(output);
@@ -310,14 +294,15 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
}
@Test(dataProvider="data")
- public void testSortingOnSamAlignmentMerger(final File unmapped, final File aligned, final boolean sorted, final String testName)
+ public void testSortingOnSamAlignmentMerger(final File unmapped, final File aligned, final boolean sorted, final boolean coordinateSorted, final String testName)
throws IOException {
final File target = File.createTempFile("target", "bam");
target.deleteOnExit();
final SamAlignmentMerger merger = new SamAlignmentMerger(unmapped, target, fasta, null, true, false,
false, Arrays.asList(aligned), 1, null, null, null, null, null, null,
- Arrays.asList(SamPairUtil.PairOrientation.FR), SAMFileHeader.SortOrder.coordinate,
+ Arrays.asList(SamPairUtil.PairOrientation.FR),
+ coordinateSorted ? SAMFileHeader.SortOrder.coordinate : SAMFileHeader.SortOrder.queryname,
new BestMapqPrimaryAlignmentSelectionStrategy(), false, false, 30);
merger.mergeAlignment(Defaults.REFERENCE_FASTA);
@@ -336,9 +321,10 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
@DataProvider(name="data")
public Object[][] getDataForSortingTest() {
return new Object[][] {
- {unmappedBam, alignedQuerynameSortedBam, true, "Basic test with pre-sorted alignment"},
- {unmappedBam, alignedBam, false, "Basic test with unsorted alignment"}
-
+ {unmappedBam, alignedQuerynameSortedBam, true, true, "Basic test with pre-sorted alignment"},
+ {unmappedBam, alignedBam, false, true, "Basic test with unsorted alignment"},
+ {unmappedBam, alignedQuerynameSortedBam, true, false, "Basic test with pre-sorted alignment"},
+ {unmappedBam, alignedBam, false, false, "Basic test with unsorted alignment"}
};
}
@@ -354,7 +340,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, output,
- SamPairUtil.PairOrientation.FR, null, null, null, null);
+ SamPairUtil.PairOrientation.FR, null, null, null, null, null);
SamReaderFactory factory = SamReaderFactory.makeDefault();
final SamReader result = factory.open(output);
@@ -419,7 +405,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
"0", "1.0", "align!", "myAligner",
true, fasta, merged,
SamPairUtil.PairOrientation.FR, null,
- null, null, null);
+ null, null, null, null);
Assert.fail("Merger should have failed because unmapped reads are not in queryname order but didn't");
}
@@ -435,7 +421,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, merged,
- null, null, null, null, null);
+ null, null, null, null, null, null);
// Iterate over the merged output and gather some statistics
final Map<String, AlignmentAccumulator> accumulatorMap = new HashMap<String, AlignmentAccumulator>();
@@ -597,7 +583,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, fasta, mergedSam,
- null, null, null, null, null);
+ null, null, null, null, null, null);
assertSamValid(mergedSam);
@@ -859,7 +845,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
false, fasta, mergedSam,
- null, null, null, null, null);
+ null, null, null, null, null, null);
assertSamValid(mergedSam);
@@ -1006,7 +992,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
fasta, output,
SamPairUtil.PairOrientation.FR,
MergeBamAlignment.PrimaryAlignmentStrategy.EarliestFragment,
- null, null, null);
+ null, null, null, null);
Assert.fail("Exception was not thrown");
}
@@ -1029,7 +1015,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
true, fasta, output,
SamPairUtil.PairOrientation.FR, MergeBamAlignment.PrimaryAlignmentStrategy.EarliestFragment,
ONE_OF_THE_BEST_TAG,
- null, false);
+ null, false, null);
final SamReader mergedReader = SamReaderFactory.makeDefault().open(output);
@@ -1171,7 +1157,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
true,
new File(TEST_DATA_DIR, "cliptest.fasta"), output,
SamPairUtil.PairOrientation.FR, null,
- null, null, null);
+ null, null, null, null);
final SamReader result = SamReaderFactory.makeDefault().open(output);
final Map<String, SAMRecord> firstReadEncountered = new HashMap<String, SAMRecord>();
@@ -1259,7 +1245,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
new File(TEST_DATA_DIR, "cliptest.fasta"), output,
SamPairUtil.PairOrientation.FR,
MergeBamAlignment.PrimaryAlignmentStrategy.BestEndMapq,
- null, includeSecondary, null);
+ null, includeSecondary, null, null);
final SamReader reader = SamReaderFactory.makeDefault().open(output);
int numFirstRecords = 0;
@@ -1304,7 +1290,8 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
final SamPairUtil.PairOrientation expectedOrientation, final MergeBamAlignment.PrimaryAlignmentStrategy primaryAlignmentStrategy,
final String attributesToRetain,
final Boolean includeSecondary,
- final Boolean unmapContaminantReads) {
+ final Boolean unmapContaminantReads,
+ final SAMFileHeader.SortOrder sortOrder) {
final List<String> args = new ArrayList<String>(Arrays.asList(
"UNMAPPED_BAM=" + unmappedBam.getAbsolutePath(),
@@ -1364,6 +1351,9 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
if (unmapContaminantReads != null) {
args.add("UNMAP_CONTAMINANT_READS=" + unmapContaminantReads);
}
+ if (sortOrder != null) {
+ args.add("SORT_ORDER=" + sortOrder.name());
+ }
Assert.assertEquals(runPicardCommandLine(args), 0, "Merge did not succeed");
}
@@ -1501,7 +1491,7 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
"0", "1.0", "align!", "myAligner",
true, fasta, output,
SamPairUtil.PairOrientation.FR, MergeBamAlignment.PrimaryAlignmentStrategy.MostDistant,
- null, includeSecondary, null);
+ null, includeSecondary, null, null);
final SamReader reader = SamReaderFactory.makeDefault().open(output);
int numFirstRecords = 0;
@@ -1680,9 +1670,43 @@ public class MergeBamAlignmentTest extends CommandLineProgramTest {
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, refFasta, mergedSam,
- null, null, null, null, true);
+ null, null, null, null, true, null);
assertSamValid(mergedSam);
IOUtil.assertFilesEqual(expectedSam, mergedSam);
}
+
+ @Test
+ public void testRemoveNmMdAndUqOnOverlappingReads() throws IOException {
+ final File output = File.createTempFile("testRemoveNmMdAndUqOnOverlappingReads", ".sam");
+ output.deleteOnExit();
+ doMergeAlignment(new File(TEST_DATA_DIR, "removetags.unmapped.sam"),
+ Collections.singletonList(new File(TEST_DATA_DIR, "removetags.aligned.sam")),
+ null, null, null, null,
+ false, true, false, 1,
+ "0", "1.0", "align!", "myAligner",
+ true,
+ new File(TEST_DATA_DIR, "removetags.fasta"), output,
+ SamPairUtil.PairOrientation.FR, null,
+ null, null, null, SAMFileHeader.SortOrder.queryname);
+
+ final SamReader result = SamReaderFactory.makeDefault().open(output);
+ for (final SAMRecord rec : result) {
+ boolean hasTags = false;
+ if (rec.getReadName().startsWith("CLIPPED")) {
+ final String[] readNameFields = rec.getReadName().split(":");
+ final int index = rec.getFirstOfPairFlag() ? 1 : 2;
+ hasTags = Integer.parseInt(readNameFields[index]) == 1;
+ }
+ if (hasTags) {
+ Assert.assertNull(rec.getAttribute("MD"));
+ Assert.assertNull(rec.getAttribute("NM"));
+ }
+ else {
+ Assert.assertNotNull(rec.getAttribute("MD"));
+ Assert.assertNotNull(rec.getAttribute("NM"));
+ }
+ }
+ result.close();
+ }
}
diff --git a/src/test/java/picard/sam/RevertSamTest.java b/src/test/java/picard/sam/RevertSamTest.java
index d14046d..95e0411 100755
--- a/src/test/java/picard/sam/RevertSamTest.java
+++ b/src/test/java/picard/sam/RevertSamTest.java
@@ -250,7 +250,7 @@ public class RevertSamTest extends CommandLineProgramTest {
for (final SAMRecord.SAMTagAndValue attr : rec.getAttributes()) {
if (removeAlignmentInfo || (!attr.tag.equals("PG") && !attr.tag.equals("NM")
- && !attr.tag.equals("MQ"))) {
+ && !attr.tag.equals(SAMTag.MQ.toString()))) {
Assert.assertFalse(reverter.ATTRIBUTE_TO_CLEAR.contains(attr.tag),
attr.tag + " should have been cleared.");
}
diff --git a/src/test/java/picard/sam/SetNmAndUqTagsTest.java b/src/test/java/picard/sam/SetNmMdAndUqTagsTest.java
similarity index 92%
rename from src/test/java/picard/sam/SetNmAndUqTagsTest.java
rename to src/test/java/picard/sam/SetNmMdAndUqTagsTest.java
index 899e8ba..20a7606 100644
--- a/src/test/java/picard/sam/SetNmAndUqTagsTest.java
+++ b/src/test/java/picard/sam/SetNmMdAndUqTagsTest.java
@@ -7,7 +7,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
-public class SetNmAndUqTagsTest {
+public class SetNmMdAndUqTagsTest {
private static final File fasta = new File("testdata/picard/sam/merger.fasta");
@DataProvider(name="filesToFix")
@@ -64,7 +64,7 @@ public class SetNmAndUqTagsTest {
"OUTPUT="+output,
"REFERENCE_SEQUENCE="+reference };
- SetNmAndUqTags setNmAndUqTags = new SetNmAndUqTags();
- Assert.assertEquals(setNmAndUqTags.instanceMain(args), 0, "Fix did not succeed");
+ SetNmMdAndUqTags setNmMdAndUqTags = new SetNmMdAndUqTags();
+ Assert.assertEquals(setNmMdAndUqTags.instanceMain(args), 0, "Fix did not succeed");
}
}
diff --git a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java
index 0e1f2f0..c5d99e6 100644
--- a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java
+++ b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java
@@ -112,7 +112,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam
}
expectedMetrics.READ_PAIR_DUPLICATES = expectedMetrics.READ_PAIR_DUPLICATES / 2;
expectedMetrics.READ_PAIRS_EXAMINED = expectedMetrics.READ_PAIRS_EXAMINED / 2;
- expectedMetrics.calculateDerivedMetrics();
+ expectedMetrics.calculateDerivedFields();
// Have to run this Double value through the same format/parsing operations as during a file write/read
expectedMetrics.PERCENT_DUPLICATION = formatter.parseDouble(formatter.format(expectedMetrics.PERCENT_DUPLICATION));
diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java
new file mode 100644
index 0000000..b78bbab
--- /dev/null
+++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java
@@ -0,0 +1,166 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import picard.PicardException;
+
+import java.util.*;
+
+/**
+ * This class defines the individual test cases to run. The actual running of the test is done
+ * by UmiAwareMarkDuplicatesWithMateCigarTester (see getTester).
+ * @author fleharty
+ */
+public class UmiAwareMarkDuplicatesWithMateCigarTest extends SimpleMarkDuplicatesWithMateCigarTest {
+
+ @Override
+ protected UmiAwareMarkDuplicatesWithMateCigarTester getTester() {
+ return new UmiAwareMarkDuplicatesWithMateCigarTester();
+ }
+
+ protected UmiAwareMarkDuplicatesWithMateCigarTester getTester(final boolean allowMissingUmis) {
+ return new UmiAwareMarkDuplicatesWithMateCigarTester(allowMissingUmis);
+ }
+
+ @DataProvider(name = "testUmiSetsDataProvider")
+ private Object[][] testUmiSetsDataProvider() {
+ return new Object[][] {{
+ // Test basic error correction using edit distance of 1
+ Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), // Observed UMI
+ Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAA"}), // Expected inferred UMI
+ Arrays.asList(new Boolean[] {false, true, false, true, true}), // Should it be marked as duplicate?
+ 1 // Edit Distance to Join
+ }, {
+ // Test basic error correction using edit distance of 2
+ Arrays.asList(new String[] {"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}),
+ Arrays.asList(new String[] {"AAAA", "AAAA", "AAAA", "AAAA", "AAAA"}),
+ Arrays.asList(new Boolean[] {false, true, true, true, true}),
+ 2
+ }, {
+ // Test basic error correction using edit distance of 1 where UMIs
+ // form a chain in edit distance space so that a UMI with large
+ // edit distance will get error corrected to a distant but linked (in edit space) UMI
+ Arrays.asList(new String[] {"AAAA", "AAAA", "AAAT", "AAGT", "ACGT", "TCGT", "CCCC"}),
+ Arrays.asList(new String[] {"AAAA", "AAAA", "AAAA", "AAAA", "AAAA", "AAAA", "CCCC"}),
+ Arrays.asList(new Boolean[] {false, true, true, true, true, true, false}),
+ 1
+ }, {
+ // Test short UMIs
+ Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}),
+ Arrays.asList(new String[] {"A", "A", "A", "A", "A", "A", "A", "A"}), // All UMIs should get corrected to A
+ Arrays.asList(new Boolean[] {false, true, true, true, true, true, true, true}), // All mate pairs should be duplicates except the first
+ 1
+ }, {
+ // Test short UMIs with no allowance for errors
+ Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}),
+ Arrays.asList(new String[] {"A", "A", "T", "G", "G", "C", "C", "A"}), // No UMIs should get corrected
+ Arrays.asList(new Boolean[] {false, true, false, false, true, false, true, true}), // Only exactly duplicated UMIs will give rise to a new duplicate set
+ 0
+ }, {
+ // Test longish UMIs with relatively large allowance for error
+ // UMIs "TTGACATCCA", "ATGCCATCGA", "AAGTCACCGT" should belong to the same duplicate set since
+ // they are within edit distance of 4 of each other. TTGACATCCA should be chosen as the inferred
+ // UMI even though it only occurs once. Since all UMIs only occur once, we choose the UMI that
+ // is not marked as duplicate to be the inferred UMI.
+ Arrays.asList(new String[] {"TTGACATCCA", "ATGCCATCGA", "AAGTCACCGT"}),
+ Arrays.asList(new String[] {"TTGACATCCA", "TTGACATCCA", "TTGACATCCA"}), // All UMIs should get corrected to TTGACATCCA
+ Arrays.asList(new Boolean[] {false, true, true}), // All mate pairs should be duplicates except the first
+ 4
+ }, };
+ }
+
+ @DataProvider(name = "testBadUmiSetsDataProvider")
+ private Object[][] testBadUmiSetsDataProvider() {
+ return new Object[][] {{
+ // The code should not support variable length UMIs, if we observe variable length UMIs
+ // ensure that an exception is thrown.
+ Arrays.asList(new String[] {"AAAA", "A"}),
+ Arrays.asList(new String[] {"AAAA", "A"}),
+ Arrays.asList(new Boolean[] {false, false}),
+ 4
+ }, {
+ // The code should not support variable length UMIs, if we observe variable length UMIs
+ // ensure that an exception is thrown.
+ // Arrays.asList(new String[] {"T", "GG"}),
+ Arrays.asList(new String[] {"T", "GG"}),
+ Arrays.asList(new String[] {"T", "GG"}),
+ Arrays.asList(new Boolean[] {false, false}),
+ 1
+ }, {
+ // Test to make sure that we throw an exception with missing UMIs when allowMissingUmis is false
+ // This throws an exception because the UMIs have differing lengths.
+ Arrays.asList(new String[] {"TTGA", "TTAT", null}),
+ Arrays.asList(new String[] {"TTGA", "TTAT", null}),
+ Arrays.asList(new Boolean[] {false, false, false}),
+ 4
+ }};
+ }
+
+ @DataProvider(name = "testEmptyUmiDataProvider")
+ private Object[][] testEmptyUmiDataProvider() {
+ return new Object[][] {{
+ // Test to make sure we treat empty UMIs correctly when they are allowed
+ Arrays.asList(new String[] {null, null, null}),
+ Arrays.asList(new String[] {null, null, null}),
+ Arrays.asList(new Boolean[] {false, true, true}),
+ 4
+ }};
+ }
+
+ @Test(dataProvider = "testUmiSetsDataProvider")
+ public void testUmi(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
+ UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false);
+ tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
+
+ for(int i = 0;i < umis.size();i++) {
+ tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
+ }
+ tester.setExpectedAssignedUmis(assignedUmi).runTest();
+ }
+
+ @Test(dataProvider = "testEmptyUmiDataProvider")
+ public void testEmptyUmis(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
+ UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(true);
+ tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
+
+ for(int i = 0;i < umis.size();i++) {
+ tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
+ }
+ tester.setExpectedAssignedUmis(assignedUmi).runTest();
+ }
+
+ @Test(dataProvider = "testBadUmiSetsDataProvider", expectedExceptions = PicardException.class)
+ public void testBadUmis(List<String> umis, List<String> assignedUmi, final List<Boolean> isDuplicate, final int editDistanceToJoin) {
+ UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false);
+ tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin);
+
+ for(int i = 0;i < umis.size();i++) {
+ tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i));
+ }
+ tester.setExpectedAssignedUmis(assignedUmi).runTest();
+ }
+}
diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java
new file mode 100644
index 0000000..bdf166b
--- /dev/null
+++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java
@@ -0,0 +1,167 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import org.testng.Assert;
+import picard.cmdline.CommandLineProgram;
+
+import java.util.List;
+
+/**
+ * This class is an extension of AbstractMarkDuplicatesCommandLineProgramTester used to test
+ * AbstractMarkDuplicatesCommandLineProgram's with SAM files generated on the fly. This performs the underlying tests
+ * defined by classes such as AbstractMarkDuplicatesCommandLineProgramTest.
+ * @author fleharty
+ */
+
+public class UmiAwareMarkDuplicatesWithMateCigarTester extends AbstractMarkDuplicatesCommandLineProgramTester {
+ private int readNameCounter = 0;
+ private List<String> expectedAssignedUmis;
+
+ // This tag is only used for testing, it indicates what we expect to see in the inferred UMI tag.
+ private final String expectedUmiTag = "RE";
+
+ // This default constructor is intended to be used by tests inherited from
+ // AbstractMarkDuplicatesCommandLineProgramTester. Since those tests use
+ // reads that don't have UMIs we enable the ALLOW_MISSING_UMIS option.
+ UmiAwareMarkDuplicatesWithMateCigarTester() {
+ addArg("ALLOW_MISSING_UMIS=" + true);
+ }
+
+ UmiAwareMarkDuplicatesWithMateCigarTester(final boolean allowMissingUmis) {
+ if (allowMissingUmis) {
+ addArg("ALLOW_MISSING_UMIS=" + true);
+ }
+ }
+
+ public void addMatePairWithUmi(final String umi, final String assignedUMI, final boolean isDuplicate1, final boolean isDuplicate2) {
+
+ final String readName = "READ" + readNameCounter++;
+ final String cigar1 = null;
+ final String cigar2 = null;
+ final boolean strand1 = false;
+ final boolean strand2 = true;
+
+ final int referenceSequenceIndex1 = 0;
+ final int referenceSequenceIndex2 = 0;
+ final int alignmentStart1 = 20;
+ final int alignmentStart2 = 20;
+
+ final boolean record1Unmapped = false;
+ final boolean record2Unmapped = false;
+
+ final boolean firstOnly = false;
+ final boolean record1NonPrimary = false;
+ final boolean record2NonPrimary = false;
+
+ final int defaultQuality = 10;
+
+ addMatePairWithUmi(readName, referenceSequenceIndex1, referenceSequenceIndex2, alignmentStart1, alignmentStart2, record1Unmapped,
+ record2Unmapped, isDuplicate1, isDuplicate2, cigar1, cigar2, strand1, strand2, firstOnly, record1NonPrimary, record2NonPrimary,
+ defaultQuality, umi, assignedUMI);
+
+ }
+
+ public void addMatePairWithUmi(final String readName,
+ final int referenceSequenceIndex1,
+ final int referenceSequenceIndex2,
+ final int alignmentStart1,
+ final int alignmentStart2,
+ final boolean record1Unmapped,
+ final boolean record2Unmapped,
+ final boolean isDuplicate1,
+ final boolean isDuplicate2,
+ final String cigar1,
+ final String cigar2,
+ final boolean strand1,
+ final boolean strand2,
+ final boolean firstOnly,
+ final boolean record1NonPrimary,
+ final boolean record2NonPrimary,
+ final int defaultQuality,
+ final String umi,
+ final String assignedUMI) {
+ final List<SAMRecord> samRecordList = samRecordSetBuilder.addPair(readName, referenceSequenceIndex1, referenceSequenceIndex2, alignmentStart1, alignmentStart2,
+ record1Unmapped, record2Unmapped, cigar1, cigar2, strand1, strand2, record1NonPrimary, record2NonPrimary, defaultQuality);
+
+ final SAMRecord record1 = samRecordList.get(0);
+ final SAMRecord record2 = samRecordList.get(1);
+
+ if (this.noMateCigars) {
+ record1.setAttribute("MC", null);
+ record2.setAttribute("MC", null);
+ }
+
+ if (firstOnly) {
+ samRecordSetBuilder.getRecords().remove(record2);
+ }
+
+ final String key1 = samRecordToDuplicatesFlagsKey(record1);
+ Assert.assertFalse(this.duplicateFlags.containsKey(key1));
+ this.duplicateFlags.put(key1, isDuplicate1);
+
+ final String key2 = samRecordToDuplicatesFlagsKey(record2);
+ Assert.assertFalse(this.duplicateFlags.containsKey(key2));
+ this.duplicateFlags.put(key2, isDuplicate2);
+
+ if (umi != null) {
+ // TODO: Replace "RX" with SAMTag.RX once this tag is available in HTSJDK
+ record1.setAttribute("RX", umi);
+ record2.setAttribute("RX", umi);
+ }
+ if (assignedUMI != null) {
+ // Set the expected UMI, this is a special tag used only for testing.
+ record1.setAttribute(expectedUmiTag, assignedUMI);
+ record2.setAttribute(expectedUmiTag, assignedUMI);
+ }
+ }
+
+ UmiAwareMarkDuplicatesWithMateCigarTester setExpectedAssignedUmis(final List<String> expectedAssignedUmis) {
+ this.expectedAssignedUmis = expectedAssignedUmis;
+ return this;
+ }
+
+ @Override
+ public void test() {
+ final SamReader reader = SamReaderFactory.makeDefault().open(getOutput());
+ for (final SAMRecord record : reader) {
+ // If there are expected assigned UMIs, check to make sure they match
+ if (expectedAssignedUmis != null) {
+ Assert.assertEquals(record.getAttribute("MI"), record.getAttribute(expectedUmiTag));
+ }
+ }
+ // Also do tests from AbstractMarkDuplicatesCommandLineProgramTester
+ super.test();
+ }
+
+ @Override
+ protected CommandLineProgram getProgram() {
+ UmiAwareMarkDuplicatesWithMateCigar uamdwmc = new UmiAwareMarkDuplicatesWithMateCigar();
+ return uamdwmc;
+ }
+}
diff --git a/src/test/java/picard/sam/testers/SamFileTester.java b/src/test/java/picard/sam/testers/SamFileTester.java
index bd83ba7..f92972b 100644
--- a/src/test/java/picard/sam/testers/SamFileTester.java
+++ b/src/test/java/picard/sam/testers/SamFileTester.java
@@ -23,12 +23,12 @@ import java.util.Map;
*/
public abstract class SamFileTester extends CommandLineProgramTest {
- private final SAMRecordSetBuilder samRecordSetBuilder;
+ protected final SAMRecordSetBuilder samRecordSetBuilder;
protected final Map<String, Boolean> duplicateFlags = new HashMap<>();
private File outputDir;
private File output;
private int readNameCounter = 0;
- private boolean noMateCigars = false;
+ protected boolean noMateCigars = false;
private boolean deleteOnExit = true;
private final ArrayList<String> args = new ArrayList<>();
diff --git a/src/test/java/picard/vcf/VcfTestUtils.java b/src/test/java/picard/vcf/VcfTestUtils.java
new file mode 100644
index 0000000..58db96b
--- /dev/null
+++ b/src/test/java/picard/vcf/VcfTestUtils.java
@@ -0,0 +1,40 @@
+package picard.vcf;
+
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.writer.Options;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
+import htsjdk.variant.vcf.VCFFileReader;
+import htsjdk.variant.vcf.VCFHeader;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.EnumSet;
+
+public class VcfTestUtils {
+ /**
+ * Useful test method. Creates a (temporary) indexed VCF so that we don't have to store the index file in the testdata set.
+ * @param vcfFile the vcf file to index
+ * @return File a vcf file (index file is created in same path).
+ */
+ public static File createIndexedVcf(final File vcfFile, final String tempFilePrefix) throws IOException {
+ final File output = File.createTempFile(tempFilePrefix, ".vcf");
+ output.deleteOnExit();
+ final File indexFile = new File(output.getAbsolutePath() + ".idx");
+ indexFile.deleteOnExit();
+ final VCFFileReader in = new VCFFileReader(vcfFile, false);
+ final VCFHeader header = in.getFileHeader();
+
+ final VariantContextWriter out = new VariantContextWriterBuilder().
+ setReferenceDictionary(header.getSequenceDictionary()).
+ setOptions(EnumSet.of(Options.INDEX_ON_THE_FLY)).
+ setOutputFile(output).build();
+ out.writeHeader(header);
+ for (final VariantContext ctx : in) {
+ out.add(ctx);
+ }
+ out.close();
+ in.close();
+ return output;
+ }
+}
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_bq_cutoff.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_bq_cutoff.error_summary_metrics
new file mode 100644
index 0000000..0a6f04b
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_bq_cutoff.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=no_bq_cutoff.pre_adapter_detail_metrics OUTPUT=no_bq_cutoff.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:13:59 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 134 0 0
+A G A>G 134 0 0
+A T A>T 134 0 0
+C A C>A 92 22 0.192982
+C G C>G 92 0 0
+C T C>T 92 32 0.258065
+
+
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_mq_cutoff.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_mq_cutoff.error_summary_metrics
new file mode 100644
index 0000000..df7c21b
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/no_mq_cutoff.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=no_mq_cutoff.pre_adapter_detail_metrics OUTPUT=no_mq_cutoff.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:14:00 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 136 0 0
+A G A>G 136 0 0
+A T A>T 136 0 0
+C A C>A 100 22 0.180328
+C G C>G 100 0 0
+C T C>T 100 32 0.242424
+
+
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/unmapped_mate.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/unmapped_mate.error_summary_metrics
new file mode 100644
index 0000000..89b6661
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/unmapped_mate.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=unmapped_mate.pre_adapter_detail_metrics OUTPUT=unmapped_mate.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:14:00 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 138 0 0
+A G A>G 138 0 0
+A T A>T 138 0 0
+C A C>A 98 22 0.183333
+C G C>G 98 0 0
+C T C>T 98 32 0.246154
+
+
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_context.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_context.error_summary_metrics
new file mode 100644
index 0000000..3ac4023
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_context.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=with_context.pre_adapter_detail_metrics OUTPUT=with_context.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:14:01 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 116 0 0
+A G A>G 116 0 0
+A T A>T 116 0 0
+C A C>A 86 22 0.203704
+C G C>G 86 0 0
+C T C>T 86 32 0.271186
+
+
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_dbsnp.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_dbsnp.error_summary_metrics
new file mode 100644
index 0000000..611d6d0
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_dbsnp.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=with_dbsnp.pre_adapter_detail_metrics OUTPUT=with_dbsnp.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:14:01 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 126 0 0
+A G A>G 126 0 0
+A T A>T 126 0 0
+C A C>A 80 18 0.183673
+C G C>G 80 0 0
+C T C>T 80 24 0.230769
+
+
diff --git a/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_intervals.error_summary_metrics b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_intervals.error_summary_metrics
new file mode 100644
index 0000000..4a7f862
--- /dev/null
+++ b/testdata/picard/analysis/artifacts/CollectSequencingArtifactMetrics/ExpectedMetricsOutput/with_intervals.error_summary_metrics
@@ -0,0 +1,15 @@
+## htsjdk.samtools.metrics.StringHeader
+# picard.analysis.artifacts.SummarizeErrors INPUT=with_intervals.pre_adapter_detail_metrics OUTPUT=with_intervals.error_summary_metrics VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
+## htsjdk.samtools.metrics.StringHeader
+# Started on: Mon Oct 24 14:14:02 EDT 2016
+
+## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics
+REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE
+A C A>C 62 0 0
+A G A>G 62 0 0
+A T A>T 62 0 0
+C A C>A 40 4 0.090909
+C G C>G 40 0 0
+C T C>T 40 32 0.444444
+
+
diff --git a/testdata/picard/fingerprint/Homo_sapiens_assembly19.haplotype_database.subset.txt b/testdata/picard/fingerprint/Homo_sapiens_assembly19.haplotype_database.subset.txt
new file mode 100755
index 0000000..4aed53b
--- /dev/null
+++ b/testdata/picard/fingerprint/Homo_sapiens_assembly19.haplotype_database.subset.txt
@@ -0,0 +1,92 @@
+ at HD VN:1.4 GO:none SO:coordinate
+ at SQ SN:1 LN:249250621 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Homo Sapiens
+ at SQ SN:2 LN:243199373 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a0d9851da00400dec1098a9255ac712e SP:Homo Sapiens
+ at SQ SN:3 LN:198022430 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdfd811849cc2fadebc929bb925902e5 SP:Homo Sapiens
+ at SQ SN:4 LN:191154276 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:23dccd106897542ad87d2765d28a19a1 SP:Homo Sapiens
+ at SQ SN:5 LN:180915260 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0740173db9ffd264d728f32784845cd7 SP:Homo Sapiens
+ at SQ SN:6 LN:171115067 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d3a93a248d92a729ee764823acbbc6b SP:Homo Sapiens
+ at SQ SN:7 LN:159138663 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:618366e953d6aaad97dbe4777c29375e SP:Homo Sapiens
+ at SQ SN:8 LN:146364022 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96f514a9929e410c6651697bded59aec SP:Homo Sapiens
+ at SQ SN:9 LN:141213431 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e273117f15e0a400f01055d9f393768 SP:Homo Sapiens
+ at SQ SN:10 LN:135534747 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:988c28e000e84c26d552359af1ea2e1d SP:Homo Sapiens
+ at SQ SN:11 LN:135006516 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98c59049a2df285c76ffb1c6db8f8b96 SP:Homo Sapiens
+ at SQ SN:12 LN:133851895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:51851ac0e1a115847ad36449b0015864 SP:Homo Sapiens
+ at SQ SN:13 LN:115169878 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:283f8d7892baa81b510a015719ca7b0b SP:Homo Sapiens
+ at SQ SN:14 LN:107349540 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98f3cae32b2a2e9524bc19813927542e SP:Homo Sapiens
+ at SQ SN:15 LN:102531392 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e5645a794a8238215b2cd77acb95a078 SP:Homo Sapiens
+ at SQ SN:16 LN:90354753 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc9b1a7b42b97a864f56b348b06095e6 SP:Homo Sapiens
+ at SQ SN:17 LN:81195210 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:351f64d4f4f9ddd45b35336ad97aa6de SP:Homo Sapiens
+ at SQ SN:18 LN:78077248 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c SP:Homo Sapiens
+ at SQ SN:19 LN:59128983 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1aacd71f30db8e561810913e0b72636d SP:Homo Sapiens
+ at SQ SN:20 LN:63025520 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens
+ at SQ SN:21 LN:48129895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2979a6085bfe28e3ad6f552f361ed74d SP:Homo Sapiens
+ at SQ SN:22 LN:51304566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a718acaa6135fdca8357d5bfe94211dd SP:Homo Sapiens
+ at SQ SN:X LN:155270560 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7e0e2e580297b7764e31dbc80c2540dd SP:Homo Sapiens
+ at SQ SN:Y LN:59373566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1fa3474750af0948bdf97d5a0ee52e51 SP:Homo Sapiens
+ at SQ SN:MT LN:16569 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c68f52674c9fb33aef52dcf399755519 SP:Homo Sapiens
+ at SQ SN:GL000207.1 LN:4262 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f3814841f1939d3ca19072d9e89f3fd7 SP:Homo Sapiens
+ at SQ SN:GL000226.1 LN:15008 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1c1b2cd1fccbc0a99b6a447fa24d1504 SP:Homo Sapiens
+ at SQ SN:GL000229.1 LN:19913 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d0f40ec87de311d8e715b52e4c7062e1 SP:Homo Sapiens
+ at SQ SN:GL000231.1 LN:27386 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ba8882ce3a1efa2080e5d29b956568a4 SP:Homo Sapiens
+ at SQ SN:GL000210.1 LN:27682 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:851106a74238044126131ce2a8e5847c SP:Homo Sapiens
+ at SQ SN:GL000239.1 LN:33824 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:99795f15702caec4fa1c4e15f8a29c07 SP:Homo Sapiens
+ at SQ SN:GL000235.1 LN:34474 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:118a25ca210cfbcdfb6c2ebb249f9680 SP:Homo Sapiens
+ at SQ SN:GL000201.1 LN:36148 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dfb7e7ec60ffdcb85cb359ea28454ee9 SP:Homo Sapiens
+ at SQ SN:GL000247.1 LN:36422 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7de00226bb7df1c57276ca6baabafd15 SP:Homo Sapiens
+ at SQ SN:GL000245.1 LN:36651 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:89bc61960f37d94abf0df2d481ada0ec SP:Homo Sapiens
+ at SQ SN:GL000197.1 LN:37175 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6f5efdd36643a9b8c8ccad6f2f1edc7b SP:Homo Sapiens
+ at SQ SN:GL000203.1 LN:37498 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96358c325fe0e70bee73436e8bb14dbd SP:Homo Sapiens
+ at SQ SN:GL000246.1 LN:38154 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e4afcd31912af9d9c2546acf1cb23af2 SP:Homo Sapiens
+ at SQ SN:GL000249.1 LN:38502 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d78abec37c15fe29a275eb08d5af236 SP:Homo Sapiens
+ at SQ SN:GL000196.1 LN:38914 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d92206d1bb4c3b4019c43c0875c06dc0 SP:Homo Sapiens
+ at SQ SN:GL000248.1 LN:39786 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5a8e43bec9be36c7b49c84d585107776 SP:Homo Sapiens
+ at SQ SN:GL000244.1 LN:39929 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0996b4475f353ca98bacb756ac479140 SP:Homo Sapiens
+ at SQ SN:GL000238.1 LN:39939 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:131b1efc3270cc838686b54e7c34b17b SP:Homo Sapiens
+ at SQ SN:GL000202.1 LN:40103 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:06cbf126247d89664a4faebad130fe9c SP:Homo Sapiens
+ at SQ SN:GL000234.1 LN:40531 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:93f998536b61a56fd0ff47322a911d4b SP:Homo Sapiens
+ at SQ SN:GL000232.1 LN:40652 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e06b6741061ad93a8587531307057d8 SP:Homo Sapiens
+ at SQ SN:GL000206.1 LN:41001 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:43f69e423533e948bfae5ce1d45bd3f1 SP:Homo Sapiens
+ at SQ SN:GL000240.1 LN:41933 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:445a86173da9f237d7bcf41c6cb8cc62 SP:Homo Sapiens
+ at SQ SN:GL000236.1 LN:41934 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdcd739913efa1fdc64b6c0cd7016779 SP:Homo Sapiens
+ at SQ SN:GL000241.1 LN:42152 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ef4258cdc5a45c206cea8fc3e1d858cf SP:Homo Sapiens
+ at SQ SN:GL000243.1 LN:43341 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:cc34279a7e353136741c9fce79bc4396 SP:Homo Sapiens
+ at SQ SN:GL000242.1 LN:43523 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2f8694fc47576bc81b5fe9e7de0ba49e SP:Homo Sapiens
+ at SQ SN:GL000230.1 LN:43691 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b4eb71ee878d3706246b7c1dbef69299 SP:Homo Sapiens
+ at SQ SN:GL000237.1 LN:45867 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e0c82e7751df73f4f6d0ed30cdc853c0 SP:Homo Sapiens
+ at SQ SN:GL000233.1 LN:45941 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7fed60298a8d62ff808b74b6ce820001 SP:Homo Sapiens
+ at SQ SN:GL000204.1 LN:81310 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:efc49c871536fa8d79cb0a06fa739722 SP:Homo Sapiens
+ at SQ SN:GL000198.1 LN:90085 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:868e7784040da90d900d2d1b667a1383 SP:Homo Sapiens
+ at SQ SN:GL000208.1 LN:92689 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:aa81be49bf3fe63a79bdc6a6f279abf6 SP:Homo Sapiens
+ at SQ SN:GL000191.1 LN:106433 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d75b436f50a8214ee9c2a51d30b2c2cc SP:Homo Sapiens
+ at SQ SN:GL000227.1 LN:128374 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a4aead23f8053f2655e468bcc6ecdceb SP:Homo Sapiens
+ at SQ SN:GL000228.1 LN:129120 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c5a17c97e2c1a0b6a9cc5a6b064b714f SP:Homo Sapiens
+ at SQ SN:GL000214.1 LN:137718 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:46c2032c37f2ed899eb41c0473319a69 SP:Homo Sapiens
+ at SQ SN:GL000221.1 LN:155397 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3238fb74ea87ae857f9c7508d315babb SP:Homo Sapiens
+ at SQ SN:GL000209.1 LN:159169 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f40598e2a5a6b26e84a3775e0d1e2c81 SP:Homo Sapiens
+ at SQ SN:GL000218.1 LN:161147 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d708b54644c26c7e01c2dad5426d38c SP:Homo Sapiens
+ at SQ SN:GL000220.1 LN:161802 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc35de963c57bf7648429e6454f1c9db SP:Homo Sapiens
+ at SQ SN:GL000213.1 LN:164239 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:9d424fdcc98866650b58f004080a992a SP:Homo Sapiens
+ at SQ SN:GL000211.1 LN:166566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7daaa45c66b288847b9b32b964e623d3 SP:Homo Sapiens
+ at SQ SN:GL000199.1 LN:169874 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:569af3b73522fab4b40995ae4944e78e SP:Homo Sapiens
+ at SQ SN:GL000217.1 LN:172149 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6d243e18dea1945fb7f2517615b8f52e SP:Homo Sapiens
+ at SQ SN:GL000216.1 LN:172294 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:642a232d91c486ac339263820aef7fe0 SP:Homo Sapiens
+ at SQ SN:GL000215.1 LN:172545 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5eb3b418480ae67a997957c909375a73 SP:Homo Sapiens
+ at SQ SN:GL000205.1 LN:174588 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d22441398d99caf673e9afb9a1908ec5 SP:Homo Sapiens
+ at SQ SN:GL000219.1 LN:179198 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f977edd13bac459cb2ed4a5457dba1b3 SP:Homo Sapiens
+ at SQ SN:GL000224.1 LN:179693 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d5b2fc04f6b41b212a4198a07f450e20 SP:Homo Sapiens
+ at SQ SN:GL000223.1 LN:180455 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:399dfa03bf32022ab52a846f7ca35b30 SP:Homo Sapiens
+ at SQ SN:GL000195.1 LN:182896 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5d9ec007868d517e73543b005ba48535 SP:Homo Sapiens
+ at SQ SN:GL000212.1 LN:186858 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:563531689f3dbd691331fd6c5730a88b SP:Homo Sapiens
+ at SQ SN:GL000222.1 LN:186861 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6fe9abac455169f50470f5a6b01d0f59 SP:Homo Sapiens
+ at SQ SN:GL000200.1 LN:187035 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:75e4c8d17cd4addf3917d1703cacaf25 SP:Homo Sapiens
+ at SQ SN:GL000193.1 LN:189789 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dbb6e8ece0b5de29da56601613007c2a SP:Homo Sapiens
+ at SQ SN:GL000194.1 LN:191469 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6ac8f815bf8e845bb3031b73f812c012 SP:Homo Sapiens
+ at SQ SN:GL000225.1 LN:211173 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:63945c3e6962f28ffd469719a747e73c SP:Homo Sapiens
+ at SQ SN:GL000192.1 LN:547496 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:325ba9e808f669dfeee210fdd7b470ac SP:Homo Sapiens
+ at SQ SN:NC_007605 LN:171823 AS:NC_007605.1 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6743bd63b3ff2b5b8985d8933c53290a SP:Epstein-Barr virus
+#CHROMOSOME POSITION NAME MAJOR_ALLELE MINOR_ALLELE MAF ANCHOR_SNP PANELS
+1 14804874 rs7555566 A G 0.223794
+3 17077268 rs17272796 C T 0.623026
+4 57194525 rs6834736 C G 0.512884
+5 156355375 rs2862058 A G 0.349127
+7 4490854 rs314605 C T 0.786367
diff --git a/testdata/picard/fingerprint/NA12891.fp.vcf b/testdata/picard/fingerprint/NA12891.fp.vcf
new file mode 100755
index 0000000..bdf117e
--- /dev/null
+++ b/testdata/picard/fingerprint/NA12891.fp.vcf
@@ -0,0 +1,101 @@
+##fileformat=VCFv4.1
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##GATKCommandLine.SelectVariants=<ID=SelectVariants,Version=nightly-2015-07-31-g3c929b0,Date="Wed Sep 14 12:14:59 EDT 2016",Epoch=1473869699168,CommandLineOptions="analysis_type=SelectVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] disable_read_filter=[] intervals=[Fingerprint.test.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sa [...]
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=MT,length=16569>
+##contig=<ID=GL000207.1,length=4262>
+##contig=<ID=GL000226.1,length=15008>
+##contig=<ID=GL000229.1,length=19913>
+##contig=<ID=GL000231.1,length=27386>
+##contig=<ID=GL000210.1,length=27682>
+##contig=<ID=GL000239.1,length=33824>
+##contig=<ID=GL000235.1,length=34474>
+##contig=<ID=GL000201.1,length=36148>
+##contig=<ID=GL000247.1,length=36422>
+##contig=<ID=GL000245.1,length=36651>
+##contig=<ID=GL000197.1,length=37175>
+##contig=<ID=GL000203.1,length=37498>
+##contig=<ID=GL000246.1,length=38154>
+##contig=<ID=GL000249.1,length=38502>
+##contig=<ID=GL000196.1,length=38914>
+##contig=<ID=GL000248.1,length=39786>
+##contig=<ID=GL000244.1,length=39929>
+##contig=<ID=GL000238.1,length=39939>
+##contig=<ID=GL000202.1,length=40103>
+##contig=<ID=GL000234.1,length=40531>
+##contig=<ID=GL000232.1,length=40652>
+##contig=<ID=GL000206.1,length=41001>
+##contig=<ID=GL000240.1,length=41933>
+##contig=<ID=GL000236.1,length=41934>
+##contig=<ID=GL000241.1,length=42152>
+##contig=<ID=GL000243.1,length=43341>
+##contig=<ID=GL000242.1,length=43523>
+##contig=<ID=GL000230.1,length=43691>
+##contig=<ID=GL000237.1,length=45867>
+##contig=<ID=GL000233.1,length=45941>
+##contig=<ID=GL000204.1,length=81310>
+##contig=<ID=GL000198.1,length=90085>
+##contig=<ID=GL000208.1,length=92689>
+##contig=<ID=GL000191.1,length=106433>
+##contig=<ID=GL000227.1,length=128374>
+##contig=<ID=GL000228.1,length=129120>
+##contig=<ID=GL000214.1,length=137718>
+##contig=<ID=GL000221.1,length=155397>
+##contig=<ID=GL000209.1,length=159169>
+##contig=<ID=GL000218.1,length=161147>
+##contig=<ID=GL000220.1,length=161802>
+##contig=<ID=GL000213.1,length=164239>
+##contig=<ID=GL000211.1,length=166566>
+##contig=<ID=GL000199.1,length=169874>
+##contig=<ID=GL000217.1,length=172149>
+##contig=<ID=GL000216.1,length=172294>
+##contig=<ID=GL000215.1,length=172545>
+##contig=<ID=GL000205.1,length=174588>
+##contig=<ID=GL000219.1,length=179198>
+##contig=<ID=GL000224.1,length=179693>
+##contig=<ID=GL000223.1,length=180455>
+##contig=<ID=GL000195.1,length=182896>
+##contig=<ID=GL000212.1,length=186858>
+##contig=<ID=GL000222.1,length=186861>
+##contig=<ID=GL000200.1,length=187035>
+##contig=<ID=GL000193.1,length=189789>
+##contig=<ID=GL000194.1,length=191469>
+##contig=<ID=GL000225.1,length=211173>
+##contig=<ID=GL000192.1,length=547496>
+##contig=<ID=NC_007605,length=171823>
+##fileDate=Wed Sep 14 11:47:53 EDT 2016
+##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+##source=SelectVariants
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12891
+1 14804874 rs7555566 A G . PASS . GT 0/0
+3 17077268 rs17272796 C T . PASS . GT 0/1
+4 57194525 rs6834736 C G . PASS . GT 1/1
+5 156355375 rs2862058 A G . PASS . GT 0/0
+7 4490854 rs314605 C T . PASS . GT 0/1
diff --git a/testdata/picard/fingerprint/NA12891.vcf b/testdata/picard/fingerprint/NA12891.vcf
new file mode 100755
index 0000000..5dc85b8
--- /dev/null
+++ b/testdata/picard/fingerprint/NA12891.vcf
@@ -0,0 +1,101 @@
+##fileformat=VCFv4.1
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=IGC,Number=1,Type=Float,Description="Illumina GenCall Confidence Score">
+##GATKCommandLine.SelectVariants=<ID=SelectVariants,Version=nightly-2015-07-31-g3c929b0,Date="Wed Sep 14 12:13:49 EDT 2016",Epoch=1473869629940,CommandLineOptions="analysis_type=SelectVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] disable_read_filter=[] intervals=[Fingerprint.test.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sa [...]
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=MT,length=16569>
+##contig=<ID=GL000207.1,length=4262>
+##contig=<ID=GL000226.1,length=15008>
+##contig=<ID=GL000229.1,length=19913>
+##contig=<ID=GL000231.1,length=27386>
+##contig=<ID=GL000210.1,length=27682>
+##contig=<ID=GL000239.1,length=33824>
+##contig=<ID=GL000235.1,length=34474>
+##contig=<ID=GL000201.1,length=36148>
+##contig=<ID=GL000247.1,length=36422>
+##contig=<ID=GL000245.1,length=36651>
+##contig=<ID=GL000197.1,length=37175>
+##contig=<ID=GL000203.1,length=37498>
+##contig=<ID=GL000246.1,length=38154>
+##contig=<ID=GL000249.1,length=38502>
+##contig=<ID=GL000196.1,length=38914>
+##contig=<ID=GL000248.1,length=39786>
+##contig=<ID=GL000244.1,length=39929>
+##contig=<ID=GL000238.1,length=39939>
+##contig=<ID=GL000202.1,length=40103>
+##contig=<ID=GL000234.1,length=40531>
+##contig=<ID=GL000232.1,length=40652>
+##contig=<ID=GL000206.1,length=41001>
+##contig=<ID=GL000240.1,length=41933>
+##contig=<ID=GL000236.1,length=41934>
+##contig=<ID=GL000241.1,length=42152>
+##contig=<ID=GL000243.1,length=43341>
+##contig=<ID=GL000242.1,length=43523>
+##contig=<ID=GL000230.1,length=43691>
+##contig=<ID=GL000237.1,length=45867>
+##contig=<ID=GL000233.1,length=45941>
+##contig=<ID=GL000204.1,length=81310>
+##contig=<ID=GL000198.1,length=90085>
+##contig=<ID=GL000208.1,length=92689>
+##contig=<ID=GL000191.1,length=106433>
+##contig=<ID=GL000227.1,length=128374>
+##contig=<ID=GL000228.1,length=129120>
+##contig=<ID=GL000214.1,length=137718>
+##contig=<ID=GL000221.1,length=155397>
+##contig=<ID=GL000209.1,length=159169>
+##contig=<ID=GL000218.1,length=161147>
+##contig=<ID=GL000220.1,length=161802>
+##contig=<ID=GL000213.1,length=164239>
+##contig=<ID=GL000211.1,length=166566>
+##contig=<ID=GL000199.1,length=169874>
+##contig=<ID=GL000217.1,length=172149>
+##contig=<ID=GL000216.1,length=172294>
+##contig=<ID=GL000215.1,length=172545>
+##contig=<ID=GL000205.1,length=174588>
+##contig=<ID=GL000219.1,length=179198>
+##contig=<ID=GL000224.1,length=179693>
+##contig=<ID=GL000223.1,length=180455>
+##contig=<ID=GL000195.1,length=182896>
+##contig=<ID=GL000212.1,length=186858>
+##contig=<ID=GL000222.1,length=186861>
+##contig=<ID=GL000200.1,length=187035>
+##contig=<ID=GL000193.1,length=189789>
+##contig=<ID=GL000194.1,length=191469>
+##contig=<ID=GL000225.1,length=211173>
+##contig=<ID=GL000192.1,length=547496>
+##contig=<ID=NC_007605,length=171823>
+##date created=Wed Sep 14 14:40:30 UTC 2016
+##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+##source=BPM file
+##source=SelectVariants
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12891
+1 14804874 exm-rs7555566 A G . . AC=0;AF=0.00;AN=2 GT:IGC 0/0:0.8139954
+3 17077268 exm-rs17272796 C T . . AC=1;AF=0.500;AN=2 GT:IGC 1/0:0.8534497
+7 4490854 rs314605 C T . . AC=1;AF=0.500;AN=2 GT:IGC 1/0:0.9196903
diff --git a/testdata/picard/fingerprint/NA12892.fp.vcf b/testdata/picard/fingerprint/NA12892.fp.vcf
new file mode 100755
index 0000000..b226128
--- /dev/null
+++ b/testdata/picard/fingerprint/NA12892.fp.vcf
@@ -0,0 +1,101 @@
+##fileformat=VCFv4.1
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##GATKCommandLine.SelectVariants=<ID=SelectVariants,Version=nightly-2015-07-31-g3c929b0,Date="Wed Sep 14 12:19:11 EDT 2016",Epoch=1473869951892,CommandLineOptions="analysis_type=SelectVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] disable_read_filter=[] intervals=[Fingerprint.test.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sa [...]
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=MT,length=16569>
+##contig=<ID=GL000207.1,length=4262>
+##contig=<ID=GL000226.1,length=15008>
+##contig=<ID=GL000229.1,length=19913>
+##contig=<ID=GL000231.1,length=27386>
+##contig=<ID=GL000210.1,length=27682>
+##contig=<ID=GL000239.1,length=33824>
+##contig=<ID=GL000235.1,length=34474>
+##contig=<ID=GL000201.1,length=36148>
+##contig=<ID=GL000247.1,length=36422>
+##contig=<ID=GL000245.1,length=36651>
+##contig=<ID=GL000197.1,length=37175>
+##contig=<ID=GL000203.1,length=37498>
+##contig=<ID=GL000246.1,length=38154>
+##contig=<ID=GL000249.1,length=38502>
+##contig=<ID=GL000196.1,length=38914>
+##contig=<ID=GL000248.1,length=39786>
+##contig=<ID=GL000244.1,length=39929>
+##contig=<ID=GL000238.1,length=39939>
+##contig=<ID=GL000202.1,length=40103>
+##contig=<ID=GL000234.1,length=40531>
+##contig=<ID=GL000232.1,length=40652>
+##contig=<ID=GL000206.1,length=41001>
+##contig=<ID=GL000240.1,length=41933>
+##contig=<ID=GL000236.1,length=41934>
+##contig=<ID=GL000241.1,length=42152>
+##contig=<ID=GL000243.1,length=43341>
+##contig=<ID=GL000242.1,length=43523>
+##contig=<ID=GL000230.1,length=43691>
+##contig=<ID=GL000237.1,length=45867>
+##contig=<ID=GL000233.1,length=45941>
+##contig=<ID=GL000204.1,length=81310>
+##contig=<ID=GL000198.1,length=90085>
+##contig=<ID=GL000208.1,length=92689>
+##contig=<ID=GL000191.1,length=106433>
+##contig=<ID=GL000227.1,length=128374>
+##contig=<ID=GL000228.1,length=129120>
+##contig=<ID=GL000214.1,length=137718>
+##contig=<ID=GL000221.1,length=155397>
+##contig=<ID=GL000209.1,length=159169>
+##contig=<ID=GL000218.1,length=161147>
+##contig=<ID=GL000220.1,length=161802>
+##contig=<ID=GL000213.1,length=164239>
+##contig=<ID=GL000211.1,length=166566>
+##contig=<ID=GL000199.1,length=169874>
+##contig=<ID=GL000217.1,length=172149>
+##contig=<ID=GL000216.1,length=172294>
+##contig=<ID=GL000215.1,length=172545>
+##contig=<ID=GL000205.1,length=174588>
+##contig=<ID=GL000219.1,length=179198>
+##contig=<ID=GL000224.1,length=179693>
+##contig=<ID=GL000223.1,length=180455>
+##contig=<ID=GL000195.1,length=182896>
+##contig=<ID=GL000212.1,length=186858>
+##contig=<ID=GL000222.1,length=186861>
+##contig=<ID=GL000200.1,length=187035>
+##contig=<ID=GL000193.1,length=189789>
+##contig=<ID=GL000194.1,length=191469>
+##contig=<ID=GL000225.1,length=211173>
+##contig=<ID=GL000192.1,length=547496>
+##contig=<ID=NC_007605,length=171823>
+##fileDate=Wed Sep 14 12:02:21 EDT 2016
+##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+##source=SelectVariants
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12892
+1 14804874 rs7555566 A G . PASS . GT 0/1
+3 17077268 rs17272796 C T . PASS . GT 1/1
+4 57194525 rs6834736 C G . PASS . GT 0/1
+5 156355375 rs2862058 A G . PASS . GT 0/1
+7 4490854 rs314605 C T . PASS . GT 1/1
diff --git a/testdata/picard/fingerprint/NA12892.vcf b/testdata/picard/fingerprint/NA12892.vcf
new file mode 100755
index 0000000..7189fdd
--- /dev/null
+++ b/testdata/picard/fingerprint/NA12892.vcf
@@ -0,0 +1,101 @@
+##fileformat=VCFv4.1
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=IGC,Number=1,Type=Float,Description="Illumina GenCall Confidence Score">
+##GATKCommandLine.SelectVariants=<ID=SelectVariants,Version=nightly-2015-07-31-g3c929b0,Date="Wed Sep 14 12:18:26 EDT 2016",Epoch=1473869906846,CommandLineOptions="analysis_type=SelectVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] disable_read_filter=[] intervals=[Fingerprint.test.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sa [...]
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=MT,length=16569>
+##contig=<ID=GL000207.1,length=4262>
+##contig=<ID=GL000226.1,length=15008>
+##contig=<ID=GL000229.1,length=19913>
+##contig=<ID=GL000231.1,length=27386>
+##contig=<ID=GL000210.1,length=27682>
+##contig=<ID=GL000239.1,length=33824>
+##contig=<ID=GL000235.1,length=34474>
+##contig=<ID=GL000201.1,length=36148>
+##contig=<ID=GL000247.1,length=36422>
+##contig=<ID=GL000245.1,length=36651>
+##contig=<ID=GL000197.1,length=37175>
+##contig=<ID=GL000203.1,length=37498>
+##contig=<ID=GL000246.1,length=38154>
+##contig=<ID=GL000249.1,length=38502>
+##contig=<ID=GL000196.1,length=38914>
+##contig=<ID=GL000248.1,length=39786>
+##contig=<ID=GL000244.1,length=39929>
+##contig=<ID=GL000238.1,length=39939>
+##contig=<ID=GL000202.1,length=40103>
+##contig=<ID=GL000234.1,length=40531>
+##contig=<ID=GL000232.1,length=40652>
+##contig=<ID=GL000206.1,length=41001>
+##contig=<ID=GL000240.1,length=41933>
+##contig=<ID=GL000236.1,length=41934>
+##contig=<ID=GL000241.1,length=42152>
+##contig=<ID=GL000243.1,length=43341>
+##contig=<ID=GL000242.1,length=43523>
+##contig=<ID=GL000230.1,length=43691>
+##contig=<ID=GL000237.1,length=45867>
+##contig=<ID=GL000233.1,length=45941>
+##contig=<ID=GL000204.1,length=81310>
+##contig=<ID=GL000198.1,length=90085>
+##contig=<ID=GL000208.1,length=92689>
+##contig=<ID=GL000191.1,length=106433>
+##contig=<ID=GL000227.1,length=128374>
+##contig=<ID=GL000228.1,length=129120>
+##contig=<ID=GL000214.1,length=137718>
+##contig=<ID=GL000221.1,length=155397>
+##contig=<ID=GL000209.1,length=159169>
+##contig=<ID=GL000218.1,length=161147>
+##contig=<ID=GL000220.1,length=161802>
+##contig=<ID=GL000213.1,length=164239>
+##contig=<ID=GL000211.1,length=166566>
+##contig=<ID=GL000199.1,length=169874>
+##contig=<ID=GL000217.1,length=172149>
+##contig=<ID=GL000216.1,length=172294>
+##contig=<ID=GL000215.1,length=172545>
+##contig=<ID=GL000205.1,length=174588>
+##contig=<ID=GL000219.1,length=179198>
+##contig=<ID=GL000224.1,length=179693>
+##contig=<ID=GL000223.1,length=180455>
+##contig=<ID=GL000195.1,length=182896>
+##contig=<ID=GL000212.1,length=186858>
+##contig=<ID=GL000222.1,length=186861>
+##contig=<ID=GL000200.1,length=187035>
+##contig=<ID=GL000193.1,length=189789>
+##contig=<ID=GL000194.1,length=191469>
+##contig=<ID=GL000225.1,length=211173>
+##contig=<ID=GL000192.1,length=547496>
+##contig=<ID=NC_007605,length=171823>
+##date created=Wed Sep 14 14:48:11 UTC 2016
+##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+##source=BPM file
+##source=SelectVariants
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12892
+1 14804874 exm-rs7555566 A G . . AC=1;AF=0.500;AN=2 GT:IGC 0/1:0.8139954
+3 17077268 exm-rs17272796 C T . . AC=2;AF=1.00;AN=2 GT:IGC 1/1:0.8534497
+7 4490854 rs314605 C T . . AC=2;AF=1.00;AN=2 GT:IGC 1/1:0.9196903
diff --git a/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.sam b/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.sam
index 70df153..52bb4fb 100644
--- a/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.sam
+++ b/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.sam
@@ -1,12 +1,12 @@
@HD VN:1.0 SO:queryname
@SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734
-FF:80:175 65 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9
-FF:80:175 129 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD
-FR_clip:100:155 97 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9
-FR_clip:100:155 145 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD
-FR_noclip:100:575 97 chr1 100 20 76M chr1 500 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9
-FR_noclip:100:575 145 chr1 500 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD
-RF:100:575 81 chr1 100 20 76M chr1 500 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9
-RF:100:575 161 chr1 500 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD
-RR:80:175 113 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9
-RR:80:175 177 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD
+FF:80:175 65 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
+FF:80:175 129 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
+FR_clip:100:155:56:0 97 chr1 100 20 76M chr1 80 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
+FR_clip:100:155:56:0 145 chr1 80 20 76M chr1 100 0 TTAGAGTACGTTAACACTCCATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAA GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
+FR_noclip:100:575:75C0:1 97 chr1 100 20 76M chr1 500 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAC ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
+FR_noclip:100:575:75C0:1 145 chr1 500 20 76M chr1 100 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
+RF:100:575 81 chr1 100 20 76M chr1 500 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
+RF:100:575 161 chr1 500 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
+RR:80:175 113 chr1 100 20 76M chr1 80 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 MD:Z:76 NM:i:0
+RR:80:175 177 chr1 80 20 76M chr1 100 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD MD:Z:76 NM:i:0
\ No newline at end of file
diff --git a/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.sam b/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.sam
index d18395d..b9aca0b 100644
--- a/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.sam
+++ b/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.sam
@@ -2,10 +2,10 @@
@RG ID:0 SM:Hi,Mom! PL:ILLUMINA
FF:80:175 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
FF:80:175 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
-FR_clip:100:155 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
-FR_clip:100:155 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
-FR_noclip:100:575 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
-FR_noclip:100:575 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
+FR_clip:100:155:56:0 77 * 0 0 * * 0 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
+FR_clip:100:155:56:0 141 * 0 0 * * 0 0 TTAAAGGTTTGTTAATATTTGCATCTGTACGATCGTAAGAGGGCTTCAGCATGAATGGAGTGTTAACGTACTCTAA GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
+FR_noclip:100:575:75C0:1 77 * 0 0 * * 0 0 ATTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACCTTTAAGGGCAAAAAAAAAACAATAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
+FR_noclip:100:575:75C0:1 141 * 0 0 * * 0 0 CTATTGTTTTTTTTTTGCCCTTAAAGGTTTGTTAATATTTGCATCTGTACGATCGTAAGAGGGCTTCAGCATGAAT GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
RF:100:575 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
RF:100:575 141 * 0 0 * * 0 0 TCGACTCTAGAGGATCCCACGAGTTTCACTGTTGTCACATATGCTGGAGTGCAGTGGTGCAATCTTGGCTTACTGC GGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGFGGFGGDGEEE-:CECEECFEDE?FBCFD=@CBBADCF=CC:CCD RG:Z:0
RR:80:175 77 * 0 0 * * 0 0 GTACCCGGGGATCCCACTCTCTCCTTGCCCTTCATGATCTTGGCACTTTCGGAGTAATGGTCATAACATCAGTAAA ECEEEEEDDCCCDDDCDDDDEBDCCCCDBBD at DBCBCCCC:ACAA?CBCCCABCBBBBBBB?BBB?<?A?<7<<=9 RG:Z:0
diff --git a/testdata/picard/sam/MergeBamAlignment/contam.expected.sam b/testdata/picard/sam/MergeBamAlignment/contam.expected.sam
index 607579d..3e28922 100644
--- a/testdata/picard/sam/MergeBamAlignment/contam.expected.sam
+++ b/testdata/picard/sam/MergeBamAlignment/contam.expected.sam
@@ -3,11 +3,11 @@
@RG ID:0 SM:Hi,Mom! PL:ILLUMINA
@PG ID:0 VN:1.0 CL:align! PN:myAligner
frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
-frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
-frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
+frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
+frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
-frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
-frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
+frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0
+frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240
r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination
diff --git a/testdata/picard/sam/MergeBamAlignment/removetags.aligned.sam b/testdata/picard/sam/MergeBamAlignment/removetags.aligned.sam
new file mode 100644
index 0000000..ac68d72
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/removetags.aligned.sam
@@ -0,0 +1,10 @@
+ at HD VN:1.0 SO:queryname
+ at SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734
+CLIPPED:0:1 97 chr1 52 20 48M chr1 51 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:48 NM:i:0
+CLIPPED:0:1 145 chr1 51 20 49M chr1 52 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
+CLIPPED:1:0 97 chr1 52 20 49M chr1 52 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
+CLIPPED:1:0 145 chr1 52 20 48M chr1 52 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:48 NM:i:0
+CLIPPED:1:1 97 chr1 52 20 49M chr1 51 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
+CLIPPED:1:1 145 chr1 51 20 49M chr1 52 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
+UNCLIPPED 65 chr1 52 20 49M chr1 51 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
+UNCLIPPED 129 chr1 51 20 49M chr1 52 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII MD:Z:49 NM:i:0
diff --git a/testdata/picard/sam/MergeBamAlignment/removetags.dict b/testdata/picard/sam/MergeBamAlignment/removetags.dict
new file mode 100644
index 0000000..3b3f5d9
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/removetags.dict
@@ -0,0 +1,2 @@
+ at HD VN:1.5 SO:unsorted
+ at SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734
diff --git a/testdata/picard/sam/MergeBamAlignment/removetags.fasta b/testdata/picard/sam/MergeBamAlignment/removetags.fasta
new file mode 100644
index 0000000..5eccdcd
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/removetags.fasta
@@ -0,0 +1,21 @@
+>chr1
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
+TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
+TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
diff --git a/testdata/picard/sam/MergeBamAlignment/removetags.fasta.fai b/testdata/picard/sam/MergeBamAlignment/removetags.fasta.fai
new file mode 100644
index 0000000..1e6a48d
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/removetags.fasta.fai
@@ -0,0 +1 @@
+chr1 1000 6 50 51
diff --git a/testdata/picard/sam/MergeBamAlignment/removetags.unmapped.sam b/testdata/picard/sam/MergeBamAlignment/removetags.unmapped.sam
new file mode 100644
index 0000000..e530b21
--- /dev/null
+++ b/testdata/picard/sam/MergeBamAlignment/removetags.unmapped.sam
@@ -0,0 +1,10 @@
+ at HD VN:1.0 SO:queryname
+ at RG ID:0 SM:Hi,Mom! PL:ILLUMINA
+CLIPPED:0:1 77 * 0 0 * * 0 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+CLIPPED:0:1 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+CLIPPED:1:0 77 * 0 0 * * 0 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+CLIPPED:1:0 141 * 0 0 * * 0 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+CLIPPED:1:1 77 * 0 0 * * 0 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+CLIPPED:1:1 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+UNCLIPPED 77 * 0 0 * * 0 0 TTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
+UNCLIPPED 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:0
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/picard-tools.git
More information about the debian-med-commit
mailing list