[med-svn] [picard-tools] 03/04: Imported Upstream version 1.108
Charles Plessy
plessy at moszumanska.debian.org
Sat Mar 22 02:05:04 UTC 2014
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch upstream
in repository picard-tools.
commit 5430f798ca989824887faa0aaecee3d53bbbfe54
Author: Charles Plessy <plessy at debian.org>
Date: Sat Mar 22 10:46:21 2014 +0900
Imported Upstream version 1.108
---
Picard-public.ipr | 2 +
build.xml | 3 +-
src/java/net/sf/picard/fastq/BasicFastqWriter.java | 4 +-
.../sf/picard/illumina/MarkIlluminaAdapters.java | 204 +++++++-------
.../net/sf/picard/illumina/parser/BclParser.java | 16 +-
.../parser/ClusterIntensityFileReader.java | 2 +-
.../illumina/parser/CycleIlluminaFileMap.java | 59 ++--
.../parser/IlluminaDataProviderFactory.java | 69 +++--
.../picard/illumina/parser/IlluminaFileUtil.java | 212 ++++++++++++++-
.../illumina/parser/IlluminaIntensityParser.java | 2 +-
.../parser/MultiTileBclCycleFilesIterator.java} | 45 +--
.../picard/illumina/parser/MultiTileBclParser.java | 111 ++++++++
.../illumina/parser/MultiTileFilterParser.java} | 59 ++--
.../illumina/parser/MultiTileLocsParser.java | 77 ++++++
.../sf/picard/illumina/parser/MultiTileParser.java | 128 +++++++++
.../sf/picard/illumina/parser/PerTileParser.java | 6 +-
.../illumina/parser/PerTilePerCycleParser.java | 5 +-
.../net/sf/picard/illumina/parser/TileIndex.java | 153 +++++++++++
.../AbstractIlluminaPositionFileReader.java | 32 ++-
.../illumina/parser/readers/BclIndexReader.java | 75 +++++
.../picard/illumina/parser/readers/BclReader.java | 30 +-
.../illumina/parser/readers/FilterFileReader.java | 4 +
.../illumina/parser/readers/LocsFileReader.java | 16 +-
.../parser/readers/MMapBackedIteratorFactory.java | 20 ++
src/java/net/sf/picard/io/IoUtil.java | 23 +-
src/java/net/sf/picard/sam/MergeBamAlignment.java | 10 +-
src/java/net/sf/picard/sam/RevertSam.java | 218 ++++++++++++---
src/java/net/sf/picard/sam/SamAlignmentMerger.java | 50 ++--
src/java/net/sf/picard/sam/SamToFastq.java | 301 +++++++++++++--------
src/java/net/sf/picard/util/IntervalListTools.java | 2 +-
.../sf/picard/util/ListMap.java} | 51 ++--
src/java/net/sf/picard/util/SamLocusIterator.java | 2 +
src/java/net/sf/picard/vcf/MakeSitesOnlyVcf.java | 28 +-
src/java/net/sf/picard/vcf/VcfFormatConverter.java | 8 +-
.../samtools/util/BlockCompressedInputStream.java | 15 +-
src/java/net/sf/samtools/util/Lazy.java | 4 +
src/java/org/broad/tribble/util/TabixUtils.java | 50 ++++
.../org/broadinstitute/variant/bcf2/BCF2Utils.java | 5 +
.../variant/variantcontext/VariantContext.java | 30 +-
.../writer/AsyncVariantContextWriter.java | 49 ++++
.../variant/variantcontext/writer/Options.java | 3 +-
.../writer/VariantContextWriterFactory.java | 164 ++++++++---
.../variant/vcf/AbstractVCFCodec.java | 5 +-
.../broadinstitute/variant/vcf/VCFFileReader.java | 2 +-
.../illumina/parser/IlluminaFileUtilTest.java | 36 ++-
.../illumina/parser/PerTilePerCycleParserTest.java | 2 +-
.../AbstractIlluminaPositionFileReaderTest.java | 4 +-
.../reference/ReferenceSequenceFileWalkerTest.java | 60 ++++
.../net/sf/picard/sam/MergeBamAlignmentTest.java | 2 +-
.../variantcontext/VariantContextUnitTest.java | 41 ++-
50 files changed, 1931 insertions(+), 568 deletions(-)
diff --git a/Picard-public.ipr b/Picard-public.ipr
index 1cc83ca..19d8680 100644
--- a/Picard-public.ipr
+++ b/Picard-public.ipr
@@ -91,6 +91,8 @@
<inspection_tool class="LocalCanBeFinal" enabled="true" level="WARNING" enabled_by_default="true">
<option name="REPORT_VARIABLES" value="true" />
<option name="REPORT_PARAMETERS" value="true" />
+ <option name="REPORT_CATCH_PARAMETERS" value="true" />
+ <option name="REPORT_FOREACH_PARAMETERS" value="true" />
</inspection_tool>
<inspection_tool class="SqlNoDataSourceInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="UnusedDeclaration" enabled="false" level="WARNING" enabled_by_default="false">
diff --git a/build.xml b/build.xml
index 473f51b..f88bf7b 100755
--- a/build.xml
+++ b/build.xml
@@ -43,7 +43,7 @@
<!-- Get SVN revision, if available, otherwise leave it blank. -->
<exec executable="svnversion" outputproperty="repository.revision" failifexecutionfails="false"/>
<property name="repository.revision" value=""/>
- <property name="sam-version" value="1.107"/>
+ <property name="sam-version" value="1.108"/>
<property name="picard-version" value="${sam-version}"/>
<property name="tribble-version" value="${sam-version}"/>
<property name="variant-version" value="${sam-version}"/>
@@ -66,6 +66,7 @@
</target>
<target name="set_excluded_test_groups" depends="set_excluded_test_groups_unix,set_excluded_test_groups_non_unix"/>
+
<!-- INIT -->
<target name="init">
<path id="classpath">
diff --git a/src/java/net/sf/picard/fastq/BasicFastqWriter.java b/src/java/net/sf/picard/fastq/BasicFastqWriter.java
index 07ee661..3e6cf52 100644
--- a/src/java/net/sf/picard/fastq/BasicFastqWriter.java
+++ b/src/java/net/sf/picard/fastq/BasicFastqWriter.java
@@ -25,7 +25,9 @@ package net.sf.picard.fastq;
import net.sf.picard.PicardException;
import net.sf.picard.io.IoUtil;
+import net.sf.samtools.Defaults;
+import java.io.BufferedOutputStream;
import java.io.File;
import java.io.OutputStream;
import java.io.PrintStream;
@@ -43,7 +45,7 @@ public class BasicFastqWriter implements FastqWriter {
}
public BasicFastqWriter(final File file, final boolean createMd5) {
- this(file, new PrintStream(maybeMd5Wrap(file, createMd5)));
+ this(file, new PrintStream(new BufferedOutputStream(maybeMd5Wrap(file, createMd5), Defaults.BUFFER_SIZE)));
}
private BasicFastqWriter(final File file, final PrintStream writer) {
diff --git a/src/java/net/sf/picard/illumina/MarkIlluminaAdapters.java b/src/java/net/sf/picard/illumina/MarkIlluminaAdapters.java
index ea60091..d5f971d 100644
--- a/src/java/net/sf/picard/illumina/MarkIlluminaAdapters.java
+++ b/src/java/net/sf/picard/illumina/MarkIlluminaAdapters.java
@@ -34,11 +34,14 @@ import net.sf.picard.metrics.MetricsFile;
import net.sf.picard.sam.ReservedTagConstants;
import net.sf.picard.util.*;
import net.sf.samtools.*;
+import net.sf.samtools.util.CollectionUtil;
import net.sf.samtools.util.SequenceUtil;
import net.sf.samtools.util.StringUtil;
+import static net.sf.picard.util.IlluminaUtil.IlluminaAdapterPair;
import java.io.File;
-import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.List;
/**
* Command line program to mark the location of adapter sequences.
@@ -58,54 +61,56 @@ public class MarkIlluminaAdapters extends CommandLineProgram {
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;
+
@Option(doc="If output is not specified, just the metrics are generated",
shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional=true)
public File OUTPUT;
+
@Option(doc="Histogram showing counts of bases_clipped in how many reads", shortName="M")
public File METRICS;
- @Option(doc="The minimum number of bases that must match the adapter that will be clipped. Defaults to " +
- ClippingUtility.MIN_MATCH_PE_BASES + " if paired-end, otherwise" + ClippingUtility.MIN_MATCH_BASES +
- "/nThe stricter match used when matching 2 reads will be twice this.",
- optional=true)
- public Integer MIN_MATCH_BASES;
- @Option(doc="The percentage of errors allowed when matching the adapter sequence. Defaults to " +
- ClippingUtility.MAX_PE_ERROR_RATE + " if paired-end, otherwise " + ClippingUtility.MAX_ERROR_RATE,
- optional=true)
- public Double MAX_ERROR_RATE;
- @Option(doc="Whether this is a paired-end run. ", shortName="PE")
+
+ @Option(doc="The minimum number of bases to match over when clipping single-end reads.")
+ public int MIN_MATCH_BASES_SE = ClippingUtility.MIN_MATCH_BASES;
+
+ @Option(doc="The minimum number of bases to match over (per-read) when clipping paired-end reads.")
+ public int MIN_MATCH_BASES_PE = ClippingUtility.MIN_MATCH_PE_BASES;
+
+ @Option(doc="The maximum mismatch error rate to tolerate when clipping single-end reads.")
+ public double MAX_ERROR_RATE_SE = ClippingUtility.MAX_ERROR_RATE;
+
+ @Option(doc="The maximum mismatch error rate to tolerate when clipping paired-end reads.")
+ public double MAX_ERROR_RATE_PE = ClippingUtility.MAX_PE_ERROR_RATE;
+
+ @Option(doc="DEPRECATED. Whether this is a paired-end run. No longer used.", shortName="PE", optional=true)
public Boolean PAIRED_RUN;
- @Option(doc="Which adapters to use, PAIRED_END, INDEXED, or SINGLE_END",
- mutex={"FIVE_PRIME_ADAPTER", "THREE_PRIME_ADAPTER"})
- // this probably only makes sense for paired_run where you need to specify either PAIRED_END or INDEXED?
- // or for non-paired_run where you need to specify either SINGLE_END or INDEXED?
- // but we won't enforce this.
- public IlluminaUtil.IlluminaAdapterPair ADAPTERS;
-
- @Option(doc="For specifying adapters other than standard Illumina", mutex = {"ADAPTERS"})
+
+ @Option(doc="Which adapters sequences to attempt to identify and clip.")
+ public List<IlluminaAdapterPair> ADAPTERS =
+ CollectionUtil.makeList(IlluminaAdapterPair.INDEXED,
+ IlluminaAdapterPair.DUAL_INDEXED,
+ IlluminaAdapterPair.PAIRED_END
+ );
+
+ @Option(doc="For specifying adapters other than standard Illumina", optional=true)
public String FIVE_PRIME_ADAPTER;
- @Option(doc="For specifying adapters other than standard Illumina", mutex = {"ADAPTERS"})
+ @Option(doc="For specifying adapters other than standard Illumina", optional=true)
public String THREE_PRIME_ADAPTER;
private static final Log log = Log.getInstance(MarkIlluminaAdapters.class);
+ // Stock main method
+ public static void main(final String[] args) {
+ System.exit(new MarkIlluminaAdapters().instanceMain(args));
+ }
+
@Override
protected String[] customCommandLineValidation() {
- // set default thresholds based on what kind of run
- if (PAIRED_RUN){
- if (MIN_MATCH_BASES == null) MIN_MATCH_BASES = ClippingUtility.MIN_MATCH_PE_BASES;
- if (MAX_ERROR_RATE == null) MAX_ERROR_RATE = ClippingUtility.MAX_PE_ERROR_RATE;
- // For paired runs, you may actually want to specify all 4 thresholds
- // so the stricter test when mismatch can be controlled.
- // We'll assume that the stricter test will be twice the min_match_bases
- } else {
- if (MIN_MATCH_BASES == null) MIN_MATCH_BASES = ClippingUtility.MIN_MATCH_BASES;
- if (MAX_ERROR_RATE == null) MAX_ERROR_RATE = ClippingUtility.MAX_ERROR_RATE;
+ if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) {
+ return new String[] {"Either both or neither of THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must be set."};
+ }
+ else {
+ return null;
}
- return null;
- }
-
- public static void main(String[] args) {
- System.exit(new MarkIlluminaAdapters().instanceMain(args));
}
@Override
@@ -113,94 +118,91 @@ public class MarkIlluminaAdapters extends CommandLineProgram {
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(METRICS);
- SAMFileReader in = new SAMFileReader(INPUT);
+ final SAMFileReader in = new SAMFileReader(INPUT);
+ final SAMFileHeader.SortOrder order = in.getFileHeader().getSortOrder();
SAMFileWriter out = null;
if (OUTPUT != null) {
IoUtil.assertFileIsWritable(OUTPUT);
out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
}
- Histogram<Integer> histo = new Histogram<Integer>("clipped_bases", "read_count");
+ final Histogram<Integer> histo = new Histogram<Integer>("clipped_bases", "read_count");
- // check sort order in the header - must be queryName for paired end runs
- if (PAIRED_RUN && !in.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
- throw new PicardException("Input BAM file must be sorted by queryname");
+ // Combine any adapters and custom adapter pairs from the command line into an array for use in clipping
+ final AdapterPair[] adapters;
+ {
+ final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
+ tmp.addAll(ADAPTERS);
+ if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
+ tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
+ }
+ adapters = tmp.toArray(new AdapterPair[tmp.size()]);
}
- final AdapterPair adapters;
- if (ADAPTERS != null) {
- adapters = ADAPTERS;
- } else {
- adapters = new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER);
- }
- // The following loop is roughly the same as "for (SAMRecord rec : in){"
+ ////////////////////////////////////////////////////////////////////////
+ // Main loop that consumes reads, clips them and writes them to the output
+ ////////////////////////////////////////////////////////////////////////
final ProgressLogger progress = new ProgressLogger(log, 1000000, "Read");
- for (Iterator<SAMRecord> iter = in.iterator(); iter.hasNext();) {
- SAMRecord rec = iter.next();
+ final SAMRecordIterator iterator = in.iterator();
- // clear any existing trim on rec
+ while (iterator.hasNext()) {
+ final SAMRecord rec = iterator.next();
+ final SAMRecord rec2 = rec.getReadPairedFlag() && iterator.hasNext() ? iterator.next() : null;
rec.setAttribute(ReservedTagConstants.XT, null);
- SAMRecord rec2 = null;
- if (PAIRED_RUN) {
- if (rec.getFirstOfPairFlag() || rec.getSecondOfPairFlag()) {
- // the secondOfPair should be the next record
- rec2 = iter.hasNext() ? iter.next() : null;
- if (rec2 == null) {
- throw new PicardException("Missing second read for " + rec);
- }
-
- // clear any existing trim on rec2
- rec2.setAttribute(ReservedTagConstants.XT, null);
- if (!rec.getReadName().equals(rec2.getReadName())){
- throw new PicardException("read names of two paired reads differs : " +
- rec.getReadName() + ", " + rec2.getReadName());
- }
-
- // establish which of pair is first and which second
- SAMRecord firstRead;
- SAMRecord secondRead;
- if (rec.getFirstOfPairFlag()){
- firstRead = rec;
- secondRead = rec2;
- } else {
- firstRead = rec2;
- secondRead = rec;
- }
- if (!firstRead.getFirstOfPairFlag()){
- throw new PicardException("first of two reads doesn't have getFirstOfPairFlag()");
- }
- if (!secondRead.getSecondOfPairFlag()){
- throw new PicardException("second of two reads doesn't have getSecondOfPairFlag()");
- }
-
- String warnString = ClippingUtility.adapterTrimIlluminaPairedReads(firstRead, secondRead,
- adapters, MIN_MATCH_BASES, MAX_ERROR_RATE);
- if (warnString != null) {
- log.info("Adapter trimming " + warnString);
- }
- } else {
- throw new PicardException("Non-paired reads in a paired run " + rec);
+ // Do the clipping one way for PE and another for SE reads
+ if (rec.getReadPairedFlag()) {
+ // Assert that the input file is in query name order only if we see some PE reads
+ if (order != SAMFileHeader.SortOrder.queryname) {
+ throw new PicardException("Input BAM file must be sorted by queryname");
}
- } else { // not a paired run
- ClippingUtility.adapterTrimIlluminaSingleRead(rec,
- adapters, MIN_MATCH_BASES, MAX_ERROR_RATE);
- }
- if (out != null) out.addAlignment(rec);
- if (out != null && rec2 != null) out.addAlignment(rec2);
+ if (rec2 == null) throw new PicardException("Missing mate pair for paired read: " + rec.getReadName());
+ rec2.setAttribute(ReservedTagConstants.XT, null);
- Integer trimPoint = rec.getIntegerAttribute(ReservedTagConstants.XT);
- if (trimPoint != null) {
- histo.increment(rec.getReadLength() - trimPoint + 1);
+ // Assert that we did in fact just get two mate pairs
+ if (!rec.getReadName().equals(rec2.getReadName())){
+ throw new PicardException("Adjacent reads expected to be mate-pairs have different names: " +
+ rec.getReadName() + ", " + rec2.getReadName());
+ }
+
+ // establish which of pair is first and which second
+ final SAMRecord first, second;
+
+ if (rec.getFirstOfPairFlag() && rec2.getSecondOfPairFlag()){
+ first = rec;
+ second = rec2;
+ }
+ else if (rec.getSecondOfPairFlag() && rec2.getFirstOfPairFlag()) {
+ first = rec2;
+ second = rec;
+ }
+ else {
+ throw new PicardException("Two reads with same name but not correctly marked as 1st/2nd of pair: " + rec.getReadName());
+ }
+
+ ClippingUtility.adapterTrimIlluminaPairedReads(first, second, MIN_MATCH_BASES_PE, MAX_ERROR_RATE_PE, adapters);
+ }
+ else {
+ ClippingUtility.adapterTrimIlluminaSingleRead(rec, MIN_MATCH_BASES_SE, MAX_ERROR_RATE_SE, adapters);
}
- progress.record(rec);
+ // Then output the records, update progress and metrics
+ for (final SAMRecord r : new SAMRecord[] {rec, rec2}) {
+ if (r != null) {
+ progress.record(r);
+ if (out != null) out.addAlignment(r);
+
+ final Integer clip = rec.getIntegerAttribute(ReservedTagConstants.XT);
+ if (clip != null) histo.increment(rec.getReadLength() - clip + 1);
+ }
+ }
}
if (out != null) out.close();
- MetricsFile<?,Integer> metricsFile = getMetricsFile();
+ // Lastly output the metrics to file
+ final MetricsFile<?,Integer> metricsFile = getMetricsFile();
metricsFile.setHistogram(histo);
metricsFile.write(METRICS);
diff --git a/src/java/net/sf/picard/illumina/parser/BclParser.java b/src/java/net/sf/picard/illumina/parser/BclParser.java
index ef34a2a..22262e9 100644
--- a/src/java/net/sf/picard/illumina/parser/BclParser.java
+++ b/src/java/net/sf/picard/illumina/parser/BclParser.java
@@ -26,6 +26,7 @@ package net.sf.picard.illumina.parser;
import net.sf.picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import net.sf.picard.illumina.parser.readers.BclReader;
+import net.sf.samtools.util.CloseableIterator;
import java.io.File;
import java.util.Collections;
@@ -46,7 +47,7 @@ class BclParser extends PerTilePerCycleParser<BclData>{
private static final Set<IlluminaDataType> SUPPORTED_TYPES = Collections.unmodifiableSet(makeSet(IlluminaDataType.BaseCalls, IlluminaDataType.QualityScores));
- private final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
+ protected final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
private final boolean applyEamssFilter;
public BclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles, final OutputMapping outputMapping, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy) {
@@ -66,17 +67,24 @@ class BclParser extends PerTilePerCycleParser<BclData>{
return new BclData(outputLengths);
}
- /** Create a Bcl parser for an individual cycle and wrap it with the CycleFileParser interaface which populates
+ /**
+ * Allow for overriding in derived classes.
+ */
+ protected CloseableIterator<BclReader.BclValue> makeReader(final File file, final int cycle, final int tileNumber) {
+ return new BclReader(file, bclQualityEvaluationStrategy);
+ }
+
+ /** Create a Bcl parser for an individual cycle and wrap it with the CycleFileParser interface which populates
* the correct cycle in BclData.
* @param file The file to parse
* @param cycle The cycle that file represents
* @return A CycleFileParser that populates a BclData object with data for a single cycle
*/
@Override
- protected CycleFileParser<BclData> makeCycleFileParser(final File file, final int cycle) {
+ protected CycleFileParser<BclData> makeCycleFileParser(final File file, final int cycle, final int tileNumber) {
return new CycleFileParser<BclData>(){
final OutputMapping.TwoDIndex cycleOutputIndex = outputMapping.getOutputIndexForCycle(cycle);
- BclReader reader = new BclReader(file, bclQualityEvaluationStrategy);
+ CloseableIterator<BclReader.BclValue> reader = makeReader(file, cycle, tileNumber);
@Override
public void close() {
diff --git a/src/java/net/sf/picard/illumina/parser/ClusterIntensityFileReader.java b/src/java/net/sf/picard/illumina/parser/ClusterIntensityFileReader.java
index 507b074..2f31cc3 100644
--- a/src/java/net/sf/picard/illumina/parser/ClusterIntensityFileReader.java
+++ b/src/java/net/sf/picard/illumina/parser/ClusterIntensityFileReader.java
@@ -146,7 +146,7 @@ class ClusterIntensityFileReader {
header.firstCycle + "; numCycles=" + header.numCycles);
}
if (cluster < 0 || cluster >= header.numClusters) {
- throw new IllegalArgumentException("Requested cluster (" + cluster + ") number out of range. numClusters=" + header.numClusters);
+ throw new IllegalArgumentException("Requested cluster (" + cluster + ") number out of range. numClustersInTile=" + header.numClusters);
}
final int relativeCycle = cycle - header.firstCycle;
final int position = HEADER_SIZE + relativeCycle * cycleSize + channel.ordinal() * channelSize + cluster * header.elementSize;
diff --git a/src/java/net/sf/picard/illumina/parser/CycleIlluminaFileMap.java b/src/java/net/sf/picard/illumina/parser/CycleIlluminaFileMap.java
index d622b87..a3deae7 100644
--- a/src/java/net/sf/picard/illumina/parser/CycleIlluminaFileMap.java
+++ b/src/java/net/sf/picard/illumina/parser/CycleIlluminaFileMap.java
@@ -66,31 +66,7 @@ class CycleIlluminaFileMap extends TreeMap<Integer, CycleFilesIterator> {
File curFile = null;
for (final int tile : expectedTiles) {
- final CycleFilesIterator cycleFiles = new CycleFilesIterator(get(tile), null); //so as not to expend the iterator
- int total;
- for(total = 0; cycleFiles.hasNext(); total++) {
- if(cycleFiles.getNextCycle() != expectedCycles[total]) {
- if(curFile == null) {
- curFile = cycleFiles.next();
- }
- cycleFiles.reset();
- throw new PicardException("Cycles in iterator(" + remainingCyclesToString(cycleFiles) +
- ") do not match those expected (" + StringUtil.intValuesToString(expectedCycles) +
- ") Last file checked (" + curFile.getAbsolutePath() + ")");
- }
- curFile = cycleFiles.next();
- if(!curFile.exists()) {
- throw new PicardException("Missing cycle file " + curFile.getName() + " in CycledIlluminaFileMap");
- }
- }
- if(total != expectedCycles.length) {
- String message = "Expected tile " + tile + " of CycledIlluminaFileMap to contain " + expectedCycles + " cycles but " + total + " were found!";
-
- if(curFile != null) {
- message += "Check to see if the following bcl exists: " + incrementCycleCount(curFile).getAbsolutePath();
- }
- throw new PicardException(message);
- }
+ get(tile).assertValid(expectedCycles);
}
}
@@ -129,10 +105,10 @@ class CycleIlluminaFileMap extends TreeMap<Integer, CycleFilesIterator> {
* given file extension while lane/tile stay the same. Stop if the next file does not exist.
*/
class CycleFilesIterator implements Iterator<File>, Iterable<File> {
- private final File parentDir;
- private final int lane;
+ protected final File parentDir;
+ protected final int lane;
private final int tile;
- private final String fileExt;
+ protected final String fileExt;
protected final int [] cycles;
protected int nextCycleIndex;
@@ -164,13 +140,23 @@ class CycleIlluminaFileMap extends TreeMap<Integer, CycleFilesIterator> {
throw new NoSuchElementException(summarizeIterator());
}
- final File cycleDir = new File(parentDir, "C" + cycles[nextCycleIndex] + ".1");
- final File curFile = new File(cycleDir, "s_" + lane + "_" + tile + fileExt);
+ final File curFile = getCurrentFile();
nextCycleIndex++;
return curFile;
}
+ /** Allow for overriding */
+ protected File getCurrentFile() {
+ return getFileForCycle(cycles[nextCycleIndex]);
+ }
+
+ /** Allow for overriding */
+ protected File getFileForCycle(final int cycle) {
+ final File cycleDir = new File(parentDir, "C" + cycle + ".1");
+ return new File(cycleDir, "s_" + lane + "_" + tile + fileExt);
+ }
+
public int getNextCycle() {
return cycles[nextCycleIndex];
}
@@ -198,4 +184,17 @@ class CycleIlluminaFileMap extends TreeMap<Integer, CycleFilesIterator> {
public Iterator<File> iterator() {
return this;
}
+
+ public void assertValid(final int[] expectedCycles) {
+ if (!Arrays.equals(cycles, expectedCycles)) {
+ // TODO: Make more informative if necessary
+ throw new PicardException(summarizeIterator() + " does not have expected cycles");
+ }
+ for (final int cycle : expectedCycles) {
+ final File file = getFileForCycle(cycle);
+ if (!file.exists()) {
+ throw new PicardException(file.getAbsolutePath() + " does not exist for " + summarizeIterator());
+ }
+ }
+ }
}
diff --git a/src/java/net/sf/picard/illumina/parser/IlluminaDataProviderFactory.java b/src/java/net/sf/picard/illumina/parser/IlluminaDataProviderFactory.java
index 664f39c..ae0b274 100644
--- a/src/java/net/sf/picard/illumina/parser/IlluminaDataProviderFactory.java
+++ b/src/java/net/sf/picard/illumina/parser/IlluminaDataProviderFactory.java
@@ -62,11 +62,15 @@ public class IlluminaDataProviderFactory {
/** For types found in Qseq, we prefer the NON-Qseq file formats first. However, if we end up using Qseqs then we use Qseqs for EVERY type it provides,
* see determineFormats
*/
- DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.BaseCalls, makeList(SupportedIlluminaFormat.Bcl, SupportedIlluminaFormat.Qseq));
- DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.QualityScores, makeList(SupportedIlluminaFormat.Bcl, SupportedIlluminaFormat.Qseq));
- DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.PF, makeList(SupportedIlluminaFormat.Filter, SupportedIlluminaFormat.Qseq));
- DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.Position, makeList(SupportedIlluminaFormat.Locs, SupportedIlluminaFormat.Clocs,
- SupportedIlluminaFormat.Pos, SupportedIlluminaFormat.Qseq));
+ DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.BaseCalls, makeList(
+ SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl, SupportedIlluminaFormat.Qseq));
+ DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.QualityScores, makeList(
+ SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl, SupportedIlluminaFormat.Qseq));
+ DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.PF, makeList(
+ SupportedIlluminaFormat.MultiTileFilter, SupportedIlluminaFormat.Filter, SupportedIlluminaFormat.Qseq));
+ DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.Position, makeList(
+ SupportedIlluminaFormat.MultiTileLocs, SupportedIlluminaFormat.Locs, SupportedIlluminaFormat.Clocs,
+ SupportedIlluminaFormat.Pos, SupportedIlluminaFormat.Qseq));
DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.Barcodes, makeList(SupportedIlluminaFormat.Barcode));
DATA_TYPE_TO_PREFERRED_FORMATS.put(IlluminaDataType.RawIntensities, makeList(SupportedIlluminaFormat.Cif));
@@ -81,13 +85,6 @@ public class IlluminaDataProviderFactory {
//rawIntensity directory is assumed to be the parent of basecallDirectory
private final File intensitiesDirectory;
- /** The types of data that will be returned by any IlluminaDataProviders created by this factory.
- *
- * Note: In previous version, data of types not specified might be returned if a data type was specified
- * for data residing in QSeqs (since QSeqs span multiple data types). This is no longer the case, you
- * MUST specify all data types that should be returned.*/
- private final Set<IlluminaDataType> dataTypes;
-
/**
* Whether or not to apply EAMSS filtering if parsing BCLs for the bases and quality scores.
*/
@@ -115,17 +112,24 @@ public class IlluminaDataProviderFactory {
* @param readStructure The read structure to which output clusters will conform. When not using QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the readStructure, if
* the readStructure specified does not match the readStructure implied by the sequencer's output than the quality scores output may differ than what would be found
* in a run's QSeq files
- * @param dataTypes Which data types to read
+ * @param dataTypesArg Which data types to read
*/
- public IlluminaDataProviderFactory(final File basecallDirectory, final int lane, final ReadStructure readStructure, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy, final IlluminaDataType... dataTypes) {
+ public IlluminaDataProviderFactory(final File basecallDirectory, final int lane, final ReadStructure readStructure,
+ final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
+ final IlluminaDataType... dataTypesArg) {
this.basecallDirectory = basecallDirectory;
this.intensitiesDirectory = basecallDirectory.getParentFile();
this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
this.lane = lane;
- this.dataTypes = Collections.unmodifiableSet(new HashSet<IlluminaDataType>(Arrays.asList(dataTypes)));
+ /* The types of data that will be returned by any IlluminaDataProviders created by this factory.
+
+ Note: In previous version, data of types not specified might be returned if a data type was specified
+ for data residing in QSeqs (since QSeqs span multiple data types). This is no longer the case, you
+ MUST specify all data types that should be returned.*/
+ final Set<IlluminaDataType> dataTypes = Collections.unmodifiableSet(new HashSet<IlluminaDataType>(Arrays.asList(dataTypesArg)));
- if (this.dataTypes.isEmpty()) {
+ if (dataTypes.isEmpty()) {
throw new PicardException("No data types have been specified for basecall output " + basecallDirectory +
", lane " + lane);
}
@@ -133,18 +137,15 @@ public class IlluminaDataProviderFactory {
this.fileUtil = new IlluminaFileUtil(basecallDirectory, lane);
//find what request IlluminaDataTypes we have files for and select the most preferred file format available for that type
- formatToDataTypes = determineFormats(this.dataTypes, fileUtil);
+ formatToDataTypes = determineFormats(dataTypes, fileUtil);
//find if we have any IlluminaDataType with NO available file formats and, if any exist, throw an exception
- final Set<IlluminaDataType> unmatchedDataTypes = findUnmatchedTypes(this.dataTypes, formatToDataTypes);
+ final Set<IlluminaDataType> unmatchedDataTypes = findUnmatchedTypes(dataTypes, formatToDataTypes);
if(unmatchedDataTypes.size() > 0) {
throw new PicardException("Could not find a format with available files for the following data types: " + StringUtil.join(", ", new ArrayList<IlluminaDataType>(unmatchedDataTypes)));
}
- final StringBuilder sb = new StringBuilder(400);
- sb.append("The following file formats will be used by IlluminaDataProvier: ");
- sb.append(StringUtil.join("," + formatToDataTypes.keySet()));
- log.debug(sb.toString());
+ log.debug("The following file formats will be used by IlluminaDataProvider: " + StringUtil.join("," + formatToDataTypes.keySet()));
availableTiles = fileUtil.getActualTiles(new ArrayList<SupportedIlluminaFormat>(formatToDataTypes.keySet()));
if(availableTiles.isEmpty()) {
@@ -203,10 +204,7 @@ public class IlluminaDataProviderFactory {
parsersToDataType.put(makeParser(fmToDt.getKey(), requestedTiles), fmToDt.getValue());
}
- final StringBuilder sb = new StringBuilder(400);
- sb.append("The following parsers will be used by IlluminaDataProvier: ");
- sb.append(StringUtil.join("," + parsersToDataType.keySet()));
- log.debug(sb.toString());
+ log.debug("The following parsers will be used by IlluminaDataProvider: " + StringUtil.join("," + parsersToDataType.keySet()));
return new IlluminaDataProvider(outputMapping, parsersToDataType, basecallDirectory, lane);
}
@@ -307,11 +305,12 @@ public class IlluminaDataProviderFactory {
parser = new BarcodeParser(fileUtil.barcode().getFiles(requestedTiles));
break;
- case Bcl:
+ case Bcl: {
final CycleIlluminaFileMap bclFileMap = fileUtil.bcl().getFiles(requestedTiles, outputMapping.getOutputCycles());
bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
parser = new BclParser(basecallDirectory, lane, bclFileMap, outputMapping, this.applyEamssFiltering, bclQualityEvaluationStrategy);
break;
+ }
case Cif:
final CycleIlluminaFileMap cifFileMap = fileUtil.cif().getFiles(requestedTiles, outputMapping.getOutputCycles());
@@ -342,6 +341,22 @@ public class IlluminaDataProviderFactory {
parser = new QseqParser(lane, readTileMap, outputMapping);
break;
+ case MultiTileFilter:
+ parser = fileUtil.multiTileFilter().makeParser(requestedTiles);
+ break;
+
+ case MultiTileLocs:
+ parser = fileUtil.multiTileLocs().makeParser(requestedTiles);
+ break;
+
+ case MultiTileBcl: {
+ final CycleIlluminaFileMap bclFileMap = fileUtil.multiTileBcl().getFiles(requestedTiles, outputMapping.getOutputCycles());
+ bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
+ parser = new MultiTileBclParser(basecallDirectory, lane, bclFileMap, outputMapping,
+ this.applyEamssFiltering, bclQualityEvaluationStrategy, fileUtil.multiTileBcl().tileIndex);
+ break;
+ }
+
default:
throw new PicardException("Unrecognized data type(" + format + ") found by IlluminaDataProviderFactory!");
}
diff --git a/src/java/net/sf/picard/illumina/parser/IlluminaFileUtil.java b/src/java/net/sf/picard/illumina/parser/IlluminaFileUtil.java
index 9d06e7c..2bbe9a7 100644
--- a/src/java/net/sf/picard/illumina/parser/IlluminaFileUtil.java
+++ b/src/java/net/sf/picard/illumina/parser/IlluminaFileUtil.java
@@ -28,7 +28,7 @@ import net.sf.picard.illumina.parser.readers.TileMetricsOutReader;
import net.sf.picard.io.IoUtil;
import net.sf.samtools.util.StringUtil;
-import java.io.File;
+import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -54,7 +54,10 @@ public class IlluminaFileUtil {
Clocs,
Pos,
Filter,
- Barcode
+ Barcode,
+ MultiTileFilter,
+ MultiTileLocs,
+ MultiTileBcl
}
private final File intensityLaneDir;
@@ -72,6 +75,9 @@ public class IlluminaFileUtil {
private final PerTileFileUtil clocs;
private final PerTileFileUtil filter;
private final PerTileFileUtil barcode;
+ private final MultiTileFilterFileUtil multiTileFilter;
+ private final MultiTileLocsFileUtil multiTileLocs;
+ private final MultiTileBclFileUtil multiTileBcl;
private final File tileMetricsOut;
private final Map<SupportedIlluminaFormat, ParameterizedFileUtil> utils;
@@ -115,6 +121,15 @@ public class IlluminaFileUtil {
barcode = new PerTileFileUtil("_barcode.txt", true, basecallDir);
utils.put(SupportedIlluminaFormat.Barcode, barcode);
+ multiTileFilter = new MultiTileFilterFileUtil(basecallLaneDir);
+ utils.put(SupportedIlluminaFormat.MultiTileFilter, multiTileFilter);
+
+ multiTileLocs = new MultiTileLocsFileUtil(new File(intensityDir, basecallLaneDir.getName()), basecallLaneDir);
+ utils.put(SupportedIlluminaFormat.MultiTileLocs, multiTileLocs);
+
+ multiTileBcl = new MultiTileBclFileUtil(basecallLaneDir);
+ utils.put(SupportedIlluminaFormat.MultiTileBcl, multiTileBcl);
+
tileMetricsOut = new File(interopDir, "TileMetricsOut.bin");
}
@@ -206,19 +221,31 @@ public class IlluminaFileUtil {
return barcode;
}
+ public MultiTileFilterFileUtil multiTileFilter() {
+ return multiTileFilter;
+ }
+
+ public MultiTileLocsFileUtil multiTileLocs() {
+ return multiTileLocs;
+ }
+
+ public MultiTileBclFileUtil multiTileBcl() {
+ return multiTileBcl;
+ }
+
public File tileMetricsOut() {
return tileMetricsOut;
}
- public static String UNPARAMETERIZED_PER_TILE_PATTERN = "s_(\\d+)_(\\d{1,4})";
- public static String UNPARAMETERIZED_QSEQ_PATTERN = "s_(\\d+)_(\\d)_(\\d{4})_qseq\\.txt(\\.gz|\\.bz2)?";
+ public static final String UNPARAMETERIZED_PER_TILE_PATTERN = "s_(\\d+)_(\\d{1,5})";
+ public static final String UNPARAMETERIZED_QSEQ_PATTERN = "s_(\\d+)_(\\d)_(\\d{4})_qseq\\.txt(\\.gz|\\.bz2)?";
private static final Pattern CYCLE_SUBDIRECTORY_PATTERN = Pattern.compile("^C(\\d+)\\.1$");
public static String makeParameterizedLaneAndTileRegex(final int lane) {
if(lane < 0) {
throw new PicardException("Lane (" + lane + ") cannot be negative");
}
- return "s_" + lane + "_(\\d{1,4})";
+ return "s_" + lane + "_(\\d{1,5})";
}
public static String makeParameterizedQseqRegex(final int lane) {
@@ -434,7 +461,6 @@ public class IlluminaFileUtil {
final File laneDir = base;
final File[] tempCycleDirs;
- File firstCycleDir = null;
tempCycleDirs = IoUtil.getFilesMatchingRegexp(laneDir, CYCLE_SUBDIRECTORY_PATTERN);
if (tempCycleDirs == null || tempCycleDirs.length == 0) {
return cycledMap;
@@ -451,7 +477,7 @@ public class IlluminaFileUtil {
}
}
- firstCycleDir = tempCycleDirs[lowestCycleDirIndex];
+ final File firstCycleDir = tempCycleDirs[lowestCycleDirIndex];
Arrays.sort(cycles);
detectedCycles = cycles;
@@ -736,6 +762,170 @@ public class IlluminaFileUtil {
}
}
+ /**
+ * For file types for which there is one file per lane, with fixed record size, and all the tiles in it,
+ * so the s_<lane>.bci file can be used to figure out where each tile starts and ends.
+ */
+ abstract class MultiTileFileUtil<OUTPUT_RECORD extends IlluminaData> extends ParameterizedFileUtil {
+ protected final File bci;
+ protected final TileIndex tileIndex;
+ protected final File dataFile;
+
+ MultiTileFileUtil(final String extension, final File base, final File bciDir) {
+ super(makeLaneRegex(extension), makeLaneRegex(extension, lane), extension, base);
+ bci = new File(bciDir, "s_" + lane + ".bci");
+ if (bci.exists()) {
+ tileIndex = new TileIndex(bci);
+ } else {
+ tileIndex = null;
+ }
+ final File[] filesMatchingRegexp = IoUtil.getFilesMatchingRegexp(base, pattern);
+ if (filesMatchingRegexp == null || filesMatchingRegexp.length == 0) dataFile = null;
+ else if (filesMatchingRegexp.length == 1) dataFile = filesMatchingRegexp[0];
+ else throw new PicardException("More than one filter file found in " + base.getAbsolutePath());
+ }
+
+ @Override
+ public boolean filesAvailable() {
+ return tileIndex != null && dataFile != null && dataFile.exists();
+ }
+
+ @Override
+ public LaneTileEnd fileToLaneTileEnd(final String fileName) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public List<Integer> getTiles() {
+ if (tileIndex == null) return Collections.EMPTY_LIST;
+ return tileIndex.getTiles();
+ }
+
+ /**
+ * expectedCycles are not checked in this implementation.
+ */
+ @Override
+ public List<String> verify(final List<Integer> expectedTiles, final int[] expectedCycles) {
+ if (tileIndex == null) {
+ return Collections.singletonList("Tile index(" + bci.getAbsolutePath() + ") does not exist!");
+ }
+ return tileIndex.verify(expectedTiles);
+ }
+
+ abstract IlluminaParser<OUTPUT_RECORD> makeParser(List<Integer> requestedTiles);
+ }
+
+ class MultiTileFilterFileUtil extends MultiTileFileUtil<PfData> {
+
+ /**
+ * @param basecallLaneDir location of .filter file and also .bci file
+ */
+ MultiTileFilterFileUtil(final File basecallLaneDir) {
+ super(".filter", basecallLaneDir, basecallLaneDir);
+ }
+
+ @Override
+ IlluminaParser<PfData> makeParser(final List<Integer> requestedTiles) {
+ return new MultiTileFilterParser(tileIndex, requestedTiles, dataFile);
+ }
+ }
+
+ class MultiTileLocsFileUtil extends MultiTileFileUtil<PositionalData> {
+
+ MultiTileLocsFileUtil(final File basecallLaneDir, final File bciDir) {
+ super(".locs", basecallLaneDir, bciDir);
+ }
+
+ @Override
+ IlluminaParser<PositionalData> makeParser(final List<Integer> requestedTiles) {
+ return new MultiTileLocsParser(tileIndex, requestedTiles, dataFile, lane);
+ }
+ }
+
+ /**
+ * NextSeq-style bcl's have all tiles for a cycle in a single file.
+ */
+ class MultiTileBclFileUtil extends ParameterizedFileUtil {
+ final File basecallLaneDir;
+ final File bci;
+ final TileIndex tileIndex;
+ final SortedMap<Integer, File> cycleFileMap = new TreeMap<Integer, File>();
+
+ MultiTileBclFileUtil(final File basecallLaneDir) {
+ // Since these file names do not contain lane number, first two args to ctor are the same.
+ super("^(\\d{4}).bcl.bgzf$", "^(\\d{4}).bcl.bgzf$", ".bcl.bgzf", basecallLaneDir);
+ this.basecallLaneDir = basecallLaneDir;
+ bci = new File(basecallLaneDir, "s_" + lane + ".bci");
+ // Do this once rather than when deciding if these files exist and again later.
+ final File[] cycleFiles = IoUtil.getFilesMatchingRegexp(base, pattern);
+ if (cycleFiles != null) {
+ for (final File file : cycleFiles) {
+ final String fileName = file.getName();
+ final String cycleNum = fileName.substring(0, fileName.indexOf('.'));
+ cycleFileMap.put(Integer.valueOf(cycleNum), file);
+ }
+ }
+ if (bci.exists()) {
+ tileIndex = new TileIndex(bci);
+ } else {
+ tileIndex = null;
+ }
+
+ }
+
+ public CycleIlluminaFileMap getFiles(final List<Integer> tiles, final int [] cycles) {
+ // Filter input list of cycles according to which actually exist
+ final ArrayList<Integer> goodCycleList = new ArrayList<Integer>(cycles.length);
+ for (final int cycle : cycles) {
+ if (cycleFileMap.containsKey(cycle)) goodCycleList.add(cycle);
+ }
+ // Ensure cycles are sorted.
+ Collections.sort(goodCycleList);
+ final int[] goodCycles = new int[goodCycleList.size()];
+ for (int i = 0; i < goodCycles.length; ++i) goodCycles[i] = goodCycleList.get(i);
+
+ // Create the map.
+ final CycleIlluminaFileMap cycledMap = new CycleIlluminaFileMap();
+ if (goodCycles.length > 0) {
+ for(final int tile : tiles) {
+ cycledMap.put(tile, new MultiTileBclCycleFilesIterator(basecallLaneDir, lane, tile, goodCycles, extension));
+ }
+ }
+ return cycledMap;
+ }
+
+ @Override
+ public boolean filesAvailable() {
+ return bci.exists() && cycleFileMap.size()> 0;
+ }
+
+ @Override
+ public LaneTileEnd fileToLaneTileEnd(final String fileName) {
+ throw new UnsupportedOperationException();
+ }
+
+
+ @Override
+ public List<Integer> getTiles() {
+ if (tileIndex == null) return Collections.EMPTY_LIST;
+ return tileIndex.getTiles();
+ }
+
+ @Override
+ public List<String> verify(final List<Integer> expectedTiles, final int[] expectedCycles) {
+ if (tileIndex == null) {
+ return Collections.singletonList("Tile index(" + bci.getAbsolutePath() + ") does not exist!");
+ }
+ final List<String> ret = tileIndex.verify(expectedTiles);
+ for (final int expectedCycle : expectedCycles) {
+ if (!cycleFileMap.containsKey(expectedCycle)) {
+ ret.add(expectedCycle + ".bcl.bgzf not found in " + base);
+ }
+ }
+ return ret;
+ }
+ }
+
/** A support class for return lane tile and end information for a given file */
static class LaneTileEnd {
public final Integer lane;
@@ -763,6 +953,14 @@ public class IlluminaFileUtil {
return "^" + makeParameterizedLaneAndTileRegex(lane) + fileNameEndPattern + "$";
}
+ private static String makeLaneRegex(final String fileNameEndPattern) {
+ return "^s_(\\d+)" + fileNameEndPattern + "$";
+ }
+
+ private static String makeLaneRegex(final String fileNameEndPattern, final int lane) {
+ return "^s_" + lane + fileNameEndPattern + "$";
+ }
+
private static int getCycleFromDir(final File tempCycleDir) {
final char [] name = tempCycleDir.getName().toCharArray();
if(name[0] != 'C') {
diff --git a/src/java/net/sf/picard/illumina/parser/IlluminaIntensityParser.java b/src/java/net/sf/picard/illumina/parser/IlluminaIntensityParser.java
index f61f74a..72cc846 100644
--- a/src/java/net/sf/picard/illumina/parser/IlluminaIntensityParser.java
+++ b/src/java/net/sf/picard/illumina/parser/IlluminaIntensityParser.java
@@ -57,7 +57,7 @@ abstract class IlluminaIntensityParser<T extends IlluminaData> extends PerTilePe
* @return CycleFileParser
*/
@Override
- protected CycleFileParser<T> makeCycleFileParser(final File file, final int cycle) {
+ protected CycleFileParser<T> makeCycleFileParser(final File file, final int cycle, final int tileNumber) {
return new IntensityFileParser(file, cycle);
}
diff --git a/src/java/org/broad/tribble/util/TabixUtils.java b/src/java/net/sf/picard/illumina/parser/MultiTileBclCycleFilesIterator.java
similarity index 53%
copy from src/java/org/broad/tribble/util/TabixUtils.java
copy to src/java/net/sf/picard/illumina/parser/MultiTileBclCycleFilesIterator.java
index c1acad2..15403a0 100644
--- a/src/java/org/broad/tribble/util/TabixUtils.java
+++ b/src/java/net/sf/picard/illumina/parser/MultiTileBclCycleFilesIterator.java
@@ -1,7 +1,7 @@
/*
* The MIT License
*
- * Copyright (c) 2013 The Broad Institute
+ * Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -21,45 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
-package org.broad.tribble.util;
+package net.sf.picard.illumina.parser;
-import java.util.HashMap;
+import java.io.File;
/**
- * classes that have anything to do with tabix
+ * Overrides standard file naming scheme for multi-tile BCL files.
*/
-public class TabixUtils {
-
- public static class TPair64 implements Comparable<TPair64> {
- public long u, v;
-
- public TPair64(final long _u, final long _v) {
- u = _u;
- v = _v;
- }
-
- public TPair64(final TPair64 p) {
- u = p.u;
- v = p.v;
- }
-
- public int compareTo(final TPair64 p) {
- return u == p.u ? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0)) ? -1 : 1; // unsigned 64-bit comparison
- }
- }
-
- public static class TIndex {
- public HashMap<Integer, TPair64[]> b; // binning index
- public long[] l; // linear index
+public class MultiTileBclCycleFilesIterator extends CycleFilesIterator {
+ public MultiTileBclCycleFilesIterator(File laneDir, int lane, int tile, int[] cycles, String fileExt) {
+ super(laneDir, lane, tile, cycles, fileExt);
}
-
- public static class TIntv {
- public int tid, beg, end;
- }
-
-
- public static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
- return (u < v) ^ (u < 0) ^ (v < 0);
+ @Override
+ protected File getFileForCycle(final int cycle) {
+ return new File(parentDir, String.format("%04d%s", cycle, fileExt));
}
}
diff --git a/src/java/net/sf/picard/illumina/parser/MultiTileBclParser.java b/src/java/net/sf/picard/illumina/parser/MultiTileBclParser.java
new file mode 100644
index 0000000..d8c71e2
--- /dev/null
+++ b/src/java/net/sf/picard/illumina/parser/MultiTileBclParser.java
@@ -0,0 +1,111 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2014 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.illumina.parser;
+
+import net.sf.picard.PicardException;
+import net.sf.picard.illumina.parser.readers.BclIndexReader;
+import net.sf.picard.illumina.parser.readers.BclQualityEvaluationStrategy;
+import net.sf.picard.illumina.parser.readers.BclReader;
+import net.sf.samtools.util.CloseableIterator;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+/**
+ * Parse .bcl.bgzf files that contain multiple tiles in a single file. This requires an index file that tells
+ * the bgzf virtual file offset of the start of each tile in the block-compressed bcl file.
+ */
+public class MultiTileBclParser extends BclParser {
+ private final TileIndex tileIndex;
+ public MultiTileBclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles,
+ final OutputMapping outputMapping, final boolean applyEamssFilter,
+ final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
+ final TileIndex tileIndex) {
+ super(directory, lane, tilesToCycleFiles, outputMapping, applyEamssFilter, bclQualityEvaluationStrategy);
+ this.tileIndex = tileIndex;
+ super.initialize();
+ }
+
+ /**
+ * Defer initialization until after this class is fully constructed. This is necessary because superclass
+ * ctor calls makeReader, which is overridden below, and the override requires that this.tileIndex is initialized,
+ * and that doesn't happen until after superclass has been constructed.
+ */
+ @Override
+ protected void initialize() {
+ }
+
+ @Override
+ protected CloseableIterator<BclReader.BclValue> makeReader(final File file, final int cycle, final int tileNumber) {
+ final TileIndex.TileIndexRecord tileIndexRecord = tileIndex.findTile(tileNumber);
+
+ final BclIndexReader bclIndexReader = new BclIndexReader(file);
+ if (tileIndex.getNumTiles() != bclIndexReader.getNumTiles()) {
+ throw new PicardException(String.format("%s.getNumTiles(%d) != %s.getNumTiles(%d)",
+ tileIndex.getFile().getAbsolutePath(), tileIndex.getNumTiles(), bclIndexReader.getBciFile().getAbsolutePath(), bclIndexReader.getNumTiles()));
+ }
+
+ final BclReader bclReader = new BclReader(file, bclQualityEvaluationStrategy);
+ bclReader.seek(bclIndexReader.get(tileIndexRecord.zeroBasedTileNumber));
+
+ return new CountLimitedIterator(bclReader, tileIndexRecord.numClustersInTile);
+ }
+
+ /**
+ * An iterator wrapper that stops when it has return a pre-determined number of records even if the underlying
+ * iterator still had more records.
+ */
+ static class CountLimitedIterator implements CloseableIterator<BclReader.BclValue> {
+ private final CloseableIterator<BclReader.BclValue> underlyingIterator;
+ private final int recordLimit;
+ private int numRecordsRead = 0;
+
+ CountLimitedIterator(final CloseableIterator<BclReader.BclValue> underlyingIterator, final int recordLimit) {
+ this.underlyingIterator = underlyingIterator;
+ this.recordLimit = recordLimit;
+ }
+
+ @Override
+ public void close() {
+ underlyingIterator.close();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return numRecordsRead < recordLimit && underlyingIterator.hasNext();
+ }
+
+ @Override
+ public BclReader.BclValue next() {
+ if (!hasNext()) throw new NoSuchElementException();
+ ++numRecordsRead;
+ return underlyingIterator.next();
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+}
diff --git a/src/java/org/broad/tribble/util/TabixUtils.java b/src/java/net/sf/picard/illumina/parser/MultiTileFilterParser.java
similarity index 50%
copy from src/java/org/broad/tribble/util/TabixUtils.java
copy to src/java/net/sf/picard/illumina/parser/MultiTileFilterParser.java
index c1acad2..3927fc5 100644
--- a/src/java/org/broad/tribble/util/TabixUtils.java
+++ b/src/java/net/sf/picard/illumina/parser/MultiTileFilterParser.java
@@ -1,7 +1,7 @@
/*
* The MIT License
*
- * Copyright (c) 2013 The Broad Institute
+ * Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -21,45 +21,38 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
-package org.broad.tribble.util;
+package net.sf.picard.illumina.parser;
-import java.util.HashMap;
+import net.sf.picard.illumina.parser.readers.FilterFileReader;
+
+import java.io.File;
+import java.util.Collections;
+import java.util.List;
/**
- * classes that have anything to do with tabix
+ * Read filter file that contains multiple tiles in a single file. A tile index is needed to parse this
+ * file so that {tile number, cluster number} can be converted into absolute record number in file.
*/
-public class TabixUtils {
-
- public static class TPair64 implements Comparable<TPair64> {
- public long u, v;
-
- public TPair64(final long _u, final long _v) {
- u = _u;
- v = _v;
- }
-
- public TPair64(final TPair64 p) {
- u = p.u;
- v = p.v;
- }
-
- public int compareTo(final TPair64 p) {
- return u == p.u ? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0)) ? -1 : 1; // unsigned 64-bit comparison
- }
+public class MultiTileFilterParser extends MultiTileParser<PfData> {
+ private final FilterFileReader reader;
+ public MultiTileFilterParser(final TileIndex tileIndex, final List<Integer> requestedTiles, final File filterFile) {
+ super(tileIndex, requestedTiles, Collections.singleton(IlluminaDataType.PF));
+ reader = new FilterFileReader(filterFile);
}
- public static class TIndex {
- public HashMap<Integer, TPair64[]> b; // binning index
- public long[] l; // linear index
+ @Override
+ PfData readNext() {
+ final boolean nextVal = reader.next();
+ return new PfData() {
+ @Override
+ public boolean isPf() {
+ return nextVal;
+ }
+ };
}
-
- public static class TIntv {
- public int tid, beg, end;
- }
-
-
- public static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
- return (u < v) ^ (u < 0) ^ (v < 0);
+ @Override
+ void skipRecords(final int numToSkip) {
+ reader.skipRecords(numToSkip);
}
}
diff --git a/src/java/net/sf/picard/illumina/parser/MultiTileLocsParser.java b/src/java/net/sf/picard/illumina/parser/MultiTileLocsParser.java
new file mode 100644
index 0000000..ac05fe8
--- /dev/null
+++ b/src/java/net/sf/picard/illumina/parser/MultiTileLocsParser.java
@@ -0,0 +1,77 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2014 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.illumina.parser;
+
+import net.sf.picard.illumina.parser.readers.AbstractIlluminaPositionFileReader;
+import net.sf.picard.illumina.parser.readers.LocsFileReader;
+
+import java.io.File;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Read locs file that contains multiple tiles in a single file. A tile index is needed to parse this
+ * file so that {tile number, cluster number} can be converted into absolute record number in file.
+ */
+public class MultiTileLocsParser extends MultiTileParser<PositionalData> {
+ private final LocsFileReader reader;
+ private final int lane;
+
+ public MultiTileLocsParser(final TileIndex tileIndex, final List<Integer> requestedTiles, final File locsFile, final int lane) {
+ super(tileIndex, requestedTiles, Collections.singleton(IlluminaDataType.Position));
+ final int tileNumber;
+ if (requestedTiles.size() == 1) tileNumber = requestedTiles.get(0);
+ else tileNumber = -1;
+ this.reader = new LocsFileReader(locsFile, lane, tileNumber);
+ this.lane = lane;
+ }
+
+ @Override
+ PositionalData readNext() {
+ final int tile = getTileOfNextCluster();
+ final AbstractIlluminaPositionFileReader.PositionInfo nextVal = reader.next();
+ return new PositionalData() {
+ public int getXCoordinate() {
+ return nextVal.xQseqCoord;
+ }
+
+ public int getYCoordinate() {
+ return nextVal.yQseqCoord;
+ }
+
+ public int getLane() {
+ return lane;
+ }
+
+ public int getTile() {
+ return tile;
+ }
+ };
+ }
+
+ @Override
+ void skipRecords(final int numToSkip) {
+ reader.skipRecords(numToSkip);
+ }
+}
diff --git a/src/java/net/sf/picard/illumina/parser/MultiTileParser.java b/src/java/net/sf/picard/illumina/parser/MultiTileParser.java
new file mode 100644
index 0000000..1621834
--- /dev/null
+++ b/src/java/net/sf/picard/illumina/parser/MultiTileParser.java
@@ -0,0 +1,128 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2014 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.illumina.parser;
+
+import net.sf.picard.PicardException;
+import net.sf.samtools.util.PeekIterator;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+/**
+ * Abstract class for files with fixed-length records for multiple tiles, e.g. .locs and .filter files.
+ * @param <OUTPUT_RECORD> The kind of record to be returned (as opposed to the type of the record stored in the file).
+ */
+public abstract class MultiTileParser<OUTPUT_RECORD extends IlluminaData> implements IlluminaParser<OUTPUT_RECORD> {
+ private final TileIndex tileIndex;
+ private final Iterator<TileIndex.TileIndexRecord> tileIndexIterator;
+ private final PeekIterator<Integer> requestedTilesIterator;
+ private final Set<IlluminaDataType> supportedTypes;
+ private int nextRecordIndex = 0;
+ private int nextClusterInTile;
+ private TileIndex.TileIndexRecord currentTile = null;
+
+ /**
+ * @param tileIndex Enables conversion from tile number to record number in this file.
+ * @param requestedTiles Iterate over these tile numbers, which must be in ascending order.
+ * @param supportedTypes The data types(s) that are provided by this file type, used to decide what file types to read.
+ */
+ public MultiTileParser(final TileIndex tileIndex,
+ final List<Integer> requestedTiles,
+ final Set<IlluminaDataType> supportedTypes) {
+ this.tileIndex = tileIndex;
+ this.tileIndexIterator = tileIndex.iterator();
+ this.requestedTilesIterator = new PeekIterator<Integer>(requestedTiles.iterator());
+ this.supportedTypes = supportedTypes;
+ }
+
+ @Override
+ public void seekToTile(final int oneBasedTileNumber) {
+ while (tileIndexIterator.hasNext()) {
+ final TileIndex.TileIndexRecord next = tileIndexIterator.next();
+ if (next.tile > oneBasedTileNumber) {
+ throw new PicardException(
+ String.format("Cannot seek backwards: next tile %d > tile sought %d", next.tile, oneBasedTileNumber));
+ } else if (next.tile == oneBasedTileNumber) {
+ currentTile = next;
+ break;
+ }
+ }
+ if (nextRecordIndex > currentTile.indexOfFirstClusterInTile) {
+ throw new PicardException(
+ String.format("Seem to be in wrong position %d > %d", nextRecordIndex, currentTile.indexOfFirstClusterInTile));
+ }
+ skipRecords(currentTile.indexOfFirstClusterInTile - nextRecordIndex);
+ nextRecordIndex = currentTile.indexOfFirstClusterInTile;
+ nextClusterInTile = 0;
+ }
+
+ @Override
+ public OUTPUT_RECORD next() {
+ if (!hasNext()) throw new NoSuchElementException();
+ OUTPUT_RECORD ret = readNext();
+ ++nextClusterInTile;
+ ++nextRecordIndex;
+ return ret;
+ }
+
+ @Override
+ public boolean hasNext() {
+ // Skip over any empty tiles
+ while ((currentTile == null || nextClusterInTile >= currentTile.numClustersInTile) && requestedTilesIterator.hasNext()) {
+ seekToTile(requestedTilesIterator.next());
+ }
+ return currentTile != null && nextClusterInTile < currentTile.numClustersInTile;
+ }
+
+ @Override
+ public int getTileOfNextCluster() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ if (currentTile != null && nextClusterInTile < currentTile.numClustersInTile) return currentTile.tile;
+ else return requestedTilesIterator.peek();
+ }
+
+ @Override
+ public void verifyData(final List<Integer> tiles, final int[] cycles) {
+ final List<String> tileErrors = tileIndex.verify(tiles);
+ if (!tileErrors.isEmpty()) throw new PicardException(tileErrors.get(0));
+ //No need to validate cycles until such time as this class is used for cycle-oriented data types
+ }
+
+ @Override
+ public Set<IlluminaDataType> supportedTypes() {
+ return supportedTypes;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ abstract OUTPUT_RECORD readNext();
+ abstract void skipRecords(int numToSkip);
+}
diff --git a/src/java/net/sf/picard/illumina/parser/PerTileParser.java b/src/java/net/sf/picard/illumina/parser/PerTileParser.java
index 4e47953..94e614b 100644
--- a/src/java/net/sf/picard/illumina/parser/PerTileParser.java
+++ b/src/java/net/sf/picard/illumina/parser/PerTileParser.java
@@ -116,7 +116,11 @@ public abstract class PerTileParser<ILLUMINA_DATA extends IlluminaData> implemen
}
public boolean hasNext() {
- return nextTile != null || (currentIterator != null && currentIterator.hasNext());
+ // Skip over empty tiles
+ while ((currentIterator == null || !currentIterator.hasNext()) && nextTile != null) {
+ advanceTile();
+ }
+ return currentIterator != null && currentIterator.hasNext();
}
public void close() {
diff --git a/src/java/net/sf/picard/illumina/parser/PerTilePerCycleParser.java b/src/java/net/sf/picard/illumina/parser/PerTilePerCycleParser.java
index 333ece2..df75751 100644
--- a/src/java/net/sf/picard/illumina/parser/PerTilePerCycleParser.java
+++ b/src/java/net/sf/picard/illumina/parser/PerTilePerCycleParser.java
@@ -87,9 +87,10 @@ abstract class PerTilePerCycleParser<ILLUMINA_DATA extends IlluminaData> impleme
* For a given cycle, return a CycleFileParser.
* @param file The file to parse
* @param cycle The cycle that file represents
+ * @param tileNumber For files that contain multiple tiles, need to specify tile of interest.
* @return A CycleFileParser that will populate the correct position in the IlluminaData object with that cycle's data.
*/
- protected abstract CycleFileParser<ILLUMINA_DATA> makeCycleFileParser(final File file, final int cycle);
+ protected abstract CycleFileParser<ILLUMINA_DATA> makeCycleFileParser(final File file, final int cycle, final int tileNumber);
/**
* CycleFileParsers iterate through the clusters of a file and populate an IlluminaData object with a single cycle's
@@ -118,7 +119,7 @@ abstract class PerTilePerCycleParser<ILLUMINA_DATA extends IlluminaData> impleme
int totalCycles = 0;
while(filesIterator.hasNext()) {
final int nextCycle = filesIterator.getNextCycle();
- cycleFileParsers.add(makeCycleFileParser(filesIterator.next(), nextCycle));
+ cycleFileParsers.add(makeCycleFileParser(filesIterator.next(), nextCycle, tileNumber));
++totalCycles;
}
diff --git a/src/java/net/sf/picard/illumina/parser/TileIndex.java b/src/java/net/sf/picard/illumina/parser/TileIndex.java
new file mode 100644
index 0000000..0750b7c
--- /dev/null
+++ b/src/java/net/sf/picard/illumina/parser/TileIndex.java
@@ -0,0 +1,153 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2014 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.illumina.parser;
+
+import net.sf.picard.PicardException;
+import net.sf.samtools.Defaults;
+import net.sf.samtools.util.CloserUtil;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.*;
+
+/**
+ * Load a file containing 8-byte records like this:
+ * tile number: 4-byte int
+ * number of clusters in tile: 4-byte int
+ * Number of records to read is determined by reaching EOF.
+ */
+class TileIndex implements Iterable<TileIndex.TileIndexRecord> {
+ private final File tileIndexFile;
+ private final List<TileIndexRecord> tiles = new ArrayList<TileIndexRecord>();
+
+ TileIndex(final File tileIndexFile) {
+ try {
+ this.tileIndexFile = tileIndexFile;
+ final InputStream is = new BufferedInputStream(new FileInputStream(tileIndexFile), Defaults.BUFFER_SIZE);
+ final ByteBuffer buf = ByteBuffer.allocate(8);
+ buf.order(ByteOrder.LITTLE_ENDIAN);
+ int absoluteRecordIndex = 0;
+ int numTiles = 0;
+ while (readTileIndexRecord(buf.array(), buf.capacity(), is)) {
+ buf.rewind();
+ buf.limit(buf.capacity());
+ final int tile = buf.getInt();
+ // Note: not handling unsigned ints > 2^31, but could if one of these exceptions is thrown.
+ if (tile < 0) throw new PicardException("Tile number too large in " + tileIndexFile.getAbsolutePath());
+ final int numClusters = buf.getInt();
+ if (numClusters < 0) throw new PicardException("Cluster size too large in " + tileIndexFile.getAbsolutePath());
+ tiles.add(new TileIndexRecord(tile, numClusters, absoluteRecordIndex, numTiles++));
+ absoluteRecordIndex += numClusters;
+ }
+ CloserUtil.close(is);
+ } catch (final IOException e) {
+ throw new PicardException("Problem reading " + tileIndexFile.getAbsolutePath(), e);
+ }
+ }
+
+ public File getFile() {
+ return tileIndexFile;
+ }
+
+ public int getNumTiles() {
+ return tiles.size();
+ }
+
+ private boolean readTileIndexRecord(final byte[] buf, final int numBytes, final InputStream is) throws IOException {
+ int totalBytesRead = 0;
+ while (totalBytesRead < numBytes) {
+ final int bytesRead = is.read(buf, totalBytesRead, numBytes - totalBytesRead);
+ if (bytesRead == -1) {
+ if (totalBytesRead != 0) {
+ throw new PicardException(tileIndexFile.getAbsolutePath() + " has incomplete last block");
+ } else return false;
+ }
+ totalBytesRead += bytesRead;
+ }
+ return true;
+ }
+
+ public List<Integer> getTiles() {
+ final List<Integer> ret = new ArrayList<Integer>(tiles.size());
+ for (final TileIndexRecord rec : tiles) ret.add(rec.tile);
+ return ret;
+ }
+
+ public List<String> verify(final List<Integer> expectedTiles) {
+ final Set<Integer> tileSet = new HashSet<Integer>(tiles.size());
+ for (final TileIndexRecord rec : tiles) tileSet.add(rec.tile);
+ final List<String> failures = new LinkedList<String>();
+ for (final int expectedTile : expectedTiles) {
+ if (!tileSet.contains(expectedTile)) {
+ failures.add("Tile " + expectedTile + " not found in " + tileIndexFile.getAbsolutePath());
+ }
+ }
+ return failures;
+ }
+
+ @Override
+ public Iterator<TileIndexRecord> iterator() {
+ return tiles.iterator();
+ }
+
+ /**
+ * @throws java.util.NoSuchElementException if tile is not found
+ */
+ public TileIndexRecord findTile(final int tileNumber) {
+ for (final TileIndexRecord rec : this) {
+ if (rec.tile == tileNumber) return rec;
+ if (rec.tile > tileNumber) {
+ break;
+ }
+ }
+ throw new NoSuchElementException(String.format("Tile %d not found in %s", tileNumber, tileIndexFile));
+ }
+
+ static class TileIndexRecord {
+ /**
+ * Number of the tile, e.g. 11101. These don't necessarily start at 0, and there may be gaps.
+ */
+ final int tile;
+
+ final int numClustersInTile;
+
+ /**
+ * I.e. the sum of numClustersInTile for all tiles preceding this one.
+ */
+ final int indexOfFirstClusterInTile;
+
+ /**
+ * A contiguous numbering of tiles starting at 0.
+ */
+ final int zeroBasedTileNumber;
+
+ private TileIndexRecord(final int tile, final int numClustersInTile, final int indexOfFirstClusterInTile, final int zeroBasedTileNumber) {
+ this.tile = tile;
+ this.numClustersInTile = numClustersInTile;
+ this.indexOfFirstClusterInTile = indexOfFirstClusterInTile;
+ this.zeroBasedTileNumber = zeroBasedTileNumber;
+ }
+ }
+}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReader.java b/src/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReader.java
index 40e92fd..0473a40 100644
--- a/src/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReader.java
+++ b/src/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReader.java
@@ -27,7 +27,6 @@ import net.sf.picard.PicardException;
import net.sf.samtools.util.CloseableIterator;
import java.io.File;
-import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -46,6 +45,12 @@ import java.util.regex.Pattern;
public abstract class AbstractIlluminaPositionFileReader implements CloseableIterator<AbstractIlluminaPositionFileReader.PositionInfo> {
public static final float MAX_POS = 9999999.99f;
+ /**
+ * At least one NextSeq run produced a small negative value for y coordinate (-5), so allow small
+ * negative values and see what happens.
+ */
+ public static final float MIN_POS = -10.0f;
+
public class PositionInfo {
/** The x-position as it occurs in the file being read */
public final float xPos;
@@ -66,12 +71,11 @@ public abstract class AbstractIlluminaPositionFileReader implements CloseableIte
public final int yQseqCoord;
public PositionInfo(final float x, final float y, final int lane, final int tile) {
- if(x < 0 || y < 0) {
- throw new IllegalArgumentException("X and Y position values must be positive (x,y)=(" + x + ", y) lane(" + lane + ") tile(" + tile + ")");
- }
+ if(x < MIN_POS || y < MIN_POS || x > MAX_POS || y > MAX_POS) {
- if(x > MAX_POS || y > MAX_POS) {
- throw new IllegalArgumentException("X and Y position values must be less than " + MAX_POS + " (x,y)=(" + x + ", y) lane(" + lane + ") tile(" + tile + ")");
+ throw new IllegalArgumentException(
+ String.format("Cluster location not in the range %f..%f. x: %f; y: %f; lane: %d; tile: %d",
+ MIN_POS, MAX_POS, x, y, lane, tile));
}
this.xPos = x;
@@ -100,7 +104,7 @@ public abstract class AbstractIlluminaPositionFileReader implements CloseableIte
}
//Note: Perhaps use the IlluminaFileUtil to do this part
- private static Pattern FileNamePattern = Pattern.compile("^s_(\\d+)_(\\d+)(_pos\\.txt|\\.locs|\\.clocs|_pos\\.txt.gz|_pos\\.txt.bz2)$");
+ private static final Pattern FileNamePattern = Pattern.compile("^s_(\\d+)_(\\d+)(_pos\\.txt|\\.locs|\\.clocs|_pos\\.txt.gz|_pos\\.txt.bz2)$");
private final File file;
private final int lane;
@@ -109,11 +113,23 @@ public abstract class AbstractIlluminaPositionFileReader implements CloseableIte
public AbstractIlluminaPositionFileReader(final File file) {
this.file = file;
- int [] laneAndTile = fileNameToLaneAndTile(file.getName());
+ final int [] laneAndTile = fileNameToLaneAndTile(file.getName());
lane = laneAndTile[0];
tile = laneAndTile[1];
}
+ /**
+ * Use this ctor if lane and tile are not discernible from file name.
+ * @param file
+ * @param lane
+ * @param tile
+ */
+ public AbstractIlluminaPositionFileReader(final File file, final int lane, final int tile) {
+ this.file = file;
+ this.lane = lane;
+ this.tile = tile;
+ }
+
public int getTile() {
return tile;
}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/BclIndexReader.java b/src/java/net/sf/picard/illumina/parser/readers/BclIndexReader.java
new file mode 100644
index 0000000..51171e2
--- /dev/null
+++ b/src/java/net/sf/picard/illumina/parser/readers/BclIndexReader.java
@@ -0,0 +1,75 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2014 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.illumina.parser.readers;
+
+import net.sf.picard.PicardException;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+
+/**
+ * Annoyingly, there are two different files with extension .bci in NextSeq output. This reader handles
+ * the file that contains virtual file pointers into a .bcl.bgzf file. After the header, there is a 64-bit record
+ * per tile.
+ */
+public class BclIndexReader {
+ private static final int BCI_HEADER_SIZE = 8;
+ private static final int BCI_VERSION = 0;
+
+ private final BinaryFileIterator<Long> bciIterator;
+ private final int numTiles;
+ private final File bciFile;
+ private int nextRecordNumber = 0;
+
+ public BclIndexReader(final File bclFile) {
+ bciFile = new File(bclFile.getAbsolutePath() + ".bci");
+ bciIterator = MMapBackedIteratorFactory.getLongIterator(BCI_HEADER_SIZE, bciFile);
+ final ByteBuffer headerBytes = bciIterator.getHeaderBytes();
+ final int actualVersion = headerBytes.getInt();
+ if (actualVersion != BCI_VERSION) {
+ throw new PicardException(String.format("Unexpected version number %d in %s", actualVersion, bciFile.getAbsolutePath()));
+ }
+ numTiles = headerBytes.getInt();
+ }
+
+ public int getNumTiles() {
+ return numTiles;
+ }
+
+ public long get(final int recordNumber) {
+ if (recordNumber < nextRecordNumber) {
+ throw new IllegalArgumentException("Can only read forward");
+ }
+ if (recordNumber > nextRecordNumber) {
+ bciIterator.skipElements(recordNumber - nextRecordNumber);
+ nextRecordNumber = recordNumber;
+ }
+ ++nextRecordNumber;
+ return bciIterator.getElement();
+ }
+
+ public File getBciFile() {
+ return bciFile;
+ }
+}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/BclReader.java b/src/java/net/sf/picard/illumina/parser/readers/BclReader.java
index 7c87318..058a387 100644
--- a/src/java/net/sf/picard/illumina/parser/readers/BclReader.java
+++ b/src/java/net/sf/picard/illumina/parser/readers/BclReader.java
@@ -26,6 +26,8 @@ package net.sf.picard.illumina.parser.readers;
import net.sf.picard.PicardException;
import net.sf.picard.util.UnsignedTypeUtil;
import net.sf.samtools.Defaults;
+import net.sf.samtools.util.BlockCompressedInputStream;
+import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.CloserUtil;
import java.io.*;
@@ -62,7 +64,7 @@ import java.util.zip.GZIPInputStream;
*
* So the output base/quality will be a (T/34)
*/
-public class BclReader implements Iterator<BclReader.BclValue> {
+public class BclReader implements CloseableIterator<BclReader.BclValue> {
/** The size of the opening header (consisting solely of numClusters*/
private static final int HEADER_SIZE = 4;
@@ -99,13 +101,18 @@ public class BclReader implements Iterator<BclReader.BclValue> {
filePath = file.getAbsolutePath();
final boolean isGzip = filePath.endsWith(".gz");
+ final boolean isBgzf = filePath.endsWith(".bgzf");
// Open up a buffered stream to read from the file and optionally wrap it in a gzip stream
// if necessary
final BufferedInputStream bufferedInputStream;
try {
- bufferedInputStream = new BufferedInputStream(new FileInputStream(file), Defaults.BUFFER_SIZE);
- inputStream = isGzip ? new GZIPInputStream(bufferedInputStream) : bufferedInputStream;
+ if (isBgzf) {
+ inputStream = new BlockCompressedInputStream(file);
+ } else {
+ bufferedInputStream = new BufferedInputStream(new FileInputStream(file), Defaults.BUFFER_SIZE);
+ inputStream = isGzip ? new GZIPInputStream(bufferedInputStream) : bufferedInputStream;
+ }
} catch (FileNotFoundException fnfe) {
throw new PicardException("File not found: (" + filePath + ")", fnfe);
} catch (IOException ioe) {
@@ -118,7 +125,7 @@ public class BclReader implements Iterator<BclReader.BclValue> {
if (file.length() == 0) {
throw new PicardException("Zero length BCL file detected: " + filePath);
}
- if (!isGzip) {
+ if (!isGzip && !isBgzf) {
// The file structure checks rely on the file size (as information is stored as individual bytes) but
// we can't reliably know the number of uncompressed bytes in the file ahead of time for gzip files. Only
// run the main check
@@ -145,7 +152,7 @@ public class BclReader implements Iterator<BclReader.BclValue> {
return UnsignedTypeUtil.uIntToLong(headerBuf.getInt());
}
- private void assertProperFileStructure(final File file) {
+ protected void assertProperFileStructure(final File file) {
final long elementsInFile = file.length() - HEADER_SIZE;
if (numClusters != elementsInFile) {
throw new PicardException("Expected " + numClusters + " in file but found " + elementsInFile);
@@ -212,6 +219,19 @@ public class BclReader implements Iterator<BclReader.BclValue> {
CloserUtil.close(inputStream);
}
+ public void seek(final long virtualFilePointer) {
+ if (!(inputStream instanceof BlockCompressedInputStream)) {
+ throw new UnsupportedOperationException("Seeking only allowed on bzgf");
+ } else {
+ final BlockCompressedInputStream bcis = (BlockCompressedInputStream)inputStream;
+ try {
+ ((BlockCompressedInputStream) inputStream).seek(virtualFilePointer);
+ } catch (IOException e) {
+ throw new PicardException("Problem seeking in " + filePath, e);
+ }
+ }
+ }
+
public void remove() {
throw new UnsupportedOperationException();
}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/FilterFileReader.java b/src/java/net/sf/picard/illumina/parser/readers/FilterFileReader.java
index adfe291..ecd493c 100644
--- a/src/java/net/sf/picard/illumina/parser/readers/FilterFileReader.java
+++ b/src/java/net/sf/picard/illumina/parser/readers/FilterFileReader.java
@@ -104,6 +104,10 @@ public class FilterFileReader implements Iterator<Boolean> {
}
}
+ public void skipRecords(final int numToSkip) {
+ bbIterator.skipElements(numToSkip);
+ }
+
public void remove() {
throw new UnsupportedOperationException();
}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/LocsFileReader.java b/src/java/net/sf/picard/illumina/parser/readers/LocsFileReader.java
index 1773d4f..0497f2d 100644
--- a/src/java/net/sf/picard/illumina/parser/readers/LocsFileReader.java
+++ b/src/java/net/sf/picard/illumina/parser/readers/LocsFileReader.java
@@ -56,7 +56,7 @@ public class LocsFileReader extends AbstractIlluminaPositionFileReader {
private BinaryFileIterator<Float> bbIterator;
/** Total clusters in the file as read in the file header */
- private final long numClusters;
+ private long numClusters;
/** The index of the next cluster to be returned */
private int nextCluster;
@@ -64,6 +64,16 @@ public class LocsFileReader extends AbstractIlluminaPositionFileReader {
public LocsFileReader(final File file) {
super(file);
+ initialize(file);
+ }
+
+ public LocsFileReader(final File file, final int lane, final int tile) {
+ super(file, lane, tile);
+
+ initialize(file);
+ }
+
+ private void initialize(final File file) {
bbIterator = MMapBackedIteratorFactory.getFloatIterator(HEADER_SIZE, file);
final ByteBuffer headerBuf = bbIterator.getHeaderBytes();
@@ -102,4 +112,8 @@ public class LocsFileReader extends AbstractIlluminaPositionFileReader {
public void close() {
bbIterator = null;
}
+
+ public void skipRecords(final int numToSkip) {
+ bbIterator.skipElements(numToSkip * 2);
+ }
}
diff --git a/src/java/net/sf/picard/illumina/parser/readers/MMapBackedIteratorFactory.java b/src/java/net/sf/picard/illumina/parser/readers/MMapBackedIteratorFactory.java
index 9562e8f..74fee58 100644
--- a/src/java/net/sf/picard/illumina/parser/readers/MMapBackedIteratorFactory.java
+++ b/src/java/net/sf/picard/illumina/parser/readers/MMapBackedIteratorFactory.java
@@ -51,6 +51,7 @@ public class MMapBackedIteratorFactory {
private static int BYTE_SIZE = 1;
private static int INT_SIZE = 4;
private static int FLOAT_SIZE = 4;
+ private static int LONG_SIZE = 8;
public static BinaryFileIterator<Integer> getIntegerIterator(final int headerSize, final File binaryFile) {
checkFactoryVars(headerSize, binaryFile);
@@ -76,6 +77,14 @@ public class MMapBackedIteratorFactory {
return new FloatMMapIterator(header, binaryFile, buf);
}
+ public static BinaryFileIterator<Long> getLongIterator(final int headerSize, final File binaryFile) {
+ checkFactoryVars(headerSize, binaryFile);
+ final ByteBuffer buf = getBuffer(binaryFile);
+ final byte [] header = getHeader(buf, headerSize);
+
+ return new LongMMapIterator(header, binaryFile, buf);
+ }
+
public static BinaryFileIterator<ByteBuffer> getByteBufferIterator(final int headerSize, final int elementSize, final File binaryFile) {
checkFactoryVars(headerSize, binaryFile);
final ByteBuffer buf = getBuffer(binaryFile);
@@ -180,6 +189,17 @@ public class MMapBackedIteratorFactory {
}
}
+ private static class LongMMapIterator extends MMapBackedIterator<Long> {
+ public LongMMapIterator(final byte[] header, final File file, final ByteBuffer buf) {
+ super(header, file, LONG_SIZE, buf);
+ }
+
+ @Override
+ protected Long getElement() {
+ return buffer.getLong();
+ }
+ }
+
//TODO: Add test
//TODO: Make a note that if you want to multithread over this then you have to copy the contents
private static class ByteBufferMMapIterator extends MMapBackedIterator<ByteBuffer> {
diff --git a/src/java/net/sf/picard/io/IoUtil.java b/src/java/net/sf/picard/io/IoUtil.java
index 89fd1c7..8479327 100644
--- a/src/java/net/sf/picard/io/IoUtil.java
+++ b/src/java/net/sf/picard/io/IoUtil.java
@@ -44,6 +44,9 @@ import org.apache.tools.bzip2.CBZip2OutputStream;
* @author Tim Fennell
*/
public class IoUtil extends net.sf.samtools.util.IOUtil {
+ /** Possible extensions for VCF files and related formats. */
+ public static final String[] VCF_EXTENSIONS = new String[] {".vcf", ".vcf.gz", ".bcf"};
+
/**
* Checks that a file is non-null, exists, is not a directory and is readable. If any
* condition is false then a runtime exception is thrown.
@@ -642,17 +645,20 @@ public class IoUtil extends net.sf.samtools.util.IOUtil {
final List<File> output = new ArrayList<File>();
stack.addAll(inputs);
- final Set<String> exts = new HashSet<String>();
- Collections.addAll(exts, extensions);
-
while (!stack.empty()) {
final File f = stack.pop();
- final String ext = IoUtil.fileSuffix(f);
+ final String name = f.getName();
+ boolean matched = false;
- if (exts.contains(ext)) {
- output.add(f);
+ for (final String ext : extensions) {
+ if (!matched && name.endsWith(ext)) {
+ output.add(f);
+ matched = true;
+ }
}
- else {
+
+ // If the file didn't match a given extension, treat it as a list of files
+ if (!matched) {
IoUtil.assertFileIsReadable(f);
for (final String s : IoUtil.readLines(f)) {
@@ -661,6 +667,9 @@ public class IoUtil extends net.sf.samtools.util.IOUtil {
}
}
+ // Preserve input order (since we're using a stack above) for things that care
+ Collections.reverse(output);
+
return output;
}
}
diff --git a/src/java/net/sf/picard/sam/MergeBamAlignment.java b/src/java/net/sf/picard/sam/MergeBamAlignment.java
index 5cb647f..bd21347 100644
--- a/src/java/net/sf/picard/sam/MergeBamAlignment.java
+++ b/src/java/net/sf/picard/sam/MergeBamAlignment.java
@@ -67,7 +67,7 @@ public class MergeBamAlignment extends CommandLineProgram {
doc="SAM or BAM file(s) with alignment data from the first read of a pair.",
mutex={"ALIGNED_BAM"},
optional=true)
- public List<File> READ1_ALIGNED_BAM;
+ public List<File> READ1_ALIGNED_BAM ;
@Option(shortName="R2_ALIGNED",
doc="SAM or BAM file(s) with alignment data from the second read of a pair.",
@@ -103,8 +103,8 @@ public class MergeBamAlignment extends CommandLineProgram {
optional=true)
public String PROGRAM_GROUP_NAME;
- @Option(doc="Whether this is a paired-end run.",
- shortName="PE")
+ @Deprecated
+ @Option(doc="This argument is ignored and will be removed.", shortName="PE")
public Boolean PAIRED_RUN;
@Option(doc="The expected jump size (required if this is a jumping library). Deprecated. Use EXPECTED_ORIENTATIONS instead",
@@ -224,8 +224,8 @@ public class MergeBamAlignment extends CommandLineProgram {
}
final SamAlignmentMerger merger = new SamAlignmentMerger(UNMAPPED_BAM, OUTPUT,
- REFERENCE_SEQUENCE, prod, CLIP_ADAPTERS, IS_BISULFITE_SEQUENCE, PAIRED_RUN,
- ALIGNED_READS_ONLY, ALIGNED_BAM, MAX_INSERTIONS_OR_DELETIONS,
+ REFERENCE_SEQUENCE, prod, CLIP_ADAPTERS, IS_BISULFITE_SEQUENCE,
+ ALIGNED_READS_ONLY, ALIGNED_BAM, MAX_INSERTIONS_OR_DELETIONS,
ATTRIBUTES_TO_RETAIN, READ1_TRIM, READ2_TRIM,
READ1_ALIGNED_BAM, READ2_ALIGNED_BAM, EXPECTED_ORIENTATIONS, SORT_ORDER,
PRIMARY_ALIGNMENT_STRATEGY.newInstance());
diff --git a/src/java/net/sf/picard/sam/RevertSam.java b/src/java/net/sf/picard/sam/RevertSam.java
index 1d46ae2..563fec1 100644
--- a/src/java/net/sf/picard/sam/RevertSam.java
+++ b/src/java/net/sf/picard/sam/RevertSam.java
@@ -30,13 +30,19 @@ import net.sf.picard.cmdline.Option;
import net.sf.picard.cmdline.StandardOptionDefinitions;
import net.sf.picard.cmdline.Usage;
import net.sf.picard.io.IoUtil;
+import net.sf.picard.util.FormatUtil;
import net.sf.picard.util.Log;
+import net.sf.picard.util.PeekableIterator;
import net.sf.picard.util.ProgressLogger;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileHeader.SortOrder;
+import net.sf.samtools.util.SortingCollection;
import java.io.File;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
import java.util.ArrayList;
+import java.util.LinkedList;
import java.util.List;
/**
@@ -77,6 +83,16 @@ public class RevertSam extends CommandLineProgram {
add("SA"); // Supplementary alignment metadata
}};
+ @Option(doc="WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " +
+ "a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " +
+ "mates, duplicated records, records with mismatches in length of bases and qualities. This option can " +
+ "only be enabled if the output sort order is queryname and will always cause sorting to occur.")
+ public boolean SANITIZE = false;
+
+ @Option(doc="If SANITIZE=true and higher than MAX_DISCARD_FRACTION reads are discarded due to sanitization then" +
+ "the program will exit with an Exception instead of exiting cleanly. Output BAM will still be valid.")
+ public double MAX_DISCARD_FRACTION = 0.01;
+
@Option(doc="The sample alias to use in the reverted output file. This will override the existing " +
"sample alias in the file and is used only if all the read groups in the input file have the " +
"same sample alias ", shortName=StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional=true)
@@ -94,10 +110,22 @@ public class RevertSam extends CommandLineProgram {
System.exit(new RevertSam().instanceMain(args));
}
+ /**
+ * Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort.
+ */
+ @Override protected String[] customCommandLineValidation() {
+ if (SANITIZE && SORT_ORDER != SortOrder.queryname) {
+ return new String[] {"SORT_ORDER must be queryname when sanitization is enabled with SANITIZE=true."};
+ }
+
+ return null;
+ }
+
protected int doWork() {
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(OUTPUT);
+ final boolean sanitizing = SANITIZE;
final SAMFileReader in = new SAMFileReader(INPUT, true);
final SAMFileHeader inHeader = in.getFileHeader();
@@ -125,9 +153,10 @@ public class RevertSam extends CommandLineProgram {
}
}
-
+ ////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
- final boolean presorted = inHeader.getSortOrder() == SORT_ORDER;
+ ////////////////////////////////////////////////////////////////////////////
+ final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
@@ -146,62 +175,169 @@ public class RevertSam extends CommandLineProgram {
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, presorted, OUTPUT);
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Build a sorting collection to use if we are sanitizing
+ ////////////////////////////////////////////////////////////////////////////
+ final SortingCollection<SAMRecord> sorter;
+ if (sanitizing) {
+ sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
+ }
+ else {
+ sorter = null;
+ }
+
final ProgressLogger progress = new ProgressLogger(log, 1000000, "Reverted");
for (final SAMRecord rec : in) {
+ // Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary()) continue;
- if (RESTORE_ORIGINAL_QUALITIES) {
- final byte[] oq = rec.getOriginalBaseQualities();
- if (oq != null) {
- rec.setBaseQualities(oq);
- rec.setOriginalBaseQualities(null);
- }
- }
- if (REMOVE_DUPLICATE_INFORMATION) {
- rec.setDuplicateReadFlag(false);
- }
-
- if (REMOVE_ALIGNMENT_INFORMATION) {
- if (rec.getReadNegativeStrandFlag()) {
- SAMRecordUtil.reverseComplement(rec);
- rec.setReadNegativeStrandFlag(false);
- }
+ // Actually to the reverting of the remaining records
+ revertSamRecord(rec);
- // Remove all alignment based information about the read itself
- rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
- rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
- rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
- rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
+ if (sanitizing) sorter.add(rec);
+ else out.addAlignment(rec);
+ progress.record(rec);
+ }
- if (!rec.getReadUnmappedFlag()) {
- rec.setInferredInsertSize(0);
- rec.setNotPrimaryAlignmentFlag(false);
- rec.setProperPairFlag(false);
- rec.setReadUnmappedFlag(true);
+ ////////////////////////////////////////////////////////////////////////////
+ // Now if we're sanitizing, clean up the records and write them to the output
+ ////////////////////////////////////////////////////////////////////////////
+ if (!sanitizing) {
+ out.close();
+ }
+ else {
+ long total = 0, discarded = 0;
+ final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(sorter.iterator());
+ final ProgressLogger sanitizerProgress = new ProgressLogger(log, 1000000, "Sanitized");
+
+ readNameLoop: while (iterator.hasNext()) {
+ final List<SAMRecord> recs = fetchByReadName(iterator);
+ total += recs.size();
+
+ // Check that all the reads have bases and qualities of the same length
+ for (final SAMRecord rec : recs) {
+ if (rec.getReadBases().length != rec.getBaseQualities().length) {
+ log.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
+ discarded += recs.size();
+ continue readNameLoop;
+ }
+ }
+ // Check that if the first read is marked as unpaired that there is in fact only one read
+ if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
+ log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
+ discarded += recs.size();
+ continue readNameLoop;
}
- // Then remove any mate flags and info related to alignment
- if (rec.getReadPairedFlag()) {
- rec.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
- rec.setMateNegativeStrandFlag(false);
- rec.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
- rec.setMateUnmappedFlag(true);
+ // Check that if we have paired reads there is exactly one first of pair and one second of pair
+ if (recs.get(0).getReadPairedFlag()) {
+ int firsts=0, seconds=0, unpaired=0;
+ for (final SAMRecord rec : recs) {
+ if (!rec.getReadPairedFlag()) ++unpaired;
+ if (rec.getFirstOfPairFlag()) ++firsts;
+ if (rec.getSecondOfPairFlag()) ++seconds;
+ }
+
+ if (unpaired > 0 || firsts != 1 || seconds != 1) {
+ log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
+ discarded += recs.size();
+ continue readNameLoop;
+ }
}
- // And then remove any tags that are calculated from the alignment
- for (final String tag : ATTRIBUTE_TO_CLEAR) {
- rec.setAttribute(tag, null);
+ // If we've made it this far spit the records into the output!
+ for (final SAMRecord rec : recs) {
+ out.addAlignment(rec);
+ sanitizerProgress.record(rec);
}
+ }
+
+ out.close();
+ final double discardRate = discarded / (double) total;
+ final NumberFormat fmt = new DecimalFormat("0.000%");
+ log.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
+
+ if (discarded / (double) total > MAX_DISCARD_FRACTION) {
+ throw new PicardException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
+ }
- out.addAlignment(rec);
- progress.record(rec);
+ return 0;
+ }
+
+ /**
+ * Generates a list by consuming from the iterator in order starting with the first available
+ * read and continuing while subsequent reads share the same read name. If there are no reads
+ * remaining returns an empty list.
+ */
+ private List<SAMRecord> fetchByReadName(final PeekableIterator<SAMRecord> iterator) {
+ final List<SAMRecord> out = new LinkedList<SAMRecord>();
+
+ if (iterator.hasNext()) {
+ final SAMRecord first = iterator.next();
+ out.add(first);
+
+ while (iterator.hasNext() && iterator.peek().getReadName().equals(first.getReadName())) {
+ out.add(iterator.next());
+ }
}
- out.close();
+ return out;
+ }
- return 0;
+ /**
+ * Takes an individual SAMRecord and applies the set of changes/reversions to it that
+ * have been requested by program level options.
+ */
+ public void revertSamRecord(final SAMRecord rec) {
+ if (RESTORE_ORIGINAL_QUALITIES) {
+ final byte[] oq = rec.getOriginalBaseQualities();
+ if (oq != null) {
+ rec.setBaseQualities(oq);
+ rec.setOriginalBaseQualities(null);
+ }
+ }
+
+ if (REMOVE_DUPLICATE_INFORMATION) {
+ rec.setDuplicateReadFlag(false);
+ }
+
+ if (REMOVE_ALIGNMENT_INFORMATION) {
+ if (rec.getReadNegativeStrandFlag()) {
+ SAMRecordUtil.reverseComplement(rec);
+ rec.setReadNegativeStrandFlag(false);
+ }
+
+ // Remove all alignment based information about the read itself
+ rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
+ rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+ rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
+ rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
+
+ if (!rec.getReadUnmappedFlag()) {
+ rec.setInferredInsertSize(0);
+ rec.setNotPrimaryAlignmentFlag(false);
+ rec.setProperPairFlag(false);
+ rec.setReadUnmappedFlag(true);
+
+ }
+
+ // Then remove any mate flags and info related to alignment
+ if (rec.getReadPairedFlag()) {
+ rec.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+ rec.setMateNegativeStrandFlag(false);
+ rec.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
+ rec.setMateUnmappedFlag(true);
+ }
+
+ // And then remove any tags that are calculated from the alignment
+ for (final String tag : ATTRIBUTE_TO_CLEAR) {
+ rec.setAttribute(tag, null);
+ }
+ }
}
+
}
diff --git a/src/java/net/sf/picard/sam/SamAlignmentMerger.java b/src/java/net/sf/picard/sam/SamAlignmentMerger.java
index ded5782..0d3b66d 100644
--- a/src/java/net/sf/picard/sam/SamAlignmentMerger.java
+++ b/src/java/net/sf/picard/sam/SamAlignmentMerger.java
@@ -41,45 +41,43 @@ public class SamAlignmentMerger extends AbstractAlignmentMerger {
* @param referenceFasta The reference sequence for the map files. Required.
* @param programRecord Program record for taget file SAMRecords created.
* @param clipAdapters Whether adapters marked in unmapped BAM file should be marked as
- * soft clipped in the merged bam. Required.
+* soft clipped in the merged bam. Required.
* @param bisulfiteSequence Whether the reads are bisulfite sequence (used when calculating the
- * NM and UQ tags). Required.
- * @param pairedRun Whether the run is a paired-end run. Required.
+* NM and UQ tags). Required.
* @param alignedReadsOnly Whether to output only those reads that have alignment data
* @param alignedSamFile The SAM file(s) with alignment information. Optional. If this is
- * not provided, then read1AlignedSamFile and read2AlignedSamFile must be.
+* not provided, then read1AlignedSamFile and read2AlignedSamFile must be.
* @param maxGaps The maximum number of insertions or deletions permitted in an
- * alignment. Alignments with more than this many gaps will be ignored.
- * -1 means to allow any number of gaps.
+* alignment. Alignments with more than this many gaps will be ignored.
+* -1 means to allow any number of gaps.
* @param attributesToRetain private attributes from the alignment record that should be
- * included when merging. This overrides the exclusion of
- * attributes whose tags start with the reserved characters
- * of X, Y, and Z
+* included when merging. This overrides the exclusion of
+* attributes whose tags start with the reserved characters
+* of X, Y, and Z
* @param read1BasesTrimmed The number of bases trimmed from start of read 1 prior to alignment. Optional.
* @param read2BasesTrimmed The number of bases trimmed from start of read 2 prior to alignment. Optional.
* @param read1AlignedSamFile The alignment records for read1. Used when the two ends of a read are
- * aligned separately. This is optional, but must be specified if
- * alignedSamFile is not.
+* aligned separately. This is optional, but must be specified if
+* alignedSamFile is not.
* @param read2AlignedSamFile The alignment records for read1. Used when the two ends of a read are
- * aligned separately. This is optional, but must be specified if
- * alignedSamFile is not.
+* aligned separately. This is optional, but must be specified if
+* alignedSamFile is not.
* @param expectedOrientations A List of SamPairUtil.PairOrientations that are expected for
- * aligned pairs. Used to determine the properPair flag.
+* aligned pairs. Used to determine the properPair flag.
* @param sortOrder The order in which the merged records should be output. If null,
- * output will be coordinate-sorted
+* output will be coordinate-sorted
* @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair,
- * in which none are primary, or more than one is marked primary
- * by the aligner.
+* in which none are primary, or more than one is marked primary
*/
- public SamAlignmentMerger (final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
- final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
- final boolean pairedRun, final boolean alignedReadsOnly,
- final List<File> alignedSamFile, final int maxGaps, final List<String> attributesToRetain,
- final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
- final List<File> read1AlignedSamFile, final List<File> read2AlignedSamFile,
- final List<SamPairUtil.PairOrientation> expectedOrientations,
- final SortOrder sortOrder,
- final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy) {
+ public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
+ final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
+ final boolean alignedReadsOnly,
+ final List<File> alignedSamFile, final int maxGaps, final List<String> attributesToRetain,
+ final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
+ final List<File> read1AlignedSamFile, final List<File> read2AlignedSamFile,
+ final List<SamPairUtil.PairOrientation> expectedOrientations,
+ final SortOrder sortOrder,
+ final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy) {
super(unmappedBamFile, targetBamFile, referenceFasta, clipAdapters, bisulfiteSequence,
alignedReadsOnly, programRecord, attributesToRetain, read1BasesTrimmed,
diff --git a/src/java/net/sf/picard/sam/SamToFastq.java b/src/java/net/sf/picard/sam/SamToFastq.java
index e56d98d..c81dd1a 100755
--- a/src/java/net/sf/picard/sam/SamToFastq.java
+++ b/src/java/net/sf/picard/sam/SamToFastq.java
@@ -35,6 +35,7 @@ import net.sf.picard.io.IoUtil;
import net.sf.picard.util.Log;
import net.sf.picard.util.ProgressLogger;
import net.sf.samtools.*;
+import net.sf.samtools.util.Lazy;
import net.sf.samtools.util.SequenceUtil;
import net.sf.samtools.util.StringUtil;
@@ -53,67 +54,75 @@ import java.util.*;
public class SamToFastq extends CommandLineProgram {
@Usage
public String USAGE = getStandardUsagePreamble() + "Extracts read sequences and qualities from the input SAM/BAM file and writes them into " +
- "the output file in Sanger fastq format. In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, " +
- "the read's sequence from input SAM file will be reverse-complemented prior to writing it to fastq in order restore correctly " +
- "the original read sequence as it was generated by the sequencer.";
+ "the output file in Sanger fastq format. In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, " +
+ "the read's sequence from input SAM file will be reverse-complemented prior to writing it to fastq in order restore correctly" +
+ "the original read sequence as it was generated by the sequencer.";
- @Option(doc="Input SAM/BAM file to extract reads from", shortName=StandardOptionDefinitions.INPUT_SHORT_NAME)
- public File INPUT ;
+ @Option(doc = "Input SAM/BAM file to extract reads from", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
+ public File INPUT;
- @Option(shortName="F", doc="Output fastq file (single-end fastq or, if paired, first end of the pair fastq).", mutex={"OUTPUT_PER_RG"})
- public File FASTQ ;
+ @Option(shortName = "F", doc = "Output fastq file (single-end fastq or, if paired, first end of the pair fastq).",
+ mutex = {"OUTPUT_PER_RG"})
+ public File FASTQ;
- @Option(shortName="F2", doc="Output fastq file (if paired, second end of the pair fastq).", optional=true, mutex={"OUTPUT_PER_RG"})
- public File SECOND_END_FASTQ ;
+ @Option(shortName = "F2", doc = "Output fastq file (if paired, second end of the pair fastq).", optional = true,
+ mutex = {"OUTPUT_PER_RG"})
+ public File SECOND_END_FASTQ;
- @Option(shortName="OPRG", doc="Output a fastq file per read group (two fastq files per read group if the group is paired).", optional=true, mutex={"FASTQ", "SECOND_END_FASTQ"})
- public boolean OUTPUT_PER_RG ;
+ @Option(shortName = "FU", doc = "Output fastq file for unpaired reads; may only be provided in paired-fastq mode", optional = true, mutex = {"OUTPUT_PER_RG"})
+ public File UNPAIRED_FASTQ;
- @Option(shortName="ODIR", doc="Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true.", optional=true)
+ @Option(shortName = "OPRG", doc = "Output a fastq file per read group (two fastq files per read group if the group is paired).",
+ optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"})
+ public boolean OUTPUT_PER_RG;
+
+ @Option(shortName = "ODIR", doc = "Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true.",
+ optional = true)
public File OUTPUT_DIR;
- @Option(shortName="RC", doc="Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq", optional=true)
+ @Option(shortName = "RC", doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq",
+ optional = true)
public boolean RE_REVERSE = true;
- @Option(shortName="INTER", doc="Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from")
+ @Option(shortName = "INTER", doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from")
public boolean INTERLEAVE = false;
- @Option(shortName="NON_PF", doc="Include non-PF reads from the SAM file into the output FASTQ files.")
+ @Option(shortName = "NON_PF", doc = "Include non-PF reads from the SAM file into the output FASTQ files.")
public boolean INCLUDE_NON_PF_READS = false;
- @Option(shortName="CLIP_ATTR", doc="The attribute that stores the position at which " +
- "the SAM record should be clipped", optional=true)
+ @Option(shortName = "CLIP_ATTR", doc = "The attribute that stores the position at which " +
+ "the SAM record should be clipped", optional = true)
public String CLIPPING_ATTRIBUTE;
- @Option(shortName="CLIP_ACT", doc="The action that should be taken with clipped reads: " +
+ @Option(shortName = "CLIP_ACT", doc = "The action that should be taken with clipped reads: " +
"'X' means the reads and qualities should be trimmed at the clipped position; " +
"'N' means the bases should be changed to Ns in the clipped region; and any " +
"integer means that the base qualities should be set to that value in the " +
- "clipped region.", optional=true)
+ "clipped region.", optional = true)
public String CLIPPING_ACTION;
- @Option(shortName="R1_TRIM", doc="The number of bases to trim from the beginning of read 1.")
+ @Option(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.")
public int READ1_TRIM = 0;
- @Option(shortName="R1_MAX_BASES", doc="The maximum number of bases to write from read 1 after trimming. " +
+ @Option(shortName = "R1_MAX_BASES", doc = "The maximum number of bases to write from read 1 after trimming. " +
"If there are fewer than this many bases left after trimming, all will be written. If this " +
- "value is null then all bases left after trimming will be written.", optional=true)
+ "value is null then all bases left after trimming will be written.", optional = true)
public Integer READ1_MAX_BASES_TO_WRITE;
- @Option(shortName="R2_TRIM", doc="The number of bases to trim from the beginning of read 2.")
+ @Option(shortName = "R2_TRIM", doc = "The number of bases to trim from the beginning of read 2.")
public int READ2_TRIM = 0;
- @Option(shortName="R2_MAX_BASES", doc="The maximum number of bases to write from read 2 after trimming. " +
+ @Option(shortName = "R2_MAX_BASES", doc = "The maximum number of bases to write from read 2 after trimming. " +
"If there are fewer than this many bases left after trimming, all will be written. If this " +
- "value is null then all bases left after trimming will be written.", optional=true)
+ "value is null then all bases left after trimming will be written.", optional = true)
public Integer READ2_MAX_BASES_TO_WRITE;
- @Option(doc="If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " +
- "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.")
- public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS=false;
+ @Option(doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " +
+ "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.")
+ public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS = false;
private final Log log = Log.getInstance(SamToFastq.class);
-
+
public static void main(final String[] argv) {
System.exit(new SamToFastq().instanceMain(argv));
}
@@ -121,10 +130,10 @@ public class SamToFastq extends CommandLineProgram {
protected int doWork() {
IoUtil.assertFileIsReadable(INPUT);
final SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(INPUT));
- final Map<String,SAMRecord> firstSeenMates = new HashMap<String,SAMRecord>();
+ final Map<String, SAMRecord> firstSeenMates = new HashMap<String, SAMRecord>();
final FastqWriterFactory factory = new FastqWriterFactory();
factory.setCreateMd5(CREATE_MD5_FILE);
- final Map<SAMReadGroupRecord, List<FastqWriter>> writers = getWriters(reader.getFileHeader().getReadGroups(), factory);
+ final Map<SAMReadGroupRecord, FastqWriters> writers = generateWriters(reader.getFileHeader().getReadGroups(), factory);
final ProgressLogger progress = new ProgressLogger(log);
for (final SAMRecord currentRecord : reader) {
@@ -135,8 +144,7 @@ public class SamToFastq extends CommandLineProgram {
if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS)
continue;
- final List<FastqWriter> fq = writers.get(currentRecord.getReadGroup());
-
+ final FastqWriters fq = writers.get(currentRecord.getReadGroup());
if (currentRecord.getReadPairedFlag()) {
final String currentReadName = currentRecord.getReadName();
final SAMRecord firstRecord = firstSeenMates.remove(currentReadName);
@@ -145,27 +153,19 @@ public class SamToFastq extends CommandLineProgram {
} else {
assertPairedMates(firstRecord, currentRecord);
- if (fq.size() == 1 && !INTERLEAVE) {
- if (OUTPUT_PER_RG) {
- fq.add(factory.newWriter(makeReadGroupFile(currentRecord.getReadGroup(), "_2")));
- } else {
- throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified.");
- }
- }
-
- final FastqWriter firstPairWriter = fq.get(0);
- final FastqWriter secondPairWriter = INTERLEAVE ? firstPairWriter : fq.get(1);
-
final SAMRecord read1 =
- currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord;
+ currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord;
final SAMRecord read2 =
- currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord;
- writeRecord(read1, 1, firstPairWriter, READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
- writeRecord(read2, 2, secondPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE);
-
+ currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord;
+ writeRecord(read1, 1, fq.getFirstOfPair(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
+ final FastqWriter secondOfPairWriter = fq.getSecondOfPair();
+ if (secondOfPairWriter == null) {
+ throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified.");
+ }
+ writeRecord(read2, 2, secondOfPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE);
}
} else {
- writeRecord(currentRecord, null, fq.get(0), READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
+ writeRecord(currentRecord, null, fq.getUnpaired(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
}
progress.record(currentRecord);
@@ -174,14 +174,8 @@ public class SamToFastq extends CommandLineProgram {
reader.close();
// Close all the fastq writers being careful to close each one only once!
- final IdentityHashMap<FastqWriter,FastqWriter> seen = new IdentityHashMap<FastqWriter, FastqWriter>();
- for (final List<FastqWriter> listOfWriters : writers.values()) {
- for (final FastqWriter w : listOfWriters) {
- if (!seen.containsKey(w)) {
- w.close();
- seen.put(w,w);
- }
- }
+ for (final FastqWriters writerMapping : new HashSet<FastqWriters>(writers.values())) {
+ writerMapping.closeAll();
}
if (firstSeenMates.size() > 0) {
@@ -193,40 +187,54 @@ public class SamToFastq extends CommandLineProgram {
}
/**
- * Gets the pair of writers for a given read group or, if we are not sorting by read group,
- * just returns the single pair of writers.
+ * Generates the writers for the given read groups or, if we are not emitting per-read-group, just returns the single set of writers.
*/
- private Map<SAMReadGroupRecord, List<FastqWriter>> getWriters(final List<SAMReadGroupRecord> samReadGroupRecords,
+ private Map<SAMReadGroupRecord, FastqWriters> generateWriters(final List<SAMReadGroupRecord> samReadGroupRecords,
final FastqWriterFactory factory) {
- final Map<SAMReadGroupRecord, List<FastqWriter>> writerMap = new HashMap<SAMReadGroupRecord, List<FastqWriter>>();
+ final Map<SAMReadGroupRecord, FastqWriters> writerMap = new HashMap<SAMReadGroupRecord, FastqWriters>();
+ final FastqWriters fastqWriters;
if (!OUTPUT_PER_RG) {
- // If we're not outputting by read group, there's only
- // one writer for each end.
- final List<FastqWriter> fqw = new ArrayList<FastqWriter>();
-
IoUtil.assertFileIsWritable(FASTQ);
IoUtil.openFileForWriting(FASTQ);
- fqw.add(factory.newWriter(FASTQ));
+ final FastqWriter firstOfPairWriter = factory.newWriter(FASTQ);
- if (SECOND_END_FASTQ != null) {
+ final FastqWriter secondOfPairWriter;
+ if (INTERLEAVE) {
+ secondOfPairWriter = firstOfPairWriter;
+ } else if (SECOND_END_FASTQ != null) {
IoUtil.assertFileIsWritable(SECOND_END_FASTQ);
IoUtil.openFileForWriting(SECOND_END_FASTQ);
- fqw.add(factory.newWriter(SECOND_END_FASTQ));
+ secondOfPairWriter = factory.newWriter(SECOND_END_FASTQ);
+ } else {
+ secondOfPairWriter = null;
}
- // Store in map with null key, in case there are reads without read group.
- writerMap.put(null, fqw);
- // Also store for every read group in header.
+
+ /** Prepare the writer that will accept unpaired reads. If we're emitting a single fastq - and assuming single-ended reads -
+ * then this is simply that one fastq writer. Otherwise, if we're doing paired-end, we emit to a third new writer, since
+ * the other two fastqs are accepting only paired end reads. */
+ final FastqWriter unpairedWriter = UNPAIRED_FASTQ == null ? firstOfPairWriter : factory.newWriter(UNPAIRED_FASTQ);
+ fastqWriters = new FastqWriters(firstOfPairWriter, secondOfPairWriter, unpairedWriter);
+
+ // For all read groups we may find in the bam, register this single set of writers for them.
+ writerMap.put(null, fastqWriters);
for (final SAMReadGroupRecord rg : samReadGroupRecords) {
- writerMap.put(rg, fqw);
+ writerMap.put(rg, fastqWriters);
}
} else {
+ // When we're creating a fastq-group per readgroup, by convention we do not emit a special fastq for unpaired reads.
for (final SAMReadGroupRecord rg : samReadGroupRecords) {
- final List<FastqWriter> fqw = new ArrayList<FastqWriter>();
-
- fqw.add(factory.newWriter(makeReadGroupFile(rg, "_1")));
- writerMap.put(rg, fqw);
+ final FastqWriter firstOfPairWriter = factory.newWriter(makeReadGroupFile(rg, "_1"));
+ // Create this writer on-the-fly; if we find no second-of-pair reads, don't bother making a writer (or delegating,
+ // if we're interleaving).
+ final Lazy<FastqWriter> lazySecondOfPairWriter = new Lazy<FastqWriter>(new Lazy.LazyInitializer<FastqWriter>() {
+ @Override
+ public FastqWriter make() {
+ return INTERLEAVE ? firstOfPairWriter : factory.newWriter(makeReadGroupFile(rg, "_2"));
+ }
+ });
+ writerMap.put(rg, new FastqWriters(firstOfPairWriter, lazySecondOfPairWriter, firstOfPairWriter));
}
}
return writerMap;
@@ -237,25 +245,25 @@ public class SamToFastq extends CommandLineProgram {
String fileName = readGroup.getPlatformUnit();
if (fileName == null) fileName = readGroup.getReadGroupId();
fileName = IoUtil.makeFileNameSafe(fileName);
- if(preExtSuffix != null) fileName += preExtSuffix;
+ if (preExtSuffix != null) fileName += preExtSuffix;
fileName += ".fastq";
final File result = (OUTPUT_DIR != null)
- ? new File(OUTPUT_DIR, fileName)
- : new File(fileName);
+ ? new File(OUTPUT_DIR, fileName)
+ : new File(fileName);
IoUtil.assertFileIsWritable(result);
return result;
}
void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer,
final int basesToTrim, final Integer maxBasesToWrite) {
- final String seqHeader = mateNumber==null ? read.getReadName() : read.getReadName() + "/"+ mateNumber;
+ final String seqHeader = mateNumber == null ? read.getReadName() : read.getReadName() + "/" + mateNumber;
String readString = read.getReadString();
String baseQualities = read.getBaseQualityString();
// If we're clipping, do the right thing to the bases or qualities
if (CLIPPING_ATTRIBUTE != null) {
- final Integer clipPoint = (Integer)read.getAttribute(CLIPPING_ATTRIBUTE);
+ final Integer clipPoint = (Integer) read.getAttribute(CLIPPING_ATTRIBUTE);
if (clipPoint != null) {
if (CLIPPING_ACTION.equalsIgnoreCase("X")) {
readString = clip(readString, clipPoint, null,
@@ -263,20 +271,18 @@ public class SamToFastq extends CommandLineProgram {
baseQualities = clip(baseQualities, clipPoint, null,
!read.getReadNegativeStrandFlag());
- }
- else if (CLIPPING_ACTION.equalsIgnoreCase("N")) {
+ } else if (CLIPPING_ACTION.equalsIgnoreCase("N")) {
readString = clip(readString, clipPoint, 'N',
!read.getReadNegativeStrandFlag());
- }
- else {
+ } else {
final char newQual = SAMUtils.phredToFastq(
- new byte[] { (byte)Integer.parseInt(CLIPPING_ACTION)}).charAt(0);
+ new byte[]{(byte) Integer.parseInt(CLIPPING_ACTION)}).charAt(0);
baseQualities = clip(baseQualities, clipPoint, newQual,
!read.getReadNegativeStrandFlag());
}
}
}
- if ( RE_REVERSE && read.getReadNegativeStrandFlag() ) {
+ if (RE_REVERSE && read.getReadNegativeStrandFlag()) {
readString = SequenceUtil.reverseComplement(readString);
baseQualities = StringUtil.reverseString(baseQualities);
}
@@ -298,24 +304,23 @@ public class SamToFastq extends CommandLineProgram {
* Utility method to handle the changes required to the base/quality strings by the clipping
* parameters.
*
- * @param src The string to clip
- * @param point The 1-based position of the first clipped base in the read
- * @param replacement If non-null, the character to replace in the clipped positions
- * in the string (a quality score or 'N'). If null, just trim src
- * @param posStrand Whether the read is on the positive strand
+ * @param src The string to clip
+ * @param point The 1-based position of the first clipped base in the read
+ * @param replacement If non-null, the character to replace in the clipped positions
+ * in the string (a quality score or 'N'). If null, just trim src
+ * @param posStrand Whether the read is on the positive strand
* @return String The clipped read or qualities
*/
private String clip(final String src, final int point, final Character replacement, final boolean posStrand) {
final int len = src.length();
- String result = posStrand ? src.substring(0, point-1) : src.substring(len-point+1);
+ String result = posStrand ? src.substring(0, point - 1) : src.substring(len - point + 1);
if (replacement != null) {
if (posStrand) {
- for (int i = point; i <= len; i++ ) {
+ for (int i = point; i <= len; i++) {
result += replacement;
}
- }
- else {
- for (int i = 0; i <= len-point; i++) {
+ } else {
+ for (int i = 0; i <= len - point; i++) {
result = replacement + result;
}
}
@@ -324,54 +329,110 @@ public class SamToFastq extends CommandLineProgram {
}
private void assertPairedMates(final SAMRecord record1, final SAMRecord record2) {
- if (! (record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() ||
- record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag() ) ) {
+ if (!(record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() ||
+ record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag())) {
throw new PicardException("Illegal mate state: " + record1.getReadName());
}
}
/**
- * Put any custom command-line validation in an override of this method.
- * clp is initialized at this point and can be used to print usage and access argv.
+ * Put any custom command-line validation in an override of this method.
+ * clp is initialized at this point and can be used to print usage and access argv.
* Any options set by command-line parser can be validated.
- * @return null if command line is valid. If command line is invalid, returns an array of error
- * messages to be written to the appropriate place.
- */
+ *
+ * @return null if command line is valid. If command line is invalid, returns an array of error
+ * messages to be written to the appropriate place.
+ */
protected String[] customCommandLineValidation() {
- if (INTERLEAVE && SECOND_END_FASTQ != null) {
- return new String[] {
- "Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ"
- };
- }
+ if (INTERLEAVE && SECOND_END_FASTQ != null) {
+ return new String[]{
+ "Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ"
+ };
+ }
+
+ if (UNPAIRED_FASTQ != null && SECOND_END_FASTQ == null) {
+ return new String[]{
+ "UNPAIRED_FASTQ may only be set when also emitting read1 and read2 fastqs (so SECOND_END_FASTQ must also be set)."
+ };
+ }
if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) ||
- (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) {
- return new String[] {
- "Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set." };
+ (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) {
+ return new String[]{
+ "Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set."};
}
if (CLIPPING_ACTION != null) {
if (CLIPPING_ACTION.equals("N") || CLIPPING_ACTION.equals("X")) {
// Do nothing, this is fine
- }
- else {
+ } else {
try {
Integer.parseInt(CLIPPING_ACTION);
- }
- catch (NumberFormatException nfe) {
- return new String[] {"CLIPPING ACTION must be one of: N, X, or an integer"};
+ } catch (NumberFormatException nfe) {
+ return new String[]{"CLIPPING ACTION must be one of: N, X, or an integer"};
}
}
}
if ((OUTPUT_PER_RG && OUTPUT_DIR == null) || ((!OUTPUT_PER_RG) && OUTPUT_DIR != null)) {
- return new String[] {
+ return new String[]{
"If OUTPUT_PER_RG is true, then OUTPUT_DIR should be set. " +
- "If " };
+ "If "};
}
return null;
}
+
+ /**
+ * A collection of {@link net.sf.picard.fastq.FastqWriter}s for particular types of reads.
+ * <p/>
+ * Allows for lazy construction of the second-of-pair writer, since when we are in the "output per read group mode", we only wish to
+ * generate a second-of-pair fastq if we encounter a second-of-pair read.
+ */
+ static class FastqWriters {
+ private final FastqWriter firstOfPair, unpaired;
+ private final Lazy<FastqWriter> secondOfPair;
+
+ /** Constructor if the consumer wishes for the second-of-pair writer to be built on-the-fly. */
+ private FastqWriters(final FastqWriter firstOfPair, final Lazy<FastqWriter> secondOfPair, final FastqWriter unpaired) {
+ this.firstOfPair = firstOfPair;
+ this.unpaired = unpaired;
+ this.secondOfPair = secondOfPair;
+ }
+
+ /** Simple constructor; all writers are pre-initialized.. */
+ private FastqWriters(final FastqWriter firstOfPair, final FastqWriter secondOfPair, final FastqWriter unpaired) {
+ this(firstOfPair, new Lazy<FastqWriter>(new Lazy.LazyInitializer<FastqWriter>() {
+ @Override
+ public FastqWriter make() {
+ return secondOfPair;
+ }
+ }), unpaired);
+ }
+
+ public FastqWriter getFirstOfPair() {
+ return firstOfPair;
+ }
+
+ public FastqWriter getSecondOfPair() {
+ return secondOfPair.get();
+ }
+
+ public FastqWriter getUnpaired() {
+ return unpaired;
+ }
+
+ public void closeAll() {
+ final Set<FastqWriter> fastqWriters = new HashSet<FastqWriter>();
+ fastqWriters.add(firstOfPair);
+ fastqWriters.add(unpaired);
+ // Make sure this is a no-op if the second writer was never fetched.
+ if (secondOfPair.isInitialized()) fastqWriters.add(secondOfPair.get());
+ for (final FastqWriter fastqWriter : fastqWriters) {
+ fastqWriter.close();
+ }
+ }
+ }
}
diff --git a/src/java/net/sf/picard/util/IntervalListTools.java b/src/java/net/sf/picard/util/IntervalListTools.java
index c355d5b..9befcf9 100644
--- a/src/java/net/sf/picard/util/IntervalListTools.java
+++ b/src/java/net/sf/picard/util/IntervalListTools.java
@@ -27,7 +27,7 @@ public class IntervalListTools extends CommandLineProgram {
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME,
doc="One or more interval lists. If multiple interval lists are provided the output is the" +
- "result of merging the inputs.")
+ "result of merging the inputs.", minElements = 1)
public List<File> INPUT;
@Option(doc="The output interval list file to write (if SCATTER_COUNT is 1) or the directory into which " +
diff --git a/src/java/org/broad/tribble/util/TabixUtils.java b/src/java/net/sf/picard/util/ListMap.java
old mode 100644
new mode 100755
similarity index 54%
copy from src/java/org/broad/tribble/util/TabixUtils.java
copy to src/java/net/sf/picard/util/ListMap.java
index c1acad2..7d0d54e
--- a/src/java/org/broad/tribble/util/TabixUtils.java
+++ b/src/java/net/sf/picard/util/ListMap.java
@@ -1,7 +1,7 @@
/*
* The MIT License
*
- * Copyright (c) 2013 The Broad Institute
+ * Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -21,45 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
-package org.broad.tribble.util;
+package net.sf.picard.util;
+
+import java.util.List;
import java.util.HashMap;
+import java.util.ArrayList;
/**
- * classes that have anything to do with tabix
+ * A Map class that holds a list of entries under each key instead of a single entry, and
+ * provides utility methods for adding an entry under a key.
+ *
+ * @author Tim Fennell
*/
-public class TabixUtils {
-
- public static class TPair64 implements Comparable<TPair64> {
- public long u, v;
-
- public TPair64(final long _u, final long _v) {
- u = _u;
- v = _v;
- }
-
- public TPair64(final TPair64 p) {
- u = p.u;
- v = p.v;
- }
-
- public int compareTo(final TPair64 p) {
- return u == p.u ? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0)) ? -1 : 1; // unsigned 64-bit comparison
+public class ListMap<K,V> extends HashMap<K, List<V>> {
+ /** Adds a single value to the list stored under a key. */
+ public void add(K key, V value) {
+ List<V> values = get(key);
+ if (values == null) {
+ values = new ArrayList<V>();
+ put(key, values);
}
- }
-
- public static class TIndex {
- public HashMap<Integer, TPair64[]> b; // binning index
- public long[] l; // linear index
- }
-
-
- public static class TIntv {
- public int tid, beg, end;
- }
-
- public static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
- return (u < v) ^ (u < 0) ^ (v < 0);
+ values.add(value);
}
}
diff --git a/src/java/net/sf/picard/util/SamLocusIterator.java b/src/java/net/sf/picard/util/SamLocusIterator.java
index 94e280a..baf5571 100644
--- a/src/java/net/sf/picard/util/SamLocusIterator.java
+++ b/src/java/net/sf/picard/util/SamLocusIterator.java
@@ -111,6 +111,8 @@ public class SamLocusIterator implements Iterable<SamLocusIterator.LocusInfo>, C
public String toString() {
return referenceSequence.getSequenceName() + ":" + position;
}
+
+ public int getSequenceLength(){return referenceSequence.getSequenceLength();}
}
diff --git a/src/java/net/sf/picard/vcf/MakeSitesOnlyVcf.java b/src/java/net/sf/picard/vcf/MakeSitesOnlyVcf.java
index 289f506..c6d3398 100644
--- a/src/java/net/sf/picard/vcf/MakeSitesOnlyVcf.java
+++ b/src/java/net/sf/picard/vcf/MakeSitesOnlyVcf.java
@@ -4,6 +4,7 @@ import net.sf.picard.PicardException;
import net.sf.picard.cmdline.CommandLineProgram;
import net.sf.picard.cmdline.Option;
import net.sf.picard.cmdline.StandardOptionDefinitions;
+import net.sf.picard.cmdline.Usage;
import net.sf.picard.io.IoUtil;
import net.sf.picard.util.Log;
import net.sf.picard.util.ProgressLogger;
@@ -29,13 +30,19 @@ import java.util.Set;
* @author Tim Fennell
*/
public class MakeSitesOnlyVcf extends CommandLineProgram {
+ @Usage
+ public final String usage = "Reads a VCF/VCF.gz/BCF and removes all genotype information from it while retaining " +
+ "all site level information, including annotations based on genotypes (e.g. AN, AF). Output an be " +
+ "any support variant format including .vcf, .vcf.gz or .bcf.";
+
@Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input VCF or BCF")
public File INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output VCF or BCF to emit without per-sample info.")
public File OUTPUT;
- @Option(shortName=StandardOptionDefinitions.SEQUENCE_DICTIONARY_SHORT_NAME, doc="Sequence dictionary to use when indexing the VCF.", optional = true)
+ @Option(shortName=StandardOptionDefinitions.SEQUENCE_DICTIONARY_SHORT_NAME,
+ doc="Sequence dictionary to use when indexing the VCF if the VCF header does not contain contig information.", optional = true)
public File SEQUENCE_DICTIONARY;
private static final Set<String> NO_SAMPLES = Collections.emptySet();
@@ -55,29 +62,28 @@ public class MakeSitesOnlyVcf extends CommandLineProgram {
if (SEQUENCE_DICTIONARY != null) IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
IoUtil.assertFileIsWritable(OUTPUT);
- final VCFFileReader reader = new VCFFileReader(INPUT);
+ final VCFFileReader reader = new VCFFileReader(INPUT, false);
final VCFHeader header = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
final SAMSequenceDictionary sequenceDictionary =
SEQUENCE_DICTIONARY != null
? SAMFileReader.getSequenceDictionary(SEQUENCE_DICTIONARY)
: header.getSequenceDictionary();
+
if (CREATE_INDEX && sequenceDictionary == null) {
throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
}
- final EnumSet<Options> options = CREATE_INDEX ? EnumSet.of(Options.INDEX_ON_THE_FLY) : EnumSet.noneOf(Options.class);
- final VariantContextWriter writer = VariantContextWriterFactory.create(OUTPUT, sequenceDictionary, options);
-
- writer.writeHeader(header);
final ProgressLogger progress = new ProgressLogger(Log.getInstance(MakeSitesOnlyVcf.class), 10000);
+ final EnumSet<Options> options = EnumSet.copyOf(VariantContextWriterFactory.DEFAULT_OPTIONS);
+ if (CREATE_INDEX) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY);
+
+ final VariantContextWriter writer = VariantContextWriterFactory.create(OUTPUT, sequenceDictionary, options);
+ writer.writeHeader(header);
+ final CloseableIterator<VariantContext> iterator = reader.iterator();
- final CloseableIterator<VariantContext> iterator = reader.iterator();
while (iterator.hasNext()) {
final VariantContext context = iterator.next();
- writer.add(context.subContextFromSamples(
- NO_SAMPLES,
- false // Do not re-derive the alleles from the new, subsetted genotypes: our site-only VCF should retain these values.
- ));
+ writer.add(context.subContextFromSamples(NO_SAMPLES, false)); // Do not re-derive the alleles from the new, subsetted genotypes: our site-only VCF should retain these values.
progress.record(context.getChr(), context.getStart());
}
diff --git a/src/java/net/sf/picard/vcf/VcfFormatConverter.java b/src/java/net/sf/picard/vcf/VcfFormatConverter.java
index e2fa253..72b35b9 100644
--- a/src/java/net/sf/picard/vcf/VcfFormatConverter.java
+++ b/src/java/net/sf/picard/vcf/VcfFormatConverter.java
@@ -90,12 +90,14 @@ public class VcfFormatConverter extends CommandLineProgram {
if (CREATE_INDEX && sequenceDictionary == null) {
throw new PicardException("A sequence dictionary must be available in the input file when creating indexed output.");
}
- final EnumSet<Options> options = CREATE_INDEX ? EnumSet.of(Options.INDEX_ON_THE_FLY) : EnumSet.noneOf(Options.class);
- final VariantContextWriter writer = VariantContextWriterFactory.create(OUTPUT, sequenceDictionary, options);
- writer.writeHeader(header);
+ final EnumSet<Options> options = EnumSet.copyOf(VariantContextWriterFactory.DEFAULT_OPTIONS);
+ if (CREATE_INDEX) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY);
+ final VariantContextWriter writer = VariantContextWriterFactory.create(OUTPUT, sequenceDictionary, options);
+ writer.writeHeader(header);
final CloseableIterator<VariantContext> iterator = reader.iterator();
+
while (iterator.hasNext()) {
final VariantContext context = iterator.next();
writer.add(context);
diff --git a/src/java/net/sf/samtools/util/BlockCompressedInputStream.java b/src/java/net/sf/samtools/util/BlockCompressedInputStream.java
index 7b5d9a0..6067641 100755
--- a/src/java/net/sf/samtools/util/BlockCompressedInputStream.java
+++ b/src/java/net/sf/samtools/util/BlockCompressedInputStream.java
@@ -63,7 +63,20 @@ public class BlockCompressedInputStream extends InputStream {
* Note that seek() is not supported if this ctor is used.
*/
public BlockCompressedInputStream(final InputStream stream) {
- mStream = IOUtil.toBufferedStream(stream);
+ this(stream, true);
+ }
+
+ /**
+ * Note that seek() is not supported if this ctor is used.
+ */
+ public BlockCompressedInputStream(final InputStream stream, final boolean allowBuffering) {
+ if (allowBuffering) {
+ mStream = IOUtil.toBufferedStream(stream);
+ }
+ else {
+ mStream = stream;
+ }
+
mFile = null;
}
diff --git a/src/java/net/sf/samtools/util/Lazy.java b/src/java/net/sf/samtools/util/Lazy.java
index 0d8c3ec..04a592b 100644
--- a/src/java/net/sf/samtools/util/Lazy.java
+++ b/src/java/net/sf/samtools/util/Lazy.java
@@ -32,4 +32,8 @@ public class Lazy<T> {
/** Returns the desired object instance. */
T make();
}
+
+ public boolean isInitialized() {
+ return isInitialized;
+ }
}
diff --git a/src/java/org/broad/tribble/util/TabixUtils.java b/src/java/org/broad/tribble/util/TabixUtils.java
index c1acad2..07cae06 100644
--- a/src/java/org/broad/tribble/util/TabixUtils.java
+++ b/src/java/org/broad/tribble/util/TabixUtils.java
@@ -23,13 +23,25 @@
*/
package org.broad.tribble.util;
+
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+import net.sf.samtools.util.BlockCompressedInputStream;
+import org.broad.tribble.TribbleException;
+import org.broad.tribble.readers.TabixReader;
+
+import java.io.File;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
/**
* classes that have anything to do with tabix
*/
public class TabixUtils {
+ public static final String STANDARD_INDEX_EXTENSION = ".tbi";
+
public static class TPair64 implements Comparable<TPair64> {
public long u, v;
@@ -62,4 +74,42 @@ public class TabixUtils {
public static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
return (u < v) ^ (u < 0) ^ (v < 0);
}
+
+ /**
+ * Generates the SAMSequenceDictionary from the given tabix index file
+ *
+ * @param tabixIndex the tabix index file
+ * @return non-null sequence dictionary
+ */
+ public static SAMSequenceDictionary getSequenceDictionary(final File tabixIndex) {
+ if (tabixIndex == null) throw new IllegalArgumentException();
+
+ try {
+ final BlockCompressedInputStream is = new BlockCompressedInputStream(tabixIndex);
+
+ // read preliminary bytes
+ byte[] buf = new byte[32];
+ is.read(buf, 0, 32);
+
+ // read sequence dictionary
+ int i, j, len = TabixReader.readInt(is);
+ buf = new byte[len];
+ is.read(buf);
+
+ final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
+ for (i = j = 0; i < buf.length; ++i) {
+ if (buf[i] == 0) {
+ byte[] b = new byte[i - j];
+ System.arraycopy(buf, j, b, 0, b.length);
+ sequences.add(new SAMSequenceRecord(new String(b)));
+ j = i + 1;
+ }
+ }
+ is.close();
+
+ return new SAMSequenceDictionary(sequences);
+ } catch (Exception e) {
+ throw new TribbleException("Unable to read tabix index: " + e.getMessage());
+ }
+ }
}
diff --git a/src/java/org/broadinstitute/variant/bcf2/BCF2Utils.java b/src/java/org/broadinstitute/variant/bcf2/BCF2Utils.java
index a0eba5a..d699242 100644
--- a/src/java/org/broadinstitute/variant/bcf2/BCF2Utils.java
+++ b/src/java/org/broadinstitute/variant/bcf2/BCF2Utils.java
@@ -283,6 +283,11 @@ public final class BCF2Utils {
public static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
+ else if ( o.getClass().isArray() ) {
+ final List<Object> l = new ArrayList<Object>();
+ Collections.addAll(l, (int[])o);
+ return l;
+ }
else return Collections.singletonList(o);
}
diff --git a/src/java/org/broadinstitute/variant/variantcontext/VariantContext.java b/src/java/org/broadinstitute/variant/variantcontext/VariantContext.java
index a18fd32..cc4a369 100644
--- a/src/java/org/broadinstitute/variant/variantcontext/VariantContext.java
+++ b/src/java/org/broadinstitute/variant/variantcontext/VariantContext.java
@@ -405,8 +405,17 @@ public class VariantContext implements Feature { // to enable tribble integratio
VariantContextBuilder builder = new VariantContextBuilder(this);
GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames);
- if ( rederiveAllelesFromGenotypes )
- builder.alleles(allelesOfGenotypes(newGenotypes));
+ if ( rederiveAllelesFromGenotypes ) {
+ Set<Allele> allelesFromGenotypes = allelesOfGenotypes(newGenotypes);
+
+ // ensure original order of genotypes
+ List<Allele> rederivedAlleles = new ArrayList<Allele>(allelesFromGenotypes.size());
+ for (Allele allele : alleles)
+ if (allelesFromGenotypes.contains(allele))
+ rederivedAlleles.add(allele);
+
+ builder.alleles(rederivedAlleles);
+ }
else {
builder.alleles(alleles);
}
@@ -1362,6 +1371,13 @@ public class VariantContext implements Feature { // to enable tribble integratio
}
public String toString() {
+ // Note: passing genotypes to String.format() will implicitly decode the genotypes
+ // This may not be desirable, so don't decode by default
+
+ return genotypes.isLazyWithData() ? toStringUnparsedGenotypes() : toStringDecodeGenotypes();
+ }
+
+ public String toStringDecodeGenotypes() {
return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s GT=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".",
@@ -1371,6 +1387,16 @@ public class VariantContext implements Feature { // to enable tribble integratio
this.getGenotypes());
}
+ private String toStringUnparsedGenotypes() {
+ return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s GT=%s",
+ getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
+ hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".",
+ this.getType(),
+ ParsingUtils.sortList(this.getAlleles()),
+ ParsingUtils.sortedString(this.getAttributes()),
+ ((LazyGenotypesContext)this.genotypes).getUnparsedGenotypeData());
+ }
+
public String toStringWithoutGenotypes() {
return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
diff --git a/src/java/org/broadinstitute/variant/variantcontext/writer/AsyncVariantContextWriter.java b/src/java/org/broadinstitute/variant/variantcontext/writer/AsyncVariantContextWriter.java
new file mode 100644
index 0000000..4bc2c2e
--- /dev/null
+++ b/src/java/org/broadinstitute/variant/variantcontext/writer/AsyncVariantContextWriter.java
@@ -0,0 +1,49 @@
+package org.broadinstitute.variant.variantcontext.writer;
+
+import net.sf.samtools.util.AbstractAsyncWriter;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.vcf.VCFHeader;
+
+/**
+ * AsyncVariantContextWriter that can be wrapped around an underlying AsyncVariantContextWriter to provide asynchronous output. Records
+ * added are placed into a queue, the queue is then drained into the underlying VariantContextWriter by a thread owned
+ * by the instance.
+ *
+ * Exceptions experienced by the writer thread will be emitted back to the caller in subsequent calls to either
+ * add() or close().
+ *
+ * @author George Grant
+ */
+public class AsyncVariantContextWriter extends AbstractAsyncWriter<VariantContext> implements VariantContextWriter {
+ private final VariantContextWriter underlyingWriter;
+
+ /**
+ * Creates a new AsyncVariantContextWriter wrapping the provided VariantContextWriter.
+ */
+ public AsyncVariantContextWriter(final VariantContextWriter out) {
+ this(out, DEFAULT_QUEUE_SIZE);
+ }
+
+ /**
+ * Creates an AsyncVariantContextWriter wrapping the provided VariantContextWriter and using the specified
+ * queue size for buffer VariantContexts.
+ */
+ public AsyncVariantContextWriter(final VariantContextWriter out, final int queueSize) {
+ super(queueSize);
+ this.underlyingWriter = out;
+ }
+
+ @Override protected void synchronouslyWrite(final VariantContext item) { this.underlyingWriter.add(item); }
+
+ @Override protected void synchronouslyClose() { this.underlyingWriter.close(); }
+
+ @Override protected final String getThreadNamePrefix() { return "VariantContextWriterThread-"; }
+
+ public void add(final VariantContext vc) {
+ write(vc);
+ }
+
+ public void writeHeader(final VCFHeader header) {
+ this.underlyingWriter.writeHeader(header);
+ }
+}
diff --git a/src/java/org/broadinstitute/variant/variantcontext/writer/Options.java b/src/java/org/broadinstitute/variant/variantcontext/writer/Options.java
index 3b6d464..eac2eb8 100644
--- a/src/java/org/broadinstitute/variant/variantcontext/writer/Options.java
+++ b/src/java/org/broadinstitute/variant/variantcontext/writer/Options.java
@@ -35,5 +35,6 @@ public enum Options {
INDEX_ON_THE_FLY,
DO_NOT_WRITE_GENOTYPES,
ALLOW_MISSING_FIELDS_IN_HEADER,
- FORCE_BCF
+ FORCE_BCF,
+ USE_ASYNC_IO // Turn on or off the use of asynchronous IO for writing output VCF files.
}
diff --git a/src/java/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java b/src/java/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java
index a4961b8..ec7f41e 100644
--- a/src/java/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java
+++ b/src/java/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java
@@ -25,6 +25,7 @@
package org.broadinstitute.variant.variantcontext.writer;
+import net.sf.samtools.Defaults;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.broad.tribble.index.IndexCreator;
@@ -42,6 +43,13 @@ public class VariantContextWriterFactory {
public static final EnumSet<Options> DEFAULT_OPTIONS = EnumSet.of(Options.INDEX_ON_THE_FLY);
public static final EnumSet<Options> NO_OPTIONS = EnumSet.noneOf(Options.class);
+ public static final String[] BLOCK_COMPRESSED_EXTENSIONS = {".gz", ".bgz", ".bgzf"};
+
+ static {
+ if (Defaults.USE_ASYNC_IO) {
+ DEFAULT_OPTIONS.add(Options.USE_ASYNC_IO);
+ }
+ }
private VariantContextWriterFactory() {}
@@ -65,21 +73,110 @@ public class VariantContextWriterFactory {
return create(null, output, refDict, options);
}
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages, and for naming the index,
+ * but does not control where the file is written
+ * @param output This is where the BCF is actually written.
+ */
+ public static VariantContextWriter createBcf2(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new BCF2Writer(location, output, refDict,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES)), options);
+ }
+
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages, and for naming the index,
+ * but does not control where the file is written
+ * @param output This is where the BCF is actually written.
+ */
+ public static VariantContextWriter createBcf2(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final IndexCreator indexCreator,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new BCF2Writer(location, output, refDict, indexCreator,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES)), options);
+ }
+
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages, and for naming the index,
+ * but does not control where the file is written
+ * @param output This is where the VCF is actually written.
+ */
+ public static VariantContextWriter createVcf(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new VCFWriter(location, output, refDict,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES),
+ options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)), options);
+ }
+
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages, and for naming the index,
+ * but does not control where the file is written
+ * @param output This is where the VCF is actually written.
+ */
+ public static VariantContextWriter createVcf(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final IndexCreator indexCreator,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new VCFWriter(location, output, refDict, indexCreator,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES),
+ options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)), options);
+ }
+
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages,
+ * but does not control where the file is written
+ * @param output This is where the VCF is actually written.
+ */
+ public static VariantContextWriter createBlockCompressedVcf(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new VCFWriter(location, maybeBgzfWrapOutputStream(location, output, options),
+ refDict,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES),
+ options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)), options);
+ }
+
+ /**
+ * @param location Note that this parameter is used to producing intelligent log messages,
+ * but does not control where the file is written
+ * @param output This is where the VCF is actually written.
+ */
+ public static VariantContextWriter createBlockCompressedVcf(final File location,
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final IndexCreator indexCreator,
+ final EnumSet<Options> options) {
+ return maybeWrapWithAsyncWriter(new VCFWriter(location, maybeBgzfWrapOutputStream(location, output, options),
+ refDict, indexCreator,
+ options.contains(Options.INDEX_ON_THE_FLY),
+ options.contains(Options.DO_NOT_WRITE_GENOTYPES),
+ options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)), options);
+ }
+
public static VariantContextWriter create(final File location,
- final OutputStream output,
- final SAMSequenceDictionary refDict,
- final EnumSet<Options> options) {
- final boolean enableBCF = isBCFOutput(location, options);
-
- if ( enableBCF )
- return new BCF2Writer(location, output, refDict,
- options.contains(Options.INDEX_ON_THE_FLY),
- options.contains(Options.DO_NOT_WRITE_GENOTYPES));
- else {
- return new VCFWriter(location, maybeBgzfWrapOutputStream(location, output, options), refDict,
- options.contains(Options.INDEX_ON_THE_FLY),
- options.contains(Options.DO_NOT_WRITE_GENOTYPES),
- options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
+ final OutputStream output,
+ final SAMSequenceDictionary refDict,
+ final EnumSet<Options> options) {
+
+ if (isBCFOutput(location, options)) {
+ return createBcf2(location, output, refDict, options);
+ } else if (isCompressedVcf(location)) {
+ return createBlockCompressedVcf(location, output, refDict, options);
+ } else {
+ return createVcf(location, output, refDict, options);
}
}
@@ -88,34 +185,35 @@ public class VariantContextWriterFactory {
final SAMSequenceDictionary refDict,
final IndexCreator indexCreator,
final EnumSet<Options> options) {
- final boolean enableBCF = isBCFOutput(location, options);
-
- if ( enableBCF )
- return new BCF2Writer(location, output, refDict, indexCreator,
- options.contains(Options.INDEX_ON_THE_FLY),
- options.contains(Options.DO_NOT_WRITE_GENOTYPES));
- else {
- return new VCFWriter(location, maybeBgzfWrapOutputStream(location, output, options), refDict, indexCreator,
- options.contains(Options.INDEX_ON_THE_FLY),
- options.contains(Options.DO_NOT_WRITE_GENOTYPES),
- options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
+
+ if (isBCFOutput(location, options)) {
+ return createBcf2(location, output, refDict, indexCreator, options);
+ } else if (isCompressedVcf(location)) {
+ return createBlockCompressedVcf(location, output, refDict, indexCreator, options);
+ } else {
+ return createVcf(location, output, refDict, indexCreator, options);
}
}
private static OutputStream maybeBgzfWrapOutputStream(final File location, OutputStream output,
final EnumSet<Options> options) {
- if (options.contains(Options.INDEX_ON_THE_FLY) &&
- (isCompressedVcf(location) || output instanceof BlockCompressedOutputStream)) {
- throw new IllegalArgumentException("VCF index creation not supported for vcf.gz output format.");
+ if (options.contains(Options.INDEX_ON_THE_FLY)) {
+ throw new IllegalArgumentException("VCF index creation not supported for block-compressed output format.");
}
if (!(output instanceof BlockCompressedOutputStream)) {
- if (isCompressedVcf(location)) {
output = new BlockCompressedOutputStream(output, location);
- }
}
return output;
}
+ private static VariantContextWriter maybeWrapWithAsyncWriter(final VariantContextWriter writer,
+ final EnumSet<Options> options) {
+ if (options.contains(Options.USE_ASYNC_IO)) {
+ return new AsyncVariantContextWriter(writer, AsyncVariantContextWriter.DEFAULT_QUEUE_SIZE);
+ }
+ else return writer;
+ }
+
/**
* Should we output a BCF file based solely on the name of the file at location?
*
@@ -131,7 +229,11 @@ public class VariantContextWriterFactory {
}
public static boolean isCompressedVcf(final File location) {
- return location != null && location.getName().endsWith(".gz");
+ if (location == null) return false;
+ for (final String extension : BLOCK_COMPRESSED_EXTENSIONS) {
+ if (location.getName().endsWith(extension)) return true;
+ }
+ return false;
}
public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, final int maxCachingStartDistance) {
diff --git a/src/java/org/broadinstitute/variant/vcf/AbstractVCFCodec.java b/src/java/org/broadinstitute/variant/vcf/AbstractVCFCodec.java
index 8071181..314a23f 100644
--- a/src/java/org/broadinstitute/variant/vcf/AbstractVCFCodec.java
+++ b/src/java/org/broadinstitute/variant/vcf/AbstractVCFCodec.java
@@ -717,9 +717,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
-
- private final static String[] INT_DECODE_ARRAY = new String[10000];
- private final static int[] decodeInts(final String string) {
+ private final String[] INT_DECODE_ARRAY = new String[10000];
+ private final int[] decodeInts(final String string) {
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
final int[] values = new int[nValues];
try {
diff --git a/src/java/org/broadinstitute/variant/vcf/VCFFileReader.java b/src/java/org/broadinstitute/variant/vcf/VCFFileReader.java
index 6ed9251..a9f03e1 100644
--- a/src/java/org/broadinstitute/variant/vcf/VCFFileReader.java
+++ b/src/java/org/broadinstitute/variant/vcf/VCFFileReader.java
@@ -31,7 +31,7 @@ public class VCFFileReader implements Closeable, Iterable<VariantContext> {
* Returns the SAMSequenceDictionary from the provided VCF file.
*/
public static SAMSequenceDictionary getSequenceDictionary(final File file) {
- final SAMSequenceDictionary dict = new VCFFileReader(file).getFileHeader().getSequenceDictionary();
+ final SAMSequenceDictionary dict = new VCFFileReader(file, false).getFileHeader().getSequenceDictionary();
CloserUtil.close(file);
return dict;
}
diff --git a/src/tests/java/net/sf/picard/illumina/parser/IlluminaFileUtilTest.java b/src/tests/java/net/sf/picard/illumina/parser/IlluminaFileUtilTest.java
index 8306556..e7b0492 100644
--- a/src/tests/java/net/sf/picard/illumina/parser/IlluminaFileUtilTest.java
+++ b/src/tests/java/net/sf/picard/illumina/parser/IlluminaFileUtilTest.java
@@ -11,9 +11,7 @@ import net.sf.picard.illumina.parser.IlluminaFileUtil.SupportedIlluminaFormat;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -26,6 +24,18 @@ public class IlluminaFileUtilTest {
private static final int [] DEFAULT_CYCLES = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
private static final int DEFAULT_LAST_CYCLE = 20;
+ // We have data for these that should agree
+ private static final List<SupportedIlluminaFormat> FORMATS_TO_TEST = Arrays.asList(
+ SupportedIlluminaFormat.Bcl,
+ SupportedIlluminaFormat.Cif,
+ SupportedIlluminaFormat.Cnf,
+ SupportedIlluminaFormat.Locs,
+ SupportedIlluminaFormat.Clocs,
+ SupportedIlluminaFormat.Pos,
+ SupportedIlluminaFormat.Filter,
+ SupportedIlluminaFormat.Barcode);
+
+
private File intensityDir;
private File basecallDir;
@@ -101,7 +111,7 @@ public class IlluminaFileUtilTest {
IlluminaFileUtil.makeParameterizedQseqRegex(lane);
}
- public void assertDefaults(final IlluminaFileUtil fileUtil, final Integer lane) {
+ public void assertDefaults(final IlluminaFileUtil fileUtil, final Integer lane, final List<SupportedIlluminaFormat> formatsToTest) {
if(lane == null) {
Assert.assertEquals(fileUtil.getLane(), DEFAULT_LANE);
} else {
@@ -143,7 +153,7 @@ public class IlluminaFileUtilTest {
Assert.assertEquals(detectedCycles[i], DEFAULT_CYCLES[i], "Elements differ at index " + i);
}
- Assert.assertEquals(fileUtil.getActualTiles(Arrays.asList(SupportedIlluminaFormat.values())), DEFAULT_TILES);
+ Assert.assertEquals(fileUtil.getActualTiles(formatsToTest), DEFAULT_TILES);
}
@Test
@@ -154,10 +164,18 @@ public class IlluminaFileUtilTest {
makeFiles(format, intensityDir, DEFAULT_LANE+2, DEFAULT_TILES, DEFAULT_CYCLES, DEFAULT_NUM_ENDS, ".bz2");
}
+ final Set<SupportedIlluminaFormat> formatsToTest = new HashSet<SupportedIlluminaFormat>();
+ // TODO: I can't be bothered to build files for these. AW
+ Collections.addAll(formatsToTest, SupportedIlluminaFormat.values());
+ formatsToTest.remove(SupportedIlluminaFormat.MultiTileBcl);
+ formatsToTest.remove(SupportedIlluminaFormat.MultiTileFilter);
+ formatsToTest.remove(SupportedIlluminaFormat.MultiTileLocs);
+ ArrayList<SupportedIlluminaFormat> formatsList = new ArrayList<SupportedIlluminaFormat>(formatsToTest);
+
for(int i = 0; i < 3; i++) {
final IlluminaFileUtil fileUtil = new IlluminaFileUtil(new File(intensityDir, "BaseCalls"), DEFAULT_LANE + i);
- Assert.assertEquals(fileUtil.getActualTiles(Arrays.asList(SupportedIlluminaFormat.values())), DEFAULT_TILES);
- assertDefaults(fileUtil, DEFAULT_LANE+i);
+ Assert.assertEquals(fileUtil.getActualTiles(formatsList), DEFAULT_TILES);
+ assertDefaults(fileUtil, DEFAULT_LANE+i, formatsList);
}
}
@@ -173,7 +191,7 @@ public class IlluminaFileUtilTest {
final IlluminaFileUtil fileUtil = new IlluminaFileUtil(new File(intensityDir, "BaseCalls"), DEFAULT_LANE + i);
- for(final SupportedIlluminaFormat format : SupportedIlluminaFormat.values()) {
+ for(final SupportedIlluminaFormat format : FORMATS_TO_TEST) {
if(format != SupportedIlluminaFormat.Qseq) {
Assert.assertEquals(new ArrayList<String>(), fileUtil.getUtil(format).verify(DEFAULT_TILES, DEFAULT_CYCLES));
}
@@ -246,7 +264,7 @@ public class IlluminaFileUtilTest {
int totalDifferences = 0;
List<String> differences = new ArrayList<String>();
- for(final SupportedIlluminaFormat format : SupportedIlluminaFormat.values()) {
+ for(final SupportedIlluminaFormat format : FORMATS_TO_TEST) {
if(format != SupportedIlluminaFormat.Qseq) {
final List<String> curDiffs = fileUtil.getUtil(format).verify(DEFAULT_TILES, DEFAULT_CYCLES);
differences.addAll(curDiffs);
diff --git a/src/tests/java/net/sf/picard/illumina/parser/PerTilePerCycleParserTest.java b/src/tests/java/net/sf/picard/illumina/parser/PerTilePerCycleParserTest.java
index 4e57a2c..ab6bfcd 100644
--- a/src/tests/java/net/sf/picard/illumina/parser/PerTilePerCycleParserTest.java
+++ b/src/tests/java/net/sf/picard/illumina/parser/PerTilePerCycleParserTest.java
@@ -48,7 +48,7 @@ public class PerTilePerCycleParserTest {
}
@Override
- protected CycleFileParser<MockCycledIlluminaData> makeCycleFileParser(final File file, final int cycle) {
+ protected CycleFileParser<MockCycledIlluminaData> makeCycleFileParser(final File file, final int cycle, final int tileNumber) {
return new CycleFileParser<MockCycledIlluminaData>() {
int currentCluster = 0;
final String filePrefix = str_del(file.getName(), cycle);
diff --git a/src/tests/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReaderTest.java b/src/tests/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReaderTest.java
index b8ab21d..f245e2e 100644
--- a/src/tests/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReaderTest.java
+++ b/src/tests/java/net/sf/picard/illumina/parser/readers/AbstractIlluminaPositionFileReaderTest.java
@@ -101,8 +101,8 @@ public class AbstractIlluminaPositionFileReaderTest {
@DataProvider(name="invalidPositions")
public Object [][] invalidPositions() {
return new Object[][] {
- {new float[]{0f, 5f, -1f}, new float[]{1f, 12f, 13f}},
- {new float[]{-5.05f}, new float[]{19801f}},
+ {new float[]{0f, 5f, -11f}, new float[]{1f, 12f, 13f}},
+ {new float[]{-15.05f}, new float[]{19801f}},
{new float[]{10.05f, 3f, 8f}, new float[]{-120899.723f, 4f, 9f}},
{new float[]{9.0f, 2.3f, AbstractIlluminaPositionFileReader.MAX_POS + 1}, new float[]{3.2f, 8.1f, 99.1f}},
{new float[]{0.01f}, new float[]{AbstractIlluminaPositionFileReader.MAX_POS + 1000}}
diff --git a/src/tests/java/net/sf/picard/reference/ReferenceSequenceFileWalkerTest.java b/src/tests/java/net/sf/picard/reference/ReferenceSequenceFileWalkerTest.java
new file mode 100644
index 0000000..5149376
--- /dev/null
+++ b/src/tests/java/net/sf/picard/reference/ReferenceSequenceFileWalkerTest.java
@@ -0,0 +1,60 @@
+package net.sf.picard.reference;
+
+import net.sf.picard.PicardException;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+
+/**
+ * Created by farjoun on 2/14/14.
+ */
+public class ReferenceSequenceFileWalkerTest {
+
+
+ @DataProvider(name = "TestReference")
+ public Object[][] TestReference() {
+ return new Object[][]{
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", 0, 1},
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", 1, 1},
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", 0, 0}
+ };
+ }
+
+
+ @Test(dataProvider = "TestReference")
+ public void testGet(final String fileName, final int index1, final int index2) throws PicardException {
+ final File refFile = new File(fileName);
+ final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile);
+
+ ReferenceSequence sequence = refWalker.get(index1);
+ Assert.assertEquals(sequence.getContigIndex(), index1);
+
+ sequence = refWalker.get(index2);
+ Assert.assertEquals(sequence.getContigIndex(), index2);
+ }
+
+
+ @DataProvider(name = "TestFailReference")
+ public Object[][] TestFailReference() {
+ return new Object[][]{
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", 2,3},
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", 1,0},
+ new Object[]{"testdata/net/sf/picard/reference/Homo_sapiens_assembly18.trimmed.fasta", -1,0}
+ };
+ }
+
+
+ @Test(expectedExceptions = {PicardException.class}, dataProvider = "TestFailReference")
+ public void testFailGet(final String fileName, final int index1, final int index2) throws PicardException {
+ final File refFile = new File(fileName);
+ final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile);
+
+ refWalker.get(index1);
+
+ refWalker.get(index2);
+ }
+
+
+}
diff --git a/src/tests/java/net/sf/picard/sam/MergeBamAlignmentTest.java b/src/tests/java/net/sf/picard/sam/MergeBamAlignmentTest.java
index df61c10..b3b558e 100644
--- a/src/tests/java/net/sf/picard/sam/MergeBamAlignmentTest.java
+++ b/src/tests/java/net/sf/picard/sam/MergeBamAlignmentTest.java
@@ -321,7 +321,7 @@ public class MergeBamAlignmentTest {
final File target = File.createTempFile("target", "bam");
target.deleteOnExit();
final SamAlignmentMerger merger = new SamAlignmentMerger(unmapped, target, fasta, null, true, false,
- true, false, Arrays.asList(aligned), 1, null, null, null, null, null,
+ false, Arrays.asList(aligned), 1, null, null, null, null, null,
Arrays.asList(SamPairUtil.PairOrientation.FR), SAMFileHeader.SortOrder.coordinate,
new BestMapqPrimaryAlignmentSelectionStrategy());
diff --git a/src/tests/java/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java b/src/tests/java/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java
index 4564496..e7c4a93 100644
--- a/src/tests/java/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java
+++ b/src/tests/java/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java
@@ -536,26 +536,29 @@ public class VariantContextUnitTest extends VariantBaseTest {
Assert.assertEquals(1, vc.getNoCallCount());
}
- @Test
+ @Test
public void testVCFfromGenotypes() {
- List<Allele> alleles = Arrays.asList(Aref, T);
+ List<Allele> alleles = Arrays.asList(Aref, C, T);
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
Genotype g4 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
- VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4).make();
+ Genotype g5 = GenotypeBuilder.create("AC", Arrays.asList(Aref, C));
+ VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
VariantContext vc12 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName(), g2.getSampleName())), true);
VariantContext vc1 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName())), true);
VariantContext vc23 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g2.getSampleName(), g3.getSampleName())), true);
VariantContext vc4 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g4.getSampleName())), true);
VariantContext vc14 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName(), g4.getSampleName())), true);
+ VariantContext vc125 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName(), g2.getSampleName(), g5.getSampleName())), true);
Assert.assertTrue(vc12.isPolymorphicInSamples());
Assert.assertTrue(vc23.isPolymorphicInSamples());
Assert.assertTrue(vc1.isMonomorphicInSamples());
Assert.assertTrue(vc4.isMonomorphicInSamples());
Assert.assertTrue(vc14.isMonomorphicInSamples());
+ Assert.assertTrue(vc125.isPolymorphicInSamples());
Assert.assertTrue(vc12.isSNP());
Assert.assertTrue(vc12.isVariant());
@@ -577,11 +580,16 @@ public class VariantContextUnitTest extends VariantBaseTest {
Assert.assertFalse(vc14.isVariant());
Assert.assertFalse(vc14.isBiallelic());
+ Assert.assertTrue(vc125.isSNP());
+ Assert.assertTrue(vc125.isVariant());
+ Assert.assertFalse(vc125.isBiallelic());
+
Assert.assertEquals(3, vc12.getCalledChrCount(Aref));
Assert.assertEquals(1, vc23.getCalledChrCount(Aref));
Assert.assertEquals(2, vc1.getCalledChrCount(Aref));
Assert.assertEquals(0, vc4.getCalledChrCount(Aref));
Assert.assertEquals(2, vc14.getCalledChrCount(Aref));
+ Assert.assertEquals(4, vc125.getCalledChrCount(Aref));
}
public void testGetGenotypeMethods() {
@@ -798,6 +806,7 @@ public class VariantContextUnitTest extends VariantBaseTest {
tests.add(new Object[]{new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles)});
tests.add(new Object[]{new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles)});
tests.add(new Object[]{new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles)});
+ tests.add(new Object[]{new SubContextTest(Arrays.asList("AA", "AT", "AC"), updateAlleles)});
}
return tests.toArray(new Object[][]{});
@@ -808,9 +817,10 @@ public class VariantContextUnitTest extends VariantBaseTest {
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
+ Genotype g4 = GenotypeBuilder.create("AC", Arrays.asList(Aref, C));
- GenotypesContext gc = GenotypesContext.create(g1, g2, g3);
- VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make();
+ GenotypesContext gc = GenotypesContext.create(g1, g2, g3, g4);
+ VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, C, T)).genotypes(gc).make();
VariantContext sub = vc.subContextFromSamples(cfg.samples, cfg.updateAlleles);
// unchanged attributes should be the same
@@ -826,22 +836,33 @@ public class VariantContextUnitTest extends VariantBaseTest {
if ( cfg.samples.contains(g1.getSampleName()) ) expectedGenotypes.add(g1);
if ( cfg.samples.contains(g2.getSampleName()) ) expectedGenotypes.add(g2);
if ( cfg.samples.contains(g3.getSampleName()) ) expectedGenotypes.add(g3);
+ if ( cfg.samples.contains(g4.getSampleName()) ) expectedGenotypes.add(g4);
GenotypesContext expectedGC = GenotypesContext.copy(expectedGenotypes);
// these values depend on the results of sub
if ( cfg.updateAlleles ) {
// do the work to see what alleles should be here, and which not
- Set<Allele> alleles = new HashSet<Allele>();
- for ( final Genotype g : expectedGC ) alleles.addAll(g.getAlleles());
- if ( ! alleles.contains(Aref) ) alleles.add(Aref); // always have the reference
- Assert.assertEquals(new HashSet<Allele>(sub.getAlleles()), alleles);
+ List<Allele> expectedAlleles = new ArrayList<Allele>();
+ expectedAlleles.add(Aref);
+
+ Set<Allele> genotypeAlleles = new HashSet<Allele>();
+ for ( final Genotype g : expectedGC )
+ genotypeAlleles.addAll(g.getAlleles());
+ genotypeAlleles.remove(Aref);
+
+ // ensure original allele order
+ for (Allele allele: vc.getAlleles())
+ if (genotypeAlleles.contains(allele))
+ expectedAlleles.add(allele);
+
+ Assert.assertEquals(sub.getAlleles(), expectedAlleles);
} else {
// not updating alleles -- should be the same
Assert.assertEquals(sub.getAlleles(), vc.getAlleles());
}
// same sample names => success
- Assert.assertEquals(sub.getGenotypes().getSampleNames(), expectedGC.getSampleNames());
+ Assert.assertTrue(sub.getGenotypes().getSampleNames().equals(expectedGC.getSampleNames()));
}
// --------------------------------------------------------------------------------
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/picard-tools.git
More information about the debian-med-commit
mailing list