[med-svn] [SCM] picard-tools branch, upstream, updated. upstream/1.80-1-gb78487e
Olivier Sallou
olivier.sallou at debian.org
Thu Jan 3 09:47:28 UTC 2013
The following commit has been merged in the upstream branch:
commit b78487ee745a707f4c8604053e25d1526ed2751f
Author: Olivier Sallou <olivier.sallou at debian.org>
Date: Thu Jan 3 10:34:04 2013 +0100
Imported Upstream version 1.82
diff --git a/Picard-public.ipr b/Picard-public.ipr
index 68b3a40..fd4cd67 100644
--- a/Picard-public.ipr
+++ b/Picard-public.ipr
@@ -35,7 +35,11 @@
<entry name="?*.tld" />
<entry name="?*.ftl" />
</wildcardResourcePatterns>
- <annotationProcessing enabled="false" useClasspath="true" />
+ <annotationProcessing>
+ <profile default="true" name="Default" enabled="false">
+ <processorPath useClasspath="true" />
+ </profile>
+ </annotationProcessing>
</component>
<component name="CopyrightManager" default="">
<module2copyright />
@@ -92,6 +96,21 @@
<option name="REPORT_VARIABLES" value="true" />
<option name="REPORT_PARAMETERS" value="true" />
</inspection_tool>
+ <inspection_tool class="UnusedDeclaration" enabled="false" level="WARNING" enabled_by_default="false">
+ <option name="ADD_MAINS_TO_ENTRIES" value="true" />
+ <option name="ADD_APPLET_TO_ENTRIES" value="true" />
+ <option name="ADD_SERVLET_TO_ENTRIES" value="true" />
+ <option name="ADD_NONJAVA_TO_ENTRIES" value="true" />
+ </inspection_tool>
+ <inspection_tool class="groupsTestNG" enabled="true" level="WARNING" enabled_by_default="true">
+ <option name="groups">
+ <value>
+ <list size="1">
+ <item index="0" class="java.lang.String" itemvalue="unix" />
+ </list>
+ </value>
+ </option>
+ </inspection_tool>
</profile>
</profiles>
<option name="PROJECT_PROFILE" value="Project Default" />
@@ -247,6 +266,15 @@
<component name="ProjectDetails">
<option name="projectName" value="Picard-public" />
</component>
+ <component name="ProjectDictionaryState">
+ <dictionary name="mccowan">
+ <words>
+ <w>inferer</w>
+ <w>inferrer</w>
+ <w>phread</w>
+ </words>
+ </dictionary>
+ </component>
<component name="ProjectKey">
<option name="state" value="https://picard.svn.sourceforge.net/svnroot/picard/trunk/Picard-public.ipr" />
</component>
diff --git a/build.xml b/build.xml
index 4c82fe7..e379da4 100755
--- a/build.xml
+++ b/build.xml
@@ -38,7 +38,7 @@
<!-- Get SVN revision, if available, otherwise leave it blank. -->
<exec executable="svnversion" outputproperty="repository.revision" failifexecutionfails="false"/>
<property name="repository.revision" value=""/>
- <property name="sam-version" value="1.80"/>
+ <property name="sam-version" value="1.82"/>
<property name="picard-version" value="${sam-version}"/>
<property name="command_tmp" value=".command_tmp"/>
<property name="command-line-html-dir" value="${dist}/html"/>
diff --git a/src/java/net/sf/picard/cmdline/CommandLineProgram.java b/src/java/net/sf/picard/cmdline/CommandLineProgram.java
index 2860fe4..df6192d 100644
--- a/src/java/net/sf/picard/cmdline/CommandLineProgram.java
+++ b/src/java/net/sf/picard/cmdline/CommandLineProgram.java
@@ -247,10 +247,21 @@ public abstract class CommandLineProgram {
return commandLineParser;
}
+ /**
+ * @return This is a little-used version string that can be put in the @Usage annotation
+ */
public String getProgramVersion() {
return commandLineParser.getProgramVersion();
}
+
+ /**
+ * @return Version stored in the manifest of the jarfile.
+ */
+ public String getVersion() {
+ return getCommandLineParser().getVersion();
+ }
+
public String getCommandLine() {
return commandLine;
}
diff --git a/src/java/net/sf/picard/fastq/FastqRecord.java b/src/java/net/sf/picard/fastq/FastqRecord.java
index 74fdd0c..e4830c6 100755
--- a/src/java/net/sf/picard/fastq/FastqRecord.java
+++ b/src/java/net/sf/picard/fastq/FastqRecord.java
@@ -28,14 +28,16 @@ package net.sf.picard.fastq;
*/
public class FastqRecord {
- private String seqHeaderPrefix;
- private String seqLine;
- private String qualHeaderPrefix;
- private String qualLine;
+ private final String seqHeaderPrefix;
+ private final String seqLine;
+ private final String qualHeaderPrefix;
+ private final String qualLine;
public FastqRecord(final String seqHeaderPrefix, final String seqLine, final String qualHeaderPrefix, final String qualLine) {
- if (seqHeaderPrefix.length() > 0) this.seqHeaderPrefix = seqHeaderPrefix;
- if (qualHeaderPrefix.length() > 0) this.qualHeaderPrefix = qualHeaderPrefix;
+ if (seqHeaderPrefix != null && seqHeaderPrefix.length() > 0) this.seqHeaderPrefix = seqHeaderPrefix;
+ else this.seqHeaderPrefix = null;
+ if (qualHeaderPrefix != null && qualHeaderPrefix.length() > 0) this.qualHeaderPrefix = qualHeaderPrefix;
+ else this.qualHeaderPrefix = null;
this.seqLine = seqLine ;
this.qualLine = qualLine ;
}
diff --git a/src/java/net/sf/picard/sam/AbstractAlignmentMerger.java b/src/java/net/sf/picard/sam/AbstractAlignmentMerger.java
index 214ff4c..692a955 100644
--- a/src/java/net/sf/picard/sam/AbstractAlignmentMerger.java
+++ b/src/java/net/sf/picard/sam/AbstractAlignmentMerger.java
@@ -181,6 +181,20 @@ public abstract class AbstractAlignmentMerger {
}
/**
+ * Do this unconditionally, not just for aligned records, for two reasons:
+ * - An unaligned read has been processed by the aligner, so it is more truthful.
+ * - When chaining additional PG records, having all the records in the output file refer to the same PG
+ * record means that only one chain will need to be created, rather than a chain for the mapped reads
+ * and a separate chain for the unmapped reads.
+ */
+ private void maybeSetPgTag(final SAMRecord rec) {
+ if (this.programRecord != null) {
+ rec.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, this.programRecord.getProgramGroupId());
+ }
+ }
+ /**
+
+ /**
* Merges the alignment data with the non-aligned records from the source BAM file.
*/
public void mergeAlignment() {
@@ -206,12 +220,15 @@ public abstract class AbstractAlignmentMerger {
while (unmappedIterator.hasNext()) {
// Load next unaligned read or read pair.
final SAMRecord rec = unmappedIterator.next();
+
rec.setHeader(this.header);
+ maybeSetPgTag(rec);
final SAMRecord secondOfPair;
if (rec.getReadPairedFlag()) {
secondOfPair = unmappedIterator.next();
secondOfPair.setHeader(this.header);
+ maybeSetPgTag(secondOfPair);
// Validate that paired reads arrive as first of pair followed by second of pair
if (!rec.getReadName().equals(secondOfPair.getReadName()))
@@ -362,10 +379,6 @@ public abstract class AbstractAlignmentMerger {
private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SAMRecord aligned) {
setValuesFromAlignment(unaligned, aligned);
updateCigarForTrimmedOrClippedBases(unaligned, aligned);
- if (this.programRecord != null) {
- unaligned.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID,
- this.programRecord.getProgramGroupId());
- }
if (SAMUtils.cigarMapsNoBasesToRef(unaligned.getCigar())) {
SAMUtils.makeReadUnmapped(unaligned);
}
diff --git a/src/java/net/sf/picard/sam/FastqToSam.java b/src/java/net/sf/picard/sam/FastqToSam.java
index c23dd93..0d95806 100644
--- a/src/java/net/sf/picard/sam/FastqToSam.java
+++ b/src/java/net/sf/picard/sam/FastqToSam.java
@@ -31,10 +31,7 @@ import net.sf.picard.cmdline.Usage;
import net.sf.picard.fastq.FastqReader;
import net.sf.picard.fastq.FastqRecord;
import net.sf.picard.io.IoUtil;
-import net.sf.picard.util.FastqQualityFormat;
-import net.sf.picard.util.Log;
-import net.sf.picard.util.ProgressLogger;
-import net.sf.picard.util.SolexaQualityConverter;
+import net.sf.picard.util.*;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileHeader.SortOrder;
import net.sf.samtools.util.Iso8601Date;
@@ -66,7 +63,7 @@ public class FastqToSam extends CommandLineProgram {
@Option(shortName="V", doc="A value describing how the quality values are encoded in the fastq. Either Solexa for pre-pipeline 1.3 " +
"style scores (solexa scaling + 66), Illumina for pipeline 1.3 and above (phred scaling + 64) or Standard for phred scaled " +
- "scores with a character shift of 33.")
+ "scores with a character shift of 33. If this value is not specified, the quality format will be detected automatically.", optional = true)
public FastqQualityFormat QUALITY_FORMAT;
@Option(doc="Output SAM/BAM file. ", shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME)
@@ -120,6 +117,23 @@ public class FastqToSam extends CommandLineProgram {
/* Simply invokes the right method for unpaired or paired data. */
protected int doWork() {
+ if (QUALITY_FORMAT == null) {
+ final QualityEncodingDetector detector = new QualityEncodingDetector();
+ final FastqReader reader = new FastqReader(FASTQ);
+ if (FASTQ2 == null) {
+ detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader);
+ } else {
+ final FastqReader reader2 = new FastqReader(FASTQ2);
+ detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader, reader2);
+ reader2.close();
+ }
+ reader.close();
+
+ QUALITY_FORMAT = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ);
+ if (detector.isDeterminationAmbiguous())
+ LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities.");
+ LOG.info(String.format("Auto-detected quality format as: %s.", QUALITY_FORMAT));
+ }
final int readCount = (FASTQ2 == null) ? doUnpaired() : doPaired();
LOG.info("Processed " + readCount + " fastq reads");
return 0;
diff --git a/src/java/net/sf/picard/sam/MarkDuplicates.java b/src/java/net/sf/picard/sam/MarkDuplicates.java
index 265afcc..3dd9ab1 100644
--- a/src/java/net/sf/picard/sam/MarkDuplicates.java
+++ b/src/java/net/sf/picard/sam/MarkDuplicates.java
@@ -69,6 +69,26 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
@Option(shortName="M", doc="File to write duplication metrics to")
public File METRICS_FILE;
+ @Option(shortName=StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME,
+ doc="The program record ID for the @PG record(s) created by this program. " +
+ "Set to null to disable PG record creation. This string may have a suffix appended to avoid " +
+ "collision with other program record IDs.",
+ optional=true)
+ public String PROGRAM_RECORD_ID = "MarkDuplicates";
+
+ @Option(shortName="PG_VERSION",
+ doc="Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.",
+ optional=true)
+ public String PROGRAM_GROUP_VERSION;
+
+ @Option(shortName="PG_COMMAND",
+ doc="Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.",
+ optional=true)
+ public String PROGRAM_GROUP_COMMAND_LINE;
+
+ @Option(shortName="PG_NAME", doc="Value of PN tag of PG record to be created.")
+ public String PROGRAM_GROUP_NAME = "MarkDuplicates";
+
@Option(doc="Comment(s) to include in the output file's header.", optional=true, shortName="CO")
public List<String> COMMENT = new ArrayList<String>();
@@ -102,6 +122,13 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
// Variables used for optical duplicate detection and tracking
private final Histogram<Short> opticalDupesByLibraryId = new Histogram<Short>();
+ // All PG IDs seen in merged input files in first pass. These are gather for two reasons:
+ // - to know how many different PG records to create to represent this program invocation.
+ // - to know what PG IDs are already used to avoid collisions when creating new ones.
+ // Note that if there are one or more records that do not have a PG tag, then a null value
+ // will be stored in this set.
+ private final Set<String> pgIdsSeen = new HashSet<String>();
+
/** Stock main method. */
public static void main(final String[] args) {
System.exit(new MarkDuplicates().instanceMain(args));
@@ -134,6 +161,33 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
final SAMFileHeader outputHeader = header.clone();
outputHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
for (final String comment : COMMENT) outputHeader.addComment(comment);
+
+ // Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it.
+ final Map<String, String> chainedPgIds;
+ // Generate new PG record(s)
+ if (PROGRAM_RECORD_ID != null) {
+ final PgIdGenerator pgIdGenerator = new PgIdGenerator(outputHeader);
+ if (PROGRAM_GROUP_VERSION == null) {
+ PROGRAM_GROUP_VERSION = this.getVersion();
+ }
+ if (PROGRAM_GROUP_COMMAND_LINE == null) {
+ PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine();
+ }
+ chainedPgIds = new HashMap<String, String>();
+ for (final String existingId : pgIdsSeen) {
+ final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID);
+ chainedPgIds.put(existingId, newPgId);
+ final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId);
+ programRecord.setProgramVersion(PROGRAM_GROUP_VERSION);
+ programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE);
+ programRecord.setProgramName(PROGRAM_GROUP_NAME);
+ programRecord.setPreviousProgramGroupId(existingId);
+ outputHeader.addProgramRecord(programRecord);
+ }
+ } else {
+ chainedPgIds = null;
+ }
+
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader,
true,
OUTPUT);
@@ -206,6 +260,9 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
// do nothing
}
else {
+ if (PROGRAM_RECORD_ID != null) {
+ rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name())));
+ }
out.addAlignment(rec);
progress.record(rec);
}
@@ -327,6 +384,13 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
+
+ // This doesn't have anything to do with building sorted ReadEnd lists, but it can be done in the same pass
+ // over the input
+ if (PROGRAM_RECORD_ID != null) {
+ pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name()));
+ }
+
if (rec.getReadUnmappedFlag()) {
if (rec.getReferenceIndex() == -1) {
// When we hit the unmapped reads with no coordinate, no reason to continue.
@@ -607,7 +671,7 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
final boolean[] opticalDuplicateFlags = findOpticalDuplicates(list, OPTICAL_DUPLICATE_PIXEL_DISTANCE);
int opticalDuplicates = 0;
- for (boolean b: opticalDuplicateFlags) if (b) ++opticalDuplicates;
+ for (final boolean b: opticalDuplicateFlags) if (b) ++opticalDuplicates;
if (opticalDuplicates > 0) {
this.opticalDupesByLibraryId.increment(list.get(0).libraryId, opticalDuplicates);
}
@@ -658,4 +722,38 @@ public class MarkDuplicates extends AbstractDuplicateFindingAlgorithm {
return retval;
}
}
+
+ static class PgIdGenerator {
+ private int recordCounter;
+
+ private final Set<String> idsThatAreAlreadyTaken = new HashSet<String>();
+
+ PgIdGenerator(final SAMFileHeader header) {
+ for (final SAMProgramRecord pgRecord : header.getProgramRecords()) {
+ idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId());
+ }
+ recordCounter = idsThatAreAlreadyTaken.size();
+ }
+
+ String getNonCollidingId(final String recordId) {
+ if(!idsThatAreAlreadyTaken.contains(recordId)) {
+ // don't remap 1st record. If there are more records
+ // with this id, they will be remapped in the 'else'.
+ idsThatAreAlreadyTaken.add(recordId);
+ ++recordCounter;
+ return recordId;
+ } else {
+ String newId;
+ // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because
+ // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of
+ // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly
+ // processed bams.
+ while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++)));
+
+ idsThatAreAlreadyTaken.add( newId );
+ return newId;
+ }
+
+ }
+ }
}
diff --git a/src/java/net/sf/picard/sam/SamFileValidator.java b/src/java/net/sf/picard/sam/SamFileValidator.java
index 1fe5e6c..2aa93fa 100644
--- a/src/java/net/sf/picard/sam/SamFileValidator.java
+++ b/src/java/net/sf/picard/sam/SamFileValidator.java
@@ -30,9 +30,7 @@ import net.sf.picard.metrics.MetricsFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileWalker;
-import net.sf.picard.util.Histogram;
-import net.sf.picard.util.Log;
-import net.sf.picard.util.ProgressLogger;
+import net.sf.picard.util.*;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMValidationError.Type;
@@ -48,15 +46,15 @@ import java.util.*;
* <li>checks sam file header for read groups</li>
* <li>for each sam record
* <ul>
- * <li>reports error detected by SAMRecord.isValid()</li>
- * <li>validates NM (nucleotide differences) exists and matches reality</li>
- * <li>validates mate fields agree with data in the mate record</li>
+ * <li>reports error detected by SAMRecord.isValid()</li>
+ * <li>validates NM (nucleotide differences) exists and matches reality</li>
+ * <li>validates mate fields agree with data in the mate record</li>
* </ul>
* </li>
* </ul>
*
- * @see SAMRecord#isValid()
* @author Doug Voet
+ * @see SAMRecord#isValid()
*/
public class SamFileValidator {
private Histogram<Type> errorsByType = new Histogram<Type>();
@@ -80,7 +78,9 @@ public class SamFileValidator {
this.maxTempFiles = maxTempFiles;
}
- /** Sets one or more error types that should not be reported on. */
+ /**
+ * Sets one or more error types that should not be reported on.
+ */
public void setErrorsToIgnore(final Collection<Type> types) {
if (!types.isEmpty()) {
this.errorsToIgnore = EnumSet.copyOf(types);
@@ -93,7 +93,7 @@ public class SamFileValidator {
/**
* Outputs validation summary report to out.
- *
+ *
* @param samReader records to validate
* @param reference if null, NM tag validation is skipped
* @return boolean true if there are no validation errors, otherwise false
@@ -123,10 +123,10 @@ public class SamFileValidator {
/**
* Outputs validation error details to out.
- *
+ *
* @param samReader records to validate
* @param reference if null, NM tag validation is skipped
- * processing will stop after this threshold has been reached
+ * processing will stop after this threshold has been reached
* @return boolean true if there are no validation errors, otherwise false
*/
public boolean validateSamFileVerbose(final SAMFileReader samReader, final ReferenceSequenceFile reference) {
@@ -174,16 +174,16 @@ public class SamFileValidator {
samReader.setValidationStringency(ValidationStringency.SILENT);
validateHeader(samReader.getFileHeader());
orderChecker = new SAMSortOrderChecker(samReader.getFileHeader().getSortOrder());
- validateSamRecords(samReader, samReader.getFileHeader());
+ validateSamRecordsAndQualityFormat(samReader, samReader.getFileHeader());
validateUnmatchedPairs();
- if (validateIndex){
+ if (validateIndex) {
try {
- BamIndexValidator.exhaustivelyTestIndex(samReader);
- } catch (Exception e){
+ BamIndexValidator.exhaustivelyTestIndex(samReader);
+ } catch (Exception e) {
addError(new SAMValidationError(Type.INVALID_INDEX_FILE_POINTER, e.getMessage(), null));
}
}
-
+
if (errorsByType.isEmpty()) {
out.println("No errors found");
}
@@ -202,7 +202,7 @@ public class SamFileValidator {
// For the coordinate-sorted map, need to detect mate pairs in which the mateReferenceIndex on one end
// does not match the readReference index on the other end, so the pairs weren't united and validated.
inMemoryPairMap = new InMemoryPairEndInfoMap();
- CloseableIterator<Map.Entry<String, PairEndInfo>> it = ((CoordinateSortedPairEndInfoMap)pairEndInfoByName).iterator();
+ CloseableIterator<Map.Entry<String, PairEndInfo>> it = ((CoordinateSortedPairEndInfoMap) pairEndInfoByName).iterator();
while (it.hasNext()) {
Map.Entry<String, PairEndInfo> entry = it.next();
PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey());
@@ -219,20 +219,28 @@ public class SamFileValidator {
}
it.close();
} else {
- inMemoryPairMap = (InMemoryPairEndInfoMap)pairEndInfoByName;
+ inMemoryPairMap = (InMemoryPairEndInfoMap) pairEndInfoByName;
}
// At this point, everything in InMemoryMap is a read marked as a pair, for which a mate was not found.
- for (final Map.Entry<String, PairEndInfo> entry: inMemoryPairMap) {
+ for (final Map.Entry<String, PairEndInfo> entry : inMemoryPairMap) {
addError(new SAMValidationError(Type.MATE_NOT_FOUND, "Mate not found for paired read", entry.getKey()));
}
}
- private void validateSamRecords(final Iterable<SAMRecord> samRecords, final SAMFileHeader header) {
+ /**
+ * SAM record and quality format validations are combined into a single method because validation must be completed
+ * in only a single pass of the SamRecords (because a SamReader's iterator() method may not return the same
+ * records on a subsequent call).
+ */
+ private void validateSamRecordsAndQualityFormat(final Iterable<SAMRecord> samRecords, final SAMFileHeader header) {
SAMRecordIterator iter = (SAMRecordIterator) samRecords.iterator();
final ProgressLogger progress = new ProgressLogger(log, 10000000, "Validated Read");
+ final QualityEncodingDetector qualityDetector = new QualityEncodingDetector();
try {
- while(iter.hasNext()){
- SAMRecord record = iter.next();
+ while (iter.hasNext()) {
+ final SAMRecord record = iter.next();
+
+ qualityDetector.add(record);
final long recordNumber = progress.getCount() + 1;
final Collection<SAMValidationError> errors = record.isValid();
@@ -242,7 +250,7 @@ public class SamFileValidator {
addError(error);
}
}
-
+
validateMateFields(record, recordNumber);
validateSortOrder(record, recordNumber);
validateReadGroup(record, header);
@@ -259,6 +267,17 @@ public class SamFileValidator {
}
progress.record(record);
}
+
+ try {
+ if (progress.getCount() > 0) { // Avoid exception being thrown as a result of no qualities being read
+ final FastqQualityFormat format = qualityDetector.generateBestGuess(QualityEncodingDetector.FileContext.SAM);
+ if (format != FastqQualityFormat.Standard) {
+ addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, String.format("Detected %s quality score encoding, but expected %s.", format, FastqQualityFormat.Standard), null));
+ }
+ }
+ } catch (PicardException e) {
+ addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, e.getMessage(), null));
+ }
} catch (SAMFormatException e) {
// increment record number because the iterator behind the SAMFileReader
// reads one record ahead so we will get this failure one record ahead
@@ -277,8 +296,7 @@ public class SamFileValidator {
if (rg == null) {
addError(new SAMValidationError(Type.RECORD_MISSING_READ_GROUP,
"A record is missing a read group", record.getReadName()));
- }
- else if (!header.getReadGroups().contains(rg)) {
+ } else if (!header.getReadGroups().contains(rg)) {
addError(new SAMValidationError(Type.READ_GROUP_NOT_FOUND,
"A record has a read group not found in the header: ",
record.getReadName() + ", " + rg.getReadGroupId()));
@@ -299,7 +317,7 @@ public class SamFileValidator {
}
private void validateSecondaryBaseCalls(final SAMRecord record, final long recordNumber) {
- final String e2 = (String)record.getAttribute(SAMTag.E2.name());
+ final String e2 = (String) record.getAttribute(SAMTag.E2.name());
if (e2 != null) {
if (e2.length() != record.getReadLength()) {
addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_E2_LENGTH,
@@ -315,13 +333,13 @@ public class SamFileValidator {
if (SequenceUtil.basesEqual(bases[i], secondaryBases[i])) {
addError(new SAMValidationError(Type.E2_BASE_EQUALS_PRIMARY_BASE,
String.format("Secondary base call (%c) == primary base call (%c)",
- (char)secondaryBases[i], (char)bases[i]),
+ (char) secondaryBases[i], (char) bases[i]),
record.getReadName(), recordNumber));
break;
}
}
}
- final String u2 = (String)record.getAttribute(SAMTag.U2.name());
+ final String u2 = (String) record.getAttribute(SAMTag.U2.name());
if (u2 != null && u2.length() != record.getReadLength()) {
addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_U2_LENGTH,
String.format("U2 tag length (%d) != read length (%d)", u2.length(), record.getReadLength()),
@@ -352,18 +370,18 @@ public class SamFileValidator {
final SAMRecord prev = orderChecker.getPreviousRecord();
if (!orderChecker.isSorted(record)) {
addError(new SAMValidationError(
- Type.RECORD_OUT_OF_ORDER,
+ Type.RECORD_OUT_OF_ORDER,
String.format(
"The record is out of [%s] order, prior read name [%s], prior coodinates [%d:%d]",
record.getHeader().getSortOrder().name(),
prev.getReadName(),
prev.getReferenceIndex(),
prev.getAlignmentStart()),
- record.getReadName(),
+ record.getReadName(),
recordNumber));
}
}
-
+
private void init(final ReferenceSequenceFile reference, final SAMFileHeader header) {
if (header.getSortOrder() == SAMFileHeader.SortOrder.coordinate) {
this.pairEndInfoByName = new CoordinateSortedPairEndInfoMap();
@@ -386,8 +404,8 @@ public class SamFileValidator {
final Integer tagNucleotideDiffs = record.getIntegerAttribute(ReservedTagConstants.NM);
if (tagNucleotideDiffs == null) {
addError(new SAMValidationError(
- Type.MISSING_TAG_NM,
- "NM tag (nucleotide differences) is missing",
+ Type.MISSING_TAG_NM,
+ "NM tag (nucleotide differences) is missing",
record.getReadName(),
recordNumber));
} else if (refFileWalker != null) {
@@ -397,27 +415,27 @@ public class SamFileValidator {
if (!tagNucleotideDiffs.equals(actualNucleotideDiffs)) {
addError(new SAMValidationError(
- Type.INVALID_TAG_NM,
+ Type.INVALID_TAG_NM,
"NM tag (nucleotide differences) in file [" + tagNucleotideDiffs +
- "] does not match reality [" + actualNucleotideDiffs + "]",
+ "] does not match reality [" + actualNucleotideDiffs + "]",
record.getReadName(),
recordNumber));
}
}
}
}
-
+
private void validateMateFields(final SAMRecord record, final long recordNumber) {
if (!record.getReadPairedFlag() || record.getNotPrimaryAlignmentFlag()) {
return;
}
-
+
final PairEndInfo pairEndInfo = pairEndInfoByName.remove(record.getReferenceIndex(), record.getReadName());
if (pairEndInfo == null) {
pairEndInfoByName.put(record.getMateReferenceIndex(), record.getReadName(), new PairEndInfo(record, recordNumber));
} else {
final List<SAMValidationError> errors =
- pairEndInfo.validateMates(new PairEndInfo(record, recordNumber), record.getReadName());
+ pairEndInfo.validateMates(new PairEndInfo(record, recordNumber), record.getReadName());
for (final SAMValidationError error : errors) {
addError(error);
}
@@ -443,8 +461,8 @@ public class SamFileValidator {
addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null));
}
List<SAMProgramRecord> pgs = fileHeader.getProgramRecords();
- for (int i = 0; i < pgs.size()-1; i++) {
- for (int j=i+1; j < pgs.size(); j++) {
+ for (int i = 0; i < pgs.size() - 1; i++) {
+ for (int j = i + 1; j < pgs.size(); j++) {
if (pgs.get(i).getProgramGroupId().equals(pgs.get(j).getProgramGroupId())) {
addError(new SAMValidationError(Type.DUPLICATE_PROGRAM_GROUP_ID, "Duplicate " +
"program group id: " + pgs.get(i).getProgramGroupId(), null));
@@ -453,8 +471,8 @@ public class SamFileValidator {
}
List<SAMReadGroupRecord> rgs = fileHeader.getReadGroups();
- for (int i = 0; i < rgs.size()-1; i++) {
- for (int j=i+1; j < rgs.size(); j++) {
+ for (int i = 0; i < rgs.size() - 1; i++) {
+ for (int j = i + 1; j < rgs.size(); j++) {
if (rgs.get(i).getReadGroupId().equals(rgs.get(j).getReadGroupId())) {
addError(new SAMValidationError(Type.DUPLICATE_READ_GROUP_ID, "Duplicate " +
"read group id: " + rgs.get(i).getReadGroupId(), null));
@@ -482,7 +500,8 @@ public class SamFileValidator {
/**
* Control verbosity
- * @param verbose True in order to emit a message per error or warning.
+ *
+ * @param verbose True in order to emit a message per error or warning.
* @param maxVerboseOutput If verbose, emit no more than this many messages. Ignored if !verbose.
*/
public void setVerbose(final boolean verbose, final int maxVerboseOutput) {
@@ -490,8 +509,13 @@ public class SamFileValidator {
this.maxVerboseOutput = maxVerboseOutput;
}
- public boolean isBisulfiteSequenced() { return bisulfiteSequenced; }
- public void setBisulfiteSequenced(boolean bisulfiteSequenced) { this.bisulfiteSequenced = bisulfiteSequenced; }
+ public boolean isBisulfiteSequenced() {
+ return bisulfiteSequenced;
+ }
+
+ public void setBisulfiteSequenced(boolean bisulfiteSequenced) {
+ this.bisulfiteSequenced = bisulfiteSequenced;
+ }
public SamFileValidator setValidateIndex(boolean validateIndex) {
// The SAMFileReader must also have IndexCaching enabled to have the index validated,
@@ -502,8 +526,8 @@ public class SamFileValidator {
public static class ValidationMetrics extends MetricBase {
}
-
- /**
+
+ /**
* This class is used so we don't have to store the entire SAMRecord in memory while we wait
* to find a record's mate and also to store the record number.
*/
@@ -519,17 +543,17 @@ public class SamFileValidator {
private final boolean mateUnmappedFlag;
private final boolean firstOfPairFlag;
-
+
private final long recordNumber;
-
+
public PairEndInfo(final SAMRecord record, final long recordNumber) {
this.recordNumber = recordNumber;
-
+
this.readAlignmentStart = record.getAlignmentStart();
this.readNegStrandFlag = record.getReadNegativeStrandFlag();
this.readReferenceIndex = record.getReferenceIndex();
this.readUnmappedFlag = record.getReadUnmappedFlag();
-
+
this.mateAlignmentStart = record.getMateAlignmentStart();
this.mateNegStrandFlag = record.getMateNegativeStrandFlag();
this.mateReferenceIndex = record.getMateReferenceIndex();
@@ -559,7 +583,7 @@ public class SamFileValidator {
validateMateFields(mate, this, readName, errors);
// Validations that should not be repeated on both ends
if (this.firstOfPairFlag == mate.firstOfPairFlag) {
- final String whichEnd = this.firstOfPairFlag? "first": "second";
+ final String whichEnd = this.firstOfPairFlag ? "first" : "second";
errors.add(new SAMValidationError(
Type.MATES_ARE_SAME_END,
"Both mates are marked as " + whichEnd + " of pair",
@@ -569,40 +593,42 @@ public class SamFileValidator {
}
return errors;
}
-
+
private void validateMateFields(final PairEndInfo end1, final PairEndInfo end2, final String readName, final List<SAMValidationError> errors) {
if (end1.mateAlignmentStart != end2.readAlignmentStart) {
errors.add(new SAMValidationError(
- Type.MISMATCH_MATE_ALIGNMENT_START,
- "Mate alignment does not match alignment start of mate",
+ Type.MISMATCH_MATE_ALIGNMENT_START,
+ "Mate alignment does not match alignment start of mate",
readName,
end1.recordNumber));
}
if (end1.mateNegStrandFlag != end2.readNegStrandFlag) {
errors.add(new SAMValidationError(
- Type.MISMATCH_FLAG_MATE_NEG_STRAND,
- "Mate negative strand flag does not match read negative strand flag of mate",
+ Type.MISMATCH_FLAG_MATE_NEG_STRAND,
+ "Mate negative strand flag does not match read negative strand flag of mate",
readName,
end1.recordNumber));
}
if (end1.mateReferenceIndex != end2.readReferenceIndex) {
errors.add(new SAMValidationError(
- Type.MISMATCH_MATE_REF_INDEX,
- "Mate reference index (MRNM) does not match reference index of mate",
+ Type.MISMATCH_MATE_REF_INDEX,
+ "Mate reference index (MRNM) does not match reference index of mate",
readName,
end1.recordNumber));
}
if (end1.mateUnmappedFlag != end2.readUnmappedFlag) {
errors.add(new SAMValidationError(
- Type.MISMATCH_FLAG_MATE_UNMAPPED,
- "Mate unmapped flag does not match read unmapped flag of mate",
+ Type.MISMATCH_FLAG_MATE_UNMAPPED,
+ "Mate unmapped flag does not match read unmapped flag of mate",
readName,
end1.recordNumber));
}
}
}
-
- /** Thrown in addError indicating that maxVerboseOutput has been exceeded and processing should stop */
+
+ /**
+ * Thrown in addError indicating that maxVerboseOutput has been exceeded and processing should stop
+ */
private static class MaxOutputExceededException extends PicardException {
MaxOutputExceededException() {
super("maxVerboseOutput exceeded.");
@@ -611,7 +637,9 @@ public class SamFileValidator {
interface PairEndInfoMap extends Iterable<Map.Entry<String, PairEndInfo>> {
void put(int mateReferenceIndex, String key, PairEndInfo value);
+
PairEndInfo remove(int mateReferenceIndex, String key);
+
CloseableIterator<Map.Entry<String, PairEndInfo>> iterator();
}
@@ -624,7 +652,7 @@ public class SamFileValidator {
}
public PairEndInfo remove(int mateReferenceIndex, String key) {
- return onDiskMap.remove(mateReferenceIndex, key);
+ return onDiskMap.remove(mateReferenceIndex, key);
}
public CloseableIterator<Map.Entry<String, PairEndInfo>> iterator() {
@@ -635,8 +663,13 @@ public class SamFileValidator {
private DataInputStream in;
private DataOutputStream out;
- public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }
- public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); }
+ public void setOutputStream(final OutputStream os) {
+ this.out = new DataOutputStream(os);
+ }
+
+ public void setInputStream(final InputStream is) {
+ this.in = new DataInputStream(is);
+ }
public void encode(String key, PairEndInfo record) {
try {
diff --git a/src/java/net/sf/picard/util/IlluminaUtil.java b/src/java/net/sf/picard/util/IlluminaUtil.java
index 5e5961a..6736457 100644
--- a/src/java/net/sf/picard/util/IlluminaUtil.java
+++ b/src/java/net/sf/picard/util/IlluminaUtil.java
@@ -158,7 +158,11 @@ public class IlluminaUtil {
"AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"),
FLUIDIGM("AATGATACGGCGACCACCGAGATCTACACTGACGACATGGTTCTACA",
- "AGACCAAGTCTCTGCTACCGTANNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG");
+ "AGACCAAGTCTCTGCTACCGTANNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"),
+
+ TRUSEQ_SMALLRNA("AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC",
+ "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG")
+ ;
final String fivePrime, threePrime, fivePrimeReadOrder;
final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes;
diff --git a/src/java/net/sf/picard/util/QualityEncodingDetector.java b/src/java/net/sf/picard/util/QualityEncodingDetector.java
new file mode 100644
index 0000000..def47fe
--- /dev/null
+++ b/src/java/net/sf/picard/util/QualityEncodingDetector.java
@@ -0,0 +1,337 @@
+package net.sf.picard.util;
+
+import net.sf.picard.PicardException;
+import net.sf.picard.fastq.FastqReader;
+import net.sf.picard.fastq.FastqRecord;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMRecordIterator;
+
+import java.util.*;
+
+import static java.util.Arrays.asList;
+
+/**
+ * Utility for determining the type of quality encoding/format (see FastqQualityFormat) used in a SAM/BAM or Fastq.
+ * <p/>
+ * To use this class, invoke the detect() method with a SAMFileReader or FastqReader, as appropriate. The consumer is
+ * responsible for closing readers.
+ *
+ * @author mccowan at broadinstitute.org
+ */
+public class QualityEncodingDetector {
+
+ private QualityRecordAggregator qualityAggregator = new QualityRecordAggregator();
+
+ /**
+ * The maximum number of records over which the detector will iterate before making a determination, by default.
+ */
+ public static final long DEFAULT_MAX_RECORDS_TO_ITERATE = 10000;
+ private static final Log log = Log.getInstance(QualityEncodingDetector.class);
+
+ public enum FileContext {FASTQ, SAM}
+
+ static class Range {
+ final int low, high;
+
+ Range(final int low, final int high) {
+ this.low = low;
+ this.high = high;
+ }
+
+ boolean contains(final int value) {
+ return value <= high && value >= low;
+ }
+ }
+
+ /**
+ * Collection of data about the different quality formats and how they are interpreted.
+ */
+ enum QualityScheme {
+ Phred(
+ new Range(0, 93), // Raw value range
+ new Range(33, 126), // ASCII value range
+ asList(new Range(33, 58)), // Ranges into which we expect at least one ASCII value to fall
+ FastqQualityFormat.Standard // Associated quality format
+ ),
+ Solexa(
+ new Range(-5, 62),
+ new Range(59, 126),
+ new ArrayList<Range>(),
+ FastqQualityFormat.Solexa
+ ),
+ Illumina(
+ new Range(0, 62),
+ new Range(64, 126),
+ new ArrayList<Range>(),
+ FastqQualityFormat.Illumina
+ );
+ final Range rawRange, asciiRange;
+ /**
+ * Ranges into which we expect at least one value to fall if this formatting is being used. For example, for
+ * Standard encoding, we expect to at least one ASCII value between 33 and 58 (0 and 25); otherwise, it's
+ * probably not Standard-encoded.
+ */
+ final List<Range> expectedAsciiRanges;
+ final FastqQualityFormat qualityFormat;
+
+ QualityScheme(final Range rawRange, final Range asciiRange, final List<Range> expectedAsciiRanges, final FastqQualityFormat qualityFormat) {
+ this.rawRange = rawRange;
+ this.asciiRange = asciiRange;
+ this.expectedAsciiRanges = expectedAsciiRanges;
+ this.qualityFormat = qualityFormat;
+ }
+ }
+
+ /**
+ * Collecting reads and their quality scores for later analysis. Uses ASCII values since those are the inherently
+ * "raw-est", read unmodified from the file.
+ */
+ private static class QualityRecordAggregator {
+ private Set<Integer> observedAsciiQualities = new HashSet<Integer>();
+
+ public Set<Integer> getObservedAsciiQualities() {
+ return Collections.unmodifiableSet(observedAsciiQualities);
+ }
+
+ /**
+ * Adds the FastqRecord's quality scores.
+ */
+ public void add(final FastqRecord fastqRecord) {
+ addAsciiQuality(fastqRecord.getBaseQualityString().getBytes());
+ }
+
+ /**
+ * Adds the SAMRecord's quality scores.
+ * <p/>
+ * Does not assume Phred quality encoding (for obvious reasons); getBaseQualityString() is used to read the
+ * unmodified ASCII score. To elaborate, SAMFileReader, which is generating these SAMRecords, builds the
+ * SAMRecord by subtracting a value from each quality score and storing that transformed value internally.
+ * Since we desire original scores here (whatever was in the file to begin with), we effectively undo this
+ * transformation by asking SAMRecord to convert the quality back into the ASCII that was read in the file.
+ */
+ public void add(final SAMRecord samRecord) {
+ addAsciiQuality(samRecord.getBaseQualityString().getBytes());
+ }
+
+ private void addAsciiQuality(final byte... asciiQualities) {
+ for (final byte asciiQuality : asciiQualities) {
+ observedAsciiQualities.add((int) asciiQuality);
+ }
+ }
+ }
+
+ /**
+ * Adds the provided reader's records to the detector.
+ * @return The number of records read
+ */
+ public long add(final long maxRecords, final FastqReader... readers) {
+ final Iterator<FastqRecord> iterator = generateInterleavedFastqIterator(readers);
+ long recordCount = 0;
+ while (iterator.hasNext() && recordCount++ != maxRecords) {
+ this.add(iterator.next());
+ }
+ log.debug(String.format("Read %s records from %s.", recordCount, Arrays.toString(readers)));
+ return recordCount;
+ }
+
+ /**
+ * Adds the provided reader's records to the detector.
+ * @return The number of records read
+ */
+ public long add(final long maxRecords, final SAMFileReader reader) {
+ final SAMRecordIterator iterator = reader.iterator();
+ long recordCount = 0;
+ try {
+ while (iterator.hasNext() && recordCount++ != maxRecords) {
+ this.add(iterator.next());
+ }
+
+ return recordCount;
+ } finally {
+ iterator.close();
+ }
+ }
+
+ /**
+ * Adds the provided record's qualities to the detector.
+ */
+ public void add(final FastqRecord fastqRecord) {
+ this.qualityAggregator.add(fastqRecord);
+ }
+
+ /**
+ * Adds the provided record's qualities to the detector.
+ */
+ public void add(final SAMRecord samRecord) {
+ this.qualityAggregator.add(samRecord);
+ }
+
+ /**
+ * Tests whether or not the detector can make a determination without guessing (i.e., if all but one quality format
+ * can be excluded using established exclusion conventions).
+ *
+ * @return True if more than one format is possible after exclusions; false otherwise
+ */
+ public boolean isDeterminationAmbiguous() {
+ return this.generateCandidateQualities().size() > 1;
+ }
+
+ /**
+ * Processes collected quality data and applies rules to determine which quality formats are possible.
+ * <p/>
+ * Specifically, for each format's known range of possible values (its "quality scheme"), exclude formats if any
+ * observed values fall outside of that range. Additionally, exclude formats for which we expect to see at
+ * least one quality in a range of values, but do not. (For example, for Phred, we expect to eventually see
+ * a value below 58. If we never see such a value, we exclude Phred as a possible format.)
+ */
+ public EnumSet<FastqQualityFormat> generateCandidateQualities() {
+ final EnumSet<FastqQualityFormat> candidateFormats = EnumSet.allOf(FastqQualityFormat.class);
+ final Set<Integer> observedAsciiQualities = this.qualityAggregator.getObservedAsciiQualities();
+ if (observedAsciiQualities.isEmpty())
+ throw new PicardException("Cannot determine candidate qualities: no qualities found.");
+
+ for (final QualityScheme scheme : QualityScheme.values()) {
+ final Iterator<Integer> qualityBinIterator = observedAsciiQualities.iterator();
+ final Collection<Range> remainingExpectedValueRanges = new ArrayList<Range>(scheme.expectedAsciiRanges);
+ while (qualityBinIterator.hasNext()) {
+ final int quality = qualityBinIterator.next();
+ if (!scheme.asciiRange.contains(quality)) {
+ candidateFormats.remove(scheme.qualityFormat);
+ }
+
+ final Iterator<Range> expectedValueRangeIterator = remainingExpectedValueRanges.iterator();
+ while (expectedValueRangeIterator.hasNext()) {
+ if (expectedValueRangeIterator.next().contains(quality)) {
+ expectedValueRangeIterator.remove();
+ }
+ }
+ }
+
+ /**
+ * We remove elements from this list as we observe values in the corresponding range; if the list isn't
+ * empty, we haven't seen a value in that range. In other words, we haven't seen a value we expected.
+ * Consequently, we remove the corresponding format from the running possibilities.
+ */
+ if (!remainingExpectedValueRanges.isEmpty()) {
+ candidateFormats.remove(scheme.qualityFormat);
+ }
+ }
+
+ return candidateFormats;
+ }
+
+ /**
+ * Based on the quality scores accumulated in the detector as well as the context in which this guess applies (a BAM
+ * or fastq), determines the best guess as to the quality format.
+ * <p/>
+ * This method does not exclude any quality formats based on observed quality values; even if there remain multiple
+ * candidate qualities (see generateCandidateQualities()), picks a single one based on a high-level logic.
+ *
+ * @param context The type of file for which the guess is being made, Fastq or Bam
+ */
+ public FastqQualityFormat generateBestGuess(final FileContext context) {
+ final EnumSet<FastqQualityFormat> possibleFormats = this.generateCandidateQualities();
+ switch (possibleFormats.size()) {
+ case 1:
+ return possibleFormats.iterator().next();
+ case 2:
+ if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Illumina, FastqQualityFormat.Solexa))) {
+ return FastqQualityFormat.Illumina;
+ } else if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Illumina, FastqQualityFormat.Standard))) {
+ switch (context) {
+ case FASTQ:
+ return FastqQualityFormat.Illumina;
+ case SAM:
+ return FastqQualityFormat.Standard;
+ }
+ } else if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Standard, FastqQualityFormat.Solexa))) {
+ throw new PicardException("The quality format cannot be determined: both Phred and Solexa formats are possible; this application's logic does not handle this scenario.");
+ } else throw new PicardException("Unreachable code.");
+ case 3:
+ throw new PicardException("The quality format cannot be determined: no formats were excluded.");
+ case 0:
+ throw new PicardException("The quality format cannot be determined: all formats were excluded.");
+ default:
+ throw new PicardException("Unreachable code.");
+ }
+ }
+
+ /**
+ * Interleaves FastqReader iterators so that serial-iteration of the result cycles between the constituent iterators.
+ */
+ private static Iterator<FastqRecord> generateInterleavedFastqIterator(final FastqReader... readers) {
+ return new Iterator<FastqRecord>() {
+ private Queue<Iterator<FastqRecord>> queue = new LinkedList<Iterator<FastqRecord>>();
+
+ {
+ for (final FastqReader reader : readers) {
+ queue.add(reader.iterator());
+ }
+ }
+
+ public boolean hasNext() {
+ // If this returns true, the head of the queue will have a next element
+ while (!queue.isEmpty()) {
+ if (queue.peek().hasNext()) {
+ return true;
+ }
+ queue.poll();
+ }
+ return false;
+ }
+
+ public FastqRecord next() {
+ if (!hasNext()) throw new NoSuchElementException();
+ final Iterator<FastqRecord> i = queue.poll();
+ final FastqRecord result = i.next();
+ queue.offer(i);
+ return result;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ /**
+ * Reads through the records in the provided fastq reader and uses their quality scores to determine the quality
+ * format used in the fastq.
+ *
+ * @param readers The fastq readers from which qualities are to be read; at least one must be provided
+ * @param maxRecords The maximum number of records to read from the reader before making a determination (a guess,
+ * so more records is better)
+ * @return The determined quality format
+ */
+ public static FastqQualityFormat detect(final long maxRecords, final FastqReader... readers) {
+ final QualityEncodingDetector detector = new QualityEncodingDetector();
+ final long recordCount = detector.add(maxRecords, readers);
+ log.debug(String.format("Read %s records from %s.", recordCount, Arrays.toString(readers)));
+ return detector.generateBestGuess(FileContext.FASTQ);
+ }
+
+ public static FastqQualityFormat detect(final FastqReader... readers) {
+ return detect(DEFAULT_MAX_RECORDS_TO_ITERATE, readers);
+ }
+
+ /**
+ * Reads through the records in the provided SAM reader and uses their quality scores to determine the quality
+ * format used in the SAM.
+ *
+ * @param reader The SAM reader from which records are to be read
+ * @param maxRecords The maximum number of records to read from the reader before making a determination (a guess,
+ * so more records is better)
+ * @return The determined quality format
+ */
+ public static FastqQualityFormat detect(final long maxRecords, final SAMFileReader reader) {
+ final QualityEncodingDetector detector = new QualityEncodingDetector();
+ final long recordCount = detector.add(maxRecords, reader);
+ log.debug(String.format("Read %s records from %s.", recordCount, reader));
+ return detector.generateBestGuess(FileContext.SAM);
+ }
+
+ public static FastqQualityFormat detect(final SAMFileReader reader) {
+ return detect(DEFAULT_MAX_RECORDS_TO_ITERATE, reader);
+ }
+}
\ No newline at end of file
diff --git a/src/java/net/sf/samtools/SAMFileTruncatedReader.java b/src/java/net/sf/samtools/SAMFileTruncatedReader.java
new file mode 100644
index 0000000..7b2c1bd
--- /dev/null
+++ b/src/java/net/sf/samtools/SAMFileTruncatedReader.java
@@ -0,0 +1,68 @@
+package net.sf.samtools;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+/**
+ * A truncated form of a SAMFileReader that iterates over a limited number of records.
+ *
+ * @author mccowan at broadinstitute.org
+ */
+public class SAMFileTruncatedReader extends SAMFileReader {
+ private class TruncatedIterator implements SAMRecordIterator {
+ final SAMRecordIterator i;
+ final long max;
+ long currentRecord = 0;
+
+ TruncatedIterator(final SAMRecordIterator i, final long max) {
+ this.i = i;
+ this.max = max;
+ }
+
+ public boolean hasNext() {
+ return i.hasNext() && max != currentRecord;
+ }
+
+ public SAMRecord next() {
+ if (this.hasNext()) {
+ currentRecord += 1;
+ return i.next();
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+
+ public void remove() {
+ i.remove();
+ }
+
+ public void close() {
+ i.close();
+ }
+
+ public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) {
+ return i.assertSorted(sortOrder);
+ }
+ }
+
+ private final long maxRecordsToIterate;
+
+ /**
+ * @param input The SAM file
+ * @param max The maximum number of records to read from the file via iterator() methods
+ */
+ public SAMFileTruncatedReader(final File input, final long max) {
+ super(input);
+ this.maxRecordsToIterate = max;
+ }
+
+ @Override
+ public SAMRecordIterator iterator() {
+ return new TruncatedIterator(super.iterator(), maxRecordsToIterate);
+ }
+
+ @Override
+ public SAMRecordIterator iterator(final SAMFileSpan chunks) {
+ return new TruncatedIterator(super.iterator(chunks), maxRecordsToIterate);
+ }
+}
diff --git a/src/java/net/sf/samtools/SAMFileWriterFactory.java b/src/java/net/sf/samtools/SAMFileWriterFactory.java
index 3788bec..5b3e675 100644
--- a/src/java/net/sf/samtools/SAMFileWriterFactory.java
+++ b/src/java/net/sf/samtools/SAMFileWriterFactory.java
@@ -172,13 +172,13 @@ public class SAMFileWriterFactory {
private void initializeBAMWriter(final BAMFileWriter writer, final SAMFileHeader header, final boolean presorted, final boolean createIndex) {
writer.setSortOrder(header.getSortOrder(), presorted);
+ if (maxRecordsInRam != null) {
+ writer.setMaxRecordsInRam(maxRecordsInRam);
+ }
writer.setHeader(header);
if (createIndex && writer.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)){
writer.enableBamIndexConstruction();
}
- if (maxRecordsInRam != null) {
- writer.setMaxRecordsInRam(maxRecordsInRam);
- }
}
/**
diff --git a/src/java/net/sf/samtools/SAMLineParser.java b/src/java/net/sf/samtools/SAMLineParser.java
new file mode 100644
index 0000000..91b963e
--- /dev/null
+++ b/src/java/net/sf/samtools/SAMLineParser.java
@@ -0,0 +1,457 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.samtools;
+
+import java.io.File;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import net.sf.samtools.util.StringUtil;
+
+/**
+ * this class enables creation of a SAMRecord object from a String in SAM text format.
+ */
+public class SAMLineParser {
+
+ // From SAM specification
+ private static final int QNAME_COL = 0;
+ private static final int FLAG_COL = 1;
+ private static final int RNAME_COL = 2;
+ private static final int POS_COL = 3;
+ private static final int MAPQ_COL = 4;
+ private static final int CIGAR_COL = 5;
+ private static final int MRNM_COL = 6;
+ private static final int MPOS_COL = 7;
+ private static final int ISIZE_COL = 8;
+ private static final int SEQ_COL = 9;
+ private static final int QUAL_COL = 10;
+
+ private static final int NUM_REQUIRED_FIELDS = 11;
+
+ // Read string must contain only these characters
+ private static final Pattern VALID_BASES = Pattern
+ .compile("^[acmgrsvtwyhkdbnACMGRSVTWYHKDBN.=]+$");
+
+ /**
+ * Allocate this once rather than for every line as a performance
+ * optimization. The size is arbitrary -- merely large enough to handle the
+ * maximum number of fields we might expect from a reasonable SAM file.
+ */
+ private final String[] mFields = new String[10000];
+
+ /**
+ * Add information about the origin (reader and position) to SAM records.
+ */
+ private final SAMFileReader mParentReader;
+ private final SAMRecordFactory samRecordFactory;
+ private final SAMFileReader.ValidationStringency validationStringency;
+ private final SAMFileHeader mFileHeader;
+ private final File mFile;
+
+ private final TextTagCodec tagCodec = new TextTagCodec();
+
+ private int currentLineNumber;
+ private String currentLine;
+
+ //
+ // Constructors
+ //
+
+ /**
+ * Public constructor. Use the default SAMRecordFactory and stringency.
+ * @param samFileHeader SAM file header
+ */
+ public SAMLineParser(final SAMFileHeader samFileHeader) {
+
+ this(new DefaultSAMRecordFactory(),
+ SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY, samFileHeader,
+ null, null);
+ }
+
+ /**
+ * Public constructor. Use the default SAMRecordFactory and stringency.
+ * @param samFileHeader SAM file header
+ * @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null.
+ * @param samFile SAM file being read (for error message only, may be null)
+ */
+ public SAMLineParser(final SAMFileHeader samFileHeader,
+ final SAMFileReader samFileReader, final File samFile) {
+
+ this(new DefaultSAMRecordFactory(),
+ SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY, samFileHeader,
+ samFileReader, samFile);
+ }
+
+ /**
+ * Public constructor.
+ * @param samRecordFactory SamRecord Factory
+ * @param validationStringency validation stringency
+ * @param samFileHeader SAM file header
+ * @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null.
+ * @param samFile SAM file being read (for error message only, may be null)
+ */
+ public SAMLineParser(final SAMRecordFactory samRecordFactory,
+ final SAMFileReader.ValidationStringency validationStringency,
+ final SAMFileHeader samFileHeader, final SAMFileReader samFileReader,
+ final File samFile) {
+
+ if (samRecordFactory == null)
+ throw new NullPointerException("The SamRecordFactory must be set");
+
+ if (validationStringency == null)
+ throw new NullPointerException("The validationStringency must be set");
+
+ if (samFileHeader == null)
+ throw new NullPointerException("The mFileHeader must be set");
+
+ this.samRecordFactory = samRecordFactory;
+ this.validationStringency = validationStringency;
+ this.mFileHeader = samFileHeader;
+
+ // Can be null
+ this.mParentReader = samFileReader;
+
+ // Can be null
+ this.mFile = samFile;
+ }
+
+ /**
+ * Get the File header.
+ * @return the SAM file header
+ */
+ private SAMFileHeader getFileHeader() {
+
+ return this.mFileHeader;
+ }
+
+ /**
+ * Get validation stringency.
+ * @return validation stringency
+ */
+ private SAMFileReader.ValidationStringency getValidationStringency() {
+
+ return this.validationStringency;
+ }
+
+ private int parseInt(final String s, final String fieldName) {
+ final int ret;
+ try {
+ ret = Integer.parseInt(s);
+ } catch (NumberFormatException e) {
+ throw reportFatalErrorParsingLine("Non-numeric value in "
+ + fieldName + " column");
+ }
+ return ret;
+ }
+
+ private void validateReferenceName(final String rname, final String fieldName) {
+ if (rname.equals("=")) {
+ if (fieldName.equals("MRNM")) {
+ return;
+ }
+ reportErrorParsingLine("= is not a valid value for "
+ + fieldName + " field.");
+ }
+ if (this.mFileHeader.getSequenceDictionary().size() != 0) {
+ if (this.mFileHeader.getSequence(rname) == null) {
+ reportErrorParsingLine(fieldName
+ + " '" + rname + "' not found in any SQ record");
+ }
+ }
+ }
+
+ /**
+ * Parse a SAM line.
+ * @param line line to parse
+ * @return a new SAMRecord object
+ */
+ public SAMRecord parseLine(final String line) {
+
+ return parseLine(line, -1);
+ }
+
+ /**
+ * Parse a SAM line.
+ * @param line line to parse
+ * @param lineNumber line number in the file. If the line number is not known
+ * can be <=0.
+ * @return a new SAMRecord object
+ */
+ public SAMRecord parseLine(final String line, final int lineNumber) {
+
+ final String mCurrentLine = line;
+ this.currentLineNumber = lineNumber;
+ this.currentLine = line;
+
+ final int numFields = StringUtil.split(mCurrentLine, mFields, '\t');
+ if (numFields < NUM_REQUIRED_FIELDS) {
+ throw reportFatalErrorParsingLine("Not enough fields");
+ }
+ if (numFields == mFields.length) {
+ reportErrorParsingLine("Too many fields in SAM text record.");
+ }
+ for (int i = 0; i < numFields; ++i) {
+ if (mFields[i].length() == 0) {
+ reportErrorParsingLine("Empty field at position " + i + " (zero-based)");
+ }
+ }
+ final SAMRecord samRecord =
+ samRecordFactory.createSAMRecord(this.mFileHeader);
+ samRecord.setValidationStringency(this.validationStringency);
+ if (mParentReader != null)
+ samRecord.setFileSource(new SAMFileSource(mParentReader, null));
+ samRecord.setHeader(this.mFileHeader);
+ samRecord.setReadName(mFields[QNAME_COL]);
+
+ final int flags = parseInt(mFields[FLAG_COL], "FLAG");
+ samRecord.setFlags(flags);
+
+ String rname = mFields[RNAME_COL];
+ if (!rname.equals("*")) {
+ rname = SAMSequenceRecord.truncateSequenceName(rname);
+ validateReferenceName(rname, "RNAME");
+ samRecord.setReferenceName(rname);
+ } else if (!samRecord.getReadUnmappedFlag()) {
+ reportErrorParsingLine("RNAME is not specified but flags indicate mapped");
+ }
+
+ final int pos = parseInt(mFields[POS_COL], "POS");
+ final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ");
+ final String cigar = mFields[CIGAR_COL];
+ if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(samRecord
+ .getReferenceName())) {
+ if (pos == 0) {
+ reportErrorParsingLine("POS must be non-zero if RNAME is specified");
+ }
+ if (!samRecord.getReadUnmappedFlag() && cigar.equals("*")) {
+ reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified");
+ }
+ } else {
+ if (pos != 0) {
+ reportErrorParsingLine("POS must be zero if RNAME is not specified");
+ }
+ if (mapq != 0) {
+ reportErrorParsingLine("MAPQ must be zero if RNAME is not specified");
+ }
+ if (!cigar.equals("*")) {
+ reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified");
+ }
+ }
+ samRecord.setAlignmentStart(pos);
+ samRecord.setMappingQuality(mapq);
+ samRecord.setCigarString(cigar);
+
+ String mateRName = mFields[MRNM_COL];
+ if (mateRName.equals("*")) {
+ if (samRecord.getReadPairedFlag() && !samRecord.getMateUnmappedFlag()) {
+ reportErrorParsingLine("MRNM not specified but flags indicate mate mapped");
+ }
+ } else {
+ if (!samRecord.getReadPairedFlag()) {
+ reportErrorParsingLine("MRNM specified but flags indicate unpaired");
+ }
+ if (!"=".equals(mateRName)) {
+ mateRName = SAMSequenceRecord.truncateSequenceName(mateRName);
+ }
+ validateReferenceName(mateRName, "MRNM");
+ if (mateRName.equals("=")) {
+ if (samRecord.getReferenceName() == null) {
+ reportErrorParsingLine("MRNM is '=', but RNAME is not set");
+ }
+ samRecord.setMateReferenceName(samRecord.getReferenceName());
+ } else {
+ samRecord.setMateReferenceName(mateRName);
+ }
+ }
+
+ final int matePos = parseInt(mFields[MPOS_COL], "MPOS");
+ final int isize = parseInt(mFields[ISIZE_COL], "ISIZE");
+ if (!samRecord.getMateReferenceName().equals(
+ SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
+ if (matePos == 0) {
+ reportErrorParsingLine("MPOS must be non-zero if MRNM is specified");
+ }
+ } else {
+ if (matePos != 0) {
+ reportErrorParsingLine("MPOS must be zero if MRNM is not specified");
+ }
+ if (isize != 0) {
+ reportErrorParsingLine("ISIZE must be zero if MRNM is not specified");
+ }
+ }
+ samRecord.setMateAlignmentStart(matePos);
+ samRecord.setInferredInsertSize(isize);
+ if (!mFields[SEQ_COL].equals("*")) {
+ validateReadBases(mFields[SEQ_COL]);
+ samRecord.setReadString(mFields[SEQ_COL]);
+ } else {
+ samRecord.setReadBases(SAMRecord.NULL_SEQUENCE);
+ }
+ if (!mFields[QUAL_COL].equals("*")) {
+ if (samRecord.getReadBases() == SAMRecord.NULL_SEQUENCE) {
+ reportErrorParsingLine("QUAL should not be specified if SEQ is not specified");
+ }
+ if (samRecord.getReadString().length() != mFields[QUAL_COL].length()) {
+ reportErrorParsingLine("length(QUAL) != length(SEQ)");
+ }
+ samRecord.setBaseQualityString(mFields[QUAL_COL]);
+ } else {
+ samRecord.setBaseQualities(SAMRecord.NULL_QUALS);
+ }
+
+ for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) {
+ parseTag(samRecord, mFields[i]);
+ }
+
+ final List<SAMValidationError> validationErrors = samRecord.isValid();
+ if (validationErrors != null) {
+ for (final SAMValidationError errorMessage : validationErrors) {
+ reportErrorParsingLine(errorMessage.getMessage());
+ }
+ }
+ return samRecord;
+ }
+
+ private void validateReadBases(final String bases) {
+ /*
+ * Using regex is slow, so check for invalid characters via
+ * isValidReadBase(), which hopefully the JIT will optimize. if
+ * (!VALID_BASES.matcher(bases).matches()) {
+ * reportErrorParsingLine("Invalid character in read bases"); }
+ */
+ for (int i = 0; i < bases.length(); ++i) {
+ if (!isValidReadBase(bases.charAt(i))) {
+ reportErrorParsingLine("Invalid character in read bases");
+ return;
+ }
+ }
+ }
+
+ private boolean isValidReadBase(final char base) {
+ switch (base) {
+ case 'a':
+ case 'c':
+ case 'm':
+ case 'g':
+ case 'r':
+ case 's':
+ case 'v':
+ case 't':
+ case 'w':
+ case 'y':
+ case 'h':
+ case 'k':
+ case 'd':
+ case 'b':
+ case 'n':
+ case 'A':
+ case 'C':
+ case 'M':
+ case 'G':
+ case 'R':
+ case 'S':
+ case 'V':
+ case 'T':
+ case 'W':
+ case 'Y':
+ case 'H':
+ case 'K':
+ case 'D':
+ case 'B':
+ case 'N':
+ case '.':
+ case '=':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ private void parseTag(final SAMRecord samRecord, final String tag) {
+ Map.Entry<String, Object> entry = null;
+ try {
+ entry = tagCodec.decode(tag);
+ } catch (SAMFormatException e) {
+ reportErrorParsingLine(e);
+ }
+ if (entry != null) {
+ if (entry.getValue() instanceof TagValueAndUnsignedArrayFlag) {
+ final TagValueAndUnsignedArrayFlag valueAndFlag =
+ (TagValueAndUnsignedArrayFlag) entry.getValue();
+ if (valueAndFlag.isUnsignedArray) {
+ samRecord.setUnsignedArrayAttribute(entry.getKey(),
+ valueAndFlag.value);
+ } else {
+ samRecord.setAttribute(entry.getKey(), valueAndFlag.value);
+ }
+ } else {
+ samRecord.setAttribute(entry.getKey(), entry.getValue());
+ }
+ }
+ }
+
+ //
+ // Error methods
+ //
+
+ private RuntimeException reportFatalErrorParsingLine(final String reason) {
+ return new SAMFormatException(makeErrorString(reason));
+ }
+
+ private void reportErrorParsingLine(final String reason) {
+ final String errorMessage = makeErrorString(reason);
+
+ if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
+ throw new SAMFormatException(errorMessage);
+ } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
+ System.err
+ .println("Ignoring SAM validation error due to lenient parsing:");
+ System.err.println(errorMessage);
+ }
+ }
+
+ private void reportErrorParsingLine(final Exception e) {
+ final String errorMessage = makeErrorString(e.getMessage());
+ if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
+ throw new SAMFormatException(errorMessage);
+ } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
+ System.err
+ .println("Ignoring SAM validation error due to lenient parsing:");
+ System.err.println(errorMessage);
+ }
+ }
+
+ private String makeErrorString(final String reason) {
+ String fileMessage = "";
+ if (mFile != null) {
+ fileMessage = "File " + mFile + "; ";
+ }
+ return "Error parsing text SAM file. "
+ + reason + "; " + fileMessage + "Line "
+ + (this.currentLineNumber <= 0 ? "unknown" : this.currentLineNumber)
+ + "\nLine: " + this.currentLine;
+ }
+
+}
diff --git a/src/java/net/sf/samtools/SAMTextHeaderCodec.java b/src/java/net/sf/samtools/SAMTextHeaderCodec.java
index 530c637..f7f4bab 100644
--- a/src/java/net/sf/samtools/SAMTextHeaderCodec.java
+++ b/src/java/net/sf/samtools/SAMTextHeaderCodec.java
@@ -32,6 +32,7 @@ import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
+import java.util.regex.Pattern;
/**
* Parser for a SAM text header, and a generator of SAM text header.
@@ -59,7 +60,10 @@ public class SAMTextHeaderCodec {
private BufferedWriter writer;
private static final String TAG_KEY_VALUE_SEPARATOR = ":";
+ private static final char TAG_KEY_VALUE_SEPARATOR_CHAR = ':';
private static final String FIELD_SEPARATOR = "\t";
+ private static final char FIELD_SEPARATOR_CHAR = '\t';
+ private static final Pattern FIELD_SEPARATOR_RE = Pattern.compile(FIELD_SEPARATOR);
public static final String COMMENT_PREFIX = HEADER_LINE_START + HeaderRecordType.CO.name() + FIELD_SEPARATOR;
@@ -250,7 +254,13 @@ public class SAMTextHeaderCodec {
assert(line.startsWith(HEADER_LINE_START));
// Tab-separate
- final String[] fields = line.split(FIELD_SEPARATOR);
+ String[] fields = new String[1024];
+ int numFields = StringUtil.split(line, fields, FIELD_SEPARATOR_CHAR);
+ if (numFields == fields.length) {
+ // Lots of fields, so fall back
+ fields = FIELD_SEPARATOR_RE.split(line);
+ numFields = fields.length;
+ }
// Parse the HeaderRecordType
try {
@@ -267,10 +277,10 @@ public class SAMTextHeaderCodec {
return;
}
+ final String[] keyAndValue = new String[2];
// Parse they key:value pairs
- for (int i = 1; i < fields.length; ++i) {
- final String[] keyAndValue = fields[i].split(TAG_KEY_VALUE_SEPARATOR, 2);
- if (keyAndValue.length != 2) {
+ for (int i = 1; i < numFields; ++i) {
+ if (StringUtil.splitConcatenateExcessTokens(fields[i], keyAndValue, TAG_KEY_VALUE_SEPARATOR_CHAR) != 2) {
reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType +
" key:value pair", SAMValidationError.Type.POORLY_FORMATTED_HEADER_TAG, null);
continue;
diff --git a/src/java/net/sf/samtools/SAMTextReader.java b/src/java/net/sf/samtools/SAMTextReader.java
index 5e1c664..adef9d8 100644
--- a/src/java/net/sf/samtools/SAMTextReader.java
+++ b/src/java/net/sf/samtools/SAMTextReader.java
@@ -26,35 +26,16 @@ package net.sf.samtools;
import net.sf.samtools.util.BufferedLineReader;
import net.sf.samtools.util.CloseableIterator;
-import net.sf.samtools.util.StringUtil;
import java.io.File;
import java.io.InputStream;
-import java.util.Map;
-import java.util.List;
-import java.util.regex.Pattern;
+
/**
* Internal class for reading SAM text files.
*/
class SAMTextReader extends SAMFileReader.ReaderImplementation {
- // From SAM specification
- private static final int QNAME_COL = 0;
- private static final int FLAG_COL = 1;
- private static final int RNAME_COL = 2;
- private static final int POS_COL = 3;
- private static final int MAPQ_COL = 4;
- private static final int CIGAR_COL = 5;
- private static final int MRNM_COL = 6;
- private static final int MPOS_COL = 7;
- private static final int ISIZE_COL = 8;
- private static final int SEQ_COL = 9;
- private static final int QUAL_COL = 10;
-
- private static final int NUM_REQUIRED_FIELDS = 11;
-
- // Read string must contain only these characters
- private static final Pattern VALID_BASES = Pattern.compile("^[acmgrsvtwyhkdbnACMGRSVTWYHKDBN.=]+$");
+
private SAMRecordFactory samRecordFactory;
private BufferedLineReader mReader;
@@ -62,7 +43,7 @@ class SAMTextReader extends SAMFileReader.ReaderImplementation {
private String mCurrentLine = null;
private RecordIterator mIterator = null;
private File mFile = null;
- private final TextTagCodec tagCodec = new TextTagCodec();
+
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY;
/**
@@ -116,7 +97,7 @@ class SAMTextReader extends SAMFileReader.ReaderImplementation {
}
boolean hasIndex() {
- return false;
+ return false;
}
BAMIndex getIndex() {
@@ -210,51 +191,17 @@ class SAMTextReader extends SAMFileReader.ReaderImplementation {
return mCurrentLine;
}
- private String makeErrorString(final String reason) {
- String fileMessage = "";
- if (mFile != null) {
- fileMessage = "File " + mFile + "; ";
- }
- return "Error parsing text SAM file. " + reason + "; " + fileMessage +
- "Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine;
- }
- private RuntimeException reportFatalErrorParsingLine(final String reason) {
- return new SAMFormatException(makeErrorString(reason));
- }
- private void reportErrorParsingLine(final String reason) {
- final String errorMessage = makeErrorString(reason);
- if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
- throw new SAMFormatException(errorMessage);
- } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
- System.err.println("Ignoring SAM validation error due to lenient parsing:");
- System.err.println(errorMessage);
- }
- }
-
- private void reportErrorParsingLine(final Exception e) {
- final String errorMessage = makeErrorString(e.getMessage());
- if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
- throw new SAMFormatException(errorMessage);
- } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
- System.err.println("Ignoring SAM validation error due to lenient parsing:");
- System.err.println(errorMessage);
- }
- }
/**
* SAMRecord iterator for SAMTextReader
*/
private class RecordIterator implements CloseableIterator<SAMRecord> {
- /**
- * Allocate this once rather than for every line as a performance optimization.
- * The size is arbitrary -- merely large enough to handle the maximum number
- * of fields we might expect from a reasonable SAM file.
- */
- private final String[] mFields = new String[10000];
+ private final SAMLineParser parser = new SAMLineParser(samRecordFactory, validationStringency,
+ mFileHeader, mParentReader, mFile);
private RecordIterator() {
if (mReader == null) {
@@ -285,234 +232,11 @@ class SAMTextReader extends SAMFileReader.ReaderImplementation {
throw new UnsupportedOperationException("Not supported: remove");
}
- int parseInt(final String s, final String fieldName) {
- final int ret;
- try {
- ret = Integer.parseInt(s);
- } catch (NumberFormatException e) {
- throw reportFatalErrorParsingLine("Non-numeric value in " + fieldName + " column");
- }
- return ret;
- }
-
- void validateReferenceName(final String rname, final String fieldName) {
- if (rname.equals("=")) {
- if (fieldName.equals("MRNM")) {
- return;
- }
- reportErrorParsingLine("= is not a valid value for " + fieldName + " field.");
- }
- if (getFileHeader().getSequenceDictionary().size() != 0) {
- if (getFileHeader().getSequence(rname) == null) {
- reportErrorParsingLine(fieldName + " '" + rname + "' not found in any SQ record");
- }
- }
- }
-
private SAMRecord parseLine() {
- final int numFields = StringUtil.split(mCurrentLine, mFields, '\t');
- if (numFields < NUM_REQUIRED_FIELDS) {
- throw reportFatalErrorParsingLine("Not enough fields");
- }
- if (numFields == mFields.length) {
- reportErrorParsingLine("Too many fields in SAM text record.");
- }
- for (int i = 0; i < numFields; ++i) {
- if (mFields[i].length() == 0) {
- reportErrorParsingLine("Empty field at position " + i + " (zero-based)");
- }
- }
- final SAMRecord samRecord = samRecordFactory.createSAMRecord(mFileHeader);
- samRecord.setValidationStringency(getValidationStringency());
- if(mParentReader != null)
- samRecord.setFileSource(new SAMFileSource(mParentReader,null));
- samRecord.setHeader(mFileHeader);
- samRecord.setReadName(mFields[QNAME_COL]);
-
- final int flags = parseInt(mFields[FLAG_COL], "FLAG");
- samRecord.setFlags(flags);
-
- String rname = mFields[RNAME_COL];
- if (!rname.equals("*")) {
- rname = SAMSequenceRecord.truncateSequenceName(rname);
- validateReferenceName(rname, "RNAME");
- samRecord.setReferenceName(rname);
- } else if (!samRecord.getReadUnmappedFlag()) {
- reportErrorParsingLine("RNAME is not specified but flags indicate mapped");
- }
-
- final int pos = parseInt(mFields[POS_COL], "POS");
- final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ");
- final String cigar = mFields[CIGAR_COL];
- if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(samRecord.getReferenceName())) {
- if (pos == 0) {
- reportErrorParsingLine("POS must be non-zero if RNAME is specified");
- }
- if (!samRecord.getReadUnmappedFlag() && cigar.equals("*")) {
- reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified");
- }
- } else {
- if (pos != 0) {
- reportErrorParsingLine("POS must be zero if RNAME is not specified");
- }
- if (mapq != 0) {
- reportErrorParsingLine("MAPQ must be zero if RNAME is not specified");
- }
- if (!cigar.equals("*")) {
- reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified");
- }
- }
- samRecord.setAlignmentStart(pos);
- samRecord.setMappingQuality(mapq);
- samRecord.setCigarString(cigar);
-
- String mateRName = mFields[MRNM_COL];
- if (mateRName.equals("*")) {
- if (samRecord.getReadPairedFlag() && !samRecord.getMateUnmappedFlag()) {
- reportErrorParsingLine("MRNM not specified but flags indicate mate mapped");
- }
- }
- else {
- if (!samRecord.getReadPairedFlag()) {
- reportErrorParsingLine("MRNM specified but flags indicate unpaired");
- }
- if (!"=".equals(mateRName)) {
- mateRName = SAMSequenceRecord.truncateSequenceName(mateRName);
- }
- validateReferenceName(mateRName, "MRNM");
- if (mateRName.equals("=")) {
- if (samRecord.getReferenceName() == null) {
- reportErrorParsingLine("MRNM is '=', but RNAME is not set");
- }
- samRecord.setMateReferenceName(samRecord.getReferenceName());
- } else {
- samRecord.setMateReferenceName(mateRName);
- }
- }
- final int matePos = parseInt(mFields[MPOS_COL], "MPOS");
- final int isize = parseInt(mFields[ISIZE_COL], "ISIZE");
- if (!samRecord.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
- if (matePos == 0) {
- reportErrorParsingLine("MPOS must be non-zero if MRNM is specified");
- }
- } else {
- if (matePos != 0) {
- reportErrorParsingLine("MPOS must be zero if MRNM is not specified");
- }
- if (isize != 0) {
- reportErrorParsingLine("ISIZE must be zero if MRNM is not specified");
- }
- }
- samRecord.setMateAlignmentStart(matePos);
- samRecord.setInferredInsertSize(isize);
- if (!mFields[SEQ_COL].equals("*")) {
- validateReadBases(mFields[SEQ_COL]);
- samRecord.setReadString(mFields[SEQ_COL]);
- } else {
- samRecord.setReadBases(SAMRecord.NULL_SEQUENCE);
- }
- if (!mFields[QUAL_COL].equals("*")) {
- if (samRecord.getReadBases() == SAMRecord.NULL_SEQUENCE) {
- reportErrorParsingLine("QUAL should not be specified if SEQ is not specified");
- }
- if (samRecord.getReadString().length() != mFields[QUAL_COL].length()) {
- reportErrorParsingLine("length(QUAL) != length(SEQ)");
- }
- samRecord.setBaseQualityString(mFields[QUAL_COL]);
- } else {
- samRecord.setBaseQualities(SAMRecord.NULL_QUALS);
- }
-
- for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) {
- parseTag(samRecord, mFields[i]);
- }
-
- final List<SAMValidationError> validationErrors = samRecord.isValid();
- if (validationErrors != null) {
- for (final SAMValidationError errorMessage : validationErrors) {
- reportErrorParsingLine(errorMessage.getMessage());
- }
- }
- return samRecord;
+ return parser.parseLine(mCurrentLine, mReader.getLineNumber());
}
- private void validateReadBases(final String bases) {
-/*
- * Using regex is slow, so check for invalid characters via isValidReadBase(), which hopefully the JIT will optimize.
- if (!VALID_BASES.matcher(bases).matches()) {
- reportErrorParsingLine("Invalid character in read bases");
- }
-*/
- for (int i = 0; i < bases.length(); ++i) {
- if (!isValidReadBase(bases.charAt(i))) {
- reportErrorParsingLine("Invalid character in read bases");
- return;
- }
- }
- }
-
- private boolean isValidReadBase(final char base) {
- switch (base) {
- case 'a':
- case 'c':
- case 'm':
- case 'g':
- case 'r':
- case 's':
- case 'v':
- case 't':
- case 'w':
- case 'y':
- case 'h':
- case 'k':
- case 'd':
- case 'b':
- case 'n':
- case 'A':
- case 'C':
- case 'M':
- case 'G':
- case 'R':
- case 'S':
- case 'V':
- case 'T':
- case 'W':
- case 'Y':
- case 'H':
- case 'K':
- case 'D':
- case 'B':
- case 'N':
- case '.':
- case '=':
- return true;
- default:
- return false;
- }
- }
-
- private void parseTag(final SAMRecord samRecord, final String tag) {
- Map.Entry<String, Object> entry = null;
- try {
- entry = tagCodec.decode(tag);
- } catch (SAMFormatException e) {
- reportErrorParsingLine(e);
- }
- if (entry != null) {
- if (entry.getValue() instanceof TagValueAndUnsignedArrayFlag) {
- final TagValueAndUnsignedArrayFlag valueAndFlag = (TagValueAndUnsignedArrayFlag) entry.getValue();
- if (valueAndFlag.isUnsignedArray) {
- samRecord.setUnsignedArrayAttribute(entry.getKey(), valueAndFlag.value);
- }
- else {
- samRecord.setAttribute(entry.getKey(), valueAndFlag.value);
- }
- } else {
- samRecord.setAttribute(entry.getKey(), entry.getValue());
- }
- }
- }
}
}
diff --git a/src/java/net/sf/samtools/SAMValidationError.java b/src/java/net/sf/samtools/SAMValidationError.java
index eec12eb..f7392d4 100644
--- a/src/java/net/sf/samtools/SAMValidationError.java
+++ b/src/java/net/sf/samtools/SAMValidationError.java
@@ -35,6 +35,9 @@ public class SAMValidationError {
}
public enum Type {
+ /** quality encodings out of range; appear to be Solexa or Illumina when Phread expected */
+ INVALID_QUALITY_FORMAT(Severity.WARNING),
+
/** proper pair flag set for unpaired read */
INVALID_FLAG_PROPER_PAIR,
diff --git a/src/java/net/sf/samtools/util/DateParser.java b/src/java/net/sf/samtools/util/DateParser.java
index 6211252..eab68db 100644
--- a/src/java/net/sf/samtools/util/DateParser.java
+++ b/src/java/net/sf/samtools/util/DateParser.java
@@ -95,14 +95,11 @@ public class DateParser {
private static boolean check(StringTokenizer st, String token)
throws InvalidDateException
{
- try {
- if (st.nextToken().equals(token)) {
- return true;
- } else {
- throw new InvalidDateException("Missing ["+token+"]");
- }
- } catch (NoSuchElementException ex) {
- return false;
+ if (!st.hasMoreElements()) return false;
+ if (st.nextToken().equals(token)) {
+ return true;
+ } else {
+ throw new InvalidDateException("Missing ["+token+"]");
}
}
diff --git a/src/java/net/sf/samtools/util/StringLineReader.java b/src/java/net/sf/samtools/util/StringLineReader.java
index 4284a1a..a7e240c 100644
--- a/src/java/net/sf/samtools/util/StringLineReader.java
+++ b/src/java/net/sf/samtools/util/StringLineReader.java
@@ -23,20 +23,24 @@
*/
package net.sf.samtools.util;
+import java.util.regex.Pattern;
+
/**
* Implementation of LineReader that gets its input from a String. No charset conversion
* is necessary because the String is in unicode. Handles CR, LF or CRLF line termination,
* but if asked to return the line terminator, it always comes back as LF.
*/
public class StringLineReader implements LineReader {
-
+ private static final Pattern CRLF = Pattern.compile("\r\n");
private final String theString;
private int curPos = 0;
private int lineNumber = 0;
public StringLineReader(final String s) {
- // Simplify later processing by replacing crlf with just lf, and replacing solo cr with lf
- this.theString = s.replaceAll("\r\n", "\n").replaceAll("\r", "\n");
+ // Simplify later processing by replacing crlf with just lf, and replacing solo cr with lf.
+ // Note that String.replace(String, String) causes a regex to be used, so precompilation should be
+ // the best we can do short of handling the string directly.
+ this.theString = CRLF.matcher(s).replaceAll("\n").replace('\r', '\n');
}
/**
diff --git a/src/tests/java/net/sf/picard/sam/MarkDuplicatesTest.java b/src/tests/java/net/sf/picard/sam/MarkDuplicatesTest.java
new file mode 100644
index 0000000..8cc0ea2
--- /dev/null
+++ b/src/tests/java/net/sf/picard/sam/MarkDuplicatesTest.java
@@ -0,0 +1,139 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.sam;
+
+import net.sf.picard.io.IoUtil;
+import net.sf.samtools.*;
+import net.sf.samtools.util.TestUtil;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.*;
+
+public class MarkDuplicatesTest {
+ private static final File TEST_DATA_DIR = new File("testdata/net/sf/picard/sam/MarkDuplicates");
+
+
+ /**
+ * Test that PG header records are created & chained appropriately (or not created), and that the PG record chains
+ * are as expected. MarkDuplicates is used both to merge and to mark dupes in this case.
+ * @param suppressPg If true, do not create PG header record.
+ * @param expectedPnVnByReadName For each read, info about the expect chain of PG records.
+ */
+ @Test(dataProvider = "pgRecordChainingTest")
+ public void pgRecordChainingTest(final boolean suppressPg,
+ final Map<String, List<ExpectedPnAndVn>> expectedPnVnByReadName) {
+ final File outputDir = IoUtil.createTempDir("MarkDuplicatesTest.", ".tmp");
+ outputDir.deleteOnExit();
+ try {
+ // Run MarkDuplicates, merging the 3 input files, and either enabling or suppressing PG header
+ // record creation according to suppressPg.
+ final MarkDuplicates markDuplicates = new MarkDuplicates();
+ final ArrayList<String> args = new ArrayList<String>();
+ for (int i = 1; i <= 3; ++i) {
+ args.add("INPUT=" + new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath());
+ }
+ final File outputSam = new File(outputDir, "markDuplicatesTest.sam");
+ args.add("OUTPUT=" + outputSam.getAbsolutePath());
+ args.add("METRICS_FILE=" + new File(outputDir, "markDuplicatesTest.duplicate_metrics").getAbsolutePath());
+ if (suppressPg) args.add("PROGRAM_RECORD_ID=null");
+
+ // I generally prefer to call doWork rather than invoking the argument parser, but it is necessary
+ // in this case to initialize the command line.
+ // Note that for the unit test, version won't come through because it is obtained through jar
+ // manifest, and unit test doesn't run code from a jar.
+ Assert.assertEquals(markDuplicates.instanceMain(args.toArray(new String[args.size()])), 0);
+
+ // Read the MarkDuplicates output file, and get the PG ID for each read. In this particular test,
+ // the PG ID should be the same for both ends of a pair.
+ final SAMFileReader reader = new SAMFileReader(outputSam);
+
+ final Map<String, String> pgIdForReadName = new HashMap<String, String>();
+ for (final SAMRecord rec : reader) {
+ final String existingPgId = pgIdForReadName.get(rec.getReadName());
+ final String thisPgId = rec.getStringAttribute(SAMTag.PG.name());
+ if (existingPgId != null) {
+ Assert.assertEquals(thisPgId, existingPgId);
+ } else {
+ pgIdForReadName.put(rec.getReadName(), thisPgId);
+ }
+ }
+ final SAMFileHeader header = reader.getFileHeader();
+ reader.close();
+
+ // Confirm that for each read name, the chain of PG records contains exactly the number that is expected,
+ // and that values in the PG chain are as expected.
+ for (final Map.Entry<String, List<ExpectedPnAndVn>> entry : expectedPnVnByReadName.entrySet()) {
+ final String readName = entry.getKey();
+ final List<ExpectedPnAndVn> expectedList = entry.getValue();
+ String pgId = pgIdForReadName.get(readName);
+ for (final ExpectedPnAndVn expected : expectedList) {
+ final SAMProgramRecord programRecord = header.getProgramRecord(pgId);
+ if (expected.expectedPn != null) Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn);
+ if (expected.expectedVn != null) Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn);
+ pgId = programRecord.getPreviousProgramGroupId();
+ }
+ Assert.assertNull(pgId);
+ }
+
+ } finally {
+ TestUtil.recursiveDelete(outputDir);
+ }
+ }
+
+ /**
+ * Represents an expected PN value and VN value for a PG record. If one of thexe is null, any value is allowed
+ * in the PG record being tested.
+ */
+ private static class ExpectedPnAndVn {
+ final String expectedPn;
+ final String expectedVn;
+
+ private ExpectedPnAndVn(final String expectedPn, final String expectedVn) {
+ this.expectedPn = expectedPn;
+ this.expectedVn = expectedVn;
+ }
+ }
+
+ @DataProvider(name = "pgRecordChainingTest")
+ public Object[][] pgRecordChainingTestDataProvider() {
+ // Two test cases: One in which PG record generation is enabled, the other in which it is turned off.
+ final Map<String, List<ExpectedPnAndVn>> withPgMap = new HashMap<String, List<ExpectedPnAndVn>>();
+ withPgMap.put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn("MarkDuplicates", null), new ExpectedPnAndVn("MarkDuplicates", "1"), new ExpectedPnAndVn("bwa", "1")));
+ withPgMap.put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn("MarkDuplicates", null), new ExpectedPnAndVn("bwa", "2")));
+ withPgMap.put("1AAXX.3.1", Arrays.asList(new ExpectedPnAndVn("MarkDuplicates", null)));
+
+
+ final Map<String, List<ExpectedPnAndVn>> suppressPgMap = new HashMap<String, List<ExpectedPnAndVn>>();
+ suppressPgMap .put("1AAXX.1.1", Arrays.asList(new ExpectedPnAndVn("MarkDuplicates", "1"), new ExpectedPnAndVn("bwa", "1")));
+ suppressPgMap .put("1AAXX.2.1", Arrays.asList(new ExpectedPnAndVn("bwa", "2")));
+ suppressPgMap .put("1AAXX.3.1", new ArrayList<ExpectedPnAndVn>(0));
+ return new Object[][] {
+ { false, withPgMap},
+ { true, suppressPgMap}
+ };
+ }
+}
diff --git a/src/tests/java/net/sf/picard/sam/ValidateSamFileTest.java b/src/tests/java/net/sf/picard/sam/ValidateSamFileTest.java
index ca85dde..0150445 100644
--- a/src/tests/java/net/sf/picard/sam/ValidateSamFileTest.java
+++ b/src/tests/java/net/sf/picard/sam/ValidateSamFileTest.java
@@ -284,6 +284,13 @@ public class ValidateSamFileTest {
}
@Test
+ public void testQualityFormatValidation() throws Exception {
+ final SAMFileReader samReader = new SAMFileReader(new File("./testdata/net/sf/picard/util/QualityEncodingDetectorTest/illumina-as-standard.bam"));
+ final Histogram<String> results = executeValidation(samReader, null);
+ Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_QUALITY_FORMAT.getHistogramString()).getValue(), 1.0);
+ }
+
+ @Test
public void testCigarOffEndOfReferenceValidation() throws Exception {
final SAMRecordSetBuilder samBuilder = new SAMRecordSetBuilder();
samBuilder.addFrag(String.valueOf(0), 0, 1, false);
diff --git a/src/tests/java/net/sf/picard/util/QualityEncodingDetectorTest.java b/src/tests/java/net/sf/picard/util/QualityEncodingDetectorTest.java
new file mode 100644
index 0000000..996d4af
--- /dev/null
+++ b/src/tests/java/net/sf/picard/util/QualityEncodingDetectorTest.java
@@ -0,0 +1,71 @@
+package net.sf.picard.util;
+
+import net.sf.picard.fastq.FastqReader;
+import net.sf.samtools.SAMFileReader;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+
+public class QualityEncodingDetectorTest {
+
+ private static class Testcase {
+ private final File f;
+ private final FastqQualityFormat q;
+
+ Testcase(final File file, final FastqQualityFormat qualityFormat) {
+ this.f = file;
+ this.q = qualityFormat;
+ }
+ }
+
+ final static List<Testcase> FASTQ_TESTCASES = Arrays.asList(
+ // Need to use full-range quality here, as Solexa and Illumina are near indistinguishable
+ new Testcase(new File("./testdata/net/sf/picard/sam/fastq2bam/fastq-solexa/solexa_full_range_as_solexa.fastq"), FastqQualityFormat.Solexa),
+ new Testcase(new File("./testdata/net/sf/picard/sam/fastq2bam/fastq-illumina/s_1_sequence.txt"), FastqQualityFormat.Illumina),
+ new Testcase(new File("./testdata/net/sf/picard/sam/fastq2bam/fastq-sanger/5k-30BB2AAXX.3.aligned.sam.fastq"), FastqQualityFormat.Standard)
+ );
+ final static List<Testcase> BAM_TESTCASES = Arrays.asList(
+ new Testcase(new File("./testdata/net/sf/picard/sam/unmapped.sam"), FastqQualityFormat.Standard),
+ new Testcase(new File("./testdata/net/sf/samtools/BAMFileIndexTest/index_test.bam"), FastqQualityFormat.Standard),
+ new Testcase(new File("./testdata/net/sf/picard/util/QualityEncodingDetectorTest/solexa-as-standard.bam"), FastqQualityFormat.Solexa),
+ new Testcase(new File("./testdata/net/sf/picard/util/QualityEncodingDetectorTest/illumina-as-standard.bam"), FastqQualityFormat.Illumina)
+
+ );
+
+ Object[][] renderObjectArrayArray(final List<Testcase> testcaseList) {
+ final Object[][] data = new Object[testcaseList.size()][];
+ for (int i = 0; i < data.length; i++) {
+ final Testcase testcase = testcaseList.get(i);
+ data[i] = new Object[]{testcase.f, testcase.q};
+ }
+ return data;
+ }
+
+ @DataProvider(name = "BAM_TESTCASES")
+ Object[][] bamTestcases() {
+ return renderObjectArrayArray(BAM_TESTCASES);
+ }
+
+ @DataProvider(name = "FASTQ_TESTCASES")
+ Object[][] fastqTestcases() {
+ return renderObjectArrayArray(FASTQ_TESTCASES);
+ }
+
+ @Test(dataProvider = "FASTQ_TESTCASES", groups = {"unix"})
+ public void testFastqQualityInference(final File input, final FastqQualityFormat expectedQualityFormat) {
+ final FastqReader reader = new FastqReader(input);
+ Assert.assertEquals(QualityEncodingDetector.detect(reader), expectedQualityFormat);
+ reader.close();
+ }
+
+ @Test(dataProvider = "BAM_TESTCASES", groups = {"unix"})
+ public void testBamQualityInference(final File input, final FastqQualityFormat expectedQualityFormat) {
+ final SAMFileReader reader = new SAMFileReader(input);
+ Assert.assertEquals(QualityEncodingDetector.detect(reader), expectedQualityFormat);
+ reader.close();
+ }
+}
diff --git a/testdata/net/sf/picard/sam/MarkDuplicates/merge1.sam b/testdata/net/sf/picard/sam/MarkDuplicates/merge1.sam
new file mode 100644
index 0000000..7e1c583
--- /dev/null
+++ b/testdata/net/sf/picard/sam/MarkDuplicates/merge1.sam
@@ -0,0 +1,14 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chr1 LN:101
+ at SQ SN:chr2 LN:101
+ at SQ SN:chr3 LN:101
+ at SQ SN:chr4 LN:101
+ at SQ SN:chr5 LN:101
+ at SQ SN:chr6 LN:101
+ at SQ SN:chr7 LN:404
+ at SQ SN:chr8 LN:202
+ at RG ID:1AAXX.1 SM:Hi,Mom! LB:mylib
+ at PG ID:MarkDuplicates PN:MarkDuplicates VN:1 CL:MarkDuplicates merge1.sam PP:bwa
+ at PG ID:bwa PN:bwa VN:1 CL:bwa aln
+1AAXX.1.1 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1AAXX.1 PG:Z:MarkDuplicates
+1AAXX.1.1 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:1AAXX.1 PG:Z:MarkDuplicates
diff --git a/testdata/net/sf/picard/sam/MarkDuplicates/merge2.sam b/testdata/net/sf/picard/sam/MarkDuplicates/merge2.sam
new file mode 100644
index 0000000..10f4ed3
--- /dev/null
+++ b/testdata/net/sf/picard/sam/MarkDuplicates/merge2.sam
@@ -0,0 +1,13 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chr1 LN:101
+ at SQ SN:chr2 LN:101
+ at SQ SN:chr3 LN:101
+ at SQ SN:chr4 LN:101
+ at SQ SN:chr5 LN:101
+ at SQ SN:chr6 LN:101
+ at SQ SN:chr7 LN:404
+ at SQ SN:chr8 LN:202
+ at RG ID:1AAXX.2 SM:Hi,Mom! LB:mylib
+ at PG ID:bwa PN:bwa VN:2 CL:bwa aln
+1AAXX.2.1 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1AAXX.2 PG:Z:bwa
+1AAXX.2.1 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:1AAXX.2 PG:Z:bwa
diff --git a/testdata/net/sf/picard/sam/MarkDuplicates/merge3.sam b/testdata/net/sf/picard/sam/MarkDuplicates/merge3.sam
new file mode 100644
index 0000000..05cdbe1
--- /dev/null
+++ b/testdata/net/sf/picard/sam/MarkDuplicates/merge3.sam
@@ -0,0 +1,13 @@
+ at HD VN:1.0 SO:coordinate
+ at SQ SN:chr1 LN:101
+ at SQ SN:chr2 LN:101
+ at SQ SN:chr3 LN:101
+ at SQ SN:chr4 LN:101
+ at SQ SN:chr5 LN:101
+ at SQ SN:chr6 LN:101
+ at SQ SN:chr7 LN:404
+ at SQ SN:chr8 LN:202
+ at RG ID:1AAXX.3 SM:Hi,Mom! LB:mylib
+ at PG ID:bwa PN:bwa VN:3 CL:bwa aln
+1AAXX.3.1 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1AAXX.3
+1AAXX.3.1 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:1AAXX.3
diff --git a/testdata/net/sf/picard/util/QualityEncodingDetectorTest/illumina-as-standard.bam b/testdata/net/sf/picard/util/QualityEncodingDetectorTest/illumina-as-standard.bam
new file mode 100644
index 0000000..a98d864
Binary files /dev/null and b/testdata/net/sf/picard/util/QualityEncodingDetectorTest/illumina-as-standard.bam differ
diff --git a/testdata/net/sf/picard/util/QualityEncodingDetectorTest/solexa-as-standard.bam b/testdata/net/sf/picard/util/QualityEncodingDetectorTest/solexa-as-standard.bam
new file mode 100644
index 0000000..7235dfa
Binary files /dev/null and b/testdata/net/sf/picard/util/QualityEncodingDetectorTest/solexa-as-standard.bam differ
--
manipulate SAM and BAM files
More information about the debian-med-commit
mailing list