[med-svn] [Git][med-team/pilon][master] 4 commits: New upstream version 1.23+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Oct 20 07:45:08 BST 2021
Andreas Tille pushed to branch master at Debian Med / pilon
Commits:
2323bede by Andreas Tille at 2021-10-20T08:35:53+02:00
New upstream version 1.23+dfsg
- - - - -
375557de by Andreas Tille at 2021-10-20T08:36:33+02:00
Upload old version as long as we can not provide the new one
- - - - -
23facf13 by Andreas Tille at 2021-10-20T08:38:38+02:00
Upload to unstable
- - - - -
4ea71093 by Andreas Tille at 2021-10-20T08:44:26+02:00
Add todo fo new upstream version which does not build without more recent scala
- - - - -
13 changed files:
- build.sbt
- build.sh
- debian/changelog
- project/plugins.sbt
- src/main/scala/org/broadinstitute/pilon/BamFile.scala
- src/main/scala/org/broadinstitute/pilon/GapFiller.scala
- src/main/scala/org/broadinstitute/pilon/GenomeFile.scala
- src/main/scala/org/broadinstitute/pilon/GenomeRegion.scala
- src/main/scala/org/broadinstitute/pilon/PileUp.scala
- src/main/scala/org/broadinstitute/pilon/PileUpRegion.scala
- src/main/scala/org/broadinstitute/pilon/Pilon.scala
- src/main/scala/org/broadinstitute/pilon/Scaffold.scala
- src/main/scala/org/broadinstitute/pilon/Vcf.scala
Changes:
=====================================
build.sbt
=====================================
@@ -1,11 +1,12 @@
name := "pilon"
-version := "1.24"
+version := "1.23"
-scalaVersion := "2.13.4"
+scalaVersion := "2.12.7"
scalacOptions += "-deprecation"
scalacOptions += "-feature"
-libraryDependencies += "com.github.samtools" % "htsjdk" % "2.23.0"
+libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
+
=====================================
build.sh
=====================================
@@ -12,5 +12,5 @@ sed -e "s/\(date.*=\).*/\\1 \"$commitdate\"/" \
<$tmp >$f
#sbt $* package
sbt $* assembly
-ln -sf `pwd`/target/scala-2.13/pilon-assembly-$version.jar ~/lib/pilon/pilon-dev.jar
+ln -sf `pwd`/target/scala-2.12/pilon-assembly-$version.jar ~/lib/pilon/pilon-dev.jar
mv $tmp $f
=====================================
debian/changelog
=====================================
@@ -1,5 +1,13 @@
pilon (1.24-1) UNRELEASED; urgency=medium
+ TODO: New version needs more recent scala
+ see: https://lists.debian.org/debian-med/2021/10/msg00019.html
+ https://bugs.debian.org/845113
+
+ -- Andreas Tille <tille at debian.org> Wed, 20 Oct 2021 08:39:15 +0200
+
+pilon (1.23+dfsg-3) unstable; urgency=medium
+
[ Andreas Tille ]
* New upstream version
* Standards-Version: 4.6.0 (routine-update)
@@ -7,17 +15,7 @@ pilon (1.24-1) UNRELEASED; urgency=medium
[ Steffen Moeller ]
* Fix watchfile to detect new versions on github (routine-update)
- TODO:
-...
-src/main/scala/org/broadinstitute/pilon/PileUpRegion.scala:136: error: value asScala is not a member of java.util.List[htsjdk.samtools.CigarElement]
- val cigarElements = cigar.getCigarElements.asScala
- ^
-src/main/scala/org/broadinstitute/pilon/Scaffold.scala:351: error: value asScala is not a member of htsjdk.samtools.SAMRecordIterator
- for (read <- reader.iterator.asScala if (!read.getReadUnmappedFlag)) {
- ^
-10 errors found
-
- -- Andreas Tille <tille at debian.org> Thu, 09 Sep 2021 08:21:06 +0200
+ -- Andreas Tille <tille at debian.org> Wed, 20 Oct 2021 08:37:00 +0200
pilon (1.23+dfsg-2) unstable; urgency=medium
=====================================
project/plugins.sbt
=====================================
@@ -1 +1 @@
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
=====================================
src/main/scala/org/broadinstitute/pilon/BamFile.scala
=====================================
@@ -21,12 +21,12 @@ package org.broadinstitute.pilon
import collection.mutable.{HashMap,Map}
import java.io.File
-import scala.jdk.CollectionConverters._
+import scala.collection.JavaConverters._
import htsjdk.samtools._
object BamFile {
val indexSuffix = ".bai"
- val maxInsertSizes = Map(("frags" -> 500), ("jumps" -> 10000), ("unpaired" -> 10000), ("bam" -> 10000))
+ val maxInsertSizes = Map(('frags -> 500), ('jumps -> 10000), ('unpaired -> 10000), ('bam -> 10000))
val minOrientationPct = 10
val maxFragInsertSize = 700
// long read type codes
@@ -35,14 +35,14 @@ object BamFile {
val pacbioLongRead = 2
}
-class BamFile(val bamFile: File, var bamType: String, val subType: String = "none") {
+class BamFile(val bamFile: File, var bamType: Symbol, val subType: Symbol = 'none) {
import BamFile._
val path = bamFile.getCanonicalPath()
var baseCount: Long = 0
- val longReadType = if (bamType == "unpaired") {
- if (subType == "nanopore") nanoporeLongRead
- else if (subType == "pacbio") pacbioLongRead
+ val longReadType = if (bamType == 'unpaired) {
+ if (subType == 'nanopore) nanoporeLongRead
+ else if (subType == 'pacbio) pacbioLongRead
else 0
} else 0
@@ -75,7 +75,7 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
}
def validateSeqs(seqsOfInterest: Set[String]) = {
- require(getSeqNames.intersect(seqsOfInterest).nonEmpty, bamFile.toString + " doesn't have sequence for any of " + seqsOfInterest.mkString(", "))
+ require(!getSeqNames.intersect(seqsOfInterest).isEmpty, bamFile + " doesn't have sequence for any of " + seqsOfInterest.mkString(", "))
}
@@ -112,8 +112,7 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
// the pairs for which only one end overlaps? Adding 10k slop until
// that's figured out.
//pileUpRegion.addReads(bam.reader.queryOverlapping(name, start, stop))
- val longRead = (bamType == "unpaired") && (subType == "nanopore" || subType == "pacbio")
-
+ val longRead = (bamType == 'unpaired) && (subType == 'nanopore || subType == 'pacbio)
val r = reader
val reads = r.queryOverlapping(region.name,
(region.start-10000) max 0, (region.stop+10000) min region.contig.length).asScala
@@ -273,22 +272,22 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
}
- def nStrays = pairs().length
+ def nStrays = pairs.length
def printDebug = println("mm: " + readMap1.size + "+" + readMap2.size + "=" + nStrays/2)
}
val strayMateMap = new MateMap()
- def autoBam(): String = {
+ def autoBam(): Symbol = {
val fr = pctFR
val rf = pctRF
val un = pctUnpaired
- if (un >= fr && un >= rf) "unpaired"
+ if (un >= fr && un >= rf) 'unpaired
else {
val insertSize = if (rf > fr) insertStatsRF.mean else insertStatsFR.mean
- if (insertSize >= maxFragInsertSize) "jumps" else "frags"
+ if (insertSize >= maxFragInsertSize) 'jumps else 'frags
}
}
@@ -319,7 +318,7 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
r.close
val totalReads = mapped + unmapped + filtered
- var summary = bamFile.toString + ": %d reads, %d filtered, %d mapped, %d proper".format(totalReads, filtered, mapped, proper)
+ var summary = bamFile + ": %d reads, %d filtered, %d mapped, %d proper".format(totalReads, filtered, mapped, proper)
if (Pilon.strays) summary += ", " + strayMateMap.nStrays + " stray"
val insertCount = insertStatsFR.count + insertStatsRF.count
if (pctFR >= minOrientationPct)
@@ -329,9 +328,9 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
if (pctUnpaired >= minOrientationPct)
summary += ", Unpaired " + pctUnpaired + "% " + insertStatsUnpaired
summary += ", max " + maxInsertSize
- if (bamType == "bam") {
- bamType = autoBam()
- summary += " " + bamType
+ if (bamType == 'bam) {
+ bamType = autoBam
+ summary += " " + bamType.name
}
println(summary)
}
@@ -347,7 +346,7 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
val flanks = flankRegion(region)
var reads = readsInRegion(flanks)
if (Pilon.debug) println("readsInRegion flanks: " + flanks + " " + reads.length + " reads")
- if (Pilon.strays && bamType != "unpaired") {
+ if (Pilon.strays && bamType != 'unpaired) {
val mm = new MateMap(reads)
if (Pilon.debug) mm.printDebug
reads ++= mm.findStrays
@@ -378,7 +377,7 @@ class BamFile(val bamFile: File, var bamType: String, val subType: String = "non
val frPct = pctFR
var mates: List[SAMRecord] = Nil
- for ((r1, r2) <- mateMap.pairs()) {
+ for ((r1, r2) <- mateMap.pairs) {
if (!(r1.getReadUnmappedFlag || r1.getProperPairFlag)) {
val rc = r1.getReadNegativeStrandFlag
val start = r1.getAlignmentStart
=====================================
src/main/scala/org/broadinstitute/pilon/GapFiller.scala
=====================================
@@ -73,7 +73,7 @@ class GapFiller(val region: GenomeRegion) {
else if (Pilon.debug) println("Gap closed but bad size: " + break + " " + closedLength)
}
if (solutionOK) solution
- else if (isGap || (Pilon.fixBreaks && loop.length == 0)) {
+ else if (isGap || ((Pilon.fixList contains 'breaks) && loop.length == 0)) {
// build partial solution using consensus from each side, opening gap if necessary
val fromRight = consensusFromRight(pathsFromRight)
val fromLeft = consensusFromLeft(pathsFromLeft)
@@ -127,10 +127,12 @@ class GapFiller(val region: GenomeRegion) {
assembler.addReads(reads)
if (Pilon.dumpReads) writeBam(break.toString, reads)
assembler.buildGraph
- if (Pilon.fixNovel && Pilon.novelContigs != Nil) {
+ if (Pilon.fixList.contains('novel) && Pilon.novelContigs != Nil) {
assembler.addGraphSeqs(Pilon.novelContigs)
- }
+ }
if (Pilon.debug) println("assembleIntoBreak: " + break + " " + assembler)
+
+ //if (Pilon.fixList contains 'novelbreaks) assembler.novel
val startOffset = breakRadius
var start = (break.start - startOffset) max region.start
@@ -167,22 +169,22 @@ class GapFiller(val region: GenomeRegion) {
solutionSet += s
}
}
-
- val solutions = solutionSet.toList sortWith { (a,b) => a._3.length + a._2.length < b._3.length + b._2.length }
- var solutionLengths = solutionSet map {s => s._3.length - s._2.length}
+ val solutions = solutionSet.toList sortWith { (a,b) => a._3.length - a._2.length > b._3.length - b._2.length }
+ var solutionLengths = Set[Int]()
+ solutions map {s => solutionLengths += s._3.length - s._2.length}
if (Pilon.debug) {
println("breakJoins: " + solutions.length + " solutions; lengths " + solutionLengths)
for (s <- solutions) println(" " + s)
}
if (solutionLengths.size == 1) {
- if (Pilon.debug) println("...all solutions of same length, using smallest total change above")
+ if (Pilon.debug) println("...all solutions of same length, using first above")
List(solutions.head)
} else{
solutions
}
}
-
+
def joinBreak(startArg: Int, forward: String, reverse: String, stopArg: Int) = {
var start = startArg
var stop = stopArg
@@ -201,13 +203,13 @@ class GapFiller(val region: GenomeRegion) {
if (patch != "") {
val solution = trimPatch(start, patch, stop)
start = solution._1
-
+
if (solution._2 == solution._3) {
if (Pilon.debug) println("...no change")
solution
} else {
if (Pilon.debug)
- println("...start=" + start + " ref=(" + solution._2.length + ")" + solution._2
+ println("...start=" + start + " ref=(" + solution._2.length + ")" + solution._2
+ " new=(" + solution._3.length + ")" + solution._3)
solution
}
@@ -216,7 +218,7 @@ class GapFiller(val region: GenomeRegion) {
noSolution
}
}
-
+
def properOverlap(left: String, right: String, minOverlap: Int): String = {
def substrMatch(a: String, aOffset: Int, b: String, bOffset: Int, len: Int): Boolean = {
@@ -302,7 +304,7 @@ class GapFiller(val region: GenomeRegion) {
if (estimatedLength == 0) {
reads = recruitReads(region)
} else {
- val unpairedBams = region.bamsOfType("unpaired")
+ val unpairedBams = region.bamsOfType('unpaired)
for (bam <- unpairedBams) {
reads ++= bam.readsInRegion(leftFlank)
reads ++= bam.readsInRegion(rightFlank)
@@ -322,14 +324,14 @@ class GapFiller(val region: GenomeRegion) {
assembler.addReads(reads)
assembler.buildGraph
val (forward, reverse, loop) = assembler.multiBridge(left, right)
- if (Pilon.verbose && loop.nonEmpty)
+ if (Pilon.verbose && !loop.isEmpty)
println(" loop: " + loop.length + " " + loop)
var patches = Set[String]()
for (f <- forward; r <- reverse) {
val patch = properOverlap(f, r, GapFiller.k)
// We should be very close in our closure estimation from alignments. Don't allow
- if (patch.nonEmpty && (estimatedLength == 0 || (patch.length - 2 * breakRadius).abs < estimatedLengthSlop)) {
+ if (!patch.isEmpty && (estimatedLength == 0 || (patch.length - 2 * breakRadius).abs < estimatedLengthSlop)) {
patches += patch
}
}
@@ -338,7 +340,7 @@ class GapFiller(val region: GenomeRegion) {
}
val lengths = patches.map(_.length).toSet
if (Pilon.verbose)
- println(s"{$patches.size} patches; lengths $lengths")
+ println(patches.size + " patches; lengths " + lengths)
if (lengths.size == 1 && (estimatedLength == 0 || loop.isEmpty || (loop.length - estimatedLength).abs < estimatedLengthSlop)) {
val patch = patches.head
@@ -356,11 +358,11 @@ class GapFiller(val region: GenomeRegion) {
def breakRadius = {
val minRadius = 3 * Assembler.K
- val inserts = region.bamsOfType("frags") map {bam => bam.insertSizeMean /*+ bam.insertSizeSigma*/}
+ val inserts = region.bamsOfType('frags) map {bam => bam.insertSizeMean /*+ bam.insertSizeSigma*/}
val insertMean = if (inserts.length > 0) (inserts.sum / inserts.length).round.toInt else 0
minRadius max insertMean
}
-
+
def recruitReadsFromBams(reg: Region, bams: List[BamFile]) = {
var reads = List[SAMRecord]()
for (b <- bams) {
@@ -371,37 +373,37 @@ class GapFiller(val region: GenomeRegion) {
reads
}
- def recruitReadsOfType(reg: Region, bamType: String) = {
+ def recruitReadsOfType(reg: Region, bamType: Symbol) = {
recruitReadsFromBams(reg, region.bamsOfType(bamType))
}
//def recruitFrags(reg: Region) = mateMapOfType(reg, 'frags).values.toList
- def recruitFrags(reg: Region) = recruitReadsOfType(reg, "frags")
+ def recruitFrags(reg: Region) = recruitReadsOfType(reg, 'frags)
def recruitJumps(reg: Region) = {
var reads = List[SAMRecord]()
- for (b <- region.bamsOfType("jumps")) {
+ for (b <- region.bamsOfType('jumps)) {
reads ++= b.recruitBadMates(reg)
}
- if (Pilon.debug)
+ if (Pilon.debug)
println("# Recruiting jump bad mates: count=" + reads.length)
reads
}
def recruitUnpaired(reg: Region) =
- if (!Pilon.longread) recruitReadsOfType(reg, "unpaired") else Nil
+ if (!Pilon.longread) recruitReadsOfType(reg, 'unpaired) else Nil
def recruitLocalReads(reg: Region) = recruitFrags(reg) ++ recruitUnpaired(reg)
def recruitLongReads(reg: Region) = {
- recruitReadsFromBams(reg, region.bamsOfType("unpaired").filter(_.longReadType > 0))
+ recruitReadsFromBams(reg, region.bamsOfType('unpaired).filter(_.longReadType > 0))
}
def recruitReads(reg: Region) = {
recruitLocalReads(reg) ++ recruitJumps(reg) // ++ recruitLongReads(reg)
}
- def writeBam(fileName: String, reads: List[SAMRecord]) = {
+ def writeBam(fileName: String, reads: List[SAMRecord]) {
val file = new File(fileName + ".sam")
val header = new SAMFileHeader()
header.addSequence(new SAMSequenceRecord(region.contig.getName, region.contig.getBases.length))
=====================================
src/main/scala/org/broadinstitute/pilon/GenomeFile.scala
=====================================
@@ -86,36 +86,43 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
bamFiles foreach {_.validateSeqs(contigsOfInterest)}
- if (Pilon.strays || Pilon.fixCircles) {
+ if (Pilon.strays || Pilon.fixList.contains('circles)) {
println("Scanning BAMs")
- bamFiles.map(_.scan(contigsOfInterest))
+ // Scan BAMs in parallel
+ //bamFiles.filter({_.bamType != 'unpaired}).par.map(_.scan(contigsOfInterest))
+ bamFiles.par.map(_.scan(contigsOfInterest))
}
val circles = Scaffold.findHgapCircles(bamFiles)
// If assemble novel sequence up front, so that we can potentially place the
// contigs into scaffolds when we process them.
- if (Pilon.fixNovel)
+ if (Pilon.fixList contains 'novel)
Pilon.novelContigs = assembleNovel(bamFiles)
var chunks = regions.map(_._2).flatten
- chunks foreach { r =>
+ if (Pilon.threads > 1) {
+ // Do parallel processing randomly to even out load if all the
+ // big chunks are early in the file
+ chunks = Random.shuffle(chunks)
+ }
+ chunks.par foreach { r =>
println("Processing " + r)
r.initializePileUps
bamFiles foreach { r.processBam(_) }
r.postProcess
- if ((Pilon.fixCircles) &&
+ if ((Pilon.fixList contains 'circles) &&
/*(r.contig.length < 5000 && r.contig.length >= 1000) ||*/ (circles contains r.contig.getName)) {
- if (Pilon.verbose) println(s"$r might be a circle!")
+ if (Pilon.verbose) println(r + " might be a circle!")
r.closeCircle(circles.getOrElse(r.contig.getName, 0))
}
- if (Pilon.vcf || Pilon.fixSnps || Pilon.fixIndels || Pilon.fixGaps || Pilon.fixLocal) {
+ if (Pilon.vcf || !Pilon.fixList.intersect(Set('snps, 'indels, 'gaps, 'local)).isEmpty) {
r.identifyAndFixIssues
// If we don't need pileups for VCF later, free up the memory now!
if (!Pilon.vcf) r.finalizePileUps
}
- println(s"$r log:")
- r.printLog()
+ println(r + " log:")
+ r.printLog
println("Finished processing " + r)
}
@@ -124,7 +131,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
new PrintWriter(new BufferedWriter(new FileWriter(changesFile)))
else null
val fastaFile = Pilon.outputFile(".fasta")
- val fastaWriter = if (Pilon.fixSnps || Pilon.fixIndels || Pilon.fixGaps || Pilon.fixLocal || Pilon.fixNovel)
+ val fastaWriter = if (!Pilon.fixList.isEmpty)
new PrintWriter(new BufferedWriter(new FileWriter(fastaFile)))
else null
@@ -153,7 +160,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
}
}
// Write the FASTA all at once rather than in chunks for formatting reasons
- if (fastaWriter != null) {
+ if (!Pilon.fixList.isEmpty) {
println("Writing updated " + newName + " to " + fastaFile)
val fixedRegions = reg._2 map { _.bases }
val bases = fixedRegions reduceLeft {_ ++ _} map {_.toChar} mkString ""
@@ -161,7 +168,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
}
}
- if (Pilon.fixNovel) {
+ if (Pilon.fixList contains 'novel) {
val novelContigs = Pilon.novelContigs
for (n <- 0 until novelContigs.length) {
val header = "pilon_novel_%03d".format(n + 1)
@@ -169,7 +176,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
writeFastaElement(fastaWriter, header, novelContigs(n))
}
}
- if (fastaWriter != null) fastaWriter.close
+ if (!Pilon.fixList.isEmpty) fastaWriter.close
if (Pilon.vcf) vcf.close
if (Pilon.changes) changesWriter.close
coverageSummary(bamFiles)
@@ -180,7 +187,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
var totalBaseCount: Long = 0
for ((t,files) <- bamTypes) {
val typeBaseCount = files.map(_.baseCount).sum
- println("Mean " + t + " coverage: " + roundDiv(typeBaseCount, genomeSize))
+ println("Mean " + t.name + " coverage: " + roundDiv(typeBaseCount, genomeSize))
totalBaseCount += typeBaseCount
}
println("Mean total coverage: " + roundDiv(totalBaseCount, genomeSize))
@@ -197,16 +204,16 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
if (Pilon.verbose) print("..." + contig.getName)
}
//genomeGraph.buildGraph
- if (Pilon.verbose) println()
+ if (Pilon.verbose) println
val assembler = new Assembler()
- bamFiles filter {_.bamType != "jumps"} foreach { bam =>
+ bamFiles filter {_.bamType != 'jumps} foreach { bam =>
val reads = bam.getUnaligned
print("..." + reads.length + " unmapped reads")
assembler.addReads(reads)
}
print("...assembling contigs")
val contigs = assembler.novel(genomeGraph)
- println()
+ println
val contigLengths = contigs map {_.length}
println("Assembled %d novel contigs containing %d bases".format(contigs.length, contigLengths.sum))
contigs
@@ -237,7 +244,7 @@ class GenomeFile(val referenceFile: File, val targets : String = "") {
def parseTargetFile(fileName: String) = {
try {
- Source.fromFile(fileName).getLines().flatMap({parseTargetString(_)}).toList
+ Source.fromFile(fileName).getLines.flatMap({parseTargetString(_)}).toList
} catch {
case ioe: Exception => Nil
}
=====================================
src/main/scala/org/broadinstitute/pilon/GenomeRegion.scala
=====================================
@@ -63,27 +63,25 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
var logString = ""
+ // Hack alert...if we're running multithreaded, buffer our output for atomic
+ // write when we're done. Otherwise, we would rather see output as it happens.
def log(s: String) = {
- print(s)
+ if (Pilon.threads > 1)
+ logString += s
+ else
+ print(s)
}
def logln(s: String = "") = {
log(s + "\n")
}
- def printLog() = print(logString)
+ def printLog() = if (Pilon.threads > 1) print(logString)
+ var changeMap = Map.empty[Int, (Symbol, PileUp)]
- object ChangeKind extends Enumeration {
- type ChangeKind = Value
- val SNP, INS, DEL, AMB = Value
- }
- import ChangeKind._
-
- var changeMap = Map.empty[Int, (ChangeKind, PileUp)]
-
- def addChange(loc: Int, kind: ChangeKind, pu: PileUp) = {
- if (kind == AMB) ambiguous(loc) = true
+ def addChange(loc: Int, kind: Symbol, pu: PileUp) = {
+ if (kind == 'amb) ambiguous(loc) = true
else changed(loc) = true
changeMap += (loc -> (kind, pu))
}
@@ -136,13 +134,13 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
//val bams = Map.empty[Symbol, BamRegion]
var bams = List[BamFile]()
- def bamsOfType(bamType: String) = bams filter { _.bamType == bamType }
+ def bamsOfType(bamType: Symbol) = bams filter { _.bamType == bamType }
- def nanoporeBams = bamsOfType("unpaired") filter { _.subType == "nanopore"}
+ def nanoporeBams = bamsOfType('unpaired) filter { _.subType == 'nanopore}
- def pacbioBams = bamsOfType("unpaired") filter { _.subType == "pacbio" }
+ def pacbioBams = bamsOfType('unpaired) filter { _.subType == 'pacbio }
- def fragBams = bamsOfType("frags")
+ def fragBams = bamsOfType('frags)
def longReadOnly = Pilon.longread && fragBams.isEmpty
@@ -232,7 +230,7 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
//meanReadLength = pileUpRegion.meanReadLength
// Pass 1: pull out values from pileups & call base changes
- val fixamb = Pilon.iupac || Pilon.fixAmb
+ val fixamb = Pilon.iupac || (Pilon.fixList contains 'amb)
for (i <- 0 until size) {
val pu = pileUpRegion(i)
@@ -255,9 +253,9 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
if (n >= minDepth && r != 'N' && !deleted(i) && bc.called) {
if (homo && b == r && bc.highConfidence && !bc.indel)
confirmed(i) = true
- else if (bc.isInsertion && bc.homoIndel) addChange(i, INS, pu)
+ else if (bc.isInsertion && bc.homoIndel) addChange(i, 'ins, pu)
else if (bc.isDeletion && bc.homoIndel) {
- addChange(i, DEL, pu)
+ addChange(i, 'del, pu)
for (j <- 1 until bc.deletion.length) {
deleted(i + j) = true
pileUpRegion(i + j).deletions += pu.deletions
@@ -265,8 +263,8 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
} else if (b != r && bc.score > 0) {
// for ambiguous bases, fix them if --fix fixamb or if original base
// not one of the top two alternatives
- if (homo) addChange(i, SNP, pu)
- else if (fixamb || bc.altBase != r) addChange(i, AMB, pu)
+ if (homo) addChange(i, 'snp, pu)
+ else if (fixamb || bc.altBase != r) addChange(i, 'amb, pu)
}
}
}
@@ -285,15 +283,15 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
}
def processBam(bam: BamFile) = {
- log(bam.bamType + " " + bam.bamFile + ": ")
+ log(bam.bamType.name + " " + bam.bamFile + ": ")
// This is a real kludge...
val covBefore = new Array[Int](size)
- if (bam.bamType != "jumps")
+ if (bam.bamType != 'jumps)
for (i <- 0 until size)
covBefore(i) = pileUpRegion.pileups(i).depth.toInt
val coverage = bam.process(this)
logln("coverage " + coverage.toString)
- if (bam.bamType != "jumps")
+ if (bam.bamType != 'jumps)
for (i <- 0 until size)
fragCoverage(i) += pileUpRegion.pileups(i).depth.toInt - covBefore(i)
bams ::= bam
@@ -309,8 +307,8 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
logln("# IdentifyIssues: " + this)
identifyIssueRegions
}
- val fixSnps = Pilon.fixSnps
- val fixIndels = Pilon.fixIndels
+ val fixSnps = Pilon.fixList contains 'snps
+ val fixIndels = Pilon.fixList contains 'indels
var snps = 0
var ins = 0
var dels = 0
@@ -325,10 +323,10 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
val cBase = bc.base
if (!excluded(i)) {
kind match {
- case SNP =>
+ case 'snp =>
if (fixSnps) snpFixList ::= (loc, rBase.toString, cBase.toString)
snps += 1
- case AMB =>
+ case 'amb =>
if (fixSnps && !Pilon.longread) {
if (Pilon.iupac) {
// we put these on the small fix list because iupac codes can mess up assembly
@@ -339,12 +337,12 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
}
amb += 1
}
- case INS =>
+ case 'ins =>
val insert = bc.insertion
if (fixIndels) smallFixList ::= (loc, "", insert)
ins += 1
insBases += insert.length
- case DEL =>
+ case 'del =>
val deletion = bc.deletion
if (fixIndels) smallFixList ::= (loc, deletion, "")
dels += 1
@@ -359,15 +357,15 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
val nonN = originalBases count {x => x != 'N'}
logln("Confirmed " + nConfirmed + " of " + nonN + " bases (" +
(nConfirmed * 100.0 / nonN).formatted("%.2f") + "%)")
- if (Pilon.fixSnps) log("Corrected ") else log("Found ")
- if (Pilon.diploid) log(s"${snps+amb} snps")
+ if (Pilon.fixList contains 'snps) log("Corrected ") else log("Found ")
+ if (Pilon.diploid) log((snps + amb) + " snps")
else {
- log(s"$snps snps; ")
- log(s"$amb ambiguous bases")
+ log(snps + " snps; ")
+ log(amb + " ambiguous bases")
}
- if (Pilon.fixIndels) log("; corrected ") else log("; found ")
- log(s"$ins small insertions totaling $insBases bases")
- logln(s", $dels small deletions totaling $delBases bases")
+ if (Pilon.fixList contains 'indels) log("; corrected ") else log("; found ")
+ log(ins + " small insertions totaling " + insBases + " bases")
+ logln(", " + dels + " small deletions totaling " + delBases + " bases")
// Report large collapsed regions (possible segmental duplication)
val duplications = duplicationEvents
@@ -380,7 +378,7 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
fixIssues(snpFixList)
// Try to fill gaps
- if (Pilon.fixGaps && gaps.nonEmpty) {
+ if ((Pilon.fixList contains 'gaps) && gaps.nonEmpty) {
logln("# Attempting to fill gaps")
for (gap <- gaps) {
val filler = new GapFiller(this)
@@ -394,7 +392,7 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
// Try to reassemble around possible contiguity breaks, but stay away from gaps
val breaks = possibleBreaks
- if (Pilon.fixLocal && breaks.nonEmpty) {
+ if ((Pilon.fixList contains 'local) && breaks.nonEmpty) {
logln("# Attempting to fix local continuity breaks")
for (break <- breaks) {
val filler = new GapFiller(this)
@@ -522,21 +520,21 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
val rBase = refBase(locus(i))
val cBase = pu.baseCall.base
kind match {
- case SNP =>
- log(name + " " + locus(i) + " snp " + rBase + " " + cBase)
+ case 'snp =>
+ log(name + " " + locus(i) + " " + kind.name + " " + rBase + " " + cBase)
if (Pilon.debug) log(" " + pu)
log(endLine)
- case INS =>
- log(name + " " + locus(i) + " ins " + "." + " " + pu.baseCall.insertion)
+ case 'ins =>
+ log(name + " " + locus(i) + " " + kind.name + " " + "." + " " + pu.baseCall.insertion)
if (Pilon.debug) log(" " + pu)
log(endLine)
- case DEL =>
- log(name + " " + locus(i) + " del " + pu.baseCall.deletion + " " + ".")
+ case 'del =>
+ log(name + " " + locus(i) + " " + kind.name + " " + pu.baseCall.deletion + " " + ".")
if (Pilon.debug) log(" " + pu)
log(endLine)
- case AMB =>
+ case 'amb =>
if (Pilon.verbose && rBase != cBase) {
- log(name + " " + locus(i) + " amb " + rBase + " " + cBase)
+ log(name + " " + locus(i) + " " + kind.name + " " + rBase + " " + cBase)
if (Pilon.debug) log(" " + pu)
log(endLine)
}
@@ -642,7 +640,7 @@ class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
}
}
- def writeChanges(changes: PrintWriter, newName: String = name, offset: Int = 0) = {
+ def writeChanges(changes: PrintWriter, newName: String = name, offset: Int = 0) {
val fixes = fixFixList(snpFixList ++ smallFixList ++ bigFixList)
var delta = 0
for (fix <- fixes) {
=====================================
src/main/scala/org/broadinstitute/pilon/PileUp.scala
=====================================
@@ -172,7 +172,7 @@ class PileUp {
else base.toString //+ (if (!homo) "/" + altBase else "")
}
- def baseMatch(refBase: Char) = {//
+ def baseMatch(refBase: Char) {//
refBase == base // TODO: handle IUPAC codes
}
=====================================
src/main/scala/org/broadinstitute/pilon/PileUpRegion.scala
=====================================
@@ -19,7 +19,7 @@
package org.broadinstitute.pilon
-import scala.jdk.CollectionConverters._
+import scala.collection.JavaConverters._
import htsjdk.samtools._
import Utils._
@@ -220,7 +220,7 @@ class PileUpRegion(name: String, start: Int, stop: Int)
}
def dump = {
- for (i <- 0 to size - 1) println(s"${locus(i)}: ${pileups(i)}")
+ for (i <- 0 to size - 1) println(locus(i) + ": " + pileups(i))
}
def postProcess = {
=====================================
src/main/scala/org/broadinstitute/pilon/Pilon.scala
=====================================
@@ -22,18 +22,9 @@ package org.broadinstitute.pilon
import java.io.File
object Pilon {
- val fixChoices= Set("snps", "indels", "gaps", "local")
- val experimentalFixChoices = Set("amb", "breaks", "circles", "novel", "scaffolds")
-
- var fixSnps = false
- var fixIndels = false
- var fixGaps = false
- var fixLocal = false
- var fixAmb = false
- var fixBreaks = false
- var fixCircles = false
- var fixNovel = false
- var fixScaffolds = false
+ // types of fixing we know about
+ val fixChoices = Set('snps, 'indels, 'gaps, 'local)
+ val experimentalFixChoices = Set('amb, 'breaks, 'circles, 'novel, 'scaffolds)
// input parameters
var bamFiles = List[BamFile]()
@@ -70,6 +61,7 @@ object Pilon {
var pacbio = false
var nanopore = false
var strays = true
+ var threads = 1
var trSafe = true
// Global computed data
@@ -78,7 +70,7 @@ object Pilon {
// for logging to output files
var commandArgs = Array[String]()
- def main(args: Array[String]) = {
+ def main(args: Array[String]) {
commandArgs = args
println(Version.version)
optionParse(args.toList)
@@ -92,9 +84,11 @@ object Pilon {
}
}
+ setDefaultParallelism(threads)
+
// Stray computation is expensive up front, so only turn it on
// if we're doing local reassembly
- strays &= fixGaps || fixLocal || fixScaffolds
+ strays &= (fixList contains 'gaps) || (fixList contains 'local) || (fixList contains 'scaffolds)
if (bamFiles.length == 0) {
println(usage)
@@ -107,7 +101,7 @@ object Pilon {
println("Genome: " + genomePath)
val genome = new GenomeFile(new File(genomePath), targets)
- println("Fixing " + (fixList mkString(", ")))
+ println("Fixing " + (fixList map {_.name} mkString(", ")))
genome.processRegions(bamFiles)
if (tracks) {
@@ -117,8 +111,9 @@ object Pilon {
}
- def optionParse(list: List[String]) : Unit = {
+ def optionParse(list: List[String]) : Unit = {
list match {
+ case Nil => Nil
case "--help" :: tail =>
println(usage)
print(help)
@@ -128,7 +123,7 @@ object Pilon {
// println(Version.version)
sys.exit(0)
case "--bam" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "bam")
+ bamFiles ::= new BamFile(new File(value), 'bam)
optionParse(tail)
case "--changes" :: tail =>
changes = true
@@ -162,7 +157,7 @@ object Pilon {
flank = value.toInt
optionParse(tail)
case "--frags" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "frags")
+ bamFiles ::= new BamFile(new File(value), 'frags)
optionParse(tail)
case "--gapmargin" :: value :: tail =>
gapMargin = value.toInt
@@ -171,7 +166,7 @@ object Pilon {
genomePath = value
optionParse(tail)
case "--jumps" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "jumps")
+ bamFiles ::= new BamFile(new File(value), 'jumps)
optionParse(tail)
case "--K" :: value :: tail =>
Assembler.K = value.toInt
@@ -192,7 +187,7 @@ object Pilon {
multiClosure = true
optionParse(tail)
case "--nanopore" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "unpaired", "nanopore")
+ bamFiles ::= new BamFile(new File(value), 'unpaired, 'nanopore)
longread = true
nanopore = true
optionParse(tail)
@@ -209,7 +204,7 @@ object Pilon {
outdir = value
optionParse(tail)
case "--pacbio" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "unpaired", "pacbio")
+ bamFiles ::= new BamFile(new File(value), 'unpaired, 'pacbio)
longread = true
pacbio = true
optionParse(tail)
@@ -217,8 +212,7 @@ object Pilon {
targets = value
optionParse(tail)
case "--threads" :: value :: tail =>
- println("--threads argument no longer supported; ignoring!")
- //threads = value.toInt
+ threads = value.toInt
optionParse(tail)
case "--tracks" :: tail =>
tracks = true
@@ -227,12 +221,12 @@ object Pilon {
// trSafe = true
// optionParse(tail)
case "--unpaired" :: value :: tail =>
- bamFiles ::= new BamFile(new File(value), "unpaired")
+ bamFiles ::= new BamFile(new File(value), 'unpaired)
optionParse(tail)
case "--variant" :: tail =>
// variant calling mode
vcf = true
- fixList += "breaks"
+ fixList += 'breaks
optionParse(tail)
//multiClosure = true
case "--vcf" :: tail =>
@@ -250,21 +244,10 @@ object Pilon {
case option :: tail =>
println("Unknown option " + option)
sys.exit(1)
- case Nil =>
}
- fixSnps = fixList contains "snps"
- fixIndels = fixList contains "indels"
- fixGaps = fixList contains "gaps"
- fixLocal = fixList contains "local"
- fixAmb = fixList contains "amb"
- fixBreaks = fixList contains "breaks"
- fixCircles = fixList contains "circles"
- fixNovel = fixList contains "novel"
- fixScaffolds = fixList contains "scaffolds"
}
def parseFixList(fix: String) = {
- // types of fixing we know about
val fixes = fix.split(",")
if (fix(0) != '+' && fix(0) != '-') fixList = Set.empty
for (f <- fixes) {
@@ -272,7 +255,7 @@ object Pilon {
else if (f == "none") fixList = Set.empty
else {
val plusMinus = if (f(0) == '-') f(0) else '+'
- val fsym = if (f(0) == plusMinus) f.substring(1) else f
+ val fsym = if (f(0) == plusMinus) Symbol(f.substring(1)) else Symbol(f)
if (fixChoices contains fsym) {
if (plusMinus == '+') fixList += fsym
else fixList -= fsym
@@ -281,14 +264,14 @@ object Pilon {
println("Warning: experimental fix option " + f)
if (plusMinus == '+') fixList += fsym
else fixList -= fsym
- } else if (fsym == "bases") {
+ } else if (fsym == 'bases) {
// for backward compatibility, 'bases applies to both 'snps and 'indels
if (plusMinus == '+') {
- fixList += "snps"
- fixList += "indels"
+ fixList += 'snps
+ fixList += 'indels
} else {
- fixList -= "snps"
- fixList -= "indels"
+ fixList -= 'snps
+ fixList -= 'indels
}
} else {
println("Error: unknown fix option " + f)
@@ -303,8 +286,21 @@ object Pilon {
val filePath = if (Pilon.outdir == "") fileName else Pilon.outdir + "/" + fileName
new File(filePath)
}
+
+ // Ugly, but thank you http://stackoverflow.com/questions/17865823
+ def setDefaultParallelism(numThreads: Int): Unit = {
+ val parPkgObj = scala.collection.parallel.`package`
+ val defaultTaskSupportField = parPkgObj.getClass.getDeclaredFields.find{
+ _.getName == "defaultTaskSupport"
+ }.get
-
+ defaultTaskSupportField.setAccessible(true)
+ defaultTaskSupportField.set(
+ parPkgObj,
+ new scala.collection.parallel.ForkJoinTaskSupport(
+ new scala.concurrent.forkjoin.ForkJoinPool(numThreads)))
+ }
+
val usage = """
Usage: pilon --genome genome.fasta [--frags frags.bam] [--jumps jumps.bam] [--unpaired unpaired.bam]
[...other options...]
@@ -328,10 +324,6 @@ object Pilon {
--bam any.bam
A bam file of unknown type; Pilon will scan it and attempt to classify it as one
of the above bam types.
- --nanopore ont.bam
- A bam file containing Oxford Nanopore read alignments. Experimental.
- --pacbio pb.bam
- A bam file containing Pacific Biosciences read alignments. Experimental.
OUTPUTS:
--output prefix
Prefix for output files
@@ -385,6 +377,8 @@ object Pilon {
scaffold00001 and coordinates 10000-20000 of scaffold00002.
If "targetlist" is the name of a file, each line will be treated as a target
specification.
+ --threads
+ Degree of parallelism to use for certain processing (default 1). Experimental.
--verbose
More verbose output.
--debug
=====================================
src/main/scala/org/broadinstitute/pilon/Scaffold.scala
=====================================
@@ -28,7 +28,7 @@ package org.broadinstitute.pilon
*/
-import scala.jdk.CollectionConverters._
+import scala.collection.JavaConverters._
import collection.mutable.{ Map, HashMap }
import htsjdk.samtools._
@@ -221,7 +221,7 @@ object Scaffold {
}
def analyzeStrays(bam: BamFile) = {
- val mm = bam.strayMateMap.pairs()
+ val mm = bam.strayMateMap.pairs
val sigma = bam.insertSizeSigma
val scaffolds = bam.getSeqs
val scaffoldSizes = scaffolds.map({_.getSequenceLength})
@@ -269,16 +269,16 @@ object Scaffold {
}
}
- def analyze(bamFiles: List[BamFile]) = {
+ def analyze(bamFiles: List[BamFile]) {
println("Analyze scaffolds")
- for (bam <- bamFiles filter {_.bamType == "jumps"})
+ for (bam <- bamFiles filter {_.bamType == 'jumps})
analyzeStrays(bam)
}
def findHgapCircles(bamFiles: List[BamFile]): Map[String, Int] = {
- if (Pilon.fixCircles) {
- val endAlignments = bamFiles filter {_.bamType == "unpaired"} flatMap findEndAlignments
+ if (Pilon.fixList contains 'circles) {
+ val endAlignments = bamFiles filter {_.bamType == 'unpaired} flatMap findEndAlignments
findCircles(endAlignments)
} else return new HashMap()
}
=====================================
src/main/scala/org/broadinstitute/pilon/Vcf.scala
=====================================
@@ -90,7 +90,7 @@ class Vcf(val file: File, val contigsWithSizes: List[(String, Int)] = Nil) {
val p = pileUp.delPct
//(rBase + bcString, rBase.toString, "1/1", depth - pileUp.deletions, pileUp.deletions)
//(rBase + bcString, rBase.toString, callType, pileUp.mqSum - pileUp.delQual, pileUp.delQual)
- (rBase.toString + bcString, rBase.toString, callType, 100 - p, p)
+ (rBase + bcString, rBase.toString, callType, 100 - p, p)
} else if (indelOk && !embedded && bc.isInsertion) {
loc -= 1
val rBase = region.refBase(loc)
@@ -98,7 +98,7 @@ class Vcf(val file: File, val contigsWithSizes: List[(String, Int)] = Nil) {
val p = pileUp.insPct
//(rBase.toString, rBase + bcString, "1/1", depth - pileUp.insertions, pileUp.insertions)
//(rBase.toString, rBase + bcString, callType, pileUp.mqSum - pileUp.insQual, pileUp.insQual)
- (rBase.toString, rBase.toString + bcString, callType, 100 - p, p)
+ (rBase.toString, rBase + bcString, callType, 100 - p, p)
} else if (bc.homo) {
val rBase = region.refBase(loc)
if (rBase == bc.base || bcString == "N")
@@ -178,8 +178,8 @@ class Vcf(val file: File, val contigsWithSizes: List[(String, Int)] = Nil) {
def writeFixRecord(region: GenomeRegion, fix: GenomeRegion.Fix) = {
val loc = fix._1 - 1
val rBase = region.refBase(loc)
- val ref = rBase.toString + fix._2
- val alt = rBase.toString + fix._3
+ val ref = rBase + fix._2
+ val alt = rBase + fix._3
val svlen = alt.length - ref.length
val svend = loc + ref.length - 1
val svtype = if (svlen < 0) "DEL" else "INS"
@@ -195,7 +195,7 @@ class Vcf(val file: File, val contigsWithSizes: List[(String, Int)] = Nil) {
var loc = dup.start - 1
val rBase = region.refBase(loc)
var line = region.name + tab + loc + tab + "." + tab
- line += rBase.toString + tab + "<DUP>" + tab + "." + tab + "PASS" + tab
+ line += rBase + tab + "<DUP>" + tab + "." + tab + "PASS" + tab
line += "SVTYPE=DUP;SVLEN=" + dup.size + ";END=" + dup.stop + ";IMPRECISE"
line += tab + "GT" + tab + "./."
writer.println(line)
View it on GitLab: https://salsa.debian.org/med-team/pilon/-/compare/2eada9a2760773c70d5cfa0b2623fc30578fdb95...4ea71093bea4df1d5dadeef1a6b8c3a9f2e13815
--
View it on GitLab: https://salsa.debian.org/med-team/pilon/-/compare/2eada9a2760773c70d5cfa0b2623fc30578fdb95...4ea71093bea4df1d5dadeef1a6b8c3a9f2e13815
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211020/24699b58/attachment-0001.htm>
More information about the debian-med-commit
mailing list