[med-svn] [pbsuite] 01/05: Update pbhoney for compatibility with blasr 5
Afif Elghraoui
afif at moszumanska.debian.org
Thu Dec 29 06:58:11 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository pbsuite.
commit 6b5ace3381e783a4071f249535b5aa70aa4fea56
Author: Afif Elghraoui <afif at debian.org>
Date: Sun Dec 11 14:05:10 2016 -0800
Update pbhoney for compatibility with blasr 5
Closes: #844034
---
debian/control | 4 +-
debian/patches/blasr-5.patch | 278 ++++++++++++++++++++++++++
debian/patches/intermediate-reads-files.patch | 77 +++++++
debian/patches/series | 2 +
4 files changed, 359 insertions(+), 2 deletions(-)
diff --git a/debian/control b/debian/control
index 6b41c64..3166f37 100644
--- a/debian/control
+++ b/debian/control
@@ -49,7 +49,7 @@ Depends:
${python:Depends},
python-pbsuite-utils (= ${source:Version}),
python-networkx,
- blasr,
+ blasr (>= 5.3),
Description: genome assembly upgrading tool
PBJelly is a highly automated pipeline that aligns long sequencing
reads (such as PacBio RS reads or long 454 reads in fasta format)
@@ -69,7 +69,7 @@ Depends:
python-networkx,
python-h5py (>= 2.0.1),
python-numpy,
- blasr,
+ blasr (>= 5.3),
samtools,
Recommends:
pbdagcon,
diff --git a/debian/patches/blasr-5.patch b/debian/patches/blasr-5.patch
new file mode 100644
index 0000000..4b37758
--- /dev/null
+++ b/debian/patches/blasr-5.patch
@@ -0,0 +1,278 @@
+Description: Update interface to blasr 5
+ blasr 5.1 and up began following the GNU convention for command-line flags.
+Author: Afif Elghraoui <afif at debian.org>
+Bug: https://sourceforge.net/p/pb-jelly/tickets/5/
+Bug-Debian: https://bugs.debian.org/844034
+Last-Update: 2016-12-11
+--- pbsuite.orig/docs/TemplateProtocol.xml
++++ pbsuite/docs/TemplateProtocol.xml
+@@ -6,7 +6,7 @@
+ <command notes="For PBS/Moab">echo '${CMD}' | msub -N "${JOBNAME}" -o ${STDOUT} -e ${STDERR} -l nodes=1:ppn=8,mem=48000mb</command>
+ <nJobs>1</nJobs>
+ </cluster>
+- <blasr>-minMatch 8 -sdpTupleSize 8 -minPctIdentity 75 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 8 -noSplitSubreads</blasr>
++ <blasr>--minMatch 8 --sdpTupleSize 8 --minPctIdentity 75 --bestn 1 --nCandidates 10 --maxScore -500 --nproc 8 --noSplitSubreads</blasr>
+ <input baseDir="/FULL/PATH/TO__/PBJelly/lambdaExample/data/reads/">
+ <job>pacbioReads.fasta</job>
+ </input>
+--- pbsuite.orig/docs/jellyExample/Protocol.xml
++++ pbsuite/docs/jellyExample/Protocol.xml
+@@ -1,7 +1,7 @@
+ <jellyProtocol>
+ <reference>/__PATH__/_TO_/jellyExample/data/reference/lambda.fasta</reference>
+ <outputDir>/__PATH__/_TO_/jellyExample/</outputDir>
+- <blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 20 -maxScore -500 -nproc 4 -noSplitSubreads</blasr>
++ <blasr>--minMatch 8 --minPctIdentity 70 --bestn 1 --nCandidates 20 --maxScore -500 --nproc 4 --noSplitSubreads</blasr>
+ <input baseDir="/__PATH__/_TO_/jellyExample/data/reads/">
+ <job>filtered_subreads.fastq</job>
+ </input>
+--- pbsuite.orig/pbsuite/banana/chunkyBlasr.py
++++ pbsuite/pbsuite/banana/chunkyBlasr.py
+@@ -10,10 +10,10 @@
+
+
+ #These are your default blasr parameters. Adjust at will.
+-parameters = "-maxScore -1000 -bestn 24 -maxLCPLength 16 -nCandidates 24 -noSplitSubreads"
++parameters = "--maxScore -1000 --bestn 24 --maxLCPLength 16 --nCandidates 24 --noSplitSubreads"
+
+
+-command = Template("blasr ${FAS} ${REF} ${SA} -m 4 -out ${OUT} -start ${START} -stride ${STRIDE} ${EXTRAPARAMS}")
++command = Template("blasr ${FAS} ${REF} ${SA} -m 4 --out ${OUT} --start ${START} --stride ${STRIDE} ${EXTRAPARAMS}")
+
+ USAGE="""%prog <reads.fasta> --output <outName> [ <options> ]
+ Creates All vs All alignment of reads.fasta
+--- pbsuite.orig/pbsuite/banana/Banana.py
++++ pbsuite/pbsuite/banana/Banana.py
+@@ -23,8 +23,8 @@
+ """
+ Simple overlapper
+ """
+- r,o,e = exe(("blasr %s %s -m 5 -bestn 200 -nCandidates 200 -minMatch 12 "
+- "-affineExtend 3 -nproc %d -noSplitSubreads -out %s -maxScore -1000") % \
++ r,o,e = exe(("blasr %s %s -m 5 --bestn 200 --nCandidates 200 --minMatch 12 "
++ "--affineExtend 3 --nproc %d --noSplitSubreads --out %s --maxScore -1000") % \
+ (query, target, nproc, outname))
+
+ def m5ToOvlGraph(readNames, fileName):
+--- pbsuite.orig/pbsuite/banana/Polish.py
++++ pbsuite/pbsuite/banana/Polish.py
+@@ -17,7 +17,7 @@
+ """
+ runs blasr
+ """
+- r,o,e = exe("blasr %s %s -bestn %d -affineAlign -m 5 -nproc %d -out %s" \
++ r,o,e = exe("blasr %s %s --bestn %d --affineAlign -m 5 --nproc %d --out %s" \
+ % (query, target, bestn, nproc, outName))
+
+
+--- pbsuite.orig/pbsuite/banana/OLCAssembly.py
++++ pbsuite/pbsuite/banana/OLCAssembly.py
+@@ -158,9 +158,9 @@
+ Takes all of the pools and generates their fastq and alignments in their own folder.
+ """
+ logging.info("Creating Overlap")
+- log = _exe(("blasr inputReads.fastq reference.fasta -nproc %d -m 4 "
+- "-out temp.rm4 -noSplitSubreads -useGuidedAlign -allowAdjacentIndels "#-minFrac 0.01
+- "-nCandidates 20 -bestn 15 -minMatch 8 -maxLCPLength 16 ") % (self.options.nproc) )
++ log = _exe(("blasr inputReads.fastq reference.fasta --nproc %d -m 4 "
++ "--out temp.rm4 --noSplitSubreads --useGuidedAlign --allowAdjacentIndels "#-minFrac 0.01
++ "--nCandidates 20 --bestn 15 --minMatch 8 --maxLCPLength 16 ") % (self.options.nproc) )
+ logging.debug(log)
+ logging.info("Sorting the alignments")
+ logging.debug(_exe("sort temp.rm4 > alignments.rm4"))
+--- pbsuite.orig/pbsuite/jelly/QFix.py
++++ pbsuite/pbsuite/jelly/QFix.py
+@@ -16,7 +16,7 @@
+ fout.write(">%s\n%s\n" % (i.name, i.seq))
+ fout.close()
+
+- print exe(("blasr input.fastq ref.fasta -bestn 2 -m 5 -noSplitSubreads > out.m5"))
++ print exe(("blasr input.fastq ref.fasta --bestn 2 -m 5 --noSplitSubreads > out.m5"))
+ print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
+ "DevJelly/branches/consensusDev/GetSubs.py out.m5 input.fastq"))
+ print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
+--- pbsuite.orig/pbsuite/jelly/Stages.py
++++ pbsuite/pbsuite/jelly/Stages.py
+@@ -56,7 +56,7 @@
+ logging.basicConfig( stream=sys.stderr, level=level, format=logFormat )
+ logging.info("Running blasr")
+
+- mappingTemplate = Template("blasr ${fasta} ${ref} ${sa} -m 4 -out ${outFile} ${parameters} ")
++ mappingTemplate = Template("blasr ${fasta} ${ref} ${sa} -m 4 --out ${outFile} ${parameters} ")
+ tailTemplate = Template("m4pie.py ${outFile} ${fasta} ${ref} --nproc ${nproc} -i ${extras}")
+
+ ret = []
+@@ -79,7 +79,7 @@
+ logging.warning("Output File %s already exists and will be overwritten." % (outFile))
+
+ #Build Blasr Command
+- nprocRe = re.compile("-nproc (\d+)")
++ nprocRe = re.compile("--nproc (\d+)")
+ np = nprocRe.findall(parameters + extras)
+ if len(np) == 0:
+ np = '1'
+--- pbsuite.orig/pbsuite/jelly/Assembly.py
++++ pbsuite/pbsuite/jelly/Assembly.py
+@@ -18,8 +18,8 @@
+ """
+ Simple overlapper
+ """
+- c = ("blasr %s %s -m %s -bestn %d -nCandidates %d -minMatch 8 -sdpTupleSize 6 -affineAlign "
+- "-nproc %d -noSplitSubreads -out %s -minPctIdentity 60 -minReadLength 5") % \
++ c = ("blasr %s %s -m %s --bestn %d --nCandidates %d --minMatch 8 --sdpTupleSize 6 --affineAlign "
++ "--nproc %d --noSplitSubreads --out %s --minPctIdentity 60 --minReadLength 5") % \
+ (query, target, fmt, bestn, nCandidates, nproc, outname)
+ logging.debug(c)
+ r,o,e = exe(c)
+--- pbsuite.orig/pbsuite/jelly/m4pie.py
++++ pbsuite/pbsuite/jelly/m4pie.py
+@@ -75,8 +75,8 @@
+ sa = "-sa " + ref + ".sa"
+ else:
+ sa = ""
+- cmd = ("blasr %s %s %s -nproc %d -m 4 -bestn 1 -nCandidates 20 -out %s"
+- " -minPctIdentity 75 -sdpTupleSize 6 -noSplitSubreads") \
++ cmd = ("blasr %s %s %s --nproc %d -m 4 --bestn 1 --nCandidates 20 --out %s"
++ " --minPctIdentity 75 --sdpTupleSize 6 --noSplitSubreads") \
+ % (fq, ref, sa, nproc, out)
+
+ logging.debug(cmd)
+--- pbsuite.orig/pbsuite/honey/HSpots.pyx
++++ pbsuite/pbsuite/honey/HSpots.pyx
+@@ -510,7 +510,7 @@
+
+ #map consensus to refregion
+ varSam = NamedTemporaryFile(suffix=".sam")
+- cmd = "blasr %s %s -sam -bestn 1 -affineAlign -out %s" % (conOut.name, refOut.name, varSam.name)
++ cmd = "blasr %s %s --sam --bestn 1 --affineAlign --out %s" % (conOut.name, refOut.name, varSam.name)
+ logging.debug(cmd)
+ logging.debug(exe(cmd))
+
+--- pbsuite.orig/pbsuite/honey/HSpots.py
++++ pbsuite/pbsuite/honey/HSpots.py
+@@ -200,17 +200,17 @@
+ """
+ Simple mapper
+ """
+- cmd = ("blasr %s %s %s -nproc %d -bestn 1 -out %s ") \
++ cmd = ("blasr %s %s %s --nproc %d --bestn 1 --out %s ") \
+ % (query, target, format, nproc, outname)
+ #need to figure out how to m5-pie it...maybe
+ if consensus:
+- r, o, e = exe(cmd + " -noSplitSubreads -minMatch 5 " + \
+- "-nCandidates 20 -sdpTupleSize 6 -insertion 1 -deletion 1 -bestn 1")
++ r, o, e = exe(cmd + " --noSplitSubreads --minMatch 5 " + \
++ "--nCandidates 20 --sdpTupleSize 6 --insertion 1 --deletion 1 --bestn 1")
+ else:
+- r, o, e = exe(cmd + " -maxAnchorsPerPosition 100 "
+- "-affineAlign -affineOpen 100 -affineExtend 0 "
+- "-insertion 10 -deletion 10 "
+- "-noSplitSubreads -nCandidates 20 ")
++ r, o, e = exe(cmd + " --maxAnchorsPerPosition 100 "
++ "--affineAlign --affineOpen 100 --affineExtend 0 "
++ "--insertion 10 --deletion 10 "
++ "--noSplitSubreads --nCandidates 20 ")
+
+ logging.debug("blasr - %d - %s - %s" % (r, o, e))
+
+@@ -1000,7 +1000,7 @@
+
+ #map consensus to refregion
+ varSam = NamedTemporaryFile(suffix=".sam")
+- blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name,\
++ blasr(conOut.name, refOut.name, format="--sam", outname=varSam.name,\
+ consensus=False) #-- would this help?
+ #or what if I fed it through leftalign?
+ #os.system("cp %s ." % (refOut.name))
+--- pbsuite.orig/pbsuite/honey/bakH.py
++++ pbsuite/pbsuite/honey/bakH.py
+@@ -187,7 +187,7 @@
+ """
+ Simple mapper
+ """
+- cmd = ("blasr %s %s %s -nproc %d -bestn 1 -out %s ") \
++ cmd = ("blasr %s %s %s --nproc %d --bestn 1 --out %s ") \
+ % (query, target, format, nproc, outname)
+ #need to figure out how to m5-pie it...maybe
+ if consensus:
+@@ -1013,7 +1013,7 @@
+
+ #map consensus to refregion
+ varSam = NamedTemporaryFile(suffix=".sam")
+- blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name)
++ blasr(conOut.name, refOut.name, format="--sam", outname=varSam.name)
+ #consensus=False) -- would this help?
+ #or what if I fed it through leftalign?
+
+--- pbsuite.orig/pbsuite/honey/bampie.py
++++ pbsuite/pbsuite/honey/bampie.py
+@@ -11,14 +11,14 @@
+ #Edit this string to set which parameters blasr will use by default
+ #DO NOT! Set -nproc, -bestn, -clipping, or any output (e.g. -out -m 5)
+ #Remove -noSpotSubreads if your inputs are bax.h5 files [i think]
+-BLASRPARAMS = (" -affineAlign -noSplitSubreads -nCandidates 20 "
+- "-minPctIdentity 75 -sdpTupleSize 6")
++BLASRPARAMS = (" --affineAlign --noSplitSubreads --nCandidates 20 "
++ "--minPctIdentity 75 --sdpTupleSize 6")
+ #Parameters used in the eichler experiments
+-EEBLASRPARAMS = (" -maxAnchorsPerPosition 100 -advanceExactMatches 10 "
+- "-affineAlign -affineOpen 100 -affineExtend 0 "
+- "-insertion 5 -deletion 5 -extend -maxExtendDropoff 20 "
+- "-noSplitSubreads -nCandidates 20 ")
+- #"-minPctIdentity 75 ") #didn't use this, but aybe should
++EEBLASRPARAMS = (" --maxAnchorsPerPosition 100 --advanceExactMatches 10 "
++ "--affineAlign --affineOpen 100 --affineExtend 0 "
++ "--insertion 5 --deletion 5 --extend --maxExtendDropoff 20 "
++ "--noSplitSubreads --nCandidates 20 ")
++ #"--minPctIdentity 75 ") #didn't use this, but aybe should
+ #have
+
+
+@@ -42,9 +42,9 @@
+ """
+ def checkBlasrParams(bp):
+ """
+- Ensure -bestn, -nproc, -clipping, -out are not specified
++ Ensure --bestn, --nproc, --clipping, --out are not specified
+ """
+- args = [" -bestn ", " -nproc ", " -clipping ", " -out ", " -m "]
++ args = [" --bestn ", " --nproc ", " --clipping ", " --out ", " -m "]
+ for i in args:
+ if bp.count(i):
+ logging.error("Do not specify %s through Honey.py pie" % (i))
+@@ -57,18 +57,18 @@
+ automatically search for .sa
+ """
+ if os.path.exists(refFile+".sa"):
+- sa = "-sa " + refFile + ".sa"
++ sa = "--sa " + refFile + ".sa"
+ else:
+ sa = ""
+ logging.info("Running Blasr")
+- cmd = ("blasr %s %s %s -nproc %d -bestn 1 "
+- "-sam -clipping subread -out %s ") \
++ cmd = ("blasr %s %s %s --nproc %d --bestn 1 "
++ "--sam --clipping subread --out %s ") \
+ % (inFile, refFile, sa, nproc, outFile)
+ r, o, e = exe(cmd + params)
+
+- #r,o,e = exe(("blasr %s %s %s -nproc %d -sam -bestn 1 -nCandidates 20 "
+- #"-out %s -clipping soft -minPctIdentity 75 "
+- #" -noSplitSubreads") % (fq, ref, sa, nproc, out))
++ #r,o,e = exe(("blasr %s %s %s --nproc %d --sam --bestn 1 --nCandidates 20 "
++ #"--out %s --clipping soft --minPctIdentity 75 "
++ #" --noSplitSubreads") % (fq, ref, sa, nproc, out))
+
+ if r != 0:
+ logging.error("blasr mapping failed!")
+--- pbsuite.orig/pbsuite/honey/Valid.py
++++ pbsuite/pbsuite/honey/Valid.py
+@@ -203,7 +203,7 @@
+ remaps reads to the provided reference (only setup for hg19 -- see
+ global variable reference)
+ """
+- return exe("blasr {0} {1} -sa {1}.sa -nproc 4 -out {2} -sam -bestn 1"\
++ return exe("blasr {0} {1} --sa {1}.sa --nproc 4 --out {2} --sam --bestn 1"\
+ .format(reads, reference, outName))
+
+ @exeLog
diff --git a/debian/patches/intermediate-reads-files.patch b/debian/patches/intermediate-reads-files.patch
new file mode 100644
index 0000000..c83ed9c
--- /dev/null
+++ b/debian/patches/intermediate-reads-files.patch
@@ -0,0 +1,77 @@
+Description: work around blasr 5's strict fasta/fastq header requirement
+ blasr 5 can only produce sam/bam output if the fasta/fastq header is
+ "PacBio compatible", or in the format <string>/<number>/<number>_<number>. See
+ https://github.com/PacificBiosciences/blasr/issues/312#issuecomment-269055488
+ If blasr's input requirements for fasta and fastq files can be relaxed,
+ this patch can be dropped. Alternatively, pbsuite could be revised to
+ not store results in the fastq headers, but that is a more intrusive change.
+Author: Afif Elghraoui <afif at debian.org>
+Bug: https://sourceforge.net/p/pb-jelly/tickets/5/
+Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=844034
+Last-Update: 2016-12-28
+--- pbsuite.orig/pbsuite/honey/bampie.py
++++ pbsuite/pbsuite/honey/bampie.py
+@@ -125,8 +125,8 @@
+
+ maq = int(read.mapq)
+ loc = mateplace + ":" + str(pos)
+- fout.write("@%s_%d%s%d%s\n%s\n+\n%s\n" % (read.qname, \
+- maq, tai, strand, loc, seq, qal))
++ fout.write("@%d%s%d%s__%s\n%s\n+\n%s\n" % \
++ (maq, tai, strand, loc, read.qname, seq, qal))
+
+ code, length = read.cigar[-1]
+ if code == 4 and length >= minLength:
+@@ -143,8 +143,8 @@
+ qal = read.qual[-length:][::-1]
+ maq = int(read.mapq)
+ loc = mateplace + ":" + str(pos)
+- fout.write("@%s_%d%s%d%s\n%s\n+\n%s\n" % (read.qname, \
+- maq, tai, strand, loc, seq, qal))
++ fout.write("@%d%s%d%s__%s\n%s\n+\n%s\n" % \
++ (maq, tai, strand, loc, read.qname, seq, qal))
+ fout.close()
+ return nreads, ntails, nmultitails
+
+@@ -163,7 +163,7 @@
+ prolog and eplog will only point to the primary and the primary will point to both
+ """
+
+- datGrab = re.compile("^(?P<rn>.*)_(?P<maq>\d+)(?P<log>[pe])(?P<strand>[01])(?P<ref>.*):(?P<pos>\d+)$")
++ datGrab = re.compile("^(?P<maq>\d+)(?P<log>[pe])(?P<strand>[01])(?P<ref>.*):(?P<pos>\d+)__(?P<rn>.*)$")
+ bout = None
+ for ibam, tbam in mappedFiles:
+ sam = pysam.Samfile(tbam, 'r')
+--- pbsuite.orig/pbsuite/jelly/m4pie.py
++++ pbsuite/pbsuite/jelly/m4pie.py
+@@ -52,8 +52,8 @@
+ ntails += 1
+ seq = reads[read.qname][:pTail]
+ shift = 0
+- fout.write(">%s_%d%s%d\n%s\n" % (read.qname, \
+- shift, 'p', len, seq))
++ fout.write(">%d%s%d__%s\n%s\n" % \
++ (shift, 'p', len, read.qname, seq))
+
+ if eTail >= minLength:
+ if hasTail:
+@@ -61,8 +61,8 @@
+ ntails += 1
+ seq = reads[read.qname][-eTail:]
+ shift = read.qend
+- fout.write(">%s_%d%s%d\n%s\n" % (read.qname, \
+- shift, 'e', len, seq))
++ fout.write(">%d%s%d__%s\n%s\n" % \
++ (shift, 'e', len, read.qname, seq))
+
+ fout.close()
+ return nreads, ntails, nmultitails
+@@ -105,7 +105,7 @@
+
+ prolog and eplog will only point to the primary and the primary will point to both
+ """
+- datGrab = re.compile("^(?P<rn>.*)_(?P<shift>\d+)(?P<log>[pe])(?P<length>\d+)$")
++ datGrab = re.compile("^(?P<shift>\d+)(?P<log>[pe])(?P<length>\d+)__(?P<rn>.*)$")
+
+ aligns = M4File(tailMapFn)
+ mode = 'a' if inplace else 'w'
diff --git a/debian/patches/series b/debian/patches/series
index 3435ec7..a46d2d0 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -3,3 +3,5 @@ intervaltree-import-statement.patch
rm-intervaltree-dependency.patch
fix-example.patch
fix-shebang-lines.patch
+blasr-5.patch
+intermediate-reads-files.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbsuite.git
More information about the debian-med-commit
mailing list