[med-svn] [Git][med-team/bbmap][master] 3 commits: routine-update: New upstream version

Dylan Aïssi gitlab at salsa.debian.org
Thu May 14 10:40:26 BST 2020



Dylan Aïssi pushed to branch master at Debian Med / bbmap


Commits:
425a5e0c by Dylan Aïssi at 2020-05-14T11:29:56+02:00
routine-update: New upstream version

- - - - -
52393578 by Dylan Aïssi at 2020-05-14T11:29:57+02:00
New upstream version 38.84+dfsg
- - - - -
427f9b40 by Dylan Aïssi at 2020-05-14T11:30:10+02:00
Update upstream source from tag 'upstream/38.84+dfsg'

Update to upstream version '38.84+dfsg'
with Debian dir 46e2acfa789d9d38932367b7d2aa072287318f1f
- - - - -


25 changed files:

- README.md
- bbduk.sh
- current/dna/AminoAcid.java
- current/icecream/IceCreamFinder.java
- current/jgi/BBDuk.java
- current/jgi/CoveragePileup.java
- current/jgi/TestFormat.java
- current/shared/Parser.java
- current/shared/ReadStats.java
- current/shared/Shared.java
- current/shared/Tools.java
- current/stream/FASTQ.java
- current/structures/EntropyTracker.java
- debian/changelog
- docs/changelog.txt
- icecreamfinder.sh
- icecreammaker.sh
- pileup.sh
- pipelines/covid/makeSummary.sh
- pipelines/covid/processCorona.sh
- − pipelines/covid/processCoronaR1.sh
- − pipelines/covid/processCoronaSE.sh
- pipelines/covid/processCoronaWrapper.sh
- pipelines/covid/readme.txt
- pipelines/covid/recal.sh


Changes:

=====================================
README.md
=====================================
@@ -3,4 +3,4 @@
 # Language: Java, Bash
 # Information about documentation is in /docs/readme.txt.
 
-# Version 38.82
+# Version 38.84


=====================================
bbduk.sh
=====================================
@@ -81,6 +81,7 @@ bqhist=<file>       Quality histogram designed for box plots.
 lhist=<file>        Read length histogram.
 phist=<file>        Polymer length histogram.
 gchist=<file>       Read GC content histogram.
+enthist=<file>      Read entropy histogram.
 ihist=<file>        Insert size histogram, for paired reads in mapped sam.
 gcbins=100          Number gchist bins.  Set to 'auto' to use read length.
 maxhistlen=6000     Set an upper bound for histogram lengths; higher uses 


=====================================
current/dna/AminoAcid.java
=====================================
@@ -805,7 +805,13 @@ public final class AminoAcid {
 		baseToComplementNumber0['T']=baseToComplementNumber0['t']=0;
 		baseToComplementNumber0['U']=baseToComplementNumber0['u']=0;
 		
-		Arrays.fill(baseToComplementExtended, (byte)-1);
+		//Invalid symbols are unchanged.
+		//This prevents crashes from -1 being out of bounds, and allows
+		//consecutive rcomp operations to restore the original sequence.
+		for(int i=0; i<baseToComplementExtended.length; i++){
+			baseToComplementExtended[i]=(byte)i;
+		}
+//		Arrays.fill(baseToComplementExtended, (byte)-1);
 		for(int i=0; i<numberToBaseExtended.length; i++){
 			char x=(char)numberToBaseExtended[i];
 			char x2=(char)numberToComplementaryBaseExtended[i];


=====================================
current/icecream/IceCreamFinder.java
=====================================
@@ -31,6 +31,7 @@ import stream.Read;
 import stream.SamLine;
 import stream.SamReadStreamer;
 import structures.ByteBuilder;
+import structures.EntropyTracker;
 import structures.IntList;
 import structures.ListNum;
 
@@ -86,9 +87,10 @@ public final class IceCreamFinder {
 		FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
 		SamLine.SET_FROM_OK=true;
 		Shared.setBufferData(1000000);
-		Shared.FASTA_WRAP=511;
+//		Shared.FASTA_WRAP=511;
 		Data.USE_SAMBAMBA=false;//Sambamba changes PacBio headers.
 		Read.CHANGE_QUALITY=false;
+		EntropyTracker.defaultK=3;
 		
 		{//Parse the arguments
 			final Parser parser=parse(args);
@@ -213,7 +215,7 @@ public final class IceCreamFinder {
 				minLengthAfterTrimming=Integer.parseInt(b);
 			}else if(a.equals("realign")){
 				realign=Tools.parseBoolean(b);
-			}else if(a.equals("aligninverse") || a.equals("alignrc")){
+			}else if(a.equals("aligninverse") || a.equals("alignrc") || a.equals("findicecream")){
 				alignRC=Tools.parseBoolean(b);
 			}else if(a.equals("alignadapter")){
 				alignAdapter=Tools.parseBoolean(b);
@@ -283,6 +285,22 @@ public final class IceCreamFinder {
 				trimPolyA=Tools.parseBoolean(b);
 			}else if(PolymerTrimmer.parse(arg, a, b)){
 				//do nothing
+			}else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
+				if(b==null || Character.isLetter(b.charAt(0))){
+					if(Tools.parseBoolean(b)){
+						entropyCutoff=0.55f;
+					}else{
+						entropyCutoff=-1;
+					}
+				}else{
+					entropyCutoff=Float.parseFloat(b);
+				}
+			}else if(a.equals("entropyblock") || a.equals("entropylength") || a.equals("entropylen") || a.equals("entlen")){
+				entropyLength=Tools.parseIntKMG(b);
+			}else if(a.equals("entropyfraction") || a.equals("entfraction")){
+				entropyFraction=Float.parseFloat(b);
+			}else if(a.equals("monomerfraction") || a.equals("maxmonomerfraction") || a.equals("mmf")){
+				maxMonomerFraction=Float.parseFloat(b);
 			}else if(a.equals("parse_flag_goes_here")){
 				long fake_variable=Tools.parseKMG(b);
 				//Set a variable here
@@ -423,6 +441,9 @@ public final class IceCreamFinder {
 		bb.appendln(Tools.numberPercent("Absent Adapter:", missingAdapterZMWs, missingAdapterZMWs*100.0/(ZMWs), 3, 8));
 		bb.appendln(Tools.numberPercent("Hidden Adapter:", hiddenAdapterZMWs, hiddenAdapterZMWs*100.0/(ZMWs), 3, 8));
 		bb.appendln(Tools.numberPercent("Ambiguous IR:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 3, 8));
+//		bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyReads, lowEntropyReads*100.0/(readsProcessed), 3, 8));
+		bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyZMWs, lowEntropyZMWs*100.0/(ZMWs), 3, 8));
+		
 		bb.appendln(Tools.number("Avg Score Ratio:", iceCreamRatio/ratiosCounted, 3, 8));
 		bb.appendln(Tools.number("Score Cutoff:", minRatio2, 3, 8));
 		
@@ -486,11 +507,15 @@ public final class IceCreamFinder {
 		jo.add("Absent_Adapter_Pct", missingAdapterZMWs*100.0/(ZMWs));
 		jo.add("Hidden_Adapter", hiddenAdapterZMWs);
 		jo.add("Hidden_Adapter_Pct", hiddenAdapterZMWs*100.0/(ZMWs));
+//		jo.add("Low_Entropy", lowEntropyReads);
+//		jo.add("Low_Entropy_Pct", lowEntropyReads*100.0/(readsProcessed));
+		jo.add("Low_Entropy", lowEntropyZMWs);
+		jo.add("Low_Entropy_Pct", lowEntropyZMWs*100.0/(ZMWs));
 		jo.add("Ambiguous_Inverted_Repeat", ambiguousZMWs);
 		jo.add("Ambiguous_Inverted_Repeat_Pct", ambiguousZMWs*100.0/(ZMWs));
 		jo.add("Avg_Score_Ratio_IR", iceCreamRatio/ratiosCounted);
 		jo.add("Score_Cutoff_IR", minRatio2);
-
+		
 		jo.add("Alignment_Iterations_IR", alignmentIters);
 		jo.add("Short_Alignment_Iterations_IR", alignmentItersShort);
 		
@@ -770,6 +795,9 @@ public final class IceCreamFinder {
 			ratiosCounted+=pt.ratiosCountedT;
 			missingAdapterZMWs+=pt.missingAdapterZMWsT;
 			hiddenAdapterZMWs+=pt.hiddenAdapterZMWsT;
+			lowEntropyZMWs+=pt.lowEntropyZMWsT;
+			lowEntropyReads+=pt.lowEntropyReadsT;
+			
 			basesTrimmed+=pt.basesTrimmedT;
 			readsTrimmed+=pt.readsTrimmedT;
 			
@@ -820,8 +848,7 @@ public final class IceCreamFinder {
 	/*----------------         Inner Classes        ----------------*/
 	/*--------------------------------------------------------------*/
 	
-	/** This class is static to prevent accidental writing to shared variables.
-	 * It is safe to remove the static modifier. */
+	/** Proecesses reads. */
 	private class ProcessThread extends Thread {
 		
 		//Constructor
@@ -835,6 +862,12 @@ public final class IceCreamFinder {
 			
 			Arrays.fill(tipBufferLeft, (byte)'N');
 			Arrays.fill(tipBufferRight, (byte)'N');
+			
+			if(entropyCutoff>=0){
+				eTracker=new EntropyTracker(false, entropyCutoff, true);
+			}else{
+				eTracker=null;
+			}
 		}
 		
 		//Called by start()
@@ -874,6 +907,24 @@ public final class IceCreamFinder {
 			return median;
 		}
 		
+		int flagLowEntropyReads(final ArrayList<Read> reads, final float minEnt, 
+				final int minLen0, final float minFract){
+			int found=0;
+			for(Read r : reads){
+				if(!r.discarded()){
+					int minLen=Tools.min((int)(r.length()*minFract), minLen0);
+					int maxBlock=eTracker.longestLowEntropyBlock(r.bases, false, maxMonomerFraction);
+					if(maxBlock>=minLen){
+						r.setDiscarded(true);
+						r.setJunk(true);
+						found++;
+//						System.err.println(r.toFasta());
+					}
+				}
+			}
+			return found;
+		}
+		
 		int flagLongReads(final ArrayList<Read> reads, int median){
 			int found=0;
 			for(Read r : reads){
@@ -886,6 +937,7 @@ public final class IceCreamFinder {
 			return found;
 		}
 		
+		/** Each list is presumed to be all reads from a ZMW, in order */
 		void processList(final ArrayList<Read> reads){
 			long numBases=0;
 			
@@ -916,7 +968,7 @@ public final class IceCreamFinder {
 						trimmed+=trimRead(r, 80);
 					}
 					
-					if(trimReads){
+					if(trimReads && adapter!=null){
 						int leftAdapterBases=alignLeftTipAdapter(r);
 						int rightAdapterBases=alignRightTipAdapter(r);
 						if(leftAdapterBases+rightAdapterBases>0){
@@ -935,6 +987,7 @@ public final class IceCreamFinder {
 						readsTrimmedT++;
 					}
 					
+					//TODO: Note again, removing intermediate reads messes up forward-rev ordering
 					if(r.length()<minLengthAfterTrimming){//Discard short trash
 						reads.set(i, null);
 						removed++;
@@ -945,6 +998,18 @@ public final class IceCreamFinder {
 				}
 			}
 			
+			if(entropyCutoff>0){
+				int bad=flagLowEntropyReads(reads, entropyCutoff, entropyLength, entropyFraction);
+				if(bad>0){
+					lowEntropyZMWsT++;
+					lowEntropyReadsT+=bad;
+					if(bad>=reads.size()){
+						if(!reads.isEmpty()){outputReads(reads, null);}
+						return; //No point in continuing...
+					}
+				}
+			}
+			
 			if(reads.isEmpty()){return;}
 
 			Read sample=null;//Read that will be searched for inverted repeat, typically median length
@@ -1079,6 +1144,8 @@ public final class IceCreamFinder {
 			if(!reads.isEmpty()){outputReads(reads, junctions);}
 		}
 		
+		//TODO: Now that I think about it.  The order of the reads is important.
+		//Since they go forward-rev-forward-rev it's imprudent to discard inner reads.
 		private void removeShortTrash(ArrayList<Read> reads) {
 			int removed=0;
 			for(int i=0; i<reads.size(); i++){
@@ -1286,6 +1353,7 @@ public final class IceCreamFinder {
 				boolean trueIceCream=(parseCustom ? ReadBuilder.isIceCream(r.id) : false);
 				trueIceCreamReads+=(trueIceCream ? 1 : 0);
 				if(r.discarded() || sendAllToDiscarded){
+//					assert(false);
 					if(bad!=null){bad.add(r);}
 					if(trueIceCream){truePositiveReadsT++;}
 					else{falsePositiveReadsT++;}
@@ -1723,6 +1791,9 @@ public final class IceCreamFinder {
 		/** Number of junctions detected in IR reads by this thread */
 		protected long junctionsOutT=0;
 		
+		protected long lowEntropyZMWsT=0;
+		protected long lowEntropyReadsT=0;
+		
 		/** True only if this thread has completed successfully */
 		boolean success=false;
 		
@@ -1745,10 +1816,12 @@ public final class IceCreamFinder {
 		final FlatAligner fla=new FlatAligner();
 //		final MultiStateAligner9PacBioAdapter msa=alignAdapter ? new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS) : null;
 		/** For slowly aligning adapter sequence sectionwise */
-		final SingleStateAlignerPacBioAdapter ssa=alignAdapter ? new SingleStateAlignerPacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS, adapter.length) : null;
+		final SingleStateAlignerPacBioAdapter ssa=(alignAdapter || trimReads) ? new SingleStateAlignerPacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS, adapter.length) : null;
 		/** Alternate scoring for slow adapter alignment */
-		final MultiStateAligner9PacBioAdapter2 msa2=alignAdapter ? new MultiStateAligner9PacBioAdapter2() : null;
+		final MultiStateAligner9PacBioAdapter2 msa2=(alignAdapter || trimReads) ? new MultiStateAligner9PacBioAdapter2() : null;
 
+		final EntropyTracker eTracker;
+		
 		final long[] adapterScoresT=new long[201];
 		final long[] repeatScoresT=new long[201];
 		final byte[] tipBufferLeft=new byte[adapterTipLen+adapterTipPad];
@@ -1912,6 +1985,9 @@ public final class IceCreamFinder {
 	protected long basesTrimmed=0;
 	protected long readsTrimmed=0;
 	
+	protected long lowEntropyZMWs=0;
+	protected long lowEntropyReads=0;
+	
 	/** Total ZMWs observed */
 	protected long ZMWs=0;
 	
@@ -1953,6 +2029,20 @@ public final class IceCreamFinder {
 	
 	boolean trimPolyA=false;
 	
+	/*--------------------------------------------------------------*/
+	/*----------------        Entropy Fields        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Minimum entropy to be considered "complex", on a scale of 0-1 */
+	float entropyCutoff=-1; //suggested 0.55
+	/** Minimum length of a low-entropy block to fail a read */
+	int entropyLength=350;
+	/** Minimum length of a low-entropy block as a fraction of read length;
+	 * the minimum of the two will be used */
+	float entropyFraction=0.5f;
+	
+	float maxMonomerFraction=0.74f; //Suggested...  0.74
+	
 	/*--------------------------------------------------------------*/
 	/*----------------         Final Fields         ----------------*/
 	/*--------------------------------------------------------------*/


=====================================
current/jgi/BBDuk.java
=====================================
@@ -146,8 +146,6 @@ public class BBDuk {
 		float minBaseFrequency_=0;
 		float minKmerFraction_=0;
 		float minCoveredFraction_=0;
-		boolean setEntropyK=false;
-		boolean setEntropyWindow=false;
 		boolean setk=false;
 		
 		scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
@@ -401,16 +399,10 @@ public class BBDuk {
 				initialSize=Tools.parseIntKMG(b);
 			}else if(a.equals("dump")){
 				dump=b;
-			}else if(a.equals("entropyk") || a.equals("ek")){
-				entropyK=Integer.parseInt(b);
-				setEntropyK=true;
-			}else if(a.equals("entropywindow") || a.equals("ew")){
-				entropyWindowBases=Integer.parseInt(b);
-				setEntropyWindow=true;
 			}else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
 				entropyCutoff=Float.parseFloat(b);
 			}else if(a.equals("verifyentropy")){
-				verifyEntropy=EntropyTracker.verify=Tools.parseBoolean(b);
+				EntropyTracker.verify=Tools.parseBoolean(b);
 //			}else if(a.equals("entropytracker") || a.equals("usetracker")){
 //				useEntropyTracker=Tools.parseBoolean(b);
 			}else if(a.equals("entropymask") || a.equals("maskentropy")){
@@ -735,6 +727,7 @@ public class BBDuk {
 		MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
 		MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
 		MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
+		MAKE_ENTROPYHIST=ReadStats.COLLECT_ENTROPY_STATS;
 		MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
 		MAKE_IHIST=ReadStats.COLLECT_INSERT_STATS;
 		
@@ -830,8 +823,6 @@ public class BBDuk {
 			shift2=shift-bitsPerBase;
 			mask=(shift>63 ? -1L : ~((-1L)<<shift));
 			kmask=lengthMasks[k];
-			entropyK=(setEntropyK ? entropyK : amino ? 2 : 5);
-			entropyWindowBases=(setEntropyWindow ? entropyWindowBases : amino ? 25 : 50);
 		}
 		
 		minKmerFraction=Tools.max(minKmerFraction_, 0);
@@ -997,7 +988,7 @@ public class BBDuk {
 		//Initialize entropy
 		calcEntropy=(entropyCutoff>=0 || entropyMark);
 		if(calcEntropy){
-			assert(entropyWindowBases>0 && (entropyMark || (entropyCutoff>=0 && entropyCutoff<=1)));
+			assert(EntropyTracker.defaultWindowBases>0 && (entropyMark || (entropyCutoff>=0 && entropyCutoff<=1)));
 		}
 		assert(calcEntropy || (!entropyMask && !entropyMark && entropyTrim==0)) : "Entropy masking/trimming operations require the entropy flag to be set.";
 		
@@ -2412,8 +2403,7 @@ public class BBDuk {
 			}
 			
 			if(calcEntropy){
-				eTrackerT=new EntropyTracker(entropyK, entropyWindowBases, amino, 
-						Tools.max(0, entropyCutoff), entropyHighpass);
+				eTrackerT=new EntropyTracker(amino, Tools.max(0, entropyCutoff), entropyHighpass);
 			}else{
 				eTrackerT=null;
 			}
@@ -3156,6 +3146,7 @@ public class BBDuk {
 				if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
 				if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
 				if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
+				if(MAKE_ENTROPYHIST){readstats.addToEntropyHistogram(r1);}
 				if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
 				
 				if(MAKE_IHIST){
@@ -4543,17 +4534,11 @@ public class BBDuk {
 	/*--------------------------------------------------------------*/
 	/*----------------        Entropy Fields        ----------------*/
 	/*--------------------------------------------------------------*/
-	
-	/** Kmer length for entropy calculation */
-	int entropyK=5;
-	/** Window length for entropy calculation */
-	int entropyWindowBases=50;
+
 	/** Minimum entropy to be considered "complex", on a scale of 0-1 */
 	float entropyCutoff=-1;
 	/** Mask entropy with a highpass filter */
 	boolean entropyHighpass=true;
-	/** Verify consistency of related data structures (slow) */
-	private boolean verifyEntropy=false;
 	
 	/** Change the quality scores to be proportional to the entropy */
 	boolean entropyMark=false;
@@ -4905,6 +4890,7 @@ public class BBDuk {
 	final boolean MAKE_INDELHIST;
 	final boolean MAKE_LHIST;
 	final boolean MAKE_GCHIST;
+	final boolean MAKE_ENTROPYHIST;
 	final boolean MAKE_IDHIST;
 	
 	final boolean MAKE_IHIST;


=====================================
current/jgi/CoveragePileup.java
=====================================
@@ -308,6 +308,10 @@ public class CoveragePileup {
 			assert(covstats==null || covstats.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
 		}
 		
+		if(!Tools.testInputFiles(false, true, in1, in2, reference)){
+			throw new RuntimeException("\nCan't read some input files.\n");  
+		}
+		
 		if(!Tools.testOutputFiles(overwrite, append, false, basecov, bincov, normcov, normcovOverall, histogram, covstats, outrpkm)){
 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+
 					basecov+", "+bincov+", "+normcov+", "+normcovOverall+", "+histogram+", "+covstats+", "+outrpkm+"\n");


=====================================
current/jgi/TestFormat.java
=====================================
@@ -73,6 +73,7 @@ public class TestFormat {
 		Read.NULLIFY_BROKEN_QUALITY=true;
 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
 		SketchObject.defaultParams.minProb=0;
+		FileFormat.PRINT_WARNING=false;
 		
 		Parser parser=new Parser();
 		for(int i=0; i<args.length; i++){
@@ -219,10 +220,14 @@ public class TestFormat {
 				}else if(ff.vcf()){
 					variant=true;
 					loadVcf(ff);
+				}else{
+					System.err.println("Does not seem to be a sequence or variant format: "+ff.rawExtension());
 				}
 			}
+//			System.err.println(ff);
 		}
 		
+		
 		if(sequence){
 			printSequenceResults();
 		}else if(variant){


=====================================
current/shared/Parser.java
=====================================
@@ -29,6 +29,7 @@ import stream.ReadStreamByteWriter;
 import stream.ReadStreamWriter;
 import stream.SamLine;
 import stream.SamStreamer;
+import structures.EntropyTracker;
 import structures.IntList;
 import tax.TaxTree;
 import var2.CallVariants;
@@ -725,6 +726,14 @@ public class Parser {
 			Shared.awsServers=!Tools.parseBoolean(b);
 		}
 		
+		else if(a.equals("entropyk") || a.equals("ek")){
+			EntropyTracker.defaultK=Integer.parseInt(b);
+			EntropyTracker.setDefaultK=true;
+		}else if(a.equals("entropywindow") || a.equals("ew")){
+			EntropyTracker.defaultWindowBases=Integer.parseInt(b);
+			EntropyTracker.setDefaultWindow=true;
+		}
+		
 		else{
 			return false;
 		}
@@ -850,6 +859,18 @@ public class Parser {
 			}
 		}else if(a.equals("gcchart") || a.equals("gcplot")){
 			ReadStats.GC_PLOT_X=Tools.parseBoolean(b);
+		}else if(a.equals("entropyhistogram") || a.equals("entropyhist") || a.equals("enhist") || a.equals("enthist")){
+			ReadStats.ENTROPY_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+			ReadStats.COLLECT_ENTROPY_STATS=(ReadStats.ENTROPY_HIST_FILE!=null);
+			if(ReadStats.COLLECT_ENTROPY_STATS){System.err.println("Set entropy histogram output to "+ReadStats.ENTROPY_HIST_FILE);}
+		}else if(a.equals("entropybins") || a.equals("entropyhistbins") || a.equals("entbins") || a.equals("enthistbins")){
+			if("auto".equalsIgnoreCase(b)){
+				ReadStats.ENTROPY_BINS=1000;
+//				ReadStats.ENTROPY_BINS_AUTO=true;
+			}else{
+				ReadStats.ENTROPY_BINS=Integer.parseInt(b);
+//				ReadStats.ENTROPY_BINS_AUTO=false;
+			}
 		}else if(a.equals("timehistogram") || a.equals("thist")){
 			ReadStats.TIME_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
 			ReadStats.COLLECT_TIME_STATS=(ReadStats.TIME_HIST_FILE!=null);


=====================================
current/shared/ReadStats.java
=====================================
@@ -11,6 +11,7 @@ import fileIO.TextStreamWriter;
 import stream.Read;
 import stream.SamLine;
 import structures.ByteBuilder;
+import structures.EntropyTracker;
 import structures.LongList;
 
 
@@ -120,6 +121,14 @@ public class ReadStats {
 			gcHist=null;
 		}
 		
+		if(COLLECT_ENTROPY_STATS){
+			entropyHist=new long[ENTROPY_BINS+1];
+			eTracker=new EntropyTracker(Shared.AMINO_IN, 0, true);
+		}else{
+			entropyHist=null;
+			eTracker=null;
+		}
+		
 		if(COLLECT_ERROR_STATS){
 			errorHist=new LongList(100);
 		}else{
@@ -250,6 +259,12 @@ public class ReadStats {
 				}
 			}
 			
+			if(COLLECT_ENTROPY_STATS){
+				for(int i=0; i<rs.entropyHist.length; i++){
+					x.entropyHist[i]+=rs.entropyHist[i];
+				}
+			}
+			
 			if(COLLECT_IDENTITY_STATS){
 				for(int i=0; i<rs.idHist.length; i++){
 					x.idHist[i]+=rs.idHist[i];
@@ -443,6 +458,32 @@ public class ReadStats {
 		gcMaxReadLen=Tools.max(len, gcMaxReadLen);
 	}
 	
+	public void addToEntropyHistogram(final Read r1){
+		if(r1==null){return;}
+		final Read r2=r1.mate;
+		final int len1=r1.length(), len2=r1.mateLength();
+		
+		final float entropy1=(len1>0 ? eTracker.averageEntropy(r1.bases, false) : -1);
+		final float entropy2=(len2>0 ? eTracker.averageEntropy(r2.bases, false) : -1);
+		if(/* usePairEntropy */ false){
+			final float entropy;
+			if(r2==null){
+				entropy=entropy1;
+			}else{
+				entropy=(entropy1*len1+entropy2*len2)/(len1+len2);
+			}
+			addToEntropyHistogram(entropy, len1+len2);
+		}else{
+			addToEntropyHistogram(entropy1, len1);
+			addToEntropyHistogram(entropy2, len2);
+		}
+	}
+	
+	private void addToEntropyHistogram(final float entropy, final int len){
+		if(entropy<0 || len<1){return;}
+		entropyHist[Tools.min(ENTROPY_BINS, (int)(entropy*(ENTROPY_BINS+1)))]++;
+	}
+	
 	public void addToIdentityHistogram(final Read r){
 		if(r==null){return;}
 		addToIdentityHistogram(r, 0);
@@ -748,7 +789,7 @@ public class ReadStats {
 		return Tools.testOutputFiles(overwrite, append, allowDuplicates,
 				AVG_QUAL_HIST_FILE, QUAL_HIST_FILE, BQUAL_HIST_FILE, BQUAL_HIST_OVERALL_FILE, QUAL_COUNT_HIST_FILE,
 				MATCH_HIST_FILE, INSERT_HIST_FILE, BASE_HIST_FILE, QUAL_ACCURACY_FILE, INDEL_HIST_FILE, ERROR_HIST_FILE, LENGTH_HIST_FILE,
-				GC_HIST_FILE, IDENTITY_HIST_FILE, TIME_HIST_FILE);
+				GC_HIST_FILE, ENTROPY_HIST_FILE, IDENTITY_HIST_FILE, TIME_HIST_FILE);
 	}
 	
 	public static boolean writeAll(){
@@ -770,6 +811,7 @@ public class ReadStats {
 			if(ERROR_HIST_FILE!=null){rs.writeErrorToFile(ERROR_HIST_FILE);}
 			if(LENGTH_HIST_FILE!=null){rs.writeLengthToFile(LENGTH_HIST_FILE);}
 			if(GC_HIST_FILE!=null){rs.writeGCToFile(GC_HIST_FILE, true);}
+			if(ENTROPY_HIST_FILE!=null){rs.writeEntropyToFile(ENTROPY_HIST_FILE, true);}
 			if(IDENTITY_HIST_FILE!=null){rs.writeIdentityToFile(IDENTITY_HIST_FILE, true);}
 			if(TIME_HIST_FILE!=null){rs.writeTimeToFile(TIME_HIST_FILE);}
 			
@@ -1354,6 +1396,46 @@ public class ReadStats {
 		errorState|=bsw.errorState;
 	}
 	
+	public void writeEntropyToFile(String fname, boolean printZeros){
+		final long[] hist=entropyHist;
+		
+		final int bins=hist.length;
+		final double mult=1.0/Tools.max(1, bins-1);
+		final long total=Tools.sum(hist);
+		final long max=Tools.max(hist);
+		final double countsPerX=Tools.max(1, ((max*1000.0)/40));
+		final double fractionMult=1.0/Tools.max(1, total);
+		long sum=0;	
+		
+		double mean=Tools.averageHistogram(hist)*mult;
+		double median=Tools.percentileHistogram(hist, 0.5)*mult;
+		double mode=Tools.calcModeHistogram(hist)*mult;
+		double stdev=Tools.standardDeviationHistogram(hist)*mult;
+		
+		ByteBuilder bb=new ByteBuilder(256);
+		bb.append("#Mean\t").append(mean, 6).nl();
+		bb.append("#Median\t").append(median, 6).nl();
+		bb.append("#Mode\t").append(mode, 6).nl();
+		bb.append("#STDev\t").append(stdev, 6).nl();
+		bb.append("#Value\tCount\n");
+		
+		ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
+		bsw.start();
+		bsw.print(bb);
+		
+		for(int i=0; i<bins; i++){
+			long x=hist[i];
+			sum+=x;
+			if(x>0 || printZeros){
+				bb.clear().append(i*mult, 4).tab().append(x).nl();
+				bsw.print(bb);
+			}
+		}
+		bsw.poison();
+		bsw.waitForFinish();
+		errorState|=bsw.errorState;
+	}
+	
 	public void writeIdentityToFile(String fname, boolean printZeros){
 		final long[] hist, histb;
 		if(ID_BINS_AUTO && idMaxReadLen+1<idHist.length){
@@ -1420,8 +1502,10 @@ public class ReadStats {
 	public final long[] qualSub;
 	public final long[] qualIns;
 	public final long[] qualDel;
-	
+
 	public final long[] gcHist;
+	public final long[] entropyHist;
+	public final EntropyTracker eTracker;
 	public final long[] idHist;
 	public final long[] idBaseHist;
 	private int gcMaxReadLen=1;
@@ -1453,16 +1537,22 @@ public class ReadStats {
 	public static final int MAXDELLEN=1000;
 	public static final int MAXDELLEN2=1000000;
 	public static final int DEL_BIN=100;
-	public static int GC_BINS=100;
 	public static int ID_BINS=100;
 	public static boolean ID_BINS_AUTO=false;
+	public static int GC_BINS=100;
 	public static boolean GC_BINS_AUTO=false;
 	public static boolean GC_PLOT_X=false;
+	public static int ENTROPY_BINS=1000;
 
 	public static double GCMean;
 	public static double GCMedian;
 	public static double GCMode;
 	public static double GCSTDev;
+
+//	public static double entropyMean;
+//	public static double entropyMedian;
+//	public static double entropyMode;
+//	public static double entropySTDev;
 	
 	public boolean errorState=false;
 	
@@ -1478,14 +1568,16 @@ public class ReadStats {
 	public static boolean COLLECT_BASE_STATS=false;
 	public static boolean COLLECT_INDEL_STATS=false;
 	public static boolean COLLECT_GC_STATS=false;
+	public static boolean COLLECT_ENTROPY_STATS=false;
 	public static boolean COLLECT_ERROR_STATS=false;
 	public static boolean COLLECT_LENGTH_STATS=false;
 	public static boolean COLLECT_IDENTITY_STATS=false;
 	public static boolean COLLECT_TIME_STATS=false;
 	
 	public static boolean collectingStats(){
-		return COLLECT_QUALITY_STATS || COLLECT_QUALITY_ACCURACY || COLLECT_MATCH_STATS || COLLECT_INSERT_STATS || COLLECT_BASE_STATS
-				|| COLLECT_INDEL_STATS || COLLECT_GC_STATS || COLLECT_ERROR_STATS || COLLECT_LENGTH_STATS || COLLECT_IDENTITY_STATS || COLLECT_TIME_STATS;
+		return COLLECT_QUALITY_STATS || COLLECT_QUALITY_ACCURACY || COLLECT_MATCH_STATS || COLLECT_INSERT_STATS
+				|| COLLECT_BASE_STATS || COLLECT_INDEL_STATS || COLLECT_GC_STATS || COLLECT_ENTROPY_STATS
+				|| COLLECT_ERROR_STATS || COLLECT_LENGTH_STATS || COLLECT_IDENTITY_STATS || COLLECT_TIME_STATS;
 	}
 	
 	public static boolean usePairGC=true;
@@ -1503,6 +1595,7 @@ public class ReadStats {
 	public static String ERROR_HIST_FILE=null;
 	public static String LENGTH_HIST_FILE=null;
 	public static String GC_HIST_FILE=null;
+	public static String ENTROPY_HIST_FILE=null;
 	public static String IDENTITY_HIST_FILE=null;
 	public static String TIME_HIST_FILE=null;
 	


=====================================
current/shared/Shared.java
=====================================
@@ -28,8 +28,8 @@ public class Shared {
 	//https://stackoverflow.com/questions/14288185/detecting-windows-or-linux
 	public static boolean LINUX=envContainsPair("OS", "nix", true) || envContainsPair("OS", "nux", true) || envContainsPair("OS", "aix", true);
 	public static boolean SOLARIS=envContainsPair("OS", "sunos", true);
-	public static boolean GENEPOOL=envContainsPair("NERSC_HOST", "genepool", false);
-	public static boolean DENOVO=envContainsPair("NERSC_HOST", "denovo", false);
+	public static boolean GENEPOOL=envContainsPair("NERSC_HOST", "genepool", false);//Probably deprecated
+	public static boolean DENOVO=envContainsPair("NERSC_HOST", "denovo", false);//Almost certainly deprecated
 	public static boolean CORI=envContainsPair("NERSC_HOST", "cori", false);
 	public static boolean NERSC=envContainsKey("NERSC_HOST");
 	public static boolean AWS=envContainsKey("EC2_HOME");
@@ -124,8 +124,8 @@ public class Shared {
 	public static final int GAPCOST=Tools.max(1, GAPLEN/64);
 	public static final byte GAPC='-';
 	
-	public static String BBMAP_VERSION_STRING="38.82";
-	public static String BBMAP_VERSION_NAME="Covid Crusade";
+	public static String BBMAP_VERSION_STRING="38.84";
+	public static String BBMAP_VERSION_NAME="Erroneous Entropy";
 	
 	public static boolean TRIM_READ_COMMENTS=false;
 	public static boolean TRIM_RNAME=false; //For mapped sam reads


=====================================
current/shared/Tools.java
=====================================
@@ -3409,6 +3409,17 @@ public final class Tools {
 		return maxIndex;
 	}
 
+	public static final int max(short[] array){return array[maxIndex(array)];}
+	
+	public static final int maxIndex(short[] array){
+		short max=array[0];
+		int maxIndex=0;
+		for(int i=1; i<array.length; i++){
+			if(array[i]>max){max=array[i];maxIndex=i;}
+		}
+		return maxIndex;
+	}
+
 	public static final int max(int[] array){return array[maxIndex(array)];}
 	
 	public static final int maxIndex(int[] array){


=====================================
current/stream/FASTQ.java
=====================================
@@ -135,7 +135,8 @@ public class FASTQ {
 		if(FORCE_INTERLEAVED){return true;}
 		if(PARSE_CUSTOM && fname.contains("_interleaved.")){return true;}
 		
-		return testPairNames(oct.get(0), oct.get(4), allowIdentical);
+		boolean b=testPairNames(oct.get(0), oct.get(4), allowIdentical);
+		return b;
 	}
 	
 	public static boolean testInterleavedFasta(String fname, boolean allowIdentical){
@@ -244,6 +245,7 @@ public class FASTQ {
 
 		final int len1=id1.length();
 		final int len2=id2.length();
+		if(len1!=len2){return false;} //Can happen in PacBio names, but never Illumina
 		final int idxSlash1=id1.lastIndexOf('/');
 		final int idxSlash2=id2.lastIndexOf('/');
 		final int idxSpace1=id1.indexOf(' ');
@@ -251,6 +253,8 @@ public class FASTQ {
 		
 //		System.out.println("idxSlash1="+idxSlash1+", idxSlash2="+idxSlash2+", idxSpace1="+idxSpace1+", idxSpace2="+idxSpace2);
 		
+//		System.err.println("A:");
+		
 		if(idxSpace1==idxSpace2 && idxSpace1>0 && len1>=idxSpace1+3 && len2>=idxSpace2+3){
 			if(id1.charAt(idxSpace1+1)=='1' && id1.charAt(idxSpace1+2)==':' && id2.charAt(idxSpace2+1)=='2' && id2.charAt(idxSpace2+2)==':'){
 				for(int i=0; i<idxSpace1; i++){
@@ -258,6 +262,7 @@ public class FASTQ {
 						return false;
 					}
 				}
+//				System.err.println("B");
 				return true;
 			}
 		}
@@ -270,10 +275,18 @@ public class FASTQ {
 						return false;
 					}
 				}
+				//Here we try to weed out PacBio, which will differ after the last slash:
+				for(int i=idxSlash1+2; i<len1; i++){
+					if(id1.charAt(i)!=id2.charAt(i)){
+						return false;
+					}
+				}
+//				System.err.println("C");
 				return true;
 			}
 		}
-		
+
+//		System.err.println("D");
 		return (allowIdentical && id1.equals(id2));
 	}
 	


=====================================
current/structures/EntropyTracker.java
=====================================
@@ -37,6 +37,19 @@ public class EntropyTracker {
 		this(k_, window_, amino_, -1, true);
 	}
 	
+	/**
+	 * Allows the use of passes() based on entropy.
+	 * @param k_ Kmer length.
+	 * @param window_ Window size in bases.
+	 * @param cutoff_ Entropy cutoff, 0 (no entropy) to 1 (max entropy).
+	 * @param highPass_ True passes entropy of at least cutoff; false fails..
+	 */
+	public EntropyTracker(boolean amino_, float cutoff_, boolean highPass_){
+		this((setDefaultK ? defaultK : amino_ ? 2 : defaultK), 
+				(setDefaultWindow ? defaultWindowBases : amino_ ? 25 : 50),
+				amino_, cutoff_, highPass_);
+	}
+	
 	/**
 	 * Allows the use of passes() based on entropy.
 	 * @param k_ Kmer length.
@@ -65,10 +78,12 @@ public class EntropyTracker {
 		
 		entropy=makeEntropyArray(windowKmers);
 		entropyMult=-1/Math.log(windowKmers);
+		baseCountMult=1f/windowBases;
+		baseCounts=new short[4];
 		counts=new short[kmerSpace];
 		countCounts=new short[windowKmers+2];
 		countCounts[0]=(short)windowKmers;
-		bases=new byte[windowBases];
+		baseRingBuffer=new byte[windowBases];
 		
 //		assert(false) : "\namino="+amino+", windowBases="+windowBases+", windowKmers="+windowKmers+", cutoff="+entropyCutoff+", bpb="+bitsPerBase+", mask="+
 //			Integer.toBinaryString(mask)+", space="+kmerSpace+", mult="+entropyMult+"\n"+"entropy="+Arrays.toString(entropy);
@@ -145,6 +160,12 @@ public class EntropyTracker {
 	/*----------------      Entropy Calculation     ----------------*/
 	/*--------------------------------------------------------------*/
 	
+	public float calcMaxMonomerFraction(){
+		//int total=sum(baseCounts); should be window length
+		int max=Tools.max(baseCounts);
+		return baseCountMult*max;
+	}
+	
 	/**
 	 * Calculate entropy in current window.
 	 * @return Entropy in current window.
@@ -335,6 +356,73 @@ public class EntropyTracker {
 		return (float)avg;
 	}
 	
+	/**
+	 * Reports the longest block of consecutive bases in which all windows
+	 * are below the entropy cutoff and at least the (optional) monomer fraction.
+	 * @param bases
+	 * @param allowNs
+	 * @param maxMonomer
+	 * @return
+	 */
+	public int longestLowEntropyBlock(byte[] bases, boolean allowNs, float maxMonomerFraction){
+		//Reset the initial state
+		clear();
+		
+		//Position in sequence
+		int i=0;
+		
+		//Number of entropy measurements
+		int totalWindows=0;
+		
+		double sum=0;
+		int totalLowWindows=0;
+		int currentLowWindows=0;
+		int maxLowWindows=0;
+		
+		//Prefill the first window
+		for(final int lim=Tools.min(bases.length, windowBases); i<lim; i++){add(bases[i]);}
+		
+		//Calculate entropy for the first window.
+		//This allows entropy to pass if it is high enough even though the sequence is shorter than window length.
+		if(allowNs || ns==0){
+			totalWindows++;
+			double e=calcEntropy();
+			float mmf=calcMaxMonomerFraction();
+			sum+=e;
+			if(e<entropyCutoff && mmf>=maxMonomerFraction){
+				totalLowWindows++;
+				currentLowWindows++;
+				maxLowWindows=Tools.max(maxLowWindows, currentLowWindows);
+			}else{
+				currentLowWindows=0;
+			}
+		}
+		
+		//Calculate entropy for remaining windows
+		for(; i<bases.length; i++){
+			add(bases[i]);
+			if(allowNs || ns==0){
+				totalWindows++;
+				double e=calcEntropy();
+				float mmf=calcMaxMonomerFraction();
+				sum+=e;
+				if(e<entropyCutoff && mmf>maxMonomerFraction){
+					totalLowWindows++;
+					currentLowWindows++;
+					maxLowWindows=Tools.max(maxLowWindows, currentLowWindows);
+				}else{
+					currentLowWindows=0;
+				}
+			}
+		}
+		
+		//Calculate the average; not needed
+		double avg=(sum/(Tools.max(1, totalWindows)));
+		
+		int maxLowBlock=maxLowWindows<1 ? 0 : Tools.min(bases.length, maxLowWindows+windowBases-1);
+		return maxLowBlock;
+	}
+	
 	/**
 	 * Calculate entropy in the window and compare to the cutoff.
 	 * If Ns are important they should be handled externally with ns().
@@ -377,7 +465,7 @@ public class EntropyTracker {
 		//Test initial state
 		assert(!verify || verify()) : this;
 		
-		final byte oldBase=bases[pos]; //Leftmost base, about to be overwritten
+		final byte oldBase=baseRingBuffer[pos]; //Leftmost base, about to be overwritten
 		
 		if(verbose){System.err.println("\nAdding "+Character.toString((char)b)+
 				"; evicting "+Character.toString((char)oldBase)+"; counts="+Arrays.toString(counts)+"; countcounts="+Arrays.toString(countCounts)+", pos="+pos+", pos2="+pos2);}
@@ -386,8 +474,9 @@ public class EntropyTracker {
 		len++;
 		
 		{//Add a new rightmost base
-			bases[pos]=b;
+			baseRingBuffer[pos]=b;
 			final int n=symbolToNumber0[b];
+			baseCounts[n]++;
 			kmer=((kmer<<bitsPerBase)|n)&mask; //Generate new rightmost kmer using the new base
 			
 			//Update number of Ns in current window
@@ -435,7 +524,7 @@ public class EntropyTracker {
 		//At this point the state is inconsistent as it may have one too many kmers.
 		
 		if(pos2>=0){//Remove the leftmost base
-			final byte b2=(k>1 ? bases[pos2] : oldBase);//This is not the leftmost base, but the base to the right of the leftmost kmer
+			final byte b2=(k>1 ? baseRingBuffer[pos2] : oldBase);//This is not the leftmost base, but the base to the right of the leftmost kmer
 			final int n2=symbolToNumber0[b2];
 			//Generate old leftmost kmer using an internal base near the left end
 			kmer2=((kmer2<<bitsPerBase)|n2)&mask;
@@ -443,6 +532,7 @@ public class EntropyTracker {
 			if(verbose){System.err.println("B2: pos="+pos+", pos2="+pos2+"; b2="+Character.toString((char)b2)+"; kmer2="+kmer2);}
 
 			if(len>windowBases){//Remove a kmer, only if a base is leaving the window
+				baseCounts[n2]--;
 				//System.err.println("Removing "+kmer2);
 				
 				//Update number of Ns in current window
@@ -507,6 +597,7 @@ public class EntropyTracker {
 		currentEsum=0;
 		
 		//Clear mutable arrays.  Bases does not need to be cleared.
+		Arrays.fill(baseCounts, (short)0); //4 operations
 		Arrays.fill(counts, (short)0); //Time proportional to kmer space
 		
 		//Note - countCounts are only needed for medium speed mode.
@@ -529,6 +620,7 @@ public class EntropyTracker {
 	 * @return True.
 	 */
 	public boolean verifyClear(){
+		for(int c : baseCounts){assert(c==0) : this;}
 		for(int c : counts){assert(c==0) : this;}
 		for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0) : this;}
 		assert(kmer==0 && kmer2==0) : this;
@@ -572,7 +664,7 @@ public class EntropyTracker {
 		}
 		
 		//Count undefined symbols
-		for(byte b : bases){
+		for(byte b : baseRingBuffer){
 			if(!isFullyDefined(b)){nSum++;}
 		}
 		
@@ -636,9 +728,9 @@ public class EntropyTracker {
 	
 	/** Returns the ring buffer as a String in its correct order. */
 	public String basesToString(){
-		StringBuilder sb=new StringBuilder(bases.length);
-		for(int i=0; i<bases.length; i++){
-			byte b=(bases[(i+pos)%bases.length]);
+		StringBuilder sb=new StringBuilder(baseRingBuffer.length);
+		for(int i=0; i<baseRingBuffer.length; i++){
+			byte b=(baseRingBuffer[(i+pos)%baseRingBuffer.length]);
 			if(b==0){b='N';}
 			sb.append((char)b);
 		}
@@ -671,6 +763,10 @@ public class EntropyTracker {
 	/*----------------        Mutable Arrays        ----------------*/
 	/*--------------------------------------------------------------*/
 	
+	/** Number of times each base occurs in current window.
+	 * Equivalent to counts if k=1. */
+	private final short[] baseCounts;
+	
 	/** Number of times each kmer occurs in current window.
 	 * Indexed by the kmer's numeric value.
 	 * counts[0] stores the count of the kmer AAA, if k=3. */
@@ -683,7 +779,7 @@ public class EntropyTracker {
 	
 	/** Ring buffer of bases in current window.
 	 * Not strictly necessary, but convenient. */
-	private final byte[] bases;
+	private final byte[] baseRingBuffer;
 	
 	/*--------------------------------------------------------------*/
 	/*----------------         Final Fields         ----------------*/
@@ -714,6 +810,9 @@ public class EntropyTracker {
 	private final double entropyMult;
 	/** Array of precalculated constants */
 	private final double[] entropy;
+	
+	/** Precalculated constant equal to 1f/windowBases */
+	private final float baseCountMult;
 
 	/** Translation table yielding 0 if undefined */
 	private final byte[] symbolToNumber0;
@@ -745,6 +844,15 @@ public class EntropyTracker {
 	/*----------------        Static Fields         ----------------*/
 	/*--------------------------------------------------------------*/
 	
+	/** Kmer length for entropy calculation */
+	public static int defaultK=5;
+	public static boolean setDefaultK=false;
+	/** Window length for entropy calculation */
+	public static int defaultWindowBases=50;
+	public static boolean setDefaultWindow=false;
+//	/** Minimum entropy to be considered "complex", on a scale of 0-1 */
+//	public static float defaultCutoff=-1;
+	
 	/** Entropy calculation mode */
 	public static int speed=FAST;
 	


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+bbmap (38.84+dfsg-1) UNRELEASED; urgency=medium
+
+  * Team upload.
+  * New upstream version
+
+ -- Dylan Aïssi <daissi at debian.org>  Thu, 14 May 2020 11:29:55 +0200
+
 bbmap (38.82+dfsg-1) unstable; urgency=medium
 
   * Team upload.


=====================================
docs/changelog.txt
=====================================
@@ -835,6 +835,17 @@ CallVariants and Pileup no apply border to reads mapped to the ends of a chromos
 Filtersam no longer discards sam headers.
 Wrote A_SampleBasic to act as a template for programs that do not do stream processing.
 Tweaked some variant-calling scripts.
+38.83
+Fixed fastq interleaving detection for a rare failure with PacBio reads.
+Added entropy histogram support to BBDuk.
+Extended EntropyTracker to always track monomer counts in a window.
+Added EntropyTracker longestLowEntropyBlock function.
+Added entropy/monomer fraction filter to IceCreamFinder.
+38.84
+Removed some IceCreamFinder debug code causing a crash.
+
+TODO: Add minor allele support.
+TODO: ApplyVariants should optionally examine AF to give IUPAC codes for mixed alleles.
 
 TODO: Dedupebymapping has trouble with reads marked paired whose mate is missing.
 TODO: Examine interplay of minedist and chromosome ends. 


=====================================
icecreamfinder.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified January 28, 2020
+Last modified May 4, 2020
 
 Description:  Finds PacBio reads containing inverted repeats.
 These are candidate triangle reads (ice cream cones).
@@ -59,6 +59,7 @@ trimpolya=f     Trim terminal poly-A and poly-T sequences, for some isoseq
 minpolymer=5    Don't trim poly-A sequence shorter than this.
 polyerror=0.2   Max error rate for trimming poly-A.
 
+
 Speed and sensitivity:
 jni=f           Enable C code for higher speed and identical results.
 minratio=       Fraction of maximal alignment score to consider as matching.
@@ -72,6 +73,18 @@ adapterratio=0.18   Initial adapter detection sensitivity; affects speed.
 adapterratio2=.325  Final adapter detection sensitivity.
 minscore=-800   Exit alignment early if score drops below this.
 
+Entropy parameters (recommended setting is 'entropy=t'):
+minentropy=-1   Set to 0.4 or above to remove low-entropy reads;
+                range is 0-1, recommended value is 0.55.  0.7 is too high.
+                Negative numbers disable this function.
+entropyk=3      Kmer length for entropy calculation.
+entropylen=450  Reads with entropy below cutoff for at least this many
+                consecutive bases will be removed.
+entropyfraction=0.5     Alternative minimum length for short reads; the shorter
+                        of entropylen and entfraction*readlength will be used.
+entropywindow=50        Window size used for entropy calculation.
+maxmonomerfraction=0.75 (mmf) Also require this fraction of bases in each
+                        window to be the same base.
 
 Java Parameters:
 -Xmx            This will set Java's memory usage, overriding autodetection.


=====================================
icecreammaker.sh
=====================================
@@ -27,8 +27,8 @@ NOTE: "length" parameters dictate subread length (for normal reads).
 minlen=500      (minlength) Minimum length of genomic sequence molecules.
 maxlen=5000     (maxlength) Maximum length of genomic sequence molecules.
 len=            (length) Set minlen and maxlen to the same number.
-minmovie=       (minmov) Minimum length of movies.
-maxmovie=       (maxmov) Maximum length of movies.
+minmovie=500    (minmov) Minimum length of movies.
+maxmovie=40k    (maxmov) Maximum length of movies.
 movie=          (mov) Set minmov and maxmov to the same number.
 
 Ice cream parameters:


=====================================
pileup.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified April 17, 2020
+Last modified April 30, 2020
 
 Description:  Calculates per-scaffold or per-base coverage information from an unsorted sam or bam file.
 Supports SAM/BAM format for reads and FASTA for reference.


=====================================
pipelines/covid/makeSummary.sh
=====================================
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 ##Written by Brian Bushnell
-##Last modified April 8, 2020
+##Last modified May 4, 2020
 ##Description:  Summarizes all the runs of "processCorona.sh" in this directory.
 ##Then tars them if you want to send them somewhere.
 ##Should be run only after all samples are processed individually.
@@ -11,7 +11,7 @@
 callvariants.sh *_deduped_trimclip.sam.gz ref=NC_045512.fasta multisample out=allVars.vcf ow -Xmx4g usebias=f strandedcov minstrandratio=0 maf=0.6
 
 ##Make a summary of coverage at varous depth cutoffs for all libraries.
-summarizecoverage.sh *basecov.txt out=coverageSummary.txt
+summarizecoverage.sh *basecov_border5.txt out=coverageSummary.txt
 
 mkdir output
 cp *.sh output


=====================================
pipelines/covid/processCorona.sh
=====================================
@@ -1,21 +1,27 @@
 #!/bin/bash
 
 ##Written by Brian Bushnell
-##Last modified April 17, 2020
-##Description:  Calls coronavirus variants from interleaved PE amplicon data.
-##This script assumes input data is paired.
+##Last modified May 4, 2020
+##Description:  Calls SARS-CoV-2 variants from Illumina amplicon data.
+##              This script assumes input data is paired-end.
 
 ##Usage:  processCorona.sh <prefix>
 ##For example, "processCorona.sh Sample1" if the data is in Sample1.fq.gz
 
+
+##Grab the sample name from the command line.
+NAME="$1"
+
+##Specify the viral reference file.
+##NC_045512.fasta contains the SARS-CoV-2 genome, equivalent to bbmap/resources/Covid19_ref.fa
+REF="NC_045512.fasta"
+
 ##Set minimum coverage for genotype calls.
 ##Areas below this depth will be set to N in the consensus genome.
 MINCOV=3
 
-##Grab the sample name from the command line
-NAME="$1"
 
-##This line is in case the script is being re-run, to clear the old output
+##This line is in case the script is being re-run, to clear the old output.
 rm "$NAME"*.sam.gz "$NAME"*.bam "$NAME"*.bai "$NAME"*.txt "$NAME"*.fa "$NAME"*.vcf
 
 ##If data is paired in twin files, interleave it into a single file.
@@ -23,10 +29,14 @@ rm "$NAME"*.sam.gz "$NAME"*.bam "$NAME"*.bai "$NAME"*.txt "$NAME"*.fa "$NAME"*.v
 ##In this case, the files are assumed to be named "Sample1_R1.fq.gz" and "Sample1_R2.fq.gz"
 #reformat.sh in="$NAME"_R#.fq.gz out="$NAME".fq.gz
 
+##Split into Covid and non-Covid reads if this has not already been done..
+##This step can be skipped if non-Covid was already removed.
+bbduk.sh ow -Xmx1g in="$NAME".fq.gz ref="$REF" outm="$NAME"_viral.fq.gz outu="$NAME"_nonviral.fq.gz k=25
+
 ##Recalibrate quality scores prior to any trimming.
-##Requires a recalibration matrix in the working directory (see recal.sh for details).
+##*** Requires a recalibration matrix in the working directory (see recal.sh for details). ***
 ##This step is optional but useful for Illumina binned quality scores.
-bbduk.sh in="$NAME".fq.gz out="$NAME"_recal.fq.gz recalibrate -Xmx1g ow
+bbduk.sh in="$NAME"_viral.fq.gz out="$NAME"_recal.fq.gz recalibrate -Xmx1g ow
 
 ##Discover adapter sequence for this library based on read overlap.
 ##You can examine the adapters output file afterward if desired;
@@ -43,24 +53,19 @@ clumpify.sh in="$NAME"_recal.fq.gz out="$NAME"_clumped.fq.gz zl=9 dedupe s=2 pas
 ##If the prior adapter-detection step failed, use "ref=adapters"
 bbduk.sh in="$NAME"_clumped.fq.gz out="$NAME"_trimmed.fq.gz minlen=60 ktrim=r k=21 mink=9 hdist=2 hdist2=1 ref="$NAME"_adapters.fa maq=14 qtrim=r trimq=10 maxns=0 tbo tpe ow -Xmx1g
 
-##Split into Covid and non-Covid reads if this has not already been done..
-##This step can be skipped if non-Covid was already removed.
-##NC_045512.fasta is the coronavirus reference, equivalent to bbmap/resources/Covid19_ref.fa
-bbduk.sh ow -Xmx1g in="$NAME"_trimmed.fq.gz ref=NC_045512.fasta outm="$NAME"_covid.fq.gz outu="$NAME"_noncovid.fq.gz k=25
-
 ##Align reads to the reference.
-bbmap.sh ref=NC_045512.fasta in="$NAME"_trimmed.fq.gz outm="$NAME"_mapped.sam.gz nodisk local maxindel=500 -Xmx4g ow k=12
+bbmap.sh ref="$REF" in="$NAME"_trimmed.fq.gz outm="$NAME"_mapped.sam.gz nodisk local maxindel=500 -Xmx4g ow k=12
 
 ##Deduplicate based on mapping coordinates.
 ##Note that if you use single-ended amplicon data, you will lose most of your data here.
 dedupebymapping.sh in="$NAME"_mapped.sam.gz out="$NAME"_deduped.sam.gz -Xmx31g ow
 
-#Remove junk reads with unsupported unique deletions; these are often chimeric.
-filtersam.sh ref=NC_045512.fasta ow in="$NAME"_deduped.sam.gz out="$NAME"_filtered.sam.gz mbad=1 del sub=f mbv=0 -Xmx4g
+##Remove junk reads with unsupported unique deletions; these are often chimeric.
+filtersam.sh ref="$REF" ow in="$NAME"_deduped.sam.gz out="$NAME"_filtered.sam.gz mbad=1 del sub=f mbv=0 -Xmx4g
 
-#Remove junk reads with multiple unsupported unique substitutions; these are often junk, particularly on Novaseq.
-#This step is not essential but reduces noise.
-filtersam.sh ref=NC_045512.fasta ow in="$NAME"_filtered.sam.gz out="$NAME"_filtered2.sam.gz mbad=1 sub mbv=2 -Xmx4g
+##Remove junk reads with multiple unsupported unique substitutions; these are often junk, particularly on Novaseq.
+##This step is not essential but reduces noise.
+filtersam.sh ref="$REF" ow in="$NAME"_filtered.sam.gz out="$NAME"_filtered2.sam.gz mbad=1 sub mbv=2 -Xmx4g
 
 ##Trim soft-clipped bases.
 bbduk.sh in="$NAME"_filtered2.sam.gz trimclip out="$NAME"_trimclip.sam.gz -Xmx1g ow
@@ -68,10 +73,7 @@ bbduk.sh in="$NAME"_filtered2.sam.gz trimclip out="$NAME"_trimclip.sam.gz -Xmx1g
 ##Call variants from the sam files.
 ##The usebias=f/minstrandratio=0 flags are necessary due to amplicon due to strand bias,
 ##and should be removed if the data is exclusively shotgun/metagenomic or otherwise randomly fragmented.
-callvariants.sh in="$NAME"_trimclip.sam.gz ref=NC_045512.fasta out="$NAME"_vars.vcf -Xmx4g ow strandedcov usebias=f minstrandratio=0 maf=0.6 minreads="$MINCOV" mincov="$MINCOV" minedistmax=30 minedist=16
-
-##Calculate coverage.
-pileup.sh in="$NAME"_trimclip.sam.gz bincov="$NAME"_bincov.txt basecov="$NAME"_basecov.txt binsize=100 -Xmx4g ow
+callvariants.sh in="$NAME"_trimclip.sam.gz ref="$REF" out="$NAME"_vars.vcf -Xmx4g ow strandedcov usebias=f minstrandratio=0 maf=0.6 minreads="$MINCOV" mincov="$MINCOV" minedistmax=30 minedist=16
 
 ##Calculate reduced coverage as per CallVariants defaults (ignoring outermost 5bp of reads).
 pileup.sh in="$NAME"_trimclip.sam.gz basecov="$NAME"_basecov_border5.txt -Xmx4g ow border=5
@@ -79,12 +81,12 @@ pileup.sh in="$NAME"_trimclip.sam.gz basecov="$NAME"_basecov_border5.txt -Xmx4g
 ##Generate a mutant reference by applying the detected variants to the reference.
 ##This is essentially the reference-guided assembly of the strain.
 ##Also changes anything below depth MINCOV to N (via the mindepth flag).
-applyvariants.sh in=NC_045512.fasta out="$NAME"_genome.fa vcf="$NAME"_vars.vcf basecov="$NAME"_basecov.txt ow mindepth="$MINCOV"
+applyvariants.sh in="$REF" out="$NAME"_genome.fa vcf="$NAME"_vars.vcf basecov="$NAME"_basecov_border5.txt ow mindepth="$MINCOV"
 
 ##Make bam/bai files; requires samtools to be installed.
 ##This step is only necessary for visualization, not variant-calling.
-samtools view -bShu "$NAME"_trimclip.sam.gz | samtools sort -m 2G -@ 3 - -o "$NAME"_sorted.bam
-samtools index "$NAME"_sorted.bam
+#samtools view -bShu "$NAME"_trimclip.sam.gz | samtools sort -m 2G -@ 3 - -o "$NAME"_sorted.bam
+#samtools index "$NAME"_sorted.bam
 
-##At this point, "$NAME"_sorted.bam, "$NAME"_sorted.bam.bai, NC_045512.fasta, and "$NAME"_vars.vcf can be used for visualization in IGV.
+##At this point, "$NAME"_sorted.bam, "$NAME"_sorted.bam.bai, "$REF", and "$NAME"_vars.vcf can be used for visualization in IGV.
 


=====================================
pipelines/covid/processCoronaR1.sh deleted
=====================================
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-#Written by Brian Bushnell
-#Last modified April 4, 2020
-#Description:  Calls coronavirus variants from single-ended 300bp amplicon data.
-#Usage:  processCoronaR1.sh <prefix>
-#For example, "processCoronaR1.sh Sample1" if the data is in Sample1.fq.gz
-
-
-NAME="$1"
-
-#This line is in case the script is being re-run, clears the old junk
-rm "$NAME"*.sam.gz "$NAME"*.bam "$NAME"*.bai "$NAME"*.txt "$NAME"*.fq.gz "$NAME"*.fa "$NAME"*.vcf  
-
-#Recalibrate quality scores prior to any trimming (see recal.sh for details)
-bbduk.sh in="$NAME".fastq.gz out="$NAME"_recal.fq.gz recalibrate -Xmx1g ow
-
-#Perform adapter-trimming on the reads
-#Also do quality trimming and filtering
-bbduk.sh in="$NAME"_recal.fq.gz out="$NAME"_trimmed.fq.gz minlen=60 ktrim=r k=21 mink=9 hdist=2 hdist2=1 ref=truseq maq=14 qtrim=r trimq=10 maxns=0 ow -Xmx1g
-
-#Align singleton reads to the reference
-bbmap.sh ref=NC_045512.fasta in="$NAME"_trimmed.fq.gz outm="$NAME"_trimmed.sam.gz nodisk local maxindel=10 -Xmx4g
-
-#Deduplicate based on mapping coordinates
-dedupebymapping.sh in="$NAME"_trimmed.sam.gz out="$NAME"_deduped.sam.gz -Xmx4g
-
-bbduk.sh in="$NAME"_deduped.sam.gz trimclip out="$NAME"_deduped_trimclip.sam.gz -Xmx1g
-
-#Make bam/bai files for visualization
-samtools view -bShu "$NAME"_deduped_trimclip.sam.gz | samtools sort -m 2G -@ 3 - -o "$NAME"_deduped_sorted.bam
-samtools index "$NAME"_deduped_sorted.bam
-
-#Call variants from the sam files
-callvariants.sh in="$NAME"_deduped_trimclip.sam.gz ref=NC_045512.fasta out="$NAME"_vars.vcf -Xmx4g
-
-#Generate a mutant reference by applying the detected variants to the reference
-#This is essentially the reference-guided assembly of the strain
-applyvariants.sh in=NC_045512.fasta out="$NAME"_genome.fa vcf="$NAME"_vars.vcf
-
-#The remaining steps are just to allow visualization (for example, in IGV).
-
-#At this point, "$NAME"_deduped_sorted.bam, "$NAME"_deduped_sorted.bam.bai, NC_045512.fasta, and "$NAME"_vars.vcf can be used for visualization in IGV.
-
-#Optional step if you want to look at your own plot of coverage.
-#Some regions of the virus, particularly the end, have low or zero coverage.
-pileup.sh in="$NAME"_deduped_trimclip.sam.gz bincov="$NAME"_bincov.txt basecov="$NAME"_basecov.txt binsize=100 -Xmx4g
-


=====================================
pipelines/covid/processCoronaSE.sh deleted
=====================================
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-##Written by Brian Bushnell
-##Last modified April 8, 2020
-##Description:  Calls coronavirus variants from single-ended amplicon data.
-
-##Usage:  processCoronaSE.sh <prefix>
-##For example, "processCoronaSE.sh Sample1" if the data is in Sample1.fq.gz
-
-##Set minimum coverage for genotype calls.
-##Areas below this depth will be set to N in the consensus genome.
-MINCOV=3
-
-##Capture sample name from command line
-NAME="$1"
-
-##This line is in case the script is being re-run, to clears the old output
-rm "$NAME"*.sam.gz "$NAME"*.bam "$NAME"*.bai "$NAME"*.txt "$NAME"*.fa "$NAME"*.vcf  
-
-##Recalibrate quality scores prior to any trimming.
-##Requires a recalibration matrix in the working directory (see recal.sh for details).
-##This step is optional but useful for Illumina binned quality scores.
-bbduk.sh in="$NAME".fastq.gz out="$NAME"_recal.fq.gz recalibrate -Xmx1g ow
-
-
-##Perform adapter-trimming on the reads.
-##Also do quality trimming and filtering.
-##If desired, also do primer-trimming here by adding, e.g., 'ftl=20' to to trim the leftmost 20 bases.
-bbduk.sh in="$NAME"_recal.fq.gz out="$NAME"_trimmed.fq.gz minlen=60 ktrim=r k=21 mink=9 hdist=2 hdist2=1 ref=truseq maq=14 qtrim=r trimq=10 maxns=0 ow -Xmx1g
-
-##Align reads to the reference.
-##NC_045512.fasta is the coronavirus reference, equivalent to bbmap/resources/Covid19_ref.fa
-bbmap.sh ref=NC_045512.fasta in="$NAME"_trimmed.fq.gz outm="$NAME"_trimmed.sam.gz nodisk local maxindel=20 -Xmx4g ow
-
-##Deduplicate based on mapping coordinates.
-##Note that single-ended amplicon data will lose most of its data here.
-dedupebymapping.sh in="$NAME"_trimmed.sam.gz out="$NAME"_deduped.sam.gz -Xmx4g ow
-
-##Trim soft-clipped bases.
-bbduk.sh in="$NAME"_deduped.sam.gz trimclip out="$NAME"_deduped_trimclip.sam.gz -Xmx1g ow
-
-##Call variants from the sam files.
-##The usebias=f/minstrandratio=0 flags *may be* necessary due to amplicon due to strand bias,
-##and should be removed if the data is exclusively shotgun/metagenomic or otherwise randomly fragmented,
-##and exhibits no strand bias.
-callvariants.sh in="$NAME"_deduped_trimclip.sam.gz ref=NC_045512.fasta out="$NAME"_vars.vcf -Xmx4g ow strandedcov usebias=f minstrandratio=0 maf=0.6 minreads="$MINCOV" mincov="$MINCOV"
-
-##Calculate coverage.
-pileup.sh in="$NAME"_deduped_trimclip.sam.gz bincov="$NAME"_bincov.txt basecov="$NAME"_basecov.txt binsize=100 -Xmx4g ow
-
-##Calculate reduced coverage as per CallVariants defaults (ignoring outermost 5bp of reads).
-pileup.sh in="$NAME"_deduped_trimclip.sam.gz basecov="$NAME"_basecov_border5.txt -Xmx4g ow border=5
-
-##Generate a mutant reference by applying the detected variants to the reference.
-##This is essentially the reference-guided assembly of the strain.
-##Also changes anything below depth MINCOV to N (via the mindepth flag).
-applyvariants.sh in=NC_045512.fasta out="$NAME"_genome.fa vcf="$NAME"_vars.vcf basecov="$NAME"_basecov.txt ow mindepth="$MINCOV"
-
-##Make bam/bai files; requires samtools to be installed.
-##This step is only necessary for visualization, not variant-calling.
-samtools view -bShu "$NAME"_deduped_trimclip.sam.gz | samtools sort -m 2G -@ 3 - -o "$NAME"_deduped_sorted.bam
-samtools index "$NAME"_deduped_sorted.bam
-
-##At this point, "$NAME"_deduped_sorted.bam, "$NAME"_deduped_sorted.bam.bai, NC_045512.fasta, and "$NAME"_vars.vcf can be used for visualization in IGV.


=====================================
pipelines/covid/processCoronaWrapper.sh
=====================================
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 ##Written by Brian Bushnell
-##Last modified April 16, 2020
+##Last modified May 4, 2020
 ##Description:  Outer wrapper for processing a bunch of Covid samples in a directory.
 ##This is just a template that needs to be modified before use, according to the input file names.
 
@@ -21,5 +21,5 @@ sh processCorona.sh Sample1
 sh processCorona.sh Sample2
 ##etc.
 
-##Summarize the output.
+##Summarize the output (optional).
 sh makeSummary.sh 1>makeSummary.o 2>&1


=====================================
pipelines/covid/readme.txt
=====================================
@@ -9,26 +9,25 @@ Usage:
 These scripts are just guidelines showing how I am processing Covid data..
 If you want to use them without modification, follow these steps:
 
-1) Install BBTools (you need the latest version - 38.81+ - due to some new features I added for Covid!).
+1) Install BBTools (you need the latest version - 38.83+ - due to some new features I added for Covid!).
    a) If you don't have Java, install Java.
       i) If you run java --version on the command line and it reports version 8+, you're fine.
       ii) Otherwise, you can get it from https://openjdk.java.net/install/index.html
-   b) Download BBTools 38.81 or higher from https://sourceforge.net/projects/bbmap/
-   c) Unzip the archive: tar -xzf BBMap_38.81.tar.gz
+   b) Download BBTools 38.83 or higher from https://sourceforge.net/projects/bbmap/
+   c) Unzip the archive: tar -xzf BBMap_38.83.tar.gz
    d) Add it to the path: export PATH=/path/to/bbmap/:$PATH
       i) Now you can run the shell scripts from wherever.
    e) Samtools is not necessary, but recommended if you want to make the bam files for visualization.
 2) Rename and interleave the files if they are paired, so libraries are in the format "prefix.fq.gz" with one file per library.
 3) Copy the Covid reference to the directory.
 4) Modify the template processCoronaWrapper.sh:
-   a) Pick a library to use for quality-score calibration if desired (line 17).
-   b) Add lines, or make a loop, so that processCorona.sh is called on all of the libraries (lines 21-22).
+   a) Pick a library to use for quality-score calibration if desired (line 16).
+   b) Add lines, or make a loop, so that processCorona.sh is called on all of the libraries (lines 20-21).
    c) Delete lines 8-9 to let the script run.
 5) Run processCoronaWrapper.sh.
 
 Note:
 You can remove human reads with a command like this (where reads.fq can be single-ended or paired/interleaved):
-bbmap.sh ref=hg19.fa in=reads.fq outm=human.fq outu=nonhuman.fq bloom
+bbmap.sh ref=hg19.fa in=reads.fq outm=human.fq outu=nonhuman.fq bloom fast
 
-I don't provide a script for that because the data I am processing has already had human reads removed.
 


=====================================
pipelines/covid/recal.sh
=====================================
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 ##Written by Brian Bushnell
-##Last modified April 17, 2020
+##Last modified May 4, 2020
 ##Description:  Creates a recalibration matrix for Illumina data.
 
 ##Usage:  recal.sh <prefix>
@@ -18,6 +18,10 @@
 ##Grab the sample name from the command line
 NAME="$1"
 
+##Specify the viral reference file.
+##NC_045512.fasta contains the SARS-CoV-2 genome, equivalent to bbmap/resources/Covid19_ref.fa
+REF="NC_045512.fasta"
+
 ##Discover adapter sequence for this library based on read overlap.
 ##This step should be skipped for single-ended reads.
 bbmerge.sh in="$NAME".fq.gz outa="$NAME"_adapters.fa ow reads=1m
@@ -30,10 +34,10 @@ bbduk.sh -Xmx1g in="$NAME".fq.gz out=recal.fq.gz minlen=150 ktrim=r k=21 mink=9
 #bbduk.sh -Xmx1g in="$NAME".fq.gz out=trimmed.fq.gz minlen=150 ktrim=r k=21 mink=9 hdist=2 hdist2=1 ref=adapters ow
 
 ##Map the reads with very high sensitivity.
-bbmap.sh ref=NC_045512.fasta in=trimmed.fq.gz outm=mapped.sam.gz vslow -Xmx6g ow
+bbmap.sh ref="$REF" in=trimmed.fq.gz outm=mapped.sam.gz vslow -Xmx6g ow
 
 #Discover true variants.
-callvariants.sh in=mapped.sam.gz ref=NC_045512.fasta out=recal.vcf -Xmx6g ow
+callvariants.sh in=mapped.sam.gz ref="$REF" out=recal.vcf -Xmx6g ow
 
 ##Generate recalibration matrices.
 calctruequality.sh in=mapped.sam.gz vcf=recal.vcf



View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/compare/f940be8bf173d37e09aa98a5a2873c729c2cf835...427f9b40d39400c2172662db08c5e83ef7557478

-- 
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/compare/f940be8bf173d37e09aa98a5a2873c729c2cf835...427f9b40d39400c2172662db08c5e83ef7557478
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200514/c16f2a15/attachment-0001.html>


More information about the debian-med-commit mailing list