[med-svn] [Git][med-team/bbmap][upstream] New upstream version 38.97+dfsg

Wed Aug 3 12:54:28 BST 2022


Étienne Mollier pushed to branch upstream at Debian Med / bbmap


Commits:
1c662261 by Étienne Mollier at 2022-08-03T13:38:44+02:00
New upstream version 38.97+dfsg
- - - - -


30 changed files:

- README.md
- + Xcalcmem.sh
- bbduk.sh
- clumpify.sh
- + current/clump/Hasher.java
- current/clump/KmerSort.java
- current/clump/KmerSort1.java
- current/clump/KmerSort2.java
- current/clump/KmerSort3.java
- current/clump/KmerSplit.java
- current/icecream/IceCreamFinder.java
- current/jgi/BBDuk.java
- current/jgi/RQCFilter2.java
- current/jgi/RQCFilterStats.java
- current/shared/Parser.java
- current/shared/Shared.java
- current/stream/FASTQ.java
- current/tax/GiToTaxid.java
- + current/tax/GiToTaxidInt.java
- current/tax/TaxServer.java
- current/tax/TaxTree.java
- docs/changelog.txt
- removecatdogmousehuman.sh
- removehuman.sh
- removehuman2.sh
- removemicrobes.sh
- + resources/PacBioAdapter.fa
- resources/contents.txt
- resources/short.fa
- shred.sh


Changes:

=====================================
README.md
=====================================
@@ -3,4 +3,4 @@
 # Language: Java, Bash
 # Information about documentation is in /docs/readme.txt.
 
-# Version 38.96
+# Version 38.97


=====================================
Xcalcmem.sh
=====================================
@@ -0,0 +1,149 @@
+#!/bin/bash
+#calcmem
+
+#function usage(){
+#	echo "CalcMem v1.01"
+#	echo "Written by Brian Bushnell, Doug Jacobsen, Alex Copeland"
+#	echo "Calculates available memory in megabytes"
+#	echo "Last modified April 25, 2014"
+#}
+
+function parseXmx () {
+	local setxmx=0
+	local setxms=0
+	
+	for arg in "$@"
+	do
+		if [[ "$arg" == -Xmx* ]]; then
+			z="$arg"
+			setxmx=1
+		elif [[ "$arg" == Xmx* ]]; then
+			z="-$arg"
+			setxmx=1
+		elif [[ "$arg" == -Xms* ]]; then
+			z2="$arg"
+			setxms=1
+		elif [[ "$arg" == Xms* ]]; then
+			z2="-$arg"
+			setxms=1
+		elif [[ "$arg" == -da ]] || [[ "$arg" == -ea ]]; then
+			EA="$arg"
+		fi
+	done
+	
+	if [[ $setxmx == 1 ]] && [[ $setxms == 0 ]]; then
+		local substring=`echo $z| cut -d'x' -f 2`
+		z2="-Xms$substring"
+		setxms=1
+	elif [[ $setxmx == 0 ]] && [[ $setxms == 1 ]]; then
+		local substring=`echo $z2| cut -d's' -f 2`
+		z="-Xmx$substring"
+		setxmx=1
+	fi
+	
+	set=$setxmx
+}
+
+
+RAM=0;
+
+function freeRam(){
+	#Memory is in kilobytes.
+	local defaultMem=3200000
+	if [ $# -gt 0 ]; then
+		defaultMem=$1;
+		case $defaultMem in
+			*g)
+			defaultMem=`echo $defaultMem| cut -d'g' -f 1`
+			defaultMem=$(( $defaultMem * $(( 1024 * 1024 )) ))
+			;;
+			*m)
+			defaultMem=`echo $defaultMem| cut -d'm' -f 1`
+			defaultMem=$(( $defaultMem * 1024 ))
+			;;
+			*k)
+			defaultMem=`echo $defaultMem| cut -d'k' -f 1`
+			;;
+		esac
+	fi
+	
+	local mult=84
+	if [ $# -gt 1 ]; then
+		mult=$2;
+	fi
+	
+	#echo "mult =    $mult"
+	#echo "default = $defaultMem"
+	
+	local ulimit=$(ulimit -v)
+	local x=$ulimit
+	
+	if [ -e /proc/meminfo ]; then
+		local vfree=$(cat /proc/meminfo | awk -F: 'BEGIN{total=-1;used=-1} /^CommitLimit:/ { total=$2 }; /^Committed_AS:/ { used=$2 } END{ print (total-used) }')
+		local pfree=$(cat /proc/meminfo | awk -F: 'BEGIN{free=-1;cached=-1;buffers=-1} /^MemFree:/ { free=$2 }; /^Cached:/ { cached=$2}; /^Buffers:/ { buffers=$2} END{ print (free+cached+buffers) }')
+		
+		echo "vfree =   $vfree"
+		echo "pfree =   $pfree"
+		echo "ulimit =  $ulimit"
+
+		local x2=0;
+		
+		if [ $vfree -gt 0 ] && [ $pfree -gt 0 ]; then
+			if [ $vfree -gt $pfree ]; then x2=$pfree; 
+			else x2=$vfree; fi
+		elif [ $vfree -gt 0 ]; then x2=$vfree;
+		elif [ $pfree -gt 0 ]; then x2=$pfree;
+		fi
+		
+		echo $x
+		echo $x2
+		echo $vfree
+		echo $pfree
+		
+		if [ "$x" = "unlimited" ] || (($x > $x2)); then x=$x2; fi
+		
+	fi
+	
+	#echo "x=$x"
+	local HOSTNAME=`hostname`
+	if [ $x -lt 1 ] || [[ $HOSTNAME == genepool* ]]; then
+		#echo "hello 2"
+		#echo $x
+		#echo "ram is unlimited"
+		RAM=$((defaultMem/1024))
+		echo "Max memory cannot be determined.  Attempting to use $RAM MB." 1>&2
+		echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2
+	else
+		#echo "hello 1"
+		#echo $x
+		
+		#if [ $x -ge 1000000000 ]; then
+		#	echo "ram is 1000g+"
+		#elif [ $x -ge 500000000 ]; then
+		#	echo "ram is 500g+"
+		#elif [ $x -ge 250000000 ]; then
+		#	echo "ram is 250g+"
+		#elif [ $x -ge 144000000 ]; then
+		#	echo "ram is 144g+"
+		#elif [ $x -ge 120000000 ]; then
+		#	echo "ram is 120g+"
+		#elif [ $x -ge 40000000 ]; then
+		#	echo "ram is 40g+"
+		#else
+		#	echo "ram is under 40g"
+		#fi
+		#echo $x
+		RAM=$(( ((x-500000)*mult/100)/1024 ))
+		#echo $RAM
+	fi
+	
+	echo $set
+	echo $z
+	echo $z2
+	
+	#local z="-Xmx${RAM}m"
+	return 0
+}
+
+parseXmx "$@"
+freeRam "$@"


=====================================
bbduk.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified January 26, 2021
+Last modified July 28, 2022
 
 Description:  Compares reads to the kmers in a reference dataset, optionally 
 allowing an edit distance. Splits the reads into two outputs - those that 
@@ -172,11 +172,14 @@ Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
 All kmer processing modes are mutually exclusive.
 Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.
 
-ktrim=f             Trim reads to remove bases matching reference kmers.
+ktrim=f             Trim reads to remove bases matching reference kmers, plus
+                    all bases to the left or right.
                     Values: 
                        f (don't trim), 
                        r (trim to the right), 
                        l (trim to the left)
+ktrimtips=0         Set this to a positive number to perform ktrim on both
+                    ends, examining only the outermost X bases.
 kmask=              Replace bases matching ref kmers with another symbol.
                     Allows any non-whitespace character, and processes short
                     kmers on both ends if mink is set.  'kmask=lc' will
@@ -237,6 +240,8 @@ restrictleft=0      If positive, only look for kmer matches in the
                     leftmost X bases.
 restrictright=0     If positive, only look for kmer matches in the 
                     rightmost X bases.
+NOTE:  restrictleft and restrictright are mutually exclusive.  If trimming
+       both ends is desired, use ktrimtips.
 mingc=0             Discard reads with GC content below this.
 maxgc=1             Discard reads with GC content above this.
 gcpairs=t           Use average GC of paired reads.


=====================================
clumpify.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified October 30, 2019
+Last modified July 28, 2022
 
 Description:  Sorts sequences to put similar reads near each other.
 Can be used for increased compression or error correction.
@@ -95,6 +95,9 @@ allduplicates=f     Mark or remove all copies of duplicates, instead of
                     keeping the highest-quality copy.
 addcount=f          Append the number of copies to the read name.
                     Mutually exclusive with markduplicates or allduplicates.
+entryfilter=f       This assists in removing exact duplicates, which saves
+                    memory in libraries that split unevenly due to huge
+                    numbers of duplicates.  Enabled automatically as needed.
 subs=2              (s) Maximum substitutions allowed between duplicates.
 subrate=0.0         (dsr) If set, the number of substitutions allowed will be
                     max(subs, subrate*min(length1, length2)) for 2 sequences.


=====================================
current/clump/Hasher.java
=====================================
@@ -0,0 +1,75 @@
+package clump;
+
+import java.util.Random;
+
+import shared.Tools;
+import stream.Read;
+
+public class Hasher {
+
+	private static synchronized long[][] makeCodes2(int modes){
+		long[][] r=makeCodes(128, modes);
+		
+		for(int i=0; i<26; i++){
+			char c=(char)('A'+i);
+			r[Tools.toLowerCase(c)]=r[c];
+		}
+		return r;
+	}
+	
+	private static synchronized long[][] makeCodes(int symbols, int modes){
+		Random randy=new Random(1);
+		long[][] r=new long[symbols][modes];
+		for(int i=0; i<symbols; i++){
+			for(int j=0; j<modes; j++){
+				r[i][j]=randy.nextLong();
+			}
+		}
+		return r;
+	}
+	
+	public static long hash(byte[] bases){
+		long code=bases.length;
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			int mode=(int)(code&31);
+			assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+			code=code^hashcodes[b][mode];
+			code=Long.rotateLeft(code, 1);
+		}
+		return code;
+	}
+	
+	public static final long hash(Read r){
+		return hash(r.bases);
+	}
+	
+	public static final long hashPair(Read r){
+		long a=hash(r);
+		if(r.mate==null){return a;}
+		long b=hash(r.mate);
+		return a^Long.rotateLeft(b, 1);
+	}
+	
+	public static final boolean equalsPaired(Read a, Read b){
+		return equals(a, b) && equals(a.mate, b.mate);
+	}
+	
+	public static final boolean equals(Read a, Read b){
+		if(a==b){return true;}
+		if(a==null || b==null){
+			assert(a!=null || b!=null);
+			return false;
+		}
+		if(a.length()!=b.length()){return false;}
+		if(a.length()==0){return true;}
+		byte[] ab=a.bases, bb=b.bases;
+		for(int i=0; i<ab.length; i++){
+			if(ab[i]!=bb[i]){return false;}
+		}
+		return true;
+	}
+	
+	private static final long[][] hashcodes=makeCodes2(32);
+	
+}


=====================================
current/clump/KmerSort.java
=====================================
@@ -3,6 +3,7 @@ package clump;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 
 import bloom.KCountArray;
 import fileIO.ReadWrite;
@@ -66,7 +67,8 @@ public abstract class KmerSort {
 		
 		String cpstring=""+(groups==1 ? clumpsProcessedThisPass : clumpsProcessedTotal);
 		String epstring=""+correctionsTotal;
-		String dpstring=""+duplicatesTotal;
+		String efstring=""+(entryFiltered);
+		String dpstring=""+(duplicatesTotal + entryFiltered);
 
 		String rostring=""+readsOut;
 		String bostring=""+basesOut;
@@ -79,6 +81,7 @@ public abstract class KmerSort {
 		while(rpstring2.length()<12){rpstring2=" "+rpstring2;}
 		while(cpstring.length()<12){cpstring=" "+cpstring;}
 		while(epstring.length()<12){epstring=" "+epstring;}
+		while(efstring.length()<12){efstring=" "+efstring;}
 		while(dpstring.length()<12){dpstring=" "+dpstring;}
 
 		while(rostring.length()<12){rostring=" "+rostring;}
@@ -92,8 +95,11 @@ public abstract class KmerSort {
 		if(correct){
 			outstream.println("Errors Corrected: "+epstring);
 		}
-		if(dedupe){
+		if(dedupe || entryfilter){
 			outstream.println("Duplicates Found: "+dpstring);
+			if(entryfilter && verbose && false){
+				outstream.println(" -Entry Filtered: "+efstring);
+			}
 			outstream.println("Reads Out:        "+rostring);
 			outstream.println("Bases Out:        "+bostring);
 		}
@@ -271,7 +277,7 @@ public abstract class KmerSort {
 		ArrayList<FetchThread1> alft=new ArrayList<FetchThread1>(threads);
 		for(int i=0; i<threads; i++){alft.add(new FetchThread1(i, cris, kc, unpair));}
 		
-		readsThisPass=memThisPass=0;
+		readsThisPass=memThisPass=entryFilteredThisPass=0;
 		
 		if(verbose){outstream.println("Starting threads.");}
 		for(FetchThread1 ht : alft){ht.start();}
@@ -289,6 +295,7 @@ public abstract class KmerSort {
 					e.printStackTrace();
 				}
 			}
+			entryFilteredThisPass+=ht.entryFilteredT;
 			readsThisPass+=ht.readsProcessedT;
 			basesProcessed+=ht.basesProcessedT;
 			diskProcessed+=ht.diskProcessedT;
@@ -296,24 +303,28 @@ public abstract class KmerSort {
 		}
 		readsProcessed+=readsThisPass;
 		memProcessed+=memThisPass;
+		entryFiltered+=entryFilteredThisPass;
 
 		if(verbose){t.stop("Fetch time: ");}
 		if(verbose){System.err.println("Closing input stream.");}
 		errorState=ReadWrite.closeStream(cris)|errorState;
 		
 		if(verbose){t.start("Combining thread output.");}
-		assert(readsProcessed<=Integer.MAX_VALUE) :
+		long readsLeft=readsThisPass-entryFilteredThisPass;
+		long slotsLeft=cris.paired() && !unpair ? readsLeft/2 : readsLeft;
+		assert(slotsLeft<=Shared.MAX_ARRAY_LEN) :
 			"\nThe number of reads is greater than 2 billion, which is the limit for a single group. "
 			+ "\nPlease rerun and manually specify 'groups=7' or similar, "
 			+ "\nsuch that the number of reads per group is less than 2 billion.";
-		ArrayList<Read> list=new ArrayList<Read>((int)readsThisPass);
+		ArrayList<Read> list=new ArrayList<Read>((int)(slotsLeft));
 		for(int i=0; i<threads; i++){
 			FetchThread1 ft=alft.set(i, null);
 			list.addAll(ft.storage);
 		}
 		if(verbose){t.stop("Combine time: ");}
 		
-		assert(list.size()==readsThisPass || (cris.paired() && list.size()*2==readsThisPass)) : list.size()+", "+readsThisPass+", "+cris.paired();
+		assert(list.size()==slotsLeft) : list.size()+", "+readsThisPass+", "+readsLeft+", "+slotsLeft+", "+cris.paired();
+		//assert(list.size()==readsLeft || (cris.paired() && list.size()*2==readsLeft)) : list.size()+", "+readsThisPass+", "+readsLeft+", "+cris.paired();
 		ecco=false;
 		return list;
 	}
@@ -330,6 +341,7 @@ public abstract class KmerSort {
 			kc=kc_;
 			storage=new ArrayList<Read>();
 			unpairT=unpair_;
+			entryFilterTable=(entryfilter ? new HashMap<Long, Read>() : null);
 		}
 		
 		@Override
@@ -377,6 +389,27 @@ public abstract class KmerSort {
 					}
 				}
 				
+				if(entryFilterTable!=null){
+					int removed=0;
+					for(int i=0; i<reads.size(); i++){
+						Read r=reads.get(i);
+						final long key=Hasher.hashPair(r);
+						final Long key2=Long.valueOf(key);
+						final Read old=entryFilterTable.get(key2);
+						if(old==null){
+							entryFilterTable.put(key2, r);
+						}else{
+							boolean same=Hasher.equalsPaired(r, old);
+							if(same){
+								removed++;
+								entryFilteredT+=r.pairCount();
+								reads.set(i, null);
+							}
+						}
+					}
+					if(removed>0){Tools.condenseStrict(reads);}
+				}
+				
 				ArrayList<Read> hashList=reads;
 				if(paired && unpairT){
 					hashList=new ArrayList<Read>(reads.size()*2);
@@ -394,7 +427,7 @@ public abstract class KmerSort {
 				
 				kc.hash(hashList, table, minCount, true);
 				storage.addAll(hashList);
-				cris.returnList(ln);
+				cris.returnList(ln.id, false);
 				ln=cris.nextList();
 				reads=(ln!=null ? ln.list : null);
 			}
@@ -416,6 +449,8 @@ public abstract class KmerSort {
 		final KmerComparator kc;
 		final ArrayList<Read> storage;
 		final boolean unpairT;
+		final HashMap<Long, Read> entryFilterTable;
+		public long entryFilteredT=0;
 		
 		protected long readsProcessedT=0;
 		protected long basesProcessedT=0;
@@ -488,10 +523,12 @@ public abstract class KmerSort {
 	protected long basesProcessed=0;
 	protected long diskProcessed=0;
 	protected long memProcessed=0;
+	protected static long entryFiltered=0;
 
 	protected long readsOut=0;
 	protected long basesOut=0;
 
+	protected long entryFilteredThisPass=0;
 	protected long readsThisPass=0;
 	protected long memThisPass=0;
 	
@@ -520,7 +557,9 @@ public abstract class KmerSort {
 	boolean unpair=false;
 	boolean repair=false;
 	boolean namesort=false;
+	boolean entryfilter=false;
 	final boolean parallelSort=Shared.parallelSort;
+	boolean memWarned=false;
 	
 	boolean useSharedHeader=false;
 	int reorderMode=REORDER_FALSE;


=====================================
current/clump/KmerSort1.java
=====================================
@@ -119,6 +119,8 @@ public class KmerSort1 extends KmerSort {
 			
 			else if(a.equals("prefilter")){
 				KmerReduce.prefilter=Parse.parseBoolean(b);
+			}else if(a.equals("entryfilter")){
+				entryfilter=Parse.parseBoolean(b);
 			}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
 				groups=Integer.parseInt(b);
 				splitInput=(groups>1);


=====================================
current/clump/KmerSort2.java
=====================================
@@ -119,6 +119,8 @@ public class KmerSort2 extends KmerSort {
 			
 			else if(a.equals("prefilter")){
 				KmerReduce.prefilter=Parse.parseBoolean(b);
+			}else if(a.equals("entryfilter")){
+				entryfilter=Parse.parseBoolean(b);
 			}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
 				groups=Integer.parseInt(b);
 				splitInput=(groups>1);


=====================================
current/clump/KmerSort3.java
=====================================
@@ -3,6 +3,7 @@ package clump;
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.atomic.AtomicInteger;
 
@@ -135,6 +136,8 @@ public class KmerSort3 extends KmerSort {
 			
 			else if(a.equals("prefilter")){
 				KmerReduce.prefilter=Parse.parseBoolean(b);
+			}else if(a.equals("entryfilter")){
+				entryfilter=Parse.parseBoolean(b);
 			}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
 				groups=Integer.parseInt(b);
 				splitInput=(groups>1);
@@ -531,6 +534,7 @@ public class KmerSort3 extends KmerSort {
 	private long closeFetchThread3s(ArrayList<FetchThread3> alft){
 		readsThisPass=0;
 		memThisPass=0;
+		entryFilteredThisPass=0;
 		/* Wait for threads to die */
 		for(FetchThread3 ft : alft){
 
@@ -542,6 +546,7 @@ public class KmerSort3 extends KmerSort {
 					e.printStackTrace();
 				}
 			}
+			entryFilteredThisPass+=ft.entryFilteredT;
 			readsThisPass+=ft.readsProcessedT;
 			basesProcessed+=ft.basesProcessedT;
 			diskProcessed+=ft.diskProcessedT;
@@ -549,6 +554,7 @@ public class KmerSort3 extends KmerSort {
 			
 			errorState|=ft.errorStateT;
 		}
+		entryFiltered+=entryFilteredThisPass;
 		readsProcessed+=readsThisPass;
 		memProcessed+=memThisPass;
 		
@@ -627,27 +633,46 @@ public class KmerSort3 extends KmerSort {
 //					outstream.println("totalMem="+totalMem+", Shared.memFree()="+Shared.memFree()+", Shared.memUsed()="+Shared.memUsed());
 //					outstream.println("fileSize="+fileSize+", "+"fileMem="+fileMem+", "+"memRatio="+memRatio+", ");
 //					outstream.println((size-1000>expectedSizePerGroup*3)+", "+(expectedMem*3>totalMem));
-					
+
 					//TODO: This could also be based on the number of FetchThreads.
-					
 					if(expectedMem>0.3*totalMem && (fileMem<1 || fileMem>0.8*totalMem)){
-						outstream.println("\n***  Warning  ***\n"
-								+ "A temp file may be too big to store in memory, due to uneven splitting:");
+						synchronized(this.getClass()){
+							if(!memWarned){
+								outstream.println("\n***  Warning  ***\n"
+										+ "A temp file may be too big to store in memory, due to uneven splitting:");
 
-						outstream.println("expectedMem="+((long)expectedMem)+", fileMem="+fileMem+", available="+totalMem);
-
-						if(repair || namesort){
-							outstream.println("It cannot be streamed to output unaltered because "+(namesort ? " namesort=t" : "repair=t"));
-							outstream.println("If this causes the program to crash, please re-run with more memory or groups.\n");
-						}else{
-							outstream.println(
-									"It will be streamed to output unaltered.\n"
-											+ "To avoid this behavior, increase memory or increase groups.\n"
-											+ "Set the flag forceprocess=t to disable this check.\n");
-//							Timer t=new Timer();
-							ArrayList<Read> list=streamNext_inner(cris);
-//							t.stop("Stream time: ");
-							return list;
+								outstream.println("expectedMem="+((long)expectedMem)+", fileMem="+fileMem+", available="+totalMem);
+							}
+							if(entryfilter){
+								if(!memWarned){
+									outstream.println(
+											"If the program crashes, increase memory or manually increase groups.\n");
+								}
+							}else if(dedupe & !Clump.opticalOnly){
+								if(!memWarned){
+									outstream.println(
+											"entryfilter is automatically being set to true to reduce memory load while removing duplicates.\n"
+													+"If the program crashes, increase memory or manually increase groups.\n");
+								}
+								entryfilter=true;
+							}else if(repair || namesort){
+								if(!memWarned){
+									outstream.println("It cannot be streamed to output unaltered because "+(namesort ? " namesort=t" : "repair=t"));
+									outstream.println("If this causes the program to crash, please re-run with more memory or groups.\n");
+								}
+							}else{
+								if(!memWarned){
+									outstream.println(
+											"It will be streamed to output unaltered.\n"
+													+ "To avoid this behavior, increase memory or increase groups.\n"
+													+ "Set the flag forceprocess=t to disable this check.\n");
+								}
+								//							Timer t=new Timer();
+								ArrayList<Read> list=streamNext_inner(cris);
+								//							t.stop("Stream time: ");
+								return list;
+							}
+							memWarned=true;
 						}
 					}
 				}
@@ -673,7 +698,7 @@ public class KmerSort3 extends KmerSort {
 //			if(verbose){t.start("Making hash threads.");}
 			final int subthreads=Tools.mid(1, Shared.threads()/2, 8);
 			ArrayList<FetchSubThread> alht=new ArrayList<FetchSubThread>(subthreads);
-			long readsThisGroup=0, memThisGroup=0;
+			long readsThisGroup=0, memThisGroup=0, entryFilteredThisGroup=0;
 			for(int i=0; i<subthreads; i++){alht.add(new FetchSubThread(i, cris, kc, unpair));}
 			
 //			if(verbose){outstream.println("Starting threads.");}
@@ -691,11 +716,13 @@ public class KmerSort3 extends KmerSort {
 						e.printStackTrace();
 					}
 				}
+				entryFilteredThisGroup+=ht.entryFilteredT;
 				readsThisGroup+=ht.readsProcessedST;
 				basesProcessedT+=ht.basesProcessedST;
 				diskProcessedT+=ht.diskProcessedST;
 				memThisGroup+=ht.memProcessedST;
 			}
+			entryFilteredT+=entryFilteredThisGroup;
 			readsProcessedT+=readsThisGroup;
 			memProcessedT+=memThisGroup;
 
@@ -704,13 +731,15 @@ public class KmerSort3 extends KmerSort {
 			errorStateT=ReadWrite.closeStream(cris)|errorStateT;
 			
 //			if(verbose){t.start("Combining thread output.");}
-			assert(readsProcessedT<=Integer.MAX_VALUE && readsProcessedT>=0);
-			ArrayList<Read> list=new ArrayList<Read>((int)readsThisGroup);
+			long readsLeft=readsThisGroup-entryFilteredThisGroup;
+			long slotsLeft=cris.paired() && !unpair ? readsLeft/2 : readsLeft;
+			assert(slotsLeft<=Shared.MAX_ARRAY_LEN && readsProcessedT>=0);
+			ArrayList<Read> list=new ArrayList<Read>((int)slotsLeft);
 			for(int i=0; i<subthreads; i++){
 				FetchSubThread ht=alht.set(i, null);
 				list.addAll(ht.storage);
 			}
-			assert(list.size()==readsThisGroup || (cris.paired() && list.size()*2==readsThisGroup)) : list.size()+", "+readsThisGroup+", "+cris.paired();
+			assert(list.size()==slotsLeft) : list.size()+", "+readsThisGroup+", "+slotsLeft+", "+cris.paired();
 //			if(verbose){t.stop("Combine time: ");}
 			
 //			if(verbose){t.start("Sorting.");}
@@ -724,7 +753,8 @@ public class KmerSort3 extends KmerSort {
 		final AtomicInteger nextGroup;
 		final KmerComparator kc;
 		final ConcurrentReadOutputStream[] rosa;
-		
+
+		protected long entryFilteredT=0;
 		protected long readsProcessedT=0;
 		protected long basesProcessedT=0;
 		protected long diskProcessedT=0;
@@ -740,6 +770,7 @@ public class KmerSort3 extends KmerSort {
 				kcT=kc_;
 				storage=new ArrayList<Read>();
 				unpairT=unpair_;
+				entryFilterTable=(entryfilter ? new HashMap<Long, Read>() : null);
 			}
 			
 			@Override
@@ -783,6 +814,27 @@ public class KmerSort3 extends KmerSort {
 						}
 					}
 					
+					if(entryFilterTable!=null){
+						int removed=0;
+						for(int i=0; i<reads.size(); i++){
+							Read r=reads.get(i);
+							final long key=Hasher.hashPair(r);
+							final Long key2=Long.valueOf(key);
+							final Read old=entryFilterTable.get(key2);
+							if(old==null){
+								entryFilterTable.put(key2, r);
+							}else{
+								boolean same=Hasher.equalsPaired(r, old);
+								if(same){
+									removed++;
+									entryFilteredT+=r.pairCount();
+									reads.set(i, null);
+								}
+							}
+						}
+						if(removed>0){Tools.condenseStrict(reads);}
+					}
+					
 					ArrayList<Read> hashList=reads;
 					if(paired && unpairT){
 						hashList=new ArrayList<Read>(reads.size()*2);
@@ -800,7 +852,7 @@ public class KmerSort3 extends KmerSort {
 					
 					kcT.hash(hashList, table, minCount, true);
 					storage.addAll(hashList);
-					cris.returnList(ln);
+					cris.returnList(ln.id, false);
 					ln=cris.nextList();
 					reads=(ln!=null ? ln.list : null);
 				}
@@ -822,6 +874,8 @@ public class KmerSort3 extends KmerSort {
 			final KmerComparator kcT;
 			final ArrayList<Read> storage;
 			final boolean unpairT;
+			final HashMap<Long, Read> entryFilterTable;
+			public long entryFilteredT=0;
 
 			protected long readsProcessedST=0;
 			protected long basesProcessedST=0;


=====================================
current/clump/KmerSplit.java
=====================================
@@ -123,6 +123,8 @@ public class KmerSplit {
 			
 			else if(a.equals("dedupe")){
 				//ignore
+			}else if(a.equals("entryfilter")){
+				//ignore
 			}else if(a.equals("markduplicates")){
 				//ignore
 			}else if(a.equals("markall")){


=====================================
current/icecream/IceCreamFinder.java
=====================================
@@ -76,6 +76,8 @@ public final class IceCreamFinder {
 			outstream=pp.outstream;
 		}
 		
+		Parser.setQuality(33);
+		
 		//Set shared static variables prior to parsing
 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
 		ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;


=====================================
current/jgi/BBDuk.java
=====================================
@@ -321,9 +321,13 @@ public class BBDuk {
 				else{ksplit_=false;}
 			}else if(a.equals("ktrim")){
 				if(b==null){b="";}
-				if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;}
-				else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;}
-				else if(b.equalsIgnoreCase("n")){
+				if(b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr") || b.equalsIgnoreCase("tips")){
+					ktrimLeft_=ktrimRight_=true;ktrimN_=ksplit_=false;
+				}else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){
+					ktrimLeft_=true;ktrimRight_=false;ktrimN_=ksplit_=false;
+				}else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){
+					ktrimLeft_=false;ktrimRight_=true;ktrimN_=ksplit_=false;
+				}else if(b.equalsIgnoreCase("n")){
 					ktrimLeft_=ktrimRight_=ksplit_=false;
 					ktrimN_=true;
 				}else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
@@ -332,9 +336,15 @@ public class BBDuk {
 					TRIM_SYMBOL_=(byte)b.charAt(0);
 				}else{
 					assert(b!=null && (b.equalsIgnoreCase("f") || b.equalsIgnoreCase("false"))) :
-						"\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n.";
+						"\nInvalid setting for ktrim - values must be f (false), l (left), r (right), rl (tips), or n.";
 					ktrimRight_=ktrimLeft_=false;
 				}
+			}else if(a.equals("trimtips") || a.equals("ktrimtips")){
+				if(b==null){b="";}
+				if(b.length()>0){
+					ktrimLeft_=ktrimRight_=true;ktrimN_=ksplit_=false;
+					restrictLeft_=restrictRight_=Integer.parseInt(b);
+				}
 			}else if(a.equals("kmask") || a.equals("mask")){
 				if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
 					kmaskLowercase_=true;
@@ -2454,7 +2464,8 @@ public class BBDuk {
 			ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen()));
 			ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen());
 			
-			final boolean ktrimrl=ktrimLeft || ktrimRight;
+			final boolean ktrimLeftOrRight=ktrimLeft || ktrimRight;
+			final boolean ktrimTips=(ktrimLeft && ktrimRight);
 			final boolean doKmerTrimming=storedKmers>0 && (ktrimLeft || ktrimRight || ktrimN || ksplit);
 			final boolean doKmerFiltering=storedKmers>0 && !doKmerTrimming;
 			
@@ -2641,7 +2652,22 @@ public class BBDuk {
 							int xsum=0;
 							int rktsum=0;
 							
-							if(ktrimrl){
+							if(ktrimTips){
+								if(r1!=null){
+									int x=ktrimTips(r1, keySets);
+									xsum+=x;
+									rktsum+=(x>0 ? 1 : 0);
+									rlen1=r1.length();
+									if(rlen1<minlen1){setDiscarded(r1);}
+								}
+								if(r2!=null){
+									int x=ktrimTips(r2, keySets);
+									xsum+=x;
+									rktsum+=(x>0 ? 1 : 0);
+									rlen2=r2.length();
+									if(rlen2<minlen2){setDiscarded(r2);}
+								}
+							}else if(ktrimLeftOrRight){
 								if(r1!=null){
 									int x=ktrim(r1, keySets);
 									xsum+=x;
@@ -3555,13 +3581,195 @@ public class BBDuk {
 			return found;
 		}
 		
+		private final int ktrim(final Read r, final AbstractKmerTable[] sets){
+			final int len=r.length();
+			final int start=(restrictRight<1 ? 0 : Tools.max(0, len-restrictRight));
+			final int stop=(restrictLeft<1 ? len : Tools.min(len, restrictLeft));
+			return ktrim(r, sets, start, stop);
+		}
+		
+		private final int ktrimTips(final Read r, final AbstractKmerTable[] sets){
+			final int len=r.length();
+			final int mid=len/2-(k-1)/2;
+			int sum=0;
+			if(ktrimRight){
+				int start=Tools.max(0, (restrictRight<1 ? mid : len-restrictRight));
+				sum+=ktrimTip(r, sets, start, len, true, false);
+			}
+			if(ktrimLeft){
+				int stop=Tools.min(r.length(), (restrictLeft<1 ? mid+k-1 : restrictLeft));
+				sum+=ktrimTip(r, sets, 0, stop, false, true);
+			}
+			return sum;
+		}
+		
+		/**
+		 * Trim a read to remove matching kmers and everything to their left or right, on one end.
+		 * Allows both trimRight and trimLeft, in 2 passes.
+		 * @param r Read to process
+		 * @param sets Kmer tables
+		 * @return Number of bases trimmed
+		 */
+		private final int ktrimTip(final Read r, final AbstractKmerTable[] sets, final int start, final int stop,
+				final boolean right, final boolean left){
+			assert(left || right);
+			assert(!(left && right));
+			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
+			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+			if(verbose){outstream.println("KTrimming read "+r.id);}
+			final byte[] bases=r.bases, quals=r.quality;
+			long kmer=0;
+			long rkmer=0;
+			int found=0;
+			int len=0;
+			int id0=-1; //ID of first kmer found.
+			
+			int minLoc=999999999, minLocExclusive=999999999;
+			int maxLoc=-1, maxLocExclusive=-1;
+			final int initialLength=r.length();
+			
+			//Scan for normal kmers
+			for(int i=start; i<stop; i++){
+				byte b=bases[i];
+				long x=symbolToNumber0[b];
+				long x2=symbolToComplementNumber0[b];
+				kmer=((kmer<<bitsPerBase)|x)&mask;
+				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
+				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
+				if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+				if(len>=minlen2 && i>=minlen){
+					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+					if(id>0){
+						if(id0<0){id0=id;}
+						minLoc=Tools.min(minLoc, i-k+1);
+						assert(minLoc>=0);
+						maxLoc=i;
+						found++;
+					}
+				}
+			}
+			
+			if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
+			if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
+			
+			//If nothing was found, scan for short kmers.  Only used for trimming..
+			if(useShortKmers && found==0){
+				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+				
+				//Look for short kmers on left side
+				if(left){
+					kmer=0;
+					rkmer=0;
+					len=0;
+					final int lim=Tools.min(k, stop);
+					for(int i=start; i<lim; i++){
+						byte b=bases[i];
+						long x=symbolToNumber0[b];
+						long x2=symbolToComplementNumber0[b];
+						kmer=((kmer<<bitsPerBase)|x)&mask;
+						rkmer=rkmer|(x2<<(bitsPerBase*len));
+						len++;
+						if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+						if(len>=mink){
+							
+							if(verbose){
+								outstream.println("Looking for left kmer  "+kmerToString(kmer, len));
+								outstream.println("Looking for left rkmer "+kmerToString(rkmer, len));
+							}
+							
+							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+							if(id>0){
+								if(id0<0){id0=id;}
+								if(verbose){outstream.println("Found "+kmer);}
+								minLoc=0;
+								minLocExclusive=Tools.min(minLocExclusive, i+1);
+								maxLoc=Tools.max(maxLoc, i);
+								maxLocExclusive=Tools.max(maxLocExclusive, 0);
+								found++;
+							}
+						}
+					}
+				}
+
+				//Look for short kmers on right side
+				if(right){
+					kmer=0;
+					rkmer=0;
+					len=0;
+					final int lim=Tools.max(-1, stop-k);
+					for(int i=stop-1; i>lim; i--){
+						byte b=bases[i];
+						long x=symbolToNumber0[b];
+						long x2=symbolToComplementNumber0[b];
+						kmer=kmer|(x<<(bitsPerBase*len));
+						rkmer=((rkmer<<bitsPerBase)|x2)&mask;
+						len++;
+						if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+						if(len>=mink){
+							if(verbose){
+								outstream.println("Looking for right kmer "+
+										AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
+							}
+							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+							if(id>0){
+								if(id0<0){id0=id;}
+								if(verbose){outstream.println("Found "+kmer);}
+								minLoc=i;
+								minLocExclusive=Tools.min(minLocExclusive, bases.length);
+								maxLoc=bases.length-1;
+								maxLocExclusive=Tools.max(maxLocExclusive, i-1);
+								found++;
+							}
+						}
+					}
+				}
+			}
+			
+			
+			if(verbose){outstream.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+
+					", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
+			
+			if(found==0){return 0;}
+			assert(found>0) : "Overflow in 'found' variable.";
+			
+			{//Increment counter for the scaffold whose kmer was first detected
+				if(scaffoldReadCountsT!=null){
+					scaffoldReadCountsT[id0]++;
+					scaffoldBaseCountsT[id0]+=bases.length;
+				}else{
+					scaffoldReadCounts.addAndGet(id0, 1);
+					scaffoldBaseCounts.addAndGet(id0, bases.length);
+				}
+			}
+			
+			if(trimPad!=0){
+				maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
+				minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
+				maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
+				minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
+			}
+			
+			if(left){ //Trim from the read start to the rightmost kmer base
+				if(verbose){outstream.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
+				int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
+				if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
+				return x;
+			}else{ //Trim from the leftmost kmer base to the read stop
+				assert(right);
+				if(verbose){outstream.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
+				int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
+				if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
+				return x;
+			}
+		}
+		
 		/**
 		 * Trim a read to remove matching kmers and everything to their left or right.
 		 * @param r Read to process
 		 * @param sets Kmer tables
 		 * @return Number of bases trimmed
 		 */
-		private final int ktrim(final Read r, final AbstractKmerTable[] sets){
+		private final int ktrim(final Read r, final AbstractKmerTable[] sets, final int start, final int stop){
 			assert(ktrimLeft || ktrimRight);
 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
@@ -3577,9 +3785,6 @@ public class BBDuk {
 			int maxLoc=-1, maxLocExclusive=-1;
 			final int initialLength=r.length();
 			
-			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
-			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
-			
 			//Scan for normal kmers
 			for(int i=start; i<stop; i++){
 				byte b=bases[i];
@@ -4807,9 +5012,9 @@ public class BBDuk {
 	final boolean usePairGC;
 	
 	/** If positive, only look for kmer matches in the leftmost X bases */
-	int restrictLeft;
+	final int restrictLeft;
 	/** If positive, only look for kmer matches the rightmost X bases */
-	int restrictRight;
+	final int restrictRight;
 	
 	/** Skip this many initial input reads */
 	private final long skipreads;


=====================================
current/jgi/RQCFilter2.java
=====================================
@@ -585,6 +585,7 @@ public class RQCFilter2 {
 			microbesUsed=appendOutDir(microbesUsed);
 			chloroStatsFile=appendOutDir(chloroStatsFile);
 			sipStatsFile=appendOutDir(sipStatsFile);
+			sipScafStats=appendOutDir(sipScafStats);
 
 			cardinalityName=appendOutDir(cardinalityName);
 			adaptersOutFile=appendOutDir(adaptersOutFile);
@@ -816,8 +817,13 @@ public class RQCFilter2 {
 				sb.append("filteredByTile="+removeOutDir(fbtOutFile)).append('\n');
 			}
 			
-			if(sipFlag && sipStatsFile!=null){
-				sb.append("sipSpikeinStats="+removeOutDir(sipStatsFile)).append('\n');
+			if(sipFlag){
+				if(sipStatsFile!=null){
+					sb.append("sipSpikeinStats="+removeOutDir(sipStatsFile)).append('\n');
+				}
+				if(sipScafStats!=null){
+					sb.append("sipScafStats="+removeOutDir(sipScafStats)).append('\n');
+				}
 			}
 			
 			if((chloroMapFlag || mitoMapFlag || riboMapFlag) && chloroOutFile!=null){
@@ -1136,15 +1142,15 @@ public class RQCFilter2 {
 				}
 				
 				//Filter the file to only include specified organisms in the taxlist..
-				final String ref=Data.findPath("?SIP_spikeins.fa.gz");//TODO add to RQCFilterData like others
+				final String ref=Data.findPath("?SIP_spikeins.fa.gz");
 				
 				{
 					ArrayList<String> args=new ArrayList<String>();
 					args.add("covstats="+sipStatsFile);
 					final boolean addToOtherStats=(sipStatsFile==null);
-					decontamByMapping(in1z, in2z, out1z, out2z, null, "sipScafStats.txt", inPrefix, outPrefix, ref, step, addToOtherStats, args);
+					decontamByMapping(in1z, in2z, out1z, out2z, null, sipScafStats, inPrefix, outPrefix, ref, step, addToOtherStats, args);
 					if(!addToOtherStats){
-						filterstats.parseSip("sipScafStats.txt");
+						filterstats.parseSip(sipScafStats);
 						assert(filterstats.toString()!=null);
 					}
 				}
@@ -3982,6 +3988,7 @@ public class RQCFilter2 {
 	private String microbesUsed="microbesUsed.txt";
 	private String chloroStatsFile="chloroStats.txt";
 	private String sipStatsFile="sipSpikeinStats.txt";
+	private String sipScafStats="sipScafStats.txt";
 	private String nexteraStats="nexteraStats.txt";
 	private String ihistName="ihist_merge.txt";
 	private String khistName="khist.txt";


=====================================
current/jgi/RQCFilterStats.java
=====================================
@@ -93,10 +93,6 @@ public class RQCFilterStats {
 	}
 	
 	public String toString(boolean skipAssertion){
-		assert(skipAssertion || readsIn>=readsOut) : toString(true);
-		assert(skipAssertion || basesIn>=basesOut) : toString(true);
-		assert(skipAssertion || readsIn-totalReadsRemoved()==readsOut) : toString(true)+"\n\ntrr="+totalReadsRemoved()+"\nri="+readsIn+"\nro="+readsOut;
-		assert(skipAssertion || basesIn-totalBasesRemoved()==basesOut) : toString(true)+"\n"+basesIn+"-"+totalBasesRemoved()+"!="+basesOut;
 		StringBuilder sb=new StringBuilder(1000);
 		sb.append("#Class\tReads\tBases\tReadPct\tBasePct\tNotes\n");
 		sb.append(format("Input", readsIn, basesIn, readsIn, basesIn));
@@ -120,6 +116,12 @@ public class RQCFilterStats {
 		sb.append(format("Dog", readsDog, basesDog, readsIn, basesIn));
 		sb.append(format("Microbe", readsMicrobe, basesMicrobe, readsIn, basesIn));
 		sb.append(format("Other", readsOther, basesOther, readsIn, basesIn));
+		
+		assert(skipAssertion || readsIn>=readsOut) : toString(true)+"\n\nsb:\n"+sb+"\n";
+		assert(skipAssertion || basesIn>=basesOut) : toString(true)+"\n\n"+sb+"\n";
+		assert(skipAssertion || readsIn-totalReadsRemoved()==readsOut) : toString(true)+"\n\ntrr="+totalReadsRemoved()+"\nri="+readsIn+"\nro="+readsOut+"\n\nsb:\n"+sb+"\n";
+		assert(skipAssertion || basesIn-totalBasesRemoved()==basesOut) : toString(true)+"\n"+basesIn+"-"+totalBasesRemoved()+"!="+basesOut+"\n\nsb:\n"+sb+"\n";
+		
 		return sb.toString();
 	}
 	


=====================================
current/shared/Parser.java
=====================================
@@ -1418,6 +1418,10 @@ public class Parser {
 	private static byte qout=-1;
 	private static boolean parsedQuality=false;
 	
+	public static void setQuality(int x){
+		qin=(byte)x;
+		parsedQuality=(x>-1);
+	}
 	public static void processQuality(){
 //		assert(parsedQuality);
 		if(!parsedQuality){return;}


=====================================
current/shared/Shared.java
=====================================
@@ -125,8 +125,8 @@ public class Shared {
 	public static final int GAPCOST=Tools.max(1, GAPLEN/64);
 	public static final byte GAPC='-';
 	
-	public static String BBMAP_VERSION_STRING="38.96";
-	public static String BBMAP_VERSION_NAME="Return True";
+	public static String BBMAP_VERSION_STRING="38.97";
+	public static String BBMAP_VERSION_NAME="Honeybee";
 	
 	public static boolean TRIM_READ_COMMENTS=false;
 	public static boolean TRIM_RNAME=false; //For mapped sam reads


=====================================
current/stream/FASTQ.java
=====================================
@@ -178,6 +178,7 @@ public class FASTQ {
 				}else{
 					if(warnQualityChange){System.err.println("Changed from ASCII-64 to ASCII-33 due to read of length "+bases.length+" while prescanning.");}
 					ASCII_OFFSET=33;
+					SET_QIN=true;
 				}
 				DETECT_QUALITY=false;
 			}
@@ -549,12 +550,15 @@ public class FASTQ {
 						ASCII_OFFSET=33;
 					}
 					DETECT_QUALITY=false;
+//					System.err.println("A: "+numericID+": "+bases.length);
 				}
 				
 //				assert(false) : Arrays.toString(quals);
 				for(int i=0; i<quals.length; i++){
 					quals[i]-=ASCII_OFFSET; //Convert from ASCII33 to native.
 					if(DETECT_QUALITY && ASCII_OFFSET==33 && (quals[i]>QUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){
+
+//						System.err.println("B: "+numericID+": "+bases.length);
 						if(warnQualityChange){
 							if(numericID<1){
 								System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
@@ -651,6 +655,29 @@ public class FASTQ {
 				}
 			}
 		}else{
+			
+			//Prevents problems with PacBio's weird quality numbers.  Usually.
+			if(DETECT_QUALITY && numericID==0){//first read
+				for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
+					if(cntr==1 && s.length>=MIN_LENGTH_TO_FORCE_ASCII_33){//Chances of ASCII-64 are vanishingly small
+//						SET_QIN=true;
+						DETECT_QUALITY=false;
+						ASCII_OFFSET=33;
+					}
+					quad[cntr]=s;
+					cntr++;
+					if(cntr==4){
+						cntr=0;
+						final Read r=quadToRead_slow(quad, false, bf, numericID, flag);
+						
+						list.add(r);
+						added++;
+						numericID++;
+						if(added>=maxReadsToReturn){return list;}
+					}
+				}
+			}
+			
 			for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
 				quad[cntr]=s;
 				cntr++;


=====================================
current/tax/GiToTaxid.java
=====================================
@@ -88,36 +88,24 @@ public class GiToTaxid {
 	
 	public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
 	public static int parseGiToTaxid(String s, char delimiter){
-		int x=parseGiNumber(s, delimiter);
-		assert(x>=0) : s;
-		assert(array!=null) : "To use gi numbers, you must load a gi table.";
-//		if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
-		if(x>=0 && x<array.length){return array[x];}
-		assert(x<array.length) : "The GI number "+x+" is too big.\n"
-				+ "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
-				+ "To ignore this problem, please run with the -da flag.\n";
-		return -1;
+		long x=parseGiNumber(s, delimiter);
+		assert(x>=0) : x+", "+s;
+		return getID(x);
 	}
 	
 
 	public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
 	public static int parseGiToTaxid(byte[] s, char delimiter){
-		int x=parseGiNumber(s, delimiter);
-		if(x>=0){return array[x];}
-		return -1;
+		long x=parseGiNumber(s, delimiter);
+		return x<0 ? -1 : getID(x);
 	}
 	
 	/** Parse a gi number, or return -1 if formatted incorrectly. */
-	static int parseGiNumber(String s, char delimiter){
+	static long parseGiNumber(String s, char delimiter){
 		if(s==null || s.length()<4){return -1;}
-//		System.err.println("a");
 		if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
-//		System.err.println("b");
 		if(!s.startsWith("gi")){return -1;}
-//		System.err.println("c");
-//		System.err.println("d");
 		int initial=s.indexOf(delimiter);
-//		System.err.println("e");
 		if(initial<0){
 			if(delimiter!='~'){
 				delimiter='~';
@@ -128,21 +116,16 @@ public class GiToTaxid {
 				initial=s.indexOf(delimiter);
 			}
 			if(initial<0){return -1;}
-//			System.err.println("f");
-//			System.err.println("g");
 		}
-//		System.err.println("h");
 		if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
-//		System.err.println("i");
 		
-		int number=0;
+		long number=0;
 		for(int i=initial+1; i<s.length(); i++){
 			char c=s.charAt(i);
 			if(c==delimiter){break;}
 			assert(Tools.isDigit(c));
 			number=(number*10)+(c-'0');
 		}
-//		System.err.println("j: "+number);
 		return number;
 	}
 	
@@ -173,15 +156,14 @@ public class GiToTaxid {
 	public static int getID(String s){return getID(s, '|');}
 	/** Get the taxID from a header starting with a taxID or gi number */
 	public static int getID(String s, char delimiter){
-		int x=parseTaxidNumber(s, delimiter);
-		if(x>=0){return x;}
+		long x=parseTaxidNumber(s, delimiter);
+		if(x>=0){return (int)x;}
 		x=parseGiNumber(s, delimiter);
-		if(x>=0){return array[x];}
-		return -1;
+		return x<0 ? -1 : getID(x);
 	}
 	
 	/** Parse a gi number, or return -1 if formatted incorrectly. */
-	static int parseGiNumber(byte[] s, char delimiter){
+	static long parseGiNumber(byte[] s, char delimiter){
 		if(s==null || s.length<4){return -1;}
 		if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
 		int initial=Tools.indexOf(s, (byte)delimiter);
@@ -192,7 +174,7 @@ public class GiToTaxid {
 		}
 		if(!Tools.isDigit(s[initial+1])){return -1;}
 		
-		int number=0;
+		long number=0;
 		for(int i=initial+1; i<s.length; i++){
 			byte c=s[i];
 			if(c==delimiter){break;}
@@ -227,16 +209,37 @@ public class GiToTaxid {
 	public static int getID(byte[] s){return getID(s, '|');}
 	/** Get the taxID from a header starting with a taxID or gi number */
 	public static int getID(byte[] s, char delimiter){
-		int x=parseGiNumber(s, delimiter);
-		if(x>=0){return array[x];}
+		long x=parseGiNumber(s, delimiter);
+		if(x>=0){return getID(x, true);}
 		return parseNcbiNumber(s, delimiter);
 	}
 	
-	/** Get the taxID from a gi number */
-	public static int getID(int gi){
-		assert(gi>=0) : gi;
-		assert(gi<array.length) : gi+", "+array.length;
-		return array[gi];
+	/** Get the taxID from a gi number;
+	 * -1 if not present or invalid (negative input),
+	 * -2 if out of range (too high) */
+	public static int getID(long gi){
+		return getID(gi, true);
+	}
+	
+	/** Get the taxID from a gi number;
+	 * 0 if not present,
+	 * -1 if invalid (negative input),
+	 * -2 if out of range (too high) */
+	public static int getID(long gi, boolean assertInRange){
+		assert(initialized) : "To use gi numbers, you must load a gi table.";
+		if(gi<0 || gi>maxGiLoaded){
+			assert(!assertInRange) : gi<0 ? "gi number "+gi+" is invalid." : 
+				"The gi number "+gi+" is too big: Max loaded gi number is "+maxGiLoaded+".\n"
+				+ "Please update the gi table with the latest version from NCBI"
+				+ " as per the instructions in gitable.sh.\n"
+				+ "To ignore this problem, please run with the -da flag.\n";
+			return gi<0 ? -1 : -2;
+		}
+		final long upper=gi>>>SHIFT;
+		final int lower=(int)(gi&LOWERMASK);
+		assert(upper<Shared.MAX_ARRAY_LEN && upper<array.length) : gi+", "+upper+", "+array.length;
+		final int[] slice=array[(int)upper];
+		return slice==null || slice.length<=lower ? 0 : slice[lower];
 	}
 	
 	public static void initialize(String fname){
@@ -245,8 +248,19 @@ public class GiToTaxid {
 			synchronized(GiToTaxid.class){
 				if(!initialized || fileString==null || !fileString.equals(fname)){
 					fileString=fname;
-					if(fname.contains(".int1d")){
-						array=ReadWrite.read(int[].class, fname, true);
+					if(fname.contains(".int2d")){
+						array=ReadWrite.read(int[][].class, fname, true);
+						maxGiLoaded=-1;
+						if(array!=null && array.length>0){
+							int upper=array.length-1;
+							int[] section=array[upper];
+							int lower=section.length-1;
+							maxGiLoaded=(((long)upper)<<SHIFT)|lower;
+						}
+					}else if(fname.contains(".int1d")){
+						throw new RuntimeException("Old gi table format filename "+fname+".\n"
+								+ "Current files should end in .int2d.");
+						
 					}else{
 						array=makeArray(fname);
 					}
@@ -259,17 +273,19 @@ public class GiToTaxid {
 	public static boolean isInitialized(){return initialized;}
 	
 	public static synchronized void unload(){
+		maxGiLoaded=-1;
 		array=null;
 		fileString=null;
 		initialized=false;
 	}
 	
-	private static int[] makeArray(String fnames){
+	private static int[][] makeArray(String fnames){
 		String[] split;
 		if(new File(fnames).exists()){split=new String[] {fnames};}
 		else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
 		else if(fnames.indexOf('#')>=0){
-			assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
+			assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for "
+					+ "relative paths in present working directory.";
 			File dir=new File(System.getProperty("user.dir"));
 			String prefix=fnames.substring(0, fnames.indexOf('#'));
 			String suffix=fnames.substring(fnames.indexOf('#')+1);
@@ -290,20 +306,28 @@ public class GiToTaxid {
 			throw new RuntimeException("Invalid file: "+fnames);
 		}
 		
-		IntList list=new IntList();
-//		assert(max<Integer.MAX_VALUE) : "Overflow.";
-//		int[] x=new int[(int)max+1];
-//		Arrays.fill(x, -1);
+		int numLists=32;
+		IntList[] lists=new IntList[numLists];
 		
 		long total=0;
 		for(String s : split){
-			long count=addToList(s, list);
+			long count=addToList(s, lists);
 			total+=count;
 		}
-		return list.shrink().array;
+		for(int i=0; i<lists.length; i++){
+			if(lists[i]!=null && lists[i].size>0){
+				lists[i].shrink();
+				numLists=i+1;
+			}
+		}
+		int[][] table=new int[numLists][];
+		for(int i=0; i<numLists; i++){
+			table[i]=lists[i].array;
+		}
+		return table;
 	}
 	
-	private static long addToList(String fname, IntList list){
+	private static long addToList(String fname, IntList[] lists){
 		boolean warned=false;
 		ByteFile bf=ByteFile.makeByteFile(fname, true);
 		long count=0, invalid=0;
@@ -316,34 +340,18 @@ public class GiToTaxid {
 				assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
 				assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
 				assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
-//				assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
+				//assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
 				int tid=Parse.parseInt(line, tab2+1, tab3);
 				int gi=Parse.parseInt(line, tab3+1, line.length);
-				if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
+				if(gi<0){
 					invalid++;
 				}else{
-				assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
-				int old=list.get(gi);
-				assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
-				
-				list.set(gi, tid);
-				
-				//assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
-//				if(x[gi]!=-1 && x[gi]!=ncbi){
-//					if(!warned){
-//						System.err.println("***WARNING*** For file "+fname+":\n"+
-//								("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
-//								"\nThis may be an error from NCBI and you may wish to report it, but it is\n"
-//								+ "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
-//								+ "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
-//						warned=true;
-//					}
-//				}else{
-//					x[gi]=ncbi;
-//				}
+					assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
+					int old=setID(gi, tid, lists);
+					assert(old<1 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
 				}
 			}else{
-				if(line.length==0){System.err.println(fname+", "+count);}//debug
+				//if(line.length==0){System.err.println(fname+", "+count);}//debug
 				invalid++;
 			}
 			line=bf.nextLine();
@@ -353,6 +361,28 @@ public class GiToTaxid {
 		return count;
 	}
 	
+	private static int getID(long gi, IntList[] lists){
+		assert(gi>=0) : "gi number "+gi+" is invalid.";
+		final long upper=gi>>>SHIFT;
+		final int lower=(int)(gi&LOWERMASK);
+		assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper;
+		IntList list=lists[(int)upper];
+		return lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower);
+	}
+	
+	private static int setID(long gi, int tid, IntList[] lists){
+		assert(gi>=0) : "gi number "+gi+" is invalid.";
+		final long upper=gi>>>SHIFT;
+		final int lower=(int)(gi&LOWERMASK);
+		assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper;
+		IntList list=lists[(int)upper];
+		if(list==null){list=lists[(int)upper]=new IntList();}
+		int old=lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower);
+		list.set(lower, tid);
+		maxGiLoaded=Tools.max(gi, maxGiLoaded);
+		return old;
+	}
+	
 //	private static int[] makeArrayOld(String fnames){
 //		String[] split;
 //		if(new File(fnames).exists()){split=new String[] {fnames};}
@@ -420,7 +450,12 @@ public class GiToTaxid {
 //		return count;
 //	}
 	
-	private static int[] array;
+	private static long maxGiLoaded=-1;
+	private static int[][] array;
+	private static final int SHIFT=30;
+	private static final long UPPERMASK=(-1L)<<SHIFT;
+	private static final long LOWERMASK=~UPPERMASK;
+	
 	private static String fileString;
 	
 	public static boolean verbose=false;


=====================================
current/tax/GiToTaxidInt.java
=====================================
@@ -0,0 +1,431 @@
+package tax;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import shared.Parse;
+import shared.Shared;
+import shared.Tools;
+import structures.IntList;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 10, 2015
+ *
+ */
+public class GiToTaxidInt {
+	
+	public static void main(String[] args){
+		ReadWrite.USE_UNPIGZ=true;
+		ReadWrite.USE_PIGZ=true;
+		ReadWrite.ZIPLEVEL=9;
+		ReadWrite.PIGZ_BLOCKSIZE=256;
+//		ReadWrite.PIGZ_ITERATIONS=30;
+		
+		for(String arg : args){
+			String[] split=arg.split("=");
+			String a=split[0].toLowerCase();
+			String b=split.length>1 ? split[1] : null;
+			shared.Parser.parseZip(arg, a, b);
+		}
+//		if(args.length>2 && false){//Run a test
+//			test(args);
+//		}else 
+		if(args.length>=2){//Write array
+			initialize(args[0]);
+			ReadWrite.write(array, args[1], true);
+		}
+	}
+	
+	public static void test(String[] args){
+		System.err.println(getID(1000));
+		System.err.println(getID(10000));
+		System.err.println(getID(10001));
+		System.err.println(getID(10002));
+		System.err.println(getID(10003));
+		System.err.println(getID(10004));
+		System.err.println(getID(10005));
+		System.err.println(getID(100000));
+		System.err.println(getID(1000000));
+		System.err.println(getID(10000000));
+		
+		TaxTree tree=null;
+		if(args.length>1){
+			tree=TaxTree.loadTaxTree(args[0], System.err, true, true);
+		}
+		
+		System.err.println("Strings:");
+		int x;
+		x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
+		System.err.println(x);
+		if(tree!=null){
+			System.err.println(tree.getNode(x));
+			tree.incrementRaw(x, 30);
+		}
+		x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
+		System.err.println(x);
+		if(tree!=null){
+			System.err.println(tree.getNode(x));
+			tree.incrementRaw(x, 40);
+		}
+		x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
+		System.err.println(x);
+		if(tree!=null){
+			System.err.println(tree.getNode(x));
+			tree.incrementRaw(x, 20);
+		}
+		
+		if(tree!=null){
+			tree.percolateUp();
+			ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
+			for(TaxNode n : nodes){
+				System.err.println(n);
+			}
+		}
+	}
+	
+	public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
+	public static int parseGiToTaxid(String s, char delimiter){
+		int x=parseGiNumber(s, delimiter);
+		assert(x>=0) : s;
+		assert(array!=null) : "To use gi numbers, you must load a gi table.";
+//		if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
+		if(x>=0 && x<array.length){return array[x];}
+		assert(x<array.length) : "The GI number "+x+" is too big.\n"
+				+ "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
+				+ "To ignore this problem, please run with the -da flag.\n";
+		return -1;
+	}
+	
+
+	public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
+	public static int parseGiToTaxid(byte[] s, char delimiter){
+		long x=parseGiNumber(s, delimiter);
+		if(x>=0 && x<array.length){return array[(int)x];}
+		if(x<0){return -1;}
+		assert(false) : x;
+		return -1;
+	}
+	
+	/** Parse a gi number, or return -1 if formatted incorrectly. */
+	static int parseGiNumber(String s, char delimiter){
+		if(s==null || s.length()<4){return -1;}
+//		System.err.println("a");
+		if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
+//		System.err.println("b");
+		if(!s.startsWith("gi")){return -1;}
+//		System.err.println("c");
+//		System.err.println("d");
+		int initial=s.indexOf(delimiter);
+//		System.err.println("e");
+		if(initial<0){
+			if(delimiter!='~'){
+				delimiter='~';
+				initial=s.indexOf(delimiter);
+			}
+			if(initial<0){
+				delimiter='_';
+				initial=s.indexOf(delimiter);
+			}
+			if(initial<0){return -1;}
+//			System.err.println("f");
+//			System.err.println("g");
+		}
+//		System.err.println("h");
+		if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
+//		System.err.println("i");
+		
+		int number=0;
+		for(int i=initial+1; i<s.length(); i++){
+			char c=s.charAt(i);
+			if(c==delimiter){break;}
+			assert(Tools.isDigit(c));
+			number=(number*10)+(c-'0');
+		}
+//		System.err.println("j: "+number);
+		return number;
+	}
+	
+	/** Parse a ncbi number, or return -1 if formatted incorrectly. */
+	public static int parseTaxidNumber(String s, char delimiter){
+		if(s==null || s.length()<5){return -1;}
+		if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);}
+		if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;}
+		int initial=s.indexOf(delimiter);
+		if(initial<0){
+			delimiter='_';
+			initial=s.indexOf(delimiter);
+			if(initial<0){return -1;}
+		}
+		if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
+		
+		int number=0;
+		for(int i=initial+1; i<s.length(); i++){
+			char c=s.charAt(i);
+			if(c==delimiter || c==' '){break;}
+			assert(Tools.isDigit(c)) : c+"\n"+s;
+			number=(number*10)+(c-'0');
+		}
+		return number;
+	}
+	
+
+	public static int getID(String s){return getID(s, '|');}
+	/** Get the taxID from a header starting with a taxID or gi number */
+	public static int getID(String s, char delimiter){
+		int x=parseTaxidNumber(s, delimiter);
+		if(x>=0){return x;}
+		x=parseGiNumber(s, delimiter);
+		if(x>=0){return array[x];}
+		return -1;
+	}
+	
+	/** Parse a gi number, or return -1 if formatted incorrectly. */
+	static int parseGiNumber(byte[] s, char delimiter){
+		if(s==null || s.length<4){return -1;}
+		if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
+		int initial=Tools.indexOf(s, (byte)delimiter);
+		if(initial<0){
+			delimiter='_';
+			initial=Tools.indexOf(s, (byte)delimiter);
+			if(initial<0){return -1;}
+		}
+		if(!Tools.isDigit(s[initial+1])){return -1;}
+		
+		long number=0;
+		for(int i=initial+1; i<s.length; i++){
+			byte c=s[i];
+			if(c==delimiter){break;}
+			assert(Tools.isDigit(c));
+			number=(number*10)+(c-'0');
+		}
+		return (int)number;
+	}
+	
+	/** Parse a gi number, or return -1 if formatted incorrectly. */
+	static int parseNcbiNumber(byte[] s, char delimiter){
+		if(s==null || s.length<3){return -1;}
+		if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;}
+		int initial=Tools.indexOf(s, (byte)delimiter);
+		if(initial<0){
+			delimiter='_';
+			initial=Tools.indexOf(s, (byte)delimiter);
+			if(initial<0){return -1;}
+		}
+		if(!Tools.isDigit(s[initial+1])){return -1;}
+		
+		int number=0;
+		for(int i=initial+1; i<s.length; i++){
+			byte c=s[i];
+			if(c==delimiter){break;}
+			assert(Tools.isDigit(c));
+			number=(number*10)+(c-'0');
+		}
+		return number;
+	}
+
+	public static int getID(byte[] s){return getID(s, '|');}
+	/** Get the taxID from a header starting with a taxID or gi number */
+	public static int getID(byte[] s, char delimiter){
+		int x=parseGiNumber(s, delimiter);
+		if(x>=0){return array[x];}
+		return parseNcbiNumber(s, delimiter);
+	}
+	
+	/** Get the taxID from a gi number */
+	public static int getID(long gi){
+		assert(gi>=0) : gi;
+		assert(gi<Integer.MAX_VALUE) : gi+" > "+Integer.MAX_VALUE;
+		assert(gi<array.length) : gi+", "+array.length;
+		return array[(int)gi];
+	}
+	
+	public static void initialize(String fname){
+		assert(fname!=null);
+		if(fileString==null || !fileString.equals(fname)){
+			synchronized(GiToTaxid.class){
+				if(!initialized || fileString==null || !fileString.equals(fname)){
+					fileString=fname;
+					if(fname.contains(".int1d")){
+						array=ReadWrite.read(int[].class, fname, true);
+					}else{
+						array=makeArray(fname);
+					}
+				}
+				initialized=true;
+			}
+		}
+	}
+	
+	public static boolean isInitialized(){return initialized;}
+	
+	public static synchronized void unload(){
+		array=null;
+		fileString=null;
+		initialized=false;
+	}
+	
+	private static int[] makeArray(String fnames){
+		String[] split;
+		if(new File(fnames).exists()){split=new String[] {fnames};}
+		else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
+		else if(fnames.indexOf('#')>=0){
+			assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
+			File dir=new File(System.getProperty("user.dir"));
+			String prefix=fnames.substring(0, fnames.indexOf('#'));
+			String suffix=fnames.substring(fnames.indexOf('#')+1);
+			
+			File[] array=dir.listFiles();
+			StringBuilder sb=new StringBuilder();
+			String comma="";
+			for(File f : array){
+				String s=f.getName();
+				if(s.startsWith(prefix) && s.startsWith(suffix)){
+					sb.append(comma);
+					sb.append(s);
+					comma=",";
+				}
+			}
+			split=sb.toString().split(",");
+		}else{
+			throw new RuntimeException("Invalid file: "+fnames);
+		}
+		
+		IntList list=new IntList();
+//		assert(max<Integer.MAX_VALUE) : "Overflow.";
+//		int[] x=new int[(int)max+1];
+//		Arrays.fill(x, -1);
+		
+		long total=0;
+		for(String s : split){
+			long count=addToList(s, list);
+			total+=count;
+		}
+		return list.shrink().array;
+	}
+	
+	private static long addToList(String fname, IntList list){
+		boolean warned=false;
+		ByteFile bf=ByteFile.makeByteFile(fname, true);
+		long count=0, invalid=0;
+		byte[] line=bf.nextLine();
+		while(line!=null){
+			if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na
+				count++;
+				int tab2=Tools.indexOfNth(line, '\t', 2);
+				int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1);
+				assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
+				assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
+				assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
+//				assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
+				int tid=Parse.parseInt(line, tab2+1, tab3);
+				int gi=Parse.parseInt(line, tab3+1, line.length);
+				if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
+					invalid++;
+				}else{
+				assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
+				int old=list.get(gi);
+				assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
+				
+				list.set(gi, tid);
+				
+				//assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
+//				if(x[gi]!=-1 && x[gi]!=ncbi){
+//					if(!warned){
+//						System.err.println("***WARNING*** For file "+fname+":\n"+
+//								("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
+//								"\nThis may be an error from NCBI and you may wish to report it, but it is\n"
+//								+ "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
+//								+ "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
+//						warned=true;
+//					}
+//				}else{
+//					x[gi]=ncbi;
+//				}
+				}
+			}else{
+				if(line.length==0){System.err.println(fname+", "+count);}//debug
+				invalid++;
+			}
+			line=bf.nextLine();
+		}
+		if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);}
+		bf.close();
+		return count;
+	}
+	
+//	private static int[] makeArrayOld(String fnames){
+//		String[] split;
+//		if(new File(fnames).exists()){split=new String[] {fnames};}
+//		else{split=fnames.split(",");}
+//		
+//		long max=0;
+//		for(String s : split){
+//			max=Tools.max(max, findMaxID(s));
+//		}
+//		
+//		assert(max<Integer.MAX_VALUE) : "Overflow.";
+//		int[] x=new int[(int)max+1];
+//		Arrays.fill(x, -1);
+//		
+//		long total=0;
+//		for(String s : split){
+//			long count=fillArray(s, x);
+//			total+=count;
+//		}
+//		return x;
+//	}
+//	
+//	private static long findMaxID(String fname){
+//		ByteFile bf=ByteFile.makeByteFile(fname, true);
+//		long count=0, max=0;
+//		byte[] line=bf.nextLine();
+//		while(line!=null){
+//			count++;
+//			int tab=Tools.indexOf(line, (byte)'\t');
+//			long gi=Parse.parseLong(line, 0, tab);
+//			max=Tools.max(max, gi);
+//			line=bf.nextLine();
+//		}
+//		bf.close();
+//		return max;
+//	}
+//	
+//	private static long fillArray(String fname, int[] x){
+//		boolean warned=false;
+//		ByteFile bf=ByteFile.makeByteFile(fname, true);
+//		long count=0;
+//		byte[] line=bf.nextLine();
+//		while(line!=null){
+//			count++;
+//			int tab=Tools.indexOf(line, (byte)'\t');
+//			int gi=Parse.parseInt(line, 0, tab);
+//			int ncbi=Parse.parseInt(line, tab+1, line.length);
+//			//assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
+//			if(x[gi]!=-1 && x[gi]!=ncbi){
+//				if(!warned){
+//					System.err.println("***WARNING*** For file "+fname+":\n"+
+//							("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
+//							"\nThis may be an error from NCBI and you may wish to report it, but it is\n"
+//							+ "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
+//							+ "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
+//					warned=true;
+//				}
+//			}else{
+//				x[gi]=ncbi;
+//			}
+//			line=bf.nextLine();
+//		}
+//		if(verbose){System.err.println("Count: "+count);}
+//		bf.close();
+//		return count;
+//	}
+	
+	private static int[] array;
+	private static String fileString;
+	
+	public static boolean verbose=false;
+	private static boolean initialized=false;
+}


=====================================
current/tax/TaxServer.java
=====================================
@@ -1243,7 +1243,7 @@ public class TaxServer {
 		if(type2==GI){
 			for(String name : names){
 				sb.append(comma);
-				TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+				TaxNode tn=getTaxNodeGi(Long.parseLong(name));
 				if(tn==null){sb.append("-1");}
 				else{sb.append(tn.id);}
 				comma=",";
@@ -1297,7 +1297,7 @@ public class TaxServer {
 		int type2=type&15;
 		final TaxNode tn;
 		if(type2==GI){
-			tn=getTaxNodeGi(Integer.parseInt(name));
+			tn=getTaxNodeGi(Long.parseLong(name));
 		}else if(type2==NAME){
 			tn=getTaxNodeByName(name);
 		}else if(type2==TAXID){
@@ -1348,7 +1348,7 @@ public class TaxServer {
 		if(type2==GI){
 			for(String name : names){
 				sb.append(comma);
-				TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+				TaxNode tn=getTaxNodeGi(Long.parseLong(name));
 				if(tn==null){sb.append("Not found");}
 				else{sb.append(tree.toSemicolon(tn, skipNonCanonical, mononomial));}
 				comma=",";
@@ -1414,7 +1414,7 @@ public class TaxServer {
 		
 		long img=-1;
 		if(type==GI){
-			tn0=getTaxNodeGi(Integer.parseInt(name));
+			tn0=getTaxNodeGi(Long.parseLong(name));
 		}else if(type==NAME){
 			tn0=getTaxNodeByName(name);
 		}else if(type==TAXID){
@@ -1546,7 +1546,7 @@ public class TaxServer {
 		int type2=type&15;
 		if(type2==GI){
 			for(String name : names){
-				TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+				TaxNode tn=getTaxNodeGi(Long.parseLong(name));
 				if(tn!=null){list.add(tn.id);}
 				else{notFound.incrementAndGet();}
 			}
@@ -1620,7 +1620,7 @@ public class TaxServer {
 	}
 	
 	/** Look up a TaxNode from the gi number */
-	TaxNode getTaxNodeGi(int gi){
+	TaxNode getTaxNodeGi(long gi){
 		int ncbi=-1;
 		try {
 			ncbi=GiToTaxid.getID(gi);


=====================================
current/tax/TaxTree.java
=====================================
@@ -2491,7 +2491,8 @@ public class TaxTree implements Serializable{
 	private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
 	
 	private static final String defaultImgFile="TAX_PATH/imgDump.txt";
-	private static final String defaultTableFile="TAX_PATH/gitable.int1d.gz";
+	private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz";
+	private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz";
 	private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz";
 	private static final String defaultPatternFile="TAX_PATH/patterns.txt";
 	private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz";
@@ -2526,7 +2527,7 @@ public class TaxTree implements Serializable{
 	/** Path to all taxonomy files, substituted in to make specific file paths */
 	public static String TAX_PATH=defaultTaxPath();
 	
-	/** Location of gitable.int1d.gz for gi lookups */
+	/** Location of gitable.int2d.gz for gi lookups */
 	public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);}
 	/** Location of tree.taxtree.gz */
 	public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);}


=====================================
docs/changelog.txt
=====================================
@@ -897,6 +897,10 @@ Wrapped instances of byte array instantiation in allocByte1D.
 38.91
 Fixed SendSketch json array format for very large input, on client side.
 Added maxload flag to BBCMS.
+38.97
+Added trimtips to BBDuk, mainly for trimming adapters on both ends of PacBio reads.
+Changed processing of reads longer than 200bp to force ASCII-33 quality.
+Enable automatic entryfilter in Clumpify to handle libraries with mostly identical reads.
 
 
 todo: outshort for fungalrelease.


=====================================
removecatdogmousehuman.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
 This script requires at least 52GB RAM.
 It is designed for NERSC and uses hard-coded paths.
 
@@ -68,7 +68,7 @@ calcXmx () {
 calcXmx "$@"
 
 function removecatdogmousehuman() {
-	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/projectb/sandbox/gaag/bbtools/mousecatdoghuman/ pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
+	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/cfs/cdirs/bbtools/mousecatdoghuman/ pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
 	echo $CMD >&2
 	eval $CMD
 }


=====================================
removehuman.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
 This script requires at least 16GB RAM.
 It is designed for NERSC and uses hard-coded paths.
 
@@ -66,7 +66,7 @@ calcXmx () {
 calcXmx "$@"
 
 function removehuman() {
-	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/projectb/sandbox/gaag/bbtools/hg19 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
+	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/cfs/cdirs/bbtools/hg19 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
 	echo $CMD >&2
 	eval $CMD
 }


=====================================
removehuman2.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
 This script requires at least 17GB RAM.
 It is designed for NERSC and uses hard-coded paths.
 
@@ -62,7 +62,7 @@ calcXmx () {
 calcXmx "$@"
 
 function removehuman() {
-	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.75 maxindel=8 bwr=0.22 bw=26 minhits=1 path=/global/projectb/sandbox/gaag/bbtools/hg19 build=2 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 maxsites=1 k=14 tipsearch=0 kfilter=25 bloomfilter $@"
+	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.75 maxindel=8 bwr=0.22 bw=26 minhits=1 path=/global/cfs/cdirs/bbtools/hg19 build=2 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 maxsites=1 k=14 tipsearch=0 kfilter=25 bloomfilter $@"
 	echo $CMD >&2
 	eval $CMD
 }


=====================================
removemicrobes.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
 This script requires at least 10GB RAM.
 It is designed for NERSC and uses hard-coded paths.
 
@@ -73,7 +73,7 @@ calcXmx () {
 calcXmx "$@"
 
 function removemicrobes() {
-	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap strictmaxindel=4 bwr=0.16 bw=12 ef=0.001 minhits=2 path=/global/projectb/sandbox/gaag/bbtools/commonMicrobes pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag printunmappedcount ztd=2 kfilter=25 maxsites=1 k=13 minid=0.95 idfilter=0.95 minhits=2 build=1 bloomfilter $@"
+	local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap strictmaxindel=4 bwr=0.16 bw=12 ef=0.001 minhits=2 path=/global/cfs/cdirs/bbtools/commonMicrobes pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag printunmappedcount ztd=2 kfilter=25 maxsites=1 k=13 minid=0.95 idfilter=0.95 minhits=2 build=1 bloomfilter $@"
 	echo $CMD >&2
 	eval $CMD
 }


=====================================
resources/PacBioAdapter.fa
=====================================
@@ -0,0 +1,2 @@
+>PacBio_gDNA_Amplification_Adapter
+AAGCAGTGGTATCAACGCAGAGTACT


=====================================
resources/contents.txt
=====================================
@@ -82,3 +82,6 @@ These files contain consensus sequences of various conserved RNAs (rRNAs and tRN
 model.pgm
 Prokaryotic gene model, used by CallGenes and BBSketch in protein mode.
 
+PacBioAdapter.fa
+A sequence found at read tips in some PacBio low input libraries.
+


=====================================
resources/short.fa
=====================================
@@ -12,3 +12,10 @@ GTTCAGAGTTCTACAGTCCGACGATC
 TCGTATGCCGTCTTCTGCTTGT
 >contam_241
 TGGAATTCTCGGGTGCCAAGG
+>adapter_19mer_with_extensions
+NAATGATACGGCGACCACCGNN
+AAATGATACGGCGACCACCGAN
+CAATGATACGGCGACCACCGCN
+GAATGATACGGCGACCACCGGN
+TAATGATACGGCGACCACCGTN
+


=====================================
shred.sh
=====================================
@@ -3,7 +3,7 @@
 usage(){
 echo "
 Written by Brian Bushnell
-Last modified September 1, 2016
+Last modified January 13, 2022
 Description:  Shreds sequences into shorter, potentially overlapping sequences.
 
 Usage:	shred.sh in=<file> out=<file> length=<number> minlength=<number> overlap=<number>
@@ -17,6 +17,7 @@ reads=-1        If nonnegative, stop after this many input sequences.
 equal=f         Shred each sequence into subsequences of equal size of at most 'length', instead of a fixed size.
 median=0        If nonzero, randomly shred reads to a length with this median.
 variance=0      If median is nonzero, shred to lengths of median +-variance.
+qfake=30        Quality score, if using fastq output.
 
 Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
 "



View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/1c662261847da712e5b04b94277f8aa712c37619

-- 
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/1c662261847da712e5b04b94277f8aa712c37619
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220803/2086d088/attachment-0001.htm>