[med-svn] [Git][med-team/bbmap][upstream] New upstream version 38.95+dfsg

Andreas Tille (@tille) gitlab at salsa.debian.org
Sun Jan 16 07:44:32 GMT 2022



Andreas Tille pushed to branch upstream at Debian Med / bbmap


Commits:
fe78528e by Andreas Tille at 2022-01-16T08:26:27+01:00
New upstream version 38.95+dfsg
- - - - -


14 changed files:

- README.md
- + current/bbmin/LongHashSet.java
- + current/bbmin/LongList.java
- + current/bbmin/Minimizer.java
- current/dna/AminoAcid.java
- current/dna/Data.java
- current/jgi/BBQC.java
- current/jgi/FungalRelease.java
- current/jgi/RQCFilter2.java
- current/shared/Shared.java
- current/sketch/SketchObject.java
- current/tax/ImgRecord.java
- current/tax/TaxServer.java
- current/tax/TaxTree.java


Changes:

=====================================
README.md
=====================================
@@ -3,4 +3,4 @@
 # Language: Java, Bash
 # Information about documentation is in /docs/readme.txt.
 
-# Version 38.94
+# Version 38.95


=====================================
current/bbmin/LongHashSet.java
=====================================
@@ -0,0 +1,301 @@
+package bbmin;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * @author Brian Bushnell
+ * @date July 6, 2016
+ *
+ */
+public final class LongHashSet{
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public LongHashSet(){
+		this(256);
+	}
+	
+	public LongHashSet(int initialSize){
+		this(initialSize, 0.7f);
+	}
+	
+	public LongHashSet(int initialSize, float loadFactor_){
+		invalid=randy.nextLong()|MINMASK;
+		assert(invalid<0);
+		assert(initialSize>0) : "Attempting to initialize a "+getClass().getSimpleName()+" of size<1.";
+		assert(loadFactor_>0 && loadFactor_<1) : "Attempting to initialize a "+getClass().getSimpleName()+" with invalid load factor: "+loadFactor_;
+		loadFactor=mid(0.25f, loadFactor_, 0.90f);
+		resize(initialSize);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Public Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public void clear(){
+		if(size<1){return;}
+		Arrays.fill(array, invalid);
+		size=0;
+	}
+	
+	public boolean contains(long value){
+		return value==invalid ? false : findCell(value)>=0;
+	}
+	
+	/**
+	 * Add this value to the set.
+	 * @param value
+	 * @return true if the value was added, false if it was already contained.
+	 */
+	public boolean add(long value){
+		if(value==invalid){resetInvalid();}
+		int cell=findCellOrEmpty(value);
+		if(array[cell]==invalid){
+			array[cell]=value;
+			size++;
+			if(size>sizeLimit){resize();}
+			return true;
+		}
+		assert(array[cell]==value);
+		return false;
+	}
+	
+	/**
+	 * Remove this value from the set.
+	 * @param value
+	 * @return true if the value was removed, false if it was not present.
+	 */
+	public boolean remove(long value){
+		if(value==invalid){return false;}
+		final int cell=findCell(value);
+		if(cell<0){return false;}
+		assert(array[cell]==value);
+		array[cell]=invalid;
+		size--;
+		
+		rehashFrom(cell);
+		return true;
+	}
+	
+	public int size(){return size;}
+	
+	public boolean isEmpty(){return size==0;}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        String Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	@Override
+	public String toString(){
+		return toStringListView();
+	}
+	
+	public String toStringSetView(){
+		StringBuilder sb=new StringBuilder();
+		sb.append('[');
+		String comma="";
+		for(int i=0; i<array.length; i++){
+			if(array[i]!=invalid){
+				sb.append(comma+"("+i+", "+array[i]+")");
+				comma=", ";
+			}
+		}
+		sb.append(']');
+		return sb.toString();
+	}
+	
+	public String toStringListView(){
+		StringBuilder sb=new StringBuilder();
+		sb.append('[');
+		String comma="";
+		for(int i=0; i<array.length; i++){
+			if(array[i]!=invalid){
+				sb.append(comma+array[i]);
+				comma=", ";
+			}
+		}
+		sb.append(']');
+		return sb.toString();
+	}
+	
+	public long[] toArray(){
+		long[] x=new long[array.length];
+		int i=0;
+		for(long v : array){
+			x[i]=v;
+			i++;
+		}
+		return x;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Private Methods       ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public boolean verify(){
+		int numValues=0;
+		int numFound=0;
+		for(int i=0; i<array.length; i++){
+			final long value=array[i];
+			if(value!=invalid){
+				numValues++;
+				final int cell=findCell(value);
+				if(i==cell){
+					numFound++;
+				}else{
+					return false;
+				}
+			}
+		}
+		return numValues==numFound && numValues==size;
+	}
+	
+	private void rehashFrom(int initial){
+		if(size<1){return;}
+		final int limit=array.length;
+		for(int cell=initial+1; cell<limit; cell++){
+			final long x=array[cell];
+			if(x==invalid){return;}
+			rehashCell(cell);
+		}
+		for(int cell=0; cell<initial; cell++){
+			final long x=array[cell];
+			if(x==invalid){return;}
+			rehashCell(cell);
+		}
+	}
+	
+	private boolean rehashCell(final int cell){
+		final long value=array[cell];
+		assert(value!=invalid);
+		if(value==invalid){resetInvalid();}
+		final int dest=findCellOrEmpty(value);
+		if(cell==dest){return false;}
+		assert(array[dest]==invalid);
+		array[cell]=invalid;
+		array[dest]=value;
+		return true;
+	}
+	
+	private void resetInvalid(){
+		final long old=invalid;
+		long x=invalid;
+		while(x==old || contains(x)){x=randy.nextLong()|MINMASK;}
+		assert(x<0);
+		invalid=x;
+		for(int i=0; i<array.length; i++){
+			if(array[i]==old){array[i]=invalid;}
+		}
+	}
+	
+	private int findCell(final long value){
+		if(value==invalid){return -1;}
+		
+		final int limit=array.length, initial=(int)((value&MASK)%modulus);
+		for(int cell=initial; cell<limit; cell++){
+			final long x=array[cell];
+			if(x==value){return cell;}
+			if(x==invalid){return -1;}
+		}
+		for(int cell=0; cell<initial; cell++){
+			final long x=array[cell];
+			if(x==value){return cell;}
+			if(x==invalid){return -1;}
+		}
+		return -1;
+	}
+	
+	private int findCellOrEmpty(final long value){
+		assert(value!=invalid) : "Collision - this should have been intercepted.";
+		
+		final int limit=array.length, initial=(int)((value&MASK)%modulus);
+		for(int cell=initial; cell<limit; cell++){
+			final long x=array[cell];
+			if(x==value || x==invalid){return cell;}
+		}
+		for(int cell=0; cell<initial; cell++){
+			final long x=array[cell];
+			if(x==value || x==invalid){return cell;}
+		}
+		throw new RuntimeException("No empty cells - size="+size+", limit="+limit);
+	}
+	
+	public final void resizeDestructive(int newSize){
+		size=0;
+		sizeLimit=0;
+		array=null;
+		resize(newSize);
+	}
+	
+	private final void resize(){
+		assert(size>=sizeLimit);
+		resize(array.length*2L+1);
+	}
+	
+	private final void resize(final long size2){
+		assert(size2>size) : size+", "+size2;
+		
+		//This is supposed to be a prime but the primes code is ripped out in this version.
+		//Any odd number is fine in most cases.
+		long newPrime=size2|1;
+		if(newPrime+extra>Integer.MAX_VALUE){
+			newPrime=(Integer.MAX_VALUE-extra-2)|1;
+		}
+		assert(newPrime>modulus) : "Overflow: "+size+", "+size2+", "+modulus+", "+newPrime;
+		modulus=(int)newPrime;
+		
+		final int size3=(int)(newPrime+extra);
+		sizeLimit=(int)(modulus*loadFactor);
+		final long[] old=array;
+		array=new long[size3];
+		Arrays.fill(array, invalid);
+		
+//		System.err.println("Resizing "+(old==null ? "null" : ""+old.length)+" to "+size3);
+		
+		if(size<1){return;}
+		
+		size=0;
+		for(long value : old){
+			if(value!=invalid){
+				add(value);
+			}
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------      Stuff From BBTools      ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public static final float mid(float x, float y, float z){
+		return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);
+	}
+	public static final float min(float x, float y){return x<y ? x : y;}
+	public static final float max(float x, float y){return x>y ? x : y;}
+	
+	/** Number of values that can be held without resizing */
+	public int capacity(){
+		return sizeLimit;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private long[] array;
+	private int size=0;
+	/** Value for empty cells */
+	private long invalid;
+	private int modulus;
+	private int sizeLimit;
+	private final float loadFactor;
+	
+	private static final Random randy=new Random(1);
+	private static final long MASK=Long.MAX_VALUE;
+	private static final long MINMASK=Long.MIN_VALUE;
+	
+	private static final int extra=10;
+	
+}


=====================================
current/bbmin/LongList.java
=====================================
@@ -0,0 +1,405 @@
+package bbmin;
+
+import java.util.Arrays;
+
+public final class LongList{
+	
+	public LongList(){this(256);}
+	
+	public LongList(int initial){
+		assert(initial>0);
+		array=allocLong1D(initial);
+	}
+	
+	public void clear(){
+		size=0;
+	}
+	
+	public final void set(int loc, long value){
+		if(loc>=array.length){
+			resize(loc*2L+1);
+		}
+		array[loc]=value;
+		size=max(size, loc+1);
+	}
+	
+	public final void setLast(long value){
+		assert(size>0);
+		array[size-1]=value;
+	}
+	
+	public final void increment(int loc, long value){
+		if(loc>=array.length){
+			resize(loc*2L+1);
+		}
+		array[loc]+=value;
+		size=max(size, loc+1);
+	}
+	
+	public final void increment(int loc){
+		increment(loc, 1);
+	}
+	
+	public final void incrementBy(LongList b){
+		for(int i=b.size-1; i>=0; i--){
+			increment(i, b.get(i));
+		}
+	}
+	
+	public final void incrementBy(long[] b){
+		for(int i=b.length-1; i>=0; i--){
+			increment(i, b[i]);
+		}
+	}
+	
+	public final void append(LongList b){
+		for(int i=0; i<b.size; i++){
+			add(b.get(i));
+		}
+	}
+	
+	public final void append(long[] b){
+		for(int i=0; i<b.length; i++){
+			add(b[i]);
+		}
+	}
+	
+	public final long get(int loc){
+		return(loc>=size ? 0 : array[loc]);
+	}
+	
+	public final void add(long x){
+		if(size>=array.length){
+			resize(size*2L+1);
+		}
+		array[size]=x;
+		size++;
+	}
+	
+	private final void resize(final long size2){
+		assert(size2>size) : size+", "+size2;
+		final int size3=(int)min(MAX_ARRAY_LEN, size2);
+		assert(size3>size) : "Overflow: "+size+", "+size2+" -> "+size3;
+		array=copyOf(array, size3);
+	}
+	
+	public final void shrink(){
+		if(size==array.length){return;}
+		array=copyOf(array, size);
+	}
+	
+	public final double stdev(){
+		if(size<2){return 0;}
+		double sum=sum();
+		double avg=sum/size;
+		double sumdev2=0;
+		for(int i=0; i<size; i++){
+			long x=array[i];
+			double dev=avg-x;
+			sumdev2+=(dev*dev);
+		}
+		return Math.sqrt(sumdev2/size);
+	}
+	
+	public final double avgDif(final double x){
+		double sum=0;
+		for(int i=0; i<size; i++){
+			sum+=absdif(x, array[i]);
+		}
+		return sum/(max(1, size));
+	}
+	
+	public final double rmsDif(final double x){
+		double sum=0;
+		for(int i=0; i<size; i++){
+			double dif=absdif(x, array[i]);
+			sum+=dif*dif;
+		}
+		return Math.sqrt(sum/(max(1, size)));
+	}
+	
+	public final long sumLong(){
+		long sum=0;
+		for(int i=0; i<size; i++){
+			sum+=array[i];
+		}
+		return sum;
+	}
+	
+	public final double sum(){
+		double sum=0;
+		for(int i=0; i<size; i++){
+			sum+=array[i];
+		}
+		return sum;
+	}
+	
+	public final double mean(){
+		return size<1 ? 0 : sum()/size;
+	}
+	
+	//Ignores elements below 1
+	public final double harmonicMean(){
+		double sum=0;
+		int count=0;
+		for(int i=0; i<size; i++){
+			if(array[i]>0){
+				sum+=1.0/array[i];
+				count++;
+			}
+		}
+		double avg=sum/max(1, count);
+		return 1.0/avg;
+	}
+	
+	//Ignores elements below 1
+	public final double geometricMean(){
+		double sum=0;
+		int count=0;
+		for(int i=0; i<size; i++){
+			if(array[i]>0){
+				sum+=Math.log(array[i]);
+				count++;
+			}
+		}
+		double avg=sum/max(1, count);
+		return Math.exp(avg);
+	}
+	
+	/** Assumes list is sorted */
+	public final double medianWeightedAverage(){
+		if(size<1){return 0;}
+		int half=size/2;
+		long count=0;
+		double sum=0;
+		for(int i=0, j=size-1; i<half; i++, j--){
+			int mult=i+1;
+			double incr=(array[i]+array[j])*mult;
+			sum+=incr;
+			count+=2*mult;
+		}
+		if((size&1)==1){//odd length
+			int mult=half+1;
+			double incr=(array[half])*mult;
+			sum+=incr;
+			count+=2*mult;
+		}
+		return sum/count;
+	}
+	
+	/** Assumes list is sorted */
+	public final long median(){
+		if(size<1){return 0;}
+		int idx=percentileIndex(0.5);
+		return array[idx];
+	}
+	
+	/** Allows unsorted list */
+	public final long min(){
+		if(size<1){return 0;}
+		long x=array[0];
+		for(int i=1; i<size; i++){
+			x=min(x, array[i]);
+		}
+		return x;
+	}
+	
+	/** Allows unsorted list */
+	public final long max(){
+		if(size<1){return 0;}
+		long x=array[0];
+		for(int i=1; i<size; i++){
+			x=max(x, array[i]);
+		}
+		return x;
+	}
+	
+	/** Assumes list is sorted */
+	public final long mode(){
+		if(size<1){return 0;}
+		assert(sorted());
+		int streak=1, bestStreak=0;
+		long prev=array[0];
+		long best=prev;
+		for(int i=0; i<size; i++){
+			long x=array[i];
+			if(x==prev){streak++;}
+			else{
+				if(streak>bestStreak){
+					bestStreak=streak;
+					best=prev;
+				}
+				streak=1;
+				prev=x;
+			}
+		}
+		if(streak>bestStreak){
+			bestStreak=streak;
+			best=prev;
+		}
+		return best;
+	}
+	
+	public long percentile(double fraction){
+		if(size<1){return 0;}
+		int idx=percentileIndex(fraction);
+		return array[idx];
+	}
+	
+	public int percentileIndex(double fraction){
+		if(size<2){return size-1;}
+		assert(sorted());
+		double target=(sum()*fraction);
+		double sum=0;
+		for(int i=0; i<size; i++){
+			sum+=array[i];
+			if(sum>=target){
+				return i;
+			}
+		}
+		return size-1;
+	}
+	
+	public final void shrinkToUnique(){
+		condense();
+		shrink();
+	}
+	
+	//In-place.
+	//Assumes sorted.
+	public final void condense(){
+		if(size<=1){return;}
+		
+		int i=0, j=1;
+		for(; j<size && array[i]<array[j]; i++, j++){}//skip while strictly ascending 
+		
+		int dupes=0;
+		for(; j<size; j++){//This only enters at the first nonascending pair
+			long a=array[i], b=array[j];
+			assert(a<=b) : "Unsorted: "+i+", "+j+", "+a+", "+b;
+			if(b>a){
+				i++;
+				array[i]=b;
+			}else{
+				//do nothing
+				dupes++;
+				assert(a==b);
+			}
+		}
+		assert(dupes==(size-(i+1)));
+		assert(size>=(i+1));
+		size=i+1;
+	}
+	
+	@Override
+	public String toString(){
+		return toStringListView();
+	}
+	
+	public String toStringSetView(){
+		StringBuilder sb=new StringBuilder();
+		sb.append('[');
+		String comma="";
+		for(int i=0; i<size; i++){
+			if(array[i]!=0){
+				sb.append(comma+"("+i+", "+array[i]+")");
+				comma=", ";
+			}
+		}
+		sb.append(']');
+		return sb.toString();
+	}
+	
+	public String toStringListView(){
+		StringBuilder sb=new StringBuilder();
+		sb.append('[');
+		String comma="";
+		for(int i=0; i<size; i++){
+				sb.append(comma+array[i]);
+				comma=", ";
+		}
+		sb.append(']');
+		return sb.toString();
+	}
+	
+	public long[] toArray(){
+		long[] x=allocLong1D(size);
+		for(int i=0; i<x.length; i++){
+			x[i]=array[i];
+		}
+		return x;
+	}
+	
+	public void sort() {
+		if(size>1){Arrays.sort(array, 0, size);}
+	}
+	
+	public void sortSerial() {
+		if(size>1){Arrays.sort(array, 0, size);}
+	}
+	
+	public void reverse() {
+		if(size>1){reverseInPlace(array, 0, size);}
+	}
+	
+	public boolean sorted(){
+		for(int i=1; i<size; i++){
+			if(array[i]<array[i-1]){return false;}
+		}
+		return true;
+	}
+	
+	public int size() {
+		return size;
+	}
+	
+	public int capacity() {
+		return array.length;
+	}
+	
+	public int freeSpace() {
+		return array.length-size;
+	}
+	
+	private static void reverseInPlace(final long[] array, final int from, final int to){
+		if(array==null){return;}
+		final int len=to-from;
+		final int max=from+len/2, last=to-1;
+		for(int i=from; i<max; i++){
+			long temp=array[i];
+			array[i]=array[last-i];
+			array[last-i]=temp;
+		}
+	}
+	
+	private static final long min(long x, long y){return x<y ? x : y;}
+	private static final long max(long x, long y){return x>y ? x : y;}
+	
+	private static final int min(int x, int y){return x<y ? x : y;}
+	private static final int max(int x, int y){return x>y ? x : y;}
+	
+	private static double absdif(double a, double b) {return a>b ? a-b : b-a;}
+	
+	private static final long[] allocLong1D(int x){return new long[x];}
+	private static long[] copyOf(long[] buffer, long newLength) {
+		final int len=buffer.length;
+		final int len2=(int)min(newLength, MAX_ARRAY_LEN);
+		if(newLength>len2 && len2<=len){
+			throw new RuntimeException("Tried to create an array above length limit: "+len+"," +newLength);
+		}
+		long[] copy=null;
+		try {
+			copy=Arrays.copyOf(buffer, len2);
+		} catch (OutOfMemoryError e) {
+			throw e;
+		}
+		return copy;
+	}
+	
+	private static final long MAX_ARRAY_LEN=Integer.MAX_VALUE-20;
+	
+	public long[] array;
+	/** Highest occupied index plus 1, i.e., lowest unoccupied index */
+	public int size=0;
+	
+}


=====================================
current/bbmin/Minimizer.java
=====================================
@@ -0,0 +1,159 @@
+package bbmin;
+
+import java.util.Arrays;
+
+/**
+ * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
+ * The resulting array is guaranteed to contain the minimal hash code<br>
+ * for every window, with no duplicates.
+ * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
+ * 
+ * @author Brian Bushnell
+ * @date October 8, 2021
+ *
+ */
+public class Minimizer {
+	
+	public static void main(String[] args){
+		int k=4, w=7;
+		String seq="ACGTCTGAGCCTTGACACATGACT";
+		try {
+			k=Integer.parseInt(args[0]);
+			w=Integer.parseInt(args[1]);
+			seq=args[2];
+		} catch (NumberFormatException e) {
+			//e.printStackTrace();
+			System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
+					+ "E.G.\n"
+					+ "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
+			System.exit(1);
+		}
+		Minimizer minnow=new Minimizer(k, w);
+		long[] array=minnow.minimize(seq.getBytes());
+		System.err.println(Arrays.toString(array));
+	}
+
+	public Minimizer(int k_, int window_){this(k_, window_, 2);}
+	public Minimizer(int k_, int window_, int bitsPerSymbol_){
+		k=k_;
+		window=window_;
+		bitsPerSymbol=bitsPerSymbol_;
+		shift=bitsPerSymbol*k;
+		shift2=shift-bitsPerSymbol;
+		mask=(shift>63 ? -1L : ~((-1L)<<shift));
+	}
+	
+	public long[] minimize(String str){
+		return minimize(str.getBytes());
+	}
+
+	public long[] minimize(byte[] bases){
+		return minimize(bases, new LongList(16), new LongHashSet(16));
+	}
+
+	/** This method is typically faster since you don't need to construct a new set each time. */
+	public long[] minimize(byte[] bases, LongList list, LongHashSet set){
+		list.clear();
+		//If the set is way too big, resize it
+		if(set.capacity()*(long)window>100L+16L*bases.length){
+			set.resizeDestructive(16);
+		}else{
+			set.clear();
+		}
+		
+		long kmersProcessed=0;
+		long kmer=0;
+		long rkmer=0;
+		int len=0;
+		
+		long bestCode=Long.MAX_VALUE;
+		int bestPosition=-1;
+		long bestKmer=-1;
+		long bestRkmer=-1;
+		int currentWindow=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			long x=baseToNumber[b];
+			long x2=baseToComplementNumber[b];
+			
+			kmer=((kmer<<2)|x)&mask;
+			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
+			if(x<0){
+				len=0;
+				rkmer=0;
+			}else{
+				len++;
+			}
+			
+			if(len>=k){
+				kmersProcessed++;
+				currentWindow++;
+
+				final long hashcode=hash(kmer, rkmer);
+				System.err.println("i="+i+", code="+hashcode);
+
+				//Track the best code in the window and its state
+				if(hashcode>=minCode && hashcode<=bestCode){
+					bestCode=hashcode;
+					bestPosition=i;
+					bestKmer=kmer;
+					bestRkmer=rkmer;
+				}
+				
+				//Once the window size is met, store the best code,
+				//and backtrack to its position to start the next window
+				if(currentWindow>=window && bestPosition>=0){
+					if(!set.contains(bestCode)){
+						set.add(bestCode);
+						list.add(bestCode);
+					}
+					i=bestPosition;
+					kmer=bestKmer;
+					rkmer=bestRkmer;
+					len=k;
+					
+					bestCode=Long.MAX_VALUE;
+					bestPosition=-1;
+					currentWindow=0;
+				}
+			}
+		}
+		list.sort();//optional
+		return list.toArray();
+	}
+
+	public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
+	public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
+	public static long hash(long key) {
+		key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+		key = key ^ (key >>> 24);
+		key = (key + (key << 3)) + (key << 8); // key * 265
+		key = key ^ (key >>> 14);
+		key = (key + (key << 2)) + (key << 4); // key * 21
+		key = key ^ (key >>> 28);
+		key = key + (key << 31);
+		return key;
+	}
+	private static final long max(long x, long y){return x>y ? x : y;}
+
+	public final int k;
+	public final int window;
+	public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
+	private final int shift;
+	private final int shift2;
+	private final long mask;
+	private final long minCode=0;
+	
+	static final byte[] baseToNumber = new byte[128];
+	static final byte[] baseToComplementNumber = new byte[128];
+	
+	static {
+		Arrays.fill(baseToNumber, (byte)-1);
+		Arrays.fill(baseToComplementNumber, (byte)-1);
+		baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+		baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
+		baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
+		baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+	}
+}


=====================================
current/dna/AminoAcid.java
=====================================
@@ -94,7 +94,7 @@ public final class AminoAcid {
 	}
 	
 	public String canonicalCodon(){
-		return codeStrings[0];
+		return codeStrings==null || codeStrings.length<1 ? null : codeStrings[0];
 	}
 	
 	
@@ -275,6 +275,10 @@ public final class AminoAcid {
 	public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG");
 	public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC");
 	public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG");
+	
+	public static final AminoAcid Selenocysteine=new AminoAcid("Selenocysteine, Sec, U"); //UGA sometimes
+	public static final AminoAcid Pyrrolysine=new AminoAcid("Pyrrolysine, Pyl, O");
+	
 	public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG");
 	public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX");
 	
@@ -849,7 +853,9 @@ public final class AminoAcid {
 		AlphabeticalAAs[18]=Tyrosine;
 		AlphabeticalAAs[19]=Valine;
 		AlphabeticalAAs[20]=END;
-//		AlphabeticalAAs[21]=ANY;
+//		AlphabeticalAAs[21]=Selenocysteine;
+//		AlphabeticalAAs[22]=Pyrrolysine;
+//		AlphabeticalAAs[23]=ANY;
 
 		Arrays.fill(aminoToCode, (byte)-1);
 		Arrays.fill(acidToNumber, (byte)-1);
@@ -896,11 +902,13 @@ public final class AminoAcid {
 		{
 			byte anySym=(byte)(Tools.max(acidToNumberExtended)+1);
 			byte dash=(byte)(anySym+1);
-			acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym;
+			acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym; //Unknown
 			acidToNumberExtended['b']=acidToNumberExtended['B']=anySym;
 			acidToNumberExtended['z']=acidToNumberExtended['Z']=anySym;
 			acidToNumberExtended['j']=acidToNumberExtended['J']=anySym;
-			acidToNumberExtended['-']=dash;
+			acidToNumberExtended['u']=acidToNumberExtended['U']=anySym; //Selenocysteine
+			acidToNumberExtended['o']=acidToNumberExtended['O']=anySym; //Pyrrolysine
+			acidToNumberExtended['-']=dash; //Deletion
 		}
 		
 		acidToNumber8['H']=acidToNumber8['K']=acidToNumber8['R']=0;
@@ -913,7 +921,8 @@ public final class AminoAcid {
 		acidToNumber8['B']=acidToNumber8['Z']=7;
 
 		aminoToCode['X']=aminoToCode['x']=aminoToCode['B']=aminoToCode['b']=
-				aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=65;
+				aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=
+				aminoToCode['O']=aminoToCode['o']=aminoToCode['U']=aminoToCode['u']=65;
 		codeToAA[65]=ANY;
 		codeToChar[65]='X';
 		codeToByte[65]='X';


=====================================
current/dna/Data.java
=====================================
@@ -1236,7 +1236,7 @@ public class Data {
 				}
 			}
 			if(!f.exists() && !path.startsWith("jar:")){
-				String hardlink="/global/projectb/sandbox/gaag/bbtools/resources/"+fname;
+				String hardlink="/global/cfs/cdirs/bbtools/resources/"+fname;
 				f=new File(hardlink);
 				if(f.exists()){path=hardlink;}
 				else{if(vb){System.err.println("Did not find "+fname+" at "+hardlink);}}


=====================================
current/jgi/BBQC.java
=====================================
@@ -1080,9 +1080,9 @@ public class BBQC {
 	private String pjetRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/pJET1.2.fasta";
 
 	private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa";
-	private String fragAdapters = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa";
-	private String rnaAdapter = "/global/projectb/sandbox/gaag/bbtools/data/truseq_rna.fa.gz";
-	private String indexPath = "/global/projectb/sandbox/gaag/bbtools/hg19/";
+	private String fragAdapters = "/global/cfs/cdirs/bbtools/data/adapters.fa";
+	private String rnaAdapter = "/global/cfs/cdirs/bbtools/data/truseq_rna.fa.gz";
+	private String indexPath = "/global/cfs/cdirs/bbtools/hg19/";
 	private String mapRef = null;
 	
 	/*--------------------------------------------------------------*/


=====================================
current/jgi/FungalRelease.java
=====================================
@@ -7,6 +7,7 @@ import fileIO.ByteFile;
 import fileIO.ByteStreamWriter;
 import fileIO.FileFormat;
 import fileIO.ReadWrite;
+import shared.KillSwitch;
 import shared.MetadataWriter;
 import shared.Parse;
 import shared.Parser;
@@ -401,7 +402,12 @@ public class FungalRelease {
 	 *         discarded.
 	 */
 	boolean processRead(final Read r1) {
-		assert (!banIupac || !r1.containsNonACGTN()) : "Non-ACGTN base found in scaffold " + r1.id;
+		//assert (!banIupac || !r1.containsNonACGTN()) : "Non-ACGTN base found in scaffold " + r1.id;
+		if(banIupac){
+			if(r1.containsNonACGTN()){
+				KillSwitch.exceptionKill(new RuntimeException("Non-ACGTN base found in scaffold " + r1.id));
+			}
+		}
 		r1.inflateGaps(minGapIn, minGapOut);
 		return r1.length() >= minScaf;
 	}


=====================================
current/jgi/RQCFilter2.java
=====================================
@@ -196,7 +196,7 @@ public class RQCFilter2 {
 				catFlag=Parse.parseBoolean(b);
 			}else if(a.equals("removemouse") || a.equals("mouse")){
 				mouseFlag=Parse.parseBoolean(b);
-			}else if(a.equals("catdoghumanmouse") || a.equals("mousecatdoghuman") || a.equals("catdogmousehuman")){
+			}else if(a.equals("catdoghumanmouse") || a.equals("mousecatdoghuman") || a.equals("")){
 				mouseCatDogHumanFlag=Parse.parseBoolean(b);
 			}else if(a.equals("keephumanreads") || a.equals("keephuman")){
 				keepHumanReads=Parse.parseBoolean(b);
@@ -4062,7 +4062,7 @@ public class RQCFilter2 {
 		return s;
 	}
 	
-	private String RQCFilterData="/global/projectb/sandbox/gaag/bbtools/RQCFilterData_Local";
+	private String RQCFilterData="/global/cfs/cdirs/bbtools/RQCFilterData_Local";
 	
 	private String shortArtifactFile = "RQCFILTER_PATH/short.fa.gz";
 	private String mainArtifactFile = "RQCFILTER_PATH/Illumina.artifacts.fa.gz";


=====================================
current/shared/Shared.java
=====================================
@@ -23,6 +23,7 @@ public class Shared {
 
 
 	public static boolean ENV=(System.getenv()!=null);
+	private static String HOSTNAME;
 	public static boolean WINDOWS=envContainsPair("OS", "Win", true);
 	public static boolean MAC=envContainsPair("OS", "Mac", true);
 	//https://stackoverflow.com/questions/14288185/detecting-windows-or-linux
@@ -33,8 +34,8 @@ public class Shared {
 	public static boolean CORI=envContainsPair("NERSC_HOST", "cori", false);
 	public static boolean NERSC=envContainsKey("NERSC_HOST");
 	public static boolean AWS=envContainsKey("EC2_HOME");
+	public static boolean IGBVM="taxonomy-vm".equals(HOSTNAME()) || "taxonomy-vm-2".equals(HOSTNAME());
 	public static boolean AMD64="amd64".equalsIgnoreCase(System.getProperty("os.arch"));
-	private static String HOSTNAME;
 
 	public static void setTaxServer(String path){
 		taxServerNersc=taxServerAws=path;
@@ -124,8 +125,8 @@ public class Shared {
 	public static final int GAPCOST=Tools.max(1, GAPLEN/64);
 	public static final byte GAPC='-';
 	
-	public static String BBMAP_VERSION_STRING="38.94";
-	public static String BBMAP_VERSION_NAME="Potato Parser";
+	public static String BBMAP_VERSION_STRING="38.95";
+	public static String BBMAP_VERSION_NAME="CFS Migration";
 	
 	public static boolean TRIM_READ_COMMENTS=false;
 	public static boolean TRIM_RNAME=false; //For mapped sam reads


=====================================
current/sketch/SketchObject.java
=====================================
@@ -1056,16 +1056,27 @@ public class SketchObject {
 		return list;
 	}
 	
-	private static final String IMG_PATH="/global/projectb/sandbox/gaag/bbtools/img/current/img#.sketch";
-	private static final String NT_PATH="/global/projectb/sandbox/gaag/bbtools/nt/current/taxa#.sketch";
-	private static final String NR_PATH="/global/projectb/sandbox/gaag/bbtools/nr/current/taxa#.sketch";
-	private static final String REFSEQ_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/taxa#.sketch";
-	private static final String REFSEQ_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/big#.sketch";
-	private static final String SILVA_PATH="/global/projectb/sandbox/gaag/bbtools/silva/latest/both_taxa#.sketch";
-	private static final String PROKPROT_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/taxa#.sketch";
-	private static final String PROKPROT_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/big#.sketch";
-	private static final String MITO_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/taxa#.sketch";
-	private static final String FUNGI_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/fungi#.sketch";
+	private static final String IMG_PATH="/global/cfs/cdirs/bbtools/img/current/img#.sketch";
+	private static final String NT_PATH="/global/cfs/cdirs/bbtools/nt/current/taxa#.sketch";
+	private static final String NR_PATH="/global/cfs/cdirs/bbtools/nr/current/taxa#.sketch";
+	private static final String REFSEQ_PATH="/global/cfs/cdirs/bbtools/refseq/current/taxa#.sketch";
+	private static final String REFSEQ_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/big#.sketch";
+	private static final String SILVA_PATH="/global/cfs/cdirs/bbtools/silva/latest/both_taxa#.sketch";
+	private static final String PROKPROT_PATH="/global/cfs/cdirs/bbtools/refseq/current/prot/taxa#.sketch";
+	private static final String PROKPROT_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/prot/big#.sketch";
+	private static final String MITO_PATH="/global/cfs/cdirs/bbtools/mito2/taxa#.sketch";
+	private static final String FUNGI_PATH="/global/cfs/cdirs/bbtools/mito2/fungi#.sketch";
+
+	private static final String IMG_PATH_IGBVM="/data/sketch/img/current/img#.sketch";
+	private static final String NT_PATH_IGBVM="/data/sketch/nt/current/taxa#.sketch";
+	private static final String NR_PATH_IGBVM="/data/sketch/nr/current/taxa#.sketch";
+	private static final String REFSEQ_PATH_IGBVM="/data/sketch/refseq/current/taxa#.sketch";
+	private static final String REFSEQ_PATH_BIG_IGBVM="/data/sketch/refseq/current/big#.sketch";
+	private static final String SILVA_PATH_IGBVM="/data/sketch/silva/current/both_taxa#.sketch";
+	private static final String PROKPROT_PATH_IGBVM="/data/sketch/refseq/current/prot/taxa#.sketch";
+	private static final String PROKPROT_PATH_BIG_IGBVM="/data/sketch/refseq/current/prot/big#.sketch";
+	private static final String MITO_PATH_IGBVM="/data/sketch/mito2/taxa#.sketch";
+	private static final String FUNGI_PATH_IGBVM="/data/sketch/mito2/fungi#.sketch";
 
 	private static final String IMG_PATH_AWS=null;
 	private static final String NT_PATH_AWS="/test1/sketch/latest/nt/taxa#.sketch";
@@ -1076,16 +1087,16 @@ public class SketchObject {
 	private static final String MITO_PATH_AWS=null;
 	private static final String FUNGI_PATH_AWS=null;
 
-	public static final String IMG_PATH(){return Shared.AWS ? IMG_PATH_AWS : IMG_PATH;}
-	public static final String NT_PATH(){return Shared.AWS ? NT_PATH_AWS : NT_PATH;}
-	public static final String NR_PATH(){return Shared.AWS ? NR_PATH_AWS : NR_PATH;}
-	public static final String REFSEQ_PATH(){return Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;}
-	public static final String REFSEQ_PATH_BIG(){return REFSEQ_PATH_BIG;}
-	public static final String SILVA_PATH(){return Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;}
-	public static final String PROKPROT_PATH(){return Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;}
-	public static final String PROKPROT_PATH_BIG(){return PROKPROT_PATH_BIG;}
-	public static final String MITO_PATH(){return Shared.AWS ? MITO_PATH_AWS : MITO_PATH;}
-	public static final String FUNGI_PATH(){return Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;}
+	public static final String IMG_PATH(){return Shared.IGBVM ? IMG_PATH_IGBVM : Shared.AWS ? IMG_PATH_AWS : IMG_PATH;}
+	public static final String NT_PATH(){return Shared.IGBVM ? NT_PATH_IGBVM : Shared.AWS ? NT_PATH_AWS : NT_PATH;}
+	public static final String NR_PATH(){return Shared.IGBVM ? NR_PATH_IGBVM : Shared.AWS ? NR_PATH_AWS : NR_PATH;}
+	public static final String REFSEQ_PATH(){return Shared.IGBVM ? REFSEQ_PATH_IGBVM : Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;}
+	public static final String REFSEQ_PATH_BIG(){return Shared.IGBVM ? REFSEQ_PATH_BIG_IGBVM : REFSEQ_PATH_BIG;}
+	public static final String SILVA_PATH(){return Shared.IGBVM ? SILVA_PATH_IGBVM : Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;}
+	public static final String PROKPROT_PATH(){return Shared.IGBVM ? PROKPROT_PATH_IGBVM : Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;}
+	public static final String PROKPROT_PATH_BIG(){return Shared.IGBVM ? PROKPROT_PATH_BIG_IGBVM : PROKPROT_PATH_BIG;}
+	public static final String MITO_PATH(){return Shared.IGBVM ? MITO_PATH_IGBVM : Shared.AWS ? MITO_PATH_AWS : MITO_PATH;}
+	public static final String FUNGI_PATH(){return Shared.IGBVM ? FUNGI_PATH_IGBVM : Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;}
 	
 	/*--------------------------------------------------------------*/
 	/*----------------           Getters            ----------------*/


=====================================
current/tax/ImgRecord.java
=====================================
@@ -122,7 +122,7 @@ public class ImgRecord implements Serializable {
 	
 	public static boolean storeName=true;
 	public static HashMap<Long, ImgRecord> imgMap;
-//	public static final String DefaultDumpFile="/global/projectb/sandbox/gaag/bbtools/tax/imgTaxDump.txt.gz";
+//	public static final String DefaultDumpFile="/global/cfs/cdirs/bbtools/tax/imgTaxDump.txt.gz";
 	public static final String DefaultDumpFile="/global/u1/i/img/adhocDumps/taxonDumpForBrian.txt";
 	
 }


=====================================
current/tax/TaxServer.java
=====================================
@@ -2275,7 +2275,7 @@ public class TaxServer {
 	private String sizeFile=null;
 
 	/** Location of sequence directory tree */
-	private String basePath="/global/projectb/sandbox/gaag/bbtools/tree/";
+	private String basePath="/global/cfs/cdirs/bbtools/tree/";
 	
 	/** Used for taxonomic tree traversal */
 	private final TaxTree tree;


=====================================
current/tax/TaxTree.java
=====================================
@@ -2479,13 +2479,16 @@ public class TaxTree implements Serializable{
 	
 	/* For these fields, see the corresponding functions, below.
 	 * They define the default paths to various data on NERSC. */
-	
-	private static final String defaultTaxPathNersc="/global/projectb/sandbox/gaag/bbtools/tax/latest";
+
+	private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest";
 	private static final String defaultTaxPathAws="/test1/tax/latest";
-	private static final String default16SFileNersc="/global/projectb/sandbox/gaag/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
+	private static final String defaultTaxPathIGBVM="/data/tax/latest";
+	private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
 	private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
-	private static final String default18SFileNersc="/global/projectb/sandbox/gaag/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
+	private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
+	private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
 	private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz";
+	private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
 	
 	private static final String defaultImgFile="TAX_PATH/imgDump.txt";
 	private static final String defaultTableFile="TAX_PATH/gitable.int1d.gz";
@@ -2507,17 +2510,17 @@ public class TaxTree implements Serializable{
 
 	/** For setting TAX_PATH, the root to taxonomy files */
 	public static final String defaultTaxPath(){
-		return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : defaultTaxPathNersc;
+		return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc;
 	}
 
 	/** 16S consensus sequences per TaxID */
 	public static final String default16SFile(){
-		return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : default16SFileNersc;
+		return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc;
 	}
 
 	/** 18S consensus sequences per TaxID */
 	public static final String default18SFile(){
-		return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : default18SFileNersc;
+		return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc;
 	}
 
 	/** Path to all taxonomy files, substituted in to make specific file paths */



View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/fe78528ef3612e697050a49f9a33783057920aaf

-- 
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/fe78528ef3612e697050a49f9a33783057920aaf
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220116/86a0774c/attachment-0001.htm>


More information about the debian-med-commit mailing list