[med-svn] [Git][med-team/bbmap][upstream] New upstream version 38.95+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Jan 16 07:44:32 GMT 2022
Andreas Tille pushed to branch upstream at Debian Med / bbmap
Commits:
fe78528e by Andreas Tille at 2022-01-16T08:26:27+01:00
New upstream version 38.95+dfsg
- - - - -
14 changed files:
- README.md
- + current/bbmin/LongHashSet.java
- + current/bbmin/LongList.java
- + current/bbmin/Minimizer.java
- current/dna/AminoAcid.java
- current/dna/Data.java
- current/jgi/BBQC.java
- current/jgi/FungalRelease.java
- current/jgi/RQCFilter2.java
- current/shared/Shared.java
- current/sketch/SketchObject.java
- current/tax/ImgRecord.java
- current/tax/TaxServer.java
- current/tax/TaxTree.java
Changes:
=====================================
README.md
=====================================
@@ -3,4 +3,4 @@
# Language: Java, Bash
# Information about documentation is in /docs/readme.txt.
-# Version 38.94
+# Version 38.95
=====================================
current/bbmin/LongHashSet.java
=====================================
@@ -0,0 +1,301 @@
+package bbmin;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * @author Brian Bushnell
+ * @date July 6, 2016
+ *
+ */
+public final class LongHashSet{
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public LongHashSet(){
+ this(256);
+ }
+
+ public LongHashSet(int initialSize){
+ this(initialSize, 0.7f);
+ }
+
+ public LongHashSet(int initialSize, float loadFactor_){
+ invalid=randy.nextLong()|MINMASK;
+ assert(invalid<0);
+ assert(initialSize>0) : "Attempting to initialize a "+getClass().getSimpleName()+" of size<1.";
+ assert(loadFactor_>0 && loadFactor_<1) : "Attempting to initialize a "+getClass().getSimpleName()+" with invalid load factor: "+loadFactor_;
+ loadFactor=mid(0.25f, loadFactor_, 0.90f);
+ resize(initialSize);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void clear(){
+ if(size<1){return;}
+ Arrays.fill(array, invalid);
+ size=0;
+ }
+
+ public boolean contains(long value){
+ return value==invalid ? false : findCell(value)>=0;
+ }
+
+ /**
+ * Add this value to the set.
+ * @param value
+ * @return true if the value was added, false if it was already contained.
+ */
+ public boolean add(long value){
+ if(value==invalid){resetInvalid();}
+ int cell=findCellOrEmpty(value);
+ if(array[cell]==invalid){
+ array[cell]=value;
+ size++;
+ if(size>sizeLimit){resize();}
+ return true;
+ }
+ assert(array[cell]==value);
+ return false;
+ }
+
+ /**
+ * Remove this value from the set.
+ * @param value
+ * @return true if the value was removed, false if it was not present.
+ */
+ public boolean remove(long value){
+ if(value==invalid){return false;}
+ final int cell=findCell(value);
+ if(cell<0){return false;}
+ assert(array[cell]==value);
+ array[cell]=invalid;
+ size--;
+
+ rehashFrom(cell);
+ return true;
+ }
+
+ public int size(){return size;}
+
+ public boolean isEmpty(){return size==0;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- String Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public String toString(){
+ return toStringListView();
+ }
+
+ public String toStringSetView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=invalid){
+ sb.append(comma+"("+i+", "+array[i]+")");
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public String toStringListView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=invalid){
+ sb.append(comma+array[i]);
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public long[] toArray(){
+ long[] x=new long[array.length];
+ int i=0;
+ for(long v : array){
+ x[i]=v;
+ i++;
+ }
+ return x;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean verify(){
+ int numValues=0;
+ int numFound=0;
+ for(int i=0; i<array.length; i++){
+ final long value=array[i];
+ if(value!=invalid){
+ numValues++;
+ final int cell=findCell(value);
+ if(i==cell){
+ numFound++;
+ }else{
+ return false;
+ }
+ }
+ }
+ return numValues==numFound && numValues==size;
+ }
+
+ private void rehashFrom(int initial){
+ if(size<1){return;}
+ final int limit=array.length;
+ for(int cell=initial+1; cell<limit; cell++){
+ final long x=array[cell];
+ if(x==invalid){return;}
+ rehashCell(cell);
+ }
+ for(int cell=0; cell<initial; cell++){
+ final long x=array[cell];
+ if(x==invalid){return;}
+ rehashCell(cell);
+ }
+ }
+
+ private boolean rehashCell(final int cell){
+ final long value=array[cell];
+ assert(value!=invalid);
+ if(value==invalid){resetInvalid();}
+ final int dest=findCellOrEmpty(value);
+ if(cell==dest){return false;}
+ assert(array[dest]==invalid);
+ array[cell]=invalid;
+ array[dest]=value;
+ return true;
+ }
+
+ private void resetInvalid(){
+ final long old=invalid;
+ long x=invalid;
+ while(x==old || contains(x)){x=randy.nextLong()|MINMASK;}
+ assert(x<0);
+ invalid=x;
+ for(int i=0; i<array.length; i++){
+ if(array[i]==old){array[i]=invalid;}
+ }
+ }
+
+ private int findCell(final long value){
+ if(value==invalid){return -1;}
+
+ final int limit=array.length, initial=(int)((value&MASK)%modulus);
+ for(int cell=initial; cell<limit; cell++){
+ final long x=array[cell];
+ if(x==value){return cell;}
+ if(x==invalid){return -1;}
+ }
+ for(int cell=0; cell<initial; cell++){
+ final long x=array[cell];
+ if(x==value){return cell;}
+ if(x==invalid){return -1;}
+ }
+ return -1;
+ }
+
+ private int findCellOrEmpty(final long value){
+ assert(value!=invalid) : "Collision - this should have been intercepted.";
+
+ final int limit=array.length, initial=(int)((value&MASK)%modulus);
+ for(int cell=initial; cell<limit; cell++){
+ final long x=array[cell];
+ if(x==value || x==invalid){return cell;}
+ }
+ for(int cell=0; cell<initial; cell++){
+ final long x=array[cell];
+ if(x==value || x==invalid){return cell;}
+ }
+ throw new RuntimeException("No empty cells - size="+size+", limit="+limit);
+ }
+
+ public final void resizeDestructive(int newSize){
+ size=0;
+ sizeLimit=0;
+ array=null;
+ resize(newSize);
+ }
+
+ private final void resize(){
+ assert(size>=sizeLimit);
+ resize(array.length*2L+1);
+ }
+
+ private final void resize(final long size2){
+ assert(size2>size) : size+", "+size2;
+
+ //This is supposed to be a prime but the primes code is ripped out in this version.
+ //Any odd number is fine in most cases.
+ long newPrime=size2|1;
+ if(newPrime+extra>Integer.MAX_VALUE){
+ newPrime=(Integer.MAX_VALUE-extra-2)|1;
+ }
+ assert(newPrime>modulus) : "Overflow: "+size+", "+size2+", "+modulus+", "+newPrime;
+ modulus=(int)newPrime;
+
+ final int size3=(int)(newPrime+extra);
+ sizeLimit=(int)(modulus*loadFactor);
+ final long[] old=array;
+ array=new long[size3];
+ Arrays.fill(array, invalid);
+
+// System.err.println("Resizing "+(old==null ? "null" : ""+old.length)+" to "+size3);
+
+ if(size<1){return;}
+
+ size=0;
+ for(long value : old){
+ if(value!=invalid){
+ add(value);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Stuff From BBTools ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final float mid(float x, float y, float z){
+ return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);
+ }
+ public static final float min(float x, float y){return x<y ? x : y;}
+ public static final float max(float x, float y){return x>y ? x : y;}
+
+ /** Number of values that can be held without resizing */
+ public int capacity(){
+ return sizeLimit;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private long[] array;
+ private int size=0;
+ /** Value for empty cells */
+ private long invalid;
+ private int modulus;
+ private int sizeLimit;
+ private final float loadFactor;
+
+ private static final Random randy=new Random(1);
+ private static final long MASK=Long.MAX_VALUE;
+ private static final long MINMASK=Long.MIN_VALUE;
+
+ private static final int extra=10;
+
+}
=====================================
current/bbmin/LongList.java
=====================================
@@ -0,0 +1,405 @@
+package bbmin;
+
+import java.util.Arrays;
+
+public final class LongList{
+
+ public LongList(){this(256);}
+
+ public LongList(int initial){
+ assert(initial>0);
+ array=allocLong1D(initial);
+ }
+
+ public void clear(){
+ size=0;
+ }
+
+ public final void set(int loc, long value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]=value;
+ size=max(size, loc+1);
+ }
+
+ public final void setLast(long value){
+ assert(size>0);
+ array[size-1]=value;
+ }
+
+ public final void increment(int loc, long value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]+=value;
+ size=max(size, loc+1);
+ }
+
+ public final void increment(int loc){
+ increment(loc, 1);
+ }
+
+ public final void incrementBy(LongList b){
+ for(int i=b.size-1; i>=0; i--){
+ increment(i, b.get(i));
+ }
+ }
+
+ public final void incrementBy(long[] b){
+ for(int i=b.length-1; i>=0; i--){
+ increment(i, b[i]);
+ }
+ }
+
+ public final void append(LongList b){
+ for(int i=0; i<b.size; i++){
+ add(b.get(i));
+ }
+ }
+
+ public final void append(long[] b){
+ for(int i=0; i<b.length; i++){
+ add(b[i]);
+ }
+ }
+
+ public final long get(int loc){
+ return(loc>=size ? 0 : array[loc]);
+ }
+
+ public final void add(long x){
+ if(size>=array.length){
+ resize(size*2L+1);
+ }
+ array[size]=x;
+ size++;
+ }
+
+ private final void resize(final long size2){
+ assert(size2>size) : size+", "+size2;
+ final int size3=(int)min(MAX_ARRAY_LEN, size2);
+ assert(size3>size) : "Overflow: "+size+", "+size2+" -> "+size3;
+ array=copyOf(array, size3);
+ }
+
+ public final void shrink(){
+ if(size==array.length){return;}
+ array=copyOf(array, size);
+ }
+
+ public final double stdev(){
+ if(size<2){return 0;}
+ double sum=sum();
+ double avg=sum/size;
+ double sumdev2=0;
+ for(int i=0; i<size; i++){
+ long x=array[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/size);
+ }
+
+ public final double avgDif(final double x){
+ double sum=0;
+ for(int i=0; i<size; i++){
+ sum+=absdif(x, array[i]);
+ }
+ return sum/(max(1, size));
+ }
+
+ public final double rmsDif(final double x){
+ double sum=0;
+ for(int i=0; i<size; i++){
+ double dif=absdif(x, array[i]);
+ sum+=dif*dif;
+ }
+ return Math.sqrt(sum/(max(1, size)));
+ }
+
+ public final long sumLong(){
+ long sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ }
+ return sum;
+ }
+
+ public final double sum(){
+ double sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ }
+ return sum;
+ }
+
+ public final double mean(){
+ return size<1 ? 0 : sum()/size;
+ }
+
+ //Ignores elements below 1
+ public final double harmonicMean(){
+ double sum=0;
+ int count=0;
+ for(int i=0; i<size; i++){
+ if(array[i]>0){
+ sum+=1.0/array[i];
+ count++;
+ }
+ }
+ double avg=sum/max(1, count);
+ return 1.0/avg;
+ }
+
+ //Ignores elements below 1
+ public final double geometricMean(){
+ double sum=0;
+ int count=0;
+ for(int i=0; i<size; i++){
+ if(array[i]>0){
+ sum+=Math.log(array[i]);
+ count++;
+ }
+ }
+ double avg=sum/max(1, count);
+ return Math.exp(avg);
+ }
+
+ /** Assumes list is sorted */
+ public final double medianWeightedAverage(){
+ if(size<1){return 0;}
+ int half=size/2;
+ long count=0;
+ double sum=0;
+ for(int i=0, j=size-1; i<half; i++, j--){
+ int mult=i+1;
+ double incr=(array[i]+array[j])*mult;
+ sum+=incr;
+ count+=2*mult;
+ }
+ if((size&1)==1){//odd length
+ int mult=half+1;
+ double incr=(array[half])*mult;
+ sum+=incr;
+ count+=2*mult;
+ }
+ return sum/count;
+ }
+
+ /** Assumes list is sorted */
+ public final long median(){
+ if(size<1){return 0;}
+ int idx=percentileIndex(0.5);
+ return array[idx];
+ }
+
+ /** Allows unsorted list */
+ public final long min(){
+ if(size<1){return 0;}
+ long x=array[0];
+ for(int i=1; i<size; i++){
+ x=min(x, array[i]);
+ }
+ return x;
+ }
+
+ /** Allows unsorted list */
+ public final long max(){
+ if(size<1){return 0;}
+ long x=array[0];
+ for(int i=1; i<size; i++){
+ x=max(x, array[i]);
+ }
+ return x;
+ }
+
+ /** Assumes list is sorted */
+ public final long mode(){
+ if(size<1){return 0;}
+ assert(sorted());
+ int streak=1, bestStreak=0;
+ long prev=array[0];
+ long best=prev;
+ for(int i=0; i<size; i++){
+ long x=array[i];
+ if(x==prev){streak++;}
+ else{
+ if(streak>bestStreak){
+ bestStreak=streak;
+ best=prev;
+ }
+ streak=1;
+ prev=x;
+ }
+ }
+ if(streak>bestStreak){
+ bestStreak=streak;
+ best=prev;
+ }
+ return best;
+ }
+
+ public long percentile(double fraction){
+ if(size<1){return 0;}
+ int idx=percentileIndex(fraction);
+ return array[idx];
+ }
+
+ public int percentileIndex(double fraction){
+ if(size<2){return size-1;}
+ assert(sorted());
+ double target=(sum()*fraction);
+ double sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ if(sum>=target){
+ return i;
+ }
+ }
+ return size-1;
+ }
+
+ public final void shrinkToUnique(){
+ condense();
+ shrink();
+ }
+
+ //In-place.
+ //Assumes sorted.
+ public final void condense(){
+ if(size<=1){return;}
+
+ int i=0, j=1;
+ for(; j<size && array[i]<array[j]; i++, j++){}//skip while strictly ascending
+
+ int dupes=0;
+ for(; j<size; j++){//This only enters at the first nonascending pair
+ long a=array[i], b=array[j];
+ assert(a<=b) : "Unsorted: "+i+", "+j+", "+a+", "+b;
+ if(b>a){
+ i++;
+ array[i]=b;
+ }else{
+ //do nothing
+ dupes++;
+ assert(a==b);
+ }
+ }
+ assert(dupes==(size-(i+1)));
+ assert(size>=(i+1));
+ size=i+1;
+ }
+
+ @Override
+ public String toString(){
+ return toStringListView();
+ }
+
+ public String toStringSetView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ if(array[i]!=0){
+ sb.append(comma+"("+i+", "+array[i]+")");
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public String toStringListView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ sb.append(comma+array[i]);
+ comma=", ";
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public long[] toArray(){
+ long[] x=allocLong1D(size);
+ for(int i=0; i<x.length; i++){
+ x[i]=array[i];
+ }
+ return x;
+ }
+
+ public void sort() {
+ if(size>1){Arrays.sort(array, 0, size);}
+ }
+
+ public void sortSerial() {
+ if(size>1){Arrays.sort(array, 0, size);}
+ }
+
+ public void reverse() {
+ if(size>1){reverseInPlace(array, 0, size);}
+ }
+
+ public boolean sorted(){
+ for(int i=1; i<size; i++){
+ if(array[i]<array[i-1]){return false;}
+ }
+ return true;
+ }
+
+ public int size() {
+ return size;
+ }
+
+ public int capacity() {
+ return array.length;
+ }
+
+ public int freeSpace() {
+ return array.length-size;
+ }
+
+ private static void reverseInPlace(final long[] array, final int from, final int to){
+ if(array==null){return;}
+ final int len=to-from;
+ final int max=from+len/2, last=to-1;
+ for(int i=from; i<max; i++){
+ long temp=array[i];
+ array[i]=array[last-i];
+ array[last-i]=temp;
+ }
+ }
+
+ private static final long min(long x, long y){return x<y ? x : y;}
+ private static final long max(long x, long y){return x>y ? x : y;}
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ private static double absdif(double a, double b) {return a>b ? a-b : b-a;}
+
+ private static final long[] allocLong1D(int x){return new long[x];}
+ private static long[] copyOf(long[] buffer, long newLength) {
+ final int len=buffer.length;
+ final int len2=(int)min(newLength, MAX_ARRAY_LEN);
+ if(newLength>len2 && len2<=len){
+ throw new RuntimeException("Tried to create an array above length limit: "+len+"," +newLength);
+ }
+ long[] copy=null;
+ try {
+ copy=Arrays.copyOf(buffer, len2);
+ } catch (OutOfMemoryError e) {
+ throw e;
+ }
+ return copy;
+ }
+
+ private static final long MAX_ARRAY_LEN=Integer.MAX_VALUE-20;
+
+ public long[] array;
+ /** Highest occupied index plus 1, i.e., lowest unoccupied index */
+ public int size=0;
+
+}
=====================================
current/bbmin/Minimizer.java
=====================================
@@ -0,0 +1,159 @@
+package bbmin;
+
+import java.util.Arrays;
+
+/**
+ * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
+ * The resulting array is guaranteed to contain the minimal hash code<br>
+ * for every window, with no duplicates.
+ * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
+ *
+ * @author Brian Bushnell
+ * @date October 8, 2021
+ *
+ */
+public class Minimizer {
+
+ public static void main(String[] args){
+ int k=4, w=7;
+ String seq="ACGTCTGAGCCTTGACACATGACT";
+ try {
+ k=Integer.parseInt(args[0]);
+ w=Integer.parseInt(args[1]);
+ seq=args[2];
+ } catch (NumberFormatException e) {
+ //e.printStackTrace();
+ System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
+ + "E.G.\n"
+ + "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
+ System.exit(1);
+ }
+ Minimizer minnow=new Minimizer(k, w);
+ long[] array=minnow.minimize(seq.getBytes());
+ System.err.println(Arrays.toString(array));
+ }
+
+ public Minimizer(int k_, int window_){this(k_, window_, 2);}
+ public Minimizer(int k_, int window_, int bitsPerSymbol_){
+ k=k_;
+ window=window_;
+ bitsPerSymbol=bitsPerSymbol_;
+ shift=bitsPerSymbol*k;
+ shift2=shift-bitsPerSymbol;
+ mask=(shift>63 ? -1L : ~((-1L)<<shift));
+ }
+
+ public long[] minimize(String str){
+ return minimize(str.getBytes());
+ }
+
+ public long[] minimize(byte[] bases){
+ return minimize(bases, new LongList(16), new LongHashSet(16));
+ }
+
+ /** This method is typically faster since you don't need to construct a new set each time. */
+ public long[] minimize(byte[] bases, LongList list, LongHashSet set){
+ list.clear();
+ //If the set is way too big, resize it
+ if(set.capacity()*(long)window>100L+16L*bases.length){
+ set.resizeDestructive(16);
+ }else{
+ set.clear();
+ }
+
+ long kmersProcessed=0;
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ long bestCode=Long.MAX_VALUE;
+ int bestPosition=-1;
+ long bestKmer=-1;
+ long bestRkmer=-1;
+ int currentWindow=0;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
+ if(x<0){
+ len=0;
+ rkmer=0;
+ }else{
+ len++;
+ }
+
+ if(len>=k){
+ kmersProcessed++;
+ currentWindow++;
+
+ final long hashcode=hash(kmer, rkmer);
+ System.err.println("i="+i+", code="+hashcode);
+
+ //Track the best code in the window and its state
+ if(hashcode>=minCode && hashcode<=bestCode){
+ bestCode=hashcode;
+ bestPosition=i;
+ bestKmer=kmer;
+ bestRkmer=rkmer;
+ }
+
+ //Once the window size is met, store the best code,
+ //and backtrack to its position to start the next window
+ if(currentWindow>=window && bestPosition>=0){
+ if(!set.contains(bestCode)){
+ set.add(bestCode);
+ list.add(bestCode);
+ }
+ i=bestPosition;
+ kmer=bestKmer;
+ rkmer=bestRkmer;
+ len=k;
+
+ bestCode=Long.MAX_VALUE;
+ bestPosition=-1;
+ currentWindow=0;
+ }
+ }
+ }
+ list.sort();//optional
+ return list.toArray();
+ }
+
+ public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
+ public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
+ public static long hash(long key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >>> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >>> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >>> 28);
+ key = key + (key << 31);
+ return key;
+ }
+ private static final long max(long x, long y){return x>y ? x : y;}
+
+ public final int k;
+ public final int window;
+ public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
+ private final int shift;
+ private final int shift2;
+ private final long mask;
+ private final long minCode=0;
+
+ static final byte[] baseToNumber = new byte[128];
+ static final byte[] baseToComplementNumber = new byte[128];
+
+ static {
+ Arrays.fill(baseToNumber, (byte)-1);
+ Arrays.fill(baseToComplementNumber, (byte)-1);
+ baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+ baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
+ baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
+ baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+ }
+}
=====================================
current/dna/AminoAcid.java
=====================================
@@ -94,7 +94,7 @@ public final class AminoAcid {
}
public String canonicalCodon(){
- return codeStrings[0];
+ return codeStrings==null || codeStrings.length<1 ? null : codeStrings[0];
}
@@ -275,6 +275,10 @@ public final class AminoAcid {
public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG");
public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC");
public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG");
+
+ public static final AminoAcid Selenocysteine=new AminoAcid("Selenocysteine, Sec, U"); //UGA sometimes
+ public static final AminoAcid Pyrrolysine=new AminoAcid("Pyrrolysine, Pyl, O");
+
public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG");
public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX");
@@ -849,7 +853,9 @@ public final class AminoAcid {
AlphabeticalAAs[18]=Tyrosine;
AlphabeticalAAs[19]=Valine;
AlphabeticalAAs[20]=END;
-// AlphabeticalAAs[21]=ANY;
+// AlphabeticalAAs[21]=Selenocysteine;
+// AlphabeticalAAs[22]=Pyrrolysine;
+// AlphabeticalAAs[23]=ANY;
Arrays.fill(aminoToCode, (byte)-1);
Arrays.fill(acidToNumber, (byte)-1);
@@ -896,11 +902,13 @@ public final class AminoAcid {
{
byte anySym=(byte)(Tools.max(acidToNumberExtended)+1);
byte dash=(byte)(anySym+1);
- acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym;
+ acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym; //Unknown
acidToNumberExtended['b']=acidToNumberExtended['B']=anySym;
acidToNumberExtended['z']=acidToNumberExtended['Z']=anySym;
acidToNumberExtended['j']=acidToNumberExtended['J']=anySym;
- acidToNumberExtended['-']=dash;
+ acidToNumberExtended['u']=acidToNumberExtended['U']=anySym; //Selenocysteine
+ acidToNumberExtended['o']=acidToNumberExtended['O']=anySym; //Pyrrolysine
+ acidToNumberExtended['-']=dash; //Deletion
}
acidToNumber8['H']=acidToNumber8['K']=acidToNumber8['R']=0;
@@ -913,7 +921,8 @@ public final class AminoAcid {
acidToNumber8['B']=acidToNumber8['Z']=7;
aminoToCode['X']=aminoToCode['x']=aminoToCode['B']=aminoToCode['b']=
- aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=65;
+ aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=
+ aminoToCode['O']=aminoToCode['o']=aminoToCode['U']=aminoToCode['u']=65;
codeToAA[65]=ANY;
codeToChar[65]='X';
codeToByte[65]='X';
=====================================
current/dna/Data.java
=====================================
@@ -1236,7 +1236,7 @@ public class Data {
}
}
if(!f.exists() && !path.startsWith("jar:")){
- String hardlink="/global/projectb/sandbox/gaag/bbtools/resources/"+fname;
+ String hardlink="/global/cfs/cdirs/bbtools/resources/"+fname;
f=new File(hardlink);
if(f.exists()){path=hardlink;}
else{if(vb){System.err.println("Did not find "+fname+" at "+hardlink);}}
=====================================
current/jgi/BBQC.java
=====================================
@@ -1080,9 +1080,9 @@ public class BBQC {
private String pjetRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/pJET1.2.fasta";
private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa";
- private String fragAdapters = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa";
- private String rnaAdapter = "/global/projectb/sandbox/gaag/bbtools/data/truseq_rna.fa.gz";
- private String indexPath = "/global/projectb/sandbox/gaag/bbtools/hg19/";
+ private String fragAdapters = "/global/cfs/cdirs/bbtools/data/adapters.fa";
+ private String rnaAdapter = "/global/cfs/cdirs/bbtools/data/truseq_rna.fa.gz";
+ private String indexPath = "/global/cfs/cdirs/bbtools/hg19/";
private String mapRef = null;
/*--------------------------------------------------------------*/
=====================================
current/jgi/FungalRelease.java
=====================================
@@ -7,6 +7,7 @@ import fileIO.ByteFile;
import fileIO.ByteStreamWriter;
import fileIO.FileFormat;
import fileIO.ReadWrite;
+import shared.KillSwitch;
import shared.MetadataWriter;
import shared.Parse;
import shared.Parser;
@@ -401,7 +402,12 @@ public class FungalRelease {
* discarded.
*/
boolean processRead(final Read r1) {
- assert (!banIupac || !r1.containsNonACGTN()) : "Non-ACGTN base found in scaffold " + r1.id;
+ //assert (!banIupac || !r1.containsNonACGTN()) : "Non-ACGTN base found in scaffold " + r1.id;
+ if(banIupac){
+ if(r1.containsNonACGTN()){
+ KillSwitch.exceptionKill(new RuntimeException("Non-ACGTN base found in scaffold " + r1.id));
+ }
+ }
r1.inflateGaps(minGapIn, minGapOut);
return r1.length() >= minScaf;
}
=====================================
current/jgi/RQCFilter2.java
=====================================
@@ -196,7 +196,7 @@ public class RQCFilter2 {
catFlag=Parse.parseBoolean(b);
}else if(a.equals("removemouse") || a.equals("mouse")){
mouseFlag=Parse.parseBoolean(b);
- }else if(a.equals("catdoghumanmouse") || a.equals("mousecatdoghuman") || a.equals("catdogmousehuman")){
+ }else if(a.equals("catdoghumanmouse") || a.equals("mousecatdoghuman") || a.equals("")){
mouseCatDogHumanFlag=Parse.parseBoolean(b);
}else if(a.equals("keephumanreads") || a.equals("keephuman")){
keepHumanReads=Parse.parseBoolean(b);
@@ -4062,7 +4062,7 @@ public class RQCFilter2 {
return s;
}
- private String RQCFilterData="/global/projectb/sandbox/gaag/bbtools/RQCFilterData_Local";
+ private String RQCFilterData="/global/cfs/cdirs/bbtools/RQCFilterData_Local";
private String shortArtifactFile = "RQCFILTER_PATH/short.fa.gz";
private String mainArtifactFile = "RQCFILTER_PATH/Illumina.artifacts.fa.gz";
=====================================
current/shared/Shared.java
=====================================
@@ -23,6 +23,7 @@ public class Shared {
public static boolean ENV=(System.getenv()!=null);
+ private static String HOSTNAME;
public static boolean WINDOWS=envContainsPair("OS", "Win", true);
public static boolean MAC=envContainsPair("OS", "Mac", true);
//https://stackoverflow.com/questions/14288185/detecting-windows-or-linux
@@ -33,8 +34,8 @@ public class Shared {
public static boolean CORI=envContainsPair("NERSC_HOST", "cori", false);
public static boolean NERSC=envContainsKey("NERSC_HOST");
public static boolean AWS=envContainsKey("EC2_HOME");
+ public static boolean IGBVM="taxonomy-vm".equals(HOSTNAME()) || "taxonomy-vm-2".equals(HOSTNAME());
public static boolean AMD64="amd64".equalsIgnoreCase(System.getProperty("os.arch"));
- private static String HOSTNAME;
public static void setTaxServer(String path){
taxServerNersc=taxServerAws=path;
@@ -124,8 +125,8 @@ public class Shared {
public static final int GAPCOST=Tools.max(1, GAPLEN/64);
public static final byte GAPC='-';
- public static String BBMAP_VERSION_STRING="38.94";
- public static String BBMAP_VERSION_NAME="Potato Parser";
+ public static String BBMAP_VERSION_STRING="38.95";
+ public static String BBMAP_VERSION_NAME="CFS Migration";
public static boolean TRIM_READ_COMMENTS=false;
public static boolean TRIM_RNAME=false; //For mapped sam reads
=====================================
current/sketch/SketchObject.java
=====================================
@@ -1056,16 +1056,27 @@ public class SketchObject {
return list;
}
- private static final String IMG_PATH="/global/projectb/sandbox/gaag/bbtools/img/current/img#.sketch";
- private static final String NT_PATH="/global/projectb/sandbox/gaag/bbtools/nt/current/taxa#.sketch";
- private static final String NR_PATH="/global/projectb/sandbox/gaag/bbtools/nr/current/taxa#.sketch";
- private static final String REFSEQ_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/taxa#.sketch";
- private static final String REFSEQ_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/big#.sketch";
- private static final String SILVA_PATH="/global/projectb/sandbox/gaag/bbtools/silva/latest/both_taxa#.sketch";
- private static final String PROKPROT_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/taxa#.sketch";
- private static final String PROKPROT_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/big#.sketch";
- private static final String MITO_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/taxa#.sketch";
- private static final String FUNGI_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/fungi#.sketch";
+ private static final String IMG_PATH="/global/cfs/cdirs/bbtools/img/current/img#.sketch";
+ private static final String NT_PATH="/global/cfs/cdirs/bbtools/nt/current/taxa#.sketch";
+ private static final String NR_PATH="/global/cfs/cdirs/bbtools/nr/current/taxa#.sketch";
+ private static final String REFSEQ_PATH="/global/cfs/cdirs/bbtools/refseq/current/taxa#.sketch";
+ private static final String REFSEQ_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/big#.sketch";
+ private static final String SILVA_PATH="/global/cfs/cdirs/bbtools/silva/latest/both_taxa#.sketch";
+ private static final String PROKPROT_PATH="/global/cfs/cdirs/bbtools/refseq/current/prot/taxa#.sketch";
+ private static final String PROKPROT_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/prot/big#.sketch";
+ private static final String MITO_PATH="/global/cfs/cdirs/bbtools/mito2/taxa#.sketch";
+ private static final String FUNGI_PATH="/global/cfs/cdirs/bbtools/mito2/fungi#.sketch";
+
+ private static final String IMG_PATH_IGBVM="/data/sketch/img/current/img#.sketch";
+ private static final String NT_PATH_IGBVM="/data/sketch/nt/current/taxa#.sketch";
+ private static final String NR_PATH_IGBVM="/data/sketch/nr/current/taxa#.sketch";
+ private static final String REFSEQ_PATH_IGBVM="/data/sketch/refseq/current/taxa#.sketch";
+ private static final String REFSEQ_PATH_BIG_IGBVM="/data/sketch/refseq/current/big#.sketch";
+ private static final String SILVA_PATH_IGBVM="/data/sketch/silva/current/both_taxa#.sketch";
+ private static final String PROKPROT_PATH_IGBVM="/data/sketch/refseq/current/prot/taxa#.sketch";
+ private static final String PROKPROT_PATH_BIG_IGBVM="/data/sketch/refseq/current/prot/big#.sketch";
+ private static final String MITO_PATH_IGBVM="/data/sketch/mito2/taxa#.sketch";
+ private static final String FUNGI_PATH_IGBVM="/data/sketch/mito2/fungi#.sketch";
private static final String IMG_PATH_AWS=null;
private static final String NT_PATH_AWS="/test1/sketch/latest/nt/taxa#.sketch";
@@ -1076,16 +1087,16 @@ public class SketchObject {
private static final String MITO_PATH_AWS=null;
private static final String FUNGI_PATH_AWS=null;
- public static final String IMG_PATH(){return Shared.AWS ? IMG_PATH_AWS : IMG_PATH;}
- public static final String NT_PATH(){return Shared.AWS ? NT_PATH_AWS : NT_PATH;}
- public static final String NR_PATH(){return Shared.AWS ? NR_PATH_AWS : NR_PATH;}
- public static final String REFSEQ_PATH(){return Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;}
- public static final String REFSEQ_PATH_BIG(){return REFSEQ_PATH_BIG;}
- public static final String SILVA_PATH(){return Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;}
- public static final String PROKPROT_PATH(){return Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;}
- public static final String PROKPROT_PATH_BIG(){return PROKPROT_PATH_BIG;}
- public static final String MITO_PATH(){return Shared.AWS ? MITO_PATH_AWS : MITO_PATH;}
- public static final String FUNGI_PATH(){return Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;}
+ public static final String IMG_PATH(){return Shared.IGBVM ? IMG_PATH_IGBVM : Shared.AWS ? IMG_PATH_AWS : IMG_PATH;}
+ public static final String NT_PATH(){return Shared.IGBVM ? NT_PATH_IGBVM : Shared.AWS ? NT_PATH_AWS : NT_PATH;}
+ public static final String NR_PATH(){return Shared.IGBVM ? NR_PATH_IGBVM : Shared.AWS ? NR_PATH_AWS : NR_PATH;}
+ public static final String REFSEQ_PATH(){return Shared.IGBVM ? REFSEQ_PATH_IGBVM : Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;}
+ public static final String REFSEQ_PATH_BIG(){return Shared.IGBVM ? REFSEQ_PATH_BIG_IGBVM : REFSEQ_PATH_BIG;}
+ public static final String SILVA_PATH(){return Shared.IGBVM ? SILVA_PATH_IGBVM : Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;}
+ public static final String PROKPROT_PATH(){return Shared.IGBVM ? PROKPROT_PATH_IGBVM : Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;}
+ public static final String PROKPROT_PATH_BIG(){return Shared.IGBVM ? PROKPROT_PATH_BIG_IGBVM : PROKPROT_PATH_BIG;}
+ public static final String MITO_PATH(){return Shared.IGBVM ? MITO_PATH_IGBVM : Shared.AWS ? MITO_PATH_AWS : MITO_PATH;}
+ public static final String FUNGI_PATH(){return Shared.IGBVM ? FUNGI_PATH_IGBVM : Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;}
/*--------------------------------------------------------------*/
/*---------------- Getters ----------------*/
=====================================
current/tax/ImgRecord.java
=====================================
@@ -122,7 +122,7 @@ public class ImgRecord implements Serializable {
public static boolean storeName=true;
public static HashMap<Long, ImgRecord> imgMap;
-// public static final String DefaultDumpFile="/global/projectb/sandbox/gaag/bbtools/tax/imgTaxDump.txt.gz";
+// public static final String DefaultDumpFile="/global/cfs/cdirs/bbtools/tax/imgTaxDump.txt.gz";
public static final String DefaultDumpFile="/global/u1/i/img/adhocDumps/taxonDumpForBrian.txt";
}
=====================================
current/tax/TaxServer.java
=====================================
@@ -2275,7 +2275,7 @@ public class TaxServer {
private String sizeFile=null;
/** Location of sequence directory tree */
- private String basePath="/global/projectb/sandbox/gaag/bbtools/tree/";
+ private String basePath="/global/cfs/cdirs/bbtools/tree/";
/** Used for taxonomic tree traversal */
private final TaxTree tree;
=====================================
current/tax/TaxTree.java
=====================================
@@ -2479,13 +2479,16 @@ public class TaxTree implements Serializable{
/* For these fields, see the corresponding functions, below.
* They define the default paths to various data on NERSC. */
-
- private static final String defaultTaxPathNersc="/global/projectb/sandbox/gaag/bbtools/tax/latest";
+
+ private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest";
private static final String defaultTaxPathAws="/test1/tax/latest";
- private static final String default16SFileNersc="/global/projectb/sandbox/gaag/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
+ private static final String defaultTaxPathIGBVM="/data/tax/latest";
+ private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
- private static final String default18SFileNersc="/global/projectb/sandbox/gaag/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
+ private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
+ private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz";
+ private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
private static final String defaultImgFile="TAX_PATH/imgDump.txt";
private static final String defaultTableFile="TAX_PATH/gitable.int1d.gz";
@@ -2507,17 +2510,17 @@ public class TaxTree implements Serializable{
/** For setting TAX_PATH, the root to taxonomy files */
public static final String defaultTaxPath(){
- return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : defaultTaxPathNersc;
+ return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc;
}
/** 16S consensus sequences per TaxID */
public static final String default16SFile(){
- return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : default16SFileNersc;
+ return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc;
}
/** 18S consensus sequences per TaxID */
public static final String default18SFile(){
- return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : default18SFileNersc;
+ return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc;
}
/** Path to all taxonomy files, substituted in to make specific file paths */
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/fe78528ef3612e697050a49f9a33783057920aaf
--
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/fe78528ef3612e697050a49f9a33783057920aaf
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220116/86a0774c/attachment-0001.htm>
More information about the debian-med-commit
mailing list