[med-svn] [Git][med-team/bbmap][upstream] New upstream version 38.97+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Wed Aug 3 12:54:28 BST 2022
Étienne Mollier pushed to branch upstream at Debian Med / bbmap
Commits:
1c662261 by Étienne Mollier at 2022-08-03T13:38:44+02:00
New upstream version 38.97+dfsg
- - - - -
30 changed files:
- README.md
- + Xcalcmem.sh
- bbduk.sh
- clumpify.sh
- + current/clump/Hasher.java
- current/clump/KmerSort.java
- current/clump/KmerSort1.java
- current/clump/KmerSort2.java
- current/clump/KmerSort3.java
- current/clump/KmerSplit.java
- current/icecream/IceCreamFinder.java
- current/jgi/BBDuk.java
- current/jgi/RQCFilter2.java
- current/jgi/RQCFilterStats.java
- current/shared/Parser.java
- current/shared/Shared.java
- current/stream/FASTQ.java
- current/tax/GiToTaxid.java
- + current/tax/GiToTaxidInt.java
- current/tax/TaxServer.java
- current/tax/TaxTree.java
- docs/changelog.txt
- removecatdogmousehuman.sh
- removehuman.sh
- removehuman2.sh
- removemicrobes.sh
- + resources/PacBioAdapter.fa
- resources/contents.txt
- resources/short.fa
- shred.sh
Changes:
=====================================
README.md
=====================================
@@ -3,4 +3,4 @@
# Language: Java, Bash
# Information about documentation is in /docs/readme.txt.
-# Version 38.96
+# Version 38.97
=====================================
Xcalcmem.sh
=====================================
@@ -0,0 +1,149 @@
+#!/bin/bash
+#calcmem
+
+#function usage(){
+# echo "CalcMem v1.01"
+# echo "Written by Brian Bushnell, Doug Jacobsen, Alex Copeland"
+# echo "Calculates available memory in megabytes"
+# echo "Last modified April 25, 2014"
+#}
+
+function parseXmx () {
+ local setxmx=0
+ local setxms=0
+
+ for arg in "$@"
+ do
+ if [[ "$arg" == -Xmx* ]]; then
+ z="$arg"
+ setxmx=1
+ elif [[ "$arg" == Xmx* ]]; then
+ z="-$arg"
+ setxmx=1
+ elif [[ "$arg" == -Xms* ]]; then
+ z2="$arg"
+ setxms=1
+ elif [[ "$arg" == Xms* ]]; then
+ z2="-$arg"
+ setxms=1
+ elif [[ "$arg" == -da ]] || [[ "$arg" == -ea ]]; then
+ EA="$arg"
+ fi
+ done
+
+ if [[ $setxmx == 1 ]] && [[ $setxms == 0 ]]; then
+ local substring=`echo $z| cut -d'x' -f 2`
+ z2="-Xms$substring"
+ setxms=1
+ elif [[ $setxmx == 0 ]] && [[ $setxms == 1 ]]; then
+ local substring=`echo $z2| cut -d's' -f 2`
+ z="-Xmx$substring"
+ setxmx=1
+ fi
+
+ set=$setxmx
+}
+
+
+RAM=0;
+
+function freeRam(){
+ #Memory is in kilobytes.
+ local defaultMem=3200000
+ if [ $# -gt 0 ]; then
+ defaultMem=$1;
+ case $defaultMem in
+ *g)
+ defaultMem=`echo $defaultMem| cut -d'g' -f 1`
+ defaultMem=$(( $defaultMem * $(( 1024 * 1024 )) ))
+ ;;
+ *m)
+ defaultMem=`echo $defaultMem| cut -d'm' -f 1`
+ defaultMem=$(( $defaultMem * 1024 ))
+ ;;
+ *k)
+ defaultMem=`echo $defaultMem| cut -d'k' -f 1`
+ ;;
+ esac
+ fi
+
+ local mult=84
+ if [ $# -gt 1 ]; then
+ mult=$2;
+ fi
+
+ #echo "mult = $mult"
+ #echo "default = $defaultMem"
+
+ local ulimit=$(ulimit -v)
+ local x=$ulimit
+
+ if [ -e /proc/meminfo ]; then
+ local vfree=$(cat /proc/meminfo | awk -F: 'BEGIN{total=-1;used=-1} /^CommitLimit:/ { total=$2 }; /^Committed_AS:/ { used=$2 } END{ print (total-used) }')
+ local pfree=$(cat /proc/meminfo | awk -F: 'BEGIN{free=-1;cached=-1;buffers=-1} /^MemFree:/ { free=$2 }; /^Cached:/ { cached=$2}; /^Buffers:/ { buffers=$2} END{ print (free+cached+buffers) }')
+
+ echo "vfree = $vfree"
+ echo "pfree = $pfree"
+ echo "ulimit = $ulimit"
+
+ local x2=0;
+
+ if [ $vfree -gt 0 ] && [ $pfree -gt 0 ]; then
+ if [ $vfree -gt $pfree ]; then x2=$pfree;
+ else x2=$vfree; fi
+ elif [ $vfree -gt 0 ]; then x2=$vfree;
+ elif [ $pfree -gt 0 ]; then x2=$pfree;
+ fi
+
+ echo $x
+ echo $x2
+ echo $vfree
+ echo $pfree
+
+ if [ "$x" = "unlimited" ] || (($x > $x2)); then x=$x2; fi
+
+ fi
+
+ #echo "x=$x"
+ local HOSTNAME=`hostname`
+ if [ $x -lt 1 ] || [[ $HOSTNAME == genepool* ]]; then
+ #echo "hello 2"
+ #echo $x
+ #echo "ram is unlimited"
+ RAM=$((defaultMem/1024))
+ echo "Max memory cannot be determined. Attempting to use $RAM MB." 1>&2
+ echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2
+ else
+ #echo "hello 1"
+ #echo $x
+
+ #if [ $x -ge 1000000000 ]; then
+ # echo "ram is 1000g+"
+ #elif [ $x -ge 500000000 ]; then
+ # echo "ram is 500g+"
+ #elif [ $x -ge 250000000 ]; then
+ # echo "ram is 250g+"
+ #elif [ $x -ge 144000000 ]; then
+ # echo "ram is 144g+"
+ #elif [ $x -ge 120000000 ]; then
+ # echo "ram is 120g+"
+ #elif [ $x -ge 40000000 ]; then
+ # echo "ram is 40g+"
+ #else
+ # echo "ram is under 40g"
+ #fi
+ #echo $x
+ RAM=$(( ((x-500000)*mult/100)/1024 ))
+ #echo $RAM
+ fi
+
+ echo $set
+ echo $z
+ echo $z2
+
+ #local z="-Xmx${RAM}m"
+ return 0
+}
+
+parseXmx "$@"
+freeRam "$@"
=====================================
bbduk.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified January 26, 2021
+Last modified July 28, 2022
Description: Compares reads to the kmers in a reference dataset, optionally
allowing an edit distance. Splits the reads into two outputs - those that
@@ -172,11 +172,14 @@ Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
All kmer processing modes are mutually exclusive.
Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.
-ktrim=f Trim reads to remove bases matching reference kmers.
+ktrim=f Trim reads to remove bases matching reference kmers, plus
+ all bases to the left or right.
Values:
f (don't trim),
r (trim to the right),
l (trim to the left)
+ktrimtips=0 Set this to a positive number to perform ktrim on both
+ ends, examining only the outermost X bases.
kmask= Replace bases matching ref kmers with another symbol.
Allows any non-whitespace character, and processes short
kmers on both ends if mink is set. 'kmask=lc' will
@@ -237,6 +240,8 @@ restrictleft=0 If positive, only look for kmer matches in the
leftmost X bases.
restrictright=0 If positive, only look for kmer matches in the
rightmost X bases.
+NOTE: restrictleft and restrictright are mutually exclusive. If trimming
+ both ends is desired, use ktrimtips.
mingc=0 Discard reads with GC content below this.
maxgc=1 Discard reads with GC content above this.
gcpairs=t Use average GC of paired reads.
=====================================
clumpify.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified October 30, 2019
+Last modified July 28, 2022
Description: Sorts sequences to put similar reads near each other.
Can be used for increased compression or error correction.
@@ -95,6 +95,9 @@ allduplicates=f Mark or remove all copies of duplicates, instead of
keeping the highest-quality copy.
addcount=f Append the number of copies to the read name.
Mutually exclusive with markduplicates or allduplicates.
+entryfilter=f This assists in removing exact duplicates, which saves
+ memory in libraries that split unevenly due to huge
+ numbers of duplicates. Enabled automatically as needed.
subs=2 (s) Maximum substitutions allowed between duplicates.
subrate=0.0 (dsr) If set, the number of substitutions allowed will be
max(subs, subrate*min(length1, length2)) for 2 sequences.
=====================================
current/clump/Hasher.java
=====================================
@@ -0,0 +1,75 @@
+package clump;
+
+import java.util.Random;
+
+import shared.Tools;
+import stream.Read;
+
+public class Hasher {
+
+ private static synchronized long[][] makeCodes2(int modes){
+ long[][] r=makeCodes(128, modes);
+
+ for(int i=0; i<26; i++){
+ char c=(char)('A'+i);
+ r[Tools.toLowerCase(c)]=r[c];
+ }
+ return r;
+ }
+
+ private static synchronized long[][] makeCodes(int symbols, int modes){
+ Random randy=new Random(1);
+ long[][] r=new long[symbols][modes];
+ for(int i=0; i<symbols; i++){
+ for(int j=0; j<modes; j++){
+ r[i][j]=randy.nextLong();
+ }
+ }
+ return r;
+ }
+
+ public static long hash(byte[] bases){
+ long code=bases.length;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int mode=(int)(code&31);
+ assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+ code=code^hashcodes[b][mode];
+ code=Long.rotateLeft(code, 1);
+ }
+ return code;
+ }
+
+ public static final long hash(Read r){
+ return hash(r.bases);
+ }
+
+ public static final long hashPair(Read r){
+ long a=hash(r);
+ if(r.mate==null){return a;}
+ long b=hash(r.mate);
+ return a^Long.rotateLeft(b, 1);
+ }
+
+ public static final boolean equalsPaired(Read a, Read b){
+ return equals(a, b) && equals(a.mate, b.mate);
+ }
+
+ public static final boolean equals(Read a, Read b){
+ if(a==b){return true;}
+ if(a==null || b==null){
+ assert(a!=null || b!=null);
+ return false;
+ }
+ if(a.length()!=b.length()){return false;}
+ if(a.length()==0){return true;}
+ byte[] ab=a.bases, bb=b.bases;
+ for(int i=0; i<ab.length; i++){
+ if(ab[i]!=bb[i]){return false;}
+ }
+ return true;
+ }
+
+ private static final long[][] hashcodes=makeCodes2(32);
+
+}
=====================================
current/clump/KmerSort.java
=====================================
@@ -3,6 +3,7 @@ package clump;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashMap;
import bloom.KCountArray;
import fileIO.ReadWrite;
@@ -66,7 +67,8 @@ public abstract class KmerSort {
String cpstring=""+(groups==1 ? clumpsProcessedThisPass : clumpsProcessedTotal);
String epstring=""+correctionsTotal;
- String dpstring=""+duplicatesTotal;
+ String efstring=""+(entryFiltered);
+ String dpstring=""+(duplicatesTotal + entryFiltered);
String rostring=""+readsOut;
String bostring=""+basesOut;
@@ -79,6 +81,7 @@ public abstract class KmerSort {
while(rpstring2.length()<12){rpstring2=" "+rpstring2;}
while(cpstring.length()<12){cpstring=" "+cpstring;}
while(epstring.length()<12){epstring=" "+epstring;}
+ while(efstring.length()<12){efstring=" "+efstring;}
while(dpstring.length()<12){dpstring=" "+dpstring;}
while(rostring.length()<12){rostring=" "+rostring;}
@@ -92,8 +95,11 @@ public abstract class KmerSort {
if(correct){
outstream.println("Errors Corrected: "+epstring);
}
- if(dedupe){
+ if(dedupe || entryfilter){
outstream.println("Duplicates Found: "+dpstring);
+ if(entryfilter && verbose && false){
+ outstream.println(" -Entry Filtered: "+efstring);
+ }
outstream.println("Reads Out: "+rostring);
outstream.println("Bases Out: "+bostring);
}
@@ -271,7 +277,7 @@ public abstract class KmerSort {
ArrayList<FetchThread1> alft=new ArrayList<FetchThread1>(threads);
for(int i=0; i<threads; i++){alft.add(new FetchThread1(i, cris, kc, unpair));}
- readsThisPass=memThisPass=0;
+ readsThisPass=memThisPass=entryFilteredThisPass=0;
if(verbose){outstream.println("Starting threads.");}
for(FetchThread1 ht : alft){ht.start();}
@@ -289,6 +295,7 @@ public abstract class KmerSort {
e.printStackTrace();
}
}
+ entryFilteredThisPass+=ht.entryFilteredT;
readsThisPass+=ht.readsProcessedT;
basesProcessed+=ht.basesProcessedT;
diskProcessed+=ht.diskProcessedT;
@@ -296,24 +303,28 @@ public abstract class KmerSort {
}
readsProcessed+=readsThisPass;
memProcessed+=memThisPass;
+ entryFiltered+=entryFilteredThisPass;
if(verbose){t.stop("Fetch time: ");}
if(verbose){System.err.println("Closing input stream.");}
errorState=ReadWrite.closeStream(cris)|errorState;
if(verbose){t.start("Combining thread output.");}
- assert(readsProcessed<=Integer.MAX_VALUE) :
+ long readsLeft=readsThisPass-entryFilteredThisPass;
+ long slotsLeft=cris.paired() && !unpair ? readsLeft/2 : readsLeft;
+ assert(slotsLeft<=Shared.MAX_ARRAY_LEN) :
"\nThe number of reads is greater than 2 billion, which is the limit for a single group. "
+ "\nPlease rerun and manually specify 'groups=7' or similar, "
+ "\nsuch that the number of reads per group is less than 2 billion.";
- ArrayList<Read> list=new ArrayList<Read>((int)readsThisPass);
+ ArrayList<Read> list=new ArrayList<Read>((int)(slotsLeft));
for(int i=0; i<threads; i++){
FetchThread1 ft=alft.set(i, null);
list.addAll(ft.storage);
}
if(verbose){t.stop("Combine time: ");}
- assert(list.size()==readsThisPass || (cris.paired() && list.size()*2==readsThisPass)) : list.size()+", "+readsThisPass+", "+cris.paired();
+ assert(list.size()==slotsLeft) : list.size()+", "+readsThisPass+", "+readsLeft+", "+slotsLeft+", "+cris.paired();
+ //assert(list.size()==readsLeft || (cris.paired() && list.size()*2==readsLeft)) : list.size()+", "+readsThisPass+", "+readsLeft+", "+cris.paired();
ecco=false;
return list;
}
@@ -330,6 +341,7 @@ public abstract class KmerSort {
kc=kc_;
storage=new ArrayList<Read>();
unpairT=unpair_;
+ entryFilterTable=(entryfilter ? new HashMap<Long, Read>() : null);
}
@Override
@@ -377,6 +389,27 @@ public abstract class KmerSort {
}
}
+ if(entryFilterTable!=null){
+ int removed=0;
+ for(int i=0; i<reads.size(); i++){
+ Read r=reads.get(i);
+ final long key=Hasher.hashPair(r);
+ final Long key2=Long.valueOf(key);
+ final Read old=entryFilterTable.get(key2);
+ if(old==null){
+ entryFilterTable.put(key2, r);
+ }else{
+ boolean same=Hasher.equalsPaired(r, old);
+ if(same){
+ removed++;
+ entryFilteredT+=r.pairCount();
+ reads.set(i, null);
+ }
+ }
+ }
+ if(removed>0){Tools.condenseStrict(reads);}
+ }
+
ArrayList<Read> hashList=reads;
if(paired && unpairT){
hashList=new ArrayList<Read>(reads.size()*2);
@@ -394,7 +427,7 @@ public abstract class KmerSort {
kc.hash(hashList, table, minCount, true);
storage.addAll(hashList);
- cris.returnList(ln);
+ cris.returnList(ln.id, false);
ln=cris.nextList();
reads=(ln!=null ? ln.list : null);
}
@@ -416,6 +449,8 @@ public abstract class KmerSort {
final KmerComparator kc;
final ArrayList<Read> storage;
final boolean unpairT;
+ final HashMap<Long, Read> entryFilterTable;
+ public long entryFilteredT=0;
protected long readsProcessedT=0;
protected long basesProcessedT=0;
@@ -488,10 +523,12 @@ public abstract class KmerSort {
protected long basesProcessed=0;
protected long diskProcessed=0;
protected long memProcessed=0;
+ protected static long entryFiltered=0;
protected long readsOut=0;
protected long basesOut=0;
+ protected long entryFilteredThisPass=0;
protected long readsThisPass=0;
protected long memThisPass=0;
@@ -520,7 +557,9 @@ public abstract class KmerSort {
boolean unpair=false;
boolean repair=false;
boolean namesort=false;
+ boolean entryfilter=false;
final boolean parallelSort=Shared.parallelSort;
+ boolean memWarned=false;
boolean useSharedHeader=false;
int reorderMode=REORDER_FALSE;
=====================================
current/clump/KmerSort1.java
=====================================
@@ -119,6 +119,8 @@ public class KmerSort1 extends KmerSort {
else if(a.equals("prefilter")){
KmerReduce.prefilter=Parse.parseBoolean(b);
+ }else if(a.equals("entryfilter")){
+ entryfilter=Parse.parseBoolean(b);
}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
groups=Integer.parseInt(b);
splitInput=(groups>1);
=====================================
current/clump/KmerSort2.java
=====================================
@@ -119,6 +119,8 @@ public class KmerSort2 extends KmerSort {
else if(a.equals("prefilter")){
KmerReduce.prefilter=Parse.parseBoolean(b);
+ }else if(a.equals("entryfilter")){
+ entryfilter=Parse.parseBoolean(b);
}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
groups=Integer.parseInt(b);
splitInput=(groups>1);
=====================================
current/clump/KmerSort3.java
=====================================
@@ -3,6 +3,7 @@ package clump;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashMap;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.atomic.AtomicInteger;
@@ -135,6 +136,8 @@ public class KmerSort3 extends KmerSort {
else if(a.equals("prefilter")){
KmerReduce.prefilter=Parse.parseBoolean(b);
+ }else if(a.equals("entryfilter")){
+ entryfilter=Parse.parseBoolean(b);
}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
groups=Integer.parseInt(b);
splitInput=(groups>1);
@@ -531,6 +534,7 @@ public class KmerSort3 extends KmerSort {
private long closeFetchThread3s(ArrayList<FetchThread3> alft){
readsThisPass=0;
memThisPass=0;
+ entryFilteredThisPass=0;
/* Wait for threads to die */
for(FetchThread3 ft : alft){
@@ -542,6 +546,7 @@ public class KmerSort3 extends KmerSort {
e.printStackTrace();
}
}
+ entryFilteredThisPass+=ft.entryFilteredT;
readsThisPass+=ft.readsProcessedT;
basesProcessed+=ft.basesProcessedT;
diskProcessed+=ft.diskProcessedT;
@@ -549,6 +554,7 @@ public class KmerSort3 extends KmerSort {
errorState|=ft.errorStateT;
}
+ entryFiltered+=entryFilteredThisPass;
readsProcessed+=readsThisPass;
memProcessed+=memThisPass;
@@ -627,27 +633,46 @@ public class KmerSort3 extends KmerSort {
// outstream.println("totalMem="+totalMem+", Shared.memFree()="+Shared.memFree()+", Shared.memUsed()="+Shared.memUsed());
// outstream.println("fileSize="+fileSize+", "+"fileMem="+fileMem+", "+"memRatio="+memRatio+", ");
// outstream.println((size-1000>expectedSizePerGroup*3)+", "+(expectedMem*3>totalMem));
-
+
//TODO: This could also be based on the number of FetchThreads.
-
if(expectedMem>0.3*totalMem && (fileMem<1 || fileMem>0.8*totalMem)){
- outstream.println("\n*** Warning ***\n"
- + "A temp file may be too big to store in memory, due to uneven splitting:");
+ synchronized(this.getClass()){
+ if(!memWarned){
+ outstream.println("\n*** Warning ***\n"
+ + "A temp file may be too big to store in memory, due to uneven splitting:");
- outstream.println("expectedMem="+((long)expectedMem)+", fileMem="+fileMem+", available="+totalMem);
-
- if(repair || namesort){
- outstream.println("It cannot be streamed to output unaltered because "+(namesort ? " namesort=t" : "repair=t"));
- outstream.println("If this causes the program to crash, please re-run with more memory or groups.\n");
- }else{
- outstream.println(
- "It will be streamed to output unaltered.\n"
- + "To avoid this behavior, increase memory or increase groups.\n"
- + "Set the flag forceprocess=t to disable this check.\n");
-// Timer t=new Timer();
- ArrayList<Read> list=streamNext_inner(cris);
-// t.stop("Stream time: ");
- return list;
+ outstream.println("expectedMem="+((long)expectedMem)+", fileMem="+fileMem+", available="+totalMem);
+ }
+ if(entryfilter){
+ if(!memWarned){
+ outstream.println(
+ "If the program crashes, increase memory or manually increase groups.\n");
+ }
+ }else if(dedupe & !Clump.opticalOnly){
+ if(!memWarned){
+ outstream.println(
+ "entryfilter is automatically being set to true to reduce memory load while removing duplicates.\n"
+ +"If the program crashes, increase memory or manually increase groups.\n");
+ }
+ entryfilter=true;
+ }else if(repair || namesort){
+ if(!memWarned){
+ outstream.println("It cannot be streamed to output unaltered because "+(namesort ? " namesort=t" : "repair=t"));
+ outstream.println("If this causes the program to crash, please re-run with more memory or groups.\n");
+ }
+ }else{
+ if(!memWarned){
+ outstream.println(
+ "It will be streamed to output unaltered.\n"
+ + "To avoid this behavior, increase memory or increase groups.\n"
+ + "Set the flag forceprocess=t to disable this check.\n");
+ }
+ // Timer t=new Timer();
+ ArrayList<Read> list=streamNext_inner(cris);
+ // t.stop("Stream time: ");
+ return list;
+ }
+ memWarned=true;
}
}
}
@@ -673,7 +698,7 @@ public class KmerSort3 extends KmerSort {
// if(verbose){t.start("Making hash threads.");}
final int subthreads=Tools.mid(1, Shared.threads()/2, 8);
ArrayList<FetchSubThread> alht=new ArrayList<FetchSubThread>(subthreads);
- long readsThisGroup=0, memThisGroup=0;
+ long readsThisGroup=0, memThisGroup=0, entryFilteredThisGroup=0;
for(int i=0; i<subthreads; i++){alht.add(new FetchSubThread(i, cris, kc, unpair));}
// if(verbose){outstream.println("Starting threads.");}
@@ -691,11 +716,13 @@ public class KmerSort3 extends KmerSort {
e.printStackTrace();
}
}
+ entryFilteredThisGroup+=ht.entryFilteredT;
readsThisGroup+=ht.readsProcessedST;
basesProcessedT+=ht.basesProcessedST;
diskProcessedT+=ht.diskProcessedST;
memThisGroup+=ht.memProcessedST;
}
+ entryFilteredT+=entryFilteredThisGroup;
readsProcessedT+=readsThisGroup;
memProcessedT+=memThisGroup;
@@ -704,13 +731,15 @@ public class KmerSort3 extends KmerSort {
errorStateT=ReadWrite.closeStream(cris)|errorStateT;
// if(verbose){t.start("Combining thread output.");}
- assert(readsProcessedT<=Integer.MAX_VALUE && readsProcessedT>=0);
- ArrayList<Read> list=new ArrayList<Read>((int)readsThisGroup);
+ long readsLeft=readsThisGroup-entryFilteredThisGroup;
+ long slotsLeft=cris.paired() && !unpair ? readsLeft/2 : readsLeft;
+ assert(slotsLeft<=Shared.MAX_ARRAY_LEN && readsProcessedT>=0);
+ ArrayList<Read> list=new ArrayList<Read>((int)slotsLeft);
for(int i=0; i<subthreads; i++){
FetchSubThread ht=alht.set(i, null);
list.addAll(ht.storage);
}
- assert(list.size()==readsThisGroup || (cris.paired() && list.size()*2==readsThisGroup)) : list.size()+", "+readsThisGroup+", "+cris.paired();
+ assert(list.size()==slotsLeft) : list.size()+", "+readsThisGroup+", "+slotsLeft+", "+cris.paired();
// if(verbose){t.stop("Combine time: ");}
// if(verbose){t.start("Sorting.");}
@@ -724,7 +753,8 @@ public class KmerSort3 extends KmerSort {
final AtomicInteger nextGroup;
final KmerComparator kc;
final ConcurrentReadOutputStream[] rosa;
-
+
+ protected long entryFilteredT=0;
protected long readsProcessedT=0;
protected long basesProcessedT=0;
protected long diskProcessedT=0;
@@ -740,6 +770,7 @@ public class KmerSort3 extends KmerSort {
kcT=kc_;
storage=new ArrayList<Read>();
unpairT=unpair_;
+ entryFilterTable=(entryfilter ? new HashMap<Long, Read>() : null);
}
@Override
@@ -783,6 +814,27 @@ public class KmerSort3 extends KmerSort {
}
}
+ if(entryFilterTable!=null){
+ int removed=0;
+ for(int i=0; i<reads.size(); i++){
+ Read r=reads.get(i);
+ final long key=Hasher.hashPair(r);
+ final Long key2=Long.valueOf(key);
+ final Read old=entryFilterTable.get(key2);
+ if(old==null){
+ entryFilterTable.put(key2, r);
+ }else{
+ boolean same=Hasher.equalsPaired(r, old);
+ if(same){
+ removed++;
+ entryFilteredT+=r.pairCount();
+ reads.set(i, null);
+ }
+ }
+ }
+ if(removed>0){Tools.condenseStrict(reads);}
+ }
+
ArrayList<Read> hashList=reads;
if(paired && unpairT){
hashList=new ArrayList<Read>(reads.size()*2);
@@ -800,7 +852,7 @@ public class KmerSort3 extends KmerSort {
kcT.hash(hashList, table, minCount, true);
storage.addAll(hashList);
- cris.returnList(ln);
+ cris.returnList(ln.id, false);
ln=cris.nextList();
reads=(ln!=null ? ln.list : null);
}
@@ -822,6 +874,8 @@ public class KmerSort3 extends KmerSort {
final KmerComparator kcT;
final ArrayList<Read> storage;
final boolean unpairT;
+ final HashMap<Long, Read> entryFilterTable;
+ public long entryFilteredT=0;
protected long readsProcessedST=0;
protected long basesProcessedST=0;
=====================================
current/clump/KmerSplit.java
=====================================
@@ -123,6 +123,8 @@ public class KmerSplit {
else if(a.equals("dedupe")){
//ignore
+ }else if(a.equals("entryfilter")){
+ //ignore
}else if(a.equals("markduplicates")){
//ignore
}else if(a.equals("markall")){
=====================================
current/icecream/IceCreamFinder.java
=====================================
@@ -76,6 +76,8 @@ public final class IceCreamFinder {
outstream=pp.outstream;
}
+ Parser.setQuality(33);
+
//Set shared static variables prior to parsing
ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;
=====================================
current/jgi/BBDuk.java
=====================================
@@ -321,9 +321,13 @@ public class BBDuk {
else{ksplit_=false;}
}else if(a.equals("ktrim")){
if(b==null){b="";}
- if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;}
- else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;}
- else if(b.equalsIgnoreCase("n")){
+ if(b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr") || b.equalsIgnoreCase("tips")){
+ ktrimLeft_=ktrimRight_=true;ktrimN_=ksplit_=false;
+ }else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){
+ ktrimLeft_=true;ktrimRight_=false;ktrimN_=ksplit_=false;
+ }else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){
+ ktrimLeft_=false;ktrimRight_=true;ktrimN_=ksplit_=false;
+ }else if(b.equalsIgnoreCase("n")){
ktrimLeft_=ktrimRight_=ksplit_=false;
ktrimN_=true;
}else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
@@ -332,9 +336,15 @@ public class BBDuk {
TRIM_SYMBOL_=(byte)b.charAt(0);
}else{
assert(b!=null && (b.equalsIgnoreCase("f") || b.equalsIgnoreCase("false"))) :
- "\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n.";
+ "\nInvalid setting for ktrim - values must be f (false), l (left), r (right), rl (tips), or n.";
ktrimRight_=ktrimLeft_=false;
}
+ }else if(a.equals("trimtips") || a.equals("ktrimtips")){
+ if(b==null){b="";}
+ if(b.length()>0){
+ ktrimLeft_=ktrimRight_=true;ktrimN_=ksplit_=false;
+ restrictLeft_=restrictRight_=Integer.parseInt(b);
+ }
}else if(a.equals("kmask") || a.equals("mask")){
if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
kmaskLowercase_=true;
@@ -2454,7 +2464,8 @@ public class BBDuk {
ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen()));
ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen());
- final boolean ktrimrl=ktrimLeft || ktrimRight;
+ final boolean ktrimLeftOrRight=ktrimLeft || ktrimRight;
+ final boolean ktrimTips=(ktrimLeft && ktrimRight);
final boolean doKmerTrimming=storedKmers>0 && (ktrimLeft || ktrimRight || ktrimN || ksplit);
final boolean doKmerFiltering=storedKmers>0 && !doKmerTrimming;
@@ -2641,7 +2652,22 @@ public class BBDuk {
int xsum=0;
int rktsum=0;
- if(ktrimrl){
+ if(ktrimTips){
+ if(r1!=null){
+ int x=ktrimTips(r1, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen1=r1.length();
+ if(rlen1<minlen1){setDiscarded(r1);}
+ }
+ if(r2!=null){
+ int x=ktrimTips(r2, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen2=r2.length();
+ if(rlen2<minlen2){setDiscarded(r2);}
+ }
+ }else if(ktrimLeftOrRight){
if(r1!=null){
int x=ktrim(r1, keySets);
xsum+=x;
@@ -3555,13 +3581,195 @@ public class BBDuk {
return found;
}
+ private final int ktrim(final Read r, final AbstractKmerTable[] sets){
+ final int len=r.length();
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, len-restrictRight));
+ final int stop=(restrictLeft<1 ? len : Tools.min(len, restrictLeft));
+ return ktrim(r, sets, start, stop);
+ }
+
+ private final int ktrimTips(final Read r, final AbstractKmerTable[] sets){
+ final int len=r.length();
+ final int mid=len/2-(k-1)/2;
+ int sum=0;
+ if(ktrimRight){
+ int start=Tools.max(0, (restrictRight<1 ? mid : len-restrictRight));
+ sum+=ktrimTip(r, sets, start, len, true, false);
+ }
+ if(ktrimLeft){
+ int stop=Tools.min(r.length(), (restrictLeft<1 ? mid+k-1 : restrictLeft));
+ sum+=ktrimTip(r, sets, 0, stop, false, true);
+ }
+ return sum;
+ }
+
+ /**
+ * Trim a read to remove matching kmers and everything to their left or right, on one end.
+ * Allows both trimRight and trimLeft, in 2 passes.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases trimmed
+ */
+ private final int ktrimTip(final Read r, final AbstractKmerTable[] sets, final int start, final int stop,
+ final boolean right, final boolean left){
+ assert(left || right);
+ assert(!(left && right));
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ if(verbose){outstream.println("KTrimming read "+r.id);}
+ final byte[] bases=r.bases, quals=r.quality;
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ int minLoc=999999999, minLocExclusive=999999999;
+ int maxLoc=-1, maxLocExclusive=-1;
+ final int initialLength=r.length();
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=symbolToNumber0[b];
+ long x2=symbolToComplementNumber0[b];
+ kmer=((kmer<<bitsPerBase)|x)&mask;
+ rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
+ if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
+ if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ minLoc=Tools.min(minLoc, i-k+1);
+ assert(minLoc>=0);
+ maxLoc=i;
+ found++;
+ }
+ }
+ }
+
+ if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
+ if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
+
+ //If nothing was found, scan for short kmers. Only used for trimming..
+ if(useShortKmers && found==0){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ if(left){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=symbolToNumber0[b];
+ long x2=symbolToComplementNumber0[b];
+ kmer=((kmer<<bitsPerBase)|x)&mask;
+ rkmer=rkmer|(x2<<(bitsPerBase*len));
+ len++;
+ if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ if(verbose){
+ outstream.println("Looking for left kmer "+kmerToString(kmer, len));
+ outstream.println("Looking for left rkmer "+kmerToString(rkmer, len));
+ }
+
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){outstream.println("Found "+kmer);}
+ minLoc=0;
+ minLocExclusive=Tools.min(minLocExclusive, i+1);
+ maxLoc=Tools.max(maxLoc, i);
+ maxLocExclusive=Tools.max(maxLocExclusive, 0);
+ found++;
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ if(right){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=symbolToNumber0[b];
+ long x2=symbolToComplementNumber0[b];
+ kmer=kmer|(x<<(bitsPerBase*len));
+ rkmer=((rkmer<<bitsPerBase)|x2)&mask;
+ len++;
+ if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+ if(verbose){
+ outstream.println("Looking for right kmer "+
+ AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
+ }
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){outstream.println("Found "+kmer);}
+ minLoc=i;
+ minLocExclusive=Tools.min(minLocExclusive, bases.length);
+ maxLoc=bases.length-1;
+ maxLocExclusive=Tools.max(maxLocExclusive, i-1);
+ found++;
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){outstream.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+
+ ", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
+
+ if(found==0){return 0;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ {//Increment counter for the scaffold whose kmer was first detected
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id0]++;
+ scaffoldBaseCountsT[id0]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id0, 1);
+ scaffoldBaseCounts.addAndGet(id0, bases.length);
+ }
+ }
+
+ if(trimPad!=0){
+ maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
+ minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
+ maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
+ minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
+ }
+
+ if(left){ //Trim from the read start to the rightmost kmer base
+ if(verbose){outstream.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
+ int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
+ if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }else{ //Trim from the leftmost kmer base to the read stop
+ assert(right);
+ if(verbose){outstream.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
+ int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
+ if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }
+ }
+
/**
* Trim a read to remove matching kmers and everything to their left or right.
* @param r Read to process
* @param sets Kmer tables
* @return Number of bases trimmed
*/
- private final int ktrim(final Read r, final AbstractKmerTable[] sets){
+ private final int ktrim(final Read r, final AbstractKmerTable[] sets, final int start, final int stop){
assert(ktrimLeft || ktrimRight);
if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
@@ -3577,9 +3785,6 @@ public class BBDuk {
int maxLoc=-1, maxLocExclusive=-1;
final int initialLength=r.length();
- final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
- final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
-
//Scan for normal kmers
for(int i=start; i<stop; i++){
byte b=bases[i];
@@ -4807,9 +5012,9 @@ public class BBDuk {
final boolean usePairGC;
/** If positive, only look for kmer matches in the leftmost X bases */
- int restrictLeft;
+ final int restrictLeft;
/** If positive, only look for kmer matches the rightmost X bases */
- int restrictRight;
+ final int restrictRight;
/** Skip this many initial input reads */
private final long skipreads;
=====================================
current/jgi/RQCFilter2.java
=====================================
@@ -585,6 +585,7 @@ public class RQCFilter2 {
microbesUsed=appendOutDir(microbesUsed);
chloroStatsFile=appendOutDir(chloroStatsFile);
sipStatsFile=appendOutDir(sipStatsFile);
+ sipScafStats=appendOutDir(sipScafStats);
cardinalityName=appendOutDir(cardinalityName);
adaptersOutFile=appendOutDir(adaptersOutFile);
@@ -816,8 +817,13 @@ public class RQCFilter2 {
sb.append("filteredByTile="+removeOutDir(fbtOutFile)).append('\n');
}
- if(sipFlag && sipStatsFile!=null){
- sb.append("sipSpikeinStats="+removeOutDir(sipStatsFile)).append('\n');
+ if(sipFlag){
+ if(sipStatsFile!=null){
+ sb.append("sipSpikeinStats="+removeOutDir(sipStatsFile)).append('\n');
+ }
+ if(sipScafStats!=null){
+ sb.append("sipScafStats="+removeOutDir(sipScafStats)).append('\n');
+ }
}
if((chloroMapFlag || mitoMapFlag || riboMapFlag) && chloroOutFile!=null){
@@ -1136,15 +1142,15 @@ public class RQCFilter2 {
}
//Filter the file to only include specified organisms in the taxlist..
- final String ref=Data.findPath("?SIP_spikeins.fa.gz");//TODO add to RQCFilterData like others
+ final String ref=Data.findPath("?SIP_spikeins.fa.gz");
{
ArrayList<String> args=new ArrayList<String>();
args.add("covstats="+sipStatsFile);
final boolean addToOtherStats=(sipStatsFile==null);
- decontamByMapping(in1z, in2z, out1z, out2z, null, "sipScafStats.txt", inPrefix, outPrefix, ref, step, addToOtherStats, args);
+ decontamByMapping(in1z, in2z, out1z, out2z, null, sipScafStats, inPrefix, outPrefix, ref, step, addToOtherStats, args);
if(!addToOtherStats){
- filterstats.parseSip("sipScafStats.txt");
+ filterstats.parseSip(sipScafStats);
assert(filterstats.toString()!=null);
}
}
@@ -3982,6 +3988,7 @@ public class RQCFilter2 {
private String microbesUsed="microbesUsed.txt";
private String chloroStatsFile="chloroStats.txt";
private String sipStatsFile="sipSpikeinStats.txt";
+ private String sipScafStats="sipScafStats.txt";
private String nexteraStats="nexteraStats.txt";
private String ihistName="ihist_merge.txt";
private String khistName="khist.txt";
=====================================
current/jgi/RQCFilterStats.java
=====================================
@@ -93,10 +93,6 @@ public class RQCFilterStats {
}
public String toString(boolean skipAssertion){
- assert(skipAssertion || readsIn>=readsOut) : toString(true);
- assert(skipAssertion || basesIn>=basesOut) : toString(true);
- assert(skipAssertion || readsIn-totalReadsRemoved()==readsOut) : toString(true)+"\n\ntrr="+totalReadsRemoved()+"\nri="+readsIn+"\nro="+readsOut;
- assert(skipAssertion || basesIn-totalBasesRemoved()==basesOut) : toString(true)+"\n"+basesIn+"-"+totalBasesRemoved()+"!="+basesOut;
StringBuilder sb=new StringBuilder(1000);
sb.append("#Class\tReads\tBases\tReadPct\tBasePct\tNotes\n");
sb.append(format("Input", readsIn, basesIn, readsIn, basesIn));
@@ -120,6 +116,12 @@ public class RQCFilterStats {
sb.append(format("Dog", readsDog, basesDog, readsIn, basesIn));
sb.append(format("Microbe", readsMicrobe, basesMicrobe, readsIn, basesIn));
sb.append(format("Other", readsOther, basesOther, readsIn, basesIn));
+
+ assert(skipAssertion || readsIn>=readsOut) : toString(true)+"\n\nsb:\n"+sb+"\n";
+ assert(skipAssertion || basesIn>=basesOut) : toString(true)+"\n\n"+sb+"\n";
+ assert(skipAssertion || readsIn-totalReadsRemoved()==readsOut) : toString(true)+"\n\ntrr="+totalReadsRemoved()+"\nri="+readsIn+"\nro="+readsOut+"\n\nsb:\n"+sb+"\n";
+ assert(skipAssertion || basesIn-totalBasesRemoved()==basesOut) : toString(true)+"\n"+basesIn+"-"+totalBasesRemoved()+"!="+basesOut+"\n\nsb:\n"+sb+"\n";
+
return sb.toString();
}
=====================================
current/shared/Parser.java
=====================================
@@ -1418,6 +1418,10 @@ public class Parser {
private static byte qout=-1;
private static boolean parsedQuality=false;
+ public static void setQuality(int x){
+ qin=(byte)x;
+ parsedQuality=(x>-1);
+ }
public static void processQuality(){
// assert(parsedQuality);
if(!parsedQuality){return;}
=====================================
current/shared/Shared.java
=====================================
@@ -125,8 +125,8 @@ public class Shared {
public static final int GAPCOST=Tools.max(1, GAPLEN/64);
public static final byte GAPC='-';
- public static String BBMAP_VERSION_STRING="38.96";
- public static String BBMAP_VERSION_NAME="Return True";
+ public static String BBMAP_VERSION_STRING="38.97";
+ public static String BBMAP_VERSION_NAME="Honeybee";
public static boolean TRIM_READ_COMMENTS=false;
public static boolean TRIM_RNAME=false; //For mapped sam reads
=====================================
current/stream/FASTQ.java
=====================================
@@ -178,6 +178,7 @@ public class FASTQ {
}else{
if(warnQualityChange){System.err.println("Changed from ASCII-64 to ASCII-33 due to read of length "+bases.length+" while prescanning.");}
ASCII_OFFSET=33;
+ SET_QIN=true;
}
DETECT_QUALITY=false;
}
@@ -549,12 +550,15 @@ public class FASTQ {
ASCII_OFFSET=33;
}
DETECT_QUALITY=false;
+// System.err.println("A: "+numericID+": "+bases.length);
}
// assert(false) : Arrays.toString(quals);
for(int i=0; i<quals.length; i++){
quals[i]-=ASCII_OFFSET; //Convert from ASCII33 to native.
if(DETECT_QUALITY && ASCII_OFFSET==33 && (quals[i]>QUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){
+
+// System.err.println("B: "+numericID+": "+bases.length);
if(warnQualityChange){
if(numericID<1){
System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
@@ -651,6 +655,29 @@ public class FASTQ {
}
}
}else{
+
+ //Prevents problems with PacBio's weird quality numbers. Usually.
+ if(DETECT_QUALITY && numericID==0){//first read
+ for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
+ if(cntr==1 && s.length>=MIN_LENGTH_TO_FORCE_ASCII_33){//Chances of ASCII-64 are vanishingly small
+// SET_QIN=true;
+ DETECT_QUALITY=false;
+ ASCII_OFFSET=33;
+ }
+ quad[cntr]=s;
+ cntr++;
+ if(cntr==4){
+ cntr=0;
+ final Read r=quadToRead_slow(quad, false, bf, numericID, flag);
+
+ list.add(r);
+ added++;
+ numericID++;
+ if(added>=maxReadsToReturn){return list;}
+ }
+ }
+ }
+
for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
quad[cntr]=s;
cntr++;
=====================================
current/tax/GiToTaxid.java
=====================================
@@ -88,36 +88,24 @@ public class GiToTaxid {
public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
public static int parseGiToTaxid(String s, char delimiter){
- int x=parseGiNumber(s, delimiter);
- assert(x>=0) : s;
- assert(array!=null) : "To use gi numbers, you must load a gi table.";
-// if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
- if(x>=0 && x<array.length){return array[x];}
- assert(x<array.length) : "The GI number "+x+" is too big.\n"
- + "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
- + "To ignore this problem, please run with the -da flag.\n";
- return -1;
+ long x=parseGiNumber(s, delimiter);
+ assert(x>=0) : x+", "+s;
+ return getID(x);
}
public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
public static int parseGiToTaxid(byte[] s, char delimiter){
- int x=parseGiNumber(s, delimiter);
- if(x>=0){return array[x];}
- return -1;
+ long x=parseGiNumber(s, delimiter);
+ return x<0 ? -1 : getID(x);
}
/** Parse a gi number, or return -1 if formatted incorrectly. */
- static int parseGiNumber(String s, char delimiter){
+ static long parseGiNumber(String s, char delimiter){
if(s==null || s.length()<4){return -1;}
-// System.err.println("a");
if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
-// System.err.println("b");
if(!s.startsWith("gi")){return -1;}
-// System.err.println("c");
-// System.err.println("d");
int initial=s.indexOf(delimiter);
-// System.err.println("e");
if(initial<0){
if(delimiter!='~'){
delimiter='~';
@@ -128,21 +116,16 @@ public class GiToTaxid {
initial=s.indexOf(delimiter);
}
if(initial<0){return -1;}
-// System.err.println("f");
-// System.err.println("g");
}
-// System.err.println("h");
if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
-// System.err.println("i");
- int number=0;
+ long number=0;
for(int i=initial+1; i<s.length(); i++){
char c=s.charAt(i);
if(c==delimiter){break;}
assert(Tools.isDigit(c));
number=(number*10)+(c-'0');
}
-// System.err.println("j: "+number);
return number;
}
@@ -173,15 +156,14 @@ public class GiToTaxid {
public static int getID(String s){return getID(s, '|');}
/** Get the taxID from a header starting with a taxID or gi number */
public static int getID(String s, char delimiter){
- int x=parseTaxidNumber(s, delimiter);
- if(x>=0){return x;}
+ long x=parseTaxidNumber(s, delimiter);
+ if(x>=0){return (int)x;}
x=parseGiNumber(s, delimiter);
- if(x>=0){return array[x];}
- return -1;
+ return x<0 ? -1 : getID(x);
}
/** Parse a gi number, or return -1 if formatted incorrectly. */
- static int parseGiNumber(byte[] s, char delimiter){
+ static long parseGiNumber(byte[] s, char delimiter){
if(s==null || s.length<4){return -1;}
if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
int initial=Tools.indexOf(s, (byte)delimiter);
@@ -192,7 +174,7 @@ public class GiToTaxid {
}
if(!Tools.isDigit(s[initial+1])){return -1;}
- int number=0;
+ long number=0;
for(int i=initial+1; i<s.length; i++){
byte c=s[i];
if(c==delimiter){break;}
@@ -227,16 +209,37 @@ public class GiToTaxid {
public static int getID(byte[] s){return getID(s, '|');}
/** Get the taxID from a header starting with a taxID or gi number */
public static int getID(byte[] s, char delimiter){
- int x=parseGiNumber(s, delimiter);
- if(x>=0){return array[x];}
+ long x=parseGiNumber(s, delimiter);
+ if(x>=0){return getID(x, true);}
return parseNcbiNumber(s, delimiter);
}
- /** Get the taxID from a gi number */
- public static int getID(int gi){
- assert(gi>=0) : gi;
- assert(gi<array.length) : gi+", "+array.length;
- return array[gi];
+ /** Get the taxID from a gi number;
+ * -1 if not present or invalid (negative input),
+ * -2 if out of range (too high) */
+ public static int getID(long gi){
+ return getID(gi, true);
+ }
+
+ /** Get the taxID from a gi number;
+ * 0 if not present,
+ * -1 if invalid (negative input),
+ * -2 if out of range (too high) */
+ public static int getID(long gi, boolean assertInRange){
+ assert(initialized) : "To use gi numbers, you must load a gi table.";
+ if(gi<0 || gi>maxGiLoaded){
+ assert(!assertInRange) : gi<0 ? "gi number "+gi+" is invalid." :
+ "The gi number "+gi+" is too big: Max loaded gi number is "+maxGiLoaded+".\n"
+ + "Please update the gi table with the latest version from NCBI"
+ + " as per the instructions in gitable.sh.\n"
+ + "To ignore this problem, please run with the -da flag.\n";
+ return gi<0 ? -1 : -2;
+ }
+ final long upper=gi>>>SHIFT;
+ final int lower=(int)(gi&LOWERMASK);
+ assert(upper<Shared.MAX_ARRAY_LEN && upper<array.length) : gi+", "+upper+", "+array.length;
+ final int[] slice=array[(int)upper];
+ return slice==null || slice.length<=lower ? 0 : slice[lower];
}
public static void initialize(String fname){
@@ -245,8 +248,19 @@ public class GiToTaxid {
synchronized(GiToTaxid.class){
if(!initialized || fileString==null || !fileString.equals(fname)){
fileString=fname;
- if(fname.contains(".int1d")){
- array=ReadWrite.read(int[].class, fname, true);
+ if(fname.contains(".int2d")){
+ array=ReadWrite.read(int[][].class, fname, true);
+ maxGiLoaded=-1;
+ if(array!=null && array.length>0){
+ int upper=array.length-1;
+ int[] section=array[upper];
+ int lower=section.length-1;
+ maxGiLoaded=(((long)upper)<<SHIFT)|lower;
+ }
+ }else if(fname.contains(".int1d")){
+ throw new RuntimeException("Old gi table format filename "+fname+".\n"
+ + "Current files should end in .int2d.");
+
}else{
array=makeArray(fname);
}
@@ -259,17 +273,19 @@ public class GiToTaxid {
public static boolean isInitialized(){return initialized;}
public static synchronized void unload(){
+ maxGiLoaded=-1;
array=null;
fileString=null;
initialized=false;
}
- private static int[] makeArray(String fnames){
+ private static int[][] makeArray(String fnames){
String[] split;
if(new File(fnames).exists()){split=new String[] {fnames};}
else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
else if(fnames.indexOf('#')>=0){
- assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
+ assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for "
+ + "relative paths in present working directory.";
File dir=new File(System.getProperty("user.dir"));
String prefix=fnames.substring(0, fnames.indexOf('#'));
String suffix=fnames.substring(fnames.indexOf('#')+1);
@@ -290,20 +306,28 @@ public class GiToTaxid {
throw new RuntimeException("Invalid file: "+fnames);
}
- IntList list=new IntList();
-// assert(max<Integer.MAX_VALUE) : "Overflow.";
-// int[] x=new int[(int)max+1];
-// Arrays.fill(x, -1);
+ int numLists=32;
+ IntList[] lists=new IntList[numLists];
long total=0;
for(String s : split){
- long count=addToList(s, list);
+ long count=addToList(s, lists);
total+=count;
}
- return list.shrink().array;
+ for(int i=0; i<lists.length; i++){
+ if(lists[i]!=null && lists[i].size>0){
+ lists[i].shrink();
+ numLists=i+1;
+ }
+ }
+ int[][] table=new int[numLists][];
+ for(int i=0; i<numLists; i++){
+ table[i]=lists[i].array;
+ }
+ return table;
}
- private static long addToList(String fname, IntList list){
+ private static long addToList(String fname, IntList[] lists){
boolean warned=false;
ByteFile bf=ByteFile.makeByteFile(fname, true);
long count=0, invalid=0;
@@ -316,34 +340,18 @@ public class GiToTaxid {
assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
-// assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
+ //assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
int tid=Parse.parseInt(line, tab2+1, tab3);
int gi=Parse.parseInt(line, tab3+1, line.length);
- if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
+ if(gi<0){
invalid++;
}else{
- assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
- int old=list.get(gi);
- assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
-
- list.set(gi, tid);
-
- //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
-// if(x[gi]!=-1 && x[gi]!=ncbi){
-// if(!warned){
-// System.err.println("***WARNING*** For file "+fname+":\n"+
-// ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
-// "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
-// + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
-// + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
-// warned=true;
-// }
-// }else{
-// x[gi]=ncbi;
-// }
+ assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
+ int old=setID(gi, tid, lists);
+ assert(old<1 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
}
}else{
- if(line.length==0){System.err.println(fname+", "+count);}//debug
+ //if(line.length==0){System.err.println(fname+", "+count);}//debug
invalid++;
}
line=bf.nextLine();
@@ -353,6 +361,28 @@ public class GiToTaxid {
return count;
}
+ private static int getID(long gi, IntList[] lists){
+ assert(gi>=0) : "gi number "+gi+" is invalid.";
+ final long upper=gi>>>SHIFT;
+ final int lower=(int)(gi&LOWERMASK);
+ assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper;
+ IntList list=lists[(int)upper];
+ return lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower);
+ }
+
+ private static int setID(long gi, int tid, IntList[] lists){
+ assert(gi>=0) : "gi number "+gi+" is invalid.";
+ final long upper=gi>>>SHIFT;
+ final int lower=(int)(gi&LOWERMASK);
+ assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper;
+ IntList list=lists[(int)upper];
+ if(list==null){list=lists[(int)upper]=new IntList();}
+ int old=lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower);
+ list.set(lower, tid);
+ maxGiLoaded=Tools.max(gi, maxGiLoaded);
+ return old;
+ }
+
// private static int[] makeArrayOld(String fnames){
// String[] split;
// if(new File(fnames).exists()){split=new String[] {fnames};}
@@ -420,7 +450,12 @@ public class GiToTaxid {
// return count;
// }
- private static int[] array;
+ private static long maxGiLoaded=-1;
+ private static int[][] array;
+ private static final int SHIFT=30;
+ private static final long UPPERMASK=(-1L)<<SHIFT;
+ private static final long LOWERMASK=~UPPERMASK;
+
private static String fileString;
public static boolean verbose=false;
=====================================
current/tax/GiToTaxidInt.java
=====================================
@@ -0,0 +1,431 @@
+package tax;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import shared.Parse;
+import shared.Shared;
+import shared.Tools;
+import structures.IntList;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 10, 2015
+ *
+ */
+public class GiToTaxidInt {
+
+ public static void main(String[] args){
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.ZIPLEVEL=9;
+ ReadWrite.PIGZ_BLOCKSIZE=256;
+// ReadWrite.PIGZ_ITERATIONS=30;
+
+ for(String arg : args){
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ shared.Parser.parseZip(arg, a, b);
+ }
+// if(args.length>2 && false){//Run a test
+// test(args);
+// }else
+ if(args.length>=2){//Write array
+ initialize(args[0]);
+ ReadWrite.write(array, args[1], true);
+ }
+ }
+
+ public static void test(String[] args){
+ System.err.println(getID(1000));
+ System.err.println(getID(10000));
+ System.err.println(getID(10001));
+ System.err.println(getID(10002));
+ System.err.println(getID(10003));
+ System.err.println(getID(10004));
+ System.err.println(getID(10005));
+ System.err.println(getID(100000));
+ System.err.println(getID(1000000));
+ System.err.println(getID(10000000));
+
+ TaxTree tree=null;
+ if(args.length>1){
+ tree=TaxTree.loadTaxTree(args[0], System.err, true, true);
+ }
+
+ System.err.println("Strings:");
+ int x;
+ x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 30);
+ }
+ x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 40);
+ }
+ x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 20);
+ }
+
+ if(tree!=null){
+ tree.percolateUp();
+ ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
+ for(TaxNode n : nodes){
+ System.err.println(n);
+ }
+ }
+ }
+
+ public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
+ public static int parseGiToTaxid(String s, char delimiter){
+ int x=parseGiNumber(s, delimiter);
+ assert(x>=0) : s;
+ assert(array!=null) : "To use gi numbers, you must load a gi table.";
+// if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
+ if(x>=0 && x<array.length){return array[x];}
+ assert(x<array.length) : "The GI number "+x+" is too big.\n"
+ + "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
+ + "To ignore this problem, please run with the -da flag.\n";
+ return -1;
+ }
+
+
+ public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
+ public static int parseGiToTaxid(byte[] s, char delimiter){
+ long x=parseGiNumber(s, delimiter);
+ if(x>=0 && x<array.length){return array[(int)x];}
+ if(x<0){return -1;}
+ assert(false) : x;
+ return -1;
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ static int parseGiNumber(String s, char delimiter){
+ if(s==null || s.length()<4){return -1;}
+// System.err.println("a");
+ if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
+// System.err.println("b");
+ if(!s.startsWith("gi")){return -1;}
+// System.err.println("c");
+// System.err.println("d");
+ int initial=s.indexOf(delimiter);
+// System.err.println("e");
+ if(initial<0){
+ if(delimiter!='~'){
+ delimiter='~';
+ initial=s.indexOf(delimiter);
+ }
+ if(initial<0){
+ delimiter='_';
+ initial=s.indexOf(delimiter);
+ }
+ if(initial<0){return -1;}
+// System.err.println("f");
+// System.err.println("g");
+ }
+// System.err.println("h");
+ if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
+// System.err.println("i");
+
+ int number=0;
+ for(int i=initial+1; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(c==delimiter){break;}
+ assert(Tools.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+// System.err.println("j: "+number);
+ return number;
+ }
+
+ /** Parse a ncbi number, or return -1 if formatted incorrectly. */
+ public static int parseTaxidNumber(String s, char delimiter){
+ if(s==null || s.length()<5){return -1;}
+ if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);}
+ if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;}
+ int initial=s.indexOf(delimiter);
+ if(initial<0){
+ delimiter='_';
+ initial=s.indexOf(delimiter);
+ if(initial<0){return -1;}
+ }
+ if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(c==delimiter || c==' '){break;}
+ assert(Tools.isDigit(c)) : c+"\n"+s;
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+
+ public static int getID(String s){return getID(s, '|');}
+ /** Get the taxID from a header starting with a taxID or gi number */
+ public static int getID(String s, char delimiter){
+ int x=parseTaxidNumber(s, delimiter);
+ if(x>=0){return x;}
+ x=parseGiNumber(s, delimiter);
+ if(x>=0){return array[x];}
+ return -1;
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ static int parseGiNumber(byte[] s, char delimiter){
+ if(s==null || s.length<4){return -1;}
+ if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
+ int initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){
+ delimiter='_';
+ initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){return -1;}
+ }
+ if(!Tools.isDigit(s[initial+1])){return -1;}
+
+ long number=0;
+ for(int i=initial+1; i<s.length; i++){
+ byte c=s[i];
+ if(c==delimiter){break;}
+ assert(Tools.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return (int)number;
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ static int parseNcbiNumber(byte[] s, char delimiter){
+ if(s==null || s.length<3){return -1;}
+ if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;}
+ int initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){
+ delimiter='_';
+ initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){return -1;}
+ }
+ if(!Tools.isDigit(s[initial+1])){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length; i++){
+ byte c=s[i];
+ if(c==delimiter){break;}
+ assert(Tools.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+ public static int getID(byte[] s){return getID(s, '|');}
+ /** Get the taxID from a header starting with a taxID or gi number */
+ public static int getID(byte[] s, char delimiter){
+ int x=parseGiNumber(s, delimiter);
+ if(x>=0){return array[x];}
+ return parseNcbiNumber(s, delimiter);
+ }
+
+ /** Get the taxID from a gi number */
+ public static int getID(long gi){
+ assert(gi>=0) : gi;
+ assert(gi<Integer.MAX_VALUE) : gi+" > "+Integer.MAX_VALUE;
+ assert(gi<array.length) : gi+", "+array.length;
+ return array[(int)gi];
+ }
+
+ public static void initialize(String fname){
+ assert(fname!=null);
+ if(fileString==null || !fileString.equals(fname)){
+ synchronized(GiToTaxid.class){
+ if(!initialized || fileString==null || !fileString.equals(fname)){
+ fileString=fname;
+ if(fname.contains(".int1d")){
+ array=ReadWrite.read(int[].class, fname, true);
+ }else{
+ array=makeArray(fname);
+ }
+ }
+ initialized=true;
+ }
+ }
+ }
+
+ public static boolean isInitialized(){return initialized;}
+
+ public static synchronized void unload(){
+ array=null;
+ fileString=null;
+ initialized=false;
+ }
+
+ private static int[] makeArray(String fnames){
+ String[] split;
+ if(new File(fnames).exists()){split=new String[] {fnames};}
+ else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
+ else if(fnames.indexOf('#')>=0){
+ assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
+ File dir=new File(System.getProperty("user.dir"));
+ String prefix=fnames.substring(0, fnames.indexOf('#'));
+ String suffix=fnames.substring(fnames.indexOf('#')+1);
+
+ File[] array=dir.listFiles();
+ StringBuilder sb=new StringBuilder();
+ String comma="";
+ for(File f : array){
+ String s=f.getName();
+ if(s.startsWith(prefix) && s.startsWith(suffix)){
+ sb.append(comma);
+ sb.append(s);
+ comma=",";
+ }
+ }
+ split=sb.toString().split(",");
+ }else{
+ throw new RuntimeException("Invalid file: "+fnames);
+ }
+
+ IntList list=new IntList();
+// assert(max<Integer.MAX_VALUE) : "Overflow.";
+// int[] x=new int[(int)max+1];
+// Arrays.fill(x, -1);
+
+ long total=0;
+ for(String s : split){
+ long count=addToList(s, list);
+ total+=count;
+ }
+ return list.shrink().array;
+ }
+
+ private static long addToList(String fname, IntList list){
+ boolean warned=false;
+ ByteFile bf=ByteFile.makeByteFile(fname, true);
+ long count=0, invalid=0;
+ byte[] line=bf.nextLine();
+ while(line!=null){
+ if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na
+ count++;
+ int tab2=Tools.indexOfNth(line, '\t', 2);
+ int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1);
+ assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
+ assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
+ assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
+// assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
+ int tid=Parse.parseInt(line, tab2+1, tab3);
+ int gi=Parse.parseInt(line, tab3+1, line.length);
+ if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
+ invalid++;
+ }else{
+ assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
+ int old=list.get(gi);
+ assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
+
+ list.set(gi, tid);
+
+ //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
+// if(x[gi]!=-1 && x[gi]!=ncbi){
+// if(!warned){
+// System.err.println("***WARNING*** For file "+fname+":\n"+
+// ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
+// "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
+// + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
+// + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
+// warned=true;
+// }
+// }else{
+// x[gi]=ncbi;
+// }
+ }
+ }else{
+ if(line.length==0){System.err.println(fname+", "+count);}//debug
+ invalid++;
+ }
+ line=bf.nextLine();
+ }
+ if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);}
+ bf.close();
+ return count;
+ }
+
+// private static int[] makeArrayOld(String fnames){
+// String[] split;
+// if(new File(fnames).exists()){split=new String[] {fnames};}
+// else{split=fnames.split(",");}
+//
+// long max=0;
+// for(String s : split){
+// max=Tools.max(max, findMaxID(s));
+// }
+//
+// assert(max<Integer.MAX_VALUE) : "Overflow.";
+// int[] x=new int[(int)max+1];
+// Arrays.fill(x, -1);
+//
+// long total=0;
+// for(String s : split){
+// long count=fillArray(s, x);
+// total+=count;
+// }
+// return x;
+// }
+//
+// private static long findMaxID(String fname){
+// ByteFile bf=ByteFile.makeByteFile(fname, true);
+// long count=0, max=0;
+// byte[] line=bf.nextLine();
+// while(line!=null){
+// count++;
+// int tab=Tools.indexOf(line, (byte)'\t');
+// long gi=Parse.parseLong(line, 0, tab);
+// max=Tools.max(max, gi);
+// line=bf.nextLine();
+// }
+// bf.close();
+// return max;
+// }
+//
+// private static long fillArray(String fname, int[] x){
+// boolean warned=false;
+// ByteFile bf=ByteFile.makeByteFile(fname, true);
+// long count=0;
+// byte[] line=bf.nextLine();
+// while(line!=null){
+// count++;
+// int tab=Tools.indexOf(line, (byte)'\t');
+// int gi=Parse.parseInt(line, 0, tab);
+// int ncbi=Parse.parseInt(line, tab+1, line.length);
+// //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
+// if(x[gi]!=-1 && x[gi]!=ncbi){
+// if(!warned){
+// System.err.println("***WARNING*** For file "+fname+":\n"+
+// ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
+// "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
+// + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
+// + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
+// warned=true;
+// }
+// }else{
+// x[gi]=ncbi;
+// }
+// line=bf.nextLine();
+// }
+// if(verbose){System.err.println("Count: "+count);}
+// bf.close();
+// return count;
+// }
+
+ private static int[] array;
+ private static String fileString;
+
+ public static boolean verbose=false;
+ private static boolean initialized=false;
+}
=====================================
current/tax/TaxServer.java
=====================================
@@ -1243,7 +1243,7 @@ public class TaxServer {
if(type2==GI){
for(String name : names){
sb.append(comma);
- TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+ TaxNode tn=getTaxNodeGi(Long.parseLong(name));
if(tn==null){sb.append("-1");}
else{sb.append(tn.id);}
comma=",";
@@ -1297,7 +1297,7 @@ public class TaxServer {
int type2=type&15;
final TaxNode tn;
if(type2==GI){
- tn=getTaxNodeGi(Integer.parseInt(name));
+ tn=getTaxNodeGi(Long.parseLong(name));
}else if(type2==NAME){
tn=getTaxNodeByName(name);
}else if(type2==TAXID){
@@ -1348,7 +1348,7 @@ public class TaxServer {
if(type2==GI){
for(String name : names){
sb.append(comma);
- TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+ TaxNode tn=getTaxNodeGi(Long.parseLong(name));
if(tn==null){sb.append("Not found");}
else{sb.append(tree.toSemicolon(tn, skipNonCanonical, mononomial));}
comma=",";
@@ -1414,7 +1414,7 @@ public class TaxServer {
long img=-1;
if(type==GI){
- tn0=getTaxNodeGi(Integer.parseInt(name));
+ tn0=getTaxNodeGi(Long.parseLong(name));
}else if(type==NAME){
tn0=getTaxNodeByName(name);
}else if(type==TAXID){
@@ -1546,7 +1546,7 @@ public class TaxServer {
int type2=type&15;
if(type2==GI){
for(String name : names){
- TaxNode tn=getTaxNodeGi(Integer.parseInt(name));
+ TaxNode tn=getTaxNodeGi(Long.parseLong(name));
if(tn!=null){list.add(tn.id);}
else{notFound.incrementAndGet();}
}
@@ -1620,7 +1620,7 @@ public class TaxServer {
}
/** Look up a TaxNode from the gi number */
- TaxNode getTaxNodeGi(int gi){
+ TaxNode getTaxNodeGi(long gi){
int ncbi=-1;
try {
ncbi=GiToTaxid.getID(gi);
=====================================
current/tax/TaxTree.java
=====================================
@@ -2491,7 +2491,8 @@ public class TaxTree implements Serializable{
private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
private static final String defaultImgFile="TAX_PATH/imgDump.txt";
- private static final String defaultTableFile="TAX_PATH/gitable.int1d.gz";
+ private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz";
+ private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz";
private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz";
private static final String defaultPatternFile="TAX_PATH/patterns.txt";
private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz";
@@ -2526,7 +2527,7 @@ public class TaxTree implements Serializable{
/** Path to all taxonomy files, substituted in to make specific file paths */
public static String TAX_PATH=defaultTaxPath();
- /** Location of gitable.int1d.gz for gi lookups */
+ /** Location of gitable.int2d.gz for gi lookups */
public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);}
/** Location of tree.taxtree.gz */
public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);}
=====================================
docs/changelog.txt
=====================================
@@ -897,6 +897,10 @@ Wrapped instances of byte array instantiation in allocByte1D.
38.91
Fixed SendSketch json array format for very large input, on client side.
Added maxload flag to BBCMS.
+38.97
+Added trimtips to BBDuk, mainly for trimming adapters on both ends of PacBio reads.
+Changed processing of reads longer than 200bp to force ASCII-33 quality.
+Enable automatic entryfilter in Clumpify to handle libraries with mostly identical reads.
todo: outshort for fungalrelease.
=====================================
removecatdogmousehuman.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
This script requires at least 52GB RAM.
It is designed for NERSC and uses hard-coded paths.
@@ -68,7 +68,7 @@ calcXmx () {
calcXmx "$@"
function removecatdogmousehuman() {
- local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/projectb/sandbox/gaag/bbtools/mousecatdoghuman/ pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
+ local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/cfs/cdirs/bbtools/mousecatdoghuman/ pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
echo $CMD >&2
eval $CMD
}
=====================================
removehuman.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
This script requires at least 16GB RAM.
It is designed for NERSC and uses hard-coded paths.
@@ -66,7 +66,7 @@ calcXmx () {
calcXmx "$@"
function removehuman() {
- local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/projectb/sandbox/gaag/bbtools/hg19 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
+ local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.9 maxindel=3 bwr=0.16 bw=12 quickmatch fast minhits=2 path=/global/cfs/cdirs/bbtools/hg19 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 kfilter=25 maxsites=1 k=14 bloomfilter $@"
echo $CMD >&2
eval $CMD
}
=====================================
removehuman2.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
This script requires at least 17GB RAM.
It is designed for NERSC and uses hard-coded paths.
@@ -62,7 +62,7 @@ calcXmx () {
calcXmx "$@"
function removehuman() {
- local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.75 maxindel=8 bwr=0.22 bw=26 minhits=1 path=/global/projectb/sandbox/gaag/bbtools/hg19 build=2 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 maxsites=1 k=14 tipsearch=0 kfilter=25 bloomfilter $@"
+ local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap minratio=0.75 maxindel=8 bwr=0.22 bw=26 minhits=1 path=/global/cfs/cdirs/bbtools/hg19 build=2 pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag usemodulo printunmappedcount ztd=2 maxsites=1 k=14 tipsearch=0 kfilter=25 bloomfilter $@"
echo $CMD >&2
eval $CMD
}
=====================================
removemicrobes.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified September 17, 2018
+Last modified December 22, 2021
This script requires at least 10GB RAM.
It is designed for NERSC and uses hard-coded paths.
@@ -73,7 +73,7 @@ calcXmx () {
calcXmx "$@"
function removemicrobes() {
- local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap strictmaxindel=4 bwr=0.16 bw=12 ef=0.001 minhits=2 path=/global/projectb/sandbox/gaag/bbtools/commonMicrobes pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag printunmappedcount ztd=2 kfilter=25 maxsites=1 k=13 minid=0.95 idfilter=0.95 minhits=2 build=1 bloomfilter $@"
+ local CMD="java $EA $EOOM $z $z2 $JNI -cp $CP align2.BBMap strictmaxindel=4 bwr=0.16 bw=12 ef=0.001 minhits=2 path=/global/cfs/cdirs/bbtools/commonMicrobes pigz unpigz zl=6 qtrim=r trimq=10 untrim idtag printunmappedcount ztd=2 kfilter=25 maxsites=1 k=13 minid=0.95 idfilter=0.95 minhits=2 build=1 bloomfilter $@"
echo $CMD >&2
eval $CMD
}
=====================================
resources/PacBioAdapter.fa
=====================================
@@ -0,0 +1,2 @@
+>PacBio_gDNA_Amplification_Adapter
+AAGCAGTGGTATCAACGCAGAGTACT
=====================================
resources/contents.txt
=====================================
@@ -82,3 +82,6 @@ These files contain consensus sequences of various conserved RNAs (rRNAs and tRN
model.pgm
Prokaryotic gene model, used by CallGenes and BBSketch in protein mode.
+PacBioAdapter.fa
+A sequence found at read tips in some PacBio low input libraries.
+
=====================================
resources/short.fa
=====================================
@@ -12,3 +12,10 @@ GTTCAGAGTTCTACAGTCCGACGATC
TCGTATGCCGTCTTCTGCTTGT
>contam_241
TGGAATTCTCGGGTGCCAAGG
+>adapter_19mer_with_extensions
+NAATGATACGGCGACCACCGNN
+AAATGATACGGCGACCACCGAN
+CAATGATACGGCGACCACCGCN
+GAATGATACGGCGACCACCGGN
+TAATGATACGGCGACCACCGTN
+
=====================================
shred.sh
=====================================
@@ -3,7 +3,7 @@
usage(){
echo "
Written by Brian Bushnell
-Last modified September 1, 2016
+Last modified January 13, 2022
Description: Shreds sequences into shorter, potentially overlapping sequences.
Usage: shred.sh in=<file> out=<file> length=<number> minlength=<number> overlap=<number>
@@ -17,6 +17,7 @@ reads=-1 If nonnegative, stop after this many input sequences.
equal=f Shred each sequence into subsequences of equal size of at most 'length', instead of a fixed size.
median=0 If nonzero, randomly shred reads to a length with this median.
variance=0 If median is nonzero, shred to lengths of median +-variance.
+qfake=30 Quality score, if using fastq output.
Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
"
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/1c662261847da712e5b04b94277f8aa712c37619
--
View it on GitLab: https://salsa.debian.org/med-team/bbmap/-/commit/1c662261847da712e5b04b94277f8aa712c37619
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220803/2086d088/attachment-0001.htm>
More information about the debian-med-commit
mailing list