[med-svn] [Git][med-team/cd-hit][master] 9 commits: New upstream version 4.8.1

Andreas Tille gitlab at salsa.debian.org
Fri Jul 19 18:38:12 BST 2019



Andreas Tille pushed to branch master at Debian Med / cd-hit


Commits:
dbd76db7 by Andreas Tille at 2019-07-19T17:21:26Z
New upstream version 4.8.1
- - - - -
d8ea28f9 by Andreas Tille at 2019-07-19T17:21:29Z
Update upstream source from tag 'upstream/4.8.1'

Update to upstream version '4.8.1'
with Debian dir eed5e76590ad0786c8d3b7d121cccb5b977fbb62
- - - - -
6a92f72d by Andreas Tille at 2019-07-19T17:21:29Z
New upstream version

- - - - -
2cd7c628 by Andreas Tille at 2019-07-19T17:21:29Z
debhelper 12

- - - - -
92f1ef9d by Andreas Tille at 2019-07-19T17:21:32Z
Standards-Version: 4.4.0

- - - - -
6eff93ea by Andreas Tille at 2019-07-19T17:21:32Z
Remove trailing whitespace in debian/rules

- - - - -
4044d76b by Andreas Tille at 2019-07-19T17:26:28Z
Do not parse d/changelog

- - - - -
d46a324b by Andreas Tille at 2019-07-19T17:33:40Z
Build-Depends: zlib1g-dev

- - - - -
6a193fc4 by Andreas Tille at 2019-07-19T17:35:57Z
Upload to unstable

- - - - -


20 changed files:

- Makefile
- README
- + cd-hit-clstr_2_blm8.pl
- cdhit-common.c++
- cdhit-common.h
- cdhit-utility.c++
- debian/changelog
- debian/compat
- debian/control
- debian/rules
- psi-cd-hit/psi-cd-hit-local.pl
- usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
- + usecases/Miseq-16S/NG-Omics-Miseq-16S.py
- + usecases/Miseq-16S/NG-Omics-WF.py
- usecases/Miseq-16S/README
- usecases/Miseq-16S/pool_samples.pl
- + usecases/Miseq-16S/silva-ana1.pl
- + usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
- + usecases/miRNA-seq/clstr_2_miRNA-table.pl
- + usecases/miRNA-seq/filter-small-cluster.pl


Changes:

=====================================
Makefile
=====================================
@@ -1,10 +1,8 @@
-
 CC = g++ -Wall -ggdb
 CC = g++ -pg
 CC = g++
 
-# without OpenMP
-
+# default with OpenMP
 # with OpenMP
 # in command line: 
 # make openmp=yes
@@ -14,6 +12,21 @@ else
   CCFLAGS = -fopenmp
 endif
 
+#LDFLAGS = -static -lz -o
+#LDFLAGS = /usr/lib/x86_64-linux-gnu/libz.a -o
+
+# default with zlib
+# without zlib
+# in command line:
+# make zlib=no
+ifeq ($(zlib),no)
+  CCFLAGS += 
+  LDFLAGS += -o
+else
+  CCFLAGS += -DWITH_ZLIB
+  LDFLAGS += -lz -o
+endif
+
 # support debugging
 # in command line:
 # make debug=yes
@@ -28,9 +41,6 @@ ifdef MAX_SEQ
 CCFLAGS += -DMAX_SEQ=$(MAX_SEQ)
 endif
 
-#LDFLAGS = -static -o
-LDFLAGS += -o
-
 PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454
 
 # Propagate hardening flags


=====================================
README
=====================================
@@ -1,8 +1,22 @@
 For cd-hit
 
-How to compile?
+Requirements
+Since 4.8.1, cd-hit supports .gz format input file. This requires zlib library. zlib should
+be install in most Linux systems, so cd-hit should be compiled without issue. If your system
+don't have zlib, please install it first. 
+    * On Ubuntu, to install zlib:
+        sudo apt install zlib1g-dev
+    * On CentOS, to install zlib:
+        sudo yum install zlib-devel 
+
+
+How to compile
   1. Compile with multi-threading support (default): make
   2. Compile without multi-threading support (if you are on very old systems): make openmp=no
+  3. Compile without zlib (if you can not install zlib): make zlib=no
+
+Having problems to compile
+Please contact the author
 
 
 For cd-hit-auxtools
@@ -10,9 +24,16 @@ For cd-hit-auxtools
   make
 
 
-For psi-cd-hit
-  please download legacy BLAST (not BLAST+) and install the executables in your $PATH
+Compile cd-hit on MacOS
+To install CD-HIT on MacOS, first install gcc on your system.
+To use Homebrew (https://brew.sh/), see https://brewformulas.org/gcc. 
+Then locate the path to your g++ executable, (e.g. /usr/local/Cellar/gcc/6.3.0_1/bin/g++-6, 
+note: yours g++ path is likely to be different), then use command like this:
+  make CC=/usr/local/Cellar/gcc/6.3.0_1/bin/g++-6
+
 
+For psi-cd-hit
+  please download BLAST+ (not legacy BLAST) and install the executables in your $PATH
 
 For more information, please visit http://cd-hit.org
 


=====================================
cd-hit-clstr_2_blm8.pl
=====================================
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+#
+
+my $rep;
+my @non_reps = ();
+my @aln_info = ();
+while($ll=<>){
+  if ($ll =~ /^>/ ) {
+    if (@non_reps) {
+      for ($i=0; $i<@non_reps; $i++) {
+        print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+      }
+    }
+    $rep = "";
+    @non_reps = ();
+    @aln_info = ();
+  }
+  else {
+    chop($ll);
+    if ($ll =~ />(.+)\.\.\./ ) {
+      my $id = $1;
+      if ($ll =~ /\*$/) { $rep = $id}
+      else              { 
+        push(@non_reps, $id);
+        my @lls = split(/\s+/, $ll);
+        my ($a, $iden) = split(/\//, $lls[-1]);
+        chop($iden); ### removing % sign
+        my ($qb, $qe, $sb, $se) = split(/:/, $a);
+        my $alnln = $qe-$qb+1;
+        my $mis = int($alnln * (100-$iden) / 100);
+        my $gap = 0;
+        my $exp = 0;
+        my $bit = $alnln*3 - $mis*6; #### very rough
+        push(@aln_info, "$iden\t$alnln\t$mis\t$gap\t$qb\t$qe\t$sb\t$se\t$exp\t$bit");
+      }
+    }
+#>Cluster 582
+#0       6671aa, >gi|302514050|uid|51... *
+#1       339aa, >SRS584717|scaffold|... at 2:332:4020:4356/89.32%
+#2       182aa, >SRS584717|scaffold|... at 1:182:6490:6671/100.00%
+#3       367aa, >SRS584717|scaffold|... at 1:332:4543:4874/90.66%
+#4       109aa, >SRS584717|scaffold|... at 1:108:5782:5889/97.22%
+
+  }
+}
+    if (@non_reps) {
+      for ($i=0; $i<@non_reps; $i++) {
+        print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+      }
+    }
+
+#query                          subject         %       alnln   mis     gap     q_b     q_e     s_b     s_e     expect  bits
+##0                              1               2       3       4       5       6       7       8       9       10      11
+#mHE-SRS012902|scaffold|86.16    gnl|CDD|226997  47.62   42      17      2       164     201     210     250     5e-04   37.6
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|225183  47.46   236     122     1       1       236     475     708     1e-92    284
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|224055  44.35   239     130     2       1       239     332     567     2e-84    259
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|227321  39.50   238     140     3       1       238     324     557     9e-69    218
+


=====================================
cdhit-common.c++
=====================================
@@ -804,6 +804,7 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
 	int band_width = band_right - band_left + 1;
 	int band_width1 = band_width + 1;
 
+    // score_mat, back_mat [i][j]: i index of seqi (0 to len(seqi)-1), j index of band (0 to band_width-1)
 	MatrixInt64 & score_mat = buffer.score_mat;
 	MatrixInt   & back_mat = buffer.back_mat;
 
@@ -823,24 +824,25 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
 	}
 
 	best_score = 0;
-	/*
-	   seq2 len2 = 17            seq2 len2 = 17      seq2 len2 = 17
-	   01234567890123456       01234567890123456    01234567890123456
-	   0     xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx    xXXXXXXXxxxxxxxxx
-	   1\\\\\Xxxxxxxxxxxxxxxxx  \\\\\Xxx\xxxxxxxxxxxxx    xx\xxxxx\xxxxxxxx
-	   2 \\\\X\xxxxxxxxxxxxxxx   \\\\Xxxx\xxxxxxxxxxxx    xxx\xxxxx\xxxxxxx
-	   seq1 3  \\\Xx\xxxxxxxxxxxxxx    \\\Xxxxx\xxxxxxxxxxx    xxxx\xxxxx\xxxxxx
-	   len1 4   \\Xxx\xxxxxxxxxxxxx     \\Xxxxxx\xxxxxxxxxx    xxxxx\xxxxx\xxxxx
-	   = 11 5    \Xxxx\xxxxxxxxxxxx      \Xxxxxxx\xxxxxxxxx    xxxxxx\xxxxx\xxxx
-	   6     Xxxxx\xxxxxxxxxxx       Xxxxxxxx\xxxxxxxx    xxxxxxx\xxxxx\xxx
-	   7     x\xxxx\xxxxxxxxxx       x\xxxxxxx\xxxxxxx    xxxxxxxx\xxxxx\xx
-	   8     xx\xxxx\xxxxxxxxx       xx\xxxxxxx\xxxxxx    xxxxxxxxx\xxxxx\x
-	   9     xxx\xxxx\xxxxxxxx       xxx\xxxxxxx\xxxxx    xxxxxxxxxx\xxxxx\
-	   0     xxxx\xxxx\xxxxxxx       xxxx\xxxxxxx\xxxx    xxxxxxxxxxx\xxxxx
-	   band_left < 0           band_left < 0        band_left >=0
-	   band_right < 0          band_right >=0       band_right >=0
-	   init score_mat, and iden_mat (place with upper 'X')
-	 */
+	/* seq1 is query, seq2 is rep
+                  seq2    len2 = 17       seq2    len2 = 17    seq2    len2 = 17
+                  01234567890123456       01234567890123456    01234567890123456
+       0          xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx    xXXXXXXXxxxxxxxxx
+       1     \\\\\Xxxxxxxxxxxxxxxxx  \\\\\Xxx\xxxxxxxxxxxxx    xx\xxxxx\xxxxxxxx
+       2      \\\\X\xxxxxxxxxxxxxxx   \\\\Xxxx\xxxxxxxxxxxx    xxx\xxxxx\xxxxxxx
+  seq1 3       \\\Xx\xxxxxxxxxxxxxx    \\\Xxxxx\xxxxxxxxxxx    xxxx\xxxxx\xxxxxx
+  len1 4        \\Xxx\xxxxxxxxxxxxx     \\Xxxxxx\xxxxxxxxxx    xxxxx\xxxxx\xxxxx
+  = 11 5         \Xxxx\xxxxxxxxxxxx      \Xxxxxxx\xxxxxxxxx    xxxxxx\xxxxx\xxxx
+       6          Xxxxx\xxxxxxxxxxx       Xxxxxxxx\xxxxxxxx    xxxxxxx\xxxxx\xxx
+       7          x\xxxx\xxxxxxxxxx       x\xxxxxxx\xxxxxxx    xxxxxxxx\xxxxx\xx
+       8          xx\xxxx\xxxxxxxxx       xx\xxxxxxx\xxxxxx    xxxxxxxxx\xxxxx\x
+       9          xxx\xxxx\xxxxxxxx       xxx\xxxxxxx\xxxxx    xxxxxxxxxx\xxxxx\
+       0          xxxx\xxxx\xxxxxxx       xxxx\xxxxxxx\xxxx    xxxxxxxxxxx\xxxxx
+                  band_left < 0 (-6)      band_left < 0 (-6)   band_left >=0 (1)
+                  band_right < 0 (-1)     band_right >=0 (2)   band_right >=0(7)
+                  band_width 6            band_width 9         band_width 7
+       init score_mat, and iden_mat (place with upper 'X')
+     */
 
 	if (band_left < 0) {  //set score to left border of the matrix within band
 		int tband = (band_right < 0) ? band_right : 0;
@@ -849,6 +851,7 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
 			i = -k;
 			j1 = k-band_left;
 			// penalty for leading gap opening = penalty for gap extension
+            // each of the left side query hunging residues give ext_gap (-1)
 			score_mat[i][j1] =  mat.ext_gap * i;
 			back_mat[i][j1] = DP_BACK_TOP;
 		}
@@ -1678,6 +1681,101 @@ void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf
 	}
 }
 
+// by liwz gzip version 2019-02
+// by liwz
+// disable swap option
+// change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
+// where des_begin is the FILE pointer of sequence record start
+//       tot_length is the total bytes of sequence record 
+void SequenceDB::Readgz( const char *file, const Options & options )
+{
+#ifdef WITH_ZLIB
+    Sequence one;
+    Sequence des;
+    gzFile fin = gzopen(file, "r");
+    char *buffer = NULL;
+    char *res = NULL;
+    int option_l = options.min_length;
+    if( fin == NULL ) bomb_error( "Failed to open the database file" );
+    Clear();
+    buffer = new char[ MAX_LINE_SIZE+1 ];
+
+    while (not gzeof( fin ) || one.size) { /* do not break when the last sequence is not handled */
+        buffer[0] = '>';
+        if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL && one.size == 0) break;
+        if( buffer[0] == '+' ){
+            int len = strlen( buffer );
+            int len2 = len;
+            while( len2 && buffer[len2-1] != '\n' ){
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            one.tot_length += len;
+
+            // read next line quality score
+            if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+            len = strlen( buffer );
+            len2 = len;
+            while( len2 && buffer[len2-1] != '\n' ){
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            one.tot_length += len;
+        }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+            if ( one.size ) { // write previous record
+                if( one.identifier == NULL || one.Format() ){
+                    printf( "Warning: from file \"%s\",\n", file );
+                    printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+                    if( one.identifier ) printf( "%s\n", one.identifier );
+                    printf( "%s\n", one.data );
+                    one.size = 0;
+                }
+                one.index = sequences.size();
+                if( one.size > option_l ) {
+                    if (options.trim_len    > 0) one.trim(options.trim_len);
+                    sequences.Append( new Sequence( one ) ); 
+                }
+            }
+            one.size = 0;
+            one.tot_length = 0;
+
+            int len = strlen( buffer );
+            int len2 = len;
+            des.size = 0;
+            des += buffer;
+            while( len2 && buffer[len2-1] != '\n' ){
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                des += buffer;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            size_t offset = gztell( fin );
+            one.des_begin = offset - len;
+            one.tot_length += len;              // count first line
+
+            int i = 0;
+            if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+            if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+            if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+            while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+            des.data[i] = 0;
+            one.identifier = des.data;
+        } else {
+            one.tot_length += strlen(buffer);  one += buffer;
+        }
+    }
+    one.identifier = NULL;
+    delete[] buffer;
+    gzclose( fin );
+#else
+    bomb_error("this program was not compiled with zlib");
+#endif
+}
+
+
+
 // by liwz
 // disable swap option
 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
@@ -1685,6 +1783,12 @@ void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf
 //       tot_length is the total bytes of sequence record 
 void SequenceDB::Read( const char *file, const Options & options )
 {
+    int f_len = strlen(file);
+    if (strcmp(file + f_len - 3, ".gz") == 0 ) {
+        Readgz(file, options);
+        return;
+    }
+
     Sequence one;
     Sequence des;
     FILE *fin = fopen( file, "rb" );
@@ -1773,9 +1877,172 @@ void SequenceDB::Read( const char *file, const Options & options )
     fclose( fin );
 }
 
+
+// by liwz gzip version 2019-02
+// PE reads liwz, disable swap option
+void SequenceDB::Readgz( const char *file, const char *file2, const Options & options )
+{
+#ifdef WITH_ZLIB
+    Sequence one, two;
+    Sequence des;
+    gzFile fin = gzopen(file, "r");
+    gzFile fin2= gzopen(file2,"r");
+    char *buffer = NULL;
+    char *buffer2= NULL;
+    char *res = NULL;
+    char *res2= NULL;
+    int option_l = options.min_length;
+    if( fin == NULL ) bomb_error( "Failed to open the database file" );
+    if( fin2== NULL ) bomb_error( "Failed to open the database file" );
+    Clear();
+    buffer = new char[ MAX_LINE_SIZE+1 ];
+    buffer2= new char[ MAX_LINE_SIZE+1 ];
+
+    while (((not gzeof( fin )) && (not gzeof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
+        buffer[0] = '>'; res =gzgets(fin,  buffer,  MAX_LINE_SIZE);
+        buffer2[0]= '>'; res2=gzgets(fin2, buffer2, MAX_LINE_SIZE);
+
+        if ( (res      == NULL) && (res2     != NULL)) bomb_error( "Paired input files have different number sequences" );
+        if ( (res      != NULL) && (res2     == NULL)) bomb_error( "Paired input files have different number sequences" );
+        if ( (one.size == 0   ) && (two.size >     0)) bomb_error( "Paired input files have different number sequences" );
+        if ( (one.size >  0   ) && (two.size ==    0)) bomb_error( "Paired input files have different number sequences" );
+        if ( (res      == NULL) && (one.size ==    0)) break;
+
+        if( buffer[0] == '+' ){ // fastq 3rd line
+            // file 1
+            int len = strlen( buffer ); 
+            int len2 = len;
+            while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            one.tot_length += len;
+
+            // read next line quality score
+            if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+            len = strlen( buffer );
+            len2 = len;
+            while( len2 && buffer[len2-1] != '\n' ){
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            one.tot_length += len;
+
+            // file 2
+            len = strlen( buffer2 );
+            len2 = len;
+            while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
+                if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer2 );
+                len += len2;
+            }
+            two.tot_length += len;
+
+            // read next line quality score
+            if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+            len = strlen( buffer2 );
+            len2 = len;
+            while( len2 && buffer2[len2-1] != '\n' ){
+                if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer2 );
+                len += len2;
+            }
+            two.tot_length += len;
+
+        }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+            if ( one.size && two.size ) { // write previous record
+                if( one.identifier == NULL || one.Format() ){
+                    printf( "Warning: from file \"%s\",\n", file );
+                    printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+                    if( one.identifier ) printf( "%s\n", one.identifier );
+                    printf( "%s\n", one.data );
+                    one.size=0; two.size=0;
+                }
+                if( two.identifier == NULL || two.Format() ){
+                    printf( "Warning: from file \"%s\",\n", file2 );
+                    printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+                    if( two.identifier ) printf( "%s\n", two.identifier );
+                    printf( "%s\n", two.data );
+                    one.size=0; two.size = 0;
+                }
+                one.index = sequences.size();
+                if( (one.size + two.size)> option_l ) {
+                    if (options.trim_len    > 0) one.trim(options.trim_len);
+                    if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
+                    sequences.Append( new Sequence( one, two, 1 ) ); 
+                }
+            }
+            // R1
+            one.size = 0;
+            one.tot_length = 0;
+
+            int len = strlen( buffer );
+            int len2 = len;
+            des.size = 0;
+            des += buffer;
+            while( len2 && buffer[len2-1] != '\n' ){
+                if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+                des += buffer;
+                len2 = strlen( buffer );
+                len += len2;
+            }
+            size_t offset = gztell( fin );    
+            one.des_begin = offset - len; // offset of ">" or "@" 
+            one.tot_length += len;              // count first line
+
+            int i = 0;
+            if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+            if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+            if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+            while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+            des.data[i] = 0;                   // find first non-space letter
+            one.identifier = des.data;
+
+            // R2
+            two.size = 0;
+            two.tot_length = 0;
+
+            len = strlen( buffer2 );
+            len2 = len;
+            while( len2 && buffer2[len2-1] != '\n' ){
+                if ( (res=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+                len2 = strlen( buffer2 );
+                len += len2;
+            }
+            offset = gztell( fin2 );
+            two.des_begin = offset - len;
+            two.tot_length += len;              // count first line
+            two.identifier = des.data;
+        } else {
+            one.tot_length += strlen(buffer);  one += buffer;
+            two.tot_length+= strlen(buffer2); two+= buffer2;
+        }
+    }
+    one.identifier = NULL;
+    two.identifier = NULL;
+    delete[] buffer;
+    gzclose( fin );
+    delete[] buffer2;
+    gzclose( fin2 );
+#else
+    bomb_error("this program was not compiled with zlib");
+#endif
+
+}
+
 // PE reads liwz, disable swap option
 void SequenceDB::Read( const char *file, const char *file2, const Options & options )
 {
+    int f_len = strlen(file);
+    int f_len2= strlen(file2);
+    if (strcmp(file + f_len - 3, ".gz") == 0 ) {
+        if ( strcmp(file2 + f_len2 - 3, ".gz") ) bomb_error( "Both input files need to be in .gz format" );
+        Readgz(file, file2, options);
+        return;
+    }
+
     Sequence one, two;
     Sequence des;
     FILE *fin = fopen( file, "rb" );
@@ -2076,8 +2343,53 @@ void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Opt
 	fclose( fout );
 	delete []buf;
 }
+
+// input db is gzipped
+void SequenceDB::WriteClustersgz( const char *db, const char *newdb, const Options & options )
+{
+#ifdef WITH_ZLIB
+    gzFile fin = gzopen(db, "r");
+	FILE *fout = fopen( newdb, "w+" );
+	int i, j, n = rep_seqs.size();
+	int count, rest;
+	char *buf = new char[MAX_LINE_SIZE+1];
+	vector<uint64_t> sorting( n );
+	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
+	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
+	std::sort( sorting.begin(), sorting.end() );
+	for (i=0; i<n; i++){
+		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
+		gzseek( fin, seq->des_begin, SEEK_SET );
+
+		count = seq->tot_length / MAX_LINE_SIZE;
+		rest  = seq->tot_length % MAX_LINE_SIZE;
+		//printf( "count = %6i,  rest = %6i\n", count, rest );
+		for (j=0; j<count; j++){
+			if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, MAX_LINE_SIZE, fout );
+		}
+		if( rest ){
+			if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, rest, fout );
+		}
+	}
+	gzclose( fin );
+	fclose( fout );
+	delete []buf;
+#else
+    bomb_error("this program was not compiled with zlib");
+#endif
+
+}
+
 void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options & options )
 {
+    int f_len = strlen(db);
+    if (strcmp(db + f_len - 3, ".gz") == 0 ) {
+        WriteClustersgz(db, newdb, options);
+        return;
+    }
+
 	FILE *fin = fopen( db, "rb" );
 	FILE *fout = fopen( newdb, "w+" );
 	int i, j, n = rep_seqs.size();
@@ -2107,9 +2419,101 @@ void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options
 	fclose( fout );
 	delete []buf;
 }
+
+
+// input db is gzipped
+// liwz PE output
+void SequenceDB::WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
+{
+#ifdef WITH_ZLIB
+    gzFile fin    = gzopen(db,    "r");
+	gzFile fin_pe = gzopen(db_pe, "r");
+	FILE *fout = fopen( newdb, "w+" );
+	FILE *fout_pe = fopen( newdb_pe, "w+" );
+	int i, j, n = rep_seqs.size();
+	int count, rest;
+	char *buf = new char[MAX_LINE_SIZE+1];
+	vector<uint64_t> sorting( n );
+	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
+	if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
+	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
+	std::sort( sorting.begin(), sorting.end() );
+
+        //sort fasta / fastq
+        int *clstr_size;
+        int *clstr_idx1;
+        if (options.sort_outputf) {
+            clstr_size = new int[n];
+            clstr_idx1 = new int[n];
+            for (i=0; i<n; i++) { 
+                clstr_size[i] = 0;
+                clstr_idx1[i]  = i;
+            }
+
+            int N = sequences.size();
+            for (i=0; i<N; i++) { 
+                int id = sequences[i]->cluster_id;
+                if (id < 0) continue;
+                if (id >=n) continue;
+                clstr_size[id]++;
+            }
+            quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
+        }
+
+	for (i=0; i<n; i++){
+		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
+                if (options.sort_outputf) seq = sequences[  rep_seqs[ clstr_idx1[i] ] ];
+                //R1
+		gzseek( fin, seq->des_begin, SEEK_SET );
+
+		count = seq->tot_length / MAX_LINE_SIZE;
+		rest  = seq->tot_length % MAX_LINE_SIZE;
+		//printf( "count = %6i,  rest = %6i\n", count, rest );
+		for (j=0; j<count; j++){
+			if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, MAX_LINE_SIZE, fout );
+		}
+		if( rest ){
+			if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, rest, fout );
+		}
+
+                //R2
+		gzseek( fin_pe, seq->des_begin2, SEEK_SET );
+
+		count = seq->tot_length2 / MAX_LINE_SIZE;
+		rest  = seq->tot_length2 % MAX_LINE_SIZE;
+		//printf( "count = %6i,  rest = %6i\n", count, rest );
+		for (j=0; j<count; j++){
+			if( gzread(fin_pe, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
+		}
+		if( rest ){
+			if( gzread(fin_pe, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+			fwrite( buf, 1, rest, fout_pe );
+		}
+
+	}
+	gzclose( fin );
+	gzclose( fin_pe );
+	fclose( fout );
+	fclose( fout_pe );
+	delete []buf;
+#else
+    bomb_error("this program was not compiled with zlib");
+#endif
+}
+
+
 // liwz PE output
 void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
 {
+    int f_len = strlen(db); 
+    if (strcmp(db + f_len - 3, ".gz") == 0 ) {
+        WriteClustersgz(db, db_pe, newdb, newdb_pe, options);
+        return;
+    }
+
 	FILE *fin = fopen( db, "rb" );
 	FILE *fout = fopen( newdb, "w+" );
 	FILE *fin_pe = fopen( db_pe, "rb" );
@@ -2816,7 +3220,7 @@ void SequenceDB::DoClustering( int T, const Options & options )
 	}
 	printf( "\n%9i  finished  %9i  clusters\n", sequences.size(), rep_seqs.size() );
 	mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
-	printf( "\nApprixmated maximum memory consumption: %zuM\n", mem );
+	printf( "\nApproximated maximum memory consumption: %zuM\n", mem );
 	last_table.Clear();
 	word_table.Clear();
 }
@@ -2865,9 +3269,9 @@ int SequenceDB::CheckOneAA( Sequence *seq, WordTable & table, WorkingParam & par
 		}
 	}
 
-        //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
-        //longer seqeunce * -aL -band_width
-        if ( S ) {
+    //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
+    //longer seqeunce * -aL -band_width
+    if ( S ) {
 		int min = table.sequences[S-1]->size;
 		int min_red = min * options.long_coverage - options.band_width;
 		if (len < min_red) return 0; // return flag=0
@@ -3310,6 +3714,14 @@ void SequenceDB::DoClustering( const Options & options )
 		size_t max_items = opts.max_entries;
 		size_t max_seqs = opts.max_sequences;
 		size_t items = 0;
+
+// find a block from i to m, so that this block can fit into a word table
+//     ...
+//  i  ++++++++++++++++++++++++++
+//     ++++++++++++++++++++
+//     ++++++++++++++++
+//  m  +++++++++++++
+//     ...
 		while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
 			Sequence *seq = sequences[m];
 			if( ! (seq->state & IS_REDUNDANT) ){
@@ -3322,7 +3734,7 @@ void SequenceDB::DoClustering( const Options & options )
 		if( m > N ) m = N;
 		printf( "\rcomparing sequences from  %9i  to  %9i\n", i, m );
 		fflush( stdout );
-		for(int ks=i; ks<m; ks++){
+		for(int ks=i; ks<m; ks++){ // clustering this block
 			Sequence *seq = sequences[ks];
 			i = ks + 1;
 			if (seq->state & IS_REDUNDANT) continue;
@@ -3332,11 +3744,11 @@ void SequenceDB::DoClustering( const Options & options )
 			if( word_table.size >= max_items ) break;
 			int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
 			if( word_table.sequences.size() >= tmax ) break;
-		}
+		} // finishing word table from this block
 		m = i;
 		if( word_table.size == 0 ) continue;
 		float p0 = 0;
-		for(int j=m; j<N; j++){
+		for(int j=m; j<N; j++){ // use this word table to screen rest sequences m->N
 			Sequence *seq = sequences[j];
 			if (seq->state & IS_REDUNDANT) continue;
 			if ( options.store_disk ) seq->SwapIn();
@@ -3360,7 +3772,7 @@ void SequenceDB::DoClustering( const Options & options )
 	}
 	printf( "\n%9i  finished  %9i  clusters\n", sequences.size(), rep_seqs.size() );
 	mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
-	printf( "\nApprixmated maximum memory consumption: %liM\n", mem );
+	printf( "\nApproximated maximum memory consumption: %liM\n", mem );
 	temp_files.Clear();
 	word_table.Clear();
 


=====================================
cdhit-common.h
=====================================
@@ -35,11 +35,15 @@
 #include<stdint.h>
 #include<time.h>
 
+#ifdef WITH_ZLIB
+#include<zlib.h>
+#endif
+
 #include<valarray>
 #include<vector>
 #include<map>
 
-#define CDHIT_VERSION  "4.7"
+#define CDHIT_VERSION  "4.8.1"
 
 #ifndef MAX_SEQ
 #define MAX_SEQ 655360
@@ -280,10 +284,10 @@ struct Options
 	int     frag_size;
 	int     option_r;
 	int     threads;
-	int	PE_mode;     // -P
-        int     trim_len;    // -cx
-        int     trim_len_R2; // -cy
-        int     align_pos;   // -ap for alignment position
+	int     PE_mode;     // -P
+    int     trim_len;    // -cx
+    int     trim_len_R2; // -cy
+    int     align_pos;   // -ap for alignment position
 
 	size_t  max_entries;
 	size_t  max_sequences;
@@ -303,8 +307,8 @@ struct Options
 	string  output;
 	string  output_pe;
         
-        int     sort_output;  // -sc
-        int     sort_outputf; // -sf
+    int     sort_output;  // -sc
+    int     sort_outputf; // -sf
 
 	Options(){
 		backupFile = false;
@@ -342,12 +346,12 @@ struct Options
 		frag_size = 0;
 		des_len = 20;
 		threads = 1;
-                PE_mode = 0;
-                trim_len = 0;
-                trim_len_R2 = 0;
-                align_pos = 0;
-                sort_output = 0;
-                sort_outputf = 0;
+        PE_mode = 0;
+        trim_len = 0;
+        trim_len_R2 = 0;
+        align_pos = 0;
+        sort_output = 0;
+        sort_outputf = 0;
 		max_entries = 0;
 		max_sequences = 1<<20;
 		mem_limit = 100000000;
@@ -374,7 +378,7 @@ struct Sequence
 	// length of the sequence:
 	int   size;
 	int   bufsize;
-        int   size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
+    int   size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
 
 	//uint32_t stats;
 
@@ -387,8 +391,8 @@ struct Sequence
 
 	// stream offset of the description string in the database:
 	size_t   des_begin, des_begin2;
-        // total record length
-        int   tot_length, tot_length2;
+    // total record length
+    int   tot_length, tot_length2;
 
 	char *identifier;
 
@@ -417,7 +421,7 @@ struct Sequence
 	int Format();
 
 	void ConvertBases();
-        void trim(int trim_len);
+    void trim(int trim_len);
 
 	void SwapIn();
 	void SwapOut();
@@ -559,9 +563,17 @@ class SequenceDB
 		~SequenceDB(){ Clear(); }
 
 		void Read( const char *file, const Options & options );
+		void Readgz( const char *file, const Options & options );
+
 		void Read( const char *file, const char *file2, const Options & options );
+		void Readgz( const char *file, const char *file2, const Options & options );
+
 		void WriteClusters( const char *db, const char *newdb, const Options & options );
+		void WriteClustersgz( const char *db, const char *newdb, const Options & options );
+
 		void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+		void WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+
 		void WriteExtra1D( const Options & options );
 		void WriteExtra2D( SequenceDB & other, const Options & options );
 		void DivideSave( const char *db, const char *newdb, int n, const Options & options );


=====================================
cdhit-utility.c++
=====================================
@@ -13,18 +13,19 @@ char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artif
 //
 
 char contacts[] =
-  "   Questions, bugs, contact Limin Fu at l2fu at ucsd.edu, or Weizhong Li at liwz at sdsc.edu\n"
-  "   For updated versions and information, please visit: http://cd-hit.org\n\n"
+  "   Questions, bugs, contact Weizhong Li at liwz at sdsc.edu\n"
+  "   For updated versions and information, please visit: http://cd-hit.org\n"
+  "                                                    or https://github.com/weizhongli/cdhit\n\n"
   "   cd-hit web server is also available from http://cd-hit.org\n\n"
   "   If you find cd-hit useful, please kindly cite:\n\n";
 
-char txt_option_i[] = "\tinput filename in fasta format, required\n";
+char txt_option_i[] = "\tinput filename in fasta format, required, can be in .gz format\n";
 char txt_option_j[] = 
 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
 \t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
 \t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
-char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
-char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
+char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required, can be in .gz format\n";
+char txt_option_i2[] = "\tinput filename for db2 in fasta format, required, can be in .gz format\n";
 char txt_option_j2[] = 
 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
 \t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
@@ -34,12 +35,12 @@ char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (
 char txt_option_c[] = 
 "\tsequence identity threshold, default 0.9\n \
 \tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the full length of the shorter sequence\n";
 char txt_option_G[] = 
 "\tuse global sequence identity, default 1\n \
 \tif set to 0, then use local sequence identity, calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the length of the alignment\n \
 \tNOTE!!! don't use -G 0 unless you use alignment coverage controls\n \
 \tsee options -aL, -AL, -aS, -AS\n";
@@ -135,7 +136,8 @@ char txt_option_sc[] =
 \tif set to 1, output clusters by decreasing size\n";
 char txt_option_sf[] =
 "\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
-\tif set to 1, output sequences by decreasing cluster size\n";
+\tif set to 1, output sequences by decreasing cluster size\n \
+\tthis can be very slow if the input is in .gz format\n";
 
 char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
 char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
@@ -337,7 +339,7 @@ int print_usage_div (char *arg) {
 char mytxt_option_c[] =
 "\tsequence identity threshold, default 0.98\n \
 \tthis is a \"global sequence identity\" calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the full length of the shorter sequence + gaps\n";
 char mytxt_option_b[] = "\tband_width of alignment, default 10\n";
 char mytxt_option_n_est[] = "\tword_length, default 10, see user's guide for choosing it\n";


=====================================
debian/changelog
=====================================
@@ -1,9 +1,18 @@
-cd-hit (4.6.8-3) UNRELEASED; urgency=medium
+cd-hit (4.8.1-1) unstable; urgency=medium
 
+  [ Jelmer Vernooij ]
   * Use secure copyright file specification URI.
   * Trim trailing whitespace.
 
- -- Jelmer Vernooij <jelmer at debian.org>  Sat, 20 Oct 2018 13:28:38 +0000
+  [ Andreas Tille ]
+  * New upstream version
+  * debhelper 12
+  * Standards-Version: 4.4.0
+  * Remove trailing whitespace in debian/rules
+  * Do not parse d/changelog
+  * Build-Depends: zlib1g-dev
+
+ -- Andreas Tille <tille at debian.org>  Fri, 19 Jul 2019 19:34:25 +0200
 
 cd-hit (4.6.8-2) unstable; urgency=medium
 


=====================================
debian/compat
=====================================
@@ -1 +1 @@
-11
+12


=====================================
debian/control
=====================================
@@ -4,9 +4,10 @@ Uploaders: Tim Booth <tbooth at ceh.ac.uk>,
            Andreas Tille <tille at debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 11~),
-               help2man
-Standards-Version: 4.2.1
+Build-Depends: debhelper (>= 12~),
+               help2man,
+               zlib1g-dev
+Standards-Version: 4.4.0
 Vcs-Browser: https://salsa.debian.org/med-team/cd-hit
 Vcs-Git: https://salsa.debian.org/med-team/cd-hit.git
 Homepage: http://weizhong-lab.ucsd.edu/cd-hit/


=====================================
debian/rules
=====================================
@@ -9,10 +9,9 @@
 
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 
-pkg := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-ver := $(shell dpkg-parsechangelog | sed -ne 's/^Version: \(\([0-9]\+\):\)\?\(.*\)-.*/\3/p')
+include /usr/share/dpkg/default.mk
 
-mandir=$(CURDIR)/debian/$(pkg)/usr/share/man/man1/
+mandir=$(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/
 
 %:
 	dh $@
@@ -23,54 +22,54 @@ override_dh_auto_build:
 override_dh_installman:
 	mkdir -p $(mandir)
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences' \
 	   $(CURDIR)/cd-hit > $(mandir)/cd-hit.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences, optimised for 454 data' \
 	   $(CURDIR)/cd-hit-454 > $(mandir)/cd-hit-454.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences in db1 or db2 format' \
 	   $(CURDIR)/cd-hit-2d > $(mandir)/cd-hit-2d.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='run CD-HIT algorithm on RNA/DNA sequences' \
 	   $(CURDIR)/cd-hit-est > $(mandir)/cd-hit-est.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='run CD-HIT algorithm on RNA/DNA sequences in db1 or db2 format' \
 	   $(CURDIR)/cd-hit-est-2d > $(mandir)/cd-hit-est-2d.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='divide a big clustering job into pieces to run cd-hit-2d or cd-hit-est-2d jobs' \
 	   $(CURDIR)/cd-hit-2d-para.pl	> $(mandir)/cd-hit-2d-para.1
 
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='divide a big clustering job into pieces to run cd-hit or cd-hit-est jobs' \
 	    $(CURDIR)/cd-hit-para.pl	> $(mandir)/cd-hit-para.1
-	
-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
            --name='divide a FASTA file into pieces' \
             $(CURDIR)/cd-hit-div.pl    > $(mandir)/cd-hit-div.1
 
 	# psi-cd-hit.pl is throwing some errors which are fixed using sed
 	# ... at least in versions <= 4.6.4 - now help2man does not work at all
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities' \
 	#   $(CURDIR)/psi-cd-hit.pl	| \
 	#   sed -e '/^Name "main::.*" used only once:/d' \
 	#	> $(mandir)/psi-cd-hit.1
 
 	# help2man does not work at all since version 4.6.4
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
 	#      $(CURDIR)/psi-cd-hit-2d.pl	> $(mandir)/psi-cd-hit-2d.1
 
 	# FIXME: what is the difference between psi-cd-hit-2d.pl and psi-cd-hit-2d-g1.pl ?
 	# help2man does not work at all since version 4.6.4
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
 	#      $(CURDIR)/psi-cd-hit-2d-g1.pl	> $(mandir)/psi-cd-hit-2d-g1.1
 
@@ -87,7 +86,7 @@ override_dh_installman:
 	#   psi-cd-hit-local.pl    -> even throws several "used only once: possible typo" errors
 
 override_dh_auto_install:
-	dh_auto_install -- PREFIX=debian/$(pkg)/usr/lib/cd-hit
+	dh_auto_install -- PREFIX=debian/$(DEB_SOURCE)/usr/lib/cd-hit
 
 override_dh_compress:
 	dh_compress --exclude=.pdf


=====================================
psi-cd-hit/psi-cd-hit-local.pl
=====================================
@@ -13,21 +13,22 @@ our $opt_aS    = 0.0;  #
 our $opt_aL    = 0.0;  #
 our $circle    = 0;    #
 our $opt_g     = 1;    ####################
-our $blast_exe = "blastall -p blastp -m 8";               #########################
-our $prof_exe  = "blastpgp -m 8";                         #
-our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500";      #
-our $prof_db   = "";                                      #
-our $bl_para   = "-F T -e 0.000001 -b 100000 -v 100000";  #  program
-our $bl_STDIN  = 1;                                       #
-our $keep_bl   = 0;                                       #
-our $blast_prog= "blastp";                                #
-our $formatdb  = "formatdb";                              #########################
+
+our $blast_exe = "blastp";                                             #########################
+our $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000";  #  program
+our $bl_para_u = "";                                                   #
+our $bl_STDIN  = 1;                                                    #
+our $keep_bl   = 0;                                                    #
+our $blast_prog= "blastp";                                             #
+our $formatdb  = "makeblastdb";                                        #########################
+
 our $exec_mode = "local";      #######################
-our $num_qsub   = 1;           #
+our $num_qsub  = 1;            #
 our $para_no   = 1;            # compute
 our $sh_file   = "";           #
 our $num_multi_seq = 50;       #
 our $batch_no_per_node = 100;  #######################
+
 our $reformat_seg = 50000;
 our $restart_seg  = 20000;
 our $job          = "";
@@ -73,9 +74,7 @@ sub parse_para_etc {
     elsif ($arg eq "-sl")         { $skip_long = shift; }
     ## program
     elsif ($arg eq "-prog")       { $blast_prog= shift; }
-    elsif ($arg eq "-p")          { $prof_para = shift; }
-    elsif ($arg eq "-dprof")      { $prof_db   = shift; die "option -dprof no longer supported!";}
-    elsif ($arg eq "-s")          { $bl_para   = shift; }
+    elsif ($arg eq "-s")          { $bl_para_u = shift; }
     elsif ($arg eq "-k")          { $keep_bl   = shift; }
     elsif ($arg eq "-bs")         { $bl_STDIN  = shift; }
     ## compute
@@ -103,38 +102,35 @@ sub parse_para_etc {
     print_usage(); exit();
   }
 
-  if ($blast_prog eq "blastn") {
-    $formatdb  = "formatdb -p F";
-    $blast_exe    = "blastall -p blastn -m 8";
-  }
-  elsif ($blast_prog eq "megablast") {
-    $blast_prog = "blastn"; #### back to blastn for blast parser type
-    $formatdb  = "formatdb -p F";
-    $blast_exe    = "megablast -H 100 -D 2 -m 8";
-  }
-  elsif ($blast_prog eq "blastpgp") {
-    $blast_exe  = "blastpgp -m 8 -j 3";
-  }
-
   #### for blast+
   if ($bl_plus) {
-    $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
-    $blast_exe = "blastp -outfmt 6";
-    $bl_para   = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
-
-    if ($blast_prog eq "blastn") {
-      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
-      $blast_exe    = "blastn -task blastn -outfmt 6";
-      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
+    if ($blast_prog eq "blastp") {
+      $formatdb  = "makeblastdb -dbtype prot -max_file_sz 2GB";
+      $blast_exe = "blastp -outfmt 6";
+      $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+    }
+    elsif ($blast_prog eq "psiblast") {
+      $formatdb  = "makeblastdb -dbtype prot -max_file_sz 2GB";
+      $blast_exe = "psiblast -outfmt 6 -num_iterations 3";
+      $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+    }
+    elsif ($blast_prog eq "blastn") {
+      $formatdb  = "makeblastdb -dbtype nucl -max_file_sz 2GB";
+      $blast_exe = "blastn -task blastn -outfmt 6";
+      $bl_para   = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
     }
     elsif ($blast_prog eq "megablast") {
-      $blast_prog = "blastn"; #### back to blastn for blast parser type
-      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
-      $blast_exe    = "blastn -task megablast -outfmt 6";
-      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
+      $formatdb  = "makeblastdb -dbtype nucl -max_file_sz 2GB";
+      $blast_exe = "blastn -task megablast -outfmt 6";
+      $bl_para   = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+      $blast_prog= "blastn"; #### back to blastn for blast parser type
+    }
+    else {
+      print "Unknown blast program: $blast_prog\n";
+      print_usage(); exit();
     }
-    elsif ($blast_prog eq "blastpgp") {
-      $blast_exe  = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
+    if ($bl_para_u) {
+      $bl_para = "$bl_para_u -num_threads $bl_threads";
     }
   }
 
@@ -142,6 +138,13 @@ sub parse_para_etc {
     $blast_exe = "$bl_path/$blast_exe";
     $formatdb  = "$bl_path/$formatdb";
   }
+  my $bl_ver = `$blast_exe -version`;
+  if ($bl_ver =~ /blast/) {
+    print "BLAST version:\n$bl_ver\n\n";
+  }
+  else {
+    die "BLAST program not found: $blast_exe\n";
+  }
 
   (-e $db_in) || die "No input";
   ($db_out)   || die "No output";
@@ -576,7 +579,7 @@ sub keep_strand_with_top_hsp {
 }
 ########## END keep_strand_with_top_hsp
 
-########## for blastpgp -j no (no>1)
+########## for psiblast -j no (no>1)
 ########## keep hits from the last round
 sub keep_hsp_of_last_round {
   my $self = shift;
@@ -691,7 +694,7 @@ sub process_blout_blastp_blastn {
   my $len_rep = 0;
   my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
   if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
-  if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
+  if ($blast_prog eq "psiblast" ) {keep_hsp_of_last_round($bl); }
 
   if ($g_iden == 0 ) { #### Local identity
     keep_top_hsp($bl); #### local alignment, only the top HSP
@@ -1084,11 +1087,7 @@ Options
              long sequences will not be clustered anyway.
         -sl 0 means no skipping
    program:
-     -prog (blastp, blastn, megablast, blastpgp), default $blast_prog
-     -p  profile search para, default 
-         "$prof_para"
-     -dprof database for building PSSM, default using input
-         you can also use another database that is more comprehensive like NR80
+     -prog (blastp, blastn, megablast, psiblast), default $blast_prog
      -s  blast search para, default 
          "$bl_para"
      -bs (1/0) default $bl_STDIN


=====================================
usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
=====================================
@@ -105,7 +105,7 @@ $CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5
 $CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
 $CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
 $CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
-$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+$CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
 rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
 
 EOD


=====================================
usecases/Miseq-16S/NG-Omics-Miseq-16S.py
=====================================
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+queue_system = 'SGE'
+
+########## local variables etc. Please edit
+ENV={
+  'CD_HIT_dir'           : '/home/oasis/data/NGS-ann-project/apps/cdhit.git',
+  'NGS_prog_trimmomatic' : '/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar',
+}
+
+########## computation resources for execution of jobs
+NGS_executions = {}
+NGS_executions['qsub_1'] = {
+  'type'                : 'qsub-pe',
+  'cores_per_node'      : 8,
+  'number_nodes'        : 64,
+  'user'                : 'weizhong', #### I will use command such as qstat -u weizhong to query submitted jobs
+  'command'             : 'qsub',
+  'command_name_opt'    : '-N',
+  'command_err_opt'     : '-e',
+  'command_out_opt'     : '-o',
+  'template'            : '''#!/bin/bash
+##$ -q RNA.q
+#$ -v PATH
+#$ -V
+
+'''
+}
+
+NGS_executions['sh_1'] = {
+  'type'                : 'sh',
+  'cores_per_node'      : 8,
+  'number_nodes'        : 1,
+  'template'            : '''#!/bin/bash
+
+'''
+}
+
+
+NGS_batch_jobs = {}
+NGS_batch_jobs['qc'] = {
+  'CMD_opts'         : ['100'],
+  'execution'        : 'qsub_1',               # where to execute
+  'cores_per_cmd'    : 4,                    # number of threads used by command below
+  'no_parallel'      : 1,                    # number of total jobs to run using command below
+  'command'          : '''
+java -jar $ENV.NGS_prog_trimmomatic PE -threads 4 -phred33 $DATA.0 $DATA.1 $SELF/R1.fq $SELF/R1-s.fq $SELF/R2.fq $SELF/R2-s.fq \\
+    SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$CMDOPTS.0 MAXINFO:80:0.5 1>$SELF/qc.stdout 2>$SELF/qc.stderr
+
+perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa &
+perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa &
+
+wait
+rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
+'''
+}
+
+
+NGS_batch_jobs['otu'] = {
+  'injobs'            : ['qc'],
+  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+  'execution'         : 'qsub_1',               # where to execute
+  'cores_per_cmd'     : 2,                    # number of threads used by command below
+  'no_parallel'       : 1,                    # number of total jobs to run using command below
+  'command'           : '''
+#### cluster at 100% PE
+$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0  -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.nr.log
+#### cluster at 99% PE and SE for R1,R2 
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr   -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.99.log
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\
+    -i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\
+    -a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr       $SELF/seq.99f.clstr     > $SELF/seq.99f-all.clstr
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr
+
+rm -f $SELF/seq.chimeric-clstr.R1    $SELF/seq.chimeric-clstr.R1.log    $SELF/seq.chimeric-clstr.R2    $SELF/seq.chimeric-clstr.R2.log 
+rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log
+mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt
+
+'''
+}
+
+
+NGS_batch_jobs['otu-pooled'] = {
+  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+  'execution'         : 'qsub_1',               # where to execute
+  'cores_per_cmd'     : 2,                    # number of threads used by command below
+  'no_parallel'       : 1,                    # number of total jobs to run using command below
+  'command'           : '''
+#### before running
+#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
+$ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
+
+'''
+}
+


=====================================
usecases/Miseq-16S/NG-Omics-WF.py
=====================================
@@ -0,0 +1,888 @@
+#!/usr/bin/env python
+# =============================== NG-Omics-WF ==================================
+#  _   _  _____         ____            _              __          ________ 
+# | \ | |/ ____|       / __ \          (_)             \ \        / /  ____|
+# |  \| | |  __ ______| |  | |_ __ ___  _  ___ ___ _____\ \  /\  / /| |__   
+# | . ` | | |_ |______| |  | | '_ ` _ \| |/ __/ __|______\ \/  \/ / |  __|  
+# | |\  | |__| |      | |__| | | | | | | | (__\__ \       \  /\  /  | |     
+# |_| \_|\_____|       \____/|_| |_| |_|_|\___|___/        \/  \/   |_|     
+#                                                                           
+# Workflow tools for next generation genomics, metagenomics, RNA-seq 
+# and other type of omics data analyiss, 
+#
+# Software originally developed since 2010 by Weizhong Li at UCSD
+#                                               currently at JCVI
+#
+# https://github.com/weizhongli/ngomicswf       liwz at sdsc.edu
+# ==============================================================================
+
+import os
+import sys
+import re
+import argparse 
+from argparse import RawTextHelpFormatter
+import math
+import subprocess
+import time
+import logging
+import textwrap
+import imp
+import collections
+import xml.etree.ElementTree as ET
+
+__author__ = 'Weizhong Li'
+
+############## Global variables
+banner = '''
+            ==================================================================
+            Workflow tools for next generation genomics, metagenomics, RNA-seq
+            and other type of omics data analyiss,
+        
+            Software originally developed since 2010 by Weizhong Li at UCSD
+                                                          currently at JCVI
+        
+            http://weizhongli-lab.org/ngomicswf           liwz at sdsc.edu
+            ==================================================================
+'''
+NGS_config = None
+NGS_samples = []
+NGS_sample_data = {}
+NGS_opts = {}
+pwd = os.path.abspath('.')
+subset_flag = False
+subset_jobs = []
+qstat_xml_data = collections.defaultdict(dict)
+job_list = collections.defaultdict(dict)  # as job_list[$t_job_id][$t_sample_id] = {}
+execution_submitted = {}                  # number of submitted jobs (qsub) or threads (local sh)
+############## END Global variables
+
+
+def fatal_error(message, exit_code=1):
+  print message
+  exit(exit_code)
+
+
+def read_parameters(args):
+  '''read option parameters from file or command line'''
+  if args.parameter_file:
+    try:
+      ##format example
+      ##JobID_A opt0 opt1 opt2
+      ##JobID_B opt0 opt1
+      ##JobID_C opt0 opt1 opt2 opt3
+      f = open(args.parameter_file, 'r')
+      for line in f:
+        if line[0] == '#':
+          continue
+        if not re.match('^\w', line):
+          continue
+        ll = re.split('\s+', line.rstrip());
+        NGS_opts[ ll[0] ] = ll[1:]
+      f.close()
+    except IOError:
+      fatal_error('cannot open ' + args.parameter_file, exit_code=1)
+
+  elif args.parameter_name:
+    for line in re.split(',', args.parameter_name):
+      ll = re.split(':', line);
+      NGS_opts[ ll[0] ] = ll[1:]
+  return
+
+
+def read_samples(args):
+  '''read sample and sample data from file or command line'''
+  if args.sample_file:
+    try:
+      f = open(args.sample_file, 'r')
+      for line in f:
+        if line[0] == '#':
+          continue
+        if not re.match('^\w', line):
+          continue
+        ll = re.split('\s+', line.rstrip());
+        NGS_samples.append(ll[0]);
+        NGS_sample_data[ ll[0] ] = ll[1:]
+      f.close()
+    except IOError:
+      fatal_error('cannot open ' + args.sample_file, exit_code=1)
+
+  elif args.sample_name:
+    for line in re.split(',', args.sample_name):
+      ll = re.split(':', line);
+      NGS_samples.append(ll[0]);
+      NGS_sample_data[ ll[0] ] = ll[1:]
+  else:
+    fatal_error('no input sample', exit_code=1)
+
+  for sample in NGS_samples:
+    if os.path.exists(sample):
+      if os.path.isdir(sample):
+        pass
+      else:
+        fatal_error('file exist: ' + sample, exit_code=1)
+    else:
+      if os.system("mkdir " + sample):
+        fatal_error('can not mkdir: ' + sample, exit_code=1)
+  return
+
+
+def task_level_jobs(NGS_config):
+  '''according to dependancy, make level of jobs'''
+  job_level = {}
+  while True:
+    change_flag = False
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      t_injobs = []
+      if 'injobs' in t_job.keys():
+        t_injobs = t_job['injobs']
+
+      if len(t_injobs) > 0:
+        max_level_injob = 0
+        for j in t_injobs:
+          if not j in job_level.keys():
+            continue
+            if job_level[j] > max_level_injob:
+              max_level_injob = job_level[j]
+        if max_level_injob == 1:
+          continue
+        max_level_injob +=1  #### one more level 
+        if (t_job_id in job_level.keys()) and (job_level[t_job_id] >= max_level_injob):
+          continue
+        job_level[t_job_id]=max_level_injob
+        change_flag = 1
+      else:
+        if not t_job_id in job_level.keys():
+          job_level[t_job_id]=1
+          change_flag = True
+
+    if not change_flag: break
+
+  # print job_level
+  for t_job_id in NGS_config.NGS_batch_jobs.keys():
+    NGS_config.NGS_batch_jobs[t_job_id]['job_level'] = job_level[t_job_id]
+      
+  return
+#### END task_level_jobs(NGS_config)
+
+
+def add_subset_jobs_by_dependency(NGS_config):
+  '''add dependant jobs'''
+  while True:
+    num_subset_jobs = len(subset_jobs)
+    for t_job_id in subset_jobs:
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      if 'injobs' in t_job.keys():
+        for j in t_job['injobs']:
+          if not (j in subset_jobs):
+            subset_jobs.append(j)
+    if num_subset_jobs == len(subset_jobs): break
+  return
+#### END add_subset_jobs_by_dependency()
+
+       
+def make_job_list(NGS_config):
+  '''make sh script for each job / sample'''
+
+  verify_flag = False
+  for t_job_id in NGS_config.NGS_batch_jobs:
+    if subset_flag and not (t_job_id in subset_jobs):
+      continue
+
+    t_job = NGS_config.NGS_batch_jobs[ t_job_id ]
+    t_execution = NGS_config.NGS_executions[ t_job["execution"] ]
+
+    pe_parameter = ''
+    if t_execution[ 'type' ] == 'qsub-pe':
+      t_cores_per_cmd  = t_job[ 'cores_per_cmd' ]
+      pe_parameter = "#$ -pe orte " + str(t_cores_per_cmd)
+      if 'pe_para' in t_execution.keys():
+        pe_parameter = "#$ " + t_execution[ 'pe_para' ] + " " +  str(t_cores_per_cmd)
+
+    if t_job[ 'cores_per_cmd' ] > t_execution[ 'cores_per_node' ]:
+      fatal_error('not enough cores ' + t_job_id, exit_code=1)
+
+    t_job[ 'cmds_per_node' ] = t_execution[ 'cores_per_node' ] / t_job[ 'cores_per_cmd' ]
+    t_job[ 'nodes_total' ] = math.ceil( t_job[ 'no_parallel' ] / float(t_job[ 'cmds_per_node' ]))
+ 
+    if t_job[ 'nodes_total' ] > t_execution[ 'number_nodes' ]:
+      fatal_error('not enough nodes ' + t_job, exit_code=1)
+
+    CMD_opts = []
+    if 'CMD_opts' in t_job.keys():  
+      CMD_opts = t_job[ 'CMD_opts' ]
+    if t_job_id in NGS_opts.keys():
+      CMD_opts = NGS_opts[ t_job_id ]
+
+    for t_sample_id in NGS_samples:
+      t_command = t_job[ 'command' ]
+      t_command = re.sub('\$SAMPLE', t_sample_id, t_command)
+      t_command = re.sub('\$SELF'  , t_job_id, t_command)
+
+      for i in NGS_config.ENV.keys():
+        t_command = re.sub('\$ENV.'+i, NGS_config.ENV[i], t_command)
+
+      for i_data in range(0, len(NGS_sample_data[ t_sample_id ])):
+        t_data = NGS_sample_data[ t_sample_id ][i_data]
+        t_re = '\$DATA\.' + str(i_data)
+        t_command = re.sub(t_re, t_data, t_command)
+
+      t_injobs = []
+      if 'injobs' in t_job.keys():
+        t_injobs = t_job[ 'injobs' ]
+        for i_data in range(0, len(t_job[ 'injobs' ])):
+          t_data = t_job[ 'injobs' ][i_data]
+          t_re = '\$INJOBS\.' + str(i_data)
+          t_command = re.sub(t_re, t_data, t_command)
+
+      for i_data in range(0, len(CMD_opts)):
+        t_data = CMD_opts[i_data]
+        t_re = '\$CMDOPTS\.' + str(i_data)
+        t_command = re.sub(t_re, t_data, t_command)
+
+      v_command = ''
+      if 'non_zero_files' in t_job.keys():
+        for t_data in t_job[ 'non_zero_files' ]:
+          v_command = v_command + \
+            'if ! [ -s {0}/{1} ]; then echo "zero size {2}/{3}"; exit; fi\n'.format(t_job_id, t_data, t_job_id, t_data)
+
+      f_start    = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.start.date'
+      f_complete = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.complete.date'
+      f_cpu      = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.cpu'
+      t_sh_file  = '{0}/WF-sh/{1}.{2}.sh'.format(pwd, t_job_id, t_sample_id)
+      t_infiles = []
+      if 'infiles' in t_job.keys():
+        t_infiles = map(lambda x: t_sample_id + "/" + x, t_job[ 'infiles' ])
+      job_list[ t_job_id ][ t_sample_id ] = {
+        'sample_id'    : t_sample_id,
+        'job_id'       : t_job_id,
+        'status'       : 'wait',       #### status can be wait (input not ready), ready (input ready), submitted (submitted or running), completed
+        'command'      : t_command,
+        'sh_file'      : t_sh_file, 
+        'infiles'      : t_infiles,
+        'injobs'       : t_injobs,
+        'start_file'   : f_start,
+        'complete_file': f_complete,
+        'cpu_file'     : f_cpu }
+
+      if not os.path.exists( t_sh_file ):
+        try:
+          tsh = open(t_sh_file, 'w')
+          tsh.write('''{0}
+{1}
+
+my_host=`hostname`
+my_pid=$$
+my_core={2}
+my_queue={3}
+my_time_start=`date +%s`
+
+cd {4}/{5}
+mkdir {6}
+if ! [ -f {7} ]; then date +%s > {7};  fi
+{8}
+{9}
+date +%s > {10}
+my_time_end=`date +%s`;
+my_time_spent=$((my_time_end-my_time_start))
+echo "sample={5} job={6} host=$my_host pid=$my_pid queue=$my_queue cores=$my_core time_start=$my_time_start time_end=$my_time_end time_spent=$my_time_spent" >> {11}
+
+'''.format(t_execution['template'], pe_parameter, t_job['cores_per_cmd'], t_job['execution'], pwd, t_sample_id, t_job_id, f_start, t_command, v_command, f_complete, f_cpu ))
+          tsh.close()
+          os.system('chmod u+x '+ t_sh_file)
+        except IOError:
+          fatal_error('cannot write to ' + job_list[ 't_job_id' ][ 't_sample_id' ][ 'sh_file' ], exit_code=1)
+  return
+### END def make_job_list(NGS_config):
+
+
+def time_str1(s):
+  str1 = str(s/3600) + 'h'
+  s = s % 3600
+  str1 = str1 + str(s/60) + 'm'
+  s = s % 60
+  str1 = str1 + str(s) + 's'
+  return str1
+
+
+def task_list_jobs(NGS_config):
+  for t_job_id in NGS_config.NGS_batch_jobs.keys():
+    t_job = NGS_config.NGS_batch_jobs[t_job_id]
+
+    t_injobs = []
+    if 'injobs' in t_job.keys():
+      t_injobs  = t_job['injobs']
+    print '{0}\tIn_jobs:[ {1} ]\tJob_level:{2}\n'.format(t_job_id, ','.join(t_injobs), t_job['job_level'] )
+
+
+def task_snapshot(NGS_config):
+  '''print job status'''
+  queue_system = NGS_config.queue_system   #### default "SGE"
+  this_task = True
+  if this_task:
+    flag_qstat_xml_call = False
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      t_execution = NGS_config.NGS_executions[ t_job['execution']]
+      exe_type = t_execution['type']
+      if (queue_system == 'SGE') and (exe_type in ['qsub','qsub-pe']):
+        flag_qstat_xml_call = True
+
+    if flag_qstat_xml_call:
+      SGE_qstat_xml_query()
+
+    for t_sample_id in NGS_samples:
+      for t_job_id in NGS_config.NGS_batch_jobs.keys():
+        if subset_flag:
+          if not (t_job_id in subset_jobs):
+            continue
+        check_submitted_job(NGS_config, t_job_id, t_sample_id)
+
+  max_len_sample = 0;
+  for t_sample_id in NGS_samples:
+    if len(t_sample_id) > max_len_sample:
+      max_len_sample = len(t_sample_id)
+  max_len_job = 0;
+  for t_job_id in NGS_config.NGS_batch_jobs.keys():
+    if len(t_job_id) > max_len_job:
+      max_len_job = len(t_job_id)
+
+  print '''
+Job status: 
+.\twait
+-\tsubmitted
+r\trunning  
++\tcompleted
+!\terror
+x\tunselected job
+'''
+
+  for i1 in range(max_len_job):
+    i = max_len_job - i1 - 1
+    print ' ' * max_len_sample + "\t", 
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      if i < len(t_job_id):
+        print ' ' + t_job_id[-i-1],
+      else:
+        print '  ',
+    print ''
+  print 'Sample\t' + ('-' * 30)
+
+  for t_sample_id in NGS_samples:
+    print t_sample_id + '\t',
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      if subset_flag:
+        if not (t_job_id in subset_jobs):
+          print ' x',
+          continue
+      t_sample_job = job_list[t_job_id][t_sample_id]
+      status = t_sample_job['status']
+      if   status == 'completed': print ' +',
+      elif status == 'submitted': print ' -',
+      elif status == 'running':   print ' r',
+      elif status == 'wait':      print ' .',
+      elif status == 'error':     print ' !',
+      else:                       print ' _',
+    print ''
+
+  print '\n\n'
+  return
+### def task_snapshot():
+
+
+def task_delete_jobs(NGS_config, opt):
+  '''manually delete jobs and its dependant jobs'''
+
+  mode, c = re.split(':', opt)
+  tmp_sh = 'NGS-{0}.sh'.format(os.getpid())
+
+  try:
+    tsh = open(tmp_sh, 'w')
+  except IOError:
+    fatal_error('cannot write to ' + tmp_sh, exit_code=1)
+  
+  tsh.write('#Please execute the following commands\n')
+
+  for t_sample_id in NGS_samples:
+    job_to_delete_ids =[]
+    if mode == 'jobids':
+      job_to_delete_ids = re.split(',', c)
+    elif mode == 'run after':
+      if not os.path.exists(c):
+        fatel_error('File does not exist:' + c, exit_code=1)
+      for t_job_id in NGS_config.NGS_batch_jobs.keys():
+        if subset_flag:
+          if not (t_job_id in subset_jobs): continue
+        t_sample_job = job_list[t_job_id][t_sample_id]
+
+        t_sh_pid  = t_sample_job['sh_file'] + '.pids'
+        if not os.path.exists(t_sh_pid): continue
+        if file1_same_or_after_file2(t_sh_pid, c):
+          job_to_delete_ids.append(t_job_id)
+    else:
+      fatel_error('unknown option for deleteing jobs: ' + opt, exit_code=1)
+
+    # now job_to_delete_ids are jobs need to be deleted
+    # next find all jobs that depends on them, recrusively
+    no_jobs_to_delete = len(job_to_delete_ids)
+    while True:
+      for t_job_id in NGS_config.NGS_batch_jobs.keys():
+        if subset_flag:
+          if not (t_job_id in subset_jobs): continue
+        t_sample_job = job_list[t_job_id][t_sample_id]
+
+        t_sh_pid  = t_sample_job['sh_file'] + '.pids'
+        if not os.path.exists(t_sh_pid): continue
+
+        for t_job_id_2 in t_sample_job['injobs']:
+          if t_job_id_2 in job_to_delete_ids:
+            if not t_job_id in job_to_delete_ids:
+              job_to_delete_ids.append(t_job_id)
+
+      if no_jobs_to_delete == len(job_to_delete_ids): break
+      no_jobs_to_delete = len(job_to_delete_ids)
+
+    if not no_jobs_to_delete: continue
+    tsh.write('#jobs to be deleted for ' + t_sample_id + ': ' + ' '.join(job_to_delete_ids) + '\n'), 
+
+    for t_job_id in job_to_delete_ids:
+      t_sample_job = job_list[t_job_id][t_sample_id]
+      t_sh_pid  = t_sample_job['sh_file'] + '.pids'
+      tsh.write('\\rm -rf {0}/{1}\n'.format(t_sample_id, t_job_id))
+      tsh.write('\\rm '+ t_sh_pid + '\n')
+
+      #### find the qsub ids to be deleted 
+      pids = [] #### either pids, or qsub ids
+      try:
+        f = open(t_sh_pid, 'r')
+        pids = f.readlines()
+        f.close()
+        pids = [x.strip() for x in pids]
+      except IOError:
+        fatal_error('cannot open ' + t_sh_pid, exit_code=1)
+      tsh.write('qdel ' + ' '.join([str(x) for x in pids]) + '\n')
+
+    tsh.write('\n\n')
+  tsh.close()
+  print 'The script does not delete the files, please run ' + tmp_sh + ' to delete files!!!\n\n';
+  return
+#### END def task_delete_jobs()
+
+
+def file1_same_or_after_file2(file1, file2):
+  # if not exist file1, assume it is in future, so it is newer
+  if not os.path.exists(file1): return True
+  # otherwise file1 exist 
+  # if file2 not exist
+  if not os.path.exists(file2): return False
+  if os.path.getmtime(file1) >= os.path.getmtime(file2): return True
+  else:                                                  return False
+
+
+def SGE_qstat_xml_query():
+  '''run qstat -f -xml and get xml tree'''
+  global qstat_xml_data
+  qstat_xml_data = collections.defaultdict(dict)
+  t_out = ''
+  try:
+    t_out  = subprocess.check_output(['qstat -f -xml'], shell=True)
+  except:
+    fatal_error("can not run qstat", exit_code=1)
+
+  qstat_xml_root = ET.fromstring(t_out)
+  #qstat_xml_root = qstat_xml.getroot()
+  for job_list in qstat_xml_root.iter('job_list'):
+    job_id    = job_list.find('JB_job_number').text
+    job_name  = job_list.find('JB_name').text
+    job_state = job_list.find('state').text
+    qstat_xml_data[job_id] = [job_name, job_state]
+
+  return
+#### END def SGE_qstat_xml_query()
+
+
+def print_job_status_summary(NGS_config):
+  '''print jobs status'''
+  job_status = {}
+  job_total = 0
+
+  for t_job_id in NGS_config.NGS_batch_jobs.keys():
+    if subset_flag:
+      if not (t_job_id in subset_jobs):
+        continue
+    for t_sample_id in NGS_samples:
+      status = job_list[t_job_id][t_sample_id]['status']
+      if status in job_status.keys():
+        job_status[status] +=1
+      else:
+        job_status[status] =1
+      job_total +=1
+
+  print 'total jobs: {0},'.format(job_total) , 
+  for i in job_status.keys():
+    print '{0}: {1}, '.format(i, job_status[i]), 
+  print '\n'
+
+
+def run_workflow(NGS_config):
+  '''major loop for workflow run'''
+  queue_system = NGS_config.queue_system   #### default "SGE"
+  sleep_time_min = 15
+  sleep_time_max = 120
+  sleep_time = sleep_time_min
+
+  while 1:
+    flag_job_done = True
+    ########## reset execution_submitted to 0
+    for i in NGS_config.NGS_executions.keys():
+      execution_submitted[ i ] = False
+
+    flag_qstat_xml_call = False
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      t_execution = NGS_config.NGS_executions[ t_job['execution']]
+      exe_type = t_execution['type']
+      if (queue_system == 'SGE') and (exe_type in ['qsub','qsub-pe']):
+        flag_qstat_xml_call = True
+
+    if flag_qstat_xml_call:
+      SGE_qstat_xml_query()
+
+    ########## check and update job status for submitted jobs
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      if subset_flag:
+        if not (t_job_id in subset_jobs):
+          continue
+      for t_sample_id in NGS_samples:
+        t_sample_job = job_list[t_job_id][t_sample_id]
+        if t_sample_job['status'] == 'completed':
+          continue
+
+        check_submitted_job(NGS_config, t_job_id, t_sample_id)
+        if t_sample_job['status'] == 'completed':
+          continue
+        flag_job_done = False
+
+    if flag_job_done:
+      print_job_status_summary(NGS_config)
+      print 'job completed!'
+      break
+
+    ########## check and update job status based on dependance 
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      if subset_flag:
+        if not (t_job_id in subset_jobs):
+          continue
+      for t_sample_id in NGS_samples:
+        t_sample_job = job_list[t_job_id][t_sample_id]
+        if t_sample_job['status'] != 'wait':
+          continue
+
+        t_ready_flag = True
+        for i in t_sample_job['infiles']:
+          if os.path.exists(i) and os.path.getsize(i) > 0:
+            continue
+          t_ready_flag = False
+          break
+
+        for i in t_sample_job['injobs']:
+          if job_list[i][t_sample_id]['status'] == 'completed':
+            continue
+          t_ready_flag = False
+          break
+
+        if t_ready_flag:
+          t_sample_job['status'] = 'ready'
+          print '{0},{1}: change status to ready\n'.format(t_job_id, t_sample_id)
+    ########## END check and update job status based on dependance 
+
+    ########## submit local sh jobs
+    has_submitted_some_jobs = False
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      if subset_flag:
+        if not (t_job_id in subset_jobs):
+          continue
+      t_execution = NGS_config.NGS_executions[ t_job['execution']]
+      t_execution_id = t_job['execution']
+      if t_execution['type'] != 'sh': 
+        continue
+      if execution_submitted[t_execution_id] >= t_execution['cores_per_node']:
+        continue
+      for t_sample_id in NGS_samples:
+        t_sample_job = job_list[t_job_id][t_sample_id]
+        if t_sample_job['status'] != 'ready':
+          continue
+        if (execution_submitted[t_execution_id] + t_job['cores_per_cmd'] * t_job['no_parallel']) > \
+            t_execution['cores_per_node']: #### no enough available cores
+          continue
+
+        #### now submitting 
+        pid_file = open( t_sample_job['sh_file'] + '.pids', 'w')
+        for i in range(0, t_job['no_parallel']):
+          p = subprocess.Popen(['/bin/bash', t_sample_job['sh_file']], shell=True)
+          pid_file.write(str(p.pid)+'\n')
+        pid_file.close()
+        t_sample_job['status'] = 'submitted'
+        print '{0},{1}: change status to submitted\n'.format(t_job_id, t_sample_id)
+        execution_submitted[ t_execution_id ] += t_job['cores_per_cmd'] * t_job['no_parallel'] 
+        has_submitted_some_jobs = True
+    ########## END submit local sh jobs
+
+    ########## submit qsub-pe jobs, multiple jobs may share same node
+    for t_job_id in NGS_config.NGS_batch_jobs.keys():
+      t_job = NGS_config.NGS_batch_jobs[t_job_id]
+      if subset_flag:
+        if not (t_job_id in subset_jobs):
+          continue
+      t_execution = NGS_config.NGS_executions[ t_job['execution']]
+      t_execution_id = t_job['execution']
+
+      if t_execution['type'] != 'qsub-pe':
+        continue
+      if execution_submitted[t_execution_id] >= t_execution['number_nodes']:
+        continue
+      t_cores_per_node = t_execution['cores_per_node']
+      t_cores_per_cmd  = t_job['cores_per_cmd']
+      t_cores_per_job  = t_cores_per_cmd * t_job['no_parallel']
+      t_nodes_per_job  = t_cores_per_job / t_cores_per_node
+
+      for t_sample_id in NGS_samples:
+        t_sample_job = job_list[t_job_id][t_sample_id]
+        if t_sample_job['status'] != 'ready':
+          continue
+
+        #### now submitting 
+        pid_file = open( t_sample_job['sh_file'] + '.pids', 'w')
+        for i in range(0, t_job['no_parallel']):
+          t_stderr = t_sample_job['sh_file'] + '.' + str(i) + '.stderr'
+          t_stdout = t_sample_job['sh_file'] + '.' + str(i) + '.stdout'
+
+          qsub_exe = 'qsub'
+          if 'qsub_exe' in t_execution.keys(): qsub_exe = t_execution['qsub_exe']
+          command_line = qsub_exe + ' {0} {1} {2} {3} {4} {5} {6}'.format(t_execution['command_name_opt'], t_job_id,
+                                                                          t_execution['command_err_opt'], t_stderr, 
+                                                                          t_execution['command_out_opt'], t_stdout, t_sample_job['sh_file'])
+          cmd = subprocess.check_output([command_line], shell=True)
+          if re.search('\d+', cmd):
+            pid = re.search('\d+', cmd).group(0)
+            pid_file.write(pid + '\n')
+          else:
+            fatal_error('error submitting jobs')
+          execution_submitted[t_execution_id] += t_nodes_per_job
+          print '{0} submitted for {1}\n'.format(t_sample_job['sh_file'], t_sample_id)
+
+        pid_file.close()
+        t_sample_job['status'] = 'submitted'
+        has_submitted_some_jobs = True
+    ########## END submit qsub-pe jobs, multiple jobs may share same node
+   
+    ########## submit qsub jobs, job bundles disabled here, if need, check the original Perl script
+
+    #### if has submitted some jobs, reset waiting time, otherwise double waiting time
+    print_job_status_summary(NGS_config)
+    if has_submitted_some_jobs:
+      sleep_time = sleep_time_min
+    else:
+      sleep_time  *= 2
+      if sleep_time > sleep_time_max:
+        sleep_time = sleep_time_max
+    time.sleep(sleep_time);
+  #### END while 1:
+  return
+#### END def run_workflow(NGS_config)
+
+
+def check_pid(pid):        
+  '''Check For the existence of a unix pid. '''
+# opt 1, this doesn't print OSError to stderr
+  return os.path.exists('/proc/' + str(pid))
+
+# opt 2,  
+#  try:
+#    os.kill(pid, 0)
+#  except OSError: return False
+#  else:           return True
+
+
+def check_any_pids(pids):
+  '''Check For the existence of a list of unix pids. return True if any one exist'''
+  for pid in pids:
+    if check_pid(pid):
+      return True
+  return False
+
+
+def check_any_qsub_pids(pids):
+  '''Check For the existence of a list of qsub pids. return True if any one exist'''
+  for pid in pids:
+    if pid in qstat_xml_data.keys():
+      return True
+  return False
+
+
+def validate_job_files(t_job_id, t_sample_id):
+  '''return True if necessary file exist'''
+  t_sample_job = job_list[t_job_id][t_sample_id]
+  if not (os.path.exists(t_sample_job['start_file'])    and os.path.getsize(t_sample_job['start_file']) > 0):    return False
+  if not (os.path.exists(t_sample_job['complete_file']) and os.path.getsize(t_sample_job['complete_file']) > 0): return False
+  if not (os.path.exists(t_sample_job['cpu_file'])      and os.path.getsize(t_sample_job['cpu_file']) > 0):      return False
+  return True
+
+
+#### def check_submitted_job()
+def check_submitted_job(NGS_config, t_job_id, t_sample_id):
+  '''
+  check submitted jobs by checking pids or qsub ids
+  update job status from wait|ready -> submitted if pid file exit (in case of restart of this script)
+  update job status from wait|ready|submitted -> completed if sh calls or qsub calls finished
+  '''
+  t_sample_job = job_list[t_job_id][t_sample_id]
+  t_job = NGS_config.NGS_batch_jobs[t_job_id]
+  t_execution = NGS_config.NGS_executions[ t_job['execution']]
+
+  t_sh_pid = t_sample_job['sh_file'] + '.pids'
+  if not os.path.exists(t_sh_pid): return
+
+  status = t_sample_job['status']
+  if ((status == 'wait') or (status == 'ready')):
+    t_sample_job['status'] = 'submitted'
+    print '{0},{1}: change status to submitted\n'.format(t_job_id, t_sample_id)
+
+  pids = [] #### either pids, or qsub ids
+  try:
+    f = open(t_sh_pid, 'r')
+    pids = f.readlines()
+    f.close()
+    pids = [x.strip() for x in pids]
+  except IOError:
+    fatal_error('cannot open ' + t_sh_pid, exit_code=1)
+
+  if len(pids) == 0:
+    fatal_error('empty file ' + t_sh_pid, exit_code=1)
+  
+  exe_type = t_execution['type']
+  if (exe_type == 'sh'):
+    if check_any_pids(pids):    #### still running
+      execution_submitted[ t_job['execution'] ] += t_job['cores_per_cmd'] * t_job['no_parallel']
+    elif validate_job_files(t_job_id, t_sample_id):                       #### job finished
+      t_sample_job['status'] = 'completed'
+      print '{0},{1}: change status to completed\n'.format(t_job_id, t_sample_id)
+    else:
+      t_sample_job['status'] = 'error'
+      print '{0},{1}: change status to error\n'.format(t_job_id, t_sample_id)
+    return
+
+  elif ((exe_type == 'qsub') or (exe_type == 'qsub-pe')):
+    if check_any_qsub_pids(pids):    #### still running
+      pass
+    elif validate_job_files(t_job_id, t_sample_id):                       #### job finished
+      t_sample_job['status'] = 'completed'
+      print '{0},{1}: change status to completed\n'.format(t_job_id, t_sample_id)
+    else:
+      t_sample_job['status'] = 'error'
+      print '{0},{1}: change status to error\n'.format(t_job_id, t_sample_id)
+  else:
+    fatal_error('unknown execution type: '+ exe_type, exit_code=1)
+  return
+#### END def check_submitted_job()
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(formatter_class = RawTextHelpFormatter,
+                                   description     = textwrap.dedent(banner))
+
+  parser.add_argument('-i', '--input',       help="workflow configration file, required", required=True)
+  parser.add_argument('-s', '--sample_file', help='''
+sample data file, required unless -S is present
+File format example:
+#Sample data file example, TAB or space delimited for following lines
+Sample_ID1 sample_data_0 sample_data_1
+Sample_ID2 sample_data_0 sample_data_1
+Sample_ID3 sample_data_0 sample_data_1
+  ''')
+  parser.add_argument('-S', '--sample_name', help='''
+sample data from command line, required unless -s is present
+format:
+Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
+  ''')
+  parser.add_argument('-t', '--parameter_file', help='''
+replace default paramters in workflow configration file
+File format example:
+#parameter file example, TAB or space delimited for following lines
+JobID_A opt0 opt1 opt2
+JobID_B opt0 opt1
+  ''')
+  parser.add_argument('-T', '--parameter_name', help='''
+parameter from command line
+format:
+JobID_A:opt0:opt1:opt2,JobID_B:opt0:opt1
+  ''')
+  parser.add_argument('-j', '--jobs', help='''run sub set of jobs, optional
+the workflow will run all jobs by default.
+to run sub set of jobs: -j qc or -j qc,fastqc
+  ''')
+  parser.add_argument('-J', '--task', help='''optional tasks
+write-sh: write sh files and quite
+log-cpu: gathering cpu time for each run for each sample
+list-jobs: list jobs
+snapshot: snapshot current job status
+delete-jobs: delete jobs, must supply jobs delete syntax by option -Z
+  e.g. -J delete-jobs -Z jobids:assembly,blast  ---delete assembly,blast and all jobs depends on them
+       -J delete-jobs -Z run_after:filename     ---delete jobs that has start time (WF.start.date) after this file, and all depending jobs
+  ''')
+  parser.add_argument('-Z', '--second_parameter', help='secondary parameter used by other options, such as -J')
+  parser.add_argument('-Q', '--queye', help='queue system, e.g. PBS, SGE', default='SGE')
+
+  args = parser.parse_args()
+
+  if (args.sample_file is None) and (args.sample_name is None) :
+    parser.error('No sample file or sample name')
+  NGS_config = imp.load_source('NGS_config', args.input)
+
+  print banner
+  read_samples(args)
+  read_parameters(args)
+
+  if args.jobs:
+    subset_flag = True
+    subset_jobs = re.split(',', args.jobs)
+    add_subset_jobs_by_dependency(NGS_config)
+    print 'Running subset jobs:',
+    print subset_jobs
+    print '\n'
+
+  if not os.path.exists('WF-sh'): os.system('mkdir WF-sh')
+
+  task_level_jobs(NGS_config)
+  make_job_list(NGS_config)
+
+  if args.task:
+    if args.task == 'list-jobs':
+      task_list_jobs(NGS_config)
+      exit(0)
+    elif args.task == 'snapshot':
+      task_snapshot(NGS_config)
+      exit(0)
+    elif args.task == 'delete-jobs':
+      task_delete_jobs(NGS_config, args.second_parameter)
+      exit(0)
+    elif args.task == 'write-sh':
+      exit(0)
+    else:
+      fatal_error('undefined task' + args.task, exit_code=1)
+
+  run_workflow(NGS_config)
+################################################################################################
+#  _____               _   _  _____  _____  _           _       _           _       _         
+# |  __ \             | \ | |/ ____|/ ____|| |         | |     | |         (_)     | |        
+# | |__) |   _ _ __   |  \| | |  __| (___  | |__   __ _| |_ ___| |__        _  ___ | |__  ___ 
+# |  _  / | | | '_ \  | . ` | | |_ |\___ \ | '_ \ / _` | __/ __| '_ \      | |/ _ \| '_ \/ __|
+# | | \ \ |_| | | | | | |\  | |__| |____) || |_) | (_| | || (__| | | |     | | (_) | |_) \__ \
+# |_|  \_\__,_|_| |_| |_| \_|\_____|_____/ |_.__/ \__,_|\__\___|_| |_|     | |\___/|_.__/|___/
+#                                      ______                      ______ _/ |                
+#                                     |______|                    |______|__/                 
+########## Run NGS_batch_jobs for each samples http://patorjk.com/software/taag
+################################################################################################


=====================================
usecases/Miseq-16S/README
=====================================
@@ -51,10 +51,17 @@ http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/t
 We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
 
 
-3. Modify NG-Omics-Miseq-16S.pl
-Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
-    $CD_HIT_dir   = "PATH_to_cd-hit";
-    $NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar";  #### where you have installed Trimmomatic
+3. Modify NG-Omics-Miseq-16S.py
+Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.py, in the top few lines:
+
+from
+  'CD_HIT_dir'           : '/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621',
+  'NGS_prog_trimmomatic' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar',
+
+to
+  'CD_HIT_dir'           : 'PATH_to_cd-hit_directory',
+  'NGS_prog_trimmomatic' : 'PATH_to_Trimmomatic/trimmomatic-0.32.jar',   #### where you have installed Trimmomatic 
+  
 
 4. Download reference dataset
 Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
@@ -124,7 +131,7 @@ This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE
 4. Run sequence QC and OTU clustering for each sample
 In the working directory, run
 
-    PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
 
     where: 150 and 100 are the effective length, 
            see next section for suggestions in choose effective clustering read length
@@ -138,13 +145,13 @@ This command will generate shell scripts for QC and for OTU for each sample.
 The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all 
 these jobs finished you then run all otu.sample_name.sh
 
-NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline 
+NG-Omics-WF.py https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline 
 tool developed in our group. It is not fully released yet, since we need more time to document 
-this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples. 
-First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the 
+this tool. However, you can try to use NG-Omics-WF.py to automatically run all your samples. 
+First edit NG-Omics-Miseq-16S.py and modify cores_per_node around line #36 to match the 
 number of CPU cores of your computer, then run
 
-    nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
+    nohup PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
 
 After the job finished, the OTU results will be in sample_name/otu folder, important files include
     OTU.clstr: file lists all clusters and sequences
@@ -156,14 +163,14 @@ If you have multiple samples, you don't just want to stop here. It is important
 to pool all sample together and re-run OTU clustering so that all samples can be
 compared, run
 
-    PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled 
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/pool_samples.pl -s SAMPLE_file -o pooled 
 
 This will pool sequences from all samples. We can handle hundred and more sample without problem. 
 
 
 6. Cluster pooled samples, run
  
-    PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
 
 This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
 run this sh script. When it is finished, OTUs will be in the pooled directory:


=====================================
usecases/Miseq-16S/pool_samples.pl
=====================================
@@ -9,10 +9,13 @@ my $output            = $opts{o};
    $output            = "pooled" unless ($output);
 my $sample_in         = $opts{s};
 my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq   or sample1;sample2;sample3
+my $file_list         = $opts{f};
+my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
+   @file_list = split(/,/, $file_list) if ($file_list); 
+
 my $job               = $opts{j};
    $job = "otu" unless ($job);
 
-my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
 
 my ($i, $j, $k, $cmd);
 $cmd = `mkdir $output` unless (-e $output);
@@ -62,7 +65,7 @@ foreach $i (@file_list) {
 
 sub usage {
 <<EOD;
-    $0 -s sample_file -o output_dir
+    $0 -s sample_file -o output_dir -j job -f list_files
     -s sample data file, required unless -S is present
        File format example
 #Sample data file example, TAB or space delimited for following lines
@@ -73,6 +76,8 @@ Sample_ID3 sample_data_0 sample_data_1
     -S sample data from command line, required unless -s is present
            format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
 
+    -j job name
+    -f list of files (delimited by ,) to pool (cat) 
 EOD
 }
 


=====================================
usecases/Miseq-16S/silva-ana1.pl
=====================================
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+## =========================== NGS tools ==========================================
+## NGS tools for metagenomic sequence analysis
+## May also be used for other type NGS data analysis
+##
+##                                      Weizhong Li, UCSD
+##                                      liwz at sdsc.edu
+## http://weizhongli-lab.org/
+## ================================================================================
+
+use Getopt::Std;
+getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
+die usage() unless ($opts{j} and $opts{o});
+my ($i, $j, $k, $cmd);
+my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
+my ($len, $lena, $lenb);
+
+my $fasta = $opts{j};
+my $output  = $opts{o};
+
+my %id_2_seq = ();
+my $id = "";
+my $ann;
+open(TMP, $fasta) || die "can not open $fasta";
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    chop($ll);
+    ($id, $ann) = split(/\s+/, substr($ll,1), 2);
+    $ann =~ s/\s/_/g;
+    $id = "$id|$ann";
+  }
+  else {
+    $ll =~ s/U/T/g;
+    $ll =~ s/u/T/g;
+    $id_2_seq{$id} .= $ll;
+  }
+}
+
+close(TMP);
+
+my @ids = keys %id_2_seq;
+   @ids = sort {length($b) <=> length($a) } @ids;
+
+open(OUT, "> $output") || die "can not write to $output";
+foreach $id (@ids) {
+  print OUT ">$id\n$id_2_seq{$id}";
+}
+close(OUT);
+
+
+
+sub usage {
+<<EOD;
+This script formats Silva FASTA file for CD-HIT-OTU-MiSeq. You should download Silva sequences
+from https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva.fasta.gz
+or similar file, gunzip the file
+
+Run this script as $0 -j SILVA_128_SSURef_Nr99_tax_silva.fasta -o silva_128_SSURef_processed.fasta
+
+Options:
+======================
+        -j path for SILVA_128_SSURef_Nr99_tax_silva.fasta
+        -o output FASTA file of formatted Silva reference DB
+EOD
+}


=====================================
usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
=====================================
@@ -0,0 +1,132 @@
+#!/usr/bin/perl
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+########## local variables etc. Please edit
+$CD_HIT_dir           = "/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621";
+$NGS_prog_trimmomatic = "/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar";
+
+########## computation resources for execution of jobs
+%NGS_executions = ();
+$NGS_executions{"qsub_1"} = {
+  "type"                => "qsub-pe",
+  "cores_per_node"      => 8,
+  "number_nodes"        => 64,
+  "user"                => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
+  "command"             => "qsub",
+  "command_name_opt"    => "-N",
+  "command_err_opt"     => "-e",
+  "command_out_opt"     => "-o",
+  "template"            => <<EOD,
+#!/bin/sh
+#PBS -v PATH
+#PBS -V
+
+#\$ -q RNA.q
+#\$ -v PATH
+#\$ -V
+
+EOD
+};
+
+
+$NGS_executions{"sh_1"} = {
+  "type"                => "sh",
+  "cores_per_node"      => 8,
+  "number_nodes"        => 1,
+};
+
+
+# $NGS_batch_jobs{"qc-pe"} = {
+#   "CMD_opts"          => ["20"],
+#   "execution"         => "qsub_1",               # where to execute
+#   "cores_per_cmd"     => 4,                     # number of threads used by command below
+#   "no_parallel"       => 1,                     # number of total jobs to run using command below
+#   "command"           => <<EOD,
+# java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
+#     SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+# 
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
+# 
+# wait
+# rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
+# EOD
+# };
+
+ 
+$NGS_batch_jobs{"qc"} = {
+  "CMD_opts"          => ["20"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 4,                     # number of threads used by command below
+  "no_parallel"       => 1,                     # number of total jobs to run using command below
+  "command"           => <<EOD,
+java -jar $NGS_prog_trimmomatic SE -threads 4 -phred33 \\DATA.0          \\SELF/R1.fq                                            \\
+    SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+rm -f \\SELF/R1.fq
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr"} = {
+  "injobs"            => ["qc"],
+  "CMD_opts"          => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 2,                    # number of threads used by command below
+  "no_parallel"       => 1,                    # number of total jobs to run using command below
+  "command"           => <<EOD,
+#### cluster at 100%
+$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -o \\SELF/seq.nr -sf 1 -sc 1 -r 0 -c 1.00        -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.nr.log
+$CD_HIT_dir/usecases/miRNA-seq/filter-small-cluster.pl -i \\SELF/seq.nr.clstr -s \\SELF/seq.nr -o \\SELF/seq.nr-filtered.clstr -f \\SELF/seq.nr-filtered -c 1
+
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr-filtered  -o \\SELF/seq.95  -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.95.log
+$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr-filtered.clstr \\SELF/seq.95.clstr     > \\SELF/seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95  -i2 \\CMDOPTS.1 -o \\SELF/seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95  -i2 \\CMDOPTS.2 -o \\SELF/seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.ref.clstr > \\SELF/seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.spk.clstr > \\SELF/seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/seq.95-full.clstr \\SELF/seq.95.reftop.clstr > \\SELF/tmp.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/tmp.clstr         \\SELF/seq.95.spktop.clstr > \\SELF/miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < \\SELF/miRNA.clstr > \\SELF/miRNA.clstr.s
+mv \\SELF/miRNA.clstr.s \\SELF/miRNA.clstr
+rm -f \\SELF/tmp.clstr
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr-pooled"} = {
+  "CMD_opts"          => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 2,                    # number of threads used by command below
+  "no_parallel"       => 1,                    # number of total jobs to run using command below
+  "command"           => <<EOD,
+#### concat seq.nr-filtered seq.nr-filtered.clstr 
+
+$CD_HIT_dir/cd-hit-est -i seq.nr-filtered    -o seq.95             -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > seq.95.log
+$CD_HIT_dir/clstr_rev.pl seq.nr-filtered.clstr       seq.95.clstr     > seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i seq.95  -i2 \\CMDOPTS.1 -o seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i seq.95  -i2 \\CMDOPTS.2 -o seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.ref.clstr > seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.spk.clstr > seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl seq.95-full.clstr seq.95.reftop.clstr > tmp.clstr
+$CD_HIT_dir/clstr_merge.pl tmp.clstr         seq.95.spktop.clstr > miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < miRNA.clstr > miRNA.clstr.s
+mv miRNA.clstr.s miRNA.clstr
+
+$CD_HIT_dir/usecases/miRNA-seq/clstr_2_miRNA-table.pl -i miRNA.clstr -o miRNA.txt
+
+EOD
+};
+
+##############################################################################################
+########## END
+1;
+


=====================================
usecases/miRNA-seq/clstr_2_miRNA-table.pl
=====================================
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+#
+use Getopt::Std;
+getopts("i:s:S:o:f:j:",\%opts);
+
+my $input             = $opts{i}; $input   = "Cluster.clstr" unless $input;
+my $output            = $opts{o}; $output  = "Cluster.txt" unless ($output);
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my %count = ();
+my %count_t = ();
+my %count_s = ();
+my $Cluster_2_ann = ();
+# >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
+open(TMP, $input) || die "can not open $input";
+my $Cluster=0;
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    $Cluster++;
+  }
+  else {
+    chop($ll);
+    if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+      my $id = $2;
+      if ($id =~ /^Sample\|([^\|]+)\|/) {
+        $sample_id = $1;
+        $sample_id{$sample_id}=1;
+        $count{$Cluster}{$sample_id}++;
+        $count_t{$Cluster}++;
+        $count_s{$sample_id}++;
+      }
+      else {
+        $Cluster_2_ann{$Cluster} .= "$id\t";
+      }
+    }
+    else {
+      die "format error $ll";
+    }
+  }
+}
+close(TMP);
+
+my @sample_ids = sort keys %sample_id;
+
+open(OUT1, "> $output") || die "can not write $output";
+print OUT1 "Cluster";
+foreach $sample_id (@sample_ids){
+  print OUT1 "\t$sample_id";
+}
+#print OUT1 "\tTotal\n";
+print OUT1 "\tAnnotation\tSpike\n";
+
+for ($i=1; $i<=$Cluster; $i++){
+  $ann = "None";
+  if ($Cluster_2_ann{$i}) { $ann = $Cluster_2_ann{$i}; }
+  print OUT1 "Cluster$i";
+  foreach $sample_id (@sample_ids){
+    $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
+    print OUT1 "\t$k";
+  }
+  #print OUT1 "\t$count_t{$i}";
+  print OUT1 "\t$ann\n";
+}
+close(OUT1);
+
+


=====================================
usecases/miRNA-seq/filter-small-cluster.pl
=====================================
@@ -0,0 +1,94 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+   $script_dir =~ s/[^\/]+$//;
+   chop($script_dir);
+   $script_dir = "./" unless ($script_dir);
+
+getopts("i:s:o:f:c:",\%opts);
+die usage() unless ($opts{i} and $opts{s} and $opts{o} and $opts{f});
+
+my $input      = $opts{i}; ## nr.clstr
+my $fa         = $opts{s}; ## R1.fa
+my $output     = $opts{o}; ## nr-filtered.clstr
+my $output_fa  = $opts{f}; ## nr-filtered
+my $cutoff     = $opts{c}; $cutoff = 1  unless ($cutoff);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+open(TMP, $input) || die "can not open $input";
+open(OUT, "> $output") || die "can not write to $output";
+$no = 0;
+$clstr = "";
+$rep = "";
+my %good_ids = ();
+
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    if ($no > $cutoff) {
+      print OUT $clstr;
+      $good_ids{$rep}=1;
+    }
+    $clstr = $ll;
+    $no = 0;
+  }
+  else {
+    $clstr .= $ll;
+    chop($ll);
+    if ($ll =~ /\*$/) {
+      $rep = "";
+      if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+        $rep = $2;
+      }
+      else {
+        die "format error $ll";
+      }
+    }
+    $no++;
+  }
+}
+    if ($no > $cutoff) {
+      print OUT $clstr;
+      $good_ids{$rep}=1;
+    }
+close(OUT);
+close(TMP);
+
+open(TMP, $fa) || die "can not open $fa";
+open(OUT, ">$output_fa") || die "can not write to $output_fa";
+
+my $flag = 0;
+while($ll = <TMP>) {
+  if ($ll =~ /^>/) {
+    $gi = substr($ll,1);
+    chop($gi);
+    $gi =~ s/\s.+$//;
+    $flag = ( $good_ids{$gi} ) ? 1 : 0;
+  }
+  print OUT $ll  if ($flag);
+}
+
+close(TMP);
+close(OUT);
+
+
+
+sub usage {
+<<EOF
+Usage:
+$script_name -i seq.nr.clstr -s R1.fa -o output.clstr -f output.fa -c 1 
+
+Options:
+    -i input seq.nr.clstr
+    -s R1.fa
+    -o output.clstr
+    -f output.fa
+    -c abundance cutoff, default $cutoff
+       small clusters <= this size will be considiered as noise and will be removed
+
+EOF
+}
+###### END usage
+



View it on GitLab: https://salsa.debian.org/med-team/cd-hit/compare/668f0f6f3ab70b07968a2252a81d1417ecef313a...6a193fc4eddc51ab6704f2d97e5ad68b7c2dec88

-- 
View it on GitLab: https://salsa.debian.org/med-team/cd-hit/compare/668f0f6f3ab70b07968a2252a81d1417ecef313a...6a193fc4eddc51ab6704f2d97e5ad68b7c2dec88
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190719/f9ce2aed/attachment-0001.html>


More information about the debian-med-commit mailing list