[med-svn] [Git][med-team/cd-hit][master] 9 commits: New upstream version 4.8.1
Andreas Tille
gitlab at salsa.debian.org
Fri Jul 19 18:38:12 BST 2019
Andreas Tille pushed to branch master at Debian Med / cd-hit
Commits:
dbd76db7 by Andreas Tille at 2019-07-19T17:21:26Z
New upstream version 4.8.1
- - - - -
d8ea28f9 by Andreas Tille at 2019-07-19T17:21:29Z
Update upstream source from tag 'upstream/4.8.1'
Update to upstream version '4.8.1'
with Debian dir eed5e76590ad0786c8d3b7d121cccb5b977fbb62
- - - - -
6a92f72d by Andreas Tille at 2019-07-19T17:21:29Z
New upstream version
- - - - -
2cd7c628 by Andreas Tille at 2019-07-19T17:21:29Z
debhelper 12
- - - - -
92f1ef9d by Andreas Tille at 2019-07-19T17:21:32Z
Standards-Version: 4.4.0
- - - - -
6eff93ea by Andreas Tille at 2019-07-19T17:21:32Z
Remove trailing whitespace in debian/rules
- - - - -
4044d76b by Andreas Tille at 2019-07-19T17:26:28Z
Do not parse d/changelog
- - - - -
d46a324b by Andreas Tille at 2019-07-19T17:33:40Z
Build-Depends: zlib1g-dev
- - - - -
6a193fc4 by Andreas Tille at 2019-07-19T17:35:57Z
Upload to unstable
- - - - -
20 changed files:
- Makefile
- README
- + cd-hit-clstr_2_blm8.pl
- cdhit-common.c++
- cdhit-common.h
- cdhit-utility.c++
- debian/changelog
- debian/compat
- debian/control
- debian/rules
- psi-cd-hit/psi-cd-hit-local.pl
- usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
- + usecases/Miseq-16S/NG-Omics-Miseq-16S.py
- + usecases/Miseq-16S/NG-Omics-WF.py
- usecases/Miseq-16S/README
- usecases/Miseq-16S/pool_samples.pl
- + usecases/Miseq-16S/silva-ana1.pl
- + usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
- + usecases/miRNA-seq/clstr_2_miRNA-table.pl
- + usecases/miRNA-seq/filter-small-cluster.pl
Changes:
=====================================
Makefile
=====================================
@@ -1,10 +1,8 @@
-
CC = g++ -Wall -ggdb
CC = g++ -pg
CC = g++
-# without OpenMP
-
+# default with OpenMP
# with OpenMP
# in command line:
# make openmp=yes
@@ -14,6 +12,21 @@ else
CCFLAGS = -fopenmp
endif
+#LDFLAGS = -static -lz -o
+#LDFLAGS = /usr/lib/x86_64-linux-gnu/libz.a -o
+
+# default with zlib
+# without zlib
+# in command line:
+# make zlib=no
+ifeq ($(zlib),no)
+ CCFLAGS +=
+ LDFLAGS += -o
+else
+ CCFLAGS += -DWITH_ZLIB
+ LDFLAGS += -lz -o
+endif
+
# support debugging
# in command line:
# make debug=yes
@@ -28,9 +41,6 @@ ifdef MAX_SEQ
CCFLAGS += -DMAX_SEQ=$(MAX_SEQ)
endif
-#LDFLAGS = -static -o
-LDFLAGS += -o
-
PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454
# Propagate hardening flags
=====================================
README
=====================================
@@ -1,8 +1,22 @@
For cd-hit
-How to compile?
+Requirements
+Since 4.8.1, cd-hit supports .gz format input file. This requires zlib library. zlib should
+be install in most Linux systems, so cd-hit should be compiled without issue. If your system
+don't have zlib, please install it first.
+ * On Ubuntu, to install zlib:
+ sudo apt install zlib1g-dev
+ * On CentOS, to install zlib:
+ sudo yum install zlib-devel
+
+
+How to compile
1. Compile with multi-threading support (default): make
2. Compile without multi-threading support (if you are on very old systems): make openmp=no
+ 3. Compile without zlib (if you can not install zlib): make zlib=no
+
+Having problems to compile
+Please contact the author
For cd-hit-auxtools
@@ -10,9 +24,16 @@ For cd-hit-auxtools
make
-For psi-cd-hit
- please download legacy BLAST (not BLAST+) and install the executables in your $PATH
+Compile cd-hit on MacOS
+To install CD-HIT on MacOS, first install gcc on your system.
+To use Homebrew (https://brew.sh/), see https://brewformulas.org/gcc.
+Then locate the path to your g++ executable, (e.g. /usr/local/Cellar/gcc/6.3.0_1/bin/g++-6,
+note: yours g++ path is likely to be different), then use command like this:
+ make CC=/usr/local/Cellar/gcc/6.3.0_1/bin/g++-6
+
+For psi-cd-hit
+ please download BLAST+ (not legacy BLAST) and install the executables in your $PATH
For more information, please visit http://cd-hit.org
=====================================
cd-hit-clstr_2_blm8.pl
=====================================
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+#
+
+my $rep;
+my @non_reps = ();
+my @aln_info = ();
+while($ll=<>){
+ if ($ll =~ /^>/ ) {
+ if (@non_reps) {
+ for ($i=0; $i<@non_reps; $i++) {
+ print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+ }
+ }
+ $rep = "";
+ @non_reps = ();
+ @aln_info = ();
+ }
+ else {
+ chop($ll);
+ if ($ll =~ />(.+)\.\.\./ ) {
+ my $id = $1;
+ if ($ll =~ /\*$/) { $rep = $id}
+ else {
+ push(@non_reps, $id);
+ my @lls = split(/\s+/, $ll);
+ my ($a, $iden) = split(/\//, $lls[-1]);
+ chop($iden); ### removing % sign
+ my ($qb, $qe, $sb, $se) = split(/:/, $a);
+ my $alnln = $qe-$qb+1;
+ my $mis = int($alnln * (100-$iden) / 100);
+ my $gap = 0;
+ my $exp = 0;
+ my $bit = $alnln*3 - $mis*6; #### very rough
+ push(@aln_info, "$iden\t$alnln\t$mis\t$gap\t$qb\t$qe\t$sb\t$se\t$exp\t$bit");
+ }
+ }
+#>Cluster 582
+#0 6671aa, >gi|302514050|uid|51... *
+#1 339aa, >SRS584717|scaffold|... at 2:332:4020:4356/89.32%
+#2 182aa, >SRS584717|scaffold|... at 1:182:6490:6671/100.00%
+#3 367aa, >SRS584717|scaffold|... at 1:332:4543:4874/90.66%
+#4 109aa, >SRS584717|scaffold|... at 1:108:5782:5889/97.22%
+
+ }
+}
+ if (@non_reps) {
+ for ($i=0; $i<@non_reps; $i++) {
+ print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+ }
+ }
+
+#query subject % alnln mis gap q_b q_e s_b s_e expect bits
+##0 1 2 3 4 5 6 7 8 9 10 11
+#mHE-SRS012902|scaffold|86.16 gnl|CDD|226997 47.62 42 17 2 164 201 210 250 5e-04 37.6
+#mHE-SRS012902|scaffold|109.23 gnl|CDD|225183 47.46 236 122 1 1 236 475 708 1e-92 284
+#mHE-SRS012902|scaffold|109.23 gnl|CDD|224055 44.35 239 130 2 1 239 332 567 2e-84 259
+#mHE-SRS012902|scaffold|109.23 gnl|CDD|227321 39.50 238 140 3 1 238 324 557 9e-69 218
+
=====================================
cdhit-common.c++
=====================================
@@ -804,6 +804,7 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
int band_width = band_right - band_left + 1;
int band_width1 = band_width + 1;
+ // score_mat, back_mat [i][j]: i index of seqi (0 to len(seqi)-1), j index of band (0 to band_width-1)
MatrixInt64 & score_mat = buffer.score_mat;
MatrixInt & back_mat = buffer.back_mat;
@@ -823,24 +824,25 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
}
best_score = 0;
- /*
- seq2 len2 = 17 seq2 len2 = 17 seq2 len2 = 17
- 01234567890123456 01234567890123456 01234567890123456
- 0 xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx xXXXXXXXxxxxxxxxx
- 1\\\\\Xxxxxxxxxxxxxxxxx \\\\\Xxx\xxxxxxxxxxxxx xx\xxxxx\xxxxxxxx
- 2 \\\\X\xxxxxxxxxxxxxxx \\\\Xxxx\xxxxxxxxxxxx xxx\xxxxx\xxxxxxx
- seq1 3 \\\Xx\xxxxxxxxxxxxxx \\\Xxxxx\xxxxxxxxxxx xxxx\xxxxx\xxxxxx
- len1 4 \\Xxx\xxxxxxxxxxxxx \\Xxxxxx\xxxxxxxxxx xxxxx\xxxxx\xxxxx
- = 11 5 \Xxxx\xxxxxxxxxxxx \Xxxxxxx\xxxxxxxxx xxxxxx\xxxxx\xxxx
- 6 Xxxxx\xxxxxxxxxxx Xxxxxxxx\xxxxxxxx xxxxxxx\xxxxx\xxx
- 7 x\xxxx\xxxxxxxxxx x\xxxxxxx\xxxxxxx xxxxxxxx\xxxxx\xx
- 8 xx\xxxx\xxxxxxxxx xx\xxxxxxx\xxxxxx xxxxxxxxx\xxxxx\x
- 9 xxx\xxxx\xxxxxxxx xxx\xxxxxxx\xxxxx xxxxxxxxxx\xxxxx\
- 0 xxxx\xxxx\xxxxxxx xxxx\xxxxxxx\xxxx xxxxxxxxxxx\xxxxx
- band_left < 0 band_left < 0 band_left >=0
- band_right < 0 band_right >=0 band_right >=0
- init score_mat, and iden_mat (place with upper 'X')
- */
+ /* seq1 is query, seq2 is rep
+ seq2 len2 = 17 seq2 len2 = 17 seq2 len2 = 17
+ 01234567890123456 01234567890123456 01234567890123456
+ 0 xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx xXXXXXXXxxxxxxxxx
+ 1 \\\\\Xxxxxxxxxxxxxxxxx \\\\\Xxx\xxxxxxxxxxxxx xx\xxxxx\xxxxxxxx
+ 2 \\\\X\xxxxxxxxxxxxxxx \\\\Xxxx\xxxxxxxxxxxx xxx\xxxxx\xxxxxxx
+ seq1 3 \\\Xx\xxxxxxxxxxxxxx \\\Xxxxx\xxxxxxxxxxx xxxx\xxxxx\xxxxxx
+ len1 4 \\Xxx\xxxxxxxxxxxxx \\Xxxxxx\xxxxxxxxxx xxxxx\xxxxx\xxxxx
+ = 11 5 \Xxxx\xxxxxxxxxxxx \Xxxxxxx\xxxxxxxxx xxxxxx\xxxxx\xxxx
+ 6 Xxxxx\xxxxxxxxxxx Xxxxxxxx\xxxxxxxx xxxxxxx\xxxxx\xxx
+ 7 x\xxxx\xxxxxxxxxx x\xxxxxxx\xxxxxxx xxxxxxxx\xxxxx\xx
+ 8 xx\xxxx\xxxxxxxxx xx\xxxxxxx\xxxxxx xxxxxxxxx\xxxxx\x
+ 9 xxx\xxxx\xxxxxxxx xxx\xxxxxxx\xxxxx xxxxxxxxxx\xxxxx\
+ 0 xxxx\xxxx\xxxxxxx xxxx\xxxxxxx\xxxx xxxxxxxxxxx\xxxxx
+ band_left < 0 (-6) band_left < 0 (-6) band_left >=0 (1)
+ band_right < 0 (-1) band_right >=0 (2) band_right >=0(7)
+ band_width 6 band_width 9 band_width 7
+ init score_mat, and iden_mat (place with upper 'X')
+ */
if (band_left < 0) { //set score to left border of the matrix within band
int tband = (band_right < 0) ? band_right : 0;
@@ -849,6 +851,7 @@ int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatri
i = -k;
j1 = k-band_left;
// penalty for leading gap opening = penalty for gap extension
+ // each of the left side query hunging residues give ext_gap (-1)
score_mat[i][j1] = mat.ext_gap * i;
back_mat[i][j1] = DP_BACK_TOP;
}
@@ -1678,6 +1681,101 @@ void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf
}
}
+// by liwz gzip version 2019-02
+// by liwz
+// disable swap option
+// change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
+// where des_begin is the FILE pointer of sequence record start
+// tot_length is the total bytes of sequence record
+void SequenceDB::Readgz( const char *file, const Options & options )
+{
+#ifdef WITH_ZLIB
+ Sequence one;
+ Sequence des;
+ gzFile fin = gzopen(file, "r");
+ char *buffer = NULL;
+ char *res = NULL;
+ int option_l = options.min_length;
+ if( fin == NULL ) bomb_error( "Failed to open the database file" );
+ Clear();
+ buffer = new char[ MAX_LINE_SIZE+1 ];
+
+ while (not gzeof( fin ) || one.size) { /* do not break when the last sequence is not handled */
+ buffer[0] = '>';
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL && one.size == 0) break;
+ if( buffer[0] == '+' ){
+ int len = strlen( buffer );
+ int len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // read next line quality score
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer );
+ len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+ }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+ if ( one.size ) { // write previous record
+ if( one.identifier == NULL || one.Format() ){
+ printf( "Warning: from file \"%s\",\n", file );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( one.identifier ) printf( "%s\n", one.identifier );
+ printf( "%s\n", one.data );
+ one.size = 0;
+ }
+ one.index = sequences.size();
+ if( one.size > option_l ) {
+ if (options.trim_len > 0) one.trim(options.trim_len);
+ sequences.Append( new Sequence( one ) );
+ }
+ }
+ one.size = 0;
+ one.tot_length = 0;
+
+ int len = strlen( buffer );
+ int len2 = len;
+ des.size = 0;
+ des += buffer;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ des += buffer;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ size_t offset = gztell( fin );
+ one.des_begin = offset - len;
+ one.tot_length += len; // count first line
+
+ int i = 0;
+ if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+ if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+ if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+ while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+ des.data[i] = 0;
+ one.identifier = des.data;
+ } else {
+ one.tot_length += strlen(buffer); one += buffer;
+ }
+ }
+ one.identifier = NULL;
+ delete[] buffer;
+ gzclose( fin );
+#else
+ bomb_error("this program was not compiled with zlib");
+#endif
+}
+
+
+
// by liwz
// disable swap option
// change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
@@ -1685,6 +1783,12 @@ void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf
// tot_length is the total bytes of sequence record
void SequenceDB::Read( const char *file, const Options & options )
{
+ int f_len = strlen(file);
+ if (strcmp(file + f_len - 3, ".gz") == 0 ) {
+ Readgz(file, options);
+ return;
+ }
+
Sequence one;
Sequence des;
FILE *fin = fopen( file, "rb" );
@@ -1773,9 +1877,172 @@ void SequenceDB::Read( const char *file, const Options & options )
fclose( fin );
}
+
+// by liwz gzip version 2019-02
+// PE reads liwz, disable swap option
+void SequenceDB::Readgz( const char *file, const char *file2, const Options & options )
+{
+#ifdef WITH_ZLIB
+ Sequence one, two;
+ Sequence des;
+ gzFile fin = gzopen(file, "r");
+ gzFile fin2= gzopen(file2,"r");
+ char *buffer = NULL;
+ char *buffer2= NULL;
+ char *res = NULL;
+ char *res2= NULL;
+ int option_l = options.min_length;
+ if( fin == NULL ) bomb_error( "Failed to open the database file" );
+ if( fin2== NULL ) bomb_error( "Failed to open the database file" );
+ Clear();
+ buffer = new char[ MAX_LINE_SIZE+1 ];
+ buffer2= new char[ MAX_LINE_SIZE+1 ];
+
+ while (((not gzeof( fin )) && (not gzeof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
+ buffer[0] = '>'; res =gzgets(fin, buffer, MAX_LINE_SIZE);
+ buffer2[0]= '>'; res2=gzgets(fin2, buffer2, MAX_LINE_SIZE);
+
+ if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
+ if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
+ if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
+ if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
+ if ( (res == NULL) && (one.size == 0)) break;
+
+ if( buffer[0] == '+' ){ // fastq 3rd line
+ // file 1
+ int len = strlen( buffer );
+ int len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // read next line quality score
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer );
+ len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // file 2
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
+ if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ two.tot_length += len;
+
+ // read next line quality score
+ if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){
+ if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ two.tot_length += len;
+
+ }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+ if ( one.size && two.size ) { // write previous record
+ if( one.identifier == NULL || one.Format() ){
+ printf( "Warning: from file \"%s\",\n", file );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( one.identifier ) printf( "%s\n", one.identifier );
+ printf( "%s\n", one.data );
+ one.size=0; two.size=0;
+ }
+ if( two.identifier == NULL || two.Format() ){
+ printf( "Warning: from file \"%s\",\n", file2 );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( two.identifier ) printf( "%s\n", two.identifier );
+ printf( "%s\n", two.data );
+ one.size=0; two.size = 0;
+ }
+ one.index = sequences.size();
+ if( (one.size + two.size)> option_l ) {
+ if (options.trim_len > 0) one.trim(options.trim_len);
+ if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
+ sequences.Append( new Sequence( one, two, 1 ) );
+ }
+ }
+ // R1
+ one.size = 0;
+ one.tot_length = 0;
+
+ int len = strlen( buffer );
+ int len2 = len;
+ des.size = 0;
+ des += buffer;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
+ des += buffer;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ size_t offset = gztell( fin );
+ one.des_begin = offset - len; // offset of ">" or "@"
+ one.tot_length += len; // count first line
+
+ int i = 0;
+ if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+ if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+ if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+ while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+ des.data[i] = 0; // find first non-space letter
+ one.identifier = des.data;
+
+ // R2
+ two.size = 0;
+ two.tot_length = 0;
+
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){
+ if ( (res=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ offset = gztell( fin2 );
+ two.des_begin = offset - len;
+ two.tot_length += len; // count first line
+ two.identifier = des.data;
+ } else {
+ one.tot_length += strlen(buffer); one += buffer;
+ two.tot_length+= strlen(buffer2); two+= buffer2;
+ }
+ }
+ one.identifier = NULL;
+ two.identifier = NULL;
+ delete[] buffer;
+ gzclose( fin );
+ delete[] buffer2;
+ gzclose( fin2 );
+#else
+ bomb_error("this program was not compiled with zlib");
+#endif
+
+}
+
// PE reads liwz, disable swap option
void SequenceDB::Read( const char *file, const char *file2, const Options & options )
{
+ int f_len = strlen(file);
+ int f_len2= strlen(file2);
+ if (strcmp(file + f_len - 3, ".gz") == 0 ) {
+ if ( strcmp(file2 + f_len2 - 3, ".gz") ) bomb_error( "Both input files need to be in .gz format" );
+ Readgz(file, file2, options);
+ return;
+ }
+
Sequence one, two;
Sequence des;
FILE *fin = fopen( file, "rb" );
@@ -2076,8 +2343,53 @@ void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Opt
fclose( fout );
delete []buf;
}
+
+// input db is gzipped
+void SequenceDB::WriteClustersgz( const char *db, const char *newdb, const Options & options )
+{
+#ifdef WITH_ZLIB
+ gzFile fin = gzopen(db, "r");
+ FILE *fout = fopen( newdb, "w+" );
+ int i, j, n = rep_seqs.size();
+ int count, rest;
+ char *buf = new char[MAX_LINE_SIZE+1];
+ vector<uint64_t> sorting( n );
+ if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
+ for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
+ std::sort( sorting.begin(), sorting.end() );
+ for (i=0; i<n; i++){
+ Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
+ gzseek( fin, seq->des_begin, SEEK_SET );
+
+ count = seq->tot_length / MAX_LINE_SIZE;
+ rest = seq->tot_length % MAX_LINE_SIZE;
+ //printf( "count = %6i, rest = %6i\n", count, rest );
+ for (j=0; j<count; j++){
+ if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, MAX_LINE_SIZE, fout );
+ }
+ if( rest ){
+ if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, rest, fout );
+ }
+ }
+ gzclose( fin );
+ fclose( fout );
+ delete []buf;
+#else
+ bomb_error("this program was not compiled with zlib");
+#endif
+
+}
+
void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options & options )
{
+ int f_len = strlen(db);
+ if (strcmp(db + f_len - 3, ".gz") == 0 ) {
+ WriteClustersgz(db, newdb, options);
+ return;
+ }
+
FILE *fin = fopen( db, "rb" );
FILE *fout = fopen( newdb, "w+" );
int i, j, n = rep_seqs.size();
@@ -2107,9 +2419,101 @@ void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options
fclose( fout );
delete []buf;
}
+
+
+// input db is gzipped
+// liwz PE output
+void SequenceDB::WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
+{
+#ifdef WITH_ZLIB
+ gzFile fin = gzopen(db, "r");
+ gzFile fin_pe = gzopen(db_pe, "r");
+ FILE *fout = fopen( newdb, "w+" );
+ FILE *fout_pe = fopen( newdb_pe, "w+" );
+ int i, j, n = rep_seqs.size();
+ int count, rest;
+ char *buf = new char[MAX_LINE_SIZE+1];
+ vector<uint64_t> sorting( n );
+ if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
+ if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
+ for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
+ std::sort( sorting.begin(), sorting.end() );
+
+ //sort fasta / fastq
+ int *clstr_size;
+ int *clstr_idx1;
+ if (options.sort_outputf) {
+ clstr_size = new int[n];
+ clstr_idx1 = new int[n];
+ for (i=0; i<n; i++) {
+ clstr_size[i] = 0;
+ clstr_idx1[i] = i;
+ }
+
+ int N = sequences.size();
+ for (i=0; i<N; i++) {
+ int id = sequences[i]->cluster_id;
+ if (id < 0) continue;
+ if (id >=n) continue;
+ clstr_size[id]++;
+ }
+ quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
+ }
+
+ for (i=0; i<n; i++){
+ Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
+ if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
+ //R1
+ gzseek( fin, seq->des_begin, SEEK_SET );
+
+ count = seq->tot_length / MAX_LINE_SIZE;
+ rest = seq->tot_length % MAX_LINE_SIZE;
+ //printf( "count = %6i, rest = %6i\n", count, rest );
+ for (j=0; j<count; j++){
+ if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, MAX_LINE_SIZE, fout );
+ }
+ if( rest ){
+ if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, rest, fout );
+ }
+
+ //R2
+ gzseek( fin_pe, seq->des_begin2, SEEK_SET );
+
+ count = seq->tot_length2 / MAX_LINE_SIZE;
+ rest = seq->tot_length2 % MAX_LINE_SIZE;
+ //printf( "count = %6i, rest = %6i\n", count, rest );
+ for (j=0; j<count; j++){
+ if( gzread(fin_pe, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
+ }
+ if( rest ){
+ if( gzread(fin_pe, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, rest, fout_pe );
+ }
+
+ }
+ gzclose( fin );
+ gzclose( fin_pe );
+ fclose( fout );
+ fclose( fout_pe );
+ delete []buf;
+#else
+ bomb_error("this program was not compiled with zlib");
+#endif
+}
+
+
// liwz PE output
void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
{
+ int f_len = strlen(db);
+ if (strcmp(db + f_len - 3, ".gz") == 0 ) {
+ WriteClustersgz(db, db_pe, newdb, newdb_pe, options);
+ return;
+ }
+
FILE *fin = fopen( db, "rb" );
FILE *fout = fopen( newdb, "w+" );
FILE *fin_pe = fopen( db_pe, "rb" );
@@ -2816,7 +3220,7 @@ void SequenceDB::DoClustering( int T, const Options & options )
}
printf( "\n%9i finished %9i clusters\n", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "\nApprixmated maximum memory consumption: %zuM\n", mem );
+ printf( "\nApproximated maximum memory consumption: %zuM\n", mem );
last_table.Clear();
word_table.Clear();
}
@@ -2865,9 +3269,9 @@ int SequenceDB::CheckOneAA( Sequence *seq, WordTable & table, WorkingParam & par
}
}
- //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
- //longer seqeunce * -aL -band_width
- if ( S ) {
+ //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
+ //longer seqeunce * -aL -band_width
+ if ( S ) {
int min = table.sequences[S-1]->size;
int min_red = min * options.long_coverage - options.band_width;
if (len < min_red) return 0; // return flag=0
@@ -3310,6 +3714,14 @@ void SequenceDB::DoClustering( const Options & options )
size_t max_items = opts.max_entries;
size_t max_seqs = opts.max_sequences;
size_t items = 0;
+
+// find a block from i to m, so that this block can fit into a word table
+// ...
+// i ++++++++++++++++++++++++++
+// ++++++++++++++++++++
+// ++++++++++++++++
+// m +++++++++++++
+// ...
while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
Sequence *seq = sequences[m];
if( ! (seq->state & IS_REDUNDANT) ){
@@ -3322,7 +3734,7 @@ void SequenceDB::DoClustering( const Options & options )
if( m > N ) m = N;
printf( "\rcomparing sequences from %9i to %9i\n", i, m );
fflush( stdout );
- for(int ks=i; ks<m; ks++){
+ for(int ks=i; ks<m; ks++){ // clustering this block
Sequence *seq = sequences[ks];
i = ks + 1;
if (seq->state & IS_REDUNDANT) continue;
@@ -3332,11 +3744,11 @@ void SequenceDB::DoClustering( const Options & options )
if( word_table.size >= max_items ) break;
int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
if( word_table.sequences.size() >= tmax ) break;
- }
+ } // finishing word table from this block
m = i;
if( word_table.size == 0 ) continue;
float p0 = 0;
- for(int j=m; j<N; j++){
+ for(int j=m; j<N; j++){ // use this word table to screen rest sequences m->N
Sequence *seq = sequences[j];
if (seq->state & IS_REDUNDANT) continue;
if ( options.store_disk ) seq->SwapIn();
@@ -3360,7 +3772,7 @@ void SequenceDB::DoClustering( const Options & options )
}
printf( "\n%9i finished %9i clusters\n", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "\nApprixmated maximum memory consumption: %liM\n", mem );
+ printf( "\nApproximated maximum memory consumption: %liM\n", mem );
temp_files.Clear();
word_table.Clear();
=====================================
cdhit-common.h
=====================================
@@ -35,11 +35,15 @@
#include<stdint.h>
#include<time.h>
+#ifdef WITH_ZLIB
+#include<zlib.h>
+#endif
+
#include<valarray>
#include<vector>
#include<map>
-#define CDHIT_VERSION "4.7"
+#define CDHIT_VERSION "4.8.1"
#ifndef MAX_SEQ
#define MAX_SEQ 655360
@@ -280,10 +284,10 @@ struct Options
int frag_size;
int option_r;
int threads;
- int PE_mode; // -P
- int trim_len; // -cx
- int trim_len_R2; // -cy
- int align_pos; // -ap for alignment position
+ int PE_mode; // -P
+ int trim_len; // -cx
+ int trim_len_R2; // -cy
+ int align_pos; // -ap for alignment position
size_t max_entries;
size_t max_sequences;
@@ -303,8 +307,8 @@ struct Options
string output;
string output_pe;
- int sort_output; // -sc
- int sort_outputf; // -sf
+ int sort_output; // -sc
+ int sort_outputf; // -sf
Options(){
backupFile = false;
@@ -342,12 +346,12 @@ struct Options
frag_size = 0;
des_len = 20;
threads = 1;
- PE_mode = 0;
- trim_len = 0;
- trim_len_R2 = 0;
- align_pos = 0;
- sort_output = 0;
- sort_outputf = 0;
+ PE_mode = 0;
+ trim_len = 0;
+ trim_len_R2 = 0;
+ align_pos = 0;
+ sort_output = 0;
+ sort_outputf = 0;
max_entries = 0;
max_sequences = 1<<20;
mem_limit = 100000000;
@@ -374,7 +378,7 @@ struct Sequence
// length of the sequence:
int size;
int bufsize;
- int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
+ int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
//uint32_t stats;
@@ -387,8 +391,8 @@ struct Sequence
// stream offset of the description string in the database:
size_t des_begin, des_begin2;
- // total record length
- int tot_length, tot_length2;
+ // total record length
+ int tot_length, tot_length2;
char *identifier;
@@ -417,7 +421,7 @@ struct Sequence
int Format();
void ConvertBases();
- void trim(int trim_len);
+ void trim(int trim_len);
void SwapIn();
void SwapOut();
@@ -559,9 +563,17 @@ class SequenceDB
~SequenceDB(){ Clear(); }
void Read( const char *file, const Options & options );
+ void Readgz( const char *file, const Options & options );
+
void Read( const char *file, const char *file2, const Options & options );
+ void Readgz( const char *file, const char *file2, const Options & options );
+
void WriteClusters( const char *db, const char *newdb, const Options & options );
+ void WriteClustersgz( const char *db, const char *newdb, const Options & options );
+
void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+ void WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+
void WriteExtra1D( const Options & options );
void WriteExtra2D( SequenceDB & other, const Options & options );
void DivideSave( const char *db, const char *newdb, int n, const Options & options );
=====================================
cdhit-utility.c++
=====================================
@@ -13,18 +13,19 @@ char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artif
//
char contacts[] =
- " Questions, bugs, contact Limin Fu at l2fu at ucsd.edu, or Weizhong Li at liwz at sdsc.edu\n"
- " For updated versions and information, please visit: http://cd-hit.org\n\n"
+ " Questions, bugs, contact Weizhong Li at liwz at sdsc.edu\n"
+ " For updated versions and information, please visit: http://cd-hit.org\n"
+ " or https://github.com/weizhongli/cdhit\n\n"
" cd-hit web server is also available from http://cd-hit.org\n\n"
" If you find cd-hit useful, please kindly cite:\n\n";
-char txt_option_i[] = "\tinput filename in fasta format, required\n";
+char txt_option_i[] = "\tinput filename in fasta format, required, can be in .gz format\n";
char txt_option_j[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
\t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
-char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
-char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
+char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required, can be in .gz format\n";
+char txt_option_i2[] = "\tinput filename for db2 in fasta format, required, can be in .gz format\n";
char txt_option_j2[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
@@ -34,12 +35,12 @@ char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (
char txt_option_c[] =
"\tsequence identity threshold, default 0.9\n \
\tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the full length of the shorter sequence\n";
char txt_option_G[] =
"\tuse global sequence identity, default 1\n \
\tif set to 0, then use local sequence identity, calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the length of the alignment\n \
\tNOTE!!! don't use -G 0 unless you use alignment coverage controls\n \
\tsee options -aL, -AL, -aS, -AS\n";
@@ -135,7 +136,8 @@ char txt_option_sc[] =
\tif set to 1, output clusters by decreasing size\n";
char txt_option_sf[] =
"\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
-\tif set to 1, output sequences by decreasing cluster size\n";
+\tif set to 1, output sequences by decreasing cluster size\n \
+\tthis can be very slow if the input is in .gz format\n";
char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
@@ -337,7 +339,7 @@ int print_usage_div (char *arg) {
char mytxt_option_c[] =
"\tsequence identity threshold, default 0.98\n \
\tthis is a \"global sequence identity\" calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the full length of the shorter sequence + gaps\n";
char mytxt_option_b[] = "\tband_width of alignment, default 10\n";
char mytxt_option_n_est[] = "\tword_length, default 10, see user's guide for choosing it\n";
=====================================
debian/changelog
=====================================
@@ -1,9 +1,18 @@
-cd-hit (4.6.8-3) UNRELEASED; urgency=medium
+cd-hit (4.8.1-1) unstable; urgency=medium
+ [ Jelmer Vernooij ]
* Use secure copyright file specification URI.
* Trim trailing whitespace.
- -- Jelmer Vernooij <jelmer at debian.org> Sat, 20 Oct 2018 13:28:38 +0000
+ [ Andreas Tille ]
+ * New upstream version
+ * debhelper 12
+ * Standards-Version: 4.4.0
+ * Remove trailing whitespace in debian/rules
+ * Do not parse d/changelog
+ * Build-Depends: zlib1g-dev
+
+ -- Andreas Tille <tille at debian.org> Fri, 19 Jul 2019 19:34:25 +0200
cd-hit (4.6.8-2) unstable; urgency=medium
=====================================
debian/compat
=====================================
@@ -1 +1 @@
-11
+12
=====================================
debian/control
=====================================
@@ -4,9 +4,10 @@ Uploaders: Tim Booth <tbooth at ceh.ac.uk>,
Andreas Tille <tille at debian.org>
Section: science
Priority: optional
-Build-Depends: debhelper (>= 11~),
- help2man
-Standards-Version: 4.2.1
+Build-Depends: debhelper (>= 12~),
+ help2man,
+ zlib1g-dev
+Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/cd-hit
Vcs-Git: https://salsa.debian.org/med-team/cd-hit.git
Homepage: http://weizhong-lab.ucsd.edu/cd-hit/
=====================================
debian/rules
=====================================
@@ -9,10 +9,9 @@
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
-pkg := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-ver := $(shell dpkg-parsechangelog | sed -ne 's/^Version: \(\([0-9]\+\):\)\?\(.*\)-.*/\3/p')
+include /usr/share/dpkg/default.mk
-mandir=$(CURDIR)/debian/$(pkg)/usr/share/man/man1/
+mandir=$(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/
%:
dh $@
@@ -23,54 +22,54 @@ override_dh_auto_build:
override_dh_installman:
mkdir -p $(mandir)
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences' \
$(CURDIR)/cd-hit > $(mandir)/cd-hit.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences, optimised for 454 data' \
$(CURDIR)/cd-hit-454 > $(mandir)/cd-hit-454.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences in db1 or db2 format' \
$(CURDIR)/cd-hit-2d > $(mandir)/cd-hit-2d.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='run CD-HIT algorithm on RNA/DNA sequences' \
$(CURDIR)/cd-hit-est > $(mandir)/cd-hit-est.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='run CD-HIT algorithm on RNA/DNA sequences in db1 or db2 format' \
$(CURDIR)/cd-hit-est-2d > $(mandir)/cd-hit-est-2d.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a big clustering job into pieces to run cd-hit-2d or cd-hit-est-2d jobs' \
$(CURDIR)/cd-hit-2d-para.pl > $(mandir)/cd-hit-2d-para.1
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a big clustering job into pieces to run cd-hit or cd-hit-est jobs' \
$(CURDIR)/cd-hit-para.pl > $(mandir)/cd-hit-para.1
-
- help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+
+ help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a FASTA file into pieces' \
$(CURDIR)/cd-hit-div.pl > $(mandir)/cd-hit-div.1
# psi-cd-hit.pl is throwing some errors which are fixed using sed
# ... at least in versions <= 4.6.4 - now help2man does not work at all
- #help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ #help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities' \
# $(CURDIR)/psi-cd-hit.pl | \
# sed -e '/^Name "main::.*" used only once:/d' \
# > $(mandir)/psi-cd-hit.1
# help2man does not work at all since version 4.6.4
- #help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ #help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
# $(CURDIR)/psi-cd-hit-2d.pl > $(mandir)/psi-cd-hit-2d.1
# FIXME: what is the difference between psi-cd-hit-2d.pl and psi-cd-hit-2d-g1.pl ?
# help2man does not work at all since version 4.6.4
- #help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+ #help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
# $(CURDIR)/psi-cd-hit-2d-g1.pl > $(mandir)/psi-cd-hit-2d-g1.1
@@ -87,7 +86,7 @@ override_dh_installman:
# psi-cd-hit-local.pl -> even throws several "used only once: possible typo" errors
override_dh_auto_install:
- dh_auto_install -- PREFIX=debian/$(pkg)/usr/lib/cd-hit
+ dh_auto_install -- PREFIX=debian/$(DEB_SOURCE)/usr/lib/cd-hit
override_dh_compress:
dh_compress --exclude=.pdf
=====================================
psi-cd-hit/psi-cd-hit-local.pl
=====================================
@@ -13,21 +13,22 @@ our $opt_aS = 0.0; #
our $opt_aL = 0.0; #
our $circle = 0; #
our $opt_g = 1; ####################
-our $blast_exe = "blastall -p blastp -m 8"; #########################
-our $prof_exe = "blastpgp -m 8"; #
-our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
-our $prof_db = ""; #
-our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
-our $bl_STDIN = 1; #
-our $keep_bl = 0; #
-our $blast_prog= "blastp"; #
-our $formatdb = "formatdb"; #########################
+
+our $blast_exe = "blastp"; #########################
+our $bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000"; # program
+our $bl_para_u = ""; #
+our $bl_STDIN = 1; #
+our $keep_bl = 0; #
+our $blast_prog= "blastp"; #
+our $formatdb = "makeblastdb"; #########################
+
our $exec_mode = "local"; #######################
-our $num_qsub = 1; #
+our $num_qsub = 1; #
our $para_no = 1; # compute
our $sh_file = ""; #
our $num_multi_seq = 50; #
our $batch_no_per_node = 100; #######################
+
our $reformat_seg = 50000;
our $restart_seg = 20000;
our $job = "";
@@ -73,9 +74,7 @@ sub parse_para_etc {
elsif ($arg eq "-sl") { $skip_long = shift; }
## program
elsif ($arg eq "-prog") { $blast_prog= shift; }
- elsif ($arg eq "-p") { $prof_para = shift; }
- elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
- elsif ($arg eq "-s") { $bl_para = shift; }
+ elsif ($arg eq "-s") { $bl_para_u = shift; }
elsif ($arg eq "-k") { $keep_bl = shift; }
elsif ($arg eq "-bs") { $bl_STDIN = shift; }
## compute
@@ -103,38 +102,35 @@ sub parse_para_etc {
print_usage(); exit();
}
- if ($blast_prog eq "blastn") {
- $formatdb = "formatdb -p F";
- $blast_exe = "blastall -p blastn -m 8";
- }
- elsif ($blast_prog eq "megablast") {
- $blast_prog = "blastn"; #### back to blastn for blast parser type
- $formatdb = "formatdb -p F";
- $blast_exe = "megablast -H 100 -D 2 -m 8";
- }
- elsif ($blast_prog eq "blastpgp") {
- $blast_exe = "blastpgp -m 8 -j 3";
- }
-
#### for blast+
if ($bl_plus) {
- $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
- $blast_exe = "blastp -outfmt 6";
- $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
-
- if ($blast_prog eq "blastn") {
- $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
- $blast_exe = "blastn -task blastn -outfmt 6";
- $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ if ($blast_prog eq "blastp") {
+ $formatdb = "makeblastdb -dbtype prot -max_file_sz 2GB";
+ $blast_exe = "blastp -outfmt 6";
+ $bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "psiblast") {
+ $formatdb = "makeblastdb -dbtype prot -max_file_sz 2GB";
+ $blast_exe = "psiblast -outfmt 6 -num_iterations 3";
+ $bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "blastn") {
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 2GB";
+ $blast_exe = "blastn -task blastn -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
}
elsif ($blast_prog eq "megablast") {
- $blast_prog = "blastn"; #### back to blastn for blast parser type
- $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
- $blast_exe = "blastn -task megablast -outfmt 6";
- $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 2GB";
+ $blast_exe = "blastn -task megablast -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
+ $blast_prog= "blastn"; #### back to blastn for blast parser type
+ }
+ else {
+ print "Unknown blast program: $blast_prog\n";
+ print_usage(); exit();
}
- elsif ($blast_prog eq "blastpgp") {
- $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
+ if ($bl_para_u) {
+ $bl_para = "$bl_para_u -num_threads $bl_threads";
}
}
@@ -142,6 +138,13 @@ sub parse_para_etc {
$blast_exe = "$bl_path/$blast_exe";
$formatdb = "$bl_path/$formatdb";
}
+ my $bl_ver = `$blast_exe -version`;
+ if ($bl_ver =~ /blast/) {
+ print "BLAST version:\n$bl_ver\n\n";
+ }
+ else {
+ die "BLAST program not found: $blast_exe\n";
+ }
(-e $db_in) || die "No input";
($db_out) || die "No output";
@@ -576,7 +579,7 @@ sub keep_strand_with_top_hsp {
}
########## END keep_strand_with_top_hsp
-########## for blastpgp -j no (no>1)
+########## for psiblast -j no (no>1)
########## keep hits from the last round
sub keep_hsp_of_last_round {
my $self = shift;
@@ -691,7 +694,7 @@ sub process_blout_blastp_blastn {
my $len_rep = 0;
my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
- if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
+ if ($blast_prog eq "psiblast" ) {keep_hsp_of_last_round($bl); }
if ($g_iden == 0 ) { #### Local identity
keep_top_hsp($bl); #### local alignment, only the top HSP
@@ -1084,11 +1087,7 @@ Options
long sequences will not be clustered anyway.
-sl 0 means no skipping
program:
- -prog (blastp, blastn, megablast, blastpgp), default $blast_prog
- -p profile search para, default
- "$prof_para"
- -dprof database for building PSSM, default using input
- you can also use another database that is more comprehensive like NR80
+ -prog (blastp, blastn, megablast, psiblast), default $blast_prog
-s blast search para, default
"$bl_para"
-bs (1/0) default $bl_STDIN
=====================================
usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
=====================================
@@ -105,7 +105,7 @@ $CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5
$CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
$CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
-$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+$CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
EOD
=====================================
usecases/Miseq-16S/NG-Omics-Miseq-16S.py
=====================================
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+queue_system = 'SGE'
+
+########## local variables etc. Please edit
+ENV={
+ 'CD_HIT_dir' : '/home/oasis/data/NGS-ann-project/apps/cdhit.git',
+ 'NGS_prog_trimmomatic' : '/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar',
+}
+
+########## computation resources for execution of jobs
+NGS_executions = {}
+NGS_executions['qsub_1'] = {
+ 'type' : 'qsub-pe',
+ 'cores_per_node' : 8,
+ 'number_nodes' : 64,
+ 'user' : 'weizhong', #### I will use command such as qstat -u weizhong to query submitted jobs
+ 'command' : 'qsub',
+ 'command_name_opt' : '-N',
+ 'command_err_opt' : '-e',
+ 'command_out_opt' : '-o',
+ 'template' : '''#!/bin/bash
+##$ -q RNA.q
+#$ -v PATH
+#$ -V
+
+'''
+}
+
+NGS_executions['sh_1'] = {
+ 'type' : 'sh',
+ 'cores_per_node' : 8,
+ 'number_nodes' : 1,
+ 'template' : '''#!/bin/bash
+
+'''
+}
+
+
+NGS_batch_jobs = {}
+NGS_batch_jobs['qc'] = {
+ 'CMD_opts' : ['100'],
+ 'execution' : 'qsub_1', # where to execute
+ 'cores_per_cmd' : 4, # number of threads used by command below
+ 'no_parallel' : 1, # number of total jobs to run using command below
+ 'command' : '''
+java -jar $ENV.NGS_prog_trimmomatic PE -threads 4 -phred33 $DATA.0 $DATA.1 $SELF/R1.fq $SELF/R1-s.fq $SELF/R2.fq $SELF/R2-s.fq \\
+ SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$CMDOPTS.0 MAXINFO:80:0.5 1>$SELF/qc.stdout 2>$SELF/qc.stderr
+
+perl -e '$i=0; while(<>){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa &
+perl -e '$i=0; while(<>){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa &
+
+wait
+rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
+'''
+}
+
+
+NGS_batch_jobs['otu'] = {
+ 'injobs' : ['qc'],
+ 'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+ 'execution' : 'qsub_1', # where to execute
+ 'cores_per_cmd' : 2, # number of threads used by command below
+ 'no_parallel' : 1, # number of total jobs to run using command below
+ 'command' : '''
+#### cluster at 100% PE
+$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.nr.log
+#### cluster at 99% PE and SE for R1,R2
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.99.log
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\
+ -i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\
+ -a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr $SELF/seq.99f.clstr > $SELF/seq.99f-all.clstr
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr
+
+rm -f $SELF/seq.chimeric-clstr.R1 $SELF/seq.chimeric-clstr.R1.log $SELF/seq.chimeric-clstr.R2 $SELF/seq.chimeric-clstr.R2.log
+rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log
+mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt
+
+'''
+}
+
+
+NGS_batch_jobs['otu-pooled'] = {
+ 'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+ 'execution' : 'qsub_1', # where to execute
+ 'cores_per_cmd' : 2, # number of threads used by command below
+ 'no_parallel' : 1, # number of total jobs to run using command below
+ 'command' : '''
+#### before running
+#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
+$ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
+ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
+
+'''
+}
+
=====================================
usecases/Miseq-16S/NG-Omics-WF.py
=====================================
@@ -0,0 +1,888 @@
+#!/usr/bin/env python
+# =============================== NG-Omics-WF ==================================
+# _ _ _____ ____ _ __ ________
+# | \ | |/ ____| / __ \ (_) \ \ / / ____|
+# | \| | | __ ______| | | |_ __ ___ _ ___ ___ _____\ \ /\ / /| |__
+# | . ` | | |_ |______| | | | '_ ` _ \| |/ __/ __|______\ \/ \/ / | __|
+# | |\ | |__| | | |__| | | | | | | | (__\__ \ \ /\ / | |
+# |_| \_|\_____| \____/|_| |_| |_|_|\___|___/ \/ \/ |_|
+#
+# Workflow tools for next generation genomics, metagenomics, RNA-seq
+# and other type of omics data analyiss,
+#
+# Software originally developed since 2010 by Weizhong Li at UCSD
+# currently at JCVI
+#
+# https://github.com/weizhongli/ngomicswf liwz at sdsc.edu
+# ==============================================================================
+
+import os
+import sys
+import re
+import argparse
+from argparse import RawTextHelpFormatter
+import math
+import subprocess
+import time
+import logging
+import textwrap
+import imp
+import collections
+import xml.etree.ElementTree as ET
+
+__author__ = 'Weizhong Li'
+
+############## Global variables
+banner = '''
+ ==================================================================
+ Workflow tools for next generation genomics, metagenomics, RNA-seq
+ and other type of omics data analyiss,
+
+ Software originally developed since 2010 by Weizhong Li at UCSD
+ currently at JCVI
+
+ http://weizhongli-lab.org/ngomicswf liwz at sdsc.edu
+ ==================================================================
+'''
+NGS_config = None
+NGS_samples = []
+NGS_sample_data = {}
+NGS_opts = {}
+pwd = os.path.abspath('.')
+subset_flag = False
+subset_jobs = []
+qstat_xml_data = collections.defaultdict(dict)
+job_list = collections.defaultdict(dict) # as job_list[$t_job_id][$t_sample_id] = {}
+execution_submitted = {} # number of submitted jobs (qsub) or threads (local sh)
+############## END Global variables
+
+
+def fatal_error(message, exit_code=1):
+ print message
+ exit(exit_code)
+
+
+def read_parameters(args):
+ '''read option parameters from file or command line'''
+ if args.parameter_file:
+ try:
+ ##format example
+ ##JobID_A opt0 opt1 opt2
+ ##JobID_B opt0 opt1
+ ##JobID_C opt0 opt1 opt2 opt3
+ f = open(args.parameter_file, 'r')
+ for line in f:
+ if line[0] == '#':
+ continue
+ if not re.match('^\w', line):
+ continue
+ ll = re.split('\s+', line.rstrip());
+ NGS_opts[ ll[0] ] = ll[1:]
+ f.close()
+ except IOError:
+ fatal_error('cannot open ' + args.parameter_file, exit_code=1)
+
+ elif args.parameter_name:
+ for line in re.split(',', args.parameter_name):
+ ll = re.split(':', line);
+ NGS_opts[ ll[0] ] = ll[1:]
+ return
+
+
+def read_samples(args):
+ '''read sample and sample data from file or command line'''
+ if args.sample_file:
+ try:
+ f = open(args.sample_file, 'r')
+ for line in f:
+ if line[0] == '#':
+ continue
+ if not re.match('^\w', line):
+ continue
+ ll = re.split('\s+', line.rstrip());
+ NGS_samples.append(ll[0]);
+ NGS_sample_data[ ll[0] ] = ll[1:]
+ f.close()
+ except IOError:
+ fatal_error('cannot open ' + args.sample_file, exit_code=1)
+
+ elif args.sample_name:
+ for line in re.split(',', args.sample_name):
+ ll = re.split(':', line);
+ NGS_samples.append(ll[0]);
+ NGS_sample_data[ ll[0] ] = ll[1:]
+ else:
+ fatal_error('no input sample', exit_code=1)
+
+ for sample in NGS_samples:
+ if os.path.exists(sample):
+ if os.path.isdir(sample):
+ pass
+ else:
+ fatal_error('file exist: ' + sample, exit_code=1)
+ else:
+ if os.system("mkdir " + sample):
+ fatal_error('can not mkdir: ' + sample, exit_code=1)
+ return
+
+
+def task_level_jobs(NGS_config):
+ '''according to dependancy, make level of jobs'''
+ job_level = {}
+ while True:
+ change_flag = False
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ t_injobs = []
+ if 'injobs' in t_job.keys():
+ t_injobs = t_job['injobs']
+
+ if len(t_injobs) > 0:
+ max_level_injob = 0
+ for j in t_injobs:
+ if not j in job_level.keys():
+ continue
+ if job_level[j] > max_level_injob:
+ max_level_injob = job_level[j]
+ if max_level_injob == 1:
+ continue
+ max_level_injob +=1 #### one more level
+ if (t_job_id in job_level.keys()) and (job_level[t_job_id] >= max_level_injob):
+ continue
+ job_level[t_job_id]=max_level_injob
+ change_flag = 1
+ else:
+ if not t_job_id in job_level.keys():
+ job_level[t_job_id]=1
+ change_flag = True
+
+ if not change_flag: break
+
+ # print job_level
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ NGS_config.NGS_batch_jobs[t_job_id]['job_level'] = job_level[t_job_id]
+
+ return
+#### END task_level_jobs(NGS_config)
+
+
+def add_subset_jobs_by_dependency(NGS_config):
+ '''add dependant jobs'''
+ while True:
+ num_subset_jobs = len(subset_jobs)
+ for t_job_id in subset_jobs:
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ if 'injobs' in t_job.keys():
+ for j in t_job['injobs']:
+ if not (j in subset_jobs):
+ subset_jobs.append(j)
+ if num_subset_jobs == len(subset_jobs): break
+ return
+#### END add_subset_jobs_by_dependency()
+
+
+def make_job_list(NGS_config):
+ '''make sh script for each job / sample'''
+
+ verify_flag = False
+ for t_job_id in NGS_config.NGS_batch_jobs:
+ if subset_flag and not (t_job_id in subset_jobs):
+ continue
+
+ t_job = NGS_config.NGS_batch_jobs[ t_job_id ]
+ t_execution = NGS_config.NGS_executions[ t_job["execution"] ]
+
+ pe_parameter = ''
+ if t_execution[ 'type' ] == 'qsub-pe':
+ t_cores_per_cmd = t_job[ 'cores_per_cmd' ]
+ pe_parameter = "#$ -pe orte " + str(t_cores_per_cmd)
+ if 'pe_para' in t_execution.keys():
+ pe_parameter = "#$ " + t_execution[ 'pe_para' ] + " " + str(t_cores_per_cmd)
+
+ if t_job[ 'cores_per_cmd' ] > t_execution[ 'cores_per_node' ]:
+ fatal_error('not enough cores ' + t_job_id, exit_code=1)
+
+ t_job[ 'cmds_per_node' ] = t_execution[ 'cores_per_node' ] / t_job[ 'cores_per_cmd' ]
+ t_job[ 'nodes_total' ] = math.ceil( t_job[ 'no_parallel' ] / float(t_job[ 'cmds_per_node' ]))
+
+ if t_job[ 'nodes_total' ] > t_execution[ 'number_nodes' ]:
+ fatal_error('not enough nodes ' + t_job, exit_code=1)
+
+ CMD_opts = []
+ if 'CMD_opts' in t_job.keys():
+ CMD_opts = t_job[ 'CMD_opts' ]
+ if t_job_id in NGS_opts.keys():
+ CMD_opts = NGS_opts[ t_job_id ]
+
+ for t_sample_id in NGS_samples:
+ t_command = t_job[ 'command' ]
+ t_command = re.sub('\$SAMPLE', t_sample_id, t_command)
+ t_command = re.sub('\$SELF' , t_job_id, t_command)
+
+ for i in NGS_config.ENV.keys():
+ t_command = re.sub('\$ENV.'+i, NGS_config.ENV[i], t_command)
+
+ for i_data in range(0, len(NGS_sample_data[ t_sample_id ])):
+ t_data = NGS_sample_data[ t_sample_id ][i_data]
+ t_re = '\$DATA\.' + str(i_data)
+ t_command = re.sub(t_re, t_data, t_command)
+
+ t_injobs = []
+ if 'injobs' in t_job.keys():
+ t_injobs = t_job[ 'injobs' ]
+ for i_data in range(0, len(t_job[ 'injobs' ])):
+ t_data = t_job[ 'injobs' ][i_data]
+ t_re = '\$INJOBS\.' + str(i_data)
+ t_command = re.sub(t_re, t_data, t_command)
+
+ for i_data in range(0, len(CMD_opts)):
+ t_data = CMD_opts[i_data]
+ t_re = '\$CMDOPTS\.' + str(i_data)
+ t_command = re.sub(t_re, t_data, t_command)
+
+ v_command = ''
+ if 'non_zero_files' in t_job.keys():
+ for t_data in t_job[ 'non_zero_files' ]:
+ v_command = v_command + \
+ 'if ! [ -s {0}/{1} ]; then echo "zero size {2}/{3}"; exit; fi\n'.format(t_job_id, t_data, t_job_id, t_data)
+
+ f_start = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.start.date'
+ f_complete = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.complete.date'
+ f_cpu = pwd + '/' + t_sample_id + '/' + t_job_id + '/WF.cpu'
+ t_sh_file = '{0}/WF-sh/{1}.{2}.sh'.format(pwd, t_job_id, t_sample_id)
+ t_infiles = []
+ if 'infiles' in t_job.keys():
+ t_infiles = map(lambda x: t_sample_id + "/" + x, t_job[ 'infiles' ])
+ job_list[ t_job_id ][ t_sample_id ] = {
+ 'sample_id' : t_sample_id,
+ 'job_id' : t_job_id,
+ 'status' : 'wait', #### status can be wait (input not ready), ready (input ready), submitted (submitted or running), completed
+ 'command' : t_command,
+ 'sh_file' : t_sh_file,
+ 'infiles' : t_infiles,
+ 'injobs' : t_injobs,
+ 'start_file' : f_start,
+ 'complete_file': f_complete,
+ 'cpu_file' : f_cpu }
+
+ if not os.path.exists( t_sh_file ):
+ try:
+ tsh = open(t_sh_file, 'w')
+ tsh.write('''{0}
+{1}
+
+my_host=`hostname`
+my_pid=$$
+my_core={2}
+my_queue={3}
+my_time_start=`date +%s`
+
+cd {4}/{5}
+mkdir {6}
+if ! [ -f {7} ]; then date +%s > {7}; fi
+{8}
+{9}
+date +%s > {10}
+my_time_end=`date +%s`;
+my_time_spent=$((my_time_end-my_time_start))
+echo "sample={5} job={6} host=$my_host pid=$my_pid queue=$my_queue cores=$my_core time_start=$my_time_start time_end=$my_time_end time_spent=$my_time_spent" >> {11}
+
+'''.format(t_execution['template'], pe_parameter, t_job['cores_per_cmd'], t_job['execution'], pwd, t_sample_id, t_job_id, f_start, t_command, v_command, f_complete, f_cpu ))
+ tsh.close()
+ os.system('chmod u+x '+ t_sh_file)
+ except IOError:
+ fatal_error('cannot write to ' + job_list[ 't_job_id' ][ 't_sample_id' ][ 'sh_file' ], exit_code=1)
+ return
+### END def make_job_list(NGS_config):
+
+
+def time_str1(s):
+ str1 = str(s/3600) + 'h'
+ s = s % 3600
+ str1 = str1 + str(s/60) + 'm'
+ s = s % 60
+ str1 = str1 + str(s) + 's'
+ return str1
+
+
+def task_list_jobs(NGS_config):
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+
+ t_injobs = []
+ if 'injobs' in t_job.keys():
+ t_injobs = t_job['injobs']
+ print '{0}\tIn_jobs:[ {1} ]\tJob_level:{2}\n'.format(t_job_id, ','.join(t_injobs), t_job['job_level'] )
+
+
+def task_snapshot(NGS_config):
+ '''print job status'''
+ queue_system = NGS_config.queue_system #### default "SGE"
+ this_task = True
+ if this_task:
+ flag_qstat_xml_call = False
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ t_execution = NGS_config.NGS_executions[ t_job['execution']]
+ exe_type = t_execution['type']
+ if (queue_system == 'SGE') and (exe_type in ['qsub','qsub-pe']):
+ flag_qstat_xml_call = True
+
+ if flag_qstat_xml_call:
+ SGE_qstat_xml_query()
+
+ for t_sample_id in NGS_samples:
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ check_submitted_job(NGS_config, t_job_id, t_sample_id)
+
+ max_len_sample = 0;
+ for t_sample_id in NGS_samples:
+ if len(t_sample_id) > max_len_sample:
+ max_len_sample = len(t_sample_id)
+ max_len_job = 0;
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if len(t_job_id) > max_len_job:
+ max_len_job = len(t_job_id)
+
+ print '''
+Job status:
+.\twait
+-\tsubmitted
+r\trunning
++\tcompleted
+!\terror
+x\tunselected job
+'''
+
+ for i1 in range(max_len_job):
+ i = max_len_job - i1 - 1
+ print ' ' * max_len_sample + "\t",
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if i < len(t_job_id):
+ print ' ' + t_job_id[-i-1],
+ else:
+ print ' ',
+ print ''
+ print 'Sample\t' + ('-' * 30)
+
+ for t_sample_id in NGS_samples:
+ print t_sample_id + '\t',
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ print ' x',
+ continue
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ status = t_sample_job['status']
+ if status == 'completed': print ' +',
+ elif status == 'submitted': print ' -',
+ elif status == 'running': print ' r',
+ elif status == 'wait': print ' .',
+ elif status == 'error': print ' !',
+ else: print ' _',
+ print ''
+
+ print '\n\n'
+ return
+### def task_snapshot():
+
+
+def task_delete_jobs(NGS_config, opt):
+ '''manually delete jobs and its dependant jobs'''
+
+ mode, c = re.split(':', opt)
+ tmp_sh = 'NGS-{0}.sh'.format(os.getpid())
+
+ try:
+ tsh = open(tmp_sh, 'w')
+ except IOError:
+ fatal_error('cannot write to ' + tmp_sh, exit_code=1)
+
+ tsh.write('#Please execute the following commands\n')
+
+ for t_sample_id in NGS_samples:
+ job_to_delete_ids =[]
+ if mode == 'jobids':
+ job_to_delete_ids = re.split(',', c)
+ elif mode == 'run after':
+ if not os.path.exists(c):
+ fatel_error('File does not exist:' + c, exit_code=1)
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if subset_flag:
+ if not (t_job_id in subset_jobs): continue
+ t_sample_job = job_list[t_job_id][t_sample_id]
+
+ t_sh_pid = t_sample_job['sh_file'] + '.pids'
+ if not os.path.exists(t_sh_pid): continue
+ if file1_same_or_after_file2(t_sh_pid, c):
+ job_to_delete_ids.append(t_job_id)
+ else:
+ fatel_error('unknown option for deleteing jobs: ' + opt, exit_code=1)
+
+ # now job_to_delete_ids are jobs need to be deleted
+ # next find all jobs that depends on them, recrusively
+ no_jobs_to_delete = len(job_to_delete_ids)
+ while True:
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if subset_flag:
+ if not (t_job_id in subset_jobs): continue
+ t_sample_job = job_list[t_job_id][t_sample_id]
+
+ t_sh_pid = t_sample_job['sh_file'] + '.pids'
+ if not os.path.exists(t_sh_pid): continue
+
+ for t_job_id_2 in t_sample_job['injobs']:
+ if t_job_id_2 in job_to_delete_ids:
+ if not t_job_id in job_to_delete_ids:
+ job_to_delete_ids.append(t_job_id)
+
+ if no_jobs_to_delete == len(job_to_delete_ids): break
+ no_jobs_to_delete = len(job_to_delete_ids)
+
+ if not no_jobs_to_delete: continue
+ tsh.write('#jobs to be deleted for ' + t_sample_id + ': ' + ' '.join(job_to_delete_ids) + '\n'),
+
+ for t_job_id in job_to_delete_ids:
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ t_sh_pid = t_sample_job['sh_file'] + '.pids'
+ tsh.write('\\rm -rf {0}/{1}\n'.format(t_sample_id, t_job_id))
+ tsh.write('\\rm '+ t_sh_pid + '\n')
+
+ #### find the qsub ids to be deleted
+ pids = [] #### either pids, or qsub ids
+ try:
+ f = open(t_sh_pid, 'r')
+ pids = f.readlines()
+ f.close()
+ pids = [x.strip() for x in pids]
+ except IOError:
+ fatal_error('cannot open ' + t_sh_pid, exit_code=1)
+ tsh.write('qdel ' + ' '.join([str(x) for x in pids]) + '\n')
+
+ tsh.write('\n\n')
+ tsh.close()
+ print 'The script does not delete the files, please run ' + tmp_sh + ' to delete files!!!\n\n';
+ return
+#### END def task_delete_jobs()
+
+
+def file1_same_or_after_file2(file1, file2):
+ # if not exist file1, assume it is in future, so it is newer
+ if not os.path.exists(file1): return True
+ # otherwise file1 exist
+ # if file2 not exist
+ if not os.path.exists(file2): return False
+ if os.path.getmtime(file1) >= os.path.getmtime(file2): return True
+ else: return False
+
+
+def SGE_qstat_xml_query():
+ '''run qstat -f -xml and get xml tree'''
+ global qstat_xml_data
+ qstat_xml_data = collections.defaultdict(dict)
+ t_out = ''
+ try:
+ t_out = subprocess.check_output(['qstat -f -xml'], shell=True)
+ except:
+ fatal_error("can not run qstat", exit_code=1)
+
+ qstat_xml_root = ET.fromstring(t_out)
+ #qstat_xml_root = qstat_xml.getroot()
+ for job_list in qstat_xml_root.iter('job_list'):
+ job_id = job_list.find('JB_job_number').text
+ job_name = job_list.find('JB_name').text
+ job_state = job_list.find('state').text
+ qstat_xml_data[job_id] = [job_name, job_state]
+
+ return
+#### END def SGE_qstat_xml_query()
+
+
+def print_job_status_summary(NGS_config):
+ '''print jobs status'''
+ job_status = {}
+ job_total = 0
+
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ for t_sample_id in NGS_samples:
+ status = job_list[t_job_id][t_sample_id]['status']
+ if status in job_status.keys():
+ job_status[status] +=1
+ else:
+ job_status[status] =1
+ job_total +=1
+
+ print 'total jobs: {0},'.format(job_total) ,
+ for i in job_status.keys():
+ print '{0}: {1}, '.format(i, job_status[i]),
+ print '\n'
+
+
+def run_workflow(NGS_config):
+ '''major loop for workflow run'''
+ queue_system = NGS_config.queue_system #### default "SGE"
+ sleep_time_min = 15
+ sleep_time_max = 120
+ sleep_time = sleep_time_min
+
+ while 1:
+ flag_job_done = True
+ ########## reset execution_submitted to 0
+ for i in NGS_config.NGS_executions.keys():
+ execution_submitted[ i ] = False
+
+ flag_qstat_xml_call = False
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ t_execution = NGS_config.NGS_executions[ t_job['execution']]
+ exe_type = t_execution['type']
+ if (queue_system == 'SGE') and (exe_type in ['qsub','qsub-pe']):
+ flag_qstat_xml_call = True
+
+ if flag_qstat_xml_call:
+ SGE_qstat_xml_query()
+
+ ########## check and update job status for submitted jobs
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ for t_sample_id in NGS_samples:
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ if t_sample_job['status'] == 'completed':
+ continue
+
+ check_submitted_job(NGS_config, t_job_id, t_sample_id)
+ if t_sample_job['status'] == 'completed':
+ continue
+ flag_job_done = False
+
+ if flag_job_done:
+ print_job_status_summary(NGS_config)
+ print 'job completed!'
+ break
+
+ ########## check and update job status based on dependance
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ for t_sample_id in NGS_samples:
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ if t_sample_job['status'] != 'wait':
+ continue
+
+ t_ready_flag = True
+ for i in t_sample_job['infiles']:
+ if os.path.exists(i) and os.path.getsize(i) > 0:
+ continue
+ t_ready_flag = False
+ break
+
+ for i in t_sample_job['injobs']:
+ if job_list[i][t_sample_id]['status'] == 'completed':
+ continue
+ t_ready_flag = False
+ break
+
+ if t_ready_flag:
+ t_sample_job['status'] = 'ready'
+ print '{0},{1}: change status to ready\n'.format(t_job_id, t_sample_id)
+ ########## END check and update job status based on dependance
+
+ ########## submit local sh jobs
+ has_submitted_some_jobs = False
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ t_execution = NGS_config.NGS_executions[ t_job['execution']]
+ t_execution_id = t_job['execution']
+ if t_execution['type'] != 'sh':
+ continue
+ if execution_submitted[t_execution_id] >= t_execution['cores_per_node']:
+ continue
+ for t_sample_id in NGS_samples:
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ if t_sample_job['status'] != 'ready':
+ continue
+ if (execution_submitted[t_execution_id] + t_job['cores_per_cmd'] * t_job['no_parallel']) > \
+ t_execution['cores_per_node']: #### no enough available cores
+ continue
+
+ #### now submitting
+ pid_file = open( t_sample_job['sh_file'] + '.pids', 'w')
+ for i in range(0, t_job['no_parallel']):
+ p = subprocess.Popen(['/bin/bash', t_sample_job['sh_file']], shell=True)
+ pid_file.write(str(p.pid)+'\n')
+ pid_file.close()
+ t_sample_job['status'] = 'submitted'
+ print '{0},{1}: change status to submitted\n'.format(t_job_id, t_sample_id)
+ execution_submitted[ t_execution_id ] += t_job['cores_per_cmd'] * t_job['no_parallel']
+ has_submitted_some_jobs = True
+ ########## END submit local sh jobs
+
+ ########## submit qsub-pe jobs, multiple jobs may share same node
+ for t_job_id in NGS_config.NGS_batch_jobs.keys():
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ if subset_flag:
+ if not (t_job_id in subset_jobs):
+ continue
+ t_execution = NGS_config.NGS_executions[ t_job['execution']]
+ t_execution_id = t_job['execution']
+
+ if t_execution['type'] != 'qsub-pe':
+ continue
+ if execution_submitted[t_execution_id] >= t_execution['number_nodes']:
+ continue
+ t_cores_per_node = t_execution['cores_per_node']
+ t_cores_per_cmd = t_job['cores_per_cmd']
+ t_cores_per_job = t_cores_per_cmd * t_job['no_parallel']
+ t_nodes_per_job = t_cores_per_job / t_cores_per_node
+
+ for t_sample_id in NGS_samples:
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ if t_sample_job['status'] != 'ready':
+ continue
+
+ #### now submitting
+ pid_file = open( t_sample_job['sh_file'] + '.pids', 'w')
+ for i in range(0, t_job['no_parallel']):
+ t_stderr = t_sample_job['sh_file'] + '.' + str(i) + '.stderr'
+ t_stdout = t_sample_job['sh_file'] + '.' + str(i) + '.stdout'
+
+ qsub_exe = 'qsub'
+ if 'qsub_exe' in t_execution.keys(): qsub_exe = t_execution['qsub_exe']
+ command_line = qsub_exe + ' {0} {1} {2} {3} {4} {5} {6}'.format(t_execution['command_name_opt'], t_job_id,
+ t_execution['command_err_opt'], t_stderr,
+ t_execution['command_out_opt'], t_stdout, t_sample_job['sh_file'])
+ cmd = subprocess.check_output([command_line], shell=True)
+ if re.search('\d+', cmd):
+ pid = re.search('\d+', cmd).group(0)
+ pid_file.write(pid + '\n')
+ else:
+ fatal_error('error submitting jobs')
+ execution_submitted[t_execution_id] += t_nodes_per_job
+ print '{0} submitted for {1}\n'.format(t_sample_job['sh_file'], t_sample_id)
+
+ pid_file.close()
+ t_sample_job['status'] = 'submitted'
+ has_submitted_some_jobs = True
+ ########## END submit qsub-pe jobs, multiple jobs may share same node
+
+ ########## submit qsub jobs, job bundles disabled here, if need, check the original Perl script
+
+ #### if has submitted some jobs, reset waiting time, otherwise double waiting time
+ print_job_status_summary(NGS_config)
+ if has_submitted_some_jobs:
+ sleep_time = sleep_time_min
+ else:
+ sleep_time *= 2
+ if sleep_time > sleep_time_max:
+ sleep_time = sleep_time_max
+ time.sleep(sleep_time);
+ #### END while 1:
+ return
+#### END def run_workflow(NGS_config)
+
+
+def check_pid(pid):
+ '''Check For the existence of a unix pid. '''
+# opt 1, this doesn't print OSError to stderr
+ return os.path.exists('/proc/' + str(pid))
+
+# opt 2,
+# try:
+# os.kill(pid, 0)
+# except OSError: return False
+# else: return True
+
+
+def check_any_pids(pids):
+ '''Check For the existence of a list of unix pids. return True if any one exist'''
+ for pid in pids:
+ if check_pid(pid):
+ return True
+ return False
+
+
+def check_any_qsub_pids(pids):
+ '''Check For the existence of a list of qsub pids. return True if any one exist'''
+ for pid in pids:
+ if pid in qstat_xml_data.keys():
+ return True
+ return False
+
+
+def validate_job_files(t_job_id, t_sample_id):
+ '''return True if necessary file exist'''
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ if not (os.path.exists(t_sample_job['start_file']) and os.path.getsize(t_sample_job['start_file']) > 0): return False
+ if not (os.path.exists(t_sample_job['complete_file']) and os.path.getsize(t_sample_job['complete_file']) > 0): return False
+ if not (os.path.exists(t_sample_job['cpu_file']) and os.path.getsize(t_sample_job['cpu_file']) > 0): return False
+ return True
+
+
+#### def check_submitted_job()
+def check_submitted_job(NGS_config, t_job_id, t_sample_id):
+ '''
+ check submitted jobs by checking pids or qsub ids
+ update job status from wait|ready -> submitted if pid file exit (in case of restart of this script)
+ update job status from wait|ready|submitted -> completed if sh calls or qsub calls finished
+ '''
+ t_sample_job = job_list[t_job_id][t_sample_id]
+ t_job = NGS_config.NGS_batch_jobs[t_job_id]
+ t_execution = NGS_config.NGS_executions[ t_job['execution']]
+
+ t_sh_pid = t_sample_job['sh_file'] + '.pids'
+ if not os.path.exists(t_sh_pid): return
+
+ status = t_sample_job['status']
+ if ((status == 'wait') or (status == 'ready')):
+ t_sample_job['status'] = 'submitted'
+ print '{0},{1}: change status to submitted\n'.format(t_job_id, t_sample_id)
+
+ pids = [] #### either pids, or qsub ids
+ try:
+ f = open(t_sh_pid, 'r')
+ pids = f.readlines()
+ f.close()
+ pids = [x.strip() for x in pids]
+ except IOError:
+ fatal_error('cannot open ' + t_sh_pid, exit_code=1)
+
+ if len(pids) == 0:
+ fatal_error('empty file ' + t_sh_pid, exit_code=1)
+
+ exe_type = t_execution['type']
+ if (exe_type == 'sh'):
+ if check_any_pids(pids): #### still running
+ execution_submitted[ t_job['execution'] ] += t_job['cores_per_cmd'] * t_job['no_parallel']
+ elif validate_job_files(t_job_id, t_sample_id): #### job finished
+ t_sample_job['status'] = 'completed'
+ print '{0},{1}: change status to completed\n'.format(t_job_id, t_sample_id)
+ else:
+ t_sample_job['status'] = 'error'
+ print '{0},{1}: change status to error\n'.format(t_job_id, t_sample_id)
+ return
+
+ elif ((exe_type == 'qsub') or (exe_type == 'qsub-pe')):
+ if check_any_qsub_pids(pids): #### still running
+ pass
+ elif validate_job_files(t_job_id, t_sample_id): #### job finished
+ t_sample_job['status'] = 'completed'
+ print '{0},{1}: change status to completed\n'.format(t_job_id, t_sample_id)
+ else:
+ t_sample_job['status'] = 'error'
+ print '{0},{1}: change status to error\n'.format(t_job_id, t_sample_id)
+ else:
+ fatal_error('unknown execution type: '+ exe_type, exit_code=1)
+ return
+#### END def check_submitted_job()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(formatter_class = RawTextHelpFormatter,
+ description = textwrap.dedent(banner))
+
+ parser.add_argument('-i', '--input', help="workflow configration file, required", required=True)
+ parser.add_argument('-s', '--sample_file', help='''
+sample data file, required unless -S is present
+File format example:
+#Sample data file example, TAB or space delimited for following lines
+Sample_ID1 sample_data_0 sample_data_1
+Sample_ID2 sample_data_0 sample_data_1
+Sample_ID3 sample_data_0 sample_data_1
+ ''')
+ parser.add_argument('-S', '--sample_name', help='''
+sample data from command line, required unless -s is present
+format:
+Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
+ ''')
+ parser.add_argument('-t', '--parameter_file', help='''
+replace default paramters in workflow configration file
+File format example:
+#parameter file example, TAB or space delimited for following lines
+JobID_A opt0 opt1 opt2
+JobID_B opt0 opt1
+ ''')
+ parser.add_argument('-T', '--parameter_name', help='''
+parameter from command line
+format:
+JobID_A:opt0:opt1:opt2,JobID_B:opt0:opt1
+ ''')
+ parser.add_argument('-j', '--jobs', help='''run sub set of jobs, optional
+the workflow will run all jobs by default.
+to run sub set of jobs: -j qc or -j qc,fastqc
+ ''')
+ parser.add_argument('-J', '--task', help='''optional tasks
+write-sh: write sh files and quite
+log-cpu: gathering cpu time for each run for each sample
+list-jobs: list jobs
+snapshot: snapshot current job status
+delete-jobs: delete jobs, must supply jobs delete syntax by option -Z
+ e.g. -J delete-jobs -Z jobids:assembly,blast ---delete assembly,blast and all jobs depends on them
+ -J delete-jobs -Z run_after:filename ---delete jobs that has start time (WF.start.date) after this file, and all depending jobs
+ ''')
+ parser.add_argument('-Z', '--second_parameter', help='secondary parameter used by other options, such as -J')
+ parser.add_argument('-Q', '--queye', help='queue system, e.g. PBS, SGE', default='SGE')
+
+ args = parser.parse_args()
+
+ if (args.sample_file is None) and (args.sample_name is None) :
+ parser.error('No sample file or sample name')
+ NGS_config = imp.load_source('NGS_config', args.input)
+
+ print banner
+ read_samples(args)
+ read_parameters(args)
+
+ if args.jobs:
+ subset_flag = True
+ subset_jobs = re.split(',', args.jobs)
+ add_subset_jobs_by_dependency(NGS_config)
+ print 'Running subset jobs:',
+ print subset_jobs
+ print '\n'
+
+ if not os.path.exists('WF-sh'): os.system('mkdir WF-sh')
+
+ task_level_jobs(NGS_config)
+ make_job_list(NGS_config)
+
+ if args.task:
+ if args.task == 'list-jobs':
+ task_list_jobs(NGS_config)
+ exit(0)
+ elif args.task == 'snapshot':
+ task_snapshot(NGS_config)
+ exit(0)
+ elif args.task == 'delete-jobs':
+ task_delete_jobs(NGS_config, args.second_parameter)
+ exit(0)
+ elif args.task == 'write-sh':
+ exit(0)
+ else:
+ fatal_error('undefined task' + args.task, exit_code=1)
+
+ run_workflow(NGS_config)
+################################################################################################
+# _____ _ _ _____ _____ _ _ _ _ _
+# | __ \ | \ | |/ ____|/ ____|| | | | | | (_) | |
+# | |__) | _ _ __ | \| | | __| (___ | |__ __ _| |_ ___| |__ _ ___ | |__ ___
+# | _ / | | | '_ \ | . ` | | |_ |\___ \ | '_ \ / _` | __/ __| '_ \ | |/ _ \| '_ \/ __|
+# | | \ \ |_| | | | | | |\ | |__| |____) || |_) | (_| | || (__| | | | | | (_) | |_) \__ \
+# |_| \_\__,_|_| |_| |_| \_|\_____|_____/ |_.__/ \__,_|\__\___|_| |_| | |\___/|_.__/|___/
+# ______ ______ _/ |
+# |______| |______|__/
+########## Run NGS_batch_jobs for each samples http://patorjk.com/software/taag
+################################################################################################
=====================================
usecases/Miseq-16S/README
=====================================
@@ -51,10 +51,17 @@ http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/t
We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
-3. Modify NG-Omics-Miseq-16S.pl
-Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
- $CD_HIT_dir = "PATH_to_cd-hit";
- $NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
+3. Modify NG-Omics-Miseq-16S.py
+Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.py, in the top few lines:
+
+from
+ 'CD_HIT_dir' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621',
+ 'NGS_prog_trimmomatic' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar',
+
+to
+ 'CD_HIT_dir' : 'PATH_to_cd-hit_directory',
+ 'NGS_prog_trimmomatic' : 'PATH_to_Trimmomatic/trimmomatic-0.32.jar', #### where you have installed Trimmomatic
+
4. Download reference dataset
Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
@@ -124,7 +131,7 @@ This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE
4. Run sequence QC and OTU clustering for each sample
In the working directory, run
- PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+ PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
where: 150 and 100 are the effective length,
see next section for suggestions in choose effective clustering read length
@@ -138,13 +145,13 @@ This command will generate shell scripts for QC and for OTU for each sample.
The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all
these jobs finished you then run all otu.sample_name.sh
-NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
+NG-Omics-WF.py https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
tool developed in our group. It is not fully released yet, since we need more time to document
-this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples.
-First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the
+this tool. However, you can try to use NG-Omics-WF.py to automatically run all your samples.
+First edit NG-Omics-Miseq-16S.py and modify cores_per_node around line #36 to match the
number of CPU cores of your computer, then run
- nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
+ nohup PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
After the job finished, the OTU results will be in sample_name/otu folder, important files include
OTU.clstr: file lists all clusters and sequences
@@ -156,14 +163,14 @@ If you have multiple samples, you don't just want to stop here. It is important
to pool all sample together and re-run OTU clustering so that all samples can be
compared, run
- PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled
+ PATH_to_cd-hit-dir/usecases/Miseq-16S/pool_samples.pl -s SAMPLE_file -o pooled
This will pool sequences from all samples. We can handle hundred and more sample without problem.
6. Cluster pooled samples, run
- PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+ PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
run this sh script. When it is finished, OTUs will be in the pooled directory:
=====================================
usecases/Miseq-16S/pool_samples.pl
=====================================
@@ -9,10 +9,13 @@ my $output = $opts{o};
$output = "pooled" unless ($output);
my $sample_in = $opts{s};
my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
+my $file_list = $opts{f};
+my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
+ @file_list = split(/,/, $file_list) if ($file_list);
+
my $job = $opts{j};
$job = "otu" unless ($job);
-my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
my ($i, $j, $k, $cmd);
$cmd = `mkdir $output` unless (-e $output);
@@ -62,7 +65,7 @@ foreach $i (@file_list) {
sub usage {
<<EOD;
- $0 -s sample_file -o output_dir
+ $0 -s sample_file -o output_dir -j job -f list_files
-s sample data file, required unless -S is present
File format example
#Sample data file example, TAB or space delimited for following lines
@@ -73,6 +76,8 @@ Sample_ID3 sample_data_0 sample_data_1
-S sample data from command line, required unless -s is present
format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
+ -j job name
+ -f list of files (delimited by ,) to pool (cat)
EOD
}
=====================================
usecases/Miseq-16S/silva-ana1.pl
=====================================
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+## =========================== NGS tools ==========================================
+## NGS tools for metagenomic sequence analysis
+## May also be used for other type NGS data analysis
+##
+## Weizhong Li, UCSD
+## liwz at sdsc.edu
+## http://weizhongli-lab.org/
+## ================================================================================
+
+use Getopt::Std;
+getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
+die usage() unless ($opts{j} and $opts{o});
+my ($i, $j, $k, $cmd);
+my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
+my ($len, $lena, $lenb);
+
+my $fasta = $opts{j};
+my $output = $opts{o};
+
+my %id_2_seq = ();
+my $id = "";
+my $ann;
+open(TMP, $fasta) || die "can not open $fasta";
+while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ chop($ll);
+ ($id, $ann) = split(/\s+/, substr($ll,1), 2);
+ $ann =~ s/\s/_/g;
+ $id = "$id|$ann";
+ }
+ else {
+ $ll =~ s/U/T/g;
+ $ll =~ s/u/T/g;
+ $id_2_seq{$id} .= $ll;
+ }
+}
+
+close(TMP);
+
+my @ids = keys %id_2_seq;
+ @ids = sort {length($b) <=> length($a) } @ids;
+
+open(OUT, "> $output") || die "can not write to $output";
+foreach $id (@ids) {
+ print OUT ">$id\n$id_2_seq{$id}";
+}
+close(OUT);
+
+
+
+sub usage {
+<<EOD;
+This script formats Silva FASTA file for CD-HIT-OTU-MiSeq. You should download Silva sequences
+from https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva.fasta.gz
+or similar file, gunzip the file
+
+Run this script as $0 -j SILVA_128_SSURef_Nr99_tax_silva.fasta -o silva_128_SSURef_processed.fasta
+
+Options:
+======================
+ -j path for SILVA_128_SSURef_Nr99_tax_silva.fasta
+ -o output FASTA file of formatted Silva reference DB
+EOD
+}
=====================================
usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
=====================================
@@ -0,0 +1,132 @@
+#!/usr/bin/perl
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+########## local variables etc. Please edit
+$CD_HIT_dir = "/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621";
+$NGS_prog_trimmomatic = "/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar";
+
+########## computation resources for execution of jobs
+%NGS_executions = ();
+$NGS_executions{"qsub_1"} = {
+ "type" => "qsub-pe",
+ "cores_per_node" => 8,
+ "number_nodes" => 64,
+ "user" => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
+ "command" => "qsub",
+ "command_name_opt" => "-N",
+ "command_err_opt" => "-e",
+ "command_out_opt" => "-o",
+ "template" => <<EOD,
+#!/bin/sh
+#PBS -v PATH
+#PBS -V
+
+#\$ -q RNA.q
+#\$ -v PATH
+#\$ -V
+
+EOD
+};
+
+
+$NGS_executions{"sh_1"} = {
+ "type" => "sh",
+ "cores_per_node" => 8,
+ "number_nodes" => 1,
+};
+
+
+# $NGS_batch_jobs{"qc-pe"} = {
+# "CMD_opts" => ["20"],
+# "execution" => "qsub_1", # where to execute
+# "cores_per_cmd" => 4, # number of threads used by command below
+# "no_parallel" => 1, # number of total jobs to run using command below
+# "command" => <<EOD,
+# java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
+# SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+#
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
+#
+# wait
+# rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
+# EOD
+# };
+
+
+$NGS_batch_jobs{"qc"} = {
+ "CMD_opts" => ["20"],
+ "execution" => "qsub_1", # where to execute
+ "cores_per_cmd" => 4, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+java -jar $NGS_prog_trimmomatic SE -threads 4 -phred33 \\DATA.0 \\SELF/R1.fq \\
+ SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+rm -f \\SELF/R1.fq
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr"} = {
+ "injobs" => ["qc"],
+ "CMD_opts" => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+ "execution" => "qsub_1", # where to execute
+ "cores_per_cmd" => 2, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+#### cluster at 100%
+$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -o \\SELF/seq.nr -sf 1 -sc 1 -r 0 -c 1.00 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.nr.log
+$CD_HIT_dir/usecases/miRNA-seq/filter-small-cluster.pl -i \\SELF/seq.nr.clstr -s \\SELF/seq.nr -o \\SELF/seq.nr-filtered.clstr -f \\SELF/seq.nr-filtered -c 1
+
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr-filtered -o \\SELF/seq.95 -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.95.log
+$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr-filtered.clstr \\SELF/seq.95.clstr > \\SELF/seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95 -i2 \\CMDOPTS.1 -o \\SELF/seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95 -i2 \\CMDOPTS.2 -o \\SELF/seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.ref.clstr > \\SELF/seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.spk.clstr > \\SELF/seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/seq.95-full.clstr \\SELF/seq.95.reftop.clstr > \\SELF/tmp.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/tmp.clstr \\SELF/seq.95.spktop.clstr > \\SELF/miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < \\SELF/miRNA.clstr > \\SELF/miRNA.clstr.s
+mv \\SELF/miRNA.clstr.s \\SELF/miRNA.clstr
+rm -f \\SELF/tmp.clstr
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr-pooled"} = {
+ "CMD_opts" => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+ "execution" => "qsub_1", # where to execute
+ "cores_per_cmd" => 2, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+#### concat seq.nr-filtered seq.nr-filtered.clstr
+
+$CD_HIT_dir/cd-hit-est -i seq.nr-filtered -o seq.95 -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > seq.95.log
+$CD_HIT_dir/clstr_rev.pl seq.nr-filtered.clstr seq.95.clstr > seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i seq.95 -i2 \\CMDOPTS.1 -o seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i seq.95 -i2 \\CMDOPTS.2 -o seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.ref.clstr > seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.spk.clstr > seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl seq.95-full.clstr seq.95.reftop.clstr > tmp.clstr
+$CD_HIT_dir/clstr_merge.pl tmp.clstr seq.95.spktop.clstr > miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < miRNA.clstr > miRNA.clstr.s
+mv miRNA.clstr.s miRNA.clstr
+
+$CD_HIT_dir/usecases/miRNA-seq/clstr_2_miRNA-table.pl -i miRNA.clstr -o miRNA.txt
+
+EOD
+};
+
+##############################################################################################
+########## END
+1;
+
=====================================
usecases/miRNA-seq/clstr_2_miRNA-table.pl
=====================================
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+#
+use Getopt::Std;
+getopts("i:s:S:o:f:j:",\%opts);
+
+my $input = $opts{i}; $input = "Cluster.clstr" unless $input;
+my $output = $opts{o}; $output = "Cluster.txt" unless ($output);
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my %count = ();
+my %count_t = ();
+my %count_s = ();
+my $Cluster_2_ann = ();
+# >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
+open(TMP, $input) || die "can not open $input";
+my $Cluster=0;
+while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $Cluster++;
+ }
+ else {
+ chop($ll);
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ my $id = $2;
+ if ($id =~ /^Sample\|([^\|]+)\|/) {
+ $sample_id = $1;
+ $sample_id{$sample_id}=1;
+ $count{$Cluster}{$sample_id}++;
+ $count_t{$Cluster}++;
+ $count_s{$sample_id}++;
+ }
+ else {
+ $Cluster_2_ann{$Cluster} .= "$id\t";
+ }
+ }
+ else {
+ die "format error $ll";
+ }
+ }
+}
+close(TMP);
+
+my @sample_ids = sort keys %sample_id;
+
+open(OUT1, "> $output") || die "can not write $output";
+print OUT1 "Cluster";
+foreach $sample_id (@sample_ids){
+ print OUT1 "\t$sample_id";
+}
+#print OUT1 "\tTotal\n";
+print OUT1 "\tAnnotation\tSpike\n";
+
+for ($i=1; $i<=$Cluster; $i++){
+ $ann = "None";
+ if ($Cluster_2_ann{$i}) { $ann = $Cluster_2_ann{$i}; }
+ print OUT1 "Cluster$i";
+ foreach $sample_id (@sample_ids){
+ $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
+ print OUT1 "\t$k";
+ }
+ #print OUT1 "\t$count_t{$i}";
+ print OUT1 "\t$ann\n";
+}
+close(OUT1);
+
+
=====================================
usecases/miRNA-seq/filter-small-cluster.pl
=====================================
@@ -0,0 +1,94 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+getopts("i:s:o:f:c:",\%opts);
+die usage() unless ($opts{i} and $opts{s} and $opts{o} and $opts{f});
+
+my $input = $opts{i}; ## nr.clstr
+my $fa = $opts{s}; ## R1.fa
+my $output = $opts{o}; ## nr-filtered.clstr
+my $output_fa = $opts{f}; ## nr-filtered
+my $cutoff = $opts{c}; $cutoff = 1 unless ($cutoff);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+open(TMP, $input) || die "can not open $input";
+open(OUT, "> $output") || die "can not write to $output";
+$no = 0;
+$clstr = "";
+$rep = "";
+my %good_ids = ();
+
+while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ if ($no > $cutoff) {
+ print OUT $clstr;
+ $good_ids{$rep}=1;
+ }
+ $clstr = $ll;
+ $no = 0;
+ }
+ else {
+ $clstr .= $ll;
+ chop($ll);
+ if ($ll =~ /\*$/) {
+ $rep = "";
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $rep = $2;
+ }
+ else {
+ die "format error $ll";
+ }
+ }
+ $no++;
+ }
+}
+ if ($no > $cutoff) {
+ print OUT $clstr;
+ $good_ids{$rep}=1;
+ }
+close(OUT);
+close(TMP);
+
+open(TMP, $fa) || die "can not open $fa";
+open(OUT, ">$output_fa") || die "can not write to $output_fa";
+
+my $flag = 0;
+while($ll = <TMP>) {
+ if ($ll =~ /^>/) {
+ $gi = substr($ll,1);
+ chop($gi);
+ $gi =~ s/\s.+$//;
+ $flag = ( $good_ids{$gi} ) ? 1 : 0;
+ }
+ print OUT $ll if ($flag);
+}
+
+close(TMP);
+close(OUT);
+
+
+
+sub usage {
+<<EOF
+Usage:
+$script_name -i seq.nr.clstr -s R1.fa -o output.clstr -f output.fa -c 1
+
+Options:
+ -i input seq.nr.clstr
+ -s R1.fa
+ -o output.clstr
+ -f output.fa
+ -c abundance cutoff, default $cutoff
+ small clusters <= this size will be considiered as noise and will be removed
+
+EOF
+}
+###### END usage
+
View it on GitLab: https://salsa.debian.org/med-team/cd-hit/compare/668f0f6f3ab70b07968a2252a81d1417ecef313a...6a193fc4eddc51ab6704f2d97e5ad68b7c2dec88
--
View it on GitLab: https://salsa.debian.org/med-team/cd-hit/compare/668f0f6f3ab70b07968a2252a81d1417ecef313a...6a193fc4eddc51ab6704f2d97e5ad68b7c2dec88
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190719/f9ce2aed/attachment-0001.html>
More information about the debian-med-commit
mailing list