[med-svn] [cd-hit] 02/05: New upstream version 4.6.8
Sascha Steinbiss
satta at debian.org
Sat Jul 1 09:12:25 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository cd-hit.
commit 08d8190e044b037e8c64797551ee7c44c38b303a
Author: Sascha Steinbiss <satta at debian.org>
Date: Sat Jul 1 10:43:01 2017 +0200
New upstream version 4.6.8
---
FET.pl | 109 ++
README | 5 +-
cdhit-common.c++ | 596 ++++++++--
cdhit-common.h | 38 +-
cdhit-est-2d.c++ | 17 +-
cdhit-est.c++ | 15 +-
cdhit-utility.c++ | 63 +-
clstr_list.pl | 89 ++
clstr_list_sort.pl | 51 +
doc/cd-hit-otu-miseq-Figure-1.png | Bin 0 -> 119495 bytes
doc/cdhit-user-guide.pdf | Bin 898823 -> 421458 bytes
doc/cdhit-user-guide.wiki | 432 +++++--
...psi-cd-hit-local.pl => psi-cd-hit-local-old.pl} | 245 +++-
psi-cd-hit/psi-cd-hit-local.pl | 549 +++++++--
psi-cd-hit/{psi-cd-hit.pl => psi-cd-hit-old.pl} | 72 +-
psi-cd-hit/psi-cd-hit.pl | 82 +-
usecases/Miseq-16S/16S-ref-db-PE-splice.pl | 458 ++++++++
usecases/Miseq-16S/NG-Omics-Miseq-16S.pl | 117 ++
usecases/Miseq-16S/NG-Omics-WF.pl | 1189 ++++++++++++++++++++
usecases/Miseq-16S/README | 200 ++++
usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl | 222 ++++
usecases/Miseq-16S/clstr_2_OTU_table.pl | 82 ++
usecases/Miseq-16S/filter-chimeric-and-small.pl | 237 ++++
usecases/Miseq-16S/filter-chimeric-by-ref.pl | 207 ++++
usecases/Miseq-16S/filter-nontop-ref.pl | 51 +
usecases/Miseq-16S/filter-refonly-cluster.pl | 34 +
usecases/Miseq-16S/greengene-ann1.pl | 75 ++
usecases/Miseq-16S/pool_samples.pl | 78 ++
28 files changed, 4980 insertions(+), 333 deletions(-)
diff --git a/FET.pl b/FET.pl
new file mode 100755
index 0000000..6db320f
--- /dev/null
+++ b/FET.pl
@@ -0,0 +1,109 @@
+#!/usr/bin/perl
+
+use Storable;
+use strict;
+use Text::NSP::Measures::2D::Fisher::right;
+
+my $clstr_file = shift;
+my $anno_file = shift;
+my $store_file = shift;
+
+my @cls_list = ();
+my @fun_list = ();
+my $cur_cls = "";
+my %cls2rep = ();
+my @cur_anno = ();
+
+
+open(TMP, $clstr_file) || die;
+while(my $ll = <TMP>) { # read .clstr files
+ if ($ll =~ /^>/) { # the begin of a cluster
+ $cur_cls = $ll;
+ $cur_cls =~ s/^>(.*?)\s$/$1/;
+ # print "$cur_cls|\n";
+ }
+ else{
+ chop($ll);
+ if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
+ my @tmp = split(/\|\|/,$4);
+ if ($#tmp == 0){
+ @cur_anno = ();
+ }
+ else{
+ @cur_anno = split(/,/, pop(@tmp));
+ }
+# print $cur_cls.$cur_anno[0]."|\n";
+ push(@cls_list, $cur_cls);
+ push(@fun_list, [@cur_anno]);
+ if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\.(.*)\*$/){
+ # print "$4\n";
+ $cls2rep{$cur_cls} = $4;
+# print "$cur_cls\t$4\n";
+ }
+ }
+ }
+}
+
+#print join("\n", @cls_list[0..10]);
+ at cls_list = map {$cls2rep{$_}} @cls_list;
+#print join("\n", @cls_list[0..10]);
+#print "\n";
+#foreach my $i (0..10){
+# print join("\t",@{$fun_list[$i]});
+# print "\n";
+#}
+#print join("\n", @fun_list[0..10]);
+#exit(1);
+my %cls_size = ();
+my %cls_anno = ();
+my %anno_size = ();
+my $M = $#fun_list+1;
+#print $#fun_list."\t".$M."\n";
+#print $#cls_list."\t".$M."\n";
+foreach my $i (0..$#fun_list){
+ $cls_size{$cls_list[$i]}++;
+ if ($#{$fun_list[$i]} >= 0) { # have annotation
+ foreach my $anno (@{$fun_list[$i]}){
+# print "$i\t$cls_list[$i]\t$anno\n";
+ $anno_size{$anno}++;
+ $cls_anno{$cls_list[$i]}{$anno}++;
+ }
+ }
+}
+
+#while (my ($a,$b) = each %anno_size){
+# print "$a\t$b\n";
+#}
+
+#print "COG0171\t".$anno_
+
+my %resu = ();
+while(my ($cls, $cls_s) = each %cls_size){
+ my @tmp = ();
+# $resu{$cls} = [];
+ while (my ($anno,$anno_s) = each %{$cls_anno{$cls}}){
+# print "$cls\t$cls_s\t$anno\t$anno_s\t$anno_size{$anno}";
+# print "\n";
+ my $pvalue = calculateStatistic(n11=>$anno_s, n1p=>$cls_s, np1=>$anno_size{$anno}, npp=>$M);
+ # anno_term, anno_size, clsper, anno_total, backper, enrichment, pvalue
+ push @tmp, [$anno, $anno_s, $anno_s/$cls_s, $anno_size{$anno}, $anno_size{$anno}/$M, $anno_s*$M/($cls_s*$anno_size{$anno}), $pvalue];
+ # push $resu{$cls}, [sort{$a[0] <=> $b[0]} @tmp];
+ }
+ @tmp = sort {$$a[6] <=> $$b[6]} @tmp;
+ $resu{$cls} = [@tmp];
+}
+
+store \%resu, $store_file;
+open(TMP, "> $anno_file") || die;
+print TMP "ClsName\tClsSize\tAnno_term\tAnno_size\tClsPer\tAnno_total\tSeq_total\tBackPer\tEnrichment\tPvalue\n";
+while(my ($cls, $info) = each %resu){
+ foreach my $a (@{$info}){ #[$pvalue, $enrichment, $anno_s, $anno]
+ print TMP join("\t",($cls, $cls_size{$cls}, $a->[0], $a->[1], $a->[2], $a->[3],
+ $M, $a->[4], $a->[5], $a->[6]))."\n";
+# print "$cls\t".join("\t",@{$a})."\n";
+ }
+# print "$cls\t$#{$info}\n";
+}
+close(TMP)
+
+
diff --git a/README b/README
index 988ef6f..e5d47b6 100644
--- a/README
+++ b/README
@@ -14,9 +14,8 @@ For psi-cd-hit
please download legacy BLAST (not BLAST+) and install the executables in your $PATH
-For more information, please visit http://cd-hit.org or please read the cdhit-users-guide.pdf.
-Most up-to-date documents are available at http://weizhongli-lab.org/cd-hit/wiki/doku.php?id=cd-hit_user_guide.
+For more information, please visit http://cd-hit.org
-cd-hit was originally hosted at Google Code, some of the old releases are still available from https://code.google.com/p/cdhit/.
+Most up-to-date documents are available at https://github.com/weizhongli/cdhit/wiki
cd-hit is also available as web server, visit http://cd-hit.org for web server address.
diff --git a/cdhit-common.c++ b/cdhit-common.c++
index 6d0ecfa..a12db63 100644
--- a/cdhit-common.c++
+++ b/cdhit-common.c++
@@ -212,7 +212,9 @@ bool Options::SetOptionCommon( const char *flag, const char *value )
{
int intval = atoi( value );
if (strcmp(flag, "-i" ) == 0) input = value;
+ else if (strcmp(flag, "-j" ) == 0) input_pe = value;
else if (strcmp(flag, "-o" ) == 0) output = value;
+ else if (strcmp(flag, "-op") == 0) output_pe = value;
else if (strcmp(flag, "-M" ) == 0) max_memory = atoll(value) * 1000000;
else if (strcmp(flag, "-l" ) == 0) min_length = intval;
else if (strcmp(flag, "-c" ) == 0) cluster_thd = atof(value), useIdentity = true;
@@ -223,6 +225,12 @@ bool Options::SetOptionCommon( const char *flag, const char *value )
else if (strcmp(flag, "-s" ) == 0) diff_cutoff = atof(value);
else if (strcmp(flag, "-S" ) == 0) diff_cutoff_aa = intval;
else if (strcmp(flag, "-B" ) == 0) store_disk = intval;
+ else if (strcmp(flag, "-P" ) == 0) PE_mode = intval;
+ else if (strcmp(flag, "-cx") == 0) trim_len = intval;
+ else if (strcmp(flag, "-cy") == 0) trim_len_R2 = intval;
+ else if (strcmp(flag, "-ap") == 0) align_pos = intval;
+ else if (strcmp(flag, "-sc") == 0) sort_output = intval;
+ else if (strcmp(flag, "-sf") == 0) sort_outputf = intval;
else if (strcmp(flag, "-p" ) == 0) print = intval;
else if (strcmp(flag, "-g" ) == 0) cluster_best = intval;
else if (strcmp(flag, "-G" ) == 0) global_identity = intval;
@@ -280,6 +288,7 @@ bool Options::SetOption2D( const char *flag, const char *value )
{
if( SetOptionCommon( flag, value ) ) return true;
if (strcmp(flag, "-i2" ) == 0) input2 = value;
+ else if (strcmp(flag, "-j2" ) == 0) input2_pe = value;
else if (strcmp(flag, "-s2") == 0) diff_cutoff2 = atof(value);
else if (strcmp(flag, "-S2") == 0) diff_cutoff_aa2 = atoi(value);
else return false;
@@ -351,6 +360,14 @@ void Options::Validate()
if ((cluster_thd > 1.0) || (cluster_thd < 0.4)) bomb_error("invalid clstr");
}
+ if (input.size() == 0) bomb_error("no input file");
+ if (output.size() == 0) bomb_error("no output file");
+ if (PE_mode) {
+ if (input_pe.size() == 0) bomb_error("no input file for R2 sequences in PE mode");
+ if (output_pe.size() == 0) bomb_error("no output file for R2 sequences in PE mode");
+ }
+ if (isEST && (align_pos==1)) option_r = 0;
+
if (band_width < 1 ) bomb_error("invalid band width");
if (NAA < 2 || NAA > NAA_top_limit) bomb_error("invalid word length");
if (des_len < 0 ) bomb_error("too short description, not enough to identify sequences");
@@ -360,6 +377,9 @@ void Options::Validate()
if( has2D ){
if ((diff_cutoff2<0) || (diff_cutoff2>1)) bomb_error("invalid value for -s2");
if (diff_cutoff_aa2<0) bomb_error("invalid value for -S2");
+ if (PE_mode) {
+ if (input2_pe.size() == 0) bomb_error("no input file for R2 sequences for 2nd db in PE mode");
+ }
}
if (global_identity == 0) print = 1;
if (short_coverage < long_coverage) short_coverage = long_coverage;
@@ -468,6 +488,15 @@ void format_seq(char *seq)
seq[j] = 0;
} // END void format_seq
+void strrev(char *p)
+{
+ char *q = p;
+ while(q && *q) ++q;
+ for(--q; p < q; ++p, --q)
+ *p = *p ^ *q,
+ *q = *p ^ *q,
+ *p = *p ^ *q;
+}
////For smiple len1 <= len2, len2 is for existing representative
////walk along all diag path of two sequences,
@@ -1458,6 +1487,7 @@ Sequence::Sequence( const Sequence & other )
distance = 2.0;
if( other.data ){
size = bufsize = other.size;
+ size_R2 = 0;
data = new char[size+1];
//printf( "data: %p %p\n", data, other.data );
data[size] = 0;
@@ -1471,6 +1501,49 @@ Sequence::Sequence( const Sequence & other )
identifier[len] = 0;
}
}
+
+// back to back merge for PE
+// R1 -> XXXXXXABC ------------------- NMLYYYYYY <--R2
+// >R1 >R2
+// XXXXXXABC YYYYYYLMN =====> Merge into
+// >R12
+// NMLYYYYYYXXXXXXABC
+Sequence::Sequence( const Sequence & other, const Sequence & other2, int mode )
+{
+ int i;
+ if (mode != 1) bomb_error("unknown mode");
+
+ //printf( "new: %p %p\n", this, & other );
+ memcpy( this, & other, sizeof( Sequence ) );
+ distance = 2.0;
+
+ if( other.data && other2.data ){
+ size = bufsize = (other.size + other2.size);
+ size_R2 = other2.size;
+ data = new char[size+1];
+ //printf( "data: %p %p\n", data, other.data );
+ data[size] = 0;
+ data[size_R2] = 0;
+ memcpy( data, other2.data, size_R2); // copy R2 first
+ strrev( data ); // reverse R2 on data
+ memcpy( data+size_R2, other.data, size-size_R2 ); // copy R1 to end of R2
+ //for (i=0; i<size; i++) data[i] = other.data[i];
+ des_begin2 = other2.des_begin;
+ tot_length2= other2.tot_length;
+ }
+ else if ( other.data || other2.data ) {
+ bomb_error("Not both PE sequences have data");
+ }
+
+ if( other.identifier ){ // only use R1
+ int len = strlen( other.identifier );
+ identifier = new char[len+1];
+ memcpy( identifier, other.identifier, len );
+ identifier[len] = 0;
+ }
+}
+
+
Sequence::~Sequence()
{
//printf( "delete: %p\n", this );
@@ -1530,6 +1603,11 @@ void Sequence::Reserve( int n )
}
if( size ) data[size] = 0;
}
+void Sequence::trim(int trim_len) {
+ if (trim_len >= size) return;
+ size = trim_len;
+ if (size) data[size]=0;
+}
void Sequence::ConvertBases()
{
int i;
@@ -1600,110 +1678,254 @@ void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf
}
}
+// by liwz
+// disable swap option
+// change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
+// where des_begin is the FILE pointer of sequence record start
+// tot_length is the total bytes of sequence record
void SequenceDB::Read( const char *file, const Options & options )
{
- Sequence one;
- Sequence dummy;
- Sequence des;
- Sequence *last = NULL;
- FILE *swap = NULL;
- FILE *fin = fopen( file, "rb" );
- char *buffer = NULL;
- char *res = NULL;
- size_t swap_size = 0;
- int option_l = options.min_length;
- if( fin == NULL ) bomb_error( "Failed to open the database file" );
- if( options.store_disk ) swap = OpenTempFile( temp_dir );
- Clear();
- dummy.swap = swap;
- buffer = new char[ MAX_LINE_SIZE+1 ];
-
- while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
- buffer[0] = '>';
- if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
- if( buffer[0] == '+' ){
- int len = strlen( buffer );
- int len2 = len;
- while( len2 && buffer[len2-1] != '\n' ){
- if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
- len2 = strlen( buffer );
- len += len2;
- }
- one.des_length2 = len;
- dummy.des_length2 = len;
- fseek( fin, one.size, SEEK_CUR );
- }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
- if ( one.size ) { // write previous record
- one.dat_length = dummy.dat_length = one.size;
- if( one.identifier == NULL || one.Format() ){
- printf( "Warning: from file \"%s\",\n", file );
- printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
- if( one.identifier ) printf( "%s\n", one.identifier );
- printf( "%s\n", one.data );
- one.size = 0;
- }
- one.index = dummy.index = sequences.size();
- if( one.size > option_l ) {
- if ( swap ) {
- swap_size += one.size;
- // so that size of file < MAX_BIN_SWAP about 2GB
- if ( swap_size >= MAX_BIN_SWAP) {
- dummy.swap = swap = OpenTempFile( temp_dir );
- swap_size = one.size;
- }
- dummy.size = one.size;
- dummy.offset = ftell( swap );
- dummy.des_length = one.des_length;
- sequences.Append( new Sequence( dummy ) );
- one.ConvertBases();
- fwrite( one.data, 1, one.size, swap );
- }else{
- //printf( "==================\n" );
- sequences.Append( new Sequence( one ) );
- //printf( "------------------\n" );
- //if( sequences.size() > 10 ) break;
- }
- //if( sequences.size() >= 10000 ) break;
- }
- }
- one.size = 0;
- one.des_length2 = 0;
-
- int len = strlen( buffer );
- int len2 = len;
- des.size = 0;
- des += buffer;
- while( len2 && buffer[len2-1] != '\n' ){
- if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
- des += buffer;
- len2 = strlen( buffer );
- len += len2;
- }
- size_t offset = ftell( fin );
- one.des_begin = dummy.des_begin = offset - len;
- one.des_length = dummy.des_length = len;
-
- int i = 0;
- if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
- if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
- if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
- while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
- des.data[i] = 0;
- one.identifier = dummy.identifier = des.data;
- } else {
- one += buffer;
- }
- }
+ Sequence one;
+ Sequence des;
+ FILE *fin = fopen( file, "rb" );
+ char *buffer = NULL;
+ char *res = NULL;
+ int option_l = options.min_length;
+ if( fin == NULL ) bomb_error( "Failed to open the database file" );
+ Clear();
+ buffer = new char[ MAX_LINE_SIZE+1 ];
+
+ while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
+ buffer[0] = '>';
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
+ if( buffer[0] == '+' ){
+ int len = strlen( buffer );
+ int len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // read next line quality score
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer );
+ len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+ }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+ if ( one.size ) { // write previous record
+ if( one.identifier == NULL || one.Format() ){
+ printf( "Warning: from file \"%s\",\n", file );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( one.identifier ) printf( "%s\n", one.identifier );
+ printf( "%s\n", one.data );
+ one.size = 0;
+ }
+ one.index = sequences.size();
+ if( one.size > option_l ) {
+ if (options.trim_len > 0) one.trim(options.trim_len);
+ sequences.Append( new Sequence( one ) );
+ }
+ }
+ one.size = 0;
+ one.tot_length = 0;
+
+ int len = strlen( buffer );
+ int len2 = len;
+ des.size = 0;
+ des += buffer;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ des += buffer;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ size_t offset = ftell( fin );
+ one.des_begin = offset - len;
+ one.tot_length += len; // count first line
+
+ int i = 0;
+ if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+ if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+ if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+ while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+ des.data[i] = 0;
+ one.identifier = des.data;
+ } else {
+ one.tot_length += strlen(buffer); one += buffer;
+ }
+ }
#if 0
- int i, n = 0;
- for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
- cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
- int i;
- scanf( "%i", & i );
+ int i, n = 0;
+ for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
+ cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
+ int i;
+ scanf( "%i", & i );
#endif
- one.identifier = dummy.identifier = NULL;
- delete[] buffer;
- fclose( fin );
+ one.identifier = NULL;
+ delete[] buffer;
+ fclose( fin );
+}
+
+// PE reads liwz, disable swap option
+void SequenceDB::Read( const char *file, const char *file2, const Options & options )
+{
+ Sequence one, two;
+ Sequence des;
+ FILE *fin = fopen( file, "rb" );
+ FILE *fin2= fopen( file2,"rb" );
+ char *buffer = NULL;
+ char *buffer2= NULL;
+ char *res = NULL;
+ char *res2= NULL;
+ int option_l = options.min_length;
+ if( fin == NULL ) bomb_error( "Failed to open the database file" );
+ if( fin2== NULL ) bomb_error( "Failed to open the database file" );
+ Clear();
+ buffer = new char[ MAX_LINE_SIZE+1 ];
+ buffer2= new char[ MAX_LINE_SIZE+1 ];
+
+ while (((not feof( fin )) && (not feof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
+ buffer[0] = '>'; res =fgets( buffer, MAX_LINE_SIZE, fin );
+ buffer2[0]= '>'; res2=fgets( buffer2, MAX_LINE_SIZE, fin2 );
+
+ if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
+ if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
+ if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
+ if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
+ if ( (res == NULL) && (one.size == 0)) break;
+
+ if( buffer[0] == '+' ){ // fastq 3rd line
+ // file 1
+ int len = strlen( buffer );
+ int len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // read next line quality score
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer );
+ len2 = len;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ one.tot_length += len;
+
+ // file 2
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
+ if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ two.tot_length += len;
+
+ // read next line quality score
+ if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) bomb_error("can not read quality score after");
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){
+ if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ two.tot_length += len;
+
+ }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
+ if ( one.size && two.size ) { // write previous record
+ if( one.identifier == NULL || one.Format() ){
+ printf( "Warning: from file \"%s\",\n", file );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( one.identifier ) printf( "%s\n", one.identifier );
+ printf( "%s\n", one.data );
+ one.size=0; two.size=0;
+ }
+ if( two.identifier == NULL || two.Format() ){
+ printf( "Warning: from file \"%s\",\n", file2 );
+ printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
+ if( two.identifier ) printf( "%s\n", two.identifier );
+ printf( "%s\n", two.data );
+ one.size=0; two.size = 0;
+ }
+ one.index = sequences.size();
+ if( (one.size + two.size)> option_l ) {
+ if (options.trim_len > 0) one.trim(options.trim_len);
+ if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
+ sequences.Append( new Sequence( one, two, 1 ) );
+ }
+ }
+ // R1
+ one.size = 0;
+ one.tot_length = 0;
+
+ int len = strlen( buffer );
+ int len2 = len;
+ des.size = 0;
+ des += buffer;
+ while( len2 && buffer[len2-1] != '\n' ){
+ if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
+ des += buffer;
+ len2 = strlen( buffer );
+ len += len2;
+ }
+ size_t offset = ftell( fin );
+ one.des_begin = offset - len; // offset of ">" or "@"
+ one.tot_length += len; // count first line
+
+ int i = 0;
+ if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
+ if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
+ if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
+ while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
+ des.data[i] = 0; // find first non-space letter
+ one.identifier = des.data;
+
+ // R2
+ two.size = 0;
+ two.tot_length = 0;
+
+ len = strlen( buffer2 );
+ len2 = len;
+ while( len2 && buffer2[len2-1] != '\n' ){
+ if ( (res=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
+ len2 = strlen( buffer2 );
+ len += len2;
+ }
+ offset = ftell( fin2 );
+ two.des_begin = offset - len;
+ two.tot_length += len; // count first line
+ two.identifier = des.data;
+ } else {
+ one.tot_length += strlen(buffer); one += buffer;
+ two.tot_length+= strlen(buffer2); two+= buffer2;
+ }
+ }
+#if 0
+ int i, n = 0;
+ for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
+ cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
+ int i;
+ scanf( "%i", & i );
+#endif
+ one.identifier = NULL;
+ two.identifier = NULL;
+ delete[] buffer;
+ fclose( fin );
+ delete[] buffer2;
+ fclose( fin2 );
}
#if 0
@@ -1827,7 +2049,6 @@ void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Opt
n = sequences.size();
for (i=0; i<n; i++){
Sequence *seq = sequences[i];
- int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
fseek( fin, seq->des_begin, SEEK_SET );
seg_size += seq->size;
@@ -1839,8 +2060,8 @@ void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Opt
seg_size = seq->size;
}
- count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
- rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
+ count = seq->tot_length / MAX_LINE_SIZE;
+ rest = seq->tot_length % MAX_LINE_SIZE;
//printf( "count = %6i, rest = %6i\n", count, rest );
for (j=0; j<count; j++){
if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
@@ -1868,11 +2089,10 @@ void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options
std::sort( sorting.begin(), sorting.end() );
for (i=0; i<n; i++){
Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
- int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
fseek( fin, seq->des_begin, SEEK_SET );
- count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
- rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
+ count = seq->tot_length / MAX_LINE_SIZE;
+ rest = seq->tot_length % MAX_LINE_SIZE;
//printf( "count = %6i, rest = %6i\n", count, rest );
for (j=0; j<count; j++){
if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
@@ -1887,11 +2107,89 @@ void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options
fclose( fout );
delete []buf;
}
+// liwz PE output
+void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
+{
+ FILE *fin = fopen( db, "rb" );
+ FILE *fout = fopen( newdb, "w+" );
+ FILE *fin_pe = fopen( db_pe, "rb" );
+ FILE *fout_pe = fopen( newdb_pe, "w+" );
+ int i, j, n = rep_seqs.size();
+ int count, rest;
+ char *buf = new char[MAX_LINE_SIZE+1];
+ vector<uint64_t> sorting( n );
+ if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
+ if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
+ for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
+ std::sort( sorting.begin(), sorting.end() );
+
+ //sort fasta / fastq
+ int *clstr_size;
+ int *clstr_idx1;
+ if (options.sort_outputf) {
+ clstr_size = new int[n];
+ clstr_idx1 = new int[n];
+ for (i=0; i<n; i++) {
+ clstr_size[i] = 0;
+ clstr_idx1[i] = i;
+ }
+
+ int N = sequences.size();
+ for (i=0; i<N; i++) {
+ int id = sequences[i]->cluster_id;
+ if (id < 0) continue;
+ if (id >=n) continue;
+ clstr_size[id]++;
+ }
+ quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
+ }
+
+ for (i=0; i<n; i++){
+ Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
+ if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
+ //R1
+ fseek( fin, seq->des_begin, SEEK_SET );
+
+ count = seq->tot_length / MAX_LINE_SIZE;
+ rest = seq->tot_length % MAX_LINE_SIZE;
+ //printf( "count = %6i, rest = %6i\n", count, rest );
+ for (j=0; j<count; j++){
+ if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, MAX_LINE_SIZE, fout );
+ }
+ if( rest ){
+ if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, rest, fout );
+ }
+
+ //R2
+ fseek( fin_pe, seq->des_begin2, SEEK_SET );
+
+ count = seq->tot_length2 / MAX_LINE_SIZE;
+ rest = seq->tot_length2 % MAX_LINE_SIZE;
+ //printf( "count = %6i, rest = %6i\n", count, rest );
+ for (j=0; j<count; j++){
+ if( fread( buf, 1, MAX_LINE_SIZE, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
+ }
+ if( rest ){
+ if( fread( buf, 1, rest, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
+ fwrite( buf, 1, rest, fout_pe );
+ }
+
+ }
+ fclose( fin );
+ fclose( fout );
+ fclose( fin_pe );
+ fclose( fout_pe );
+ delete []buf;
+}
+
void SequenceDB::WriteExtra1D( const Options & options )
{
string db_clstr = options.output + ".clstr";
string db_clstr_bak = options.output + ".bak.clstr";
- int i, k, N = sequences.size();
+ int i, i0, k, N = sequences.size();
vector<long long> sorting( N );
for (i=0; i<N; i++) sorting[i] = ((long long)sequences[i]->index << 32) | i;
std::sort( sorting.begin(), sorting.end() );
@@ -1918,11 +2216,33 @@ void SequenceDB::WriteExtra1D( const Options & options )
}
fout = fopen( db_clstr.c_str(), "w+" );
- for (i=0; i<M; i++) {
+
+ if (options.sort_output) {
+ int *clstr_size = new int[M];
+ int *clstr_idx1 = new int[M];
+
+ for (i=0; i<M; i++) {
+ clstr_size[i] = (int)clusters[i].size();
+ clstr_idx1[i] = i;
+ }
+ quick_sort_idxr(clstr_size, clstr_idx1, 0, M-1);
+
+ for (i=0; i<M; i++) {
+ i0 = clstr_idx1[i];
+ fprintf( fout, ">Cluster %i\n", i );
+ for (k=0; k<(int)clusters[i0].size(); k++)
+ sequences[ clusters[i0][k] ]->PrintInfo( k, fout, options, buf );
+ }
+ }
+ else {
+ for (i=0; i<M; i++) {
fprintf( fout, ">Cluster %i\n", i );
for (k=0; k<(int)clusters[i].size(); k++)
sequences[ clusters[i][k] ]->PrintInfo( k, fout, options, buf );
- }
+ }
+
+ }
+
delete []buf;
}
void SequenceDB::WriteExtra2D( SequenceDB & other, const Options & options )
@@ -3284,7 +3604,63 @@ void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> &Comp_AAN_
}
} // make_comp_short_word_index
-
+//quick_sort_idx calling (a, idx, 0, no-1)
+//sort a with another array idx
+//so that idx rearranged
+int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
+ int lo = lo0;
+ int hi = hi0;
+ int mid;
+ int tmp;
+
+ if ( hi0 > lo0) {
+ mid = a[ ( lo0 + hi0 ) / 2 ];
+
+ while( lo <= hi ) {
+ while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
+ while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
+ if( lo <= hi ) {
+ tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
+ tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
+ lo++; hi--;
+ }
+ } // while
+
+ if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
+ if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
+ } // if ( hi0 > lo0)
+ return 0;
+} // quick_sort_idx
+
+
+//decreasing can not use reverse of quick_sort_idx due to tie
+//quick_sort_idxr calling (a, idx, 0, no-1)
+//sort a with another array idx
+//so that idx rearranged
+int quick_sort_idxr (int *a, int *idx, int lo0, int hi0 ) {
+ int lo = lo0;
+ int hi = hi0;
+ int mid;
+ int tmp;
+
+ if ( hi0 > lo0) {
+ mid = a[ ( lo0 + hi0 ) / 2 ];
+
+ while( lo <= hi ) {
+ while( ( lo < hi0 ) && ( a[lo] > mid ) ) lo++;
+ while( ( hi > lo0 ) && ( a[hi] < mid ) ) hi--;
+ if( lo <= hi ) {
+ tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
+ tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
+ lo++; hi--;
+ }
+ } // while
+
+ if( lo0 < hi ) quick_sort_idxr(a, idx, lo0, hi );
+ if( lo < hi0 ) quick_sort_idxr(a, idx, lo, hi0 );
+ } // if ( hi0 > lo0)
+ return 0;
+} // quick_sort_idxr
/////////////////////////// END ALL ////////////////////////
diff --git a/cdhit-common.h b/cdhit-common.h
index 4daa256..b400294 100644
--- a/cdhit-common.h
+++ b/cdhit-common.h
@@ -39,7 +39,7 @@
#include<vector>
#include<map>
-#define CDHIT_VERSION "4.6"
+#define CDHIT_VERSION "4.7"
#ifndef MAX_SEQ
#define MAX_SEQ 655360
@@ -280,6 +280,10 @@ struct Options
int frag_size;
int option_r;
int threads;
+ int PE_mode; // -P
+ int trim_len; // -cx
+ int trim_len_R2; // -cy
+ int align_pos; // -ap for alignment position
size_t max_entries;
size_t max_sequences;
@@ -293,8 +297,14 @@ struct Options
bool backupFile;
string input;
+ string input_pe;
string input2;
+ string input2_pe;
string output;
+ string output_pe;
+
+ int sort_output; // -sc
+ int sort_outputf; // -sf
Options(){
backupFile = false;
@@ -332,6 +342,12 @@ struct Options
frag_size = 0;
des_len = 20;
threads = 1;
+ PE_mode = 0;
+ trim_len = 0;
+ trim_len_R2 = 0;
+ align_pos = 0;
+ sort_output = 0;
+ sort_outputf = 0;
max_entries = 0;
max_sequences = 1<<20;
mem_limit = 100000000;
@@ -358,6 +374,7 @@ struct Sequence
// length of the sequence:
int size;
int bufsize;
+ int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
//uint32_t stats;
@@ -369,13 +386,9 @@ struct Sequence
int offset;
// stream offset of the description string in the database:
- size_t des_begin;
- // length of the description:
- int des_length;
- // length of the description in quality score part:
- int des_length2;
- // length of data in fasta file, including line wrapping:
- int dat_length;
+ size_t des_begin, des_begin2;
+ // total record length
+ int tot_length, tot_length2;
char *identifier;
@@ -389,6 +402,7 @@ struct Sequence
Sequence();
Sequence( const Sequence & other );
+ Sequence( const Sequence & other, const Sequence & other2, int mode );
~Sequence();
void Clear();
@@ -403,6 +417,7 @@ struct Sequence
int Format();
void ConvertBases();
+ void trim(int trim_len);
void SwapIn();
void SwapOut();
@@ -544,7 +559,9 @@ class SequenceDB
~SequenceDB(){ Clear(); }
void Read( const char *file, const Options & options );
+ void Read( const char *file, const char *file2, const Options & options );
void WriteClusters( const char *db, const char *newdb, const Options & options );
+ void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
void WriteExtra1D( const Options & options );
void WriteExtra2D( SequenceDB & other, const Options & options );
void DivideSave( const char *db, const char *newdb, int n, const Options & options );
@@ -590,6 +607,7 @@ int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix
int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
int band_left, int band_center, int band_right, WorkingBuffer & buffer);
+void strrev(char *p);
int print_usage_2d (char *arg);
int print_usage_est (char *arg);
int print_usage_div (char *arg);
@@ -606,3 +624,7 @@ void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutof
int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
float current_time();
+
+//some functions from very old cd-hit
+int quick_sort_idx(int *a, int *idx, int lo0, int hi0 );
+int quick_sort_idxr(int *a, int *idx, int lo0, int hi0 );
diff --git a/cdhit-est-2d.c++ b/cdhit-est-2d.c++
index c7ab211..70f1fbb 100644
--- a/cdhit-est-2d.c++
+++ b/cdhit-est-2d.c++
@@ -49,6 +49,10 @@ int main(int argc, char **argv)
string db_in;
string db_in2;
string db_out;
+ string db_in_pe;
+ string db_in2_pe;
+ string db_out_pe;
+
options.cluster_thd = 0.95;
options.NAA = 10;
@@ -67,8 +71,12 @@ int main(int argc, char **argv)
options.Validate();
db_in = options.input;
+ db_in_pe = options.input_pe;
db_in2 = options.input2;
+ db_in2_pe = options.input2_pe;
db_out = options.output;
+ db_out_pe = options.output_pe;
+
InitNAA( MAX_UAA );
options.NAAN = NAAN_array[options.NAA];
@@ -80,10 +88,12 @@ int main(int argc, char **argv)
make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
}
- seq_db.Read( db_in.c_str(), options );
+ if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
+ else {seq_db.Read( db_in.c_str(), options );}
cout << "total seq in db1: " << seq_db.sequences.size() << endl;
- seq_db2.Read( db_in2.c_str(), options );
+ if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
+ else { seq_db2.Read( db_in2.c_str(), options );}
cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
seq_db.SortDivide( options );
@@ -93,6 +103,9 @@ int main(int argc, char **argv)
cout << "writing non-redundant sequences from db2" << endl;
seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
+ if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
+ else { seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options ); }
+
seq_db2.WriteExtra2D( seq_db, options );
cout << "program completed !" << endl << endl;
end_time = current_time();
diff --git a/cdhit-est.c++ b/cdhit-est.c++
index 2915c26..893472d 100644
--- a/cdhit-est.c++
+++ b/cdhit-est.c++
@@ -43,6 +43,8 @@ int main(int argc, char **argv)
{
string db_in;
string db_out;
+ string db_in_pe;
+ string db_out_pe;
options.cluster_thd = 0.95;
options.NAA = 10;
@@ -60,8 +62,10 @@ int main(int argc, char **argv)
if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
options.Validate();
- db_in = options.input;
- db_out = options.output;
+ db_in = options.input;
+ db_in_pe = options.input_pe;
+ db_out = options.output;
+ db_out_pe = options.output_pe;
InitNAA( MAX_UAA );
seq_db.NAAN = NAAN_array[options.NAA];
@@ -71,13 +75,16 @@ int main(int argc, char **argv)
make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
}
- seq_db.Read( db_in.c_str(), options );
+ if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
+ else {seq_db.Read( db_in.c_str(), options );}
+
cout << "total seq: " << seq_db.sequences.size() << endl;
seq_db.SortDivide( options );
seq_db.DoClustering( options );
printf( "writing new database\n" );
- seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options );
+ if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
+ else { seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options ); }
// write a backup clstr file in case next step crashes
seq_db.WriteExtra1D( options );
diff --git a/cdhit-utility.c++ b/cdhit-utility.c++
index 9f92e03..4fb294b 100644
--- a/cdhit-utility.c++
+++ b/cdhit-utility.c++
@@ -7,10 +7,9 @@ using namespace std;
// information
char cd_hit_ver[] = "\t\t====== CD-HIT version " CDHIT_VERSION " (built on " __DATE__ ") ======";
-char cd_hit_ref1[] = "\"Clustering of highly homologous sequences to reduce thesize of large protein database\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2001) 17:282-283";
-char cd_hit_ref2[] = "\"Tolerating some redundancy significantly speeds up clustering of large protein databases\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2002) 18:77-82";
-char cd_hit_ref3[] = "\"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
-char cd_hit_ref4[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
+char cd_hit_ref1[] = "\"CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
+char cd_hit_ref2[] = "\"CD-HIT: accelerated for clustering the next generation sequencing data\", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152";
+char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
//
char contacts[] =
@@ -20,9 +19,18 @@ char contacts[] =
" If you find cd-hit useful, please kindly cite:\n\n";
char txt_option_i[] = "\tinput filename in fasta format, required\n";
+char txt_option_j[] =
+"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
+\t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
+\t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
+char txt_option_j2[] =
+"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
+\t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
+\t -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 \n";
char txt_option_o[] = "\toutput filename, required\n";
+char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (PE) files\n";
char txt_option_c[] =
"\tsequence identity threshold, default 0.9\n \
\tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
@@ -88,7 +96,21 @@ char txt_option_A[] =
char txt_option_B[] =
"\t1 or 0, default 0, by default, sequences are stored in RAM\n \
\tif set to 1, sequence are stored on hard drive\n \
-\tit is recommended to use -B 1 for huge databases\n";
+\t!! No longer supported !!\n";
+char txt_option_P[] =
+"\tinput paired end (PE) reads, default 0, single file\n \
+\tif set to 1, please use -i R1 -j R2 to input both PE files\n";
+char txt_option_cx[] =
+"\tlength to keep after trimming the tail of sequence, default 0, not trimming\n \
+\tif set to 50, the program only uses the first 50 letters of input sequence\n";
+char txt_option_cy[] =
+"\tlength to keep after trimming the tail of R2 sequence, default 0, not trimming\n \
+\tif set to 50, the program only uses the first 50 letters of input R2 sequence\n \
+\te.g. -cx 100 -cy 80 for paired end reads\n";
+char txt_option_ap[] =
+"\talignment position constrains, default 0, no constrain\n \
+\tif set to 1, the program will force sequences to align at beginings\n \
+\twhen set to 1, the program only does +/+ alignment\n";
char txt_option_uL[] =
"\tmaximum unmatched percentage for the longer sequence, default 1.0\n \
\tif set to 0.1, the unmatched region (excluding leading and tailing gaps)\n \
@@ -108,6 +130,12 @@ char txt_option_r[] =
\tif set to 0, only +/+ strand alignment\n";
char txt_option_bak[] =
"\twrite backup cluster file (1 or 0, default 0)\n";
+char txt_option_sc[] =
+"\tsort clusters by size (number of sequences), default 0, output clusters by decreasing length\n \
+\tif set to 1, output clusters by decreasing size\n";
+char txt_option_sf[] =
+"\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
+\tif set to 1, output sequences by decreasing cluster size\n";
char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
@@ -145,6 +173,8 @@ int print_usage (char *arg) {
cout << " -B" << txt_option_B;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
+ cout << " -sc"<< txt_option_sc;
+ cout << " -sf"<< txt_option_sf;
cout << " -bak" << txt_option_bak;
cout << " -h\tprint this help\n\n";
cout << contacts;
@@ -190,7 +220,7 @@ int print_usage_2d (char *arg) {
cout << " Questions, bugs, contact Weizhong Li at liwz at sdsc.edu\n\n";
cout << " If you find cd-hit useful, please kindly cite:\n\n";
cout << " " << cd_hit_ref1 << "\n";
- cout << " " << cd_hit_ref3 << "\n\n\n";
+ cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_2d
@@ -199,7 +229,9 @@ int print_usage_est (char *arg) {
cout << cd_hit_ver << "\n\n" ;
cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
cout << " -i" << txt_option_i;
+ cout << " -j" << txt_option_j;
cout << " -o" << txt_option_o;
+ cout << " -op" << txt_option_op;
cout << " -c" << txt_option_c;
cout << " -G" << txt_option_G;
cout << " -b" << txt_option_b;
@@ -219,6 +251,10 @@ int print_usage_est (char *arg) {
cout << " -uS" << txt_option_uS;
cout << " -U" << txt_option_U;
cout << " -B" << txt_option_B;
+ cout << " -P" << txt_option_P;
+ cout << " -cx"<< txt_option_cx;
+ cout << " -cy"<< txt_option_cy;
+ cout << " -ap"<< txt_option_ap;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
cout << " -r" << txt_option_r;
@@ -228,10 +264,12 @@ int print_usage_est (char *arg) {
cout << " -gap" << txt_option_gap;
cout << " -gap-ext" << txt_option_gap_ext;
cout << " -bak" << txt_option_bak;
+ cout << " -sc"<< txt_option_sc;
+ cout << " -sf"<< txt_option_sf;
cout << " -h\tprint this help\n\n";
cout << contacts;
cout << " " << cd_hit_ref1 << "\n";
- cout << " " << cd_hit_ref3 << "\n\n\n";
+ cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_est
@@ -241,7 +279,9 @@ int print_usage_est_2d (char *arg) {
cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
cout << " -i" << txt_option_i_2d;
cout << " -i2"<< txt_option_i2;
+ cout << " -j, -j2"<< txt_option_j2;
cout << " -o" << txt_option_o;
+ cout << " -op" << txt_option_op;
cout << " -c" << txt_option_c;
cout << " -G" << txt_option_G;
cout << " -b" << txt_option_b;
@@ -263,6 +303,9 @@ int print_usage_est_2d (char *arg) {
cout << " -uS" << txt_option_uS;
cout << " -U" << txt_option_U;
cout << " -B" << txt_option_B;
+ cout << " -P" << txt_option_P;
+ cout << " -cx"<< txt_option_cx;
+ cout << " -cy"<< txt_option_cy;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
cout << " -r" << txt_option_r;
@@ -275,7 +318,7 @@ int print_usage_est_2d (char *arg) {
cout << " -h\tprint this help\n\n";
cout << contacts;
cout << " " << cd_hit_ref1 << "\n";
- cout << " " << cd_hit_ref3 << "\n\n\n";
+ cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_est_2d
@@ -326,8 +369,8 @@ int print_usage_454 (char *arg)
cout << " Questions, bugs, contact Weizhong Li at liwz at sdsc.edu\n\n";
cout << " If you find cd-hit useful, please kindly cite:\n\n";
cout << " " << cd_hit_ref1 << "\n";
- cout << " " << cd_hit_ref3 << "\n";
- cout << " " << cd_hit_ref4 << "\n\n\n";
+ cout << " " << cd_hit_ref2 << "\n";
+ cout << " " << cd_hit_ref3 << "\n\n\n";
exit(1);
}
diff --git a/clstr_list.pl b/clstr_list.pl
new file mode 100755
index 0000000..9c6639b
--- /dev/null
+++ b/clstr_list.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/perl
+
+use Storable;
+use strict;
+#my $sort_by_what = shift;
+# $sort_by_what = "no" unless $sort_by_what;
+
+my $clstr_file = shift;
+my $store_file = shift;
+
+my %clstr = (); # an array of hashes for all the cluster
+my $rep_len = 0;
+my $rep_acc = "";
+my @cur_sequences = (); # array of hashes for all sequences in a cluster
+my $ll = "";
+my @record = ();
+
+open(TMP, $clstr_file) || die;
+while($ll = <TMP>) { # read .clstr files
+ if ($ll =~ /^>/) { # the begin of a cluster
+ if (scalar(@cur_sequences)) { # not the first cluster, therefore collect the information of last clstr
+ #@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
+ @cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
+ @record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
+ $clstr{$rep_acc} = [@record];
+ }
+ @cur_sequences=();
+ }
+ else { # the sequence line
+ chop($ll);
+ if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
+ @record = ($4, $2, 0, [], "");
+ if ($ll =~ /\*$/) { # representative sequence or not
+ $rep_acc = $record[0];
+ $rep_len = $record[1];
+ $record[4] = "100%";
+ }
+# elsif ($ll =~ / at (\d.+)$/ ) {
+ elsif ($ll =~ / at (.+\d.+)$/ ) {# because cd-hit-est have strand info
+ $record[4] = $1;
+ }
+ }
+ push(@cur_sequences, [@record]);
+ }
+}
+if (scalar(@cur_sequences)) {
+ #@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
+ @cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
+ @record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
+ $clstr{$rep_acc} = [@record];
+}
+close(TMP);
+
+if (-e $store_file){ # already have a cluster file
+ my %old_clstr = %{retrieve($store_file)};
+ foreach my $rep_acc (keys %clstr){
+ my $seqs = $clstr{$rep_acc}[3]; # $seqs a reference to the sequences;
+ my $tmp_size = scalar(@{$seqs}); # how many sequences in a top level cluster, each sequence should be a representative sequence for lower level cluster
+ #print "$rep_acc, $tmp_size\n";
+ my $i;
+ for $i (0..($tmp_size-1)){
+ my $seq = $$seqs[$i];
+ if ($old_clstr{$$seq[0]}){
+ $clstr{$rep_acc}[3][$i][3] = [@{$old_clstr{$$seq[0]}[3]}];
+ $clstr{$rep_acc}[3][$i][2] = 1;
+ }
+ }
+ }
+}
+
+store \%clstr, $store_file;
+
+#~ my $size = scalar(keys %clstr);
+#~ print "$size\n";
+
+#~ my $acc = 'D8F4YGO02FSTQP|range|2:370|frame|2|len|123';
+
+#~ my $temp = $clstr{$acc}[1];
+#~ print "$temp\n";
+
+#~ my $temp = scalar(@{$clstr{$acc}[3]});
+#~ print "$temp\n";
+
+#~ my $x;
+#~ for $x (@{$clstr{$acc}[3]} ){
+ #~ my $tmp_1 = scalar(@{$x->[3]});
+ #~ print "$x->[2], $x->[4], $x->[0], $x->[1], $tmp_1\n";
+#~ }
+
diff --git a/clstr_list_sort.pl b/clstr_list_sort.pl
new file mode 100755
index 0000000..e0d20d8
--- /dev/null
+++ b/clstr_list_sort.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+
+use Storable;
+use strict;
+
+my $input_file = shift;
+my $output_file = shift;
+my $sort_by_what = shift;
+ $sort_by_what = "no" unless $sort_by_what;
+
+my @clstr = values %{retrieve($input_file)};
+
+
+if ($sort_by_what eq "no") {
+
+ ### Added by liwz sort by No. sequences instead of No. nodes
+ my %rep2size = ();
+ my $clstr_no = scalar(@clstr);
+ my ($i);
+
+
+ for ($i=0; $i<$clstr_no; $i++){
+ my $node_size = 0;
+ foreach my $seq1 (@{$clstr[$i][3]}) {
+ if ($$seq1[2]) { # can be futher expanded
+ foreach my $seq2(@{$$seq1[3]}) {
+ if ($$seq2[2]) { $node_size += scalar(@{$$seq2[3]}); }
+ else { $node_size++; }
+ }
+ }
+ else {
+ $node_size++;
+ }
+ }
+ $rep2size{ $clstr[$i][0] } = $node_size;
+ }
+ ### END
+
+ #@clstr = sort {scalar(@{$b->[3]}) <=> scalar(@{$a->[3]})} @clstr;
+ @clstr = sort {$rep2size{$b->[0]} <=> $rep2size{$a->[0]}} @clstr;
+}
+elsif ($sort_by_what eq "len") {
+ @clstr = sort {$b->[1] <=> $a->[1]} @clstr;
+}
+elsif ($sort_by_what eq "des") {
+ @clstr = sort {$a->[0] cmp $b->[0]} @clstr;
+}
+
+store \@clstr, $output_file;
+
+
diff --git a/doc/cd-hit-otu-miseq-Figure-1.png b/doc/cd-hit-otu-miseq-Figure-1.png
new file mode 100644
index 0000000..7eb2e7a
Binary files /dev/null and b/doc/cd-hit-otu-miseq-Figure-1.png differ
diff --git a/doc/cdhit-user-guide.pdf b/doc/cdhit-user-guide.pdf
index d6b01d2..7bfa258 100644
Binary files a/doc/cdhit-user-guide.pdf and b/doc/cdhit-user-guide.pdf differ
diff --git a/doc/cdhit-user-guide.wiki b/doc/cdhit-user-guide.wiki
index 2d24505..a665477 100644
--- a/doc/cdhit-user-guide.wiki
+++ b/doc/cdhit-user-guide.wiki
@@ -4,7 +4,7 @@ Last updated: ~~LASTMOD~~
[[http://cd-hit.org]]
-Program developed by Weizhong Li's lab at UCSD [[http://weizhong-lab.ucsd.edu]] and JCVI [[http://jcvi.org]] [[liwz at sdsc.edu]]
+Program developed by Weizhong Li's lab at UCSD [[http://weizhongli-lab.org]] and JCVI [[http://jcvi.org]] [[liwz at sdsc.edu]]
===== Introduction =====
@@ -62,7 +62,7 @@ Based on this greedy method, we established several integrated heuristics that m
**Reduced alphabet (to be implemented)**: This is for protein clustering. In reduced alphabet, a group of exchangeable residues are reduced to a single residue (I/V/L==>I, S/T==>S, D/E==>D, K/R==>K, F/Y==>F), and then conservative mutations would appear as identities in sequence alignments. It improves the short word filter for clustering at low sequence identity below 50%.
-**Gapped word (to be implemented)**: Short word filter using gapped word allows mismatch within a word such as “ACE” vs “AME”, “ACFE” vs “AMYE”, and “AACTT” vs “AAGTT”, which can be written as “101”, “1001” and “11011”. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
+**Gapped word (to be implemented)**: Short word filter using gapped word allows mismatch within a word such as âACEâ vs âAMEâ, âACFEâ vs âAMYEâ, and âAACTTâ vs âAAGTTâ, which can be written as â101â, â1001â and â11011â. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
@@ -95,9 +95,9 @@ Because of the algorithm, cd-hit may not be used for clustering proteins at <40%
It can be copied under the GNU General Public License version 2 (GPLv2).
Most CD-HIT programs were written in C++. Installing CD-HIT package is very simple:
- * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
- * unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
- * change dir by "cd cd-hit-v4.6.2-2015-0511"
+ * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.6-2016-0711.tar.gz
+ * unpack the file with " tar xvf cd-hit-v4.6.6-2016-0711.tar.gz --gunzip"
+ * change dir by "cd cd-hit-v4.6.6-2016-0711"
* compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
* cd cd-hit-auxtools
* compile cd-hit-auxtools by "make"
@@ -107,8 +107,8 @@ Most CD-HIT programs were written in C++. Installing CD-HIT package is very simp
CD-HIT clusters proteins into clusters that meet a user-defined similarity threshold, usually a sequence identity. Each cluster has one representative sequence. The input is a protein dataset in fasta format and the output are two files: a fasta file of representative sequences and a text file of list of clusters.
Basic command:
- cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 –d 0 -T 8
- cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 –d 0 -T 8,
+ cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 âd 0 -T 8
+ cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 âd 0 -T 8,
where\\
''db'' is the filename of input, \\
@@ -182,7 +182,7 @@ __**The most updated options are available from the command line version of the
must not be more than 10 bases
-B 1 or 0, default 0, by default, sequences are stored in RAM
if set to 1, sequence are stored on hard drive
- it is recommended to use -B 1 for huge databases
+ !! No longer supported !!
-p 1 or 0, default 0
if set to 1, print alignment overlap in .clstr file
-g 1 or 0, default 0
@@ -191,6 +191,10 @@ __**The most updated options are available from the command line version of the
will cluster it into the most similar cluster that meet the threshold
(accurate but slow mode)
but either 1 or 0 won't change the representatives of final clusters
+ -sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
+ if set to 1, output clusters by decreasing size
+ -sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
+ if set to 1, output sequences by decreasing cluster size
-bak write backup cluster file (1 or 0, default 0)
-h print this help
@@ -200,7 +204,8 @@ Alignment coverage control:
See the figure below, the -aL, -AL, -aS and -AS options can be used to specify the alignment coverage on both the representative sequence and other sequences. -s and -S can control the length difference between the representative sequence and other sequences.
-{{ :Figure2.png }}
+{{ :cd-hit-figure2.png }}
+
''
aL = R<sub>a</sub> / R\\
@@ -264,18 +269,76 @@ Choose of word size (same as cd-hit):
-n 2 for thresholds 0.4 ~ 0.5
</code>
-More options:
-
-Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
-are same to CD-HIT, here are few more cd-hit-2d specific options:
+Options:
<code>
--i2 input filename for db2 in fasta format, required
--s2 length difference cutoff for db1, default 1.0
- by default, seqs in db1 >= seqs in db2 in a same cluster
- if set to 0.9, seqs in db1 may just >= 90% seqs in db2
--S2 length difference cutoff, default 0
- by default, seqs in db1 >= seqs in db2 in a same cluster
- if set to 60, seqs in db2 may 60aa longer than seqs in db1
+ -i input filename for db1 in fasta format, required
+ -i2 input filename for db2 in fasta format, required
+ -o output filename, required
+ -c sequence identity threshold, default 0.9
+ this is the default cd-hit's "global sequence identity" calculated as:
+ number of identical amino acids in alignment
+ divided by the full length of the shorter sequence
+ -G use global sequence identity, default 1
+ if set to 0, then use local sequence identity, calculated as :
+ number of identical amino acids in alignment
+ divided by the length of the alignment
+ NOTE!!! don't use -G 0 unless you use alignment coverage controls
+ see options -aL, -AL, -aS, -AS
+ -b band_width of alignment, default 20
+ -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
+ -T number of threads, default 1; with 0, all CPUs will be used
+ -n word_length, default 5, see user's guide for choosing it
+ -l length of throw_away_sequences, default 10
+ -t tolerance for redundance, default 2
+ -d length of description in .clstr file, default 20
+ if set to 0, it takes the fasta defline and stops at first space
+ -s length difference cutoff, default 0.0
+ if set to 0.9, the shorter sequences need to be
+ at least 90% length of the representative of the cluster
+ -S length difference cutoff in amino acid, default 999999
+ if set to 60, the length difference between the shorter sequences
+ and the representative of the cluster can not be bigger than 60
+ -s2 length difference cutoff for db1, default 1.0
+ by default, seqs in db1 >= seqs in db2 in a same cluster
+ if set to 0.9, seqs in db1 may just >= 90% seqs in db2
+ -S2 length difference cutoff, default 0
+ by default, seqs in db1 >= seqs in db2 in a same cluster
+ if set to 60, seqs in db2 may 60aa longer than seqs in db1
+ -aL alignment coverage for the longer sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AL alignment coverage control for the longer sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -aS alignment coverage for the shorter sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AS alignment coverage control for the shorter sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -A minimal alignment coverage control for the both sequences, default 0
+ alignment must cover >= this value for both sequences
+ -uL maximum unmatched percentage for the longer sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -uS maximum unmatched percentage for the shorter sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -U maximum unmatched length, default 99999999
+ if set to 10, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10 bases
+ -B 1 or 0, default 0, by default, sequences are stored in RAM
+ if set to 1, sequence are stored on hard drive
+ !! No longer supported !!
+ -p 1 or 0, default 0
+ if set to 1, print alignment overlap in .clstr file
+ -g 1 or 0, default 0
+ by cd-hit's default algorithm, a sequence is clustered to the first
+ cluster that meet the threshold (fast cluster). If set to 1, the program
+ will cluster it into the most similar cluster that meet the threshold
+ (accurate but slow mode)
+ but either 1 or 0 won't change the representatives of final clusters
+ -bak write backup cluster file (1 or 0, default 0)
+ -h print this help
+
</code>
==== CD-HIT-EST ====
@@ -289,7 +352,8 @@ difficult to make full-length alignments for these genes. So, CD-HIT-EST is
good for non-intron containing sequences like EST.
Basic command:
- cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
+ cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
+ cd-hit-est -i R1.fa -j R2.fa -o R1.95.fa -op R2.95.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
Choose of word size:
<code>
@@ -301,11 +365,79 @@ Choose of word size:
-n 4 for thresholds 0.75 ~ 0.8
</code>
-More options:
-
-Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
-are same to CD-HIT, here are few more cd-hit-est specific options:
+Options:
<code>
+ -i input filename in fasta format, required
+ -j input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
+ -i R1.fq -j R2.fq -o output_R1 -op output_R2 or
+ -i R1.fa -j R2.fa -o output_R1 -op output_R2
+ -o output filename, required
+ -op output filename for R2 reads if input are paired end (PE) files
+ -c sequence identity threshold, default 0.9
+ this is the default cd-hit's "global sequence identity" calculated as:
+ number of identical amino acids in alignment
+ divided by the full length of the shorter sequence
+ -G use global sequence identity, default 1
+ if set to 0, then use local sequence identity, calculated as :
+ number of identical amino acids in alignment
+ divided by the length of the alignment
+ NOTE!!! don't use -G 0 unless you use alignment coverage controls
+ see options -aL, -AL, -aS, -AS
+ -b band_width of alignment, default 20
+ -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
+ -T number of threads, default 1; with 0, all CPUs will be used
+ -n word_length, default 10, see user's guide for choosing it
+ -l length of throw_away_sequences, default 10
+ -d length of description in .clstr file, default 20
+ if set to 0, it takes the fasta defline and stops at first space
+ -s length difference cutoff, default 0.0
+ if set to 0.9, the shorter sequences need to be
+ at least 90% length of the representative of the cluster
+ -S length difference cutoff in amino acid, default 999999
+ if set to 60, the length difference between the shorter sequences
+ and the representative of the cluster can not be bigger than 60
+ -aL alignment coverage for the longer sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AL alignment coverage control for the longer sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -aS alignment coverage for the shorter sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AS alignment coverage control for the shorter sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -A minimal alignment coverage control for the both sequences, default 0
+ alignment must cover >= this value for both sequences
+ -uL maximum unmatched percentage for the longer sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -uS maximum unmatched percentage for the shorter sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -U maximum unmatched length, default 99999999
+ if set to 10, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10 bases
+ -B 1 or 0, default 0, by default, sequences are stored in RAM
+ if set to 1, sequence are stored on hard drive
+ !! No longer supported !!
+ -P input paired end (PE) reads, default 0, single file
+ if set to 1, please use -i R1 -j R2 to input both PE files
+ -cx length to keep after trimming the tail of sequence, default 0, not trimming
+ if set to 50, the program only uses the first 50 letters of input sequence
+ -cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
+ if set to 50, the program only uses the first 50 letters of input R2 sequence
+ e.g. -cx 100 -cy 80 for paired end reads
+ -ap alignment position constrains, default 0, no constrain
+ if set to 1, the program will force sequences to align at beginings
+ when set to 1, the program only does +/+ alignment
+ -p 1 or 0, default 0
+ if set to 1, print alignment overlap in .clstr file
+ -g 1 or 0, default 0
+ by cd-hit's default algorithm, a sequence is clustered to the first
+ cluster that meet the threshold (fast cluster). If set to 1, the program
+ will cluster it into the most similar cluster that meet the threshold
+ (accurate but slow mode)
+ but either 1 or 0 won't change the representatives of final clusters
-r 1 or 0, default 1, by default do both +/+ & +/- alignments
if set to 0, only +/+ strand alignment
-mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')
@@ -313,6 +445,14 @@ are same to CD-HIT, here are few more cd-hit-est specific options:
-mismatch mismatching score, default -2
-gap gap opening score, default -6
-gap-ext gap extension score, default -1
+ -bak write backup cluster file (1 or 0, default 0)
+ -sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
+ if set to 1, output clusters by decreasing size
+ -sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
+ if set to 1, output sequences by decreasing cluster size
+ -h print this help
+
+
</code>
==== CD-HIT-EST-2D ====
@@ -326,18 +466,98 @@ For same reason as CD-HIT-EST, CD-HIT-EST-2D is good for non-intron containing
sequences like EST.
Basic command:
- cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
-
+ cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
+ cd-hit-est-2d -i db1.R1.fa -j db1.R2.fa -i2 db2.R1.fa -j2 db2.R2.fa -o db2_novel.R1.fa -op db2_novel.R2.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
+
Choose of word size and options are the same as CD-HIT-EST:
-cd-hit-est-2d specificnoptions:
+Options:
<code>
+ -i input filename for db1 in fasta format, required
+ -i2 input filename for db2 in fasta format, required
+ -j, -j2 input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
+ -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or
+ -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2
+ -o output filename, required
+ -op output filename for R2 reads if input are paired end (PE) files
+ -c sequence identity threshold, default 0.9
+ this is the default cd-hit's "global sequence identity" calculated as:
+ number of identical amino acids in alignment
+ divided by the full length of the shorter sequence
+ -G use global sequence identity, default 1
+ if set to 0, then use local sequence identity, calculated as :
+ number of identical amino acids in alignment
+ divided by the length of the alignment
+ NOTE!!! don't use -G 0 unless you use alignment coverage controls
+ see options -aL, -AL, -aS, -AS
+ -b band_width of alignment, default 20
+ -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
+ -T number of threads, default 1; with 0, all CPUs will be used
+ -n word_length, default 10, see user's guide for choosing it
+ -l length of throw_away_sequences, default 10
+ -d length of description in .clstr file, default 20
+ if set to 0, it takes the fasta defline and stops at first space
+ -s length difference cutoff, default 0.0
+ if set to 0.9, the shorter sequences need to be
+ at least 90% length of the representative of the cluster
+ -S length difference cutoff in amino acid, default 999999
+ if set to 60, the length difference between the shorter sequences
+ and the representative of the cluster can not be bigger than 60
-s2 length difference cutoff for db1, default 1.0
by default, seqs in db1 >= seqs in db2 in a same cluster
if set to 0.9, seqs in db1 may just >= 90% seqs in db2
-S2 length difference cutoff, default 0
by default, seqs in db1 >= seqs in db2 in a same cluster
if set to 60, seqs in db2 may 60aa longer than seqs in db1
+ -aL alignment coverage for the longer sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AL alignment coverage control for the longer sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -aS alignment coverage for the shorter sequence, default 0.0
+ if set to 0.9, the alignment must covers 90% of the sequence
+ -AS alignment coverage control for the shorter sequence, default 99999999
+ if set to 60, and the length of the sequence is 400,
+ then the alignment must be >= 340 (400-60) residues
+ -A minimal alignment coverage control for the both sequences, default 0
+ alignment must cover >= this value for both sequences
+ -uL maximum unmatched percentage for the longer sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -uS maximum unmatched percentage for the shorter sequence, default 1.0
+ if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10% of the sequence
+ -U maximum unmatched length, default 99999999
+ if set to 10, the unmatched region (excluding leading and tailing gaps)
+ must not be more than 10 bases
+ -B 1 or 0, default 0, by default, sequences are stored in RAM
+ if set to 1, sequence are stored on hard drive
+ !! No longer supported !!
+ -P input paired end (PE) reads, default 0, single file
+ if set to 1, please use -i R1 -j R2 to input both PE files
+ -cx length to keep after trimming the tail of sequence, default 0, not trimming
+ if set to 50, the program only uses the first 50 letters of input sequence
+ -cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
+ if set to 50, the program only uses the first 50 letters of input R2 sequence
+ e.g. -cx 100 -cy 80 for paired end reads
+ -p 1 or 0, default 0
+ if set to 1, print alignment overlap in .clstr file
+ -g 1 or 0, default 0
+ by cd-hit's default algorithm, a sequence is clustered to the first
+ cluster that meet the threshold (fast cluster). If set to 1, the program
+ will cluster it into the most similar cluster that meet the threshold
+ (accurate but slow mode)
+ but either 1 or 0 won't change the representatives of final clusters
+ -r 1 or 0, default 1, by default do both +/+ & +/- alignments
+ if set to 0, only +/+ strand alignment
+ -mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')
+ -match matching score, default 2 (1 for T-U and N-N)
+ -mismatch mismatching score, default -2
+ -gap gap opening score, default -6
+ -gap-ext gap extension score, default -1
+ -bak write backup cluster file (1 or 0, default 0)
+ -h print this help
+
</code>
@@ -348,7 +568,7 @@ We implemented a program called cd-hit-454 to identify duplicated 454 reads by r
Basic command:
cd-hit-454 -i 454_reads -o 454_reads_95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
-Full list of options:
+Options:
<code>
-i input filename in fasta format, required
-o output filename, required
@@ -423,7 +643,7 @@ Implementation (see figure below)
- repeat cd-hit and cd-hit-2d runs till done
- Combine the results
-{{ :Figure3.png }}
+{{ :cd-hit-figure3.png }}
Basic command:
cd-hit-para.pl -i nr90 -o nr60 -c 0.6 -n 4 --B hosts --S 64
@@ -506,8 +726,12 @@ stable cluster structure.
With multiple-step, iterated runs of CD-HIT, you perform a clustering in a
neighbor-joining method, which generates a hierarchical structure. The third step use psi-cd-hit, please see psi-cd-hit section for details.
+
+This way is faster than one-step clustering. It can also be more accurate.
+
+There is a problem with one-step clustering. Two very similar sequences A and B may be clustered into different clusters. For example, let the clustering threshold to be 60%, IAB (identity of AB) = 95%, IAC ⥠60%, but IBC < 60%. If C was first selected a cluster representative, then A will be in cluster âCâ, but âBâ will not, resulting near identical AB to be in different clusters. Hierarchically clustering will reduce this problem.
-{{ :Figure4.png }}
+{{ :cd-hit-figure4.png }}
Commands:
cd-hit -i nr -o nr80 -c 0.8 -n 5 -d 0 -M 16000 -T 16
@@ -525,10 +749,6 @@ nr60.clstr only lists sequences from nr80, script clstr_rev.pl add the original
clstr_rev.pl nr80-60.clstr nr30.clstr > nr80-60-30.clstr
nr30.clstr only lists sequences from nr60, script clstr_rev.pl add the original sequences into file nr80-60-30.clstr
-This way is faster than one-step run from nr directly to nr30. It can also
-more accurate.
-
-
===== CD-HIT AuxTools =====
@@ -541,7 +761,7 @@ read duplicates, finding pairs of overlapping reads or joining pair-end reads et
cd-hit-dup is a simple tool for removing duplicates from sequencing reads,
-with optional step to detect and remove chimeric reads.
+with optional step to detect and remove chimeric reads. When two files of paired end reads are used as inputs, each pair of reads will be concatenated into a single one.
A number of options are provided to tune how the duplicates are removed.
Running the program without arguments should print out the list of available options,
as the following:
@@ -571,42 +791,10 @@ Options:
</code>
=== Option details ===
-
-== Common options ==
-Here are the more detailed description of the options.
-<code>
- -i Input file;
-</code>
-Input file that must be in fasta or fastq format.
-
-<code>
- -i2 Second input file;
-</code>
-cd-hit-dup can take 2 files of paired end reads.
-"-i" can be used to specify the file for the R1;
-and "-i2" can be used to specify the file for R2.
-
-When two files of paired end reads are used as inputs, each pair of reads will
-be concatenated into a single one. And the following steps of duplicate and chimeric
-detection and removing.
-
-<code>
- -o Output file;
-</code>
-Output file which contains a list of reads without duplicates.
-
-<code>
- -o2 Output file for R2, with paired end reads;
-</code>
-
-<code>
- -d Description length (default 0, truncate at the first whitespace character)
-</code>
-The length of description line that should be written to the output.
-
<code>
-u Length of prefix to be used in the analysis (default 0, for full/maximum length);
</code>
+
For pair-end inputs, the program will take part (whole or prefix) of the first end
and part (whole or prefix) of the second read,
and join them together to form a single read to do the analysis.
@@ -621,8 +809,6 @@ It also allows the program to use only the prefix up to the specified length of
to do the analysis. In case that a read is shorter than this length, no 'N' is appended to
the read since it is not necessary.
-
-== Options for duplicate detection ==
<code>
-m Match length (true/false, default true);
</code>
@@ -637,8 +823,6 @@ for duplicate and chimeric detection. For duplicate detection, any two reads wit
no greater than the specified value are considered to be duplicates. For chimeric detection,
this option control how similar a read should be to either of its parents.
-
-== Options for chimeric filtering ==
<code>
-f Filter out chimeric clusters (true/false, default false);
</code>
@@ -883,8 +1067,8 @@ but using BLAST to calculate similarities. Below are the procedures of PSI-CD-HI
- Repeat until done
==== Installation ====
-please download legacy BLAST (not BLAST+) and install the executables in your $PATH. The programs
-required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb.
+please download either legacy BLAST or BLAST+ and install the executables in your $PATH. The programs
+required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb for legacy blast, and blastp, blastn, psiblast and makeblastdb for blast+.
==== Usage ====
@@ -941,11 +1125,11 @@ More options:
-------------circle-----------
| |
seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
- \\\\ /////////////
- \\\\ /////////////
+ \\\\\\\\ /////////////
+ \\\\\\\\ /////////////
HSP 2 -> ////HSP 1 /// <-HSP 2
- ///////////// \\\\
- ///////////// \\\\
+ ///////////// \\\\\\\\
+ ///////////// \\\\\\\\
seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
| |
-----------circle--------------
@@ -1164,7 +1348,95 @@ server performs hierarchical clustering up to 3 steps.
The CD-HIT-454 web server is also available from [[http://cd-hit.org]].
-
+===== Use cases =====
+Here, a use case is defined as a sequence clustering related problem or application that cannot be easily solved with existing clustering approaches, such as CD-HIT. However, it is feasible to solve such a use case by customizing current clustering algorithms or utilizing current approach in a very intelligent way or non-standard manner. In the last years, we have developed many use cases in addressing various problems. We will release these use cases after additional testing. These use [...]
+
+===== CD-HIT-OTU-MiSeq =====
+This use case is developed for clustering 16S rRNA genes into OTUs for microbiome studies. In recent years, Illumina MiSeq sequencers became dominant in 16S rRNA sequencing. The Paired End (PE) reads need to be assembled first. However many reads can not be accurately assembled because the poor quality at the 3â ends of both PE reads in the overlapping region. This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique features to cluster MiSeq 16S seq [...]
+ - The package can clustering PE reads without joining them into contigs.
+ - Users can choose a high quality portion of the PE reads for analysis (e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
+ - We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length 16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs) and annotate these OTUs concurrently.
+ - Chimeric sequences are effectively identified through both de novo and reference-based approaches.
+
+The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at the 5â ends of R1 and R2 reads. For example, the effective read length can be 200 bases for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with spliced PE sequences from the reference database to derive OTUs (Figure).
+
+{{:cd-hit-otu-miseq-figure-1.png|}}
+
+==== Installation ====
+First download and install full cd-hit package
+ * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
+ * unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
+ * change dir by "cd cd-hit-v4.6.2-2015-0511"
+ * compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
+ * cd cd-hit-auxtools
+ * compile cd-hit-auxtools by "make"
+ * CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
+
+CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from [[http://www.usadellab.org/cms/?page=trimmomatic]] or [[https://github.com/timflutre/trimmomatic]]. We also have a copy at [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
+
+ * modify NG-Omics-Miseq-16S.pl
+Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
+ $CD_HIT_dir = "PATH_to_cd-hit";
+ $NGS_prog_trimmomatic = "PATH/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
+
+==== Download reference and sample datasets ====
+Reference database and sample datasets can be downloaded from [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
+
+The reference database Greengene-13-5-99.fasta.gz was processed from original Greengene database, so that sequences with more specific annotations are at the beginning of the file. You need to download and gunzip it.
+
+You can also download Greengene and generate it. You should download Greengene from [[http://greengenes.secondgenome.com/downloads]], or [[ftp://greengenes.microbio.me/]]. Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta. There is a script: usecases/Miseq-16S/greengene-ann1.pl.
+
+Commands:
+ /greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
+
+The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
+
+==== Usage ====
+
+**Step 1. prepare fastq files and sample file:** Most projects have multiple samples sequenced at the same region. You should already have paired ended fastq files for these samples, put them in a working directory in similar way as the testing datasets, where the R1.fq and R2.fq are placed in separate folder for each sample. So in the working directory, you should have files:
+ sample_name_1/R1.fq
+ sample_name_1/R2.fq
+ sample_name_2/R1.fq
+ sample_name_2/R2.fq
+ ...
+ sample_name_N/R1.fq
+ sample_name_N/R2.fq
+
+Then, please prepare a sample file in the working directory. The file should look like:
+ sample_name_1 R1.fq R2.fq
+ sample_name_2 R1.fq R2.fq
+ sample_name_N R1.fq R2.fq
+
+**Step 2. Reference database preparation:** We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva, into PE sequences. If there are multiple samples in a project sequenced with the same amplicon of same variable region, only one spliced reference database is needed. To run:
+
+ path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
+Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file. This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
+
+**Step 3. Run sequence QC and OTU clustering for each sample:**. In the working directory, run
+ PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+where: 150 and 100 are the effective length, 0.97 is the OTU clustering cutoff, 0.00001 is the abundance cutoff, 75 is the length for chimeric checking at each R1 and R2 read
+
+This command will generate shell scripts for QC and for OTU for each sample. The scripts will be in WF-sh folder. You can first run the qc.sample_name.sh and then run otu.sample_name.sh
+
+NG-Omics-WF.pl [[https://github.com/weizhongli/ngomicswf]] is a very powerful workflow and pipeline tool developed in our group. It is not fully released yet, since we need more time to document this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples. First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36, then
+ nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
+
+After the job finished, the OTU results will be in sample_name/otu folder, important files include
+ * OTU.clstr: file lists all clusters and sequences
+ * removed_chimeric*: chimeric sequenced removed
+ * small_clusters.list: low abundance small clusters removed
+
+**Step 4. pool all the samples together:** Please run
+ PATH_to_cd-hit-dir/usecases/pool_samples.pl -s sample_file -o pooled_sample.
+This will pool sequences from all sample and re-run OTU clustering. We can pool hundred of samples without problem. After job finished, additional files will be available from pooled_sample directory
+ * OTU.clstr: file list all clusters and sequences from all samples
+ * removed_chimeric*: chimeric sequenced removed
+ * small_clusters.list: low abundance small clusters removed
+ * OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
+ * OTU.biome: OTU.txt in biome format
+
+
+
===== References =====
If you find cd-hit helpful to your research and study, please kindly cite the
diff --git a/psi-cd-hit/psi-cd-hit-local.pl b/psi-cd-hit/psi-cd-hit-local-old.pl
similarity index 81%
copy from psi-cd-hit/psi-cd-hit-local.pl
copy to psi-cd-hit/psi-cd-hit-local-old.pl
index a77c7f7..f5ab1b1 100755
--- a/psi-cd-hit/psi-cd-hit-local.pl
+++ b/psi-cd-hit/psi-cd-hit-local-old.pl
@@ -15,16 +15,16 @@ our $circle = 0; #
our $opt_g = 1; ####################
our $blast_exe = "blastall -p blastp -m 8"; #########################
our $prof_exe = "blastpgp -m 8"; #
-our $prof_para = "-j 3 -F F -e 0.001 -b 500 -v 500"; #
+our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
our $prof_db = ""; #
-our $bl_para = "-F F -e 0.000001 -b 100000 -v 100000"; # program
+our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
our $bl_STDIN = 1; #
our $keep_bl = 0; #
our $blast_prog= "blastp"; #
our $formatdb = "formatdb"; #########################
our $exec_mode = "local"; #######################
-our $host_no = 1; #
-our $core_no = 1; # compute
+our $num_qsub = 1; #
+our $para_no = 1; # compute
our $sh_file = ""; #
our $batch_no_per_node = 50; #######################
our $reformat_seg = 50000;
@@ -44,6 +44,12 @@ our $tmp_db;
our $remote_perl_script;
our $remote_sh_script;
our $bl_path;
+our $bl_plus = 1; #### use blast+
+our $bl_threads = 1;
+our $skip_long = 0;
+our %qsub_ids = (); #### a list of qsub ids
+our %qstat_xml_data = ();
+
sub parse_para_etc {
my ($arg, $cmd);
@@ -60,18 +66,21 @@ sub parse_para_etc {
elsif ($arg eq "-aS") { $opt_aS = shift; }
elsif ($arg eq "-g") { $opt_g = shift; }
elsif ($arg eq "-circle") { $circle = shift; }
+ elsif ($arg eq "-sl") { $skip_long = shift; }
## program
elsif ($arg eq "-prog") { $blast_prog= shift; }
elsif ($arg eq "-p") { $prof_para = shift; }
- elsif ($arg eq "-dprof") { $prof_db = shift; }
+ elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
elsif ($arg eq "-s") { $bl_para = shift; }
elsif ($arg eq "-k") { $keep_bl = shift; }
elsif ($arg eq "-bs") { $bl_STDIN = shift; }
## compute
elsif ($arg eq "-exec") { $exec_mode = shift; }
- elsif ($arg eq "-host") { $host_no = shift; }
- elsif ($arg eq "-core") { $core_no = shift; }
+ elsif ($arg eq "-host") { $num_qsub = shift; }
+ elsif ($arg eq "-para") { $para_no = shift; }
elsif ($arg eq "-shf") { $sh_file = shift; }
+ elsif ($arg eq "-blp") { $bl_threads = shift; }
+ elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
## job:
elsif ($arg eq "-rs") { $restart_seg = shift; }
elsif ($arg eq "-rf") { $reformat_seg= shift; }
@@ -95,7 +104,29 @@ sub parse_para_etc {
$blast_exe = "megablast -H 100 -D 2 -m 8";
}
elsif ($blast_prog eq "blastpgp") {
- $blast_exe = ($prof_db) ? "blastpgp -m 8" : "blastpgp -m 8 -j 3";
+ $blast_exe = "blastpgp -m 8 -j 3";
+ }
+
+ #### for blast+
+ if ($bl_plus) {
+ $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
+ $blast_exe = "blastp -outfmt 6";
+ $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+
+ if ($blast_prog eq "blastn") {
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+ $blast_exe = "blastp -task blastn -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "megablast") {
+ $blast_prog = "blastn"; #### back to blastn for blast parser type
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+ $blast_exe = "blastp -task megablast -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "blastpgp") {
+ $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
+ }
}
if ($bl_path) {
@@ -138,6 +169,7 @@ sub read_db {
$seq =~ s/\s//g;
if (length($seq) > $len_t) { add_seq($des, $seq); }
$des = $ll; $seq = "";
+
}
else { $seq .= $ll; }
}
@@ -155,6 +187,7 @@ sub read_db {
sub add_seq {
my ($des, $seq) = @_;
+ $des =~ s/\s.+$//;
push(@seqs, $seq);
push(@dess, $des);
push(@lens, length($seq));
@@ -180,6 +213,10 @@ sub open_LOG {
}
########## END open_LOG
+sub write_LOG {
+ my $txt=shift;
+ print LOG "$txt\n";
+}
{## use static variables
my $last_NR90_no=0;
@@ -223,7 +260,7 @@ sub close_LOG {
sub total_remote_cpu {
my ($i, $j, $k, $ll);
my $tt = 0;
- for ($j=0; $j<$host_no; $j++) {
+ for ($j=0; $j<$num_qsub; $j++) {
open(TCPU, "$seq_dir/host.$j.cpu") || next;
while($ll = <TCPU>) {
chop($ll);
@@ -482,6 +519,7 @@ sub keep_top_hsp {
for ($i=0; $i<$self->{no}; $i++) {
my $p = $self->{sbj}->[$i];
my ($id1, $len_sub) = split(/\./, $p->{id});
+ next unless ($len_sub >0) ;
if (not defined($id_exist{$id1})) {
$id_exist{$id1} = 1;
@@ -569,6 +607,7 @@ sub process_blout_blastp_blastn {
my $frame = $p->{frame};
if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
my $iden = $p->{iden};
+ next unless (($len_sub >0) and ($len_rep>0));
my $cov_aS = $p->{alnln} / $len_sub;
my $cov_aL = $p->{alnln} / $len_rep;
my $exp1 = $p->{expect};
@@ -592,6 +631,7 @@ sub process_blout_blastp_blastn {
my ($id1, $len_sub) = split(/\./, $p->{id});
my $frame = $p->{frame};
if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
+ next unless (($len_sub >0) and ($len_rep>0));
if ($hsp_no) {
if ($id1 ne $hsp[0]->[0]) {
@@ -747,6 +787,7 @@ sub readblast_m8 {
my $frame = "";
$frame .= ($lls[6] < $lls[7]) ? "+" : "-";
$frame .= ($lls[8] < $lls[9]) ? "+" : "-";
+ next unless ($lls[0] and $lls[1]);
$this_sbj[$no] = {
'qid' => $lls[0],
'id' => $lls[1],
@@ -791,6 +832,7 @@ sub blast_formatdb {
for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
$i = $NR_idx[$i0];
last if ($idens[$i] eq "*"); ### last if reach rep
+ next if ($lens[$i] < $opt_aL_lower_band);
next if ($passeds[$i] and ($opt_g==0));
my $seq = $seqs[$i];
$seq =~ s/(.{70})/$1\n/g;
@@ -813,7 +855,10 @@ sub blast_formatdb {
return(0, 0) unless ($j > 0);
- my $cmd = `$formatdb -i $tmp_db`;
+ my $cmd_line = "$formatdb -i $tmp_db";
+ $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
+ my $cmd = `$cmd_line`;
+
((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) ||
((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) ||
((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) ||
@@ -895,6 +940,10 @@ Options
seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
| |
-----------circle--------------
+ -sl, length of very long sequences to be skipped, default 0, no skipping
+ e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
+ without clustering, to save time, especially when there is -aL option in place, very
+ long sequences will not be clustered anyway.
program:
-prog (blastp, blastn, megablast, blastpgp), default blastp
-p profile search para, default
@@ -911,15 +960,32 @@ Options
this program writes a shell script to run blast, this script is
either performed locally by sh or remotely by qsub
with qsub, you can use PBS, SGE etc
- -host number of hosts for qsub
- -core number of cpu cores per computer, default 1
+ -host number of hosts, ie number of qsub jobs
+ -para number of parallel blast job per qsub job (each blast can use multi cores), default 1
+ -blp number of threads per blast job, default 1
+ number of threads per blast job X number of parallel blast job per qsub job
+ should <= the number of cores in your computer
+ if your computer grid has 32 cores / node, do either of the followings
+ -para 4 -blp 8
+ -para 8 -blp 4
+ -para 16 -blp 2
+ -para 32 -blp 1
+ -bat number of sequences a blast job to process
-shf a filename for add local settings into the job shell script
for example, when you run PBS jobs, you can add quene name etc in this
file and this script will add them into the job shell script
-e.g. your file may have followings
+e.g. template file for PBS
+#!/bin/sh
#PBS -v PATH
#PBS -l walltime=8:00:00
-#PBS -q jobqueue
+#PBS -q job_queue.q
+
+e.g. template file for SGE or OGE
+#!/bin/sh
+#\$ -v PATH
+#\$ -q job_queue.q
+#\$ -V
+#\$ -pe orte 8
job:
-rs steps of save restart file and clustering output, default 5000
@@ -932,7 +998,7 @@ e.g. your file may have followings
if program clustered 200,000 seqs, it remove them from seq
pool, and re format blast db to save time
-J job, job_file, exe specific jobs like parse blast outonly
- DON'T use it, it is only used by this program itself
+ DO NOT use it, it is only used by this program itself
-k (1/0) keep blast raw output file, default $keep_bl
-P path to executables
@@ -962,14 +1028,46 @@ EOD
## while let nodes run them autoly
sub run_batch_blast3 {
my $i0 = shift;
- my ($id, $i, $j, $k);
+ my ($id, $i, $j, $k, $cmd);
- my $total_jobs = $batch_no_per_node * $host_no * $core_no;
+ #### wait before qsubs
+ if ($exec_mode eq "qsub") {
+ while(1) {
+ SGE_qstat_xml_query();
+ last unless (%qsub_ids);
+
+ my $wait_flag = 0;
+ foreach my $qsub_id (keys %qsub_ids) {
+ if (defined($qstat_xml_data{$qsub_id})) { #### still running
+ $wait_flag = 1;
+ $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
+ print LOG "force delete un necessary job $qsub_id\n";
+ }
+ else {
+ delete $qsub_ids{$qsub_id};
+ }
+ }
+
+ if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
+ }
+
+ #### delete seq files from last batch
+ opendir(DIR1, $seq_dir);
+ my @files = grep { /^\d/ } readdir(DIR1);
+ closedir(DIR1);
+ foreach $i (@files) {
+ $cmd = `rm -f $seq_dir/$i`;
+ print LOG "remove un necessary seq file $i\n"
+ }
+ }
+
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
for ($k=0; $i0<$NR_no; $i0++) {
$id = $NR_idx[$i0];
next if ($passeds[$id]);
next if ($in_bg[$id]);
+ next if ($lens[$id] < $opt_aL_upper_band);
$in_bg[$id] = 1;
my $seq = $seqs[$id];
@@ -982,14 +1080,18 @@ sub run_batch_blast3 {
}
if ($exec_mode eq "qsub") {
- for ($j=0; $j<$host_no; $j++) {
+ for ($j=0; $j<$num_qsub; $j++) {
my $t = "psi-cd-hit-$j";
- print LOG "PBS querying $j\n";
my $cmd = `qsub -N $t $remote_sh_script`;
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+ print LOG "qsub querying $j, PID $qsub_id\n";
+ $qsub_ids{$qsub_id} = 1;
}
}
elsif ($exec_mode eq "local") {
- my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
+ #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
+ my $cmd = `sh $remote_sh_script`;
}
return;
@@ -999,23 +1101,24 @@ sub run_batch_blast3 {
sub write_remote_sh_script {
my ($i, $j, $k);
- my $local_sh = "";
+ my $local_sh = <<EOD;
+#!/bin/sh
+#PBS -v PATH
+#\$ -v PATH
+EOD
+
if ($sh_file) {
$local_sh = `cat $sh_file`;
}
open(RESH, "> $remote_sh_script") || die;
print RESH <<EOD;
-#!/bin/bash
-#\$ -S /bin/bash
-#\$ -v PATH
-#PBS -v PATH
$local_sh
cd $pwd
EOD
- for ($k=0; $k<$core_no; $k++){
+ for ($k=0; $k<$para_no; $k++){
print RESH "./$remote_perl_script $k&\n"
}
print RESH "wait\n\n";
@@ -1027,11 +1130,11 @@ EOD
sub write_remote_perl_script {
my $dir1 = ".";
- my $bl2 = ($prof_db) ?
- "$blast_exe -d $dir1/$tmp_db $bl_para -R $bl_dir/\$id.prof":
- "$blast_exe -d $dir1/$tmp_db $bl_para";
- my $cc = ($prof_db) ? 1 : 0;
- if ($prof_db) { my $cmd=`formatdb -i $prof_db`; }
+ my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
+ $bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
+
+ my $opti = "-i"; $opti = "-query" if ($bl_plus);
+ my $opto = "-o"; $opto = "-out" if ($bl_plus);
open(REPERL, "> $remote_perl_script") || die;
print REPERL <<EOD;
@@ -1063,20 +1166,15 @@ foreach \$id (\@ids) {
next if (-e "$seq_dir/\$id.lock");
\$cmd = `touch $seq_dir/\$id.lock`;
- if ($cc) {
- \$cmd = `$prof_exe -d $prof_db $prof_para -i $seq_dir/\$id -C $bl_dir/\$id.prof`;
- }
-
if ($bl_STDIN) {
- \$cmd = `$bl2 -i $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
+ \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
}
else {
- \$cmd = `$bl2 -i $seq_dir/\$id -o $bl_dir/\$id`;
+ \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
\$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
}
\$cmd = `rm -f $seq_dir/\$id`;
\$cmd = `rm -f $seq_dir/\$id.lock`;
- if ($cc) { \$cmd = `rm -f $bl_dir/\$id.prof`; }
}
(\$tu, \$ts, \$cu, \$cs) = times();
@@ -1111,6 +1209,77 @@ sub wait_blast_out {
########## END wait_blast_out
+sub SGE_qstat_xml_query {
+ my ($i, $j, $k, $cmd, $ll);
+ %qstat_xml_data = (); #### global
+ $cmd = `qstat -f -xml`;
+ if ($cmd =~ /<queue_info/) { #### dummy
+ $qstat_xml_data{"NULL"}= ["NULL","NULL"];
+ }
+ my $tmp = <<EOD;
+<?xml version='1.0'?>
+<job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
+ <queue_info>
+ <Queue-List>
+ <name>all.q\@master</name>
+ <qtype>BIP</qtype>
+ <slots_used>0</slots_used>
+ <slots_resv>0</slots_resv>
+ <slots_total>0</slots_total>
+ <load_avg>0.08000</load_avg>
+ <arch>linux-x64</arch>
+ </Queue-List>
+...
+ <Queue-List>
+ <name>all.q\@node016</name>
+ <qtype>BIP</qtype>
+ <slots_used>32</slots_used>
+ <slots_resv>0</slots_resv>
+ <slots_total>32</slots_total>
+ <load_avg>42.59000</load_avg>
+ <arch>linux-x64</arch>
+ <job_list state="running"> ####### running jobs in this section
+ <JB_job_number>3535</JB_job_number>
+ <JAT_prio>0.51468</JAT_prio>
+ <JB_name>cd-hit</JB_name>
+ <JB_owner>ubuntu</JB_owner>
+ <state>r</state>
+ <slots>4</slots>
+ </job_list>
+...
+ </queue_info>
+ <job_info>
+ <job_list state="pending"> ######## pending jobs in this section
+ <JB_job_number>3784</JB_job_number>
+ <JAT_prio>0.60500</JAT_prio>
+ <JB_name>cd-hit</JB_name>
+ <JB_owner>ubuntu</JB_owner>
+ <state>qw</state>
+ <slots>32</slots>
+ </job_list>
+...
+ </job_info>
+</job_info>
+
+EOD
+ my @lls = split(/\n/, $cmd);
+ $i = 2; #### skip first 2 lines
+ for (; $i<$#lls+1; $i++) {
+ if ($lls[$i] =~ /<job_list/) {
+ my ($id, $name, $state);
+ for (; $i<$#lls+1; $i++) {
+ last if ($lls[$i] =~ /<\/job_list/);
+ if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
+ if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
+ if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
+ }
+ if (defined($id) and defined($name) and defined($state)) {
+ $qstat_xml_data{$id} = [$name, $state];
+ }
+ }
+ }
+}
+
1;
diff --git a/psi-cd-hit/psi-cd-hit-local.pl b/psi-cd-hit/psi-cd-hit-local.pl
index a77c7f7..3122f17 100755
--- a/psi-cd-hit/psi-cd-hit-local.pl
+++ b/psi-cd-hit/psi-cd-hit-local.pl
@@ -3,8 +3,8 @@
######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
################################################################################
our $pid = $$;
-our $db_in = ""; ###################
-our $db_out = ""; # input / output
+our $db_in; ###################
+our $db_out; # input / output
our $len_t = 10; ###################
our $NR_clstr = 0.3; #
our $NR_clstre = -1; #thresholds
@@ -15,18 +15,19 @@ our $circle = 0; #
our $opt_g = 1; ####################
our $blast_exe = "blastall -p blastp -m 8"; #########################
our $prof_exe = "blastpgp -m 8"; #
-our $prof_para = "-j 3 -F F -e 0.001 -b 500 -v 500"; #
+our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
our $prof_db = ""; #
-our $bl_para = "-F F -e 0.000001 -b 100000 -v 100000"; # program
+our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
our $bl_STDIN = 1; #
our $keep_bl = 0; #
our $blast_prog= "blastp"; #
our $formatdb = "formatdb"; #########################
our $exec_mode = "local"; #######################
-our $host_no = 1; #
-our $core_no = 1; # compute
+our $num_qsub = 1; #
+our $para_no = 1; # compute
our $sh_file = ""; #
-our $batch_no_per_node = 50; #######################
+our $num_multi_seq = 50; #
+our $batch_no_per_node = 100; #######################
our $reformat_seg = 50000;
our $restart_seg = 20000;
our $job = "";
@@ -39,11 +40,20 @@ our $db_log;
our $db_out1;
our $seq_dir;
our $bl_dir;
+our $blm_dir;
our $restart_file;
our $tmp_db;
our $remote_perl_script;
our $remote_sh_script;
our $bl_path;
+our $bl_plus = 1; #### use blast+
+our $bl_threads = 1;
+our $skip_long = 0;
+our %qsub_ids = (); #### a list of qsub ids
+our %qstat_xml_data = ();
+our @blm8_buffer = ();
+our %blm8_data = ();
+
sub parse_para_etc {
my ($arg, $cmd);
@@ -60,18 +70,21 @@ sub parse_para_etc {
elsif ($arg eq "-aS") { $opt_aS = shift; }
elsif ($arg eq "-g") { $opt_g = shift; }
elsif ($arg eq "-circle") { $circle = shift; }
+ elsif ($arg eq "-sl") { $skip_long = shift; }
## program
elsif ($arg eq "-prog") { $blast_prog= shift; }
elsif ($arg eq "-p") { $prof_para = shift; }
- elsif ($arg eq "-dprof") { $prof_db = shift; }
+ elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
elsif ($arg eq "-s") { $bl_para = shift; }
elsif ($arg eq "-k") { $keep_bl = shift; }
elsif ($arg eq "-bs") { $bl_STDIN = shift; }
## compute
elsif ($arg eq "-exec") { $exec_mode = shift; }
- elsif ($arg eq "-host") { $host_no = shift; }
- elsif ($arg eq "-core") { $core_no = shift; }
+ elsif ($arg eq "-host") { $num_qsub = shift; }
+ elsif ($arg eq "-para") { $para_no = shift; }
elsif ($arg eq "-shf") { $sh_file = shift; }
+ elsif ($arg eq "-blp") { $bl_threads = shift; }
+ elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
## job:
elsif ($arg eq "-rs") { $restart_seg = shift; }
elsif ($arg eq "-rf") { $reformat_seg= shift; }
@@ -83,7 +96,12 @@ sub parse_para_etc {
}
# speical jobs
- if ($job eq "parse_blout") { job_parse_blout(); exit();}
+ if ($job eq "parse_blout") { job_parse_blout(); exit();}
+ elsif ($job eq "parse_blout_multi") { job_parse_blout_multi(); exit();}
+
+ if (not (defined($db_in) and defined($db_out))) {
+ print_usage(); exit();
+ }
if ($blast_prog eq "blastn") {
$formatdb = "formatdb -p F";
@@ -95,7 +113,29 @@ sub parse_para_etc {
$blast_exe = "megablast -H 100 -D 2 -m 8";
}
elsif ($blast_prog eq "blastpgp") {
- $blast_exe = ($prof_db) ? "blastpgp -m 8" : "blastpgp -m 8 -j 3";
+ $blast_exe = "blastpgp -m 8 -j 3";
+ }
+
+ #### for blast+
+ if ($bl_plus) {
+ $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
+ $blast_exe = "blastp -outfmt 6";
+ $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+
+ if ($blast_prog eq "blastn") {
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+ $blast_exe = "blastn -task blastn -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "megablast") {
+ $blast_prog = "blastn"; #### back to blastn for blast parser type
+ $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+ $blast_exe = "blastn -task megablast -outfmt 6";
+ $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
+ }
+ elsif ($blast_prog eq "blastpgp") {
+ $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
+ }
}
if ($bl_path) {
@@ -111,13 +151,14 @@ sub parse_para_etc {
$db_out1 = "$db_out.out";
$seq_dir = "$db_in-seq";
$bl_dir = "$db_in-bl";
+ $blm_dir = "$db_in-blm";
$restart_file =" $db_out.restart";
$tmp_db = "$db_in.$pid";
$remote_perl_script = "$tmp_db-bl.pl";
$remote_sh_script = "$tmp_db-bl.sh";
- $cmd = `mkdir $bl_dir $seq_dir`;
+ $cmd = `mkdir $bl_dir $blm_dir $seq_dir`;
write_remote_perl_script();
write_remote_sh_script();
@@ -138,6 +179,7 @@ sub read_db {
$seq =~ s/\s//g;
if (length($seq) > $len_t) { add_seq($des, $seq); }
$des = $ll; $seq = "";
+
}
else { $seq .= $ll; }
}
@@ -155,6 +197,7 @@ sub read_db {
sub add_seq {
my ($des, $seq) = @_;
+ $des =~ s/\s.+$//;
push(@seqs, $seq);
push(@dess, $des);
push(@lens, length($seq));
@@ -180,6 +223,10 @@ sub open_LOG {
}
########## END open_LOG
+sub write_LOG {
+ my $txt=shift;
+ print LOG "$txt\n";
+}
{## use static variables
my $last_NR90_no=0;
@@ -223,7 +270,7 @@ sub close_LOG {
sub total_remote_cpu {
my ($i, $j, $k, $ll);
my $tt = 0;
- for ($j=0; $j<$host_no; $j++) {
+ for ($j=0; $j<$num_qsub; $j++) {
open(TCPU, "$seq_dir/host.$j.cpu") || next;
while($ll = <TCPU>) {
chop($ll);
@@ -235,6 +282,52 @@ sub total_remote_cpu {
}
########## END total_remote_cpu
+#### process m8 format output from multi-query search
+sub job_parse_blout_multi{
+ my ($i, $j, $k, $tfh, $ll, $t1, $t2);
+
+ $tfh="BLM8";
+ open($tfh, $job_file) || die "can not open $job_file";
+
+ @blm8_buffer = ();
+ my $last_id = "";
+ my $this_id = "";
+ my $tquery;
+ while($ll = <$tfh>) {
+ next if ($ll =~ /^#/);
+ ($this_id, $t1) = split(/\s+/, $ll, 2);
+
+ if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
+ my @hits = process_blout_blastp_blastn();
+ $tquery = (split(/\./, $last_id))[0];
+ my $no1 = $#hits+1;
+ print ">$tquery\t$no1\n";
+ foreach $i (@hits) {
+ print join("\t", @{$i}), "\n";
+ }
+ print "#\n";
+ @blm8_buffer = ();
+ }
+ push(@blm8_buffer, $ll);
+ $last_id = $this_id;
+ }
+
+ if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
+ my @hits = process_blout_blastp_blastn();
+ $tquery = (split(/\./, $last_id))[0];
+ my $no1 = $#hits+1;
+ print ">$tquery\t$no1\n";
+ foreach $i (@hits) {
+ print join("\t", @{$i}), "\n";
+ }
+ print "#\n";
+ @blm8_buffer = ();
+ }
+ close($tfh);
+ return;
+}
+########## END job_parse_blout_multi
+
sub job_parse_blout {
my ($i, $j, $k);
@@ -377,6 +470,45 @@ sub remove_raw_blout_bg {
}
########## END remove_raw_blout_bg
+sub fish_other_homolog_multi {
+ my ($i, $j, $k, $i0, $j0, $k0);
+ $id = shift; # real idx, not sorted idx
+ my @hits = ();
+
+ if (defined($blm8_data{$id})) {
+ @hits = @{$blm8_data{$id}};
+ }
+
+ my $rep_len = $lens[$id];
+
+ foreach $i (@hits) {
+ my $id1 = $i->[0];
+ next unless ($id1 < $NR_no);
+ next if ($idens[$id1] eq "*"); #existing reps
+ next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
+
+ if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
+ my $old_e = (split(/\//,$idens[$id1]))[0];
+ if ($i->[3] < $old_e) {
+ $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
+ $passeds[$id1] = 1;
+ $NR_clstr_nos[$id1] = $NR90_no;
+ }
+ next;
+ }
+
+ $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
+ $passeds[$id1] = 1;
+ $NR_clstr_nos[$id1] = $NR90_no;
+ $NR_passed++;
+ }
+ if (defined($blm8_data{$id})) {
+ delete $blm8_data{$id};
+ }
+ return;
+}
+########## END fish_other_homolog_multi
+
sub fish_other_homolog {
my ($i, $j, $k, $i0, $j0, $k0);
@@ -482,6 +614,7 @@ sub keep_top_hsp {
for ($i=0; $i<$self->{no}; $i++) {
my $p = $self->{sbj}->[$i];
my ($id1, $len_sub) = split(/\./, $p->{id});
+ next unless ($len_sub >0) ;
if (not defined($id_exist{$id1})) {
$id_exist{$id1} = 1;
@@ -556,7 +689,7 @@ sub process_blout_blastp_blastn {
#### need $len_rep
my $len_rep = 0;
- my $bl = readblast_m8("", $blout);
+ my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
@@ -569,6 +702,7 @@ sub process_blout_blastp_blastn {
my $frame = $p->{frame};
if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
my $iden = $p->{iden};
+ next unless (($len_sub >0) and ($len_rep>0));
my $cov_aS = $p->{alnln} / $len_sub;
my $cov_aL = $p->{alnln} / $len_rep;
my $exp1 = $p->{expect};
@@ -592,6 +726,7 @@ sub process_blout_blastp_blastn {
my ($id1, $len_sub) = split(/\./, $p->{id});
my $frame = $p->{frame};
if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
+ next unless (($len_sub >0) and ($len_rep>0));
if ($hsp_no) {
if ($id1 ne $hsp[0]->[0]) {
@@ -730,6 +865,49 @@ sub cross1_before_2013_0818 {
}
########## END cross1
+sub readblast_m8_buffer {
+ my ($i, $j, $k, $ll, $no);
+ my @this_sbj = ();
+ $no = 0;
+ while($ll = shift @blm8_buffer) {
+ chop($ll);
+ my @lls = split(/\t/,$ll);
+ my $frame = "";
+ $frame .= ($lls[6] < $lls[7]) ? "+" : "-";
+ $frame .= ($lls[8] < $lls[9]) ? "+" : "-";
+ next unless ($lls[0] and $lls[1]);
+ $this_sbj[$no] = {
+ 'qid' => $lls[0],
+ 'id' => $lls[1],
+ 'iden' => $lls[2],
+ 'alnln' => $lls[3],
+ 'ms' => $lls[4],
+ 'gap' => $lls[5],
+ 'qfrom' => $lls[6],
+ 'qend' => $lls[7],
+ 'sfrom' => $lls[8],
+ 'send' => $lls[9],
+ 'expect' => $lls[10],
+ 'score' => $lls[11],
+ 'frame' => $frame,
+ };
+
+ $no++;
+# BLASTP 2.2.24 [Aug-08-2010]
+# Query: gi|388328107|pdb|4DDG|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
+# Database: pdbaa.fa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81 171 9 3 6 171 1 171 6e-89 323
+#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51 86 3 0 235 320 155 240 2e-41 166
+ }
+ my $self = {
+ 'no' => $no,
+ 'sbj' => [@this_sbj],
+ };
+ return $self;
+}
+########## END readblast_m8
+
sub readblast_m8 {
my ($i, $j, $k, $ll, $no);
my ($q_seq, $filename) = @_;
@@ -747,6 +925,7 @@ sub readblast_m8 {
my $frame = "";
$frame .= ($lls[6] < $lls[7]) ? "+" : "-";
$frame .= ($lls[8] < $lls[9]) ? "+" : "-";
+ next unless ($lls[0] and $lls[1]);
$this_sbj[$no] = {
'qid' => $lls[0],
'id' => $lls[1],
@@ -791,6 +970,7 @@ sub blast_formatdb {
for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
$i = $NR_idx[$i0];
last if ($idens[$i] eq "*"); ### last if reach rep
+ next if ($lens[$i] < $opt_aL_lower_band);
next if ($passeds[$i] and ($opt_g==0));
my $seq = $seqs[$i];
$seq =~ s/(.{70})/$1\n/g;
@@ -813,7 +993,10 @@ sub blast_formatdb {
return(0, 0) unless ($j > 0);
- my $cmd = `$formatdb -i $tmp_db`;
+ my $cmd_line = "$formatdb -i $tmp_db";
+ $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
+ my $cmd = `$cmd_line`;
+
((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) ||
((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) ||
((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) ||
@@ -842,15 +1025,15 @@ Options
input/output:
-i in_dbname, required
-o out_dbname, required
- -l length_of_throw_away_sequences, default 10
+ -l length_of_throw_away_sequences, default $len_t
thresholds:
- -c clustering threshold (sequence identity), default 0.3
- -ce clustering threshold (blast expect), default -1,
+ -c clustering threshold (sequence identity), default $NR_clstr
+ -ce clustering threshold (blast expect), default $NR_clstre,
it means by default it doesn't use expect threshold,
but with positive value, the program cluster seqs if similarities
meet either identity threshold or expect threshold
- -G (1/0) use global identity? default 1
+ -G (1/0) use global identity? default $g_iden
two sequences Long (i.e. representative) and Short (redunant) may have multiple
alignment fragments (i.e. HSPs), see:
seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence
@@ -869,17 +1052,17 @@ Options
Local identity = identity of the top high score HSP
if you prefer to use -G 0, it is suggested that you also
use -aS, -aL, such as -aS 0.8, to prevent very short matches.
- -aL alignment coverage for the longer sequence, default 0.0
+ -aL alignment coverage for the longer sequence, default $opt_aL
if set to 0.9, the alignment must covers 90% of the sequence
- -aS alignment coverage for the shorter sequence, default 0.0
+ -aS alignment coverage for the shorter sequence, default $opt_aS
if set to 0.9, the alignment must covers 90% of the sequence
- -g (1/0), default 0
+ -g (1/0), default $opt_g
by cd-hit's default algorithm, a sequence is clustered to the first
cluster that meet the threshold (fast cluster). If set to 1, the program
will cluster it into the most similar cluster that meet the threshold
(accurate but slow mode)
but either 1 or 0 won't change the representatives of final clusters
- -circle (1/0), default 0
+ -circle (1/0), default $circle
when set to 1, treat sequences as circular sequence.
bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
the 2 HSPs below will be treated as non co-linear with -circle 0
@@ -887,55 +1070,78 @@ Options
-------------circle-----------
| |
seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
- \\\\\\\\ /////////////
- \\\\\\\\ /////////////
+ \\\\\\\\\\\\\\\\ /////////////
+ \\\\\\\\\\\\\\\\ /////////////
HSP 2 -> ////HSP 1 /// <-HSP 2
- ///////////// \\\\\\\\
- ///////////// \\\\\\\\
+ ///////////// \\\\\\\\\\\\\\\\
+ ///////////// \\\\\\\\\\\\\\\\
seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
| |
-----------circle--------------
+ -sl, length of very long sequences to be skipped, default $skip_long,
+ e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
+ without clustering, to save time, especially when there is -aL option in place, very
+ long sequences will not be clustered anyway.
+ -sl 0 means no skipping
program:
- -prog (blastp, blastn, megablast, blastpgp), default blastp
- -p profile search para, default
- "-j 3 -F F -e 0.001 -b 500 -v 500"
+ -prog (blastp, blastn, megablast, blastpgp), default $blast_prog
+ -p profile search para, default
+ "$prof_para"
-dprof database for building PSSM, default using input
you can also use another database that is more comprehensive like NR80
- -s blast search para, default
- "-F F -e 0.000001 -b 100000 -v 100000"
- -bs (1/0) default 1
+ -s blast search para, default
+ "$bl_para"
+ -bs (1/0) default $bl_STDIN
pipe blast results from into parser instead of save in hard drive (save time)
compute:
- -exec (qsub, local) default local
+ -exec (qsub, local) default $exec_mode
this program writes a shell script to run blast, this script is
either performed locally by sh or remotely by qsub
with qsub, you can use PBS, SGE etc
- -host number of hosts for qsub
- -core number of cpu cores per computer, default 1
+ -host number of qsub jobs, default $num_qsub
+ -para number of parallel blast job per qsub job (each blast can use multi cores), default $para_no
+ one qsub script can run multiple blast jobs
+ -blp number of threads per blast job, default $bl_threads
+ number of threads per blast job (option -blp) X number of parallel blast job per qsub job (option -para)
+ should <= the number of cores in your computer
+ if your computer grid has 32 cores / node, do either of the followings
+ -para 4 -blp 8
+ -para 8 -blp 4 preferred
+ -para 16 -blp 2
+ -para 32 -blp 1
+ -bat number of sequences a blast job to process, $batch_no_per_node
-shf a filename for add local settings into the job shell script
for example, when you run PBS jobs, you can add quene name etc in this
file and this script will add them into the job shell script
-e.g. your file may have followings
+e.g. template file for PBS
+#!/bin/sh
#PBS -v PATH
#PBS -l walltime=8:00:00
-#PBS -q jobqueue
+#PBS -q job_queue.q
+
+e.g. template file for SGE or OGE
+#!/bin/sh
+#\$ -v PATH
+#\$ -q job_queue.q
+#\$ -V
+#\$ -pe orte 8
job:
- -rs steps of save restart file and clustering output, default 5000
+ -rs steps of save restart file and clustering output, default $restart_seg
everytime after process 5000 sequences, program write a
restart file and current clustering information
-restart restart file, readin a restart file
if program crash, stoped, termitated, you can restart it by
add a option "-restart sth.restart"
- -rf steps of re format blast database, default 200,000
+ -rf steps of re format blast database, default $reformat_seg
if program clustered 200,000 seqs, it remove them from seq
pool, and re format blast db to save time
-J job, job_file, exe specific jobs like parse blast outonly
- DON'T use it, it is only used by this program itself
+ DO NOT use it, it is only used by this program itself
-k (1/0) keep blast raw output file, default $keep_bl
- -P path to executables
+ -P path to blast executables
EOD
@@ -958,18 +1164,144 @@ EOD
########## END print_usage
-## like above, but don't assign seqs to specific node
-## while let nodes run them autoly
+## copied from run_batch_blast3
+## run multi seq per sample
+## wait for all jobs to finish
+sub run_batch_blast3_multi {
+ my $i0 = shift;
+ my ($id, $i, $j, $k, $cmd, $ll);
+
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
+
+ for ($k=0; $i0<$NR_no; $i0++) {
+ $id = $NR_idx[$i0];
+ next if ($passeds[$id]);
+ next if ($in_bg[$id]);
+ next if ($lens[$id] < $opt_aL_upper_band);
+ $in_bg[$id] = 1;
+
+ my $seq = $seqs[$id];
+
+ if (($k % $num_multi_seq) ==0) { #### reopen
+ close(SEQ) if ($k > 0);
+ open(SEQ, "> $seq_dir/$id") || die "Can not write";
+ }
+ #print SEQ "$dess[$id]\n$seq\n";
+ print SEQ ">$id.$lens[$id]\n$seq\n";
+ $k++;
+ last if ($k >= $total_jobs);
+ }
+ close(SEQ);
+
+ if ($exec_mode eq "qsub") {
+ for ($j=0; $j<$num_qsub; $j++) {
+ my $t = "psi-cd-hit-$j";
+ my $cmd = `qsub -N $t $remote_sh_script $j`; #### pass $j to qsub command
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+ print LOG "qsub querying $j, PID $qsub_id\n";
+ $qsub_ids{$qsub_id} = 1;
+ }
+ }
+ elsif ($exec_mode eq "local") {
+ #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
+ my $cmd = `sh $remote_sh_script`;
+ }
+
+ #### wait finish all submitted
+ if ($exec_mode eq "qsub") {
+ while(1) {
+ SGE_qstat_xml_query();
+ last unless (%qsub_ids);
+
+ my $wait_flag = 0;
+ foreach my $qsub_id (keys %qsub_ids) {
+ if (defined($qstat_xml_data{$qsub_id})) { #### still running
+ $wait_flag = 1;
+ }
+ else {
+ delete $qsub_ids{$qsub_id};
+ }
+ }
+
+ if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
+ }
+ }
+
+ #### read in all parsed blast output
+ %blm8_data =();
+ opendir(BLMDIR, $blm_dir) || die "can not open $blm_dir";
+ my @bl_files = grep { /^\d/ } readdir(BLMDIR);
+ closedir(BLMDIR);
+
+ foreach my $blf (@bl_files) {
+ open(BLMTMP, "$blm_dir/$blf") || next;
+ while($ll = <BLMTMP>) {
+ next if ($ll =~ /^#/);
+ chop($ll);
+ if ($ll =~ /^>/) {
+
+ my ($id, $no1) = split(/\s+/, substr($ll,1));
+ my @hits = ();
+ for ($j=0; $j<$no1; $j++) {
+ $ll=<BLMTMP>; chop($ll);
+ push(@hits, [split(/\t/,$ll)]);
+ }
+ if ($no1>=1) {
+ $blm8_data{$id} = [@hits];
+ }
+ }
+ }
+ close(BLMTMP);
+
+ $cmd = `rm -f $blm_dir/$blf`;
+ print LOG "parse and then rm $blm_dir/$blf\n";
+ }
+ return;
+}
+
sub run_batch_blast3 {
my $i0 = shift;
- my ($id, $i, $j, $k);
+ my ($id, $i, $j, $k, $cmd);
- my $total_jobs = $batch_no_per_node * $host_no * $core_no;
+ #### wait before qsubs
+ if ($exec_mode eq "qsub") {
+ while(1) {
+ SGE_qstat_xml_query();
+ last unless (%qsub_ids);
+
+ my $wait_flag = 0;
+ foreach my $qsub_id (keys %qsub_ids) {
+ if (defined($qstat_xml_data{$qsub_id})) { #### still running
+ $wait_flag = 1;
+ $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
+ print LOG "force delete un necessary job $qsub_id\n";
+ }
+ else {
+ delete $qsub_ids{$qsub_id};
+ }
+ }
+
+ if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
+ }
+
+ #### delete seq files from last batch
+ opendir(DIR1, $seq_dir);
+ my @files = grep { /^\d/ } readdir(DIR1);
+ closedir(DIR1);
+ foreach $i (@files) {
+ $cmd = `rm -f $seq_dir/$i`;
+ print LOG "remove un necessary seq file $i\n"
+ }
+ }
+
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
for ($k=0; $i0<$NR_no; $i0++) {
$id = $NR_idx[$i0];
next if ($passeds[$id]);
next if ($in_bg[$id]);
+ next if ($lens[$id] < $opt_aL_upper_band);
$in_bg[$id] = 1;
my $seq = $seqs[$id];
@@ -982,14 +1314,18 @@ sub run_batch_blast3 {
}
if ($exec_mode eq "qsub") {
- for ($j=0; $j<$host_no; $j++) {
+ for ($j=0; $j<$num_qsub; $j++) {
my $t = "psi-cd-hit-$j";
- print LOG "PBS querying $j\n";
my $cmd = `qsub -N $t $remote_sh_script`;
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+ print LOG "qsub querying $j, PID $qsub_id\n";
+ $qsub_ids{$qsub_id} = 1;
}
}
elsif ($exec_mode eq "local") {
- my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
+ #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
+ my $cmd = `sh $remote_sh_script`;
}
return;
@@ -999,24 +1335,26 @@ sub run_batch_blast3 {
sub write_remote_sh_script {
my ($i, $j, $k);
- my $local_sh = "";
+ my $local_sh = <<EOD;
+#!/bin/sh
+#PBS -v PATH
+#\$ -v PATH
+EOD
+
if ($sh_file) {
$local_sh = `cat $sh_file`;
}
open(RESH, "> $remote_sh_script") || die;
print RESH <<EOD;
-#!/bin/bash
-#\$ -S /bin/bash
-#\$ -v PATH
-#PBS -v PATH
$local_sh
+para=\$1
cd $pwd
EOD
- for ($k=0; $k<$core_no; $k++){
- print RESH "./$remote_perl_script $k&\n"
+ for ($k=0; $k<$para_no; $k++){
+ print RESH "./$remote_perl_script $k \$para &\n"
}
print RESH "wait\n\n";
@@ -1027,16 +1365,17 @@ EOD
sub write_remote_perl_script {
my $dir1 = ".";
- my $bl2 = ($prof_db) ?
- "$blast_exe -d $dir1/$tmp_db $bl_para -R $bl_dir/\$id.prof":
- "$blast_exe -d $dir1/$tmp_db $bl_para";
- my $cc = ($prof_db) ? 1 : 0;
- if ($prof_db) { my $cmd=`formatdb -i $prof_db`; }
+ my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
+ $bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
+
+ my $opti = "-i"; $opti = "-query" if ($bl_plus);
+ my $opto = "-o"; $opto = "-out" if ($bl_plus);
open(REPERL, "> $remote_perl_script") || die;
print REPERL <<EOD;
#!/usr/bin/perl
\$host = shift;
+\$instance = shift;
\$arg = shift;
#### random sleep, rand() can be a fraction of second
@@ -1063,25 +1402,24 @@ foreach \$id (\@ids) {
next if (-e "$seq_dir/\$id.lock");
\$cmd = `touch $seq_dir/\$id.lock`;
- if ($cc) {
- \$cmd = `$prof_exe -d $prof_db $prof_para -i $seq_dir/\$id -C $bl_dir/\$id.prof`;
+ if ($num_multi_seq) {
+ \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
+ \$cmd = `$script_name -J parse_blout_multi $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0 >> $blm_dir/\$host.\$instance`;
}
-
- if ($bl_STDIN) {
- \$cmd = `$bl2 -i $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
+ elsif ($bl_STDIN) {
+ \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
}
else {
- \$cmd = `$bl2 -i $seq_dir/\$id -o $bl_dir/\$id`;
+ \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
\$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
}
\$cmd = `rm -f $seq_dir/\$id`;
\$cmd = `rm -f $seq_dir/\$id.lock`;
- if ($cc) { \$cmd = `rm -f $bl_dir/\$id.prof`; }
}
(\$tu, \$ts, \$cu, \$cs) = times();
\$tt = \$tu + \$ts + \$cu + \$cs;
-\$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
+\$cmd = `echo \$tt >> $seq_dir/host.\$host.\$instance.cpu`;
EOD
close(REPERL);
@@ -1111,6 +1449,77 @@ sub wait_blast_out {
########## END wait_blast_out
+sub SGE_qstat_xml_query {
+ my ($i, $j, $k, $cmd, $ll);
+ %qstat_xml_data = (); #### global
+ $cmd = `qstat -f -xml`;
+ if ($cmd =~ /<queue_info/) { #### dummy
+ $qstat_xml_data{"NULL"}= ["NULL","NULL"];
+ }
+ my $tmp = <<EOD;
+<?xml version='1.0'?>
+<job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
+ <queue_info>
+ <Queue-List>
+ <name>all.q\@master</name>
+ <qtype>BIP</qtype>
+ <slots_used>0</slots_used>
+ <slots_resv>0</slots_resv>
+ <slots_total>0</slots_total>
+ <load_avg>0.08000</load_avg>
+ <arch>linux-x64</arch>
+ </Queue-List>
+...
+ <Queue-List>
+ <name>all.q\@node016</name>
+ <qtype>BIP</qtype>
+ <slots_used>32</slots_used>
+ <slots_resv>0</slots_resv>
+ <slots_total>32</slots_total>
+ <load_avg>42.59000</load_avg>
+ <arch>linux-x64</arch>
+ <job_list state="running"> ####### running jobs in this section
+ <JB_job_number>3535</JB_job_number>
+ <JAT_prio>0.51468</JAT_prio>
+ <JB_name>cd-hit</JB_name>
+ <JB_owner>ubuntu</JB_owner>
+ <state>r</state>
+ <slots>4</slots>
+ </job_list>
+...
+ </queue_info>
+ <job_info>
+ <job_list state="pending"> ######## pending jobs in this section
+ <JB_job_number>3784</JB_job_number>
+ <JAT_prio>0.60500</JAT_prio>
+ <JB_name>cd-hit</JB_name>
+ <JB_owner>ubuntu</JB_owner>
+ <state>qw</state>
+ <slots>32</slots>
+ </job_list>
+...
+ </job_info>
+</job_info>
+
+EOD
+ my @lls = split(/\n/, $cmd);
+ $i = 2; #### skip first 2 lines
+ for (; $i<$#lls+1; $i++) {
+ if ($lls[$i] =~ /<job_list/) {
+ my ($id, $name, $state);
+ for (; $i<$#lls+1; $i++) {
+ last if ($lls[$i] =~ /<\/job_list/);
+ if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
+ if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
+ if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
+ }
+ if (defined($id) and defined($name) and defined($state)) {
+ $qstat_xml_data{$id} = [$name, $state];
+ }
+ }
+ }
+}
+
1;
diff --git a/psi-cd-hit/psi-cd-hit.pl b/psi-cd-hit/psi-cd-hit-old.pl
similarity index 52%
copy from psi-cd-hit/psi-cd-hit.pl
copy to psi-cd-hit/psi-cd-hit-old.pl
index af99e52..bb4f512 100755
--- a/psi-cd-hit/psi-cd-hit.pl
+++ b/psi-cd-hit/psi-cd-hit-old.pl
@@ -7,7 +7,7 @@ our $script_name = $0;
our $script_dir = $0;
$script_dir =~ s/[^\/]+$//;
$script_dir = "./" unless ($script_dir);
-require "$script_dir/psi-cd-hit-local.pl";
+require "$script_dir/psi-cd-hit-local-old.pl";
parse_para_etc(@ARGV);
open_LOG();
@@ -26,6 +26,10 @@ our $DB_len = 0;
our $DB_len0 = 0;
our $DB_len_reduced = 0;
our $DB_len_reduced2 = 0; #### for write_restart etc purpose
+
+our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
+our $opt_al_upper_bandi= 0;
+our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
my ($i, $j, $k, $i0, $j0, $k0, $ll);
read_db();
@@ -41,6 +45,54 @@ our @NR90_seq = ();
$i0 = 0;
if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
+elsif ($skip_long > 0) { #### skip very long seqs
+ for (; $i0<$NR_no; $i0++) {
+ $i = $NR_idx[$i0];
+ last if ($lens[$i] < $skip_long);
+
+ $NR_passed++;
+ $NR_clstr_nos[$i] = $NR90_no;
+ $idens[$i] = "*";
+ $passeds[$i] = 1;
+ $NR90_seq[$NR90_no] = [$i];
+ $NR90_no++;
+ $DB_len_reduced += $lens[$i];
+ }
+}
+
+#### set init opt_aL_uppper/lower_bands
+if ( ($opt_aL > 0.3) ) {
+ die ("option -aL > 1.0") if ($opt_aL > 1.0);
+
+####################
+###################
+##################
+#################
+################
+############### <-upper band
+############## <- seq below not submit, unless band change
+#############
+############
+###########
+########## <- lower band
+######### <- seq below not in format db
+########
+#######
+#####
+####
+###
+##
+#
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
+ my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
+ my $d1 = $i0+$space;
+ $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
+ $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
+ $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
+ $opt_aL_upper_bandi= $d1;
+ write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
+}
+
($DB_no, $DB_len) = blast_formatdb();
$DB_len0 = $DB_len;
@@ -68,7 +120,23 @@ for (; $i0<$NR_no; $i0++) {
write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
$DB_len_reduced2 = 0;
}
- if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10)) {
+
+ my $opt_aL_format_flag = 0;
+ if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
+ if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
+
+ my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
+ my $d1 = $i0+$space;
+ $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
+ $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
+ $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
+ $opt_aL_upper_bandi= $d1;
+ $opt_aL_format_flag = 1;
+ write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
+ }
+ }
+ if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
($DB_no, $DB_len) = blast_formatdb();
$DB_len_reduced = 0;
}
diff --git a/psi-cd-hit/psi-cd-hit.pl b/psi-cd-hit/psi-cd-hit.pl
index af99e52..6d209c6 100755
--- a/psi-cd-hit/psi-cd-hit.pl
+++ b/psi-cd-hit/psi-cd-hit.pl
@@ -26,6 +26,10 @@ our $DB_len = 0;
our $DB_len0 = 0;
our $DB_len_reduced = 0;
our $DB_len_reduced2 = 0; #### for write_restart etc purpose
+
+our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
+our $opt_al_upper_bandi= 0;
+our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
my ($i, $j, $k, $i0, $j0, $k0, $ll);
read_db();
@@ -41,6 +45,54 @@ our @NR90_seq = ();
$i0 = 0;
if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
+elsif ($skip_long > 0) { #### skip very long seqs
+ for (; $i0<$NR_no; $i0++) {
+ $i = $NR_idx[$i0];
+ last if ($lens[$i] < $skip_long);
+
+ $NR_passed++;
+ $NR_clstr_nos[$i] = $NR90_no;
+ $idens[$i] = "*";
+ $passeds[$i] = 1;
+ $NR90_seq[$NR90_no] = [$i];
+ $NR90_no++;
+ $DB_len_reduced += $lens[$i];
+ }
+}
+
+#### set init opt_aL_uppper/lower_bands
+if ( ($opt_aL > 0.3) ) {
+ die ("option -aL > 1.0") if ($opt_aL > 1.0);
+
+####################
+###################
+##################
+#################
+################
+############### <-upper band
+############## <- seq below not submit, unless band change
+#############
+############
+###########
+########## <- lower band
+######### <- seq below not in format db
+########
+#######
+#####
+####
+###
+##
+#
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
+ my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
+ my $d1 = $i0+$space;
+ $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
+ $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
+ $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
+ $opt_aL_upper_bandi= $d1;
+ write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
+}
+
($DB_no, $DB_len) = blast_formatdb();
$DB_len0 = $DB_len;
@@ -48,7 +100,7 @@ $DB_len_reduced = 0;
$DB_len_reduced2 = 0;
for (; $i0<$NR_no; $i0++) {
$i = $NR_idx[$i0];
- run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
+ run_batch_blast3_multi($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
if ( not $passeds[$i] ) { # this is a new representative
$NR_passed++;
@@ -56,7 +108,7 @@ for (; $i0<$NR_no; $i0++) {
$idens[$i] = "*";
$passeds[$i] = 1;
$NR90_seq[$NR90_no] = [$i];
- fish_other_homolog($i);
+ fish_other_homolog_multi($i);
$NR90_no++;
$DB_len_reduced += $lens[$i];
$DB_len_reduced2 += $lens[$i];
@@ -64,11 +116,29 @@ for (; $i0<$NR_no; $i0++) {
watch_progress($i0, $NR90_no, $NR_passed, $NR_no, 0);
- if ((($i0+1) % $restart_seg == 0) or ($DB_len_reduced2 > $DB_len0/10) ) {
- write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
- $DB_len_reduced2 = 0;
+ if (($i0+1) % $restart_seg == 0 ) {
+ write_restart(); write_db_clstr();
+ }
+
+ my $opt_aL_format_flag = 0;
+ if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
+ my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
+ my $opt_aL_upper_band_old = $opt_aL_upper_band;
+ if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
+
+ my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
+ my $d1 = $i0+$space;
+ $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
+ $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
+ if ($opt_aL_upper_band < $opt_aL_upper_band_old) {
+ $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
+ $opt_aL_upper_bandi= $d1;
+ $opt_aL_format_flag = 1;
+ write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
+ }
+ }
}
- if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10)) {
+ if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
($DB_no, $DB_len) = blast_formatdb();
$DB_len_reduced = 0;
}
diff --git a/usecases/Miseq-16S/16S-ref-db-PE-splice.pl b/usecases/Miseq-16S/16S-ref-db-PE-splice.pl
new file mode 100755
index 0000000..dc51f93
--- /dev/null
+++ b/usecases/Miseq-16S/16S-ref-db-PE-splice.pl
@@ -0,0 +1,458 @@
+#!/usr/bin/perl
+## =========================== NGS tools ==========================================
+## NGS tools for metagenomic sequence analysis
+## May also be used for other type NGS data analysis
+##
+## Weizhong Li, UCSD
+## liwz at sdsc.edu
+## http://weizhongli-lab.org/
+## ================================================================================
+
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+use Getopt::Std;
+getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
+die usage() unless ($opts{i} and $opts{j} and $opts{o} and $opts{d});
+my ($i, $j, $k, $cmd);
+my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
+my ($len, $lena, $lenb);
+
+my $fastq = $opts{i};
+my $fastq2 = $opts{j};
+my $ref = $opts{d};
+my $output = $opts{o};
+my $trim_R1 = $opts{p}; $trim_R1 = 100 unless ($trim_R1);
+my $trim_R2 = $opts{q}; $trim_R2 = 100 unless ($trim_R2);
+my $clstr_cutoff = $opts{c}; #### post clustering
+my $full_frag = $opts{S};
+my $prime_len = 45;
+my $output_R1 = "$output-R1";
+my $output_R2 = "$output-R2";
+my $session = "OTU-session-$$";
+my $output_S = "$output-single";
+my $consensus_db = "$output-consensus";
+my $cd_hit_2d = "$script_dir/../../cd-hit-est-2d"; die "no $cd_hit_2d" unless (-e $cd_hit_2d);
+my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
+my $format = input_test($fastq); #fasta or fastq
+my $cdhit_opt_M = $opts{M}; $cdhit_opt_M = 16000 unless defined($cdhit_opt_M);
+
+if (defined($clstr_cutoff)) {
+ die "Clustering cutoff $clstr_cutoff is not reasonable, should be <=1.0 and >= 0.97" unless (($clstr_cutoff <=1.0) and ($clstr_cutoff>=0.97));
+}
+
+my %FHZ=();
+
+my %ref_map = ();
+foreach my $f (($fastq, $fastq2)) {
+ my $R = ( $f eq $fastq ) ? "R1" : "R2";
+ open(OUT, "> $consensus_db.$R") || die "can not write to $consensus_db.$R";
+
+ my %con = ();
+ my $num_seq = 0;
+ open_files_z_safe("TTTa", $f);
+
+ if ($format eq "fastq") {
+ while(1) {
+ ($lla, $ida, $seqa, $quaa, $lena) = read_next_fastq("TTTa");
+ last unless ($lla);
+ for ($i=0; $i<$prime_len; $i++) {
+ $c=uc(substr($seqa, $i, 1));
+ $con{$i}{$c}++;
+ }
+ $num_seq++;
+ }
+ }
+ else { #### fasta
+ my $seqa = "";
+ while($ll = <TTTa>) {
+ if ($ll =~ /^>/) {
+ if ($seqa) {
+ for ($i=0; $i<$prime_len; $i++) {
+ $c=uc(substr($seqa, $i, 1));
+ $con{$i}{$c}++;
+ }
+ $num_seq++;
+ }
+ chop($ll);
+ $seqa = "";
+ }
+ else {
+ chop($ll);
+ $seqa .= $ll;
+ }
+ }
+ if ($seqa) {
+ for ($i=0; $i<$prime_len; $i++) {
+ $c=uc(substr($seqa, $i, 1));
+ $con{$i}{$c}++;
+ }
+ $num_seq++;
+ }
+ } #### END fasta
+
+ close(TTTa);
+
+ my @cons = (); #which letter
+ my @cons_v = (); #abundance
+ for ($i=0; $i<$prime_len; $i++) {
+ my %t = %{ $con{$i} };
+ my @k = keys %t;
+ @k = sort { $t{$b} <=> $t{$a} } @k;
+ push(@cons, $k[0]);
+ push(@cons_v, $t{ $k[0] } / $num_seq);
+ }
+ ## set minimal consensus to be 30
+ for ($i=33; $i<$prime_len; $i++) {
+ if ( ($cons_v[$i ] <0.75) and
+ ($cons_v[$i-1] <0.75) and
+ ($cons_v[$i-2] <0.75) ) {
+ $i = $i-2; last;
+ }
+ }
+ my $trim_len_new = $i;
+
+ print OUT ">$R\n";
+ for ($i=0; $i<$trim_len_new; $i++) {
+ print OUT $cons[$i];
+ }
+ print OUT "\n";
+ close(OUT);
+
+ my $cmd_line = "$cd_hit_2d -i $consensus_db.$R -i2 $ref -d 0 -c 0.8 -n 5 -r 1 -p 1 -b 5 -o $session.$R-vs-ref -G 0 -A 30 -s2 0.01 -M $cdhit_opt_M > $session.$R-vs-ref.log";
+ print "running $cmd_line\n";
+ $cmd = `$cmd_line`;
+
+ my $parse_template=<<EOD;
+>Cluster 0
+0 45nt, >R1... *
+1 1479nt, >1111882... at 1:42:4:45/+/95.24%
+2 1500nt, >1111856... at 1:42:4:45/+/88.10%
+3 1426nt, >1111848... at 2:44:3:45/+/90.70%
+4 1530nt, >1111847... at 1:42:4:45/+/85.71%
+5 1497nt, >1111839... at 1:41:5:45/+/85.37%
+6 1492nt, >1111819... at 1:42:4:45/+/88.10%
+7 1482nt, >1111782... at 1:42:4:45/+/88.10%
+8 1496nt, >1111776... at 1:42:4:45/+/88.10%
+9 1500nt, >1111768... at 1:42:4:45/+/85.71%
+...
+>Cluster 0
+0 45nt, >R2... *
+1 1428nt, >1111883... at 483:440:2:45/-/84.09%
+2 1479nt, >1111882... at 511:468:2:45/-/88.64%
+3 1336nt, >1111879... at 435:399:2:38/-/86.49%
+4 1402nt, >1111874... at 469:426:2:45/-/84.09%
+5 1500nt, >1111856... at 513:470:2:45/-/84.09%
+6 1530nt, >1111847... at 532:489:2:45/-/86.36%
+7 1497nt, >1111839... at 509:473:2:38/-/86.49%
+8 1492nt, >1111819... at 514:471:2:45/-/88.64%
+9 1482nt, >1111782... at 502:464:2:40/-/84.62%
+10 1496nt, >1111776... at 516:473:2:45/-/84.09%
+EOD
+
+ open(TMP, "$session.$R-vs-ref.clstr") || die "can not open $session.$R-vs-ref.clstr";
+ while($ll=<TMP>){
+ next if ($ll =~ /^>/);
+ next if ($ll =~ /^0/);
+ chop($ll);
+ if ($ll =~ /^\d+\s+\d+(aa|nt), >(.+)\.\.\./) {
+ my $id = $2;
+ my @lls = split(/\s+/, $ll);
+ my @lls = split(/\//, $lls[-1]); ##516:473:2:45/-/84.09%
+ my ($query_start, $query_end, $rep_star, $rep_end) = split(/:/, $lls[0]);
+ $ref_map{$id}{$R}=[$query_start, $query_end, $rep_star, $rep_end, $lls[1]];
+ }
+ }
+ close(TMP);
+}
+
+my %ref_cut;
+foreach $id (keys %ref_map) {
+ next unless (defined $ref_map{$id}{"R1"});
+ next unless (defined $ref_map{$id}{"R2"});
+
+ my @R1_info = @{$ref_map{$id}{"R1"}};
+ my @R2_info = @{$ref_map{$id}{"R2"}};
+
+ next unless ($R1_info[4] eq "+");
+ next unless ($R2_info[4] eq "-");
+
+ my $p1 = $R1_info[0] - ($R1_info[2]-1); #### 1-based, can be -1 value for V1
+ my $p2 = $R2_info[0] + ($R2_info[2]-1); #### 1-based, can be longer than len($seq)
+ $ref_cut{$id} = [$p1, $p2];
+}
+
+open(TMP, $ref) || die "can not open $ref";
+open(OUT1, "> $output_R1") || die "can not write to $output_R1";
+open(OUT2, "> $output_R2") || die "can not write to $output_R2";
+if ($full_frag) {
+ open(OUT3, "> $output_S") || die "can not write to $output_S";
+}
+my $seq;
+my $des;
+$id = "";
+
+while($ll = <TMP>) {
+ if ($ll =~ /^>/) {
+ if ($seq) {
+ if ($ref_cut{$id}) {
+ $seq =~ s/\s//g;
+ my ($p1, $p2) = @{$ref_cut{$id}};
+ my $len = length($seq);
+ my $seq1 = "";
+ my $seq2 = "";
+ if ($p1>=1) {
+ $seq1 = substr($seq, $p1-1, $trim_R1);
+ }
+ else {
+ my $pad = 1 - $p1; #### add NNN at 5'
+ $seq1 = "N" x $pad;
+ $seq1 .= substr($seq, 0, $trim_R1-$pad);
+ }
+
+ if ($p2 <= $len) {
+ my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
+ if ($p2a < 0) { #### not long enough
+ $seq2 = substr($seq, 0, $p2);
+ }
+ else {
+ $seq2 = substr($seq, $p2a, $trim_R2);
+ }
+ }
+ else { #### add NNN at 5'
+ my $pad = $p2 - $len;
+ my $trim_t2_t = $trim_R2 - $pad;
+ $seq2 = "N" x $pad;
+
+ my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
+ if ($p2a < 0) { #### not long enough
+ $seq2.= $seq;
+ }
+ else {
+ $seq2 .= substr($seq, $p2a, $trim_R2_t);
+ }
+ }
+ $seq2 = reverse_complement($seq2);
+ ### now have $seq1 $seq2
+ print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
+ print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
+ if ($full_frag) {
+ if ($p1 < 1 ) {$p1 = 1; }
+ if ($p2 > $len) {$p2 = $len;}
+ my $eff_len = $p2-$p1+1;
+ my $seq1 = substr($seq, $p1-1, $eff_len);
+ print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
+ }
+ }
+ }
+ chop($ll);
+ $des = $ll;
+ $id = substr($ll,1);
+ $id =~ s/\s.+$//;
+ $seq = "";
+ }
+ else {
+ $seq .= $ll;
+ }
+}
+
+ if ($seq) {
+ if ($ref_cut{$id}) {
+ $seq =~ s/\s//g;
+ my ($p1, $p2) = @{$ref_cut{$id}};
+ my $len = length($seq);
+ my $seq1 = "";
+ my $seq2 = "";
+ if ($p1>=1) {
+ $seq1 = substr($seq, $p1-1, $trim_R1);
+ }
+ else {
+ my $pad = 1 - $p1; #### add NNN at 5'
+ $seq1 = "N" x $pad;
+ $seq1 .= substr($seq, 0, $trim_R1-$pad);
+ }
+
+ if ($p2 <= $len) {
+ my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
+ if ($p2a < 0) { #### not long enough
+ $seq2 = substr($seq, 0, $p2);
+ }
+ else {
+ $seq2 = substr($seq, $p2a, $trim_R2);
+ }
+ }
+ else { #### add NNN at 5'
+ my $pad = $p2 - $len;
+ my $trim_t2_t = $trim_R2 - $pad;
+ $seq2 = "N" x $pad;
+
+ my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
+ if ($p2a < 0) { #### not long enough
+ $seq2.= $seq;
+ }
+ else {
+ $seq2 .= substr($seq, $p2a, $trim_R2_t);
+ }
+ }
+ $seq2 = reverse_complement($seq2);
+ ### now have $seq1 $seq2
+ print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
+ print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
+ if ($full_frag) {
+ if ($p1 < 1 ) {$p1 = 1; }
+ if ($p2 > $len) {$p2 = $len;}
+ my $eff_len = $p2-$p1+1;
+ my $seq1 = substr($seq, $p1-1, $eff_len);
+ print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
+ }
+ }
+ }
+
+close(OUT1);
+close(OUT2);
+if ($full_frag) { close(OUT3); }
+close(TMP);
+
+if (defined($clstr_cutoff)) {
+ my $output_R1_tmp = "$output_R1.$$";
+ my $output_R2_tmp = "$output_R2.$$";
+
+ my $cmd_line = "$cd_hit_est -i $output_R1 -j $output_R2 -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
+ " -o $output_R1_tmp -op $output_R2_tmp -G 1 -g 1 -M $cdhit_opt_M -P 1 -l 11 -sc 1 > $output_R1_tmp.log";
+ print "running $cmd_line\n";
+ $cmd = `$cmd_line`;
+
+ die "Can not run $cd_hit_est" unless (-e "$output_R1_tmp.clstr");
+ $cmd = `mv $output_R1_tmp $output_R1`;
+ $cmd = `mv $output_R2_tmp $output_R2`;
+ $cmd = `mv $output_R1_tmp.clstr $output.clstr`;
+
+ if ($full_frag) {
+ my $output_S_tmp = "$output_S.$$";
+ my $cmd_line = "$cd_hit_est -i $output_S -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
+ " -o $output_S_tmp -G 1 -g 1 -M $cdhit_opt_M -l 11 -sc 1 > $output_S_tmp.log";
+ print "running $cmd_line\n";
+ $cmd = `$cmd_line`;
+ die "Can not run $cd_hit_est" unless (-e "$output_S_tmp.clstr");
+ $cmd = `mv $output_S_tmp $output_S`;
+ $cmd = `mv $output_S_tmp.clstr $output_S.clstr`;
+ }
+}
+
+$cmd = `rm -f $session*`;
+
+# need %FHZ
+# open one or more files including zipped files
+# above open_files_z may have broken pipe problem
+# so this safe sub, open each file individually
+sub open_files_z_safe {
+ my ($fh, @files) = @_;
+ my ($i, $j, $k);
+
+ my $no = $#files+1;
+
+ $FHZ{$fh} = {
+ 'files' => [@files],
+ 'no' => $no,
+ 'open_idx' => 0,
+ };
+
+ my $f0 = $files[0];
+ if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 |") || die "can not gunzip -c $f0\n"; }
+ elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 |") || die "can not bzcat $f0\n"; }
+ else { open($fh, $f0 ) || die "can not open $f0\n"; }
+ return 0;
+}
+########## END open_files_z_safe
+
+
+sub read_FHZ {
+ my $fh = shift;
+ my $ll;
+
+ $ll = <$fh>;
+ if ($ll) { return $ll;} ##### read from existing opened file
+
+ #otherwise, last opened file reaches EOF
+ if ($FHZ{$fh}->{open_idx} < $FHZ{$fh}->{no} -1 ) { ### still file not opened yet
+ close($fh); #### close last open file
+
+ $FHZ{$fh}->{open_idx}++;
+ my $f0 = $FHZ{$fh}->{files}->[ $FHZ{$fh}->{open_idx} ];
+
+ if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 |") || die "can not gunzip -c $f0\n"; }
+ elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 |") || die "can not bzcat $f0\n"; }
+ else { open($fh, $f0 ) || die "can not open $f0\n"; }
+
+ $ll = <$fh>;
+ return $ll;
+ }
+ else { #### no more file to open, return undef
+ return undef;
+ }
+}
+########### END read_FHZ
+
+
+########## read_next_fastq
+sub read_next_fastq {
+ my $fh = shift;
+ my ($lla, $seqa, $lla2, $quaa, $ida, $lena);
+ $lla = read_FHZ($fh); return unless ($lla);
+ chop($lla); $lla =~ s/\s.+$//;
+ $ida = substr($lla,1);
+ $seqa = read_FHZ($fh); $seqa =~ s/\s+$//g; $lena = length($seqa);
+ $lla2 = read_FHZ($fh); #read ID
+ $quaa = read_FHZ($fh); $quaa =~ s/\s+$//g;
+ return ($lla, $ida, $seqa, $quaa, $lena);
+}
+########## END read_next_fastq
+
+
+sub reverse_complement {
+ my ($in_seq) = @_;
+ my $opposite = reverse $in_seq;
+ $opposite =~ tr/ACGT/TGCA/;
+ return("$opposite");
+}
+
+
+sub input_test {
+ my $f = shift;
+ open(TTT, $f) || die "can not open $f\n";
+ my $ll = <TTT>;
+ close(TTT);
+
+ my $c = substr($ll,0,1);
+ if ($c eq ">") {return "fasta";}
+ else {return "fastq";}
+}
+########## END input_test
+
+
+sub usage {
+<<EOD;
+This script takes a paired-end (PE) read files (Fastq or Fasta) for a 16S dataset, e.g. from V3-V4
+region, it also takes a Fasta file of full-length 16S reference database, e.g. Greengene.
+this script identifies the sequencing region on the reference sequencs and it cuts the forward
+and reverse segments and outputs them in PE fasta files. The output PE reference database can be used
+to cluster together with 16S datasets
+
+Options:
+======================
+ -i input fasta or fastq file for R1
+ -j input fasta or fastq file for R2
+ -d 16S reference sequence file in fasta format
+ -o output prefix
+ -p lenght of forward sequence in output file
+ -q length of reverse sequence in output file
+ -S also output full fragment
+ -c cutoff for clustering the output PE files to remove redundant reference seqeunces.
+ Suggested cutoffs: 1.00, 0.99, 0.98 and 0.97
+ The script will not cluster the output unless user specifies this cutoff.
+ -M available memory to use, default 16000, means 16000MB. This option will be passed to cd-hit.
+EOD
+}
diff --git a/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl b/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
new file mode 100644
index 0000000..19d4070
--- /dev/null
+++ b/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
@@ -0,0 +1,117 @@
+#!/usr/bin/perl
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+########## local variables etc. Please edit
+$CD_HIT_dir = "/home/oasis/data/etc/git/cdhit";
+$NGS_prog_trimmomatic = "/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar";
+
+
+########## computation resources for execution of jobs
+%NGS_executions = ();
+$NGS_executions{"qsub_1"} = {
+ "type" => "qsub-pe",
+ "cores_per_node" => 8,
+ "number_nodes" => 64,
+ "user" => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
+ "command" => "qsub",
+ "command_name_opt" => "-N",
+ "command_err_opt" => "-e",
+ "command_out_opt" => "-o",
+ "template" => <<EOD,
+#!/bin/sh
+#PBS -v PATH
+#PBS -V
+
+#\$ -v PATH
+#\$ -V
+
+EOD
+};
+
+
+$NGS_executions{"sh_1"} = {
+ "type" => "sh",
+ "cores_per_node" => 8,
+ "number_nodes" => 1,
+};
+
+$NGS_batch_jobs{"qc"} = {
+ "CMD_opts" => ["100"],
+ "execution" => "sh_1", # where to execute
+ "cores_per_cmd" => 4, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
+ SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+
+perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
+
+wait
+rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
+EOD
+};
+
+
+$NGS_batch_jobs{"otu"} = {
+ "injobs" => ["qc"],
+ "CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
+ "execution" => "sh_1", # where to execute
+ "cores_per_cmd" => 2, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+#### cluster at 100% PE
+$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -j \\INJOBS.0/R2.fa -o \\SELF/seq.nr -op \\SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.nr.log
+#### cluster at 99% PE and SE for R1,R2
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -o \\SELF/seq.chimeric-clstr.R1 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R1.log
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr.2 -o \\SELF/seq.chimeric-clstr.R2 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R2.log
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -j \\SELF/seq.nr.2 -o \\SELF/seq.99 -op \\SELF/seq.99.2 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.99.log
+$CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c \\CMDOPTS.3 -k \\SELF/seq.nr.clstr \\
+ -i \\SELF/seq.chimeric-clstr.R1.clstr -j \\SELF/seq.chimeric-clstr.R2.clstr \\
+ -a \\SELF/seq.99.clstr -f \\SELF/seq.99 -g \\SELF/seq.99.2 -o \\SELF/seq.99f
+$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr.clstr \\SELF/seq.99f.clstr > \\SELF/seq.99f-all.clstr
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.99f -j \\SELF/seq.99f.2 -o \\SELF/seq.97 -op \\SELF/seq.97.2 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.log
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.97 -j \\SELF/seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o \\SELF/seq.97.ref -op \\SELF/seq.97.ref.2 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.ref.log
+$CD_HIT_dir/clstr_rev.pl \\SELF/seq.99f-all.clstr \\SELF/seq.97.clstr > \\SELF/seq.97-all.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.97.ref.clstr > \\SELF/seq.97.reftop.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/seq.97-all.clstr \\SELF/seq.97.reftop.clstr > \\SELF/OTU.clstr
+
+rm -f \\SELF/seq.chimeric-clstr.R1 \\SELF/seq.chimeric-clstr.R1.log \\SELF/seq.chimeric-clstr.R2 \\SELF/seq.chimeric-clstr.R2.log
+rm -f \\SELF/seq.97.ref \\SELF/seq.97.ref.2 \\SELF/seq.97.ref.log
+mv \\SELF/seq.99f.log \\SELF/chimeric-small-clusters-list.txt
+
+EOD
+};
+
+
+$NGS_batch_jobs{"otu-pooled"} = {
+ "CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
+ "execution" => "sh_1", # where to execute
+ "cores_per_cmd" => 2, # number of threads used by command below
+ "no_parallel" => 1, # number of total jobs to run using command below
+ "command" => <<EOD,
+#### before running
+#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
+$CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log
+$CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
+ -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log
+$CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
+$CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
+$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
+
+EOD
+};
+
+##############################################################################################
+########## END
+1;
+
diff --git a/usecases/Miseq-16S/NG-Omics-WF.pl b/usecases/Miseq-16S/NG-Omics-WF.pl
new file mode 100755
index 0000000..2f46255
--- /dev/null
+++ b/usecases/Miseq-16S/NG-Omics-WF.pl
@@ -0,0 +1,1189 @@
+#!/usr/bin/perl
+# =============================== NG-Omics-WF ==================================
+# _ _ _____ ____ _ __ ________
+# | \ | |/ ____| / __ \ (_) \ \ / / ____|
+# | \| | | __ ______| | | |_ __ ___ _ ___ ___ _____\ \ /\ / /| |__
+# | . ` | | |_ |______| | | | '_ ` _ \| |/ __/ __|______\ \/ \/ / | __|
+# | |\ | |__| | | |__| | | | | | | | (__\__ \ \ /\ / | |
+# |_| \_|\_____| \____/|_| |_| |_|_|\___|___/ \/ \/ |_|
+#
+# =========================== Next Generation Omics data workflow tools ========
+#
+# Workflow tools for next generation genomics, metagenomics, RNA-seq
+# and other type of omics data analyiss,
+#
+# Software originally developed since 2010 by Weizhong Li at UCSD
+# currently at JCVI
+#
+# http://weizhongli-lab.org/ngomicswf liwz at sdsc.edu
+# ==============================================================================
+
+use Getopt::Std;
+use POSIX;
+
+getopts("i:R:s:J:Q:r:j:Z:t:S:T:",\%opts);
+die usage() unless ($opts{i} and ($opts{s} or $opts{S}));
+
+my $sample_in = $opts{s};
+my $sample_command_in = $opts{S}; #### ';' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
+my $input_conf = $opts{i};
+my $this_task = $opts{J};
+our $G_NGS_root = $opts{r};
+my $queue_system = $opts{Q}; $queue_system = "SGE" unless $queue_system;
+my $subset_wfs = $opts{R};
+my $subset_jobs = $opts{j};
+my $second_opt = $opts{Z};
+my $opt_file = $opts{t};
+my $opt_command_in = $opts{T}; #### ';' delimited jobs, ":" delimited entries, e.g. JobID_A:opt0:opt1:opt2;JobID_B:opt0:opt1
+
+my $pwd = `pwd`; chop($pwd);
+my $sleep_time_min = 15;
+my $sleep_time_max = 120;
+my $log_dir = "$pwd/WF-LOG";
+my $log_file = "$log_dir/LOG";
+my $log_fileq = "$log_dir/LOGq";
+my $sh_dir = "$pwd/WF-sh";
+my $sh_bundle_dir = "$pwd/WF-sh-bundle";
+my $subset_flag = 0; #### run only one job, subset of jobs, or jobs in sub workflows
+my %subset_jobs = ();
+my %qstat_xml_data = ();
+my ($i, $j, $k, $ll, $cmd);
+
+######## scan through WF configration
+######## and generate job list
+require $input_conf;
+my %job_list = (); # as $job_list{$t_job_id}{$t_sample_id} = {};
+my ($t_sample_id, $t_job_id, $t_execution_id);
+my ($t_sample, $t_job, $t_execution);
+task_level_jobs();
+my @NGS_batch_jobs = sort {($NGS_batch_jobs{$a}->{'job_level'} <=> $NGS_batch_jobs{$b}->{'job_level'}) or ($a cmp $b)} keys %NGS_batch_jobs;
+
+$cmd = `mkdir -p $log_dir` unless (-e $log_dir);
+$cmd = `mkdir -p $sh_dir` unless (-e $sh_dir);
+$cmd = `mkdir -p $sh_bundle_dir` unless (-e $sh_bundle_dir);
+open(LOG, ">> $log_file") || die "can not write to $log_file";
+
+######## parse NGS_samples
+my %NGS_sample_data = ();
+my @NGS_samples = ();
+if (defined($sample_in)) {
+ open(TMP, $sample_in) || die "can not open $sample_in";
+ while($ll=<TMP>){
+ next if ($ll =~ /^#/);
+ next unless ($ll =~ /^\w/); chop($ll);
+ my ($id, @data) = split(/\s+/,$ll);
+ push(@NGS_samples, $id);
+ $NGS_sample_data{$id} = [@data];
+ if (not (-e $id)) { $cmd = `mkdir $id`;}
+ }
+ close(TMP);
+}
+elsif (defined($sample_command_in)) {
+ my @lls = split(/,/, $sample_command_in);
+ foreach $ll (@lls) {
+ my ($id, @data) = split(/:/, $ll);
+ push(@NGS_samples, $id);
+ $NGS_sample_data{$id} = [@data];
+ if (not (-e $id)) { $cmd = `mkdir $id`;}
+ }
+}
+else {
+ die "no input samples";
+}
+
+my %CMD_opts = ();
+if (-e $opt_file) {
+ ##format example
+ ##CMDOPT JobID_A:opt0:opt1:opt2
+ ##CMDOPT JobID_B:opt0:opt1
+ ##CMDOPT JobID_C:opt0:opt1:opt2:opt3
+ open(TMP, $opt_file) || die "can not open $opt_file";
+ while($ll = <TMP>){
+ next if ($ll =~ /^#/);
+ next unless ($ll =~ /^CMDOPT/);
+ chop($ll);
+ my ($i, $opt1) = split(/\s+/, $ll);
+ my ($job_id, @opts) = split(/:/, $opt1);
+ $CMD_opts{$job_id} = [@opts];
+ }
+ close(TMP);
+}
+elsif ($opt_command_in) {
+ my @lls = split(/,/, $opt_command_in);
+ foreach $ll (@lls) {
+ my ($job_id, @opts) = split(/:/, $ll);
+ $CMD_opts{$job_id} = [@opts];
+ }
+}
+
+########## processing subset of jobs
+if ($subset_wfs) {
+ my @wfs = split(/,/, $subset_wfs);
+ $subset_flag = 1;
+ foreach $i (@wfs) {
+ my @jobs = @{ $NGS_batch_sets{$i}->{"jobs"} };
+ foreach $j (@jobs) { $subset_jobs{$j} = 1; }
+ }
+}
+if ($subset_jobs) {
+ $subset_flag = 1;
+ my @jobs = split(/,/, $subset_jobs);
+ foreach $j (@jobs) { $subset_jobs{$j} = 1; }
+ add_subset_jobs_by_dependency();
+}
+if ($subset_flag) {
+ my $job_str = join(" ", keys %subset_jobs);
+ write_log("Running subset of jobs: $job_str");
+}
+
+my $verify_flag = 0;
+foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ $t_job = $NGS_batch_jobs{$t_job_id};
+ $t_execution = $NGS_executions{ $t_job->{"execution"} };
+
+ my $pe_parameter = ""; #### setup pe parameters
+ if ($t_execution->{'type'} eq "qsub-pe") {
+ my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
+ $t_cores_per_cmd = 1 unless ($t_cores_per_cmd);
+ $pe_parameter = "#\$ -pe orte $t_cores_per_cmd";
+ }
+
+ if ($t_job->{"cores_per_cmd"} > $t_execution->{"cores_per_node"} ) {
+ $verify_flag = 1;
+ write_log("$t_job_id needs $t_job->{\"cores_per_cmd\"} cores, but $t_job->{\"execution\"} only has $t_execution->{\"cores_per_node\"} cores");
+ }
+
+ my $cmds_per_node = POSIX::floor( $t_execution->{"cores_per_node"} / $t_job->{"cores_per_cmd"});
+ my $nodes_total = POSIX::ceil($t_job->{"no_parallel"} / $cmds_per_node);
+ $t_job->{"cmds_per_node"} = $cmds_per_node;
+ $t_job->{"nodes_total"} = $nodes_total;
+
+ if ($t_job->{"nodes_total"} > $t_execution->{"number_nodes"}) {
+ $verify_flag = 1;
+ write_log("$t_job_id needs $t_job->{\"nodes_total\"} nodes, but $t_job->{\"execution\"} only has $t_execution->{\"number_nodes\"} nodes");
+ }
+
+ my @CMD_opts = ();
+ @CMD_opts = @{$t_job->{CMD_opts}} if (defined($t_job->{CMD_opts} ));
+ @CMD_opts = @{$CMD_opts{$t_job_id}} if (defined($CMD_opts{$t_job_id})); #### command line take over default
+
+ foreach $t_sample_id (@NGS_samples) {
+ my @t_commands = split(/\t/, $t_job->{"command"});
+ my $t_command = "";
+ foreach my $c0 (@t_commands) {
+ my $c1 = $c0;
+ $c1 =~ s/\\SAMPLE/$t_sample_id/g;
+ $c1 =~ s/\\SELF/$t_job_id/g;
+ # take it easy, assuming maxium 20 input files
+ $c1 =~ s/\\INFILES\.0/$t_job->{"infiles"}->[0]/g; $c1 =~ s/\\INFILES\.10/$t_job->{"infiles"}->[10]/g;
+ $c1 =~ s/\\INFILES\.1/$t_job->{"infiles"}->[1]/g; $c1 =~ s/\\INFILES\.11/$t_job->{"infiles"}->[11]/g;
+ $c1 =~ s/\\INFILES\.2/$t_job->{"infiles"}->[2]/g; $c1 =~ s/\\INFILES\.12/$t_job->{"infiles"}->[12]/g;
+ $c1 =~ s/\\INFILES\.3/$t_job->{"infiles"}->[3]/g; $c1 =~ s/\\INFILES\.13/$t_job->{"infiles"}->[13]/g;
+ $c1 =~ s/\\INFILES\.4/$t_job->{"infiles"}->[4]/g; $c1 =~ s/\\INFILES\.14/$t_job->{"infiles"}->[14]/g;
+ $c1 =~ s/\\INFILES\.5/$t_job->{"infiles"}->[5]/g; $c1 =~ s/\\INFILES\.15/$t_job->{"infiles"}->[15]/g;
+ $c1 =~ s/\\INFILES\.6/$t_job->{"infiles"}->[6]/g; $c1 =~ s/\\INFILES\.16/$t_job->{"infiles"}->[16]/g;
+ $c1 =~ s/\\INFILES\.7/$t_job->{"infiles"}->[7]/g; $c1 =~ s/\\INFILES\.17/$t_job->{"infiles"}->[17]/g;
+ $c1 =~ s/\\INFILES\.8/$t_job->{"infiles"}->[8]/g; $c1 =~ s/\\INFILES\.18/$t_job->{"infiles"}->[18]/g;
+ $c1 =~ s/\\INFILES\.9/$t_job->{"infiles"}->[9]/g; $c1 =~ s/\\INFILES\.19/$t_job->{"infiles"}->[19]/g;
+
+ $c1 =~ s/\\DATA\.0/$NGS_sample_data{$t_sample_id}->[0]/g; $c1 =~ s/\\DATA\.10/$NGS_sample_data{$t_sample_id}->[10]/g;
+ $c1 =~ s/\\DATA\.1/$NGS_sample_data{$t_sample_id}->[1]/g; $c1 =~ s/\\DATA\.11/$NGS_sample_data{$t_sample_id}->[11]/g;
+ $c1 =~ s/\\DATA\.2/$NGS_sample_data{$t_sample_id}->[2]/g; $c1 =~ s/\\DATA\.12/$NGS_sample_data{$t_sample_id}->[12]/g;
+ $c1 =~ s/\\DATA\.3/$NGS_sample_data{$t_sample_id}->[3]/g; $c1 =~ s/\\DATA\.13/$NGS_sample_data{$t_sample_id}->[13]/g;
+ $c1 =~ s/\\DATA\.4/$NGS_sample_data{$t_sample_id}->[4]/g; $c1 =~ s/\\DATA\.14/$NGS_sample_data{$t_sample_id}->[14]/g;
+ $c1 =~ s/\\DATA\.5/$NGS_sample_data{$t_sample_id}->[5]/g; $c1 =~ s/\\DATA\.15/$NGS_sample_data{$t_sample_id}->[15]/g;
+ $c1 =~ s/\\DATA\.6/$NGS_sample_data{$t_sample_id}->[6]/g; $c1 =~ s/\\DATA\.16/$NGS_sample_data{$t_sample_id}->[16]/g;
+ $c1 =~ s/\\DATA\.7/$NGS_sample_data{$t_sample_id}->[7]/g; $c1 =~ s/\\DATA\.17/$NGS_sample_data{$t_sample_id}->[17]/g;
+ $c1 =~ s/\\DATA\.8/$NGS_sample_data{$t_sample_id}->[8]/g; $c1 =~ s/\\DATA\.18/$NGS_sample_data{$t_sample_id}->[18]/g;
+ $c1 =~ s/\\DATA\.9/$NGS_sample_data{$t_sample_id}->[9]/g; $c1 =~ s/\\DATA\.19/$NGS_sample_data{$t_sample_id}->[19]/g;
+
+ $c1 =~ s/\\INJOBS\.0/$t_job->{"injobs"}->[0]/g; $c1 =~ s/\\INJOBS\.10/$t_job->{"injobs"}->[10]/g;
+ $c1 =~ s/\\INJOBS\.1/$t_job->{"injobs"}->[1]/g; $c1 =~ s/\\INJOBS\.11/$t_job->{"injobs"}->[11]/g;
+ $c1 =~ s/\\INJOBS\.2/$t_job->{"injobs"}->[2]/g; $c1 =~ s/\\INJOBS\.12/$t_job->{"injobs"}->[12]/g;
+ $c1 =~ s/\\INJOBS\.3/$t_job->{"injobs"}->[3]/g; $c1 =~ s/\\INJOBS\.13/$t_job->{"injobs"}->[13]/g;
+ $c1 =~ s/\\INJOBS\.4/$t_job->{"injobs"}->[4]/g; $c1 =~ s/\\INJOBS\.14/$t_job->{"injobs"}->[14]/g;
+ $c1 =~ s/\\INJOBS\.5/$t_job->{"injobs"}->[5]/g; $c1 =~ s/\\INJOBS\.15/$t_job->{"injobs"}->[15]/g;
+ $c1 =~ s/\\INJOBS\.6/$t_job->{"injobs"}->[6]/g; $c1 =~ s/\\INJOBS\.16/$t_job->{"injobs"}->[16]/g;
+ $c1 =~ s/\\INJOBS\.7/$t_job->{"injobs"}->[7]/g; $c1 =~ s/\\INJOBS\.17/$t_job->{"injobs"}->[17]/g;
+ $c1 =~ s/\\INJOBS\.8/$t_job->{"injobs"}->[8]/g; $c1 =~ s/\\INJOBS\.18/$t_job->{"injobs"}->[18]/g;
+ $c1 =~ s/\\INJOBS\.9/$t_job->{"injobs"}->[9]/g; $c1 =~ s/\\INJOBS\.19/$t_job->{"injobs"}->[19]/g;
+
+ $c1 =~ s/\\CMDOPTS\.0/$CMD_opts[0]/g; $c1 =~ s/\\CMDOPTS\.10/$CMD_opts[10]/g;
+ $c1 =~ s/\\CMDOPTS\.1/$CMD_opts[1]/g; $c1 =~ s/\\CMDOPTS\.11/$CMD_opts[11]/g;
+ $c1 =~ s/\\CMDOPTS\.2/$CMD_opts[2]/g; $c1 =~ s/\\CMDOPTS\.12/$CMD_opts[12]/g;
+ $c1 =~ s/\\CMDOPTS\.3/$CMD_opts[3]/g; $c1 =~ s/\\CMDOPTS\.13/$CMD_opts[13]/g;
+ $c1 =~ s/\\CMDOPTS\.4/$CMD_opts[4]/g; $c1 =~ s/\\CMDOPTS\.14/$CMD_opts[14]/g;
+ $c1 =~ s/\\CMDOPTS\.5/$CMD_opts[5]/g; $c1 =~ s/\\CMDOPTS\.15/$CMD_opts[15]/g;
+ $c1 =~ s/\\CMDOPTS\.6/$CMD_opts[6]/g; $c1 =~ s/\\CMDOPTS\.16/$CMD_opts[16]/g;
+ $c1 =~ s/\\CMDOPTS\.7/$CMD_opts[7]/g; $c1 =~ s/\\CMDOPTS\.17/$CMD_opts[17]/g;
+ $c1 =~ s/\\CMDOPTS\.8/$CMD_opts[8]/g; $c1 =~ s/\\CMDOPTS\.18/$CMD_opts[18]/g;
+ $c1 =~ s/\\CMDOPTS\.9/$CMD_opts[9]/g; $c1 =~ s/\\CMDOPTS\.19/$CMD_opts[19]/g;
+ $t_command .= "$c1\n";
+ }
+
+
+ my @t_infiles = map { "$t_sample_id/$_" } @{$t_job->{"infiles"}};
+ my @t_injobs = @{$t_job->{"injobs"}};
+ my $t_sh_file = "$sh_dir/$t_job_id.$t_sample_id.sh";
+ my $f_start = "$pwd/$t_sample_id/$t_job_id/WF.start.date";
+ my $f_complete = "$pwd/$t_sample_id/$t_job_id/WF.complete.date";
+ my $f_cpu = "$pwd/$t_sample_id/$t_job_id/WF.cpu";
+ $job_list{$t_job_id}{$t_sample_id} = {
+ 'sample_id' => $t_sample_id,
+ 'job_id' => $t_job_id,
+ 'status' => 'wait', #### status can be wait (input not ready), ready (input ready), submitted (submitted or running), completed
+ 'command' => $t_command,
+ 'sh_file' => $t_sh_file,
+ 'infiles' => [@t_infiles],
+ 'injobs' => [@t_injobs],
+ 'start_file' => $f_start,
+ 'complete_file'=> $f_complete,
+ 'cpu_file' => $f_cpu,
+ };
+
+ my $v_command = "";
+ foreach my $vf (@{$t_job->{"non_zero_files"}}) {
+ $v_command .= "if ! [ -s $t_job_id/$vf ]; then echo \"zero size $t_job_id/$vf\"; exit; fi\n";
+ }
+
+
+ if (not -e $t_sh_file) {
+ write_log("Write sh file to $t_sh_file");
+ open(TSH, "> $t_sh_file") || die "can not write to $t_sh_file\n";
+ print TSH <<EOD;
+$t_execution->{"template"}
+$pe_parameter
+
+my_host=`hostname`
+my_pid=\$\$
+my_core=$t_job->{"cores_per_cmd"}
+my_queue=$t_job->{"execution"}
+my_time_start=`date +%s`;
+
+cd $pwd
+cd $t_sample_id
+mkdir $t_job_id
+if ! [ -f $f_start ]; then date +\%s > $f_start; fi
+$t_command
+$v_command
+date +\%s > $f_complete
+#times >> $f_cpu
+
+my_time_end=`date +%s`;
+my_time_spent=\$((my_time_end-my_time_start))
+echo "sample=$t_sample_id job=$t_job_id host=\$my_host pid=\$my_pid queue=\$my_queue cores=\$my_core time_start=\$my_time_start time_end=\$my_time_end time_spent=\$my_time_spent" >> $f_cpu
+
+EOD
+ close(TSH);
+ #validate_cmd_line($t_command, $t_sh_file, $t_sample_id);
+ }
+ } ########## foreach my $c0 (@t_commands)
+} ########## foreach $t_job (keys %NGS_batch_jobs)
+
+die if ($verify_flag);
+
+if ($this_task eq "log-cpu" ) { task_log_cpu(); exit 0;}
+elsif ($this_task eq "list-jobs" ) { task_list_jobs(); exit 0;}
+elsif ($this_task eq "snapshot" ) { task_snapshot(); exit 0;}
+elsif ($this_task eq "delete-jobs" ) { task_delete_jobs($second_opt); exit 0;}
+elsif ($this_task eq "write-sh" ) { exit 0;}
+elsif ($this_task ) { die "undefined task $this_task";}
+
+################################################################################################
+# _____ _ _ _____ _____ _ _ _ _ _
+# | __ \ | \ | |/ ____|/ ____|| | | | | | (_) | |
+# | |__) | _ _ __ | \| | | __| (___ | |__ __ _| |_ ___| |__ _ ___ | |__ ___
+# | _ / | | | '_ \ | . ` | | |_ |\___ \ | '_ \ / _` | __/ __| '_ \ | |/ _ \| '_ \/ __|
+# | | \ \ |_| | | | | | |\ | |__| |____) || |_) | (_| | || (__| | | | | | (_) | |_) \__ \
+# |_| \_\__,_|_| |_| |_| \_|\_____|_____/ |_.__/ \__,_|\__\___|_| |_| | |\___/|_.__/|___/
+# ______ ______ _/ |
+# |______| |______|__/
+########## Run NGS_batch_jobs for each samples http://patorjk.com/software/taag
+################################################################################################
+
+
+my %execution_submitted = (); # number of submitted jobs (qsub) or threads (local sh)
+my $sleep_time = $sleep_time_min;
+while(1) {
+ my $flag_job_done = 1;
+
+ ########## reset execution_submitted to 0
+ foreach $i (keys %NGS_executions) { $execution_submitted{$i} = 0; }
+
+ my $flag_qstat_xml_call = 0;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+ my $exe_type = $t_execution->{type};
+ $flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
+ }
+ SGE_qstat_xml_query() if $flag_qstat_xml_call;
+
+ ########## check and update job status for submitted jobs
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+
+ next if ($status eq "completed");
+ ########## check file system to update job status
+ ########## in case this is a restart run
+ check_submitted_job($t_job_id, $t_sample_id);
+ next if ($t_sample_job->{'status'} eq "completed");
+ $flag_job_done = 0;
+ }
+ }
+
+ if ($flag_job_done) { write_log("job completed!"); last; }
+
+ ########## check and update job status based on dependance
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+
+ next unless ($status eq "wait");
+ my @t_infiles = @{ $t_sample_job->{'infiles'} };
+ my @t_injobs = @{ $t_sample_job->{'injobs'} };
+ my $t_ready_flag = 1;
+
+ foreach $i (@t_infiles) {
+ next if (-s $i); #### non-zero size file
+ $t_ready_flag = 0;
+ last;
+ }
+
+ foreach $i (@t_injobs) {
+ next if ( $job_list{$i}{$t_sample_id}->{'status'} eq "completed"); #### injob completed
+ $t_ready_flag = 0;
+ last;
+ }
+ if ($t_ready_flag) {
+ $t_sample_job->{"status"} = "ready";
+ write_log("$t_job_id,$t_sample_id: change status to ready");
+ }
+ }
+ }
+
+ ########## submit local sh jobs
+ my $has_submitted_some_jobs = 0;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+ my $t_execution_id = $t_job->{"execution"};
+
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ next unless ($t_execution->{'type'} eq "sh");
+ next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"cores_per_node"} ); #### all cores are used
+
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ next unless ($status eq "ready");
+ next if ( ($execution_submitted{$t_execution_id} + $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"}) > $t_execution->{"cores_per_node"} ); #### no enough available cores
+ #### now submitting
+
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
+ $cmd = `sh $t_sh_file >/dev/null 2>&1 &`;
+ }
+ $cmd = `touch $t_sh_pid`;
+ $t_sample_job->{'status'} = "submitted";
+ write_log("$t_job_id,$t_sample_id: change status to submitted");
+ $execution_submitted{ $t_execution_id } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
+ $has_submitted_some_jobs = 1;
+ }
+ }
+
+ ########## submit qsub-pe jobs, multiple jobs may share same node
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+ my $t_execution_id = $t_job->{"execution"};
+
+ next unless ($t_execution->{'type'} eq "qsub-pe");
+ next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
+
+ my $t_cores_per_node = $t_execution->{"cores_per_node"};
+ my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
+ my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
+ my $t_nodes_per_job = $t_cores_per_job / $t_cores_per_node;
+
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ next unless ($status eq "ready");
+
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
+
+ for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
+ my $t_stderr = "$t_sh_file.$i.stderr";
+ my $t_stdout = "$t_sh_file.$i.stdout";
+ $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_file 2>$log_fileq`;
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+ print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
+ $execution_submitted{$t_execution_id} += $t_nodes_per_job;
+ write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
+ }
+
+ close(TID);
+ $has_submitted_some_jobs = 1;
+ $t_sample_job->{'status'} = "submitted";
+ }
+ } ########## END foreach $t_job_id (keys %NGS_batch_jobs)
+
+ ########## submit qsub jobs
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+ my $t_execution_id = $t_job->{"execution"};
+
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ next unless ($t_execution->{'type'} eq "qsub");
+ next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
+
+ my $t_cores_per_node = $t_execution->{"cores_per_node"};
+ my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
+ my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
+ my $t_nodes_per_job = POSIX::ceil($t_cores_per_job / $t_cores_per_node);
+ my $t_cmds_per_node = int($t_cores_per_node / $t_cores_per_cmd);
+ my $t_jobs_per_node = int($t_cores_per_node / $t_cores_per_job);
+
+ ########## 1. this loop process jobs need 1 or more nodes per sample, ie. bundle within a sample, e.g. blast against refseq
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ next unless ($status eq "ready");
+ next unless ($t_jobs_per_node <= 1); #### unless need >= 1 node, including jobs use between (51%-100%) cores per node
+ last if ( ($execution_submitted{$t_execution_id} + $t_nodes_per_job) > $t_execution->{"number_nodes"}); #### no enough available queues
+
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.$t_sample_id.$$.sh";
+ my $t_stderr = "$t_sh_bundle.stderr";
+ my $t_stdout = "$t_sh_bundle.stdout";
+ my $t_sh_pid = "$t_sh_file.pids";
+
+ open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
+ open(BSH, "> $t_sh_bundle") || die "can not write to $t_sh_bundle";
+ print BSH <<EOD;
+$t_execution->{"template"}
+cd $pwd
+EOD
+ for ($i=0; $i<$t_cmds_per_node; $i++) {
+ print BSH "sh $t_sh_file &\n";
+ print BSH "sleep 3\n";
+ }
+ print BSH "wait\n";
+ close(BSH);
+
+ for ($i=0; $i<$t_nodes_per_job; $i++) {
+ $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+ print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
+ $execution_submitted{$t_execution_id}++;
+ write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
+ }
+ close(TID);
+ $has_submitted_some_jobs = 1;
+ $t_sample_job->{'status'} = "submitted";
+ } ########## END foreach $t_sample_id (@NGS_samples)
+
+
+ ########## 2. this loop process jobs need less than 1 node per sample, ie. bundle jobs across samples, e.g. qc
+ my @t_bundle = ();
+ my $available_nodes = $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id};
+ my $no_sample_can_be_processed = $available_nodes * $t_jobs_per_node;
+ my @t_samples = ();
+ my $t_batch_no = 0;
+
+ foreach $t_sample_id (@NGS_samples) { #### same loop as next, to find out @t_samples and last sample can run
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ next unless ($status eq "ready");
+ next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
+ last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
+ push(@t_samples, $t_sample_id);
+ }
+ my $last_sample_can_run = $t_samples[-1];
+ @t_samples = ();
+
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ next unless ($status eq "ready");
+ next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
+ last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
+ push(@t_samples, $t_sample_id);
+
+ #### bundle @t_samples to one qsub job
+ if ((($#t_samples+1) == $t_jobs_per_node) or ($t_sample_id eq $last_sample_can_run)) {
+ my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.samples-$t_batch_no.$$.sh";
+ my $t_stderr = "$t_sh_bundle.stderr";
+ my $t_stdout = "$t_sh_bundle.stdout";
+
+ open(BSH, "> $t_sh_bundle") || die "can not write to $t_sh_bundle";
+ print BSH <<EOD;
+$t_execution->{"template"}
+cd $pwd
+EOD
+ foreach $i (@t_samples) {
+ my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
+ for ($j=0; $j<$t_job->{"no_parallel"}; $j++) {
+ print BSH "sh $t_sh_file &\n";
+ print BSH "sleep 3\n";
+ }
+ }
+ print BSH "wait\n";
+ close(BSH);
+
+ $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
+ my $qsub_id = 0;
+ if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
+
+ foreach $i (@t_samples) {
+ my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
+ print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
+ write_log("$t_sh_bundle submitted for sample $i, qsubid $cmd");
+ close(TID);
+ $job_list{$t_job_id}{$i}->{'status'} = "submitted";
+ }
+
+ $has_submitted_some_jobs = 1;
+ $execution_submitted{$t_execution_id}++;
+ @t_samples = (); #### clear
+ $t_batch_no++;
+ }
+ } ########## END foreach $t_sample_id (@NGS_samples)
+ } ########## END foreach $t_job_id (keys %NGS_batch_jobs)
+
+
+ #### if has submitted some jobs, reset waiting time, otherwise double waiting time
+ print_job_status_summary();
+ if ($has_submitted_some_jobs) {
+ $sleep_time = $sleep_time_min;
+ }
+ else {
+ $sleep_time = $sleep_time*2;
+ $sleep_time = $sleep_time_max if ($sleep_time > $sleep_time_max);
+ }
+ write_log("sleep $sleep_time seconds");
+ sleep($sleep_time);
+} ########## END while(1)
+
+task_log_cpu();
+################################################################################
+########## END Run NGS_batch_jobs for each samples
+################################################################################
+
+close(LOG);
+##########
+
+
+sub write_log {
+ my @txt = @_;
+ my $i;
+ my $date = `date`; chop($date);
+ foreach $i (@txt) {
+ print LOG "$date $i\n";
+ print STDERR "$date $i\n";
+ }
+ print LOG "\n";
+ print STDERR "\n";
+}
+########## END write_log
+
+sub SGE_qstat_xml_query {
+ my ($i, $j, $k, $cmd, $ll);
+ %qstat_xml_data = (); #### global
+ $cmd = `qstat -f -xml`;
+ if ($cmd =~ /<queue_info/) { #### dummy
+ $qstat_xml_data{"NULL"}= ["NULL","NULL"];
+ }
+
+ my @lls = split(/\n/, $cmd);
+ $i = 2; #### skip first 2 lines
+ for (; $i<$#lls+1; $i++) {
+ if ($lls[$i] =~ /<job_list/) {
+ my ($id, $name, $state);
+ for (; $i<$#lls+1; $i++) {
+ last if ($lls[$i] =~ /<\/job_list/);
+ if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
+ if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
+ if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
+ }
+ if (defined($id) and defined($name) and defined($state)) {
+ $qstat_xml_data{$id} = [$name, $state];
+ }
+ }
+ }
+}
+
+########## check submitted job by checking pids, or qsub ids
+########## update job status from wait|ready -> submitted if pid file exit (in case of restart of this script)
+########## update job status from wait|ready|submitted -> completed if sh calls or qsub calls finished
+########## these pids or qsub ids are done
+sub check_submitted_job {
+ my ($t_job_id, $t_sample_id) = @_;
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+
+ my ($i, $j, $k, $flag, $ll, $cmd);
+
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+
+ # status won't change unless there is a pid file
+ return unless (-e $t_sh_pid);
+
+ my $status = $t_sample_job->{'status'};
+ if (($status eq "wait") or ($status eq "ready")) {
+ $t_sample_job->{'status'} = "submitted";
+ write_log("$t_job_id,$t_sample_id: change status to submitted");
+ }
+
+ my $exe_type = $t_execution->{type};
+
+ if ($exe_type eq "sh") {
+ $cmd = `ps -ef | grep "$t_sh_file" | grep -v grep`;
+ if ($cmd =~ /\w/) { # still running
+ $execution_submitted{ $t_job->{"execution"} } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
+ }
+ elsif (validate_job_files($t_job_id, $t_sample_id)) {
+ $t_sample_job->{'status'} = "completed";
+ write_log("$t_job_id,$t_sample_id: change status to completed");
+ }
+ else {
+ $t_sample_job->{'status'} = "error";
+ write_log("$t_job_id,$t_sample_id: change status to error");
+ }
+ return;
+ }
+ elsif (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")) {
+ my @pids = ();
+ open(CHECK, $t_sh_pid) || die "Can not open $t_sh_pid\n";
+ while($ll = <CHECK>) {
+ chop($ll); next unless ($ll =~ /\w/);
+ push(@pids, $ll);
+ }
+ close(CHECK);
+
+ my $finish_flag = 1;
+ foreach $i (@pids) {
+ if (($queue_system eq "SGE") and %qstat_xml_data) {
+ if (defined($qstat_xml_data{$i})) {
+ $t_sample_job->{'status'} = "running" if (($qstat_xml_data{$i}->[1] eq "r") and ($t_sample_job->{'status'} eq "submitted"));
+ $finish_flag = 0;
+ $execution_submitted{ $t_job->{"execution"} } ++;
+ }
+ }
+ elsif ($queue_system eq "SGE") {
+ $cmd = `qstat -j $i | grep job_number`;
+ if ($cmd =~ /$i/) {
+ $finish_flag = 0;
+ $execution_submitted{ $t_job->{"execution"} } ++;
+ }
+ }
+ else {
+ $cmd = `qstat -r $i | grep $i`;
+ $j = (split(/\D/,$cmd))[0];
+ if ($j == $i) { # this job is running
+ $finish_flag = 0;
+ $execution_submitted{ $t_job->{"execution"} } ++;
+ }
+ }
+ }
+ if ($finish_flag == 1) {
+ if (validate_job_files($t_job_id, $t_sample_id)) {
+ $t_sample_job->{'status'} = "completed";
+ write_log("$t_job_id,$t_sample_id: change status to completed");
+ }
+ else {
+ $t_sample_job->{'status'} = "error";
+ write_log("$t_job_id,$t_sample_id: change status to error");
+ }
+ }
+ return;
+ }
+ else {
+ die "unknown execution type: $exe_type\n";
+ }
+}
+########## END sub check_submitted_job
+
+
+# WF.start.date and WF.complete.date need to have non-zero size
+sub validate_job_files {
+ my ($t_job_id, $t_sample_id) = @_;
+ my ($i, $j, $k);
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+
+ return 0 unless (-s $t_sample_job->{'start_file'} );
+ return 0 unless (-s $t_sample_job->{'complete_file'} );
+ return 0 unless (-s $t_sample_job->{'cpu_file'} );
+
+ return 1; #### pass
+}
+########## END validate_job_files
+
+
+sub print_job_status_summary {
+ my ($t_job_id, $t_sample_id);
+ my ($i, $j, $k);
+
+ my %job_status = ();
+ my $job_total = 0;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ foreach $t_sample_id (@NGS_samples) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ $job_status{$status}++;
+ $job_total++;
+ }
+ }
+
+ print STDERR "total jobs: $job_total,";
+ foreach $i (sort keys %job_status) {
+ print STDERR "$i: $job_status{$i},";
+ }
+ print STDERR "\n";
+}
+########## END print_job_status_summary
+
+
+sub validate_cmd_line {
+ my ($i, $j, $k);
+ my ($t_command, $t_sh_file, $t_sample_id) = @_;
+ my @cmds = split(/\n/,$t_command);
+
+ my @warn_path = ();
+ foreach $i (@cmds) {
+ my ($key_cmd, @opts) = split(/\s+/, $i);
+ if ($key_cmd =~ /\//) {
+ if (not -e $key_cmd) { push(@warn_path, $key_cmd); }
+ }
+ @opts = grep {/\//} @opts;
+ foreach $j (@opts) {
+ my @opts1 = split(/,|;|>|<|\|/,$j);
+ foreach $k (@opts1) {
+ $k = "$t_sample_id/$k" unless (($k =~ /^\//) or ($k =~ /^\./));
+ if (not -e $k) { push(@warn_path, $k); }
+ }
+ }
+ }
+
+ if (@warn_path) {
+ print STDERR "File or program doesn't exist in $t_sh_file: ", join(" ", @warn_path), "\n";
+ }
+
+}
+########## END validate_cmd_line
+
+sub add_subset_jobs_by_dependency {
+ my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
+
+ while(1) {
+ my $num_subset_jobs = scalar keys %subset_jobs;
+
+ foreach $t_job_id (keys %subset_jobs) {
+ $t_job = $NGS_batch_jobs{$t_job_id};
+ my @t_injobs = @{$t_job->{"injobs"}};
+
+ for $j (@t_injobs) {
+ $subset_jobs{$j} = 1;
+ }
+ }
+
+ last if ($num_subset_jobs == scalar keys %subset_jobs);
+ }
+}
+########## END add_subset_jobs_by_dependency
+
+
+sub task_level_jobs {
+ my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
+ my %job_level = ();
+
+ while(1) {
+ my $change_flag = 0;
+
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ $t_job = $NGS_batch_jobs{$t_job_id};
+ my @t_injobs = @{$t_job->{"injobs"}};
+
+ if (@t_injobs) {
+ my $max_level_injob;
+ foreach $j (@t_injobs) {
+ next unless defined ($job_level{$j});
+ $max_level_injob = $job_level{$j} if ($job_level{$j} > $max_level_injob);
+ }
+
+ next unless (defined($max_level_injob));
+ $max_level_injob++; #### one more level
+ if (not defined ($job_level{$t_job_id})) {
+ $job_level{$t_job_id}=$max_level_injob;
+ $change_flag = 1;
+ }
+ elsif ($max_level_injob > $job_level{$t_job_id}) {
+ $job_level{$t_job_id}=$max_level_injob;
+ $change_flag = 1;
+ }
+ }
+ else {
+ if (not defined ($job_level{$t_job_id})) {
+ $job_level{$t_job_id}=1;
+ $change_flag = 1;
+ }
+ }
+ }
+ last unless ($change_flag);
+ }
+
+ foreach $t_job_id (sort keys %NGS_batch_jobs) {
+ $NGS_batch_jobs{$t_job_id}->{"job_level"} = $job_level{$t_job_id};
+ }
+}
+########## END task_list_jobs
+
+sub task_snapshot {
+ my ($t_job_id, $t_sample_id);
+ my ($i, $j, $k);
+
+ if ($this_task) {
+ my $flag_qstat_xml_call = 0;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_execution = $NGS_executions{ $t_job->{"execution"} };
+ my $exe_type = $t_execution->{type};
+ $flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
+ }
+ SGE_qstat_xml_query() if $flag_qstat_xml_call;
+
+ foreach $t_sample_id (@NGS_samples) {
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ check_submitted_job($t_job_id, $t_sample_id);
+ }
+ }
+ }
+
+ my $max_len_sample = 0;
+ foreach $t_sample_id (@NGS_samples) {
+ $max_len_sample = length($t_sample_id) if (length($t_sample_id) > $max_len_sample);
+ }
+ my $max_len_job = 0;
+ foreach $t_job_id (@NGS_batch_jobs) {
+ $max_len_job = length($t_job_id) if (length($t_job_id) > $max_len_job);
+ }
+
+ print <<EOD;
+Job status:
+.\twait
+-\tsubmitted
+r\trunning
++\tcompleted
+!\terror
+EOD
+
+ for ($i=$max_len_job-1; $i>=0; $i--) {
+ print ' 'x$max_len_sample, "\t";
+ foreach $t_job_id (@NGS_batch_jobs) {
+ print " ", ($i<length($t_job_id) ? substr(reverse($t_job_id), $i, 1):" ");
+ }
+ print "\n";
+ }
+
+ foreach $t_sample_id (@NGS_samples) {
+ print "$t_sample_id\t";
+ foreach $t_job_id (@NGS_batch_jobs) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $status = $t_sample_job->{'status'};
+ if ($status eq "completed") { print " +";}
+ elsif ($status eq "submitted") { print " -";}
+ elsif ($status eq "running" ) { print " r";}
+ elsif ($status eq "wait" ) { print " .";}
+ elsif ($status eq "error" ) { print " !";}
+ else { print " _";}
+ }
+ print "\n";
+ }
+}
+########## END task_snapshot
+
+sub task_list_jobs {
+ my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
+ foreach $t_job_id (@NGS_batch_jobs) {
+ $t_job = $NGS_batch_jobs{$t_job_id};
+ #my @t_infiles = @{$t_job->{"infiles"}};
+ my @t_injobs = @{$t_job->{"injobs"}};
+
+ #print "\tInput_files:", join(",", @t_infiles) if @t_infiles;
+ print "$t_job_id\tIn_jobs:[" , join(",", @t_injobs), "]\tJob_level:$t_job->{'job_level'}\n";
+ }
+}
+########## END task_list_jobs
+
+sub file1_after_file2 {
+ my ($file1, $file2) = @_;
+
+ # if not exist file1, assume it is in future, so it is newer
+ if (not -e ($file1)) {return 0;}
+ if (not -e ($file2)) {return 0;}
+
+ my $mtime1 = (stat($file1))[9];
+ my $mtime2 = (stat($file2))[9];
+
+ return ( ($mtime1 > $mtime2) ? 1 : 0);
+}
+######## END file1_after_file2
+
+sub file1_same_or_after_file2 {
+ my ($file1, $file2) = @_;
+
+ # if not exist file1, assume it is in future, so it is newer
+ if (not -e ($file1)) {return 0;}
+ if (not -e ($file2)) {return 0;}
+
+ my $mtime1 = (stat($file1))[9];
+ my $mtime2 = (stat($file2))[9];
+
+ return ( ($mtime1 >= $mtime2) ? 1 : 0);
+}
+######## END file1_after_file2
+
+
+sub task_delete_jobs {
+ my $opt = shift;
+ my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
+ my ($mode, $c) = split(/:/, $opt);
+ my $tmp_sh = "NGS-$$.sh";
+
+ open(TMPSH, "> $tmp_sh") || die "can not write to file $tmp_sh";
+ print TMPSH "#Please execute the following commands\n";
+ foreach $t_sample_id (@NGS_samples) {
+ my %job_to_delete_ids = ();
+ if ($mode eq "jobids") {
+ %job_to_delete_ids = map {$_, 1} split(/,/,$c);
+ }
+ elsif ($mode eq "run_after") {
+ die "file $c doesn't exist!" unless (-e $c);
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ next unless (-e $t_sh_pid); #### unless the job is submitted
+ #$job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sample_job->{'start_file'} , $c));
+ $job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sh_pid , $c));
+
+ }
+ }
+ else {
+ die "unknown option for deleting jobs: $opt";
+ }
+
+ # now %job_to_delete_ids are jobs need to be deleted
+ # next find all jobs that depends on them, recrusively
+ my $no_jobs_to_delete = scalar keys %job_to_delete_ids;
+ while(1) {
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ next unless (-e $t_sh_pid); #### unless the job is submitted
+ my @t_injobs = @{ $t_sample_job->{'injobs'} };
+ foreach my $t_job_id_2 (@t_injobs) {
+ $job_to_delete_ids{$t_job_id} = 1 if ($job_to_delete_ids{$t_job_id_2});
+ }
+ }
+ last if ($no_jobs_to_delete == (scalar keys %job_to_delete_ids)); #### no more depending jobs
+ $no_jobs_to_delete = scalar keys %job_to_delete_ids;
+ }
+
+ if ($no_jobs_to_delete) {
+ print TMPSH "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
+ print "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
+ foreach $t_job_id (keys %job_to_delete_ids) {
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $t_sh_file = $t_sample_job->{'sh_file'};
+ my $t_sh_pid = "$t_sh_file.pids";
+ print TMPSH "\\rm -rf $pwd/$t_sample_id/$t_job_id\n";
+ print TMPSH "\\rm $t_sh_pid\n";
+ print TMPSH "\\rm $t_sh_file.*.std*\n";
+
+ #### find the qsub ids to be deleted
+ my $qids = `cat $t_sh_pid`; $qids =~ s/\n/ /g; $qids =~ s/\s+/ /g;
+ print TMPSH "qdel $qids\n";
+ }
+ }
+ }
+ close(TMPSH);
+ print "The script is not delete the file, please run $tmp_sh to delete files!!!\n\n";
+}
+########## END task_list_jobs
+
+sub task_log_cpu {
+ my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
+
+ my %cpu_info;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ foreach $t_sample_id (@NGS_samples) {
+
+ $cpu_info{$t_job_id}{$t_sample_id} = [$t_wall, $t_cpu];
+ }
+ }
+
+ foreach $t_sample_id (@NGS_samples) {
+ my $f_cpu = "$pwd/$t_sample_id/WF.cpu";
+ open(CPUOUT, "> $f_cpu") || die "Can not open $f_cpu";
+ print CPUOUT "#job_name\tCores\tWall(s)\tWall_time\tCPU(s)\tCPU_time\n";
+ my $min_start = 1402092131 * 999999;
+ my $max_end = 0;
+ my $sum_cpu = 0;
+ foreach $t_job_id (keys %NGS_batch_jobs) {
+ if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
+ my $t_job = $NGS_batch_jobs{$t_job_id};
+ my $t_core = $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
+
+ my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
+ my $f_start = $t_sample_job->{'start_file'};
+ my $f_complete = $t_sample_job->{'complete_file'};
+ my $f_cpu = $t_sample_job->{'cpu_file'};
+ my $t_start = `cat $f_start`; $t_start =~ s/\s//g; $min_start = $t_start if ($t_start < $min_start);
+ my $t_end = `cat $f_complete`; $t_end =~ s/\s//g; $max_end = $t_end if ($t_end > $max_end);
+ my $t_wall = int($t_end - $t_start);
+ $t_wall = 0 unless ($t_wall>0);
+
+ my $t_cpu = 0;
+ if (open(TCPU, $f_cpu)) {
+ while($ll = <TCPU>) {
+ chop($ll);
+ if ($ll =~ /^(\d+)m(\d+)/) {
+ $t_cpu += $1 * 60;
+ }
+ }
+ close(TCPU);
+ }
+ $sum_cpu += $t_cpu;
+
+ my $t_walls = time_str1($t_wall);
+ my $t_cpus = time_str1($t_cpu);
+ print CPUOUT "$t_job_id\t$t_core\t$t_wall\t$t_walls\t$t_cpu\t$t_cpus\n";
+ }
+ my $t_wall = ($max_end - $min_start); $t_wall = 0 unless ($t_wall>0);
+ my $t_walls = time_str1($t_wall);
+ my $sum_cpus= time_str1($sum_cpu);
+ print CPUOUT "total\t-\t$t_wall\t$t_walls\t$sum_cpu\t$sum_cpus\n";
+ close(CPUOUT);
+ }
+}
+######### END task_log_cpu
+
+sub time_str1 {
+ my $s = shift;
+ my $str = "";
+
+ $str .= int($s/3600); $str .= "h"; $s = $s % 3600;
+ $str .= int($s/60); $str .= "m"; $s = $s % 60;
+ $str .= $s; $str .= "s";
+
+ return $str;
+}
+########## END time_str1;
+
+
+
+
+
+
+sub usage {
+<<EOD;
+
+# =============================== NG-Omics-WF ==================================
+# _ _ _____ ____ _ __ ________
+# | \\ | |/ ____| / __ \\ (_) \\ \\ / / ____|
+# | \\| | | __ ______| | | |_ __ ___ _ ___ ___ _____\\ \\ /\\ / /| |__
+# | . ` | | |_ |______| | | | '_ ` _ \\| |/ __/ __|______\\ \\/ \\/ / | __|
+# | |\\ | |__| | | |__| | | | | | | | (__\\__ \\ \\ /\\ / | |
+# |_| \\_|\\_____| \\____/|_| |_| |_|_|\\___|___/ \\/ \\/ |_|
+#
+# =========================== Next Generation Omics data workflow tools ========
+
+To run workflow:
+ $0 -s sample_file -i workflow_file
+
+Options:
+
+ -i workflow configration file, required
+
+ -s sample data file, required unless -S is present
+ File format example
+#Sample data file example, TAB or space delimited for following lines
+Sample_ID1 sample_data_0 sample_data_1
+Sample_ID2 sample_data_0 sample_data_1
+Sample_ID3 sample_data_0 sample_data_1
+
+ -S sample data from command line, required unless -s is present
+ format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
+
+ -j run sub sets of jobs, optional, the workflow will run all jobs by default
+ e.g. -j qc or -j qc,fastqc
+
+ -t parameter file, optional, replace default paramters in workflow configration file
+ File format example
+#parameter file example, TAB or space delimited for following lines
+CMDOPT JobID_A:opt0:opt1:opt2
+CMDOPT JobID_B:opt0:opt1
+
+ -T parameter from command line
+ format: JobID_A:opt0:opt1:opt2,JobID_B:opt0:opt1
+
+ -r root directory of NGS-tools
+
+ -J optional tasks
+ write-sh: write sh files and quite
+ log-cpu: gathering cpu time for each run for each sample
+ list-jobs: list jobs
+ snapshot: snapshot current job status
+ delete-jobs: delete jobs, must supply jobs delete syntax by option -Z
+ e.g. -J delete-jobs -Z jobids:assembly,blast ---delete assembly,blast and all jobs depends on them
+ -J delete-jobs -Z run_after:filename ---delete jobs that has start time (WF.start.date) after this file, and all depending jobs
+
+ -Z secondary parameter used by other options, such as -J
+
+ -Q queue system, default SGE
+ can be PBS, SGE
+
+Question and comments:
+ http://weizhongli-lab.org/ngomicswf liwz\@sdsc.edu
+
+EOD
+}
+
+
+
+############################################################################################
+# _______ ________ _________ ___________________ ________ .____ _________
+# \ \ / _____/ / _____/ \__ ___/\_____ \ \_____ \ | | / _____/
+# / | \/ \ ___ \_____ \ ______ | | / | \ / | \| | \_____ \
+#/ | \ \_\ \/ \ /_____/ | | / | \/ | \ |___ / \
+#\____|__ /\______ /_______ / |____| \_______ /\_______ /_______ \/_______ /
+# \/ \/ \/ \/ \/ \/ \/
+############################################################################################
+
diff --git a/usecases/Miseq-16S/README b/usecases/Miseq-16S/README
new file mode 100644
index 0000000..843e6b1
--- /dev/null
+++ b/usecases/Miseq-16S/README
@@ -0,0 +1,200 @@
+CD-HIT usecases: CD-HIT-OTU-MiSeq (http://cd-hit.org)
+
+Please also check https://github.com/weizhongli/cdhit/wiki,
+which offers most up-to-date documents.
+
+
+================================================================================================
+ Introduction of CD-HIT-OTU-MiSeq
+================================================================================================
+This use case is developed for clustering 16S rDNA sequences sequenced with MiSeq
+platform into OTUs for microbiome studies.
+In recent years, Illumina MiSeq sequencers became dominant in 16S rDNA sequencing. The
+Paired End (PE) reads need to be assembled first. However many reads can not be accurately
+assembled because the poor quality at the 3’ ends of both PE reads in the overlapping region.
+This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique
+features to cluster MiSeq 16S sequences.
+
+ * The package can clustering PE reads without joining them into contigs.
+ * Users can choose a high quality portion of the PE reads for analysis
+ (e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
+ * We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length
+ 16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE
+ reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs)
+ and annotate these OTUs concurrently.
+ * Chimeric sequences are effectively identified through de novo approache.
+
+The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at
+the 5’ ends of R1 and R2 reads. For example, the effective clustering read length can be 200 bases
+for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with
+spliced PE sequences from the reference database to derive OTUs (Figure).
+
+
+================================================================================================
+ Installation
+================================================================================================
+1. Install CD-HIT package
+ * download current CD-HIT at https://github.com/weizhongli/cdhit/releases,
+ for example cd-hit-v4.6.2-2015-0511.tar.gz
+ * unpack the file with “tar xvf cd-hit-v4.6.2-2015-0511.tar.gz –gunzip”
+ * change dir by “cd cd-hit-v4.6.2-2015-0511”
+ * compile the programs by “make” with multi-threading (default),
+ or by “make openmp=no” without multi-threading (on old systems without OpenMP)
+ * cd cd-hit-auxtools
+ * compile cd-hit-auxtools by “make”
+ * CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
+
+
+2. Install Trimmomatic
+CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from
+http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/trimmomatic.
+We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
+
+
+3. Modify NG-Omics-Miseq-16S.pl
+Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
+ $CD_HIT_dir = "PATH_to_cd-hit";
+ $NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
+
+4. Download reference dataset
+Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
+The reference database Greengene-13-5-99.fasta.gz was re-formatted from original Greengene database,
+so that sequences with more specific annotations are at the beginning of the file. Please gunzip after
+download.
+
+You can also download Greengene directly. You should download Greengene from
+http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
+Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file.
+You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta.
+
+There is a script: usecases/Miseq-16S/greengene-ann1.pl, please run this script to re-format greengene:
+ PATH_to_cd-hit/usecases/Miseq-16S/greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
+
+5. Download sample datasets
+Sample datasets can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
+The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
+
+
+================================================================================================
+ Usage of CD-HIT-OTU-MiSeq
+================================================================================================
+1. Prepare fastq files and sample file
+Most projects have multiple samples sequenced at the same variable regions.
+After your samples are sequenced, your sequencing center should give you two paired ended fastq files
+for each samples. Put them in a working directory in similar way as the testing datasets,
+where the R1.fq and R2.fq are placed in a folder for each sample. the folder name is the sample name.
+So in the working directory, you should have files:
+
+sample_name_1/R1.fq
+sample_name_1/R2.fq
+sample_name_2/R1.fq
+sample_name_2/R2.fq
+...
+sample_name_N/R1.fq
+sample_name_N/R2.fq
+
+
+2. Prepare sample file
+Next is to prepare a SAMPLE_file, a text file, in the working directory. The file should look like:
+
+sample_name_1 R1.fq R2.fq
+sample_name_2 R1.fq R2.fq
+...
+sample_name_N R1.fq R2.fq
+
+
+3. Prepare reference database
+We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a
+full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva,
+into PE sequences. If there are multiple samples in a project sequenced with the same
+amplicon of same variable region, only one spliced reference database is needed.
+Please run:
+
+ Path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
+
+ Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file.
+ -p 150 specify the effective clustering read length for R1 to be 150
+ -q 100 specify the effective clustering read length for R2 to be 100
+ -p and -q option need to be consistent with parameters in OTU clustering in step 4
+ see next section for suggestions in choose effective clustering read length
+
+This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
+
+
+4. Run sequence QC and OTU clustering for each sample
+In the working directory, run
+
+ PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+
+ where: 150 and 100 are the effective length,
+ see next section for suggestions in choose effective clustering read length
+ 0.97 is the OTU clustering cutoff,
+ 0.00001 is the abundance cutoff,
+ 75 is the length for chimeric checking at each R1 and R2 read
+ PATH_to-gg_13_5-PE99.150-100-R1 and PATH_to-gg_13_5-PE99.150-100-R2 need to be full path
+ e.g. /home/user/myproj/PATH_to-gg_13_5-PE99.150-100-R1
+
+This command will generate shell scripts for QC and for OTU for each sample.
+The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all
+these jobs finished you then run all otu.sample_name.sh
+
+NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
+tool developed in our group. It is not fully released yet, since we need more time to document
+this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples.
+First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the
+number of CPU cores of your computer, then run
+
+ nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
+
+After the job finished, the OTU results will be in sample_name/otu folder, important files include
+ OTU.clstr: file lists all clusters and sequences
+ chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
+
+
+5. Pool all samples together
+If you have multiple samples, you don't just want to stop here. It is important
+to pool all sample together and re-run OTU clustering so that all samples can be
+compared, run
+
+ PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled
+
+This will pool sequences from all samples. We can handle hundred and more sample without problem.
+
+
+6. Cluster pooled samples, run
+
+ PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+
+This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
+run this sh script. When it is finished, OTUs will be in the pooled directory:
+ OTU.clstr: file list all clusters and sequences from all samples in CD-HIT format
+ OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
+ chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
+
+
+================================================================================================
+ Choose effective clustering read length
+================================================================================================
+The key of this method is to use the high quality portion of reads from both R1 and R2, so how
+to choose effective clustering read length depends on the actual quality of the PE reads. In our
+paper five pairs of effective clustering read lengths (225, 175), (200, 150), (175, 125),
+(150, 100) and (125, 75) were selected for samples sequenced at V34 or V45.
+Two pairs of effective clustering read lengths (150, 100) and (125, 75) were used for
+samples of V4 region. All these settings gave good results.
+
+You can try some different settings and compare the resutls. Also, programs such as FASTQC
+(http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) can be used to scan the raw reads
+to help choose the effective clustering read length of R1 and R2.
+
+
+
+
+================================================================================================
+ Other topics
+================================================================================================
+
+Questions, comments to the author Weizhong Li, liwz at sdsc.edu
+
+
+
+
diff --git a/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl b/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl
new file mode 100755
index 0000000..e746d60
--- /dev/null
+++ b/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl
@@ -0,0 +1,222 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+getopts("i:j:o:p:c:s:t:m:e:Z:a:f:d:R:",\%opts);
+die usage() unless ($opts{i} and $opts{o});
+
+my $input = $opts{i};
+my $input2 = $opts{j};
+my $dir = $opts{o};
+my $abs_cutoff = $opts{a}; $abs_cutoff = 0.00005 unless ($abs_cutoff); #5e-5
+my $otu_cutoff = $opts{c}; $otu_cutoff = 0.97 unless ($otu_cutoff);
+my $chimera_f = $opts{m}; $chimera_f = "true" unless ($chimera_f);
+my $debug_mode = $opts{Z};
+my $fast_mode = $opts{f}; #### use cd-hit-dup for stage 1 and 2 clustering
+my $cdhit_opt = $opts{d};
+my $restart_n = $opts{R}; $restart_n = 0 unless (defined($restart_n));
+my $LOGf = "$dir/OTU.log";
+my $cd_hit_dup = "$script_dir/../../cd-hit-auxtools/cd-hit-dup"; die "no $cd_hit_dup" unless (-e $cd_hit_dup);
+my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+$cmd = `mkdir -p $dir`;
+open(LOG, "> $LOGf") || die "can not write to $LOGf";
+my $f2 = "$dir/seq";
+
+################################################################################
+#### Stage 0 ----------- clustering at 100% - stage 0
+################################################################################
+my $clstr = "$f2.dup.clstr";
+my $clstr2 = "$f2.dup2.clstr";
+if ($restart_n <= 0) {
+ nice_run("$cd_hit_dup -i $input -i2 $input2 -o $f2.dup -o2 $f2.dup.2 -u 100 -d 0 -m false -f $chimera_f > $f2.dup2.log");
+ nice_run("cat $f2.dup.clstr $f2.dup2.clstr > $f2-stage0.clstr.tmp");
+ nice_run("$script_dir/cd-hit/clstr_sort_by.pl < $f2-stage0.clstr.tmp > $f2-stage0.clstr; rm -f $f2-stage0.clstr.tmp");
+ nice_run("$script_dir/clstr_sort_rep.pl $f2-stage0.clstr $input > $f2-stage0-rep.fa");
+#
+# /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 -f true > otu/seq.dup.log # no work
+# /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 > otu/seq.dup.log
+#
+# what if cd-hit-est
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i qc/R1.fa -j qc/R2.fa -o otu/seq.nr -op otu/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 -cx 100 -cy 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.log
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -o otu/seq.nr.R1 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R1.log
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr.2 -o otu/seq.nr.R2 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R2.log
+
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -j otu/seq.nr.2 -o otu/seq.99 -op otu/seq.99.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.99.log
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.99 -j otu/seq.99.2 -o otu/seq.97 -op otu/seq.97.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.97 -n 10 -G 1 -b 5 -T 1 -M 8000 -d 0 -p 1 > otu/seq.97.log
+# do not sort 99.clstr, always trust cd-hit-dup ordered sequences
+# /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.nr.clstr otu/seq.99.clstr | /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.99-full.clstr
+# /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.99-full.clstr otu/seq.97.clstr | /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.97-full.clstr
+#
+# combine ref
+# /home/oasis/data/etc/git/cdhit/cd-hit-est -i seq.99.wref.R1 -o seq.97.wref.R1only -r 0 -cx 100 -c 0.97 -n 10 -b 5 -T 1 -M 8000 -d 1 -p 1 -G 0 -A 50 -g 1
+#
+}
+if (not $debug_mode) {
+ my $no1 = count_seqs_from_fasta_file($input);
+ my $no_clstr = count_clstrs_from_clstr_file($clstr);
+ my $no_clstr2 = count_clstrs_from_clstr_file($clstr2);
+ print LOG "Number_contigs\t$no1\n";
+ print LOG "Number_unique_contigs\t$no_clstr\n";
+ print LOG "Number_unique_chimaric_contigs\t$no_clstr2\n";
+}
+
+################################################################################
+#### Stage 1 ---------- clustering at 99.25% #### distance 0.75%
+################################################################################
+my $seq_n = `grep -c "^>" $input`; $seq_n =~ s/\D//g;
+my $cutoff = int($seq_n * $abs_cutoff);
+my $c1 = 0.9925;
+if ($restart_n <= 1) {
+ if ($fast_mode) {
+ nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage0-rep.fa -o $f2-stage1 -d 0 -m false -e 3 > $f2-stage1.log");
+ }
+ else {
+ nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage0-rep.fa -o $f2-stage1 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage1.log");
+ }
+ nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage0.clstr $f2-stage1.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage1-all.clstr");
+ nice_run("$script_dir/clstr_sort_rep.pl $f2-stage1-all.clstr $f2-stage1 > $f2-stage1-rep.fa");
+}
+if (not $debug_mode) {
+ $no_clstr = count_clstrs_from_clstr_file("$f2-stage1.clstr");
+ print LOG "Stage1 clustering at $c1\n";
+ print LOG "Number_clusters_stage1\t$no_clstr\n";
+}
+
+################################################################################
+#### Stage 2 ---------- clustering at 98.50% #### distance 1.50%
+################################################################################
+ $c1 = 0.985;
+if ($restart_n <= 2) {
+ if ($fast_mode) {
+ nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage1-rep.fa -o $f2-stage2 -d 0 -m false -e 6 > $f2-stage2.log");
+ }
+ else {
+ nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage1-rep.fa -o $f2-stage2 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage2.log");
+ }
+ nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage1-all.clstr $f2-stage2.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage2-all.clstr");
+ nice_run("$script_dir/clstr_sort_rep.pl $f2-stage2-all.clstr $f2-stage2 > $f2-stage2-rep.fa");
+}
+if (not $debug_mode) {
+ $no_clstr = count_clstrs_from_clstr_file("$f2-stage2.clstr");
+ print LOG "Stage2 clustering at $c1\n";
+ print LOG "Number_clusters_stage2\t$no_clstr\n";
+}
+
+
+################################################################################
+#### Stage pre-3 ---------- filtering
+################################################################################
+
+if ($restart_n <= 3) {
+ nice_run("$script_dir/clstr_select_rep.pl size $cutoff 999999999 < $f2-stage2-all.clstr > $f2-stage2-rep-big.ids");
+ nice_run("$script_dir/fetch_fasta_by_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-big.fa");
+ nice_run("$script_dir/fetch_fasta_exclude_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-small.fa");
+
+ if (-s $clstr2) {
+ nice_run("$script_dir/clstr_select_rep.pl size 1 999999999 < $clstr2 > $dir/chimaric.ids"); ## save chimaric ids
+ nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-big.fa > $f2-stage2-rep-big-good.fa"); ## exclude chimaric reads from $t1-pri-rep.fa
+ nice_run("rm -f $f2-stage2-rep-big.fa");
+
+ nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-small.fa > $f2-stage2-rep-small-good.fa");
+ nice_run("rm -f $f2-stage2-rep-small.fa");
+ }
+ else {
+ nice_run("mv $f2-stage2-rep-big.fa $f2-stage2-rep-big-good.fa");
+ nice_run("mv $f2-stage2-rep-small.fa $f2-stage2-rep-small-good.fa");
+ }
+}
+
+if (not $debug_mode) {
+ print LOG "Min_clstr_size\t$cutoff\n";
+ my $no_seq = count_seqs_from_fasta_file("$f2-stage2-rep-big-good.fa");
+ print LOG "Number_clstrs_above_min_size\t$no_seq\n";
+}
+
+################################################################################
+#### Stage 3 ---------- clustering at 97%
+################################################################################
+ $c1 = $otu_cutoff;
+if ($restart_n <= 3) {
+ nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage2-rep-big-good.fa -o $f2-stage3 -c $c1 -n 8 -l 11 -p 1 -d 0 -g 1 -b 5 $cdhit_opt > $f2-stage3.log");
+ nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage2-all.clstr $f2-stage3.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage3-all.clstr");
+ nice_run("$script_dir/clstr_sort_rep.pl $f2-stage3-all.clstr $f2-stage3 > $f2-stage3-rep.fa");
+ nice_run("mv -f $f2-stage3-all.clstr $dir/OTU.clstr");
+ nice_run("$script_dir/cd-hit-otu-table-faa.pl -i $dir/OTU.clstr -s $f2-stage3-rep.fa -o $dir/OTU-dist.txt -f $dir/OTU.fa");
+}
+
+if (not $debug_mode) {
+ $no_clstr = count_clstrs_from_clstr_file("$dir/OTU.clstr");
+ $no_seq = count_seqs_from_clstr_file("$dir/OTU.clstr");
+ print LOG "OTU clustering at $c1\n";
+ print LOG "Number_OTUs\t$no_clstr\n";
+ print LOG "Number_seqs_in_OTUs\t$no_seq\n";
+ my ($tu,$ts,$cu,$cs)=times(); my $tt=$tu+$ts+$cu+$cs;
+ print LOG "Total_CPU_time\t$tt\n";
+}
+close(LOG);
+
+
+sub usage {
+<<EOF
+Usage:
+$script_name -i contig_fasta_file -o output_dir -a abundance_cutoff -c OTU_cutoff -m check_chimera_flag
+
+Options:
+ -i input fasta file of contig
+ -o output dir
+ -c OTU cutoff, default 0.97
+ -m whether to perform chimera checking (true/false), default true
+ -a abundance cutoff, default 0.00005
+ small clusters < this size will be considiered as noise and will be removed
+ if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
+ -f 1 or 0, default 0
+ if set to 1, then use cd-hit-dup instead of cd-hit-est for stage 1 and 2 clustering
+ which is very fast
+ -R restart flag, if re-run at different abundance cutoff value or something,
+ with this parameter, program can skip the first n step and restart at certain step
+ values:
+ 0 default, start from the scratch cd-hit-dup
+ 1 cd-hit-est at 99.25
+ 2 cd-hit-est at 98.75
+ 3 filtering and cd-hit-est at 97%
+
+EOF
+}
+###### END usage
+
+sub nice_run {
+ my $str = shift;
+ print STDERR "$str\n";
+ my $cmd = `$str` unless ($debug_mode);
+ return $cmd;
+}
+##########
+
+sub count_clstrs_from_clstr_file {
+ my $clstr = shift;
+ my $n = `grep -c "^>" $clstr`;
+ $n =~ s/\s//g;
+ return $n;
+}
+
+sub count_seqs_from_clstr_file {
+ my $clstr = shift;
+ my $n = `grep -cv "^>" $clstr`;
+ $n =~ s/\s//g;
+ return $n;
+}
+
+sub count_seqs_from_fasta_file {
+ my $faa = shift;
+ my $n = `grep -c "^>" $faa`;
+ $n =~ s/\s//g;
+ return $n;
+}
+
diff --git a/usecases/Miseq-16S/clstr_2_OTU_table.pl b/usecases/Miseq-16S/clstr_2_OTU_table.pl
new file mode 100755
index 0000000..0ad13e3
--- /dev/null
+++ b/usecases/Miseq-16S/clstr_2_OTU_table.pl
@@ -0,0 +1,82 @@
+#!/usr/bin/perl
+#
+use Getopt::Std;
+getopts("i:s:S:o:f:j:",\%opts);
+
+my $input = $opts{i}; $input = "OTU.clstr" unless $input;
+my $output = $opts{o}; $output = "OTU.txt" unless ($output);
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my %count = ();
+my %count_t = ();
+my %count_s = ();
+my $OTU_2_ann = ();
+my $tree_flag = 0; #### for greengene header format
+# >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
+open(TMP, $input) || die "can not open $input";
+my $OTU=0;
+while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $OTU++;
+ }
+ else {
+ chop($ll);
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ my $id = $2;
+ if ($id =~ /^Sample\|([^\|]+)\|/) {
+ $sample_id = $1;
+ $sample_id{$sample_id}=1;
+ $count{$OTU}{$sample_id}++;
+ $count_t{$OTU}++;
+ $count_s{$sample_id}++;
+ }
+ else {
+ $OTU_2_ann{$OTU} = $id;
+ $tree_flag = 1 if ($id =~ /\|k__Bacteria;.p__/);
+ }
+ }
+ else {
+ die "format error $ll";
+ }
+ }
+}
+close(TMP);
+
+my @sample_ids = sort keys %sample_id;
+
+open(OUT1, "> $output") || die "can not write $output";
+print OUT1 "OTU";
+foreach $sample_id (@sample_ids){
+ print OUT1 "\t$sample_id";
+}
+if ($tree_flag) {
+ print OUT1 "\t", join("\t", qw/Kingdom Phylum Class Order Family Genus Species/);
+}
+#print OUT1 "\tTotal\n";
+print OUT1 "\tAnnotation\n";
+
+for ($i=1; $i<=$OTU; $i++){
+ $ann = "None";
+ if ($OTU_2_ann{$i}) { $ann = $OTU_2_ann{$i}; }
+ print OUT1 "OTU$i";
+ foreach $sample_id (@sample_ids){
+ $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
+ print OUT1 "\t$k";
+ }
+ if ($tree_flag) {
+ my ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s);
+ if ($ann =~ /k__(\w+)/) {$tax_k = $1} else {$tax_k = "";}
+ if ($ann =~ /p__(\w+)/) {$tax_p = $1} else {$tax_p = "";}
+ if ($ann =~ /c__(\w+)/) {$tax_c = $1} else {$tax_c = "";}
+ if ($ann =~ /o__(\w+)/) {$tax_o = $1} else {$tax_o = "";}
+ if ($ann =~ /f__(\w+)/) {$tax_f = $1} else {$tax_f = "";}
+ if ($ann =~ /g__(\w+)/) {$tax_g = $1} else {$tax_g = "";}
+ if ($ann =~ /s__(\w+)/) {$tax_s = $1} else {$tax_s = "";}
+ print OUT1 "\t", join("\t", ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s));
+ }
+ #print OUT1 "\t$count_t{$i}";
+ print OUT1 "\t$ann\n";
+}
+close(OUT1);
+
+
diff --git a/usecases/Miseq-16S/filter-chimeric-and-small.pl b/usecases/Miseq-16S/filter-chimeric-and-small.pl
new file mode 100755
index 0000000..de303fe
--- /dev/null
+++ b/usecases/Miseq-16S/filter-chimeric-and-small.pl
@@ -0,0 +1,237 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
+die usage() unless ($opts{k} and $opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
+
+my $input0 = $opts{k}; ## nr.clstr
+my $input = $opts{i}; ## R1 only clstr
+my $input2 = $opts{j}; ## R2 only clstr
+my $clstr_99 = $opts{a}; ## seq.99.clstr #### can be any 2nd -preclustering e.g. 98.5%
+my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
+my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
+my $output = $opts{o}; ## seq.99f
+my $abs_cutoff = $opts{c}; $abs_cutoff = 0.0001 unless ($abs_cutoff);
+my $output_2 = "$output.2"; ## seq.99f.2 -- R2
+my $output_cls = "$output.clstr"; ## seq.99f.clstr
+my $output_log = "$output.log"; ## seq.99f.log
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my $num_total_seq;
+my %seq_nr_size;
+my %seqs_of_nr;
+open(LOG, "> $output_log") || die "can not open $output_log";
+open(TMP, $input0) || die "can not open $input0";
+if (1) {
+ my $rep;
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $rep = "";
+ }
+ else {
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $num_total_seq++;
+ if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_nr{$rep} = [];}
+ $seq_nr_size{$rep}++ if ($rep);
+ push(@{$seqs_of_nr{$rep}}, $id) if ($rep);
+ }
+ }
+ }
+}
+close(TMP);
+
+my %seq_R1_clstr;
+my %seq_R2_clstr;
+foreach my $f (($input, $input2)) {
+ open(TMP, $f) || die "can not open $f";
+ my $rep;
+
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $rep = "";
+ }
+ else {
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ if ($ll =~ /\*$/) {
+ $rep=$id;
+ }
+ if ($rep) {
+ if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
+ else { $seq_R2_clstr{$id} = $rep;}
+ }
+ }
+ }
+ }
+ close(TMP);
+}
+
+#### open $clstr_99 first time
+open(TMP, $clstr_99) || die "can not open $clstr_99";
+%rep_2_otu = ();
+$OTU = -1;
+while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $OTU++;
+ }
+ else {
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $rep_2_otu{$id} = $OTU;
+ }
+ }
+}
+close(TMP);
+
+my %chimeric_ids = ();
+#### those ids are candidates, if they are recurited by other non-chimeric clusters,
+#### then they are not chimeric anymore
+foreach $i (keys %seq_R1_clstr) {
+ my $rep1 = $seq_R1_clstr{$i};
+ my $rep2 = $seq_R2_clstr{$i};
+
+ next if ($rep1 eq $rep2);
+ next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2);
+ next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2);
+
+ my $OTU1 = $rep_2_otu{$rep1};
+ my $OTU2 = $rep_2_otu{$rep2};
+ next if ($OTU1 eq $OTU2);
+ $chimeric_ids{$i} = 1;
+}
+
+#### parse seq.99.clstr
+my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
+ $cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1); #### singleton will be removed
+#print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
+
+open(TMP, $clstr_99) || die "can not open $clstr_99";
+open(OUT, "> $output_cls") || die "can not write to $output_cls";
+my %good_ids = ();
+my @seqs_this_cls = ();
+if (1) {
+ my $clstr_txt = "";
+ my $clstr_size = 0;
+ my $rep;
+
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ if ($clstr_txt) {
+ if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
+ print OUT $clstr_txt;
+ $good_ids{$rep} = 1;
+ }
+ elsif ( $chimeric_ids{$rep} ) {
+ foreach $j (@seqs_this_cls) {
+ foreach $i ( @{ $seqs_of_nr{$j} } ) {
+ print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
+ }
+ }
+ }
+ else {
+ foreach $j (@seqs_this_cls) {
+ foreach $i ( @{ $seqs_of_nr{$j} } ) {
+ print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
+ }
+ }
+ }
+ }
+ $clstr_size = 0;
+ $clstr_txt = $ll;
+ $rep = "";
+ @seqs_this_cls=();
+ }
+ else {
+ $clstr_txt .= $ll;
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $clstr_size += $seq_nr_size{$id};
+ $rep=$id if ($ll =~ /\*$/);
+ push(@seqs_this_cls, $id);
+ }
+ }
+ }
+ if ($clstr_txt) {
+ if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
+ print OUT $clstr_txt;
+ $good_ids{$rep} = 1;
+ }
+ elsif ( $chimeric_ids{$rep} ) {
+ foreach $j (@seqs_this_cls) {
+ foreach $i ( @{ $seqs_of_nr{$j} } ) {
+ print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
+ }
+ }
+ }
+ else {
+ foreach $j (@seqs_this_cls) {
+ foreach $i ( @{ $seqs_of_nr{$j} } ) {
+ print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
+ }
+ }
+ }
+ }
+}
+close(TMP);
+close(OUT);
+
+foreach my $f (($seq_99, $seq_992)) {
+ my $fout = ($f eq $seq_99) ? $output : $output_2;
+
+ open(TMP, $f) || die "can not open $f";
+ open(OUT, ">$fout") || die "can not write to $fout";
+
+ my $flag = 0;
+ while($ll = <TMP>) {
+ if ($ll =~ /^>/) {
+ $gi = substr($ll,1);
+ chop($gi);
+ $gi =~ s/\s.+$//;
+ $flag = ( $good_ids{$gi} ) ? 1 : 0;
+ }
+ print OUT $ll if ($flag);
+ }
+
+ close(TMP);
+ close(OUT);
+}
+
+
+close(LOG);
+
+sub usage {
+<<EOF
+Usage:
+$script_name -k seq.nr.clstr -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.99.clstr -f seq.99 -g seq.99.2 -o seq.99f
+
+Options:
+ -k input seq.nr.clstr
+ -i input seq.nr.R1.clstr
+ -j input seq.nr.R2.clstr
+ -a input seq.99.clstr
+ -f input seq.99
+ -g input seq.99.2
+ -o output
+ -c abundance cutoff, default $abs_cutoff
+ small clusters < this size will be considiered as noise and will be removed
+ if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
+
+EOF
+}
+###### END usage
+
diff --git a/usecases/Miseq-16S/filter-chimeric-by-ref.pl b/usecases/Miseq-16S/filter-chimeric-by-ref.pl
new file mode 100755
index 0000000..5e9b32a
--- /dev/null
+++ b/usecases/Miseq-16S/filter-chimeric-by-ref.pl
@@ -0,0 +1,207 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
+die usage() unless ($opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
+
+my $input = $opts{i}; ## R1 only clstr
+my $input2 = $opts{j}; ## R2 only clstr
+my $clstr_99 = $opts{a}; ## seq.97f-full.clstr #### can be any 2nd -preclustering e.g. 98.5%
+my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
+my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
+my $output = $opts{o}; ## seq.99f
+my $abs_cutoff = $opts{c}; $abs_cutoff = 0.01 unless ($abs_cutoff); #### small cluster will be checked for chimeric
+my $output_2 = "$output.2"; ## seq.99f.2 -- R2
+my $output_cls = "$output.clstr"; ## seq.99f.clstr
+my $output_log = "$output.log"; ## seq.99f.log
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my $num_total_seq;
+my %seq_nr_size;
+my %seqs_of_rep;
+open(LOG, "> $output_log") || die "can not open $output_log";
+open(TMP, $clstr_99) || die "can not open $clstr_99";
+if (1) {
+ my $rep;
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $rep = "";
+ }
+ else {
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $num_total_seq++ if ($id =~ /^Sample/);
+ if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_rep{$rep} = [];}
+ $seq_nr_size{$rep}++ if ($rep);
+ push(@{$seqs_of_rep{$rep}}, $id) if ($rep);
+ }
+ }
+ }
+}
+close(TMP);
+
+
+my %seq_R1_clstr;
+my %seq_R2_clstr;
+foreach my $f (($input, $input2)) {
+ open(TMP, $f) || die "can not open $f";
+ my $rep;
+
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ $rep = "";
+ }
+ else {
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ if ($ll =~ /\*$/) {
+ $rep=$id;
+ }
+ if ($rep and ($id =~ /^Sample/) ) {
+ if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
+ else { $seq_R2_clstr{$id} = $rep;}
+ }
+ }
+ }
+ }
+ close(TMP);
+}
+
+my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
+ $cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1);
+#print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
+
+my %chimeric_ids = ();
+#### those ids are candidates, if they are recurited by other non-chimeric clusters,
+#### then they are not chimeric anymore
+foreach $i (keys %seq_nr_size) {
+ next unless ($i =~ /^Sample/);
+ my $rep1 = $seq_R1_clstr{$i};
+ my $rep2 = $seq_R2_clstr{$i};
+ next unless ($rep1 and $rep2);
+
+ next if ($rep1 eq $rep2);
+ next if ($rep1 eq $i);
+ next if ($rep2 eq $i);
+ next if ($seq_nr_size{$i} > $cutoff_clstr_size);
+ if (defined($seq_nr_size{$rep1})) { next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2); }
+ if (defined($seq_nr_size{$rep2})) { next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2); }
+
+ $chimeric_ids{$i} = 1;
+}
+
+#### parse seq.97fwref.clstr
+#### do chimeric checking for sample-only clusters
+open(TMP, $clstr_99) || die "can not open $clstr_99";
+open(OUT, "> $output_cls") || die "can not write to $output_cls";
+my %good_ids = ();
+if (1) {
+ my $clstr_txt = "";
+ my $clstr_size = 0;
+ my $rep;
+ my $refonly = 1;
+
+ while($ll=<TMP>){
+ if ($ll =~ /^>/) {
+ if ($clstr_txt) {
+ if ( not $refonly ) {
+ if (not $chimeric_ids{$rep}) {
+ print OUT $clstr_txt;
+ $good_ids{$rep} = 1;
+ }
+ elsif ( $chimeric_ids{$rep} ) {
+ foreach $i ( @{ $seqs_of_rep{$rep} }) {
+ print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
+ }
+ }
+ }
+ }
+ $clstr_size = 0;
+ $clstr_txt = $ll;
+ $rep = "";
+ $refonly = 1;
+ }
+ else {
+ $clstr_txt .= $ll;
+ chop($ll);
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $clstr_size++;
+ $rep=$id if ($ll =~ /\*$/);
+ $refonly = 0 if ($id =~ /^Sample/);
+ }
+ }
+ }
+ if ($clstr_txt) {
+ if ( not $refonly ) {
+ if (not $chimeric_ids{$rep}) {
+ print OUT $clstr_txt;
+ $good_ids{$rep} = 1;
+ }
+ elsif ( $chimeric_ids{$rep} ) {
+ foreach $i ( @{ $seqs_of_rep{$rep} }) {
+ print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
+ }
+ }
+ }
+ }
+
+}
+close(TMP);
+close(OUT);
+
+foreach my $f (($seq_99, $seq_992)) {
+ my $fout = ($f eq $seq_99) ? $output : $output_2;
+
+ open(TMP, $f) || die "can not open $f";
+ open(OUT, ">$fout") || die "can not write to $fout";
+
+ my $flag = 0;
+ while($ll = <TMP>) {
+ if ($ll =~ /^>/) {
+ $gi = substr($ll,1);
+ chop($gi);
+ $gi =~ s/\s.+$//;
+ $flag = ( $good_ids{$gi} ) ? 1 : 0;
+ }
+ print OUT $ll if ($flag);
+ }
+
+ close(TMP);
+ close(OUT);
+}
+
+close(LOG);
+
+sub usage {
+<<EOF
+Usage:
+$script_name -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.97f-full.clstr -f seq.99 -g seq.99.2 -o seq.99f
+
+Options:
+ -i input seq.nr.R1.clstr
+ -j input seq.nr.R2.clstr
+ -a input seq.97f-full.clstr
+ -f input seq.99
+ -g input seq.99.2
+ -o output cluster without chimeric cluster, without ref-only cluster
+ -c abundance cutoff, default $abs_cutoff
+ small clusters < this size will be checked for chimeric and be removed if is chimeric
+ if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are checked
+
+EOF
+}
+###### END usage
+
diff --git a/usecases/Miseq-16S/filter-nontop-ref.pl b/usecases/Miseq-16S/filter-nontop-ref.pl
new file mode 100755
index 0000000..71d8ec3
--- /dev/null
+++ b/usecases/Miseq-16S/filter-nontop-ref.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my $clstr = "";
+my $best_ref = "";
+my $best_score = 0;
+
+my $refonly = 1;
+while($ll=<>){
+ if ($ll =~ /^>/) {
+ if ($clstr) {
+ print $clstr;
+ print $best_ref if ($best_ref);
+ }
+
+ $clstr = $ll;
+ $best_ref = "";
+ $best_score = 0;
+ }
+ else {
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ my $id = $2;
+ if ($id =~ /^Sample/) {
+ $clstr .= $ll;
+ }
+ elsif ( $ll =~ /\/([\d|\.]+)%$/) {
+ my $iden = $1;
+ if ($iden > $best_score) {
+ $best_score = $iden;
+ $best_ref = $ll;
+ }
+ }
+ }
+ else {
+ print STDERR "format err: $ll";
+ }
+ }
+}
+
+ if ($clstr) {
+ print $clstr;
+ print $best_ref if ($best_ref);
+ }
diff --git a/usecases/Miseq-16S/filter-refonly-cluster.pl b/usecases/Miseq-16S/filter-refonly-cluster.pl
new file mode 100755
index 0000000..c4c8ab6
--- /dev/null
+++ b/usecases/Miseq-16S/filter-refonly-cluster.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+ $script_dir =~ s/[^\/]+$//;
+ chop($script_dir);
+ $script_dir = "./" unless ($script_dir);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my $num_total_seq;
+my %seq_nr_size;
+
+if (1) {
+ my $clstr = "";
+ my $refonly = 1;
+ while($ll=<>){
+ if ($ll =~ /^>/) {
+ print $clstr unless ($refonly);
+ $clstr = $ll;
+ $refonly = 1;
+ }
+ else {
+ $clstr .= $ll;
+ my $id;
+ if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+ $id = $2;
+ $refonly = 0 if ($id =~ /^Sample/);
+ }
+ }
+ }
+}
+
diff --git a/usecases/Miseq-16S/greengene-ann1.pl b/usecases/Miseq-16S/greengene-ann1.pl
new file mode 100755
index 0000000..fde835e
--- /dev/null
+++ b/usecases/Miseq-16S/greengene-ann1.pl
@@ -0,0 +1,75 @@
+#!/usr/bin/perl
+## =========================== NGS tools ==========================================
+## NGS tools for metagenomic sequence analysis
+## May also be used for other type NGS data analysis
+##
+## Weizhong Li, UCSD
+## liwz at sdsc.edu
+## http://weizhongli-lab.org/
+## ================================================================================
+
+use Getopt::Std;
+getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
+die usage() unless ($opts{i} and $opts{j} and $opts{o});
+my ($i, $j, $k, $cmd);
+my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
+my ($len, $lena, $lenb);
+
+my $file1 = $opts{i};
+my $fasta = $opts{j};
+my $output = $opts{o};
+
+my %id_2_ann;
+open(TMP, $file1) || die "can not open $file1";
+while($ll=<TMP>){
+ chop($ll);
+ my ($id, $txt) = split(/\s+/, $ll, 2);
+ $txt =~ s/ /./g;
+ $id_2_ann{$id} = $txt;
+}
+close(TMP);
+
+my %id_2_seq = ();
+my $id = "";
+open(TMP, $fasta) || die "can not open $fasta";
+while($ll=<TMP>){
+ if ($ll =~ /^>(\d+)/) {
+ chop($ll);
+ $id = $1;
+ $ann = $id_2_ann{$id};
+ $id = "$id|$ann" if ($ann);
+ }
+ else {
+ $id_2_seq{$id} .= $ll;
+ }
+}
+
+close(TMP);
+
+my @ids = keys %id_2_seq;
+ @ids = sort {length($b) <=> length($a) } @ids;
+
+open(OUT, "> $output") || die "can not write to $output";
+foreach $id (@ids) {
+ print OUT ">$id\n$id_2_seq{$id}";
+}
+close(OUT);
+
+
+
+sub usage {
+<<EOD;
+This script formats Greengene FASTA file for CD-HIT-OTU-MiSeq. You should download Greengene sequences
+from http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
+download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find
+gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta
+
+Run this script as $0 -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o gg_13_5_processed.fasta
+
+Options:
+======================
+ -i path for gg_13_5_otus/taxonomy/99_otu_taxonomy.txt
+ -j path for gg_13_5_otus/rep_set/99_otus.fasta
+ -o output FASTA file of formatted Greengene reference DB
+EOD
+}
diff --git a/usecases/Miseq-16S/pool_samples.pl b/usecases/Miseq-16S/pool_samples.pl
new file mode 100755
index 0000000..64ca669
--- /dev/null
+++ b/usecases/Miseq-16S/pool_samples.pl
@@ -0,0 +1,78 @@
+#!/usr/bin/perl
+#
+use Getopt::Std;
+getopts("s:S:o:f:j:",\%opts);
+
+die usage() unless ($opts{s} or $opts{S});
+
+my $output = $opts{o};
+ $output = "pooled" unless ($output);
+my $sample_in = $opts{s};
+my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
+my $job = $opts{j};
+ $job = "otu" unless ($job);
+
+my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
+
+my ($i, $j, $k, $cmd);
+$cmd = `mkdir $output` unless (-e $output);
+
+foreach $i (@file_list) {
+ if (-e "$output/$i") {
+ die "output dir $output & file $output/$i already exist, please remove all files from $output and re-run\n";
+ }
+}
+
+######## parse NGS_samples
+my @NGS_samples = ();
+if (defined($sample_in)) {
+ open(TMP, $sample_in) || die "can not open $sample_in";
+ while($ll=<TMP>){
+ next if ($ll =~ /^#/);
+ next unless ($ll =~ /^\w/); chop($ll);
+ my ($id, @data) = split(/\s+/,$ll);
+ push(@NGS_samples, $id);
+ }
+ close(TMP);
+}
+elsif (defined($sample_command_in)) {
+ my @lls = split(/,/, $sample_command_in);
+ foreach $ll (@lls) {
+ my ($id, @data) = split(/:/, $ll);
+ push(@NGS_samples, $id);
+ }
+}
+else {
+ die "no input samples";
+}
+
+foreach $i (@file_list) {
+ my $target = "$output/$i";
+ foreach $j (@NGS_samples) {
+ my $source = "$j/$job/$i";
+ if (-e $source) {
+ print STDERR "cat $source >> $target\n";
+ $cmd = `cat $source >> $target`;
+ }
+ else {
+ print STDERR "Warning, $source missing\n";
+ }
+ }
+}
+
+sub usage {
+<<EOD;
+ $0 -s sample_file -o output_dir
+ -s sample data file, required unless -S is present
+ File format example
+#Sample data file example, TAB or space delimited for following lines
+Sample_ID1 sample_data_0 sample_data_1
+Sample_ID2 sample_data_0 sample_data_1
+Sample_ID3 sample_data_0 sample_data_1
+
+ -S sample data from command line, required unless -s is present
+ format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
+
+EOD
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/cd-hit.git
More information about the debian-med-commit
mailing list