[med-svn] [trinityrnaseq] 01/03: Imported Upstream version 2.1.1+dfsg

Thu Oct 22 16:41:21 UTC 2015

This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to branch master
in repository trinityrnaseq.

commit 8cee3206fc6febe582c996fb938823be20621765
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Thu Oct 22 08:27:31 2015 -0700

    Imported Upstream version 2.1.1+dfsg
---
 .gitmodules                                        |   3 +
 Trinity                                            |  42 +++++-
 trinity-plugins/htslib-1.2.1.tar.bz2               | Bin 0 -> 911326 bytes
 trinity-plugins/htslib.github.Aug182015.tar.gz     | Bin 0 -> 7316027 bytes
 trinity-plugins/samtools-0.1.19.tar.bz2            | Bin 0 -> 514507 bytes
 util/misc/fastQ_rand_subset.pl                     | 158 ++++++++++-----------
 ...subset.reservoir_sampling_reqiures_high_mem.pl} |   0
 util/misc/run_STAR.pl                              |  19 ++-
 .../GG_partitioned_trinity_aggregator.pl           |   2 +-
 9 files changed, 130 insertions(+), 94 deletions(-)

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8a1da23
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "trinityrnaseq.wiki"]
+	path = trinityrnaseq.wiki
+	url = https://github.com/trinityrnaseq/trinityrnaseq.wiki.git
diff --git a/Trinity b/Trinity
index a41d004..94f5c06 100755
--- a/Trinity
+++ b/Trinity
@@ -17,7 +17,7 @@ use Pipeliner;
 use Fasta_reader;
 use List::Util qw(min max);
 
-my $VERSION = "v2.1.0"; 
+my $VERSION = "v2.1.1"; 
 
 
 BEGIN {
@@ -1554,13 +1554,13 @@ sub run_chrysalis {
             $pipeliner->add_commands( new Command($cmd, "$iworm_min100_fa_file.bowtie_build.ok"));
 
             my $bowtie_sam_file = "$chrysalis_output_dir/iworm.bowtie.nameSorted.bam";
+	    my $samtools_max_memory = int($jellyfish_ram/$CPU);
             if ($long_reads){
-            	$cmd = "bash -c \" set -o pipefail;bowtie2 --local -a --threads $CPU -f $iworm_min100_fa_file $bowtie_reads_fa  | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";  
+            	$cmd = "bash -c \" set -o pipefail;bowtie2 --local -a --threads $CPU -f $iworm_min100_fa_file $bowtie_reads_fa  | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $samtools_max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";  
             }
             else{
-            	$cmd = "bash -c \" set -o pipefail; bowtie -a -m 20 --best --strata --threads $CPU  --chunkmbs 512 -q -S -f $iworm_min100_fa_file $bowtie_reads_fa  | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
-            }
-            
+            	$cmd = "bash -c \" set -o pipefail; bowtie -a -m 20 --best --strata --threads $CPU  --chunkmbs 512 -q -S -f $iworm_min100_fa_file $bowtie_reads_fa  | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $samtools_max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
+            }            
             
             $pipeliner->add_commands( new Command($cmd, "$bowtie_sam_file.ok"));
             
@@ -2080,10 +2080,13 @@ sub prep_seqs {
         # make fasta
         foreach my $f (@initial_files){
             my $fastool_cmd = "cat $f | $FASTOOL_DIR/fastool";
+            my $linecount_cmd = "cat $f | wc -l";
             if ($f=~/\.gz$/){
                 $fastool_cmd = "gunzip -c $f | $FASTOOL_DIR/fastool";
+                $linecount_cmd = "gunzip -c $f | wc -l";
             }elsif ($f=~/\.bz2$/){
                 $fastool_cmd = "bunzip2 -dkc $f | $FASTOOL_DIR/fastool";
+                $linecount_cmd = "bunzip2 -dkc $f | wc -l";
             }
             
             if ($SS_lib_type && $SS_lib_type eq "R") {
@@ -2092,6 +2095,9 @@ sub prep_seqs {
             $fastool_cmd .= " --illumina-trinity --to-fasta >> $file_prefix.fa 2> $f.readcount ";
             my $cmd = $fastool_cmd;
             &process_cmd($cmd);
+
+            &ensure_complete_FQtoFA_conversion($linecount_cmd, "$f.readcount");
+            
         }
     }
     elsif ($seqType eq "fa") {
@@ -3031,3 +3037,29 @@ sub run_ANANAS {
 
     exit(0);
 }
+
+
+####
+sub ensure_complete_FQtoFA_conversion {
+    my ($linecount_cmd, $fastool_count_file) = @_;
+
+    my $fastool_count_info = `cat $fastool_count_file`;
+    $fastool_count_info =~ /Sequences parsed: (\d+)/ or confess "Error, cannot parse $fastool_count_file";
+    my $fastool_count = $1;
+
+    
+    my $num_fq_lines = `$linecount_cmd`;
+    chomp $num_fq_lines;
+
+    my $num_fq_entries = $num_fq_lines / 4;
+
+    if ($num_fq_entries == $fastool_count) {
+        print STDERR "-conversion of $num_fq_entries from FQ to FA format succeeded.\n";
+    }
+    else {
+        confess "Error, counts of reads in FQ: $num_fq_entries (as per $linecount_cmd) doesn't match fastool's report of FA records: $fastool_count ";
+    }
+
+    return;
+}
+
diff --git a/trinity-plugins/htslib-1.2.1.tar.bz2 b/trinity-plugins/htslib-1.2.1.tar.bz2
new file mode 100644
index 0000000..0c82ade
Binary files /dev/null and b/trinity-plugins/htslib-1.2.1.tar.bz2 differ
diff --git a/trinity-plugins/htslib.github.Aug182015.tar.gz b/trinity-plugins/htslib.github.Aug182015.tar.gz
new file mode 100644
index 0000000..455614d
Binary files /dev/null and b/trinity-plugins/htslib.github.Aug182015.tar.gz differ
diff --git a/trinity-plugins/samtools-0.1.19.tar.bz2 b/trinity-plugins/samtools-0.1.19.tar.bz2
new file mode 100644
index 0000000..2db0d79
Binary files /dev/null and b/trinity-plugins/samtools-0.1.19.tar.bz2 differ
diff --git a/util/misc/fastQ_rand_subset.pl b/util/misc/fastQ_rand_subset.pl
index e5377d0..8b0f7e0 100755
--- a/util/misc/fastQ_rand_subset.pl
+++ b/util/misc/fastQ_rand_subset.pl
@@ -1,24 +1,5 @@
 #!/usr/bin/env perl
 
-## util/fastQ_rand_subset.pl
-## Purpose: Extracts out a specific number of random reads from an
-##   input file, making sure that left and right ends of the selected
-##   reads are paired
-## Usage: $0 left.fq right.fq num_entries
-##
-## This code implements reservoir sampling, which produces an unbiased
-## selection regardless of the number of entries (assuming an
-## appropriately uniform random distribution function).
-
-## See:
-## * https://en.wikipedia.org/wiki/Reservoir_sampling
-##
-## The current implementation requires the selected entries to be
-## stored in memory, but passes over the input file(s) only once. If
-## the array loading / reading is too slow or memory intensive, this
-## could be implemented in a two-pass fashion by first getting
-## indexes, and then getting the reads
-
 use strict;
 use warnings;
 
@@ -31,83 +12,88 @@ my $usage = "usage: $0 left.fq right.fq num_entries\n\n";
 
 my $left_fq = $ARGV[0] or die $usage;
 my $right_fq = $ARGV[1] or die $usage;
-my $num_entries = $ARGV[2] or die $usage;
+my $num_to_sample = $ARGV[2] or die $usage;
 
-my @selected_left_entries = ();
-my @selected_right_entries = ();
+srand();
 
 main: {
 
-  print STDERR "Selecting $num_entries entries...";
-
-  my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
-  my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
-
-  my $num_M_entries = $num_entries/1e6;
-  $num_M_entries .= "M";
-  my $base_left_fq = basename($left_fq);
-  my $base_right_fq = basename($right_fq);
-
-  open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
-  open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
-
-  srand();
-
-  my $num_skipped = 0;
-  my $num_output_entries = 0;
-  my $num_entries_read = 0;
-
-  my $left_entry = 0;
-  my $right_entry = 0;
-
-  while ($left_entry = $left_fq_reader->next()) {
-    $right_entry = $right_fq_reader->next();
-
-    unless ($left_entry && $right_entry) {
-      die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
-    }
-    unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
-      die "Error, core read names don't match: "
-        . "Left: " . $left_entry->get_core_read_name() . "\n"
-          . "Right: " . $right_entry->get_core_read_name() . "\n";
+    ## do reservoir sampling on indices instead of the fastq records themselves, then just extract the selected ones.
+
+    my $num_fq_entries = &get_num_fq_entries($left_fq);
+    my @selected_entries = (1..$num_to_sample);
+    
+    for (my $i = $num_to_sample + 1; $i <= $num_fq_entries; $i++) {
+        
+        my $rand_pos = rand($i);
+        if ($rand_pos < $num_to_sample) {
+            $selected_entries[$rand_pos] = $i;
+        }
     }
 
-    $num_entries_read++;
-
-    if($num_entries_read <= $num_entries){
-      # Populate reservoir with entries up to num_entries
-      push(@selected_left_entries, $left_entry);
-      push(@selected_right_entries, $right_entry);
-    } else {
-      # Randomly replace elements in the reservoir
-      # with a decreasing probability
-      my $swapPos = int(rand($num_entries_read));
-
-      if($swapPos < $num_entries){
-        $selected_left_entries[$swapPos] = $left_entry;
-        $selected_right_entries[$swapPos] = $right_entry;
-      }
+    my %indices_want = map { + $_ => 1 } @selected_entries;
+    @selected_entries = (); # free
+
+    
+    my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
+    my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
+    
+    my $num_M_entries = $num_to_sample/1e6;
+    $num_M_entries .= "M";
+    my $base_left_fq = basename($left_fq);
+    my $base_right_fq = basename($right_fq);
+    
+    open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
+    open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
+    
+    my $fq_index = 0;
+    while (my $left_entry = $left_fq_reader->next()) {
+        my $right_entry = $right_fq_reader->next();
+        
+        unless ($left_entry && $right_entry) {
+            die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
+        }
+        unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
+            die "Error, core read names don't match: "
+                . "Left: " . $left_entry->get_core_read_name() . "\n"
+                . "Right: " . $right_entry->get_core_read_name() . "\n";
+        }
+      
+        $fq_index++;
+
+        if ($indices_want{$fq_index}) {
+            print $left_ofh $left_entry->get_fastq_record();
+            print $right_ofh $right_entry->get_fastq_record();
+        }
     }
-  }
-
-  if ($num_entries > $num_entries_read) {
-    die "Error, num_entries $num_entries > total records available: $num_entries_read ";
-  }
+    
+    print STDERR " done.\n";
+    
+    close $left_ofh;
+    close $right_ofh;
+    
+    exit(0);
+}
 
-  # print selected entries to their respective files
-  foreach my $entry (@selected_left_entries){
-    print $left_ofh $entry->get_fastq_record();
-  }
-  foreach my $entry (@selected_right_entries){
-    print $right_ofh $entry->get_fastq_record();
-  }
+####
+sub get_num_fq_entries {
+    my ($fastq_file) = @_;
 
-  print STDERR " done.\n";
+    unless (-s $fastq_file) {
+        die "Error, cannot locate file $fastq_file";
+    }
 
-  close $left_ofh;
-  close $right_ofh;
+    my $linecount;
+    if ($fastq_file =~ /\.gz$/) {
+        $linecount = `gunzip -c $fastq_file | wc -l`;
+    }
+    else {
+        $linecount = `cat $fastq_file | wc -l`;
+    }
+    chomp $linecount;
+    
+    my $num_records = $linecount / 4;
 
-  exit(0);
+    return($num_records);
 }
 
-
diff --git a/util/misc/fastQ_rand_subset.pl b/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
similarity index 100%
copy from util/misc/fastQ_rand_subset.pl
copy to util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
diff --git a/util/misc/run_STAR.pl b/util/misc/run_STAR.pl
index fa8c269..40b5dbc 100755
--- a/util/misc/run_STAR.pl
+++ b/util/misc/run_STAR.pl
@@ -26,6 +26,8 @@ my $usage = <<__EOUSAGE__;
 #  --CPU <int>                 number of threads (default: 2)
 #  --out_prefix <string>       output prefix (default: star)
 #  --out_dir <string>          output directory (default: current working directory)
+#  --star_path <string>        full path to the STAR program to use.
+#  --patch <string>            genomic targets to patch the genome fasta with.
 #
 #######################################################################
 
@@ -46,6 +48,9 @@ my $gtf_file;
 my $out_dir;
 my $ADV = 0;
 
+my $star_path = "STAR";
+my $patch;
+
 &GetOptions( 'h' => \$help_flag,
              'genome=s' => \$genome,
              'reads=s' => \$reads,
@@ -54,7 +59,8 @@ my $ADV = 0;
              'G=s' => \$gtf_file,
              'out_dir=s' => \$out_dir,
              'ADV' => \$ADV,
-
+             'star_path=s' => \$star_path,
+             'patch=s' => \$patch,
     );
 
 
@@ -66,8 +72,12 @@ if ($help_flag) {
     die $usage;
 }
 
+if (@ARGV) {
+    die "Error, cannot recognize opts: @ARGV";
+}
+
 
-my $star_prog = `which STAR`;
+my $star_prog = `which $star_path`;
 chomp $star_prog;
 unless ($star_prog =~ /\w/) {
     die "Error, cannot locate STAR program. Be sure it's in your PATH setting.  ";
@@ -138,6 +148,11 @@ main: {
         . " --alignSJDBoverhangMin 10 "
         . " --chimSegmentReadGapMax parameter 3 "
         . " --limitBAMsortRAM 20000000000";
+
+    if ($patch) {
+        $cmd .= " --genomeFastaFiles $patch ";
+    }
+        
     
     #if ($ADV) {
         $cmd .= " --alignSJstitchMismatchNmax 5 -1 5 5 ";  #which allows for up to 5 mismatches for non-canonical GC/AG, and AT/AC junctions, and any number of mismatches for canonical junctions (the default values 0 -1 0 0 replicate the old behavior (from AlexD)
diff --git a/util/support_scripts/GG_partitioned_trinity_aggregator.pl b/util/support_scripts/GG_partitioned_trinity_aggregator.pl
index c0fccff..ce71959 100755
--- a/util/support_scripts/GG_partitioned_trinity_aggregator.pl
+++ b/util/support_scripts/GG_partitioned_trinity_aggregator.pl
@@ -23,7 +23,7 @@ while (<STDIN>) {
         open (my $fh, $filename) or die "Error, cannot open file $filename";
         while (<$fh>) {
             if (/>/) {
-                s/>/>{$token}${counter}\_/;
+                s/>/>${token}_${counter}\_/;
             }
             print;
         }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/trinityrnaseq.git