[med-svn] [trinityrnaseq] 01/03: Imported Upstream version 2.1.1+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Thu Oct 22 16:41:21 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository trinityrnaseq.
commit 8cee3206fc6febe582c996fb938823be20621765
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Thu Oct 22 08:27:31 2015 -0700
Imported Upstream version 2.1.1+dfsg
---
.gitmodules | 3 +
Trinity | 42 +++++-
trinity-plugins/htslib-1.2.1.tar.bz2 | Bin 0 -> 911326 bytes
trinity-plugins/htslib.github.Aug182015.tar.gz | Bin 0 -> 7316027 bytes
trinity-plugins/samtools-0.1.19.tar.bz2 | Bin 0 -> 514507 bytes
util/misc/fastQ_rand_subset.pl | 158 ++++++++++-----------
...subset.reservoir_sampling_reqiures_high_mem.pl} | 0
util/misc/run_STAR.pl | 19 ++-
.../GG_partitioned_trinity_aggregator.pl | 2 +-
9 files changed, 130 insertions(+), 94 deletions(-)
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8a1da23
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "trinityrnaseq.wiki"]
+ path = trinityrnaseq.wiki
+ url = https://github.com/trinityrnaseq/trinityrnaseq.wiki.git
diff --git a/Trinity b/Trinity
index a41d004..94f5c06 100755
--- a/Trinity
+++ b/Trinity
@@ -17,7 +17,7 @@ use Pipeliner;
use Fasta_reader;
use List::Util qw(min max);
-my $VERSION = "v2.1.0";
+my $VERSION = "v2.1.1";
BEGIN {
@@ -1554,13 +1554,13 @@ sub run_chrysalis {
$pipeliner->add_commands( new Command($cmd, "$iworm_min100_fa_file.bowtie_build.ok"));
my $bowtie_sam_file = "$chrysalis_output_dir/iworm.bowtie.nameSorted.bam";
+ my $samtools_max_memory = int($jellyfish_ram/$CPU);
if ($long_reads){
- $cmd = "bash -c \" set -o pipefail;bowtie2 --local -a --threads $CPU -f $iworm_min100_fa_file $bowtie_reads_fa | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
+ $cmd = "bash -c \" set -o pipefail;bowtie2 --local -a --threads $CPU -f $iworm_min100_fa_file $bowtie_reads_fa | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $samtools_max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
}
else{
- $cmd = "bash -c \" set -o pipefail; bowtie -a -m 20 --best --strata --threads $CPU --chunkmbs 512 -q -S -f $iworm_min100_fa_file $bowtie_reads_fa | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
- }
-
+ $cmd = "bash -c \" set -o pipefail; bowtie -a -m 20 --best --strata --threads $CPU --chunkmbs 512 -q -S -f $iworm_min100_fa_file $bowtie_reads_fa | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $samtools_max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
+ }
$pipeliner->add_commands( new Command($cmd, "$bowtie_sam_file.ok"));
@@ -2080,10 +2080,13 @@ sub prep_seqs {
# make fasta
foreach my $f (@initial_files){
my $fastool_cmd = "cat $f | $FASTOOL_DIR/fastool";
+ my $linecount_cmd = "cat $f | wc -l";
if ($f=~/\.gz$/){
$fastool_cmd = "gunzip -c $f | $FASTOOL_DIR/fastool";
+ $linecount_cmd = "gunzip -c $f | wc -l";
}elsif ($f=~/\.bz2$/){
$fastool_cmd = "bunzip2 -dkc $f | $FASTOOL_DIR/fastool";
+ $linecount_cmd = "bunzip2 -dkc $f | wc -l";
}
if ($SS_lib_type && $SS_lib_type eq "R") {
@@ -2092,6 +2095,9 @@ sub prep_seqs {
$fastool_cmd .= " --illumina-trinity --to-fasta >> $file_prefix.fa 2> $f.readcount ";
my $cmd = $fastool_cmd;
&process_cmd($cmd);
+
+ &ensure_complete_FQtoFA_conversion($linecount_cmd, "$f.readcount");
+
}
}
elsif ($seqType eq "fa") {
@@ -3031,3 +3037,29 @@ sub run_ANANAS {
exit(0);
}
+
+
+####
+sub ensure_complete_FQtoFA_conversion {
+ my ($linecount_cmd, $fastool_count_file) = @_;
+
+ my $fastool_count_info = `cat $fastool_count_file`;
+ $fastool_count_info =~ /Sequences parsed: (\d+)/ or confess "Error, cannot parse $fastool_count_file";
+ my $fastool_count = $1;
+
+
+ my $num_fq_lines = `$linecount_cmd`;
+ chomp $num_fq_lines;
+
+ my $num_fq_entries = $num_fq_lines / 4;
+
+ if ($num_fq_entries == $fastool_count) {
+ print STDERR "-conversion of $num_fq_entries from FQ to FA format succeeded.\n";
+ }
+ else {
+ confess "Error, counts of reads in FQ: $num_fq_entries (as per $linecount_cmd) doesn't match fastool's report of FA records: $fastool_count ";
+ }
+
+ return;
+}
+
diff --git a/trinity-plugins/htslib-1.2.1.tar.bz2 b/trinity-plugins/htslib-1.2.1.tar.bz2
new file mode 100644
index 0000000..0c82ade
Binary files /dev/null and b/trinity-plugins/htslib-1.2.1.tar.bz2 differ
diff --git a/trinity-plugins/htslib.github.Aug182015.tar.gz b/trinity-plugins/htslib.github.Aug182015.tar.gz
new file mode 100644
index 0000000..455614d
Binary files /dev/null and b/trinity-plugins/htslib.github.Aug182015.tar.gz differ
diff --git a/trinity-plugins/samtools-0.1.19.tar.bz2 b/trinity-plugins/samtools-0.1.19.tar.bz2
new file mode 100644
index 0000000..2db0d79
Binary files /dev/null and b/trinity-plugins/samtools-0.1.19.tar.bz2 differ
diff --git a/util/misc/fastQ_rand_subset.pl b/util/misc/fastQ_rand_subset.pl
index e5377d0..8b0f7e0 100755
--- a/util/misc/fastQ_rand_subset.pl
+++ b/util/misc/fastQ_rand_subset.pl
@@ -1,24 +1,5 @@
#!/usr/bin/env perl
-## util/fastQ_rand_subset.pl
-## Purpose: Extracts out a specific number of random reads from an
-## input file, making sure that left and right ends of the selected
-## reads are paired
-## Usage: $0 left.fq right.fq num_entries
-##
-## This code implements reservoir sampling, which produces an unbiased
-## selection regardless of the number of entries (assuming an
-## appropriately uniform random distribution function).
-
-## See:
-## * https://en.wikipedia.org/wiki/Reservoir_sampling
-##
-## The current implementation requires the selected entries to be
-## stored in memory, but passes over the input file(s) only once. If
-## the array loading / reading is too slow or memory intensive, this
-## could be implemented in a two-pass fashion by first getting
-## indexes, and then getting the reads
-
use strict;
use warnings;
@@ -31,83 +12,88 @@ my $usage = "usage: $0 left.fq right.fq num_entries\n\n";
my $left_fq = $ARGV[0] or die $usage;
my $right_fq = $ARGV[1] or die $usage;
-my $num_entries = $ARGV[2] or die $usage;
+my $num_to_sample = $ARGV[2] or die $usage;
-my @selected_left_entries = ();
-my @selected_right_entries = ();
+srand();
main: {
- print STDERR "Selecting $num_entries entries...";
-
- my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
- my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
-
- my $num_M_entries = $num_entries/1e6;
- $num_M_entries .= "M";
- my $base_left_fq = basename($left_fq);
- my $base_right_fq = basename($right_fq);
-
- open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
- open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
-
- srand();
-
- my $num_skipped = 0;
- my $num_output_entries = 0;
- my $num_entries_read = 0;
-
- my $left_entry = 0;
- my $right_entry = 0;
-
- while ($left_entry = $left_fq_reader->next()) {
- $right_entry = $right_fq_reader->next();
-
- unless ($left_entry && $right_entry) {
- die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
- }
- unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
- die "Error, core read names don't match: "
- . "Left: " . $left_entry->get_core_read_name() . "\n"
- . "Right: " . $right_entry->get_core_read_name() . "\n";
+ ## do reservoir sampling on indices instead of the fastq records themselves, then just extract the selected ones.
+
+ my $num_fq_entries = &get_num_fq_entries($left_fq);
+ my @selected_entries = (1..$num_to_sample);
+
+ for (my $i = $num_to_sample + 1; $i <= $num_fq_entries; $i++) {
+
+ my $rand_pos = rand($i);
+ if ($rand_pos < $num_to_sample) {
+ $selected_entries[$rand_pos] = $i;
+ }
}
- $num_entries_read++;
-
- if($num_entries_read <= $num_entries){
- # Populate reservoir with entries up to num_entries
- push(@selected_left_entries, $left_entry);
- push(@selected_right_entries, $right_entry);
- } else {
- # Randomly replace elements in the reservoir
- # with a decreasing probability
- my $swapPos = int(rand($num_entries_read));
-
- if($swapPos < $num_entries){
- $selected_left_entries[$swapPos] = $left_entry;
- $selected_right_entries[$swapPos] = $right_entry;
- }
+ my %indices_want = map { + $_ => 1 } @selected_entries;
+ @selected_entries = (); # free
+
+
+ my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
+ my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
+
+ my $num_M_entries = $num_to_sample/1e6;
+ $num_M_entries .= "M";
+ my $base_left_fq = basename($left_fq);
+ my $base_right_fq = basename($right_fq);
+
+ open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
+ open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
+
+ my $fq_index = 0;
+ while (my $left_entry = $left_fq_reader->next()) {
+ my $right_entry = $right_fq_reader->next();
+
+ unless ($left_entry && $right_entry) {
+ die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
+ }
+ unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
+ die "Error, core read names don't match: "
+ . "Left: " . $left_entry->get_core_read_name() . "\n"
+ . "Right: " . $right_entry->get_core_read_name() . "\n";
+ }
+
+ $fq_index++;
+
+ if ($indices_want{$fq_index}) {
+ print $left_ofh $left_entry->get_fastq_record();
+ print $right_ofh $right_entry->get_fastq_record();
+ }
}
- }
-
- if ($num_entries > $num_entries_read) {
- die "Error, num_entries $num_entries > total records available: $num_entries_read ";
- }
+
+ print STDERR " done.\n";
+
+ close $left_ofh;
+ close $right_ofh;
+
+ exit(0);
+}
- # print selected entries to their respective files
- foreach my $entry (@selected_left_entries){
- print $left_ofh $entry->get_fastq_record();
- }
- foreach my $entry (@selected_right_entries){
- print $right_ofh $entry->get_fastq_record();
- }
+####
+sub get_num_fq_entries {
+ my ($fastq_file) = @_;
- print STDERR " done.\n";
+ unless (-s $fastq_file) {
+ die "Error, cannot locate file $fastq_file";
+ }
- close $left_ofh;
- close $right_ofh;
+ my $linecount;
+ if ($fastq_file =~ /\.gz$/) {
+ $linecount = `gunzip -c $fastq_file | wc -l`;
+ }
+ else {
+ $linecount = `cat $fastq_file | wc -l`;
+ }
+ chomp $linecount;
+
+ my $num_records = $linecount / 4;
- exit(0);
+ return($num_records);
}
-
diff --git a/util/misc/fastQ_rand_subset.pl b/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
similarity index 100%
copy from util/misc/fastQ_rand_subset.pl
copy to util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
diff --git a/util/misc/run_STAR.pl b/util/misc/run_STAR.pl
index fa8c269..40b5dbc 100755
--- a/util/misc/run_STAR.pl
+++ b/util/misc/run_STAR.pl
@@ -26,6 +26,8 @@ my $usage = <<__EOUSAGE__;
# --CPU <int> number of threads (default: 2)
# --out_prefix <string> output prefix (default: star)
# --out_dir <string> output directory (default: current working directory)
+# --star_path <string> full path to the STAR program to use.
+# --patch <string> genomic targets to patch the genome fasta with.
#
#######################################################################
@@ -46,6 +48,9 @@ my $gtf_file;
my $out_dir;
my $ADV = 0;
+my $star_path = "STAR";
+my $patch;
+
&GetOptions( 'h' => \$help_flag,
'genome=s' => \$genome,
'reads=s' => \$reads,
@@ -54,7 +59,8 @@ my $ADV = 0;
'G=s' => \$gtf_file,
'out_dir=s' => \$out_dir,
'ADV' => \$ADV,
-
+ 'star_path=s' => \$star_path,
+ 'patch=s' => \$patch,
);
@@ -66,8 +72,12 @@ if ($help_flag) {
die $usage;
}
+if (@ARGV) {
+ die "Error, cannot recognize opts: @ARGV";
+}
+
-my $star_prog = `which STAR`;
+my $star_prog = `which $star_path`;
chomp $star_prog;
unless ($star_prog =~ /\w/) {
die "Error, cannot locate STAR program. Be sure it's in your PATH setting. ";
@@ -138,6 +148,11 @@ main: {
. " --alignSJDBoverhangMin 10 "
. " --chimSegmentReadGapMax parameter 3 "
. " --limitBAMsortRAM 20000000000";
+
+ if ($patch) {
+ $cmd .= " --genomeFastaFiles $patch ";
+ }
+
#if ($ADV) {
$cmd .= " --alignSJstitchMismatchNmax 5 -1 5 5 "; #which allows for up to 5 mismatches for non-canonical GC/AG, and AT/AC junctions, and any number of mismatches for canonical junctions (the default values 0 -1 0 0 replicate the old behavior (from AlexD)
diff --git a/util/support_scripts/GG_partitioned_trinity_aggregator.pl b/util/support_scripts/GG_partitioned_trinity_aggregator.pl
index c0fccff..ce71959 100755
--- a/util/support_scripts/GG_partitioned_trinity_aggregator.pl
+++ b/util/support_scripts/GG_partitioned_trinity_aggregator.pl
@@ -23,7 +23,7 @@ while (<STDIN>) {
open (my $fh, $filename) or die "Error, cannot open file $filename";
while (<$fh>) {
if (/>/) {
- s/>/>{$token}${counter}\_/;
+ s/>/>${token}_${counter}\_/;
}
print;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/trinityrnaseq.git
More information about the debian-med-commit
mailing list