[med-svn] [trinityrnaseq] 02/03: Imported Upstream version 2.1.1+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Mon Feb 22 12:26:58 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository trinityrnaseq.
commit fdeb4cbd44702e2e7d21be1232e67c0d2790dfcb
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Mon Feb 22 04:09:09 2016 -0800
Imported Upstream version 2.1.1+dfsg
---
galaxy-plugin/EdgeR_differentialExpression.xml | 38 -
.../EdgeR_differentialExpression_wrapper.py | 40 -
.../GauravGalaxy/EdgeR_differentialExpression.xml | 38 -
.../EdgeR_differentialExpression_wrapper.py | 40 -
.../GauravGalaxy/RSEM_estimates_to_matrix.xml | 51 -
galaxy-plugin/GauravGalaxy/Trinity | 2707 --------------------
.../GauravGalaxy/__add_to_PATH_setting.txt | 1 -
.../abundance_estimation_to_matrix.xml | 42 -
.../abundance_estimation_to_matrix_wrapper.py | 40 -
.../GauravGalaxy/align_and_estimate_abundance.xml | 90 -
galaxy-plugin/GauravGalaxy/alignreads.xml | 138 -
galaxy-plugin/GauravGalaxy/analyze_diff_exp.xml | 41 -
.../GauravGalaxy/analyze_diff_exp_wrapper.py | 63 -
.../GauravGalaxy/bash_command_executer.py | 44 -
galaxy-plugin/GauravGalaxy/cat.xml | 41 -
galaxy-plugin/GauravGalaxy/diffExpress_edgeR.xml | 47 -
galaxy-plugin/GauravGalaxy/transcriptsToOrfs.xml | 53 -
galaxy-plugin/GauravGalaxy/trinityToolWrapper.py | 82 -
galaxy-plugin/GauravGalaxy/trinityrnaseq.xml | 127 -
.../trinityrnaseq.xml.Graham_version_022014 | 127 -
galaxy-plugin/GauravGalaxy/trinityrnaseq_norm.xml | 102 -
galaxy-plugin/RSEM_abundance_estimation.xml | 72 -
galaxy-plugin/RSEM_estimates_to_matrix.xml | 51 -
galaxy-plugin/Trinity | 2707 --------------------
galaxy-plugin/__add_to_PATH_setting.txt | 1 -
galaxy-plugin/abundance_estimation_to_matrix.xml | 42 -
.../abundance_estimation_to_matrix_wrapper.py | 40 -
galaxy-plugin/align_and_estimate_abundance.xml | 90 -
galaxy-plugin/alignreads.xml | 138 -
galaxy-plugin/analyze_diff_exp.xml | 41 -
galaxy-plugin/analyze_diff_exp_wrapper.py | 63 -
galaxy-plugin/bash_command_executer.py | 44 -
galaxy-plugin/cat.xml | 41 -
galaxy-plugin/diffExpress_edgeR.xml | 47 -
galaxy-plugin/transcriptsToOrfs.xml | 53 -
galaxy-plugin/trinityToolWrapper.py | 82 -
galaxy-plugin/trinityrnaseq.xml | 127 -
.../trinityrnaseq.xml.Graham_version_022014 | 127 -
galaxy-plugin/trinityrnaseq_norm.xml | 102 -
39 files changed, 7820 deletions(-)
diff --git a/galaxy-plugin/EdgeR_differentialExpression.xml b/galaxy-plugin/EdgeR_differentialExpression.xml
deleted file mode 100644
index 2636909..0000000
--- a/galaxy-plugin/EdgeR_differentialExpression.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<tool id="EdgeR_differentialExpression" name="EdgeR_differentialExpression" version="0.0.1">
-
- <description>Identify Differentially Expressed Transcripts Using EdgeR</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- EdgeR_differentialExpression_wrapper.py
- $counts_matrix
- $dispersion
-
- </command>
- <inputs>
-
- <param type="data" format="txt" name="counts_matrix" label="Matrix of RNA-Seq fragment counts for transcripts per condition" />
- <param type="data" format="fasta" name="transcripts_fasta_file" label="Transcripts fasta file corresponding to matrix" />
- <param type="float" name="dispersion" value="0.1" min="0" label="dispersion value" help="Dispersion value to be used in the negative binomial" />
-
- </inputs>
- <outputs>
-
- <data format="tar.gz" name="EdgeR_Archive" label="${tool.name} on ${on_string}: EdgeR_Results.tar.gz" from_work_dir="edgeR_results.tar.gz" />
-
- </outputs>
- <tests>
-
- <test>
- <param name="myname" value="This is just a simple test" />
-
- </test>
-
-
- </tests>
- <help>
- help info here.
- </help>
-</tool>
diff --git a/galaxy-plugin/EdgeR_differentialExpression_wrapper.py b/galaxy-plugin/EdgeR_differentialExpression_wrapper.py
deleted file mode 100644
index ff55cd1..0000000
--- a/galaxy-plugin/EdgeR_differentialExpression_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import sys, os, subprocess
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit(1)
-
-usage= "usage: " + " $counts_matrix" + " $dispersion"
-
-if len(sys.argv)<2:
- print "Require atleast two parameters"
-else:
- print "All good- command going ahead"
-print " "
-
-def run_command(cmd):
- print "The command used: " + cmd
- pipe=subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
- pipe.wait()
- ret= pipe.returncode
- if ret:
- print "command died: " + str(ret)
- print pipe.stderr.readlines()
- sys.exit(1)
- else:
- return
-print " "
-
-countmatrix= "counts_matrix"
-
-cmd= "cp " + sys.argv[1] + " " + countmatrix
-run_command(cmd)
-
-cmd= TRINITY_BASE_DIR + "/Analysis/DifferentialExpression/run_DE_analysis.pl "+ " --matrix "+ countmatrix + " --method edgeR " + " --output edgeR_results "+ " --dispersion " + sys.argv[2] + " --tar_gz_outdir"
-
-run_command(cmd)
-
-sys.exit(0)
diff --git a/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression.xml b/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression.xml
deleted file mode 100644
index 2636909..0000000
--- a/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<tool id="EdgeR_differentialExpression" name="EdgeR_differentialExpression" version="0.0.1">
-
- <description>Identify Differentially Expressed Transcripts Using EdgeR</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- EdgeR_differentialExpression_wrapper.py
- $counts_matrix
- $dispersion
-
- </command>
- <inputs>
-
- <param type="data" format="txt" name="counts_matrix" label="Matrix of RNA-Seq fragment counts for transcripts per condition" />
- <param type="data" format="fasta" name="transcripts_fasta_file" label="Transcripts fasta file corresponding to matrix" />
- <param type="float" name="dispersion" value="0.1" min="0" label="dispersion value" help="Dispersion value to be used in the negative binomial" />
-
- </inputs>
- <outputs>
-
- <data format="tar.gz" name="EdgeR_Archive" label="${tool.name} on ${on_string}: EdgeR_Results.tar.gz" from_work_dir="edgeR_results.tar.gz" />
-
- </outputs>
- <tests>
-
- <test>
- <param name="myname" value="This is just a simple test" />
-
- </test>
-
-
- </tests>
- <help>
- help info here.
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression_wrapper.py b/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression_wrapper.py
deleted file mode 100644
index ff55cd1..0000000
--- a/galaxy-plugin/GauravGalaxy/EdgeR_differentialExpression_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import sys, os, subprocess
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit(1)
-
-usage= "usage: " + " $counts_matrix" + " $dispersion"
-
-if len(sys.argv)<2:
- print "Require atleast two parameters"
-else:
- print "All good- command going ahead"
-print " "
-
-def run_command(cmd):
- print "The command used: " + cmd
- pipe=subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
- pipe.wait()
- ret= pipe.returncode
- if ret:
- print "command died: " + str(ret)
- print pipe.stderr.readlines()
- sys.exit(1)
- else:
- return
-print " "
-
-countmatrix= "counts_matrix"
-
-cmd= "cp " + sys.argv[1] + " " + countmatrix
-run_command(cmd)
-
-cmd= TRINITY_BASE_DIR + "/Analysis/DifferentialExpression/run_DE_analysis.pl "+ " --matrix "+ countmatrix + " --method edgeR " + " --output edgeR_results "+ " --dispersion " + sys.argv[2] + " --tar_gz_outdir"
-
-run_command(cmd)
-
-sys.exit(0)
diff --git a/galaxy-plugin/GauravGalaxy/RSEM_estimates_to_matrix.xml b/galaxy-plugin/GauravGalaxy/RSEM_estimates_to_matrix.xml
deleted file mode 100644
index 3057eb0..0000000
--- a/galaxy-plugin/GauravGalaxy/RSEM_estimates_to_matrix.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<tool id="RSEM_estimates_to_matrix" name="RSEM_estimates_to_matrix" version="0.0.1">
-
- <description>Join RSEM estimates from multiple samples into a single matrix</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/RSEM_util/merge_RSEM_counts_and_labels_single_table.pl
-
- --labels
- #for $entry in $RSEM_samples:
- ${entry.column_label}
- #end for
-
- --RSEM_counts
- #for $entry in $RSEM_samples:
- ${entry.file}
- #end for
-
- > RSEM.counts.matrix
-
- </command>
- <inputs>
-
- <repeat name="RSEM_samples" title="RSEM abundance estimates for samples">
- <param name="file" label="Add file" type="data" format="text"/>
- <param name="column_label" label="column label" type="text" />
- </repeat>
-
- </inputs>
- <outputs>
- <data format="text" name="counts_matrix" label="${tool.name} on ${on_string}: Counts Matrix" from_work_dir="RSEM.counts.matrix"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/Trinity b/galaxy-plugin/GauravGalaxy/Trinity
deleted file mode 100755
index be72d56..0000000
--- a/galaxy-plugin/GauravGalaxy/Trinity
+++ /dev/null
@@ -1,2707 +0,0 @@
-#!/usr/bin/env perl
-
-use strict;
-use warnings;
-use threads;
-no strict qw(subs refs);
-
-use FindBin;
-use lib ("$FindBin::Bin/PerlLib");
-use File::Basename;
-use Time::localtime;
-use Cwd;
-use Carp;
-use COMMON;
-use Getopt::Long qw(:config no_ignore_case pass_through);
-
-BEGIN {
-
- $ENV{TRINITY_HOME} = "$FindBin::Bin";
-
-}
-
-use HTC::GridRunner;
-
-
-open (STDERR, ">&STDOUT"); ## capturing stderr and stdout in a single stdout stream
-
-# Site specific setup
-
-my $CPU_MAX = 64; # set higher at your own risk. Definitely don't set it higher than the number of cores available on your machine.
-
-my $KMER_SIZE = 25;
-my $MAX_KMER_SIZE = 32;
-
-my $INCHWORM_CUSTOM_PARAMS;
-
-# option list:
-my ($seqType, @left_files, @right_files, @single_files, $SS_lib_type, $min_contig_length,
- $group_pairs_distance, $jaccard_clip, $show_advanced_options,
- $output_directory, $prep_only
- );
-
-# What is allowed for the options. Put string to be displayed in '%allowed'; this
-# will be showed to the user via help and on error. Keys are the variable names.
-# Actual hash to be used for checking is auto-generated. Fancy regex inside map
-# is just to get rid of the syntaxical sugar 'or' in the display string.
-
-my %allowed =
- ( seqType => 'fa, or fq'
- , kmer_method => 'jellyfish, meryl, or inchworm'
- );
-
-my %allowed_check;
-foreach my $all (keys %allowed) {
- my %h = map { (my $s = $_) =~ s/^or //; $s => 1 } split ', ', $allowed{$all};
- $allowed_check{$all} = \%h;
-}
-
-# defaults:
-
-$output_directory = &create_full_path("trinity_out_dir", 0);
-
-
-# butterfly opts
-$min_contig_length = 200;
-$group_pairs_distance = 500;
-my $path_reinforcement_distance;
-my $PE_path_reinforcement_distance = 75;
-my $SE_path_reinforcement_distance = 25;
-
-my $NO_RUN_BUTTERFLY_FLAG = 0;
-my $RERUN_BUTTERFLY_FLAG = 0;
-my $bfly_opts = "";
-my $bflyHeapSpaceMax = "10G";
-my $bflyHeapSpaceInit = "1G";
-
-my $BFLY_JAR = "";
-
-# butterfly path merging criteria
-my $NO_PATH_MERGING = 0;
-my $MIN_PER_ID_SAME_PATH; # leave these at the butterfy defaults
-my $MAX_DIFFS_SAME_PATH;
-my $MAX_INTERNAL_GAP_SAME_PATH;
-
-
-# misc opts
-my $min_kmer_cov = 1;
-my $meryl_opts = "";
-my $inchworm_cpu = 6;
-
-my $min_percent_read_iworm_kmers = -1; # experimental, off
-
-my $CPU = 2;
-my $bflyCPU;
-my $bflyCalculateCPU = 0;
-my $bflyGCThreads = 2;
-
-my $long_reads = "";
-
-
-## ADVANCED OPTIONS:
-
-my $no_meryl_flag = 0;
-
-## Chrysalis opts
-my $min_glue = 2;
-my $min_iso_ratio = 0.05;
-my $glue_factor = 0.05;
-my $max_reads_per_graph = 200000;
-my $max_reads_per_loop = 10000000;
-my $min_pct_read_mapping = 0;
-my $NO_RUN_QUANTIFYGRAPH_FLAG = 0;
-my $NO_RUN_CHRYSALIS_FLAG = 0;
-my $chrysalis_output_dir = "chrysalis";
-my $component_directory;
-
-my $help_flag;
-my $advanced_help_flag;
-my $SHOW_CITATION_FLAG = 0;
-
-my $VERSION = "trinityrnaseq_r20140717";
-my $show_version_flag = 0;
-
-## Kmer methods
-my $kmer_method = "";
-
-## Jellyfish
-my $max_memory;
-
-
-## Grid computing options:
-my $grid_conf_file;
-
-## Performance monitoring options
-my $pm_logfile = "Trinity.timing";
-my $pm_trinity_startstring;
-my $pm_trinity_endstring;
-my $pm_trinity_start=0;
-my $pm_trinity_end=0;
-my $pm_inchworm_start=0;
-my $pm_inchworm_end=0;
-my $pm_chrysalis_start=0;
-my $pm_chrysalis_end=0;
-my $pm_butterfly_start=0;
-my $pm_butterfly_end=0;
-my $pm_left_fa_size=0;
-my $pm_right_fa_size=0;
-my $pm_single_fa_size=0;
-my $pm_trinity_fa_size=0;
-my $pm_trinity_arguments="";
-my $pm_inchworm_kmers=0;
-my $pm_read_count=0;
-
-my $run_with_collectl = 0;
-# flush each second, record procs+rest every 5 secs, use only process subsystem
-my $collectl_param = "-F1 -i5:5 -sZ";
-my $collectl_output_directory = "collectl";
-my $collectl_pid = 0;
-my $collectl_out = "";
-my $collectl_titlename = "";
-my $start_dir = cwd();
-
-## misc other opts, mostly for testing purposes
-my $run_as_paired_flag = 0; ## in case we have paired reads in single fasta file, already oriented.
-my $weldmer_size = 48;
-my $FORCE_INCHWORM_KMER_METHOD = 0;
-
-my $TRIPLET_LOCK = 1;
-my $EXTENDED_TRIPLET_LOCK = 0;
-
-
-
-
-my $PARALLEL_IWORM_FLAG = 1;
-my $NO_PARALLEL_IWORM = 0;
-
-## Quality trimming params
-my $RUN_TRIMMOMATIC_FLAG = 0;
-my $trimmomatic_quality_trim_params = "LEADING:5 TRAILING:5 MINLEN:36";
-
-## Normalize reads
-my $NORMALIZE_READS_FLAG = 0;
-my $normalize_max_read_cov = 50;
-my $NORMALIZE_BY_READ_SET = 0;
-
-# Note: For the Trinity logo below the backslashes are quoted in order to keep
-# them from quoting the character than follows them. "\\" keeps "\ " from occuring.
-
-my $basic_usage = qq^
-
-
-###############################################################################
-#
-# ______ ____ ____ ____ ____ ______ __ __
-# | || \\ | || \\ | || || | |
-# | || D ) | | | _ | | | | || | |
-# |_| |_|| / | | | | | | | |_| |_|| ~ |
-# | | | \\ | | | | | | | | | |___, |
-# | | | . \\ | | | | | | | | | | |
-# |__| |__|\\_||____||__|__||____| |__| |____/
-#
-###############################################################################
-#
-# Required:
-#
-# --seqType <string> :type of reads: ( $allowed{seqType} )
-#
-# --JM <string> :(Jellyfish Memory) number of GB of system memory to use for
-# k-mer counting by jellyfish (eg. 10G) *include the 'G' char
-#
-# If paired reads:
-# --left <string> :left reads, one or more (separated by space)
-# --right <string> :right reads, one or more (separated by space)
-#
-# Or, if unpaired reads:
-# --single <string> :single reads, one or more (note, if single file contains pairs, can use flag: --run_as_paired )
-#
-####################################
-## Misc: #########################
-#
-# --SS_lib_type <string> :Strand-specific RNA-Seq read orientation.
-# if paired: RF or FR,
-# if single: F or R. (dUTP method = RF)
-# See web documentation.
-#
-# --CPU <int> :number of CPUs to use, default: $CPU
-# --min_contig_length <int> :minimum assembled contig length to report
-# (def=$min_contig_length)
-#
-# --genome <string> :genome guided mode, provide path to genome fasta file (see genome-guided param section under --show_full_usage_info)
-#
-# --jaccard_clip :option, set if you have paired reads and
-# you expect high gene density with UTR
-# overlap (use FASTQ input file format
-# for reads).
-# (note: jaccard_clip is an expensive
-# operation, so avoid using it unless
-# necessary due to finding excessive fusion
-# transcripts w/o it.)
-#
-# --trimmomatic :run Trimmomatic to quality trim reads
-# see '--quality_trimming_params' under full usage info for tailored settings.
-#
-#
-# --normalize_reads :run in silico normalization of reads. Defaults to max. read coverage of $normalize_max_read_cov.
-# see '--normalize_max_read_cov' under full usage info for tailored settings.
-#
-#
-# --output <string> :name of directory for output (will be
-# created if it doesn't already exist)
-# default( your current working directory: "$output_directory" )
-#
-# --full_cleanup :only retain the Trinity fasta file, rename as \${output_dir}.Trinity.fasta
-#
-# --cite :show the Trinity literature citation
-#
-# --version :reports Trinity version ($VERSION) and exits.
-#
-# --show_full_usage_info :show the many many more options available for running Trinity (expert usage).
-^;
-
-my $full_usage = qq^
-# --prep :Only prepare files (high I/O usage) and stop before kmer counting.
-#
-# --full_cleanup_ET :only retains assembly fasta file, error tolerant (ET)
-#
-# --no_cleanup :retain all intermediate input files.
-#
-####################################################
-# Inchworm and K-mer counting-related options: #####
-#
-# --min_kmer_cov <int> :min count for K-mers to be assembled by
-# Inchworm (default: $min_kmer_cov)
-# --inchworm_cpu <int> :number of CPUs to use for Inchworm, default is min(6, --CPU option)
-#
-# --no_run_inchworm :stop after running jellyfish, before inchworm.
-#
-###################################
-# Chrysalis-related options: ######
-#
-# --max_reads_per_graph <int> :maximum number of reads to anchor within
-# a single graph (default: $max_reads_per_graph)
-# --min_glue <int> :min number of reads needed to glue two inchworm contigs
-# together. (default: $min_glue)
-# --no_run_chrysalis :stop Trinity after Inchworm and before
-# running Chrysalis
-# --no_run_quantifygraph :stop Trinity just before running the
-# parallel QuantifyGraph computes, to
-# leverage a compute farm and massively
-# parallel execution..
-#
-# --chrysalis_output <string> :name of directory for chrysalis output (will be
-# created if it doesn't already exist)
-# default( "$chrysalis_output_dir" )
-#
-# --no_bowtie :dont run bowtie to use pair info in chrysalis clustering.
-#
-#####################################
-### Butterfly-related options: ####
-#
-# --bfly_opts <string> :additional parameters to pass through to butterfly
-# (see butterfly options: java -jar Butterfly.jar ).
-# (note: only for expert or experimental use. Commonly used parameters are exposed through this Trinity menu here).
-#
-# //////////////////////////////////
-# Alternative reconstruction modes:
-# Default mode is the 'regular' Butterfly transcript reconstruction by graph node extension.
-#
-# --PasaFly PASA-like algorithm for maximally-supported isoforms
-# or
-# --CuffFly Cufflinks-like algorithm to report minimum transcripts
-#
-#
-# Butterfly read-pair grouping settings (used for all reconstruction modes to define 'pair paths'):
-#
-# --group_pairs_distance <int> :maximum length expected between fragment pairs (default: $group_pairs_distance)
-# (reads outside this distance are treated as single-end)
-#
-# ///////////////////////////////////////////////
-# Butterfly default reconstruction mode settings. (no CuffFly or PasaFly custom settings are currently available).
-#
-# --path_reinforcement_distance <int> :minimum overlap of reads with growing transcript
-# path (default: PE: $PE_path_reinforcement_distance, SE: $SE_path_reinforcement_distance)
-# Set to 1 for the most lenient path extension requirements.
-#
-# --no_triplet_lock : (increase stringency of regular butterfly reconstruction (default: on))
-# lock triplet-supported nodes: node 'c' having read path 'A-B-C' disables 'Z-B-C' if no such read support exists.
-#
-# --extended_lock : (further increase the stringency of regular butterfy reconstruction)
-# extend the triplet lock to include longer range read path information.
-# ex. in extending path 'A-B-Z' to 'A-B-Z-D', we only find read support for 'A-B-C-D', that 'A-B-Z' extension to 'D' will be blocked.
-#
-#
-# /////////////////////////////////////////
-# Butterfly transcript reduction settings:
-#
-# --NO_EM_REDUCE : do not run the final EM step to rank transcripts and remove lower-ranking entries that lack unique read conent.
-#
-# --no_path_merging : all final transcript candidates are output (including SNP variations, however, some SNPs may be unphased)
-#
-# By default, alternative transcript candidates are merged (in reality, discarded) if they are found to be too similar, according to the following logic:
-#
-# (identity=(numberOfMatches/shorterLen) > 95.0% or if we have <= 2 mismatches) and if we have internal gap lengths <= 10
-#
-# with parameters as:
-#
-# --min_per_id_same_path <int> default: 95 min percent identity for two paths to be merged into single paths
-# --max_diffs_same_path <int> default: 2 max allowed differences encountered between path sequences to combine them
-# --max_internal_gap_same_path <int> default: 10 maximum number of internal consecutive gap characters allowed for paths to be merged into single paths.
-#
-# If, in a comparison between two alternative transcripts, they are found too similar, the transcript with the greatest cumulative
-# compatible read (pair-path) support is retained, and the other is discarded.
-#
-#
-# //////////////////////////////////////////////
-# Butterfly Java and parallel execution settings.
-#
-# --bflyHeapSpaceMax <string> :java max heap space setting for butterfly
-# (default: $bflyHeapSpaceMax) => yields command
-# 'java -Xmx$bflyHeapSpaceMax -jar Butterfly.jar ... \$bfly_opts'
-# --bflyHeapSpaceInit <string> :java initial hap space settings for
-# butterfly (default: $bflyHeapSpaceInit) => yields command
-# 'java -Xms$bflyHeapSpaceInit -jar Butterfly.jar ... \$bfly_opts'
-# --bflyGCThreads <int> :threads for garbage collection
-# (default: $bflyGCThreads))
-# --bflyCPU <int> :CPUs to use (default will be normal
-# number of CPUs; e.g., $CPU)
-# --bflyCalculateCPU :Calculate CPUs based on 80% of max_memory
-# divided by maxbflyHeapSpaceMax
-# --no_run_butterfly :stops after the Chrysalis stage. You'll
-# need to run the Butterfly computes
-# separately, such as on a computing grid.
-# Then, concatenate all the Butterfly assemblies by running:
-# 'find trinity_out_dir/ -name "\*allProbPaths.fasta" \
-# -exec cat {} + > trinity_out_dir/Trinity.fasta'
-#
-# --bfly_jar <string> : /path/to/Butterfly.jar, otherwise default
-# Trinity-installed version is used.
-#
-
-#
-################################################################################
-#### Quality Trimming Options ####
-#
-# --quality_trimming_params <string> defaults to: "$trimmomatic_quality_trim_params"
-#
-################################################################################
-#### In silico Read Normalization Options ###
-#
-# --normalize_max_read_cov <int> defaults to 50
-# --normalize_by_read_set run normalization separate for each pair of fastq files,
-# then one final normalization that combines the individual normalized reads.
-# Consider using this if RAM limitations are a consideration.
-#
-################################################################################
-#### Genome-guided de novo assembly
-#
-# * required:
-#
-# --genome_guided_max_intron <int> :maximum allowed intron length (also maximum fragment span on genome)
-#
-# --genome_guided_use_bam <string> :use a provided coord-sorted bam file as starting point. Otherwise, use gmap to align to the genome.
-#
-# * optional:
-#
-# --genome_guided_min_coverage <int> :minimum read coverage for identifying and expressed region of the genome. (default: 1)
-#
-# --genome_guided_min_reads_per_partition <int> :default min of 10 reads per partition
-#
-# --genome_guided_CPU <int> : number of threads for the individual genome-guided Trinity commands to use. (defaults to --CPU setting)
-#
-# --genome_guided_sort_buffer <string> : amount of RAM to dedicate to the initial prep of genome-guided read partitioning (defaults to --JM)
-#
-#
-# --GMAP_CPU <int> :defaults to --CPU setting.
-#
-# --genome_guided_just_prep : process stops after prepping the reads for assembly (prior to submitting to a computing grid for parallel execution)
-#
-#################################
-# Grid-computing options: #######
-#
-# --grid_conf_file <string> :configuration file for supported compute farms
-# ex. TRINITY_HOME/htc_conf/BroadInst_LSF.conf
-# currently supported computing gris: LSF, SGE
-#
-#
- ^;
-
-my $usage_synopsis = qq^
-###############################################################################
-#
-# *Note, a typical Trinity command might be:
-#
-# Trinity --seqType fq --JM 100G --left reads_1.fq --right reads_2.fq --CPU 6
-#
-#
-# and for Genome-guided Trinity:
-#
-# Trinity --genome genome.fasta \
-# --genome_guided_max_intron 10000 --genome_guided_sort_buffer 10G \
-# --genome_guided_CPU 4 \
-# --seqType fq --JM 2G --left reads_1.fq --right reads_2.fq --CPU 6
-# (and optionally provide your own bam file: --genome_guided_use_bam rnaseq_alignments.csorted.bam
-# or Trinity will run GSNAP to generate one. )
-#
-#
-# see: $FindBin::RealBin/sample_data/test_Trinity_Assembly/
-# for sample data and 'runMe.sh' for example Trinity execution
-# For more details, visit: http://trinityrnaseq.sf.net
-#
-###############################################################################
-
-
- ^;
-
-
-
-my $advanced_usage = <<_ADVANCEDUSAGE_;
-###################################################################################
- ## Not intended for users, instead for experimentation by developers ##
-###################################################################################
-#
-#
-# Inchworm-related options:
-#
-# --INCHWORM_CUSTOM_PARAMS <string> :additional parameters to be passed on to Inchworm
-# --FORCE_INCHWORM_KMER_METHOD :uses inchworm built-in kmer cataloger instead of jellyfish (not recommended)
-# --long_reads <string> :fasta file containing corrected pac bio reads
-# --NO_PARALLEL_IWORM : turn off parallel iworm assembly
-#
-#
-# Chyrsalis-related options:
-#
-# --min_pcnt_read_iworm_kmers <int> :min percentage of a read sequence that must be composed of inchworm kmers to be pursued
-# by chrysalis (default: $min_percent_read_iworm_kmers) note: off if < 0
-#
-# --min_iso_ratio <float> :min fraction of average kmer coverage between two iworm contigs
-# required for gluing. (default: $min_iso_ratio)
-# --glue_factor <float> :fraction of max (iworm pair coverage) for read glue support (default: $glue_factor)
-#
-# --max_reads_per_loop <int> :maximum number of reads to read into
-# memory at once (default: $max_reads_per_loop)
-# --min_pct_read_mapping <int> :minimum percent of a reads kmers that must map to an
-# inchworm bundle (aka. component) default: 0
-#
-# --bowtie_components :use bowtie2 to generate readsToTranscripts mappings
-#
-#
-# Other:
-# --monitoring :use collectl to monitor all steps of Trinity
-#
-# --compdir|component_directory : use a temporary or local directory for Components_bin
-#
-#
-
-
-_ADVANCEDUSAGE_
-
- ;
-
-
-my $ROOTDIR = "$FindBin::RealBin";
-my $UTILDIR = "$ROOTDIR/util";
-my $INCHWORM_DIR = "$ROOTDIR/Inchworm";
-my $CHRYSALIS_DIR = "$ROOTDIR/Chrysalis";
-my $BUTTERFLY_DIR = "$ROOTDIR/Butterfly";
-my $JELLYFISH_DIR = "$ROOTDIR/trinity-plugins/jellyfish";
-my $FASTOOL_DIR = "$ROOTDIR/trinity-plugins/fastool";
-my $COLLECTL_DIR = "$ROOTDIR/trinity-plugins/collectl/bin";
-my $COREUTILS_DIR = "$ROOTDIR/trinity-plugins/coreutils/bin";
-my $PARAFLY = "$ROOTDIR/trinity-plugins/parafly/bin/ParaFly";
-my $TRIMMOMATIC = "$ROOTDIR/trinity-plugins/Trimmomatic/trimmomatic.jar";
-
-my $usage = $basic_usage . $usage_synopsis;
-
-unless (@ARGV) {
- die "$usage\n";
-}
-
-# Log command line parameters for performance monitoring
-foreach (@ARGV) {
- $pm_trinity_arguments = $pm_trinity_arguments . " " . $_;
-};
-
-
-my $sort_exec = &COMMON::get_sort_exec($CPU);
-
-my $NO_FASTOOL = 0;
-my $NO_CLEANUP = 0;
-my $FULL_CLEANUP = 0;
-my $FULL_CLEANUP_ERROR_TOLERANT = 0; ## NOTE, THIS IS AN AWFUL IDEA... //FIXME: add propper error-handling mechanisms
-my $NO_BOWTIE = 0;
-
-
-my $BOWTIE_COMP = 0;
-
-my $NO_RUN_INCHWORM_FLAG = 0;
-
-my $JELLY_S;
-
-
-my $PASAFLY_MODE = 0;
-my $CUFFFLY_MODE = 0;
-
-my $full_usage_info_flag;
-
-my $NO_TRIPLET_LOCK;
-my $NO_EM_REDUCE;
-
-## Genome-guided params:
-my $genome_fasta_file;
-my $genome_guided_max_intron;
-my $genome_guided_use_bam;
-my $genome_guided_min_coverage = 1;
-my $genome_guided_min_reads_per_partition = 10;
-my $GMAP_CPU;
-my $genome_guided_CPU;
-my $genome_guided_sort_buffer;
-my $genome_guided_just_prep_flag = 0;
-
-my @ORIG_ARGS = @ARGV;
-
-&GetOptions(
-
- 'h|help' => \$help_flag,
- 'advanced_help' => \$advanced_help_flag,
- 'show_full_usage_info' => \$full_usage_info_flag,
-
- ## general opts
- "seqType=s" => \$seqType,
- "left=s{,}" => \@left_files,
- "right=s{,}" => \@right_files,
- "single=s{,}" => \@single_files,
-
- "SS_lib_type=s" => \$SS_lib_type,
-
- "long_reads=s" => \$long_reads,
-
- "output=s" => \$output_directory,
-
- "min_contig_length=i" => \$min_contig_length,
-
- "jaccard_clip" => \$jaccard_clip,
-
- "cite" => \$SHOW_CITATION_FLAG,
-
- 'CPU=i' => \$CPU,
-
- 'prep' => \$prep_only,
-
- 'KMER_SIZE=i' => \$KMER_SIZE,
-
-
- # Quality trimming:
- 'trimmomatic' => \$RUN_TRIMMOMATIC_FLAG,
- 'quality_trimming_params=s' => \$trimmomatic_quality_trim_params,
-
- # In silico read normalization
- 'normalize_reads' => \$NORMALIZE_READS_FLAG,
- 'normalize_max_read_cov=i' => \$normalize_max_read_cov,
- 'normalize_by_read_set' => \$NORMALIZE_BY_READ_SET,
-
-
- # Butterfly opts
- 'no_run_butterfly' => \$NO_RUN_BUTTERFLY_FLAG,
- 'no_triplet_lock' => \$NO_TRIPLET_LOCK,
- 'extended_lock' => \$EXTENDED_TRIPLET_LOCK,
- "group_pairs_distance=i" => \$group_pairs_distance,
- 'bfly_opts=s' => \$bfly_opts,
- 'bflyHeapSpaceMax=s' => \$bflyHeapSpaceMax,
- 'bflyHeapSpaceInit=s' => \$bflyHeapSpaceInit,
- 'bflyGCThreads=i' => \$bflyGCThreads,
- 'bflyCPU=i' => \$bflyCPU,
- 'bflyCalculateCPU' => \$bflyCalculateCPU,
- 'bfly_jar=s' => \$BFLY_JAR,
-
- 'path_reinforcement_distance=i' => \$path_reinforcement_distance,
- 'rerun_butterfly' => \$RERUN_BUTTERFLY_FLAG,
-
- 'NO_EM_REDUCE' => \$NO_EM_REDUCE,
- 'no_path_merging' => \$NO_PATH_MERGING,
- 'min_per_id_same_path=i' => \$MIN_PER_ID_SAME_PATH,
- 'max_diffs_same_path=i' => \$MAX_DIFFS_SAME_PATH,
- 'max_internal_gap_same_path=i' => \$MAX_INTERNAL_GAP_SAME_PATH,
-
-
- 'PasaFly' => \$PASAFLY_MODE,
- 'CuffFly' => \$CUFFFLY_MODE,
-
- # Inchworm & kmer catalog opts
-
- 'min_kmer_cov=i' => \$min_kmer_cov,
- 'inchworm_cpu=i' => \$inchworm_cpu,
- 'FORCE_INCHWORM_KMER_METHOD' => \$FORCE_INCHWORM_KMER_METHOD,
- 'INCHWORM_CUSTOM_PARAMS=s' => \$INCHWORM_CUSTOM_PARAMS,
- 'no_run_inchworm' => \$NO_RUN_INCHWORM_FLAG,
-
- # Jellyfish
- 'JM=s' => \$max_memory, # in GB
-
- # Chrysalis -related opts
- 'min_glue=i' => \$min_glue,
- 'glue_factor=f' => \$glue_factor,
- 'min_iso_ratio=f' => \$min_iso_ratio,
- 'min_pcnt_read_iworm_kmers=i' => \$min_percent_read_iworm_kmers,
- 'no_run_quantifygraph' => \$NO_RUN_QUANTIFYGRAPH_FLAG,
- 'max_reads_per_graph=i' => \$max_reads_per_graph,
- 'max_reads_per_loop=i' => \$max_reads_per_loop,
- 'no_run_chrysalis' => \$NO_RUN_CHRYSALIS_FLAG,
- 'min_pct_read_mapping=i' => \$min_pct_read_mapping,
- 'weldmer_size=i' => \$weldmer_size,
- "chrysalis_output=s" => \$chrysalis_output_dir,
- "no_bowtie" => \$NO_BOWTIE,
- "bowtie_comp" => \$BOWTIE_COMP,
-
- # Grid computing options
- 'grid_conf_file=s' => \$grid_conf_file,
-
- "show_advanced_options" => \$show_advanced_options,
-
-
- # misc
- 'run_as_paired' => \$run_as_paired_flag,
- 'no_fastool' => \$NO_FASTOOL,
- 'no_cleanup' => \$NO_CLEANUP,
- 'full_cleanup' => \$FULL_CLEANUP,
- 'version' => \$show_version_flag,
- 'monitoring' => \$run_with_collectl,
- 'full_cleanup_ET' => \$FULL_CLEANUP_ERROR_TOLERANT,
-
- # hidden (don't look here! ;)
- 'KMER_SIZE=i' => \$KMER_SIZE,
- 'jelly_s=i' => \$JELLY_S,
- 'compdir|component_directory=s' => \$component_directory,
- 'NO_PARALLEL_IWORM' => \$NO_PARALLEL_IWORM,
-
-
-
- # genome guided
- "genome=s" => \$genome_fasta_file,
- "genome_guided_max_intron=i" => \$genome_guided_max_intron,
- "genome_guided_use_bam=s" => \$genome_guided_use_bam,
- "genome_guided_min_coverage=i" => \$genome_guided_min_coverage,
- "genome_guided_min_reads_per_partition=i" => \$genome_guided_min_reads_per_partition,
- "genome_guided_CPU=i" => \$genome_guided_CPU,
- "GMAP_CPU=i" => \$GMAP_CPU,
- "genome_guided_sort_buffer=s" => \$genome_guided_sort_buffer,
- "genome_guided_just_prep" => \$genome_guided_just_prep_flag,
-
- );
-
-
-
-if ($SHOW_CITATION_FLAG) {
- &show_lit_citation();
- exit(0);
-}
-
-
-if ($full_usage_info_flag) {
- $usage = $basic_usage . $full_usage . $usage_synopsis;
- die "$usage\n";
-}
-
-
-if ($advanced_help_flag) {
- die "$advanced_usage\n";
-}
-if ($help_flag) {
- die "$usage\n";
-}
-
-if ($show_version_flag) {
- print "Trinity version: $VERSION\n";
- exit(1);
-}
-
-if ($NO_CLEANUP && $FULL_CLEANUP) {
- die "cannot set --no_cleanup and --full_cleanup as they contradict";
-}
-
-
-if ($KMER_SIZE > $MAX_KMER_SIZE) {
- die "Error, kmer size can be at most $MAX_KMER_SIZE ";
-}
-
-
-if ($NO_TRIPLET_LOCK) {
- $TRIPLET_LOCK = 0; # turn it off since on by default.
-}
-if ($NO_PARALLEL_IWORM) {
- # turn it off.
- $PARALLEL_IWORM_FLAG = 0;
-}
-
-my $MIN_IWORM_LEN = $KMER_SIZE;
-
-
-unless ($GMAP_CPU) {
- $GMAP_CPU = $CPU;
-}
-unless ($genome_guided_CPU) {
- $genome_guided_CPU = $CPU;
-}
-
-if (@ARGV) {
- die "Error, do not understand options: @ARGV\n";
-}
-
-if ($run_with_collectl && $^O !~ /linux/i) {
- print STDERR "WARNING, --monitoring can only be used on linux. Turning it off.\n\n";
- $run_with_collectl = 0;
-}
-
-unless ($BFLY_JAR) {
- $BFLY_JAR = "$BUTTERFLY_DIR/Butterfly.jar";
-}
-
-
-## Check options set:
-
-# Subroutine takes variable *reference* plus name of variable. Lower-cases
-# variable value and checks to see if it one of the allowed ones.
-# 'die' has new-line in order to keep line number from being shown to user.
-
-sub check_option {
- my ($option, $name) = @_;
- $$option = lc $$option;
- if ($$option eq '') {
- die "Error, option '--$name' is required.\n";
- }
- if (!defined $allowed_check{$name}{$$option}) {
- die "Error, option '--$name' ($$option) not one of $allowed{$name}\n";
- }
-}
-
-check_option( \$seqType, 'seqType' );
-
-my $USE_FASTOOL = 1; # by default, using fastool for fastq to fasta conversion
-if ($NO_FASTOOL) {
- $USE_FASTOOL = 0;
-}
-
-if ($SS_lib_type) {
- unless ($SS_lib_type =~ /^(R|F|RF|FR)$/) {
- die "Error, unrecognized SS_lib_type value of $SS_lib_type. Should be: F, R, RF, or FR\n";
- }
-}
-
-unless ( (@left_files && @right_files) || @single_files ) {
- die "Error, need either options 'left' and 'right' or option 'single'\n";
-}
-
-if (@left_files) {
- @left_files = split(",", join(",", @left_files));
-}
-if (@right_files) {
- @right_files = split(",", join(",", @right_files));
-}
-if (@single_files) {
- @single_files = split(",", join(",", @single_files));
-}
-
-
-if ($min_iso_ratio > 1) {
- die "Error, --min_iso_ratio should be <= 1 \n";
-}
-
-## keep the original 'xG' format string for the --JM option, then calculate the numerical value for max_memory
-my $JM_string = $max_memory; ## this one is used in the Chrysalis exec string
-if ($max_memory) {
- $max_memory =~ /^([\d\.]+)G$/ or die "Error, cannot parse max_memory value of $max_memory. Set it to 'xG' where x is a numerical value\n";
-
- $max_memory = $1;
- $max_memory *= 1024**3; # convert to from gig to bytes
-}
-else {
- die "Error, must specify max memory for jellyfish to use, eg. --JM 10G \n";
-}
-
-unless ($genome_guided_sort_buffer) {
- $genome_guided_sort_buffer = $JM_string;
-}
-
-## Try to remove stack limits
-if ($^O eq "linux") { # cannot set stacksize on newer macs for some reason...
-# &try_unlimit_stacksize();
-}
-
-my $curr_limit_settings = `/bin/sh -c 'ulimit -a' `;
-unless ($curr_limit_settings && $curr_limit_settings =~ /\w/) {
- $curr_limit_settings = `/bin/csh -c limit`; # backup, probably not needed.
-}
-
-print "Current settings:\n$curr_limit_settings\n\n";
-
-
-## Check Java version:
-unless ($NO_RUN_BUTTERFLY_FLAG || $NO_RUN_CHRYSALIS_FLAG) {
- my $java_version = `java -Xmx64m -version 2>&1 `;
- unless ($java_version =~ /(java|openjdk) version \"1\.[67]\./) {
- die "Error, Trinity requires access to Java version 1.6 or 1.7. Currently installed version is: $java_version";
- }
-}
-
-# Give the variable with memory size and a user-oriented name
-
-sub bfly_check {
- my ($mem, $name) = @_;
- my ($num, $type) = $mem =~ /^(\d+)([MG])$/;
- if (!defined $mem || !defined $type) {
- die "Error, $name must be set to a value of format: \\d+G or \\d+M (eg. 1G or 1000M)\n Currently: $mem\n";
- }
- return $type eq 'G' ? $num * 1024**3 : $num * 1024**2;
-}
-
-my $bflyHeapSpaceMaxBytes = bfly_check($bflyHeapSpaceMax , 'bflyHeapSpaceMax' );
-my $bflyHeapSpaceInitBytes = bfly_check($bflyHeapSpaceInit, 'bflyHeapSpaceInit');
-
-if ($bflyHeapSpaceInitBytes > $bflyHeapSpaceMaxBytes) {
- die "Error, bflyHeapSpaceInit ($bflyHeapSpaceInit) must be less or equal to bflyHeapSpaceMax ($bflyHeapSpaceMax).\n";
-}
-
-
-if ($CPU > $CPU_MAX) {
- print STDERR "Warning, --CPU $CPU might be excessive. Limiting it to $CPU_MAX for now.\n";
- $CPU = $CPU_MAX;
-}
-
-if ($inchworm_cpu > $CPU) {
- $inchworm_cpu = $CPU;
-}
-
-if ($bflyCalculateCPU && $max_memory) {
- $bflyCPU = int ($max_memory * 0.80 / $bflyHeapSpaceMaxBytes);
-}
-
-$bflyCPU = $CPU if !defined $bflyCPU;
-
-if ($bflyCPU > $CPU_MAX) {
- print STDERR "Warning, --bflyCPU $bflyCPU might be excessive. Limiting it to $CPU_MAX for now.\n";
- $bflyCPU = $CPU_MAX;
-}
-
-
-if (defined($bflyGCThreads) && $bflyGCThreads > 32) {
- die "Error, you probably want fewer than $bflyGCThreads java garbage collection threads. Try a number less than 32.";
-}
-
-
-if ($genome_fasta_file) {
- ## genome-guided mode.
- unless ($genome_guided_max_intron) {
- die "Error, must specifiy --genome_guided_max_intron <int> for genome-guided mode.\n";
- }
- unless ($genome_guided_use_bam) {
- ## check for gsnap software
- my @tools = qw(gmap_build gsnap);
- foreach my $tool (@tools) {
- my $path = `which $tool`;
- if ($path =~ /\w/) {
- print STDERR "Found $tool at $path\n";
- }
- else {
- die "Error, cannot locate tool: $tool, required for genome-guided pipeline.";
- }
- }
- }
-}
-
-
-
-$ENV{OMP_NUM_THREADS} = $CPU; ## for Inchworm and Chrysalis
-
-
-my $PAIRED_MODE = ( (@left_files && @right_files) || $run_as_paired_flag) ? 1:0;
-if ($PAIRED_MODE && (!$NO_RUN_CHRYSALIS_FLAG) && (!$NO_BOWTIE)) {
- ## be sure we can find 'bowtie', since we use it as part of the iworm pair scaffolding step
- my $bowtie_path = `which bowtie`;
- my $bowtie_build_path = `which bowtie-build`;
- if ($bowtie_path =~ /\w/ && $bowtie_build_path =~ /\w/) {
- print "Paired mode requires bowtie. Found bowtie at: $bowtie_path\n and bowtie-build at $bowtie_build_path\n\n";
- }
- else {
- die "Error, cannot find path to bowtie ($bowtie_path) or bowtie-build ($bowtie_build_path), which is now needed as part of Chrysalis' read scaffolding step. If you should choose to not run bowtie, include the --no_bowtie in your Trinity command.\n\n";
- }
-
- my $samtools_path = `which samtools`;
- if ($samtools_path =~ /\w/) {
- print "Found samtools at: $samtools_path\n";
- }
- else {
- die "Error, cannot find samtools. Please be sure samtools is installed and included in your PATH setting.\n";
- }
-
- unless ($path_reinforcement_distance) {
- $path_reinforcement_distance = $PE_path_reinforcement_distance;
- }
-}
-else {
- unless ($path_reinforcement_distance) {
- $path_reinforcement_distance = $SE_path_reinforcement_distance;
- }
-}
-
-
-my $MKDIR_OUTDIR_FLAG = 0; ## only purging output_directory if we create it in this run.
-
-
-## Regular run. Name the output based on the butterfly reconstruction mode.
-my $butterfly_output_filename = "Trinity.fasta";
-if ($PASAFLY_MODE) {
- $butterfly_output_filename = "Trinity.Pasafly.fasta";
-}
-elsif ($CUFFFLY_MODE) {
- $butterfly_output_filename = "Trinity.Cufffly.fasta";
-}
-
-main: {
- $ENV{OMP_NUM_THREADS} = $CPU;
-
-
- unless ($NO_RUN_BUTTERFLY_FLAG || $NO_RUN_CHRYSALIS_FLAG) {
- print STDERR "-since butterfly will eventually be run, lets test for proper execution of java\n";
- &test_java_failure_capture();
- }
-
- unless ($genome_fasta_file) {
-
- if (basename($chrysalis_output_dir) !~ /chrysalis/i) {
- die "Error, chrysalis output directory name must include 'chrysalis' in the name."; # lets try to prevent bad things from happening... (security issue)
- }
-
- if ($FULL_CLEANUP && basename($output_directory) !~ /\w/) {
- die "Error, working in full-cleanup mode. Specify a named directory for the output. The directory and contents are purged at end of a successful run.";
- }
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) { # genome-guided mode
-
- if (basename($output_directory) !~ /trinity/i) {
- die "Error, in genome-guided mode, the output directory name must include 'trinity' in the name (precautionary measure)";
- }
- $FULL_CLEANUP = 1;
-
- ## purge chrysalis directory from a previously failed run
- if (-d $output_directory) {
- print STDERR "WARNING: $output_directory exists. Since under full-cleanup mode, deleting this first before proceeding.\n:";
- &process_cmd("rm -rf $output_directory");
- }
- }
-
-
- if ($chrysalis_output_dir !~ /^\//) {
- $chrysalis_output_dir = "$output_directory/$chrysalis_output_dir";
- }
-
- $chrysalis_output_dir = &create_full_path($chrysalis_output_dir, 0);
-
- if ($component_directory){
- # does a component directory exist from a previous run?
- if (-e $chrysalis_output_dir.'/Component_bins'){
- if (-l $chrysalis_output_dir.'/Component_bins'){
- $component_directory = readlink($chrysalis_output_dir.'/Component_bins');
- }else{
- $component_directory = $chrysalis_output_dir.'/Component_bins';
- }
- warn "Reusing existing component directory $component_directory\n";
- }else{
- $component_directory .= "/Trinity.$$";
- mkdir($component_directory) || die ("component directory cannot be created or already exists!\n");
- die "Cannot create component directory $component_directory" unless -d $component_directory;
- $component_directory .= "/Component_bins";
- mkdir($component_directory) || die ("component directory cannot be created or already exists!\n");
- die "Cannot create component directory $component_directory" unless -d $component_directory;
- }
- # so that users know where it is/remember to remove it if manually done?
- symlink($component_directory,$chrysalis_output_dir.'/Component_bins') unless -e $chrysalis_output_dir.'/Component_bins';
- }else{
- $component_directory = $chrysalis_output_dir . '/Component_bins';
- $component_directory = &create_full_path($component_directory,0);
- }
- }
-
-
-
- ## create complete paths for input files:
- @left_files = &create_full_path(\@left_files, 1) if @left_files;
- @right_files = &create_full_path(\@right_files, 1) if @right_files;
- @single_files = &create_full_path(\@single_files, 1) if @single_files;
- $output_directory = &create_full_path($output_directory, 0);
- $long_reads = &create_full_path($long_reads, 1) if $long_reads;
- $genome_fasta_file = &create_full_path($genome_fasta_file, 1) if $genome_fasta_file;
- $genome_guided_use_bam = &create_full_path($genome_guided_use_bam, 1) if $genome_guided_use_bam;
-
- $grid_conf_file = &create_full_path($grid_conf_file, 1) if $grid_conf_file;
-
- unless (-d $output_directory) {
-
- &process_cmd("mkdir -p $output_directory");
- $MKDIR_OUTDIR_FLAG = 1;
- }
-
- if ((! $genome_fasta_file) && (! -d $chrysalis_output_dir)) {
- &process_cmd("mkdir -p $chrysalis_output_dir"); # note, won't be auto-cleaned up if not in the trinity_out_dir/
- }
-
- chdir ($output_directory) or die "Error, cannot cd to $output_directory";
-
- collectl_start() unless ($FULL_CLEANUP);
- &perfmon_start() unless ($FULL_CLEANUP);
-
- ##########################
- ## Run Quality Trimming
- ##########################
-
- if ($RUN_TRIMMOMATIC_FLAG) {
-
- print STDERR "---------------------------------------------------------------\n"
- . "------ Quality Trimming Via Trimmomatic ---------------------\n"
- . "<< $trimmomatic_quality_trim_params >>\n"
- . "---------------------------------------------------------------\n\n";
-
-
- unless ($seqType eq 'fq') {
- die "Error, cannot do quality trimming on fasta files, need fastq files.";
- }
-
- if (@left_files && @right_files) {
- my @trimmed_left_files;
- my @trimmed_right_files;
-
- while (@left_files) {
- my $left_file = shift @left_files;
- my $right_file = shift @right_files;
-
- my ($left_file_trimmed, $right_file_trimmed) = &run_trimmomatic_PE($left_file, $right_file, $trimmomatic_quality_trim_params);
- push (@trimmed_left_files, $left_file_trimmed);
- push (@trimmed_right_files, $right_file_trimmed);
- }
-
- @left_files = @trimmed_left_files;
- @right_files = @trimmed_right_files;
- }
- elsif (@single_files) {
- my @trimmed_single_files;
- foreach my $single_file (@single_files) {
- my $trimmed_single_file = &run_trimmomatic_SE($single_file, $trimmomatic_quality_trim_params);
- push (@trimmed_single_files, $trimmed_single_file);
- }
- @single_files = @trimmed_single_files;
- }
- }
-
- ##########################################
- ## In silico normalization
- ##########################################
-
- if ($NORMALIZE_READS_FLAG) {
-
- if (@left_files && @right_files) {
- my ($left_norm_file, $right_norm_file) = &run_normalization($normalize_max_read_cov, \@left_files, \@right_files);
- @left_files = ($left_norm_file);
- @right_files = ($right_norm_file);
- }
- elsif (@single_files) {
- @single_files = &run_normalization($normalize_max_read_cov, \@single_files);
- }
- }
-
- if ($genome_fasta_file) {
-
- if (@left_files && @right_files) {
- &run_genome_guided_Trinity(\@left_files, \@right_files);
- }
- else {
- &run_genome_guided_Trinity(\@single_files);
- }
-
-
- exit(0);
- }
-
-
-
- ## create inchworm file name
- my $inchworm_file = "inchworm.K$KMER_SIZE.L$MIN_IWORM_LEN";
- unless ($SS_lib_type) {
- $inchworm_file .= ".DS";
- }
- $inchworm_file .= ".fa";
- $inchworm_file = &create_full_path($inchworm_file, 0);
-
- my $trinity_target_fa = (@single_files) ? "single.fa" : "both.fa";
- my $inchworm_target_fa = $trinity_target_fa; # change this later if we have long_reads
-
-
- ## Don't prep the inputs if Inchworm already exists.... Resuming earlier operations.
- my $inchworm_finished_checkpoint_file = "$inchworm_file.finished";
- if (-s $inchworm_file && -e $inchworm_finished_checkpoint_file) {
- print "\n\n#######################################################################\n"
- . "Inchworm file: $inchworm_file detected.\n"
- . "Skipping Inchworm Step, Using Previous Inchworm Assembly\n"
- . "#######################################################################\n\n";
- #sleep(2);
- }
- else {
-
- ## Prep data for Inchworm
- my $count_of_reads;
- if (@left_files && @right_files) {
-
- unless (-s $trinity_target_fa && !-e "left.fa" && !-e "right.fa") {
-
- my ($left_SS_type, $right_SS_type);
- if ($SS_lib_type) {
- ($left_SS_type, $right_SS_type) = split(//, $SS_lib_type);
- }
- print("Converting input files. (in parallel)");
- my $thr1;
- my $thr2;
- if (!(-s "left.fa")) {
- $thr1 = threads->create('prep_seqs', \@left_files, $seqType, "left", $left_SS_type);
- } else {
- $thr1 = threads->create(sub { print ("left file exists, nothing to do");});
- }
- if (!(-s "right.fa")) {
- $thr2 = threads->create('prep_seqs', \@right_files, $seqType, "right", $right_SS_type);
- } else {
- $thr2 = threads->create(sub { print ("right file exists, nothing to do");});
- }
- @left_files = @{$thr1->join()};
- @right_files =@{$thr2->join()};
-
- if ($thr1->error() || $thr2->error()) {
- die "Error prepping sequences.";
- }
-
- print("Done converting input files.");
- ## Calculate input file sizes for performance monitoring
- # this should be set as the created fasta otherwise results will differ for same data passed as .fq and .fa?
- my $pm_temp = -s "left.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_left_fa_size = sprintf('%.0f', $pm_temp);
- $pm_temp = -s "right.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_right_fa_size = sprintf('%.0f', $pm_temp);
-
- &process_cmd("cat left.fa right.fa > $trinity_target_fa") unless (-s $trinity_target_fa && (-s $trinity_target_fa == ((-s "left.fa") + (-s "right.fa"))));
- unless (-s $trinity_target_fa == ((-s "left.fa") + (-s "right.fa"))){
- die "$trinity_target_fa is smaller (".(-s $trinity_target_fa)." bytes) than the combined size of left.fa and right.fa (".((-s "left.fa") + (-s "right.fa"))." bytes)\n";
- }
-
- # we keep if we have jaccard; delete later
- unlink ("left.fa", "right.fa") unless $jaccard_clip; # no longer needed now that we have 'both.fa', which is needed by chryaslis
- }
-
- foreach my $f ((@left_files, at right_files)){
- if (-s $f.'.readcount'){
- open (IN,$f.'.readcount');
- my $s = <IN>;
- close IN;
- $s=~/([0-9]+)$/;
- $count_of_reads += $1 if $1;
- }
- }
-
-
- }
- elsif (@single_files) {
-
- @single_files = @{&prep_seqs(\@single_files, $seqType, "single", $SS_lib_type) unless (-s "single.fa")};
- ## Calculate input file sizes for performance monitoring
- my $pm_temp = -s "single.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_single_fa_size = sprintf('%.0f', $pm_temp);
- foreach my $f (@single_files){
- if (-s $f.'.readcount'){
- open (IN,$f.'.readcount');
- my $s = <IN>;
- close IN;
- $s=~/([0-9]+)$/;
- $count_of_reads += $1 if $1;
- }
- }
- }
-
- else {
- die "not sure what to do. "; # should never get here.
- }
-
- if (!$count_of_reads){
- $count_of_reads = `wc -l < $inchworm_target_fa`;chomp($count_of_reads); #AP: grep is expensive; one test took 2h...!
- $count_of_reads/=2;
- }
- if ($long_reads) {
- $inchworm_target_fa .= ".wLongReads.fa";
- $count_of_reads += `grep -c '^>' $long_reads | wc -l`; #AP we don't know if these will be one single line
- &process_cmd("cat $long_reads $trinity_target_fa > $inchworm_target_fa");
- }
-
- open (my $ofh, ">$inchworm_target_fa.read_count") or die $!;
- print $ofh $count_of_reads."\n";
- close $ofh;
- }
-
- if ($prep_only){
- print "Data has been prepared. Exiting now as per user request\n";
- exit();
- }
-
- #################
- ## Inchworm step:
- $pm_inchworm_start = `date +%s`;
- unless (-s $inchworm_file && -e $inchworm_finished_checkpoint_file) {
-
-
- &run_inchworm($inchworm_file, $inchworm_target_fa, $SS_lib_type, $kmer_method);
- &process_cmd("touch $inchworm_finished_checkpoint_file");
- }
- $pm_inchworm_end = `date +%s`;
-
-
- unless (-s $inchworm_file) {
-
- ## No inchworm output under genome-guided flag, must be sparse data.
-
- if ($FULL_CLEANUP_ERROR_TOLERANT && $FULL_CLEANUP && -e $inchworm_file && -e $inchworm_finished_checkpoint_file) {
- ## GG-trinity mode, clean-up gracefully
- if ($MKDIR_OUTDIR_FLAG) {
- &process_cmd("rm -rf $component_directory") if $component_directory;
- &process_cmd("rm -rf $output_directory");
- }
- else {
- print STDERR "WARNING, cannot remove output directory $output_directory, since not created in this run. (safety precaution)\n";
- }
- exit(0);
- }
- else {
- die "Error, no Inchworm output is detected at: $inchworm_file";
- }
- }
-
-
- if ($jaccard_clip) {
-
- eval {
-
- if ($jaccard_clip && -s 'left.fa' && -s 'right.fa') {
- $inchworm_file = &run_jaccard_clip_left_right($inchworm_file, \@left_files, \@right_files, $seqType, $SS_lib_type);
- #$inchworm_file = &run_jaccard_clip_left_right($inchworm_file, $left_file, $right_file, $seqType, $SS_lib_type);
-
- }
- elsif ($jaccard_clip && -s 'single.fa') {
- $inchworm_file = &run_jaccard_clip_single_but_really_paired($inchworm_file, \@single_files, $seqType, $SS_lib_type);
- #$inchworm_file = &run_jaccard_clip_single_but_really_paired($inchworm_file, $single_file, $seqType, $SS_lib_type);
- }
- };
-
- if ($@) {
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## GG-trinity mode, clean up gracefully
- system("rm -rf $output_directory &"); # ignore file system errors on failed cleanup
- exit(0);
- }
- else {
- die "Error, jaccard-clip failed: $@";
- }
- }
- }
-
-
- if ($NO_RUN_CHRYSALIS_FLAG) {
- print "\n\n\n";
- print "#########################################################################\n";
- print "Inchworm is complete. --no_run_chrysalis was specified, so stopping here.\n";
- print "#########################################################################\n\n\n";
-
- exit(0);
- }
- $ENV{OMP_NUM_THREADS} = $CPU;
- ##################
- ## Chrysalis step:
-
- if ($min_percent_read_iworm_kmers > 0) {
-
- ### EXPERIMENTAL: DO NOT USE!
-
- $trinity_target_fa = &extract_reads_with_iworm_kmers($trinity_target_fa, $inchworm_file, $min_percent_read_iworm_kmers, $SS_lib_type);
-
- }
-
- ## butterfly commands can be reparameterized for exploring different assembly requirements
- ## chrysalis will just run or resume depending on what's already been processed.
- $pm_chrysalis_start = `date +%s`;
- my $butterfly_cmds = &run_chrysalis($inchworm_file, $inchworm_target_fa,
- $min_contig_length, $group_pairs_distance, $SS_lib_type, $trinity_target_fa);
- $pm_chrysalis_end = `date +%s`;
-
- print "Butterfly_cmds: $butterfly_cmds\n";
-
- if ($butterfly_cmds && -s $butterfly_cmds) {
-
- if ($NO_RUN_BUTTERFLY_FLAG) {
-
- print "\n\nYou've opted to run butterfly commands independently from this script, such as on a computing grid.\n\n";
- print "Butterfly commands to execute are available here:\n"
- . "\t$butterfly_cmds\n\n";
- print "After executing Butterfly commands, concatenate all Butterfly outputs by running:\n"
- . "\t\tfind $output_directory/ -name \"\*allProbPaths.fasta\" -exec cat {} + > $output_directory/Trinity.fasta\n\n\n";
-
- exit(0);
-
- }
- else {
-
- ## Run Butterfly
-
- print "Inchworm and Chrysalis complete. Butterfly commands to execute are provided here:\n"
- . $butterfly_cmds . "\n\n";
-
-
- print STDERR "---------------------------------------------------------------\n"
- . "-------------------- Butterfly --------------------------------\n"
- . "-- (Reconstruct transcripts from reads and de Bruijn graphs) --\n"
- . "---------------------------------------------------------------\n\n";
-
- $pm_butterfly_start = `date +%s`;
- if ($grid_conf_file) {
- my @bfly_cmds = `cat $butterfly_cmds`;
- chomp @bfly_cmds;
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "chrysalis/butterfly_on_grid.cacheSuccess");
- my $ret = $grid_runner->run_on_grid(@bfly_cmds);
- if ($ret) {
- die "Error, not all butterfly commands could complete successfully... cannot continue.";
- }
- }
- else {
- my $cmd = "$PARAFLY -c $butterfly_cmds -shuffle -CPU $bflyCPU -failed_cmds failed_butterfly_commands.$$.txt -v "; # shuffle them since the first ones are usually the longest-running ones.
- &process_cmd($cmd);
- }
- $pm_butterfly_end = `date +%s`;
-
- ## capture results:
- # my $cmd = 'find ./chrysalis -name "*allProbPaths.fasta" -exec cat {} + > Trinity.fasta.tmp';
- # no longer scan the file system... we know which files should exist
- my $cmd = "$UTILDIR/support_scripts/print_butterfly_assemblies.pl $chrysalis_output_dir/component_base_listing.txt > Trinity.fasta.tmp";
- &process_cmd($cmd);
-
- }
-
- }
-
- if ($FULL_CLEANUP) {
- print "Fully cleaning up.\n";
- $output_directory =~ s|/+$||g; # remove any trailing directory slash
-
- if (-s "Trinity.fasta.tmp") {
- rename("Trinity.fasta.tmp", "$output_directory.Trinity.fasta") or die "Error, cannot rename Trinity.fasta.tmp to $output_directory.Trinity.fasta";
-
- print "\n\n";
- print "###################################################################\n";
- print "Butterfly assemblies are written to $output_directory.Trinity.fasta\n";
- print "###################################################################\n\n\n";
-
- }
- else {
- print "\n\n";
- print "####################################\n";
- print "## No butterfly assemblies to report.\n";
- print "####################################\n\n\n";
- }
-
- if ($MKDIR_OUTDIR_FLAG) {
- system("rm -rf $output_directory &"); # ignore filesystem errors on failed cleanup
- }
- else {
- print STDERR "WARNING, cannot remove output directory $output_directory, since not created in this run. (safety precaution)\n";
- }
-
- }
- else {
-
-
- if (-s "Trinity.fasta.tmp") {
- rename("Trinity.fasta.tmp", $butterfly_output_filename) or die "Error, cannot rename Trinity.fasta.tmp to $butterfly_output_filename"; # now that process has finished.
- }
-
- if (-s $butterfly_output_filename) {
-
- print "\n\n";
- print "###################################################################\n";
- print "Butterfly assemblies are written to $output_directory/$butterfly_output_filename\n";
- print "###################################################################\n\n\n";
- }
- else {
- die "ERROR, no butterfly assemblies reported.";
- }
-
- }
-
- &perfmon_end() unless ($FULL_CLEANUP);
- exit(0);
-}
-
-
-####
-sub run_chrysalis {
- my ($inchworm_file, $reads_file,
- $min_contig_length, $group_pairs_distance, $SS_lib_type, $pairs_fa) = @_;
-
-
- my $butterfly_cmds = &create_full_path("$chrysalis_output_dir/butterfly_commands");
-
- my $quantify_graph_cmds = &create_full_path("$chrysalis_output_dir/quantifyGraph_commands");
-
- my $chrysalis_finished_checkpoint = "$chrysalis_output_dir/chrysalis.finished";
-
- if (-e $chrysalis_finished_checkpoint) {
-
- print "###################################################################\n";
- print "#### Chrysalis results already exist. Not rerunning Chrysalis. ####\n";
- print "###################################################################\n\n\n";
-
- #sleep(2);
-
- }
- else {
- ## run Chrysalis
-
- my $cmd = "$CHRYSALIS_DIR/Chrysalis -i $reads_file -iworm $inchworm_file -o $chrysalis_output_dir -cpu $CPU "
- . " -min_glue $min_glue -min_iso_ratio $min_iso_ratio -glue_factor $glue_factor -kmer_size " . ($KMER_SIZE-1) # chrysalis wants kmer overlap length
- . " -weldmer_size $weldmer_size "
- . " -min $min_contig_length -dist $group_pairs_distance -max_reads $max_reads_per_graph "
- . " -sort_exec \"$sort_exec\" "
- . " -sort_buffer_size $JM_string -max_mem_reads $max_reads_per_loop ";
-
- if ($SS_lib_type) {
- $cmd .= " -strand 1 ";
- }
-
- if ($PAIRED_MODE) {
- $cmd .= " -paired ";
- $cmd .= " -reads_for_pairs $pairs_fa ";
-
- if ($NO_BOWTIE) {
- $cmd .= " -no_pair_links ";
- }
-
- }
-
- if ($BOWTIE_COMP) {
- $cmd .= " -bowtie_comp ";
- }
-
- if ($min_pct_read_mapping) {
- $cmd .= " -min_pct_read_mapping $min_pct_read_mapping ";
- }
-
-
- $cmd .= " -butterfly $BFLY_JAR ";
-
- if ($NO_CLEANUP) {
- $cmd .= " -no_cleanup ";
- }
-
- $cmd .= " 2>&1 ";
-
- eval {
-
- &process_cmd($cmd);
-
- };
-
-
- if ($@) {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode - OK, not enough data that's worth pursuing.
- return("");
-
- }
-
- my $errmsg = "$curr_limit_settings\n";
- $errmsg .= "Error, the Chrysalis process failed:\n$@\n";
- croak $errmsg;
- }
-
-
- print "Chrysalis initial stage completed successfully.\n";
- &process_cmd("touch $chrysalis_finished_checkpoint");
- }
-
- ## partition the graphs and reads in prep for quantify graph and butterfly steps.
-
- unless (-s "$chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn") {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode - OK, not enough data that's worth pursuing.
- return("");
- }
-
- croak "Error, no deBruijn graphs generated based on inchworm contigs: $chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn";
- }
-
-
- my $partitioning_checkpoint_file = "$chrysalis_output_dir/file_partitioning.ok";
-
- my $cmd = "$UTILDIR/support_scripts/partition_chrysalis_graphs_n_reads.pl --deBruijns $chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn --componentReads $chrysalis_output_dir/readsToComponents.out.sort -N 1000 -L $min_contig_length --compdir $component_directory ";
-
- &process_cmd($cmd) unless (-e $partitioning_checkpoint_file);
-
- &process_cmd("touch $partitioning_checkpoint_file") unless (-e $partitioning_checkpoint_file);
-
- ## write the quantifygraph commands and butterfly commands
- my $component_base_listing_file = "$chrysalis_output_dir/component_base_listing.txt";
- unless (-s $component_base_listing_file) {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode
- return("");
- }
- croak "Error, component base listing file: $component_base_listing_file does not exist";
-
- }
-
-
- {
- open (my $bfly_cmds_ofh, ">$butterfly_cmds") or die $!;
- open (my $qgraph_cmd_ofh, ">$quantify_graph_cmds") or die $!;
-
-
- open (my $fh, $component_base_listing_file) or die $!;
- while (<$fh>) {
- chomp;
- my ($component_id, $base_filename) = split(/\t/);
-
-
- { # quantify graph command
-
- my $quantify_graph_cmd = "$CHRYSALIS_DIR/QuantifyGraph -g $base_filename.graph.tmp "
- . " -i $base_filename.reads.tmp "
- . " -o $base_filename.graph.out "
- . " -max_reads $max_reads_per_graph "
- . " -k " . ($KMER_SIZE - 1);
-
- if ($SS_lib_type) {
- $quantify_graph_cmd .= " -strand ";
- }
- if ($NO_CLEANUP) {
-
- $quantify_graph_cmd .= " -no_cleanup ";
- }
-
- print $qgraph_cmd_ofh $quantify_graph_cmd . "\n";
-
- }
-
- { # butterfly command
-
-
- my $bfly_cmd = "java -Xmx$bflyHeapSpaceMax -Xms$bflyHeapSpaceInit ";
-
- if (defined($bflyGCThreads)) {
- $bfly_cmd .= " -XX:ParallelGCThreads=$bflyGCThreads ";
- }
-
- $bfly_cmd .= " -jar $BFLY_JAR -N 100000 -L $min_contig_length -F $group_pairs_distance -C $base_filename.graph ";
-
- if ($bfly_opts) {
- $bfly_cmd .= " $bfly_opts ";
- }
-
- $bfly_cmd .= " --path_reinforcement_distance=$path_reinforcement_distance ";
-
- if ($TRIPLET_LOCK) {
- $bfly_cmd .= " --triplet-lock ";
- }
- if ($EXTENDED_TRIPLET_LOCK) {
- $bfly_cmd .= " --extended_triplet ";
- }
-
- unless ($NO_EM_REDUCE) {
- $bfly_cmd .= " --EM_REDUCE ";
- }
-
- if ($NO_PATH_MERGING) {
- $bfly_cmd .= " --no_path_merging ";
- }
- else {
- if (defined($MIN_PER_ID_SAME_PATH)) {
- $bfly_cmd .= " --min_per_id_same_path=$MIN_PER_ID_SAME_PATH ";
- }
- if (defined($MAX_DIFFS_SAME_PATH)) {
- $bfly_cmd .= " --max_diffs_same_path=$MAX_DIFFS_SAME_PATH ";
- }
- if (defined($MAX_INTERNAL_GAP_SAME_PATH)) {
- $bfly_cmd .= " --max_internal_gap_same_path=$MAX_INTERNAL_GAP_SAME_PATH ";
- }
- }
-
- if ($PASAFLY_MODE) {
- $bfly_cmd .= " --PasaFly ";
- }
- elsif ($CUFFFLY_MODE) {
- $bfly_cmd .= " --CuffFly ";
- }
-
- print $bfly_cmds_ofh $bfly_cmd . "\n";
-
- }
- }
- close $fh;
- close $bfly_cmds_ofh;
- close $qgraph_cmd_ofh;
-
- }
-
- # see if we need to run the quantifyGraph commands:
- if ($NO_RUN_QUANTIFYGRAPH_FLAG) {
-
- print "#############################################################################\n";
- print "## Ceasing Trinity prior to execution of massively parallel operations.\n";
- print "##\n";
- print "## To complete Trinity, execute the following sets of commands:\n";
- print "##\n";
- print "## First, run the Chrysalis QuantifyGraph commands in parallel:\n";
- print "## $quantify_graph_cmds\n";
- print "##\n";
- print "## Then, execute all the Butterfly commands:\n";
- print "## $butterfly_cmds\n";
- print "##\n";
- print "## And, finally, concatenate all Butterfly assemblies into a single output file:\n";
- print "##\n";
- print "## find $output_directory/ -name \"\*allProbPaths.fasta\" -exec cat {} + > $output_directory/Trinity.fasta\n";
- print "##\n";
- print "##############################################################################\n";
- print "\n\n";
-
- exit(0);
- }
- else {
-
-
- my $quantify_graph_cmds_finished = &create_full_path("$chrysalis_output_dir/quantifyGraph_commands.run.finished");
- if (! -e $quantify_graph_cmds_finished) {
- ## run it
-
- print STDERR "---------------------------------------------------\n"
- . "----------- Chrysalis: QuantifyGraph --------------\n"
- . "-- (Integrate mapped reads into de Bruijn graph) --\n"
- . "---------------------------------------------------\n\n";
-
-
- if ($grid_conf_file) {
- my @quantify_graph_cmds = `cat $quantify_graph_cmds`;
- chomp @quantify_graph_cmds;
-
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "chrysalis/chrysalis_quantify_graph_on_grid.cacheSuccess");
- my $ret = $grid_runner->run_on_grid(@quantify_graph_cmds);
- if ($ret) {
- die "Error, not all Chrysalis quantify_graph commands completed successfully. Cannot continue.";
- }
- }
- else {
-
- my $cmd = "$PARAFLY -c $quantify_graph_cmds -CPU $CPU -failed_cmds failed_quantify_graph_commands.$$.txt -v -shuffle ";
- &process_cmd($cmd);
- }
-
- # write checkpoint
- &process_cmd("touch $quantify_graph_cmds_finished");
- }
-
-
- return($butterfly_cmds);
-
- }
-
-
-
-}
-
-
-####
-sub run_inchworm {
- my ($inchworm_outfile, $reads, $strand_specific_flag, $kmer_method) = @_;
-
-
- ## get count of number of reads to be assembled.
- my $read_count_file = "$reads.read_count";
- if (! -s $read_count_file) {
- my $count_of_reads = `wc -l < $reads`;chomp($count_of_reads); #AP: grep is expensive; one test took 2h...!
- $count_of_reads/=2; # assume fasta; two lines per read
- $pm_read_count = $count_of_reads;
- open (my $ofh, ">$read_count_file") or die $!;
- print $ofh $count_of_reads."\n";
- close $ofh;
- }
-
-
- my $inchworm_cmd;
-
- my @tmp_files; # to be deleted after successful inchworm run.
-
-
- #####################################################
- ## Using Jellyfish kmer method
- #####################################################
-
- if (! $FORCE_INCHWORM_KMER_METHOD) {
-
- my $jelly_kmer_fa_file = "jellyfish.kmers.fa";
- my $jelly_finished_checkpoint_file = "jellyfish.$min_kmer_cov.finished";
- unless (-e $jelly_finished_checkpoint_file) {
-
-
- print STDERR "-------------------------------------------\n"
- . "----------- Jellyfish --------------------\n"
- . "-- (building a k-mer catalog from reads) --\n"
- . "-------------------------------------------\n\n";
-
-
- my $read_file_size = -s $reads;
-
- my $jelly_hash_size = int( ($max_memory - $read_file_size)/7); # decided upon by Rick Westerman
-
-
- if ($jelly_hash_size < 100e6) {
- $jelly_hash_size = 100e6; # seems reasonable for a min hash size as 100M
- }
-
- ## for testing
- if ($JELLY_S) {
- $jelly_hash_size = $JELLY_S;
- }
-
- my $cmd = "$JELLYFISH_DIR/bin/jellyfish count -t $CPU -m $KMER_SIZE -s $jelly_hash_size ";
-
- unless ($SS_lib_type) {
- ## count both strands
- $cmd .= " --canonical ";
- }
-
- $cmd .= " $reads";
-
- &process_cmd($cmd);
-
-
- if (-s $jelly_kmer_fa_file) {
- unlink($jelly_kmer_fa_file) or die "Error, cannot unlink $jelly_kmer_fa_file";
- }
-
- my $jelly_db = "mer_counts.jf";
-
- $cmd = "$JELLYFISH_DIR/bin/jellyfish dump -L $min_kmer_cov $jelly_db > $jelly_kmer_fa_file";
-
- &process_cmd($cmd);
-
-
- ## write a histogram of the kmer counts.
- $cmd = "$JELLYFISH_DIR/bin/jellyfish histo -t $CPU -o $jelly_kmer_fa_file.histo $jelly_db";
- &process_cmd($cmd);
-
-
- unlink($jelly_db);
-
- ## if got this far, consider jellyfish done.
- &process_cmd("touch $jelly_finished_checkpoint_file");
-
-
- if ($NO_RUN_INCHWORM_FLAG) {
- print STDERR "WARNING: --no_run_inchworm parameter in effect. Stopping here prior to running inchworm.\n";
- exit(0);
- }
-
-
- }
-
-
- $inchworm_cmd = "$INCHWORM_DIR/bin/inchworm --kmers $jelly_kmer_fa_file --run_inchworm -K $KMER_SIZE -L $MIN_IWORM_LEN --monitor 1 ";
-
- # hold on to the jellyfish file - we might use it for other applications.
- #push (@tmp_files, $jelly_finished_checkpoint_file, $jelly_kmer_fa_file) unless $NO_CLEANUP;
-
- }
- else {
-
- ######################################################
- ## Using Inchworm kmer method (original, slow method)
- ######################################################
-
- $inchworm_cmd = "$INCHWORM_DIR/bin/inchworm --reads $reads --run_inchworm -K $KMER_SIZE -L $MIN_IWORM_LEN --monitor 1 ";
- if ($min_kmer_cov > 1) {
- $inchworm_cmd .= " --minKmerCount $min_kmer_cov ";
- }
- }
-
-
- ## finish constructing the inchworm command to execute
-
- unless ($strand_specific_flag) {
- $inchworm_cmd .= " --DS ";
- }
-
- unless ($NO_CLEANUP) {
- $inchworm_cmd .= " --keep_tmp_files ";
- }
-
-
- my $num_threads = ($inchworm_cpu) ? $inchworm_cpu : $CPU;
- $inchworm_cmd .= " --num_threads $num_threads ";
-
- if ($PARALLEL_IWORM_FLAG) {
- $inchworm_cmd .= " --PARALLEL_IWORM ";
- }
-
- if ($INCHWORM_CUSTOM_PARAMS) {
- $inchworm_cmd .= " $INCHWORM_CUSTOM_PARAMS ";
- }
-
- #$inchworm_cmd .= " 2>inchworm.log > $inchworm_outfile.tmp";
- $inchworm_cmd .= " > $inchworm_outfile.tmp";
-
- print STDERR "----------------------------------------------\n"
- . "--------------- Inchworm ---------------------\n"
- . "-- (Linear contig construction from k-mers) --\n"
- . "----------------------------------------------\n\n";
-
-
- eval {
-
- &process_cmd($inchworm_cmd);;
- };
-
- if ($@) {
-
- print STDERR "$@\n";
- print "** The inchworm process failed.";
- print STDERR "\n\nIf it indicates bad_alloc(), then Inchworm ran out of memory. You'll need to either reduce the size of your data set or run Trinity on a server with more memory available.\n\n";
- exit(1);
- }
-
- rename("$inchworm_outfile.tmp", $inchworm_outfile) or die "Error, cannot rename $inchworm_outfile.tmp to $inchworm_outfile"; # now we know for sure it's done.
-
-
- return;
-
-}
-
-####
-sub prep_seqs {
- my ($initial_files_ref, $seqType, $file_prefix, $SS_lib_type) = @_;
- my @initial_files = @$initial_files_ref;
- return if -e "$file_prefix.fa";
- for (my $i=0;$i<scalar(@initial_files);$i++){
- my $f = $initial_files[$i];
- if ($f=~/\.gz$/){
- my $new = $f;
- $new=~s/\.gz$//;
- unlink($new);
- &process_cmd("gunzip -c $f > $new");
- $initial_files[$i] = $new;
- }elsif ($f=~/\.bz2$/){
- my $new = $f;
- $new=~s/\.bz2$//;
- unlink($new);
- &process_cmd("bunzip2 -dkc $f > $new");
- $initial_files[$i] = $new;
- }
- }
-
- my $initial_file_str = join(" ", at initial_files);
- if ($seqType eq "fq") {
- # make fasta
- foreach my $f (@initial_files){
- my $perlcmd = "$UTILDIR/support_scripts/fastQ_to_fastA.pl -I $f ";
- my $fastool_cmd = "$FASTOOL_DIR/fastool";
- if ($SS_lib_type && $SS_lib_type eq "R") {
- $perlcmd .= " --rev ";
- $fastool_cmd .= " --rev ";
- }
- $fastool_cmd .= " --illumina-trinity --to-fasta $f >> $file_prefix.fa 2> $f.readcount ";
- $perlcmd .= " >> $file_prefix.fa 2> $f.readcount ";
- my $cmd = ($USE_FASTOOL) ? $fastool_cmd : $perlcmd;
- &process_cmd($cmd);
- }
- }
- elsif ($seqType eq "fa") {
- if (scalar(@initial_files) == 1 && (!$SS_lib_type || $SS_lib_type ne "R")) {
- ## just symlink it here:
- my $cmd = "ln -s $initial_file_str $file_prefix.fa";
- &process_cmd($cmd);
- }elsif(scalar(@initial_files) > 1 && (!$SS_lib_type || $SS_lib_type ne "R")){
- my $cmd = "cat $initial_file_str > $file_prefix.fa";
- &process_cmd($cmd);
- }else {
- #if ($SS_lib_type && $SS_lib_type eq "R") {
- foreach my $f (@initial_files){
- my $cmd = "$UTILDIR/support_scripts/revcomp_fasta.pl $f >> $file_prefix.fa";
- &process_cmd($cmd);
- }
- }
- }
- elsif (($seqType eq "cfa") | ($seqType eq "cfq")) {
- confess "cfa, cfq not supported";
- }
- return \@initial_files;
-}
-
-
-
-###
-sub create_full_path {
- my ($file, $verify_exists) = @_;
- if (ref($file) eq "ARRAY"){
- for (my $i=0;$i<scalar(@$file);$i++){
- my $filename = $file->[$i];
- if ($verify_exists && ! -e $filename) {
- confess "Error, cannot locate file: $filename";
- }
- $file->[$i] = &create_full_path($filename);
- }
- return @$file;
- }
- else {
- if ($verify_exists && ! -e $file) {
- confess "Error, cannot locate file: $file";
- }
- my $cwd = cwd();
- if ($file !~ m|^/|) { # must be a full path
- $file = $cwd . "/$file";
- }
- return($file);
- }
-}
-
-
-
-####
-sub process_cmd {
- my ($cmd) = @_;
-
- print &mytime."CMD: $cmd\n";
-
- my $start_time = time();
- my $ret = system($cmd);
- my $end_time = time();
-
- if ($ret) {
- die "Error, cmd: $cmd died with ret $ret";
- }
-
- print "CMD finished (" . ($end_time - $start_time) . " seconds)\n";
-
- return;
-}
-
-
-####
-sub run_jaccard_clip_left_right {
- my ($inchworm_file, $left_files_aref, $right_files_aref, $seqType, $SS_lib_type) = @_;
-
- my $output_file = "$inchworm_file.clipped.fa";
-
- if (-s $output_file) {
- print STDERR "###### WARNING: $output_file already exists, skipping the jaccard-clip step, using already existing output: $output_file\n";
- return($output_file);
- }
-
- my $cmd = "$UTILDIR/support_scripts/inchworm_transcript_splitter.pl --iworm $inchworm_file "
- . " --left " . join(",", @$left_files_aref) . " --right " . join(",", @$right_files_aref) . " --seqType $seqType --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- &process_cmd($cmd);
-
- unless (-s $output_file) {
- croak "Error, jaccard clipping didn't produce the expected output file: $output_file";
- }
-
- return($output_file);
-}
-
-
-
-####
-sub run_jaccard_clip_single_but_really_paired {
- my ($inchworm_file, $single_files_aref, $seqType, $SS_lib_type) = @_;
-
- my $output_file = "$inchworm_file.clipped.fa";
-
- if (-s $output_file) {
- print STDERR "###### WARNING: $output_file already exists, skipping the jaccard-clip step, using already existing output: $output_file\n";
- return($output_file);
- }
-
- my $cmd = "$UTILDIR/support_scripts/inchworm_transcript_splitter.pl --iworm $inchworm_file "
- . " --single_but_really_paired " . join(",", @$single_files_aref) . " --seqType $seqType --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- &process_cmd($cmd);
-
-
-
- unless (-s $output_file) {
- croak "Error, jaccard clipping didn't produce the expected output file: $output_file";
- }
-
- return($output_file);
-}
-
-####
-sub test_java_failure_capture {
-
- print "#######################################\n";
- print "Running Java Tests\n";
-
- my $java_prog = `which java`;
- unless ($java_prog) {
- die "Error, cannot find 'java'. Please be sure it is available within your \${PATH} setting and then try again.";
- }
-
-
- my $cmd = "java -Xmx64m -jar $UTILDIR/support_scripts/ExitTester.jar 0";
- eval {
- &process_cmd($cmd);
- };
- if ($@) {
- print STDERR "Error encountered in testing for running of a simple java application. ";
- print "$@\n\n";
- print STDERR "Please check your java configuration.\n";
- exit(1);
-
- }
-
- $cmd = "java -Xmx64m -jar $UTILDIR/support_scripts/ExitTester.jar 1";
- eval {
- &process_cmd($cmd);
- };
-
- if ($@) {
- print "-we properly captured the java failure status, as needed. Looking good.\n";
- }
- else {
- print STDERR "-we are unable to properly capture java failure status. Please be sure that java (or any wrapper around java that's being used) can properly capture and propagate failure status before proceeding.\n";
- exit(1);
- }
-
- print "Java tests succeeded.\n";
- print "###################################\n\n";
-
- return;
-}
-
-
-####
-sub extract_reads_with_iworm_kmers {
- my ($trinity_target_fa, $inchworm_file, $min_percent_read_containing_kmers, $SS_lib_type) = @_;
-
- my $extracted_reads_file = "$trinity_target_fa." . $min_percent_read_containing_kmers . "pcnt.iworm_extracted";
-
- my $cmd = "$INCHWORM_DIR/bin/pull_reads_with_kmers "
- . "--target $inchworm_file "
- . "--reads $trinity_target_fa "
- . "--min_percent_read_containing_kmers $min_percent_read_containing_kmers ";
-
- unless ($SS_lib_type) {
- $cmd .= " --DS ";
- }
-
- $cmd .= " > $extracted_reads_file ";
-
- if (-s $extracted_reads_file) {
- print STDERR "-warning, iworm kmer-extracted reads file already exists: $extracted_reads_file. Re-using it.\n";
- }
- else {
-
- &process_cmd($cmd);
- }
-
- return($extracted_reads_file);
-}
-
-
-sub try_unlimit_stacksize {
-
- # from Ryan Thompson
- eval "use BSD::Resource; setrlimit(RLIMIT_STACK, RLIM_INFINITY, RLIM_INFINITY); ";
-
- if( $@ ) {
- warn <<"EOF";
-
- $@
-
- Unable to set unlimited stack size. Please install the BSD::Resource
- Perl module to allow this script to set the stack size, or set it
- yourself in your shell before running Trinity (ignore this warning if
- you have set the stack limit in your shell). See the following URL for
- more information:
-
- http://trinityrnaseq.sourceforge.net/trinity_faq.html#ques_E
-
-EOF
-;
- }
- else {
- print "Successfully set unlimited stack size.\n";
- print "###################################\n\n";
- }
- return;;
-}
-
-sub mytime() {
- my @mabbr = qw(January February March April May June July August September October November December);
- my @wabbr = qw(Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
- my $sec = localtime->sec() < 10 ? '0' . localtime->sec() : localtime->sec();
- my $min = localtime->min() < 10 ? '0' . localtime->min() : localtime->min();
- my $hour = localtime->hour() < 10 ? '0' . localtime->hour() : localtime->hour();
- my $wday = $wabbr[localtime->wday];
- my $mday = localtime->mday;
- my $mon = $mabbr[localtime->mon];
- my $year = localtime->year() + 1900;
- return "$wday, $mon $mday, $year: $hour:$min:$sec\t";
-}
-
-
-
-####
-sub show_lit_citation {
-
- print "\n\n* Trinity:\n"
- . "Full-length transcriptome assembly from RNA-Seq data without a reference genome.\n"
- . "Grabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L,\n"
- . "Raychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F,\n"
- . "Birren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A.\n"
- . "Nature Biotechnology 29, 644–652 (2011)\n"
- . "Paper: http://www.nature.com/nbt/journal/v29/n7/full/nbt.1883.html\n"
- . "Code: http://trinityrnaseq.sf.net\n\n\n";
-
-=included_in_trinity
-
------------------------------------------------------------------------------------------
------ Tools Below are Used Within Trinity Accordingly -----------------------------------
------------------------------------------------------------------------------------------
-
-* Fastool (for fast fastQ-to-fastA conversion)
-Francesco Strozzi
-Code: https://github.com/fstrozzi/Fastool
-
-* Jellyfish (for fast K-mer counting)
-A fast, lock-free approach for efficient parallel counting of occurrences of k-mers.
-Guillaume Marcais and Carl Kingsford.
-Bioinformatics (2011) 27(6): 764-770
-Paper: http://bioinformatics.oxfordjournals.org/content/27/6/764.long\n
-Code: http://www.cbcb.umd.edu/software/jellyfish
-
-* Trimmomatic
-Lohse M, Bolger AM, Nagel A, Fernie AR, Lunn JE, Stitt M, Usadel B. RobiNA: a
-user-friendly, integrated software solution for RNA-Seq-based transcriptomics.
-Nucleic Acids Res. 2012 Jul;40(Web Server issue):W622-7.
-Code: http://www.usadellab.org/cms/?page=trimmomatic
-
-
-=cut
-
- return;
-}
-
-# clean-up after normal termination, exit(), or die()
-END {
- &collectl_stop();
-}
-
-
-sub perfmon_start {
- open (FILE, ">", "$output_directory/$pm_logfile") or die "Error, cannot write to: $output_directory/$pm_logfile";
- print FILE "Statistics:\n";
- print FILE "===========\n";
- print FILE "Trinity Version: $VERSION\n";
- my $tempp="";
- $tempp=`ldd $INCHWORM_DIR/bin/inchworm 2>/dev/null | grep "libgomp"`;
- if ($tempp eq "") {
- print FILE "Compiler: Intel\n";
- } else {
- print FILE "Compiler: GCC\n";
- }
- print FILE "Trinity Parameters: $pm_trinity_arguments\n";
- $pm_trinity_startstring = `date`;
- $pm_trinity_start = `date +%s`;
- close (FILE);
-}
-
-sub perfmon_end {
- $pm_trinity_endstring = `date`;
- $pm_trinity_end = `date +%s`;
- my $timestamp = `date +%s`;
- if ( -e "$output_directory/$pm_logfile" ) {
- open (FILE, '>>', "$output_directory/$pm_logfile") or die;
- if ($PAIRED_MODE) {
- print FILE "Paired mode\n";
- print FILE " Input data\n";
- if (@left_files && @right_files) {
- print FILE " Left.fasta $pm_left_fa_size MByte\n";
- print FILE " Right.fasta $pm_right_fa_size MByte\n";
- } else {
- print FILE " Single.fasta $pm_single_fa_size MByte\n";
- }
- } else {
- print FILE "Unpaired read mode\n";
- print FILE " Input data\n";
- print FILE " Single.fasta $pm_single_fa_size MByte\n";
- }
- }
- $pm_inchworm_kmers = `cat $output_directory/inchworm.kmer_count`;
- print FILE " Number of unique KMERs: $pm_inchworm_kmers";
- print FILE " Number of reads: $pm_read_count";
- print FILE " Output data\n";
- my $pm_temp = -s "$output_directory/Trinity.fasta" || 0;
- $pm_temp = $pm_temp / 1024 / 1024;
- my $pm_trinity_fa_size = sprintf('%.0f', $pm_temp);
- print FILE " Trinity.fasta $pm_trinity_fa_size MByte\n\n";
- print FILE "Runtime\n";
- print FILE "=======\n";
- print FILE "Start: $pm_trinity_startstring";
- print FILE "End: $pm_trinity_endstring";
- my $pm_trinity_time = $pm_trinity_end - $pm_trinity_start;
- print FILE "Trinity $pm_trinity_time seconds\n";
- my $pm_inchworm_time = $pm_inchworm_end - $pm_inchworm_start;
- print FILE " Inchworm $pm_inchworm_time seconds\n";
- my $pm_chrysalis_time = $pm_chrysalis_end - $pm_chrysalis_start;
- print FILE " Chrysalis $pm_chrysalis_time seconds\n";
- my $pm_butterfly_time = $pm_butterfly_end - $pm_butterfly_start;
- print FILE " Butterfly $pm_butterfly_time seconds\n";
- my $pm_rest_time = $pm_trinity_time - $pm_butterfly_time - $pm_chrysalis_time - $pm_inchworm_time;
- print FILE " Rest $pm_rest_time seconds\n";
- close (FILE);
-}
-
-sub collectl_start {
- # install signal handler to stop collectl on interrupt
- $SIG{INT} = sub { print "Trinity interrupted\n"; &collectl_stop(); exit(1); };
-
- if ($run_with_collectl){
- warn "STARTING COLLECTL\n";
- $collectl_output_directory = "$start_dir/collectl";
- `rm -rf $collectl_output_directory `;
- $collectl_output_directory = &create_full_path($collectl_output_directory, 0);
- unless (-d $collectl_output_directory) {
- mkdir $collectl_output_directory or die "Error, cannot mkdir $collectl_output_directory";
- }
- my $collectl_userid = qx(id --user --real);
- chomp($collectl_userid);
- my $cmd = "cd $collectl_output_directory && exec ${COLLECTL_DIR}/collectl $collectl_param --procfilt u$collectl_userid -f $collectl_output_directory/y";
- ## fork a child to run collectl
- $collectl_pid = fork();
- if (not defined $collectl_pid) {
- warn "FORK FAILED - NO COLLECTL PROCESS STARTED\n";
- } elsif ($collectl_pid == 0) {
- warn "I'M THE CHILD RUNNING TRINITY\n";
- exec($cmd);
- warn "COLLECTL FINISHED BEVORE KILL WAS CALLED\n";
- exit(0);
- } else {
- warn "I'M THE PARENT, COLLECTL_PID=$collectl_pid\n";
- }
- }
-}
-
-# finish collectl monitoring and create collectl plots
-sub collectl_stop {
- if ($run_with_collectl && $collectl_pid>0) {
- warn "TERMINATING COLLECTL, PID = $collectl_pid\n";
- # try to be nice here as a hard kill will result in broken/unusable raw.gz file
- system("sync");
- kill("INT", $collectl_pid);
- kill("TERM", $collectl_pid);
- waitpid($collectl_pid,0);
- chdir($collectl_output_directory) or return;
- system("$COLLECTL_DIR/make_data_files.sh");
- system("$COLLECTL_DIR/timetable.sh");
- $collectl_titlename = "${VERSION} ${CPU} @{left_files}@{single_files}";
- system("$COLLECTL_DIR/plot.sh \"$collectl_titlename\" ${CPU}");
- }
-}
-
-####
-sub run_trimmomatic_PE {
- my ($left_fq_file, $right_fq_file, $trimmomatic_params) = @_;
-
- my $trimmed_left_file_base = basename($left_fq_file);
- my $trimmed_right_file_base = basename($right_fq_file);
-
- my ($trimmed_left_fq, $trimmed_right_fq) = ("$trimmed_left_file_base.PwU.qtrim.fq", "$trimmed_right_file_base.PwU.qtrim.fq");
- my $checkpoint = "trimmomatic.ok";
-
- if (&files_exist($trimmed_left_fq, $trimmed_right_fq, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Trimmomatic process was previously completed. Skipping it and using existing qual-trimmed files: $trimmed_left_fq, $trimmed_right_fq\n";
- print STDERR "###############################################################################\n";
-
- return($trimmed_left_fq, $trimmed_right_fq);
- }
-
-
- my $cmd = "java -jar $TRIMMOMATIC PE -threads $CPU -phred33 "
- . " $left_fq_file $right_fq_file "
- . " $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim "
- . " $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim "
- . " $trimmomatic_params ";
-
- &process_cmd($cmd);
-
- ## append the orphans so we can still use them in assembly
- &process_cmd("cat $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim > $trimmed_left_fq");
- &process_cmd("cat $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim > $trimmed_right_fq");
-
- &process_cmd("touch $checkpoint");
-
- # compress the trimmomatic direct outputs to conserve space:
- &process_cmd("gzip $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim &");
-
- return($trimmed_left_fq, $trimmed_right_fq);
-
-
-}
-
-####
-sub run_trimmomatic_SE {
- my ($single_fq, $trimmomatic_params) = @_;
-
-
- my $trimmed_fq = basename($single_fq) . ".qtrim.fq";
-
- my $checkpoint = "trimmomatic.ok";
-
- if (&files_exist($trimmed_fq, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Trimmomatic process was previously completed. Skipping it and using existing qual-trimmed file: $trimmed_fq\n";
- print STDERR "###############################################################################\n";
-
- return($trimmed_fq);
- }
-
- my $cmd = "java -jar $TRIMMOMATIC SE -threads $CPU -phred33 "
- . " $single_fq "
- . " $trimmed_fq "
- . " $trimmomatic_params ";
-
- &process_cmd($cmd);
-
- &process_cmd("touch $checkpoint");
-
- return($trimmed_fq);
-}
-
-####
-sub run_normalization {
- my ($max_read_coverage, @read_files) = @_;
-
- if ($NORMALIZE_BY_READ_SET) {
-
- my ($reads_left_or_single_aref, $right_reads_aref) = @read_files;
-
- my @normalized_left_or_single;
- my @normalized_right;
-
- my $counter = 0;
- while (@$reads_left_or_single_aref) {
- my $left_or_single_reads = shift @$reads_left_or_single_aref;
- my @reads_to_process = ([$left_or_single_reads]);
- if (ref $right_reads_aref) {
- my $right_reads = shift @$right_reads_aref;
- push (@reads_to_process, [$right_reads]);
- }
- $counter++;
- my $norm_out_dir = cwd() . "/norm_for_read_set_$counter";
- my @norm_read_files = &normalize($norm_out_dir, $max_read_coverage, @reads_to_process);
- push (@normalized_left_or_single, $norm_read_files[0]);
- if (scalar @norm_read_files == 2) {
- # PE norm
- push (@normalized_right, $norm_read_files[1]);
- }
-
- }
-
- ## now merge them in one final round:
- my $norm_merged_dir = cwd() . "/insilico_read_normalization_altogether";
- my @reads = (\@normalized_left_or_single);
- if (@normalized_right) {
- push (@reads, \@normalized_right);
- }
- my @ret_files = &normalize($norm_merged_dir, $max_read_coverage, @reads);
- return(@ret_files);
-
- }
- else {
- ## all at once.
- my $normalize_outdir = cwd() . "/insilico_read_normalization";
-
- my @ret_files = &normalize($normalize_outdir, $max_read_coverage, @read_files);
- return(@ret_files);
-
- }
-
-
-}
-
-####
-sub normalize {
- my ($normalize_outdir, $max_read_coverage, @read_files) = @_;
-
- print STDERR "---------------------------------------------------------------\n"
- . "------------ In silico Read Normalization ---------------------\n"
- . "-- (Removing Excess Reads Beyond $max_read_coverage Coverage --\n"
- . "-- $normalize_outdir --\n"
- . "---------------------------------------------------------------\n\n";
-
-
-
- my $cmd = "$UTILDIR/insilico_read_normalization.pl --seqType $seqType --JM $JM_string "
- . " --max_cov $max_read_coverage --CPU $CPU --output $normalize_outdir";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- if ($NO_CLEANUP) {
- $cmd .= " --no_cleanup ";
- }
-
-
- my @ret_files;
- if (scalar @read_files == 2) {
- $cmd .= " --left " . join(",", @{$read_files[0]}) . " --right " . join(",", @{$read_files[1]})
- . " --pairs_together --PARALLEL_STATS ";
- @ret_files = ("$normalize_outdir/left.norm.$seqType", "$normalize_outdir/right.norm.$seqType");
-
- }
- elsif (scalar @read_files == 1) {
- $cmd .= " --single " . join(",", @{$read_files[0]});
- @ret_files = ("$normalize_outdir/single.norm.$seqType");
- }
- else {
- confess "how did we end up with " . scalar(@read_files) . " read files? @read_files\nNot sure what to do.... ";
- }
-
- my $checkpoint = "$normalize_outdir/normalization.ok";
- if (&files_exist(@ret_files, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Normalization process was previously completed. Skipping it and using existing normalized files: @ret_files\n";
- print STDERR "###############################################################################\n";
-
- }
- else {
- # do the normalization
-
- &process_cmd($cmd);
-
- &process_cmd("touch $checkpoint");
- }
-
-
- return(@ret_files);
-}
-
-
-####
-sub files_exist {
- my @files = @_;
-
- foreach my $file (@files) {
- if (! -e $file) {
- return(0); # not exists
- }
- }
-
- return(1); # all exist
-}
-
-####
-sub run_genome_guided_Trinity {
- my ($left_files_aref, $right_files_aref) = @_;
-
-
- my $bam_file;
- if ($genome_guided_use_bam) {
- $bam_file = $genome_guided_use_bam;
- }
- else {
- ## run gsnap to align reads:
-
- $bam_file = "gsnap.coordSorted.bam";
-
- unless (-s "$bam_file" && -e "$bam_file.ok") {
-
- my @files;
- if ($left_files_aref && $right_files_aref) {
- while (@$left_files_aref) {
- my $left_file = shift @$left_files_aref;
- my $right_file = shift @$right_files_aref;
- push (@files, $left_file, $right_file);
- }
- }
- else {
- @files = @$left_files_aref; # really single files
- }
-
- @files = &add_zcat_gz(@files);
-
-
- ## prep the genome
- my $cmd = "ln -sf $genome_fasta_file gsnap_target.fa";
- &process_cmd($cmd);
-
- if (-s "$genome_fasta_file.gmap") {
- &process_cmd("ln -sf $genome_fasta_file.gmap gsnap_target.gmap");
- }
- else {
-
- my $cmd = "gmap_build -k 13 -D . -d gsnap_target.gmap gsnap_target.fa ";
- &process_cmd($cmd) unless (-e "target.gmap");
- }
-
- if (-s "$genome_fasta_file.fai") {
- &process_cmd("ln -sf $genome_fasta_file.fai gsnap_target.fa.fai");
- }
- else {
- my $cmd = "samtools faidx gsnap_target.fa";
- &process_cmd($cmd);
- }
-
- $cmd = "bash -c \"set -o pipefail; gsnap -d gsnap_target.gmap -D . -A sam --nofails -N 1 -t $GMAP_CPU -w $genome_guided_max_intron -n 20 @files | samtools view -bt gsnap_target.fa.fai - | samtools sort -o - - > $bam_file \"";
- &process_cmd($cmd);
-
- &process_cmd("touch $bam_file.ok"); # checkpoint
- }
-
- }
-
- ## partition the reads according to coverage piles:
-
- my $cmd = "$UTILDIR/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl --coord_sorted_SAM $bam_file -I $genome_guided_max_intron --sort_buffer $genome_guided_sort_buffer --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
- &process_cmd($cmd) unless (-e "partitions.ok");
-
- &process_cmd("touch partitions.ok") unless (-e "partitions.ok");
-
- ## generate list of the read files:
- $cmd = "find Dir_\* -name '*reads' > read_files.list";
-
- &process_cmd($cmd) unless (-s "read_files.list" && -e "read_files.list.ok");
- &process_cmd("touch read_files.list.ok") unless (-e "read_files.list.ok"); # checkpoint
-
- ##################################################
- ## write Trinity assembly commands for partitions:
- ##################################################
-
- $cmd = "$UTILDIR/support_scripts/GG_write_trinity_cmds.pl --reads_list_file read_files.list --CPU $genome_guided_CPU ";
- if ($run_as_paired_flag) {
- $cmd .= " --run_as_paired ";
- }
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type F "; # all sequences already reoriented
- }
-
- $cmd .= " --full_cleanup_ET --seqType fa ";
-
-
- my @potential_args = @ORIG_ARGS;
-
- while (@potential_args) {
- my $arg = shift @potential_args;
-
- # single value options that aren't needed:
- if ($arg =~ /run_as_paired|normalize_by_read_set|trimmomatic|normalize_reads|prep/) {
- next;
- }
-
- # value specified options that aren't needed
- if ($arg =~ /seqType|left|right|single|genome|SS_lib_type|GMAP|quality_trimming|output|normalize_max_read_cov|grid_conf/
- ||
- # more precise identification of parameter
- $arg =~ /^(CPU)$/
-
- ) {
- # skipping these, already represented by opt configuration above.
- my $val = shift @potential_args;
- next;
- }
-
- if ($arg eq "--bfly_opts") {
- # wrap val in quotes
- my $val = shift @potential_args;
- $cmd .= "$arg \"$val\" ";
- }
- else {
- ## just passing it on.
- $cmd .= " $arg ";
- }
- }
-
- $cmd .= " > trinity_GG.cmds";
-
- &process_cmd($cmd) unless (-e "trinity_GG.cmds.ok");
- &process_cmd("touch trinity_GG.cmds.ok") unless (-e "trinity_GG.cmds.ok");
-
- if ($genome_guided_just_prep_flag) {
- print STDERR "###### Just prepping data for genome-guided assembly. Stopping here due to --genome_guided_just_prep invocation. #####\n\n";
- exit(0);
- }
-
- ## execute the commands:
- if ($grid_conf_file) {
- my @trin_GG_cmds = `cat trinity_GG.cmds`;
- chomp @trin_GG_cmds;
-
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "trinity_GG_cmds.htc_cache_success");
- my $ret = $grid_runner->run_on_grid(@trin_GG_cmds);
- if ($ret) {
- die "Error, not all Trinity-GG commands completed successfully. Cannot continue.";
- }
-
- }
- else {
- my $cmd = "$PARAFLY -c trinity_GG.cmds -CPU $CPU -v ";
- &process_cmd($cmd);
- }
-
- ## pull together the final outputs:
- $cmd = "find Dir_* -name '*inity.fasta' | $UTILDIR/support_scripts/GG_trinity_accession_incrementer.pl > Trinity-GG.fasta.tmp";
- &process_cmd($cmd);
-
- rename("Trinity-GG.fasta.tmp", "Trinity-GG.fasta"); # now that it's done.
-
- print STDERR "\n\nFinished. See Trinity-GG.fasta for reconstructed transcripts\n\n";
-
- return;
-}
-
-sub add_zcat_gz {
- my (@in_files) = @_;
-
- my @files;
-
- foreach my $file (@in_files) {
-
- if ($file =~ /\.gz$/) {
-
- $file = "<(zcat $file)";
-
- }
- push (@files, $file);
- }
-
- return(@files);
-}
diff --git a/galaxy-plugin/GauravGalaxy/__add_to_PATH_setting.txt b/galaxy-plugin/GauravGalaxy/__add_to_PATH_setting.txt
deleted file mode 100644
index 85feb78..0000000
--- a/galaxy-plugin/GauravGalaxy/__add_to_PATH_setting.txt
+++ /dev/null
@@ -1 +0,0 @@
-/usr/local/bin
diff --git a/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix.xml b/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix.xml
deleted file mode 100644
index 46ef57e..0000000
--- a/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<tool id="abundance_estimation_to_matrix" name="abundance_estimation_to_matrix" version="0.0.1">
-
- <description>Join RSEM estimates from multiple samples into a single matrix</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- abundance_estimation_to_matrix_wrapper.py
- #for $q in $RSEM_samples
- ${q.file} "${q.column_label}"
- #end for
-
- </command>
- <inputs>
-
- <repeat name="RSEM_samples" title="RSEM abundance estimates for samples">
- <param name="file" label="Add file" type="data" format="text"/>
- <param name="column_label" label="column label" type="text" />
- </repeat>
-
- </inputs>
- <outputs>
- <data format="text" name="counts_matrix" label="${tool.name} on ${on_string}: Counts Matrix" from_work_dir="matrix.counts.matrix"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix_wrapper.py b/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix_wrapper.py
deleted file mode 100644
index e906e3a..0000000
--- a/galaxy-plugin/GauravGalaxy/abundance_estimation_to_matrix_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, string, subprocess
-
-#aliasing the filenames using the labels
-
-
-def run_command(command):
- print "Running command: " + command
-
- err_capture_file = open("my.stderr", 'w') # writing stderr to a file
- cmd_run = subprocess.Popen(args=command, shell=True, stderr=err_capture_file, stdout=sys.stdout)
- err = cmd_run.wait() # get exit code from command execution
- err_capture_file.close()
-
- if err:
- # report the error messages we captured, and exit non-zero
- sys.stderr.write("Error, cmd: " + command + " died with ret: " + `err`)
- for line in open(err_capture_file):
- sys.stderr.write(line)
- sys.exit(err)
- return
-
-label_list = [] # symlink files to the labels
-for i in range(1, len(sys.argv), 2):
- filename=sys.argv[i]
- label= sys.argv[i+1]
- cmd= "ln -sf " + filename + " " + label
- label_list.append(label)
- run_command(cmd)
-
-
-# run the abundance estimation script
-
-cmd = os.path.dirname(sys.argv[0]) + "/trinityToolWrapper.py " + " util/abundance_estimates_to_matrix.pl --est_method RSEM --cross_sample_fpkm_norm TMM " + " ".join(label_list)
-
-run_command(cmd)
-
-sys.exit(0)
-
diff --git a/galaxy-plugin/GauravGalaxy/align_and_estimate_abundance.xml b/galaxy-plugin/GauravGalaxy/align_and_estimate_abundance.xml
deleted file mode 100644
index d102646..0000000
--- a/galaxy-plugin/GauravGalaxy/align_and_estimate_abundance.xml
+++ /dev/null
@@ -1,90 +0,0 @@
-<tool id="RSEM_abundance_estimation" name="RSEM_abundance_estimation" version="0.0.1">
-
- <description>run RSEM to estimate transcript abundances</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/align_and_estimate_abundance.pl --transcripts $transcripts --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
-
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
-
- </command>
- <inputs>
- <param format="fasta" name="transcripts" type="data" label="transcripts_fasta" help="Fasta sequences for which reads are aligned." />
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
-
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="text" name="transcript_counts" label="${tool.name} on ${on_string}: Isoform Counts" from_work_dir="RSEM.isoforms.results"/>
- <data format="text" name="gene_counts" label="${tool.name} on ${on_string}: Gene counts" from_work_dir="RSEM.genes.results"/>
-
-
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/alignreads.xml b/galaxy-plugin/GauravGalaxy/alignreads.xml
deleted file mode 100644
index 6347a13..0000000
--- a/galaxy-plugin/GauravGalaxy/alignreads.xml
+++ /dev/null
@@ -1,138 +0,0 @@
-<tool id="alignreads" name="alignReads" version="0.0.1">
-
- <description>alignReads: short read alignment tool wrapper</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/alignReads.pl --target $target -o alignment --aligner $aligner_selection.aligner
-
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --max_dist_between_pairs $inputs.max_dist_between_pairs
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- ##if str($inputs.use_additional) == "yes":
- ## -- $inputs.additional_params
- ##end if
-
-
- ## direct to output
- > outCapture.txt
-
-
- </command>
- <inputs>
- <param format="fasta" name="target" type="data" label="target" help="Fasta sequences targeted for short-read alignment" />
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="max_dist_between_pairs" type="integer" value="2000" min="1" label="max_dist_between_pairs" help="Maximum length expected between fragment pairs as aligned to the target, including introns where relevant."/>
-
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- </when>
- </conditional>
-
- <conditional name="aligner_selection">
- <param name="aligner" type="select" label="Select alignment tool to run">
- <option value="bowtie">bowtie</option>
- <option value="bwa">bwa</option>
- <option value="blat">blat</option>
- </param>
- <when value="blat">
- <param name="max_intron_length" type="integer" value="10000" min = "1" label="maximum intron length" help="" />
- <param name="min_percent_identity" type="integer" value="95" min="1" label="minimum percent identity" help="" />
- </when>
- <when value="bwa">
- </when>
- <when value="bowtie">
- </when>
- </conditional>
-
-
- <!--
- <conditional name="use_additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="additional_params" type="text" value="" label="Additional command-line parameters to aligner" help="" />
- </when>
- </conditional>
-
- -->
-
- </inputs>
- <outputs>
- <data format="bam" name="coordSortedBam" label="${tool.name} on ${on_string}: COORD-sorted read alignments" from_work_dir="alignment/alignment.coordSorted.bam"/>
- <data format="bam" name="nameSortedBam" label="${tool.name} on ${on_string}: NAME-sorted read alignments" from_work_dir="alignment/alignment.nameSorted.bam"/>
-
- <!-- notes: need to retain:
- -the sample name for the alignment
- -coordinate-sorted vs. name-sorted bam file
- -paired vs. unpaired vs. strictly proper pairs (for RSEM)
- -strand-specific or not
- -->
-
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/analyze_diff_exp.xml b/galaxy-plugin/GauravGalaxy/analyze_diff_exp.xml
deleted file mode 100644
index 5e3c6d6..0000000
--- a/galaxy-plugin/GauravGalaxy/analyze_diff_exp.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<tool id="Analyze_Diff_Exp" name="Analyze_Differential_Expression" version="0.0.1">
-
- <description>Analyze differential expression</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- analyze_diff_exp_wrapper.py $EdgeRTarGz $TMM_Matrix_FPKM $Pvalue $Cvalue
-
- </command>
- <inputs>
- <param name="EdgeRTarGz" label="EdgeR tar gz file" type="data" format="file"/>
- <param name="TMM_Matrix_FPKM" label="TMM Normalized FPKM matrix" type="data" format="file" />
- <param name="Pvalue" label="P-value" value="0.05" type="float" />
- <param name="Cvalue" label="C-value" value="0" type="float" />
-
- </inputs>
- <outputs>
- <data format="data" name="diffExpr_matrix" label="${tool.name} on ${on_string}: Matrix" from_work_dir="diffExpr.matrix"/>
- <data format="data" name="diffExpr_correlation_matrix" label="${tool.name} on ${on_string}: Sample_Correlation_Matrix" from_work_dir="diffExpr.matrix.log2.sample_cor.dat"/>
- <data format="data" name="diffExpr_correlation_matrix_pdf" label="${tool.name} on ${on_string}: Sample_Correlation_Matrix_PDF" from_work_dir="diffExpr.matrix.log2.sample_cor_matrix.pdf"/>
- <data format="data" name="Heatmap" label="${tool.name} on ${on_string}: Heatmap" from_work_dir="diffExpr.matrix.log2.centered.genes_vs_samples_heatmap.pdf"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/analyze_diff_exp_wrapper.py b/galaxy-plugin/GauravGalaxy/analyze_diff_exp_wrapper.py
deleted file mode 100644
index d4d7fae..0000000
--- a/galaxy-plugin/GauravGalaxy/analyze_diff_exp_wrapper.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys, os, subprocess
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit()
-
-usage= "usage: " + sys.argv[0] + " " + "edgeR.tar.gz " + "TMM_normalized_FPKM_matrix " + "P-value " + "C-value"
-print sys.argv
-print usage
-print " "
-
-if len(sys.argv)<5:
- print "Require atleast two parameters"
-else:
- print "All good- command going ahead"
-print " "
-
-Normalized_Matrix=sys.argv[2]
-Pvalue=sys.argv[3]
-Cvalue=sys.argv[4]
-
-def run_command(cmd):
- print "The command used: " + cmd
- pipe= subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
- pipe.wait()
- ret= pipe.returncode
- if ret:
- print "command died: " + str(ret)
- print pipe.stderr.readlines()
- sys.exit(1)
- else:
- return
-print " "
-
-Final_tar_gz= "edgeR.tar.gz"
-run_command("cp "+ sys.argv[1] + " " + "Final_tar_gz")
-run_command("tar -xvf " + "Final_tar_gz")
-run_command("mv " + "edgeR_results" + "/* ." )
-
-# run the analyze command
-cmd= TRINITY_BASE_DIR + "/Analysis/DifferentialExpression/analyze_diff_expr.pl "+ "--matrix " + Normalized_Matrix + " -P " + Pvalue + " -C " + Cvalue
-run_command(cmd)
-
-origMatrixName= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix"
-# diffExpr.P0.001_C2.0.matrix
-run_command("mv " + origMatrixName + " diffExpr.matrix")
-
-SampleCorName= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.sample_cor.dat"
-# diffExpr.P0.001_C2.0.matrix.log2.sample_cor.dat
-run_command("mv " + SampleCorName + " diffExpr.matrix.log2.sample_cor.dat")
-
-CorMatrix= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.sample_cor_matrix.pdf"
-# diffExpr.P0.001_C2.0.matrix.log2.sample_cor_matrix.pdf
-run_command("mv " + CorMatrix + " diffExpr.matrix.log2.sample_cor_matrix.pdf")
-
-Heatmap= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.centered.genes_vs_samples_heatmap.pdf"
-#diffExpr.P0.001_C2.0.matrix.log2.centered.genes_vs_samples_heatmap.pdf
-run_command("mv " + Heatmap + " diffExpr.matrix.log2.centered.genes_vs_samples_heatmap.pdf")
-
-sys.exit(0)
diff --git a/galaxy-plugin/GauravGalaxy/bash_command_executer.py b/galaxy-plugin/GauravGalaxy/bash_command_executer.py
deleted file mode 100755
index c6ab738..0000000
--- a/galaxy-plugin/GauravGalaxy/bash_command_executer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-
-
-import sys, subprocess, os
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get command-line arguments
- args = sys.argv
- # Remove name of calling program, i.e. ./stderr_wrapper.py
- args.pop(0)
-
- # If there are no arguments left, we're done
- if len(args) == 0:
- return
-
- # If one needs to silence stdout
- #args.append( ">" )
- #args.append( "/dev/null" )
-
- cmdline = " ".join(args)
-
-
- try:
- # Run program
- err_capture = open("stderr.txt", 'w')
- proc = subprocess.Popen( args=cmdline, shell=True, stderr=err_capture, stdout=sys.stdout )
- returncode = proc.wait()
- err_capture.close()
-
-
- if returncode != 0:
- raise Exception
-
- except Exception:
- # Running Grinder failed: write error message to stderr
- err_text = open("stderr.txt").readlines()
- stop_err( "ERROR:\n" + "\n".join(err_text))
-
-
-if __name__ == "__main__": __main__()
diff --git a/galaxy-plugin/GauravGalaxy/cat.xml b/galaxy-plugin/GauravGalaxy/cat.xml
deleted file mode 100644
index 9aeaa50..0000000
--- a/galaxy-plugin/GauravGalaxy/cat.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<tool id="catbash" name="Concatenate datasets">
- <description>tail-to-head</description>
- <command interpreter="python">
- bash_command_executer.py cat
- $input1
- #for $q in $queries
- ${q.input2}
- #end for
- > $out_file1
- </command>
- <inputs>
- <param name="input1" type="data" label="Concatenate Dataset"/>
- <repeat name="queries" title="Dataset">
- <param name="input2" type="data" label="Select" />
- </repeat>
- </inputs>
- <outputs>
- <data name="out_file1" format="input" metadata_source="input1"/>
- </outputs>
- <tests>
- <test>
- <param name="input1" value="1.bed"/>
- <param name="input2" value="2.bed"/>
- <output name="out_file1" file="cat_wrapper_out1.bed"/>
- </test>
- <!--TODO: if possible, enhance the underlying test code to handle this test
- the problem is multiple params with the same name "input2"
- <test>
- <param name="input1" value="1.bed"/>
- <param name="input2" value="2.bed"/>
- <param name="input2" value="3.bed"/>
- <output name="out_file1" file="cat_wrapper_out2.bed"/>
- </test>
- -->
- </tests>
- <help>
-
- no help
-
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/diffExpress_edgeR.xml b/galaxy-plugin/GauravGalaxy/diffExpress_edgeR.xml
deleted file mode 100644
index 1a5b186..0000000
--- a/galaxy-plugin/GauravGalaxy/diffExpress_edgeR.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<tool id="diffExpress_edgeR" name="diffExpress_edgeR" version="0.0.1">
-
- <description>Identify Differentially Expressed Transcripts Using EdgeR</description>
- <requirements>
- <requirement type="package">edgeR</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py Analysis/DifferentialExpression/run_EdgeR.pl
- --matrix $counts_matrix
- --transcripts $transcripts_fasta_file
- --output edgeR_results
- --dispersion $dispersion
-
- > stdout.txt
-
-
- </command>
- <inputs>
-
- <param type="data" format="txt" name="counts_matrix" label="Matrix of RNA-Seq fragment counts for transcripts per condition" />
- <param type="data" format="fasta" name="transcripts_fasta_file" label="Transcripts fasta file corresponding to matrix" />
- <param type="float" name="dispersion" value="0.1" min="0" label="dispersion value" help="Dispersion value to be used in the negative binomial" />
-
- </inputs>
- <outputs>
-
- <data format="txt" name="diff_expressed_edgeR_results" label="${tool.name} on ${on_string}: differentially expressed transcripts per pair of conditions" from_work_dir="edgeR_results/all_diff_expression_results.txt" />
-
- <data format="txt" name="matrix_FPKM" label="${tool.name} on ${on_string}: matrix.TMM_normalized.FPKM" from_work_dir="edgeR_results/matrix.TMM_normalized.FPKM" />
-
- <data format="txt" name="TMM_info" label="${tool.name} on ${on_string}: TMM library size estimates" from_work_dir="edgeR_results/TMM_info.txt" />
-
- </outputs>
- <tests>
-
- <test>
- <param name="myname" value="This is just a simple test" />
-
- </test>
-
-
- </tests>
- <help>
- help info here.
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/transcriptsToOrfs.xml b/galaxy-plugin/GauravGalaxy/transcriptsToOrfs.xml
deleted file mode 100644
index 2afb8f6..0000000
--- a/galaxy-plugin/GauravGalaxy/transcriptsToOrfs.xml
+++ /dev/null
@@ -1,53 +0,0 @@
-<tool id="transcriptsToOrfs" name="transcriptsToOrfs" version="0.0.1">
-
- <description>Trinity Transcripts to Candidate Peptides</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
- trinityToolWrapper.py Analysis/Coding/transcripts_to_best_scoring_ORFs.pl
-
- -t $transcripts
- -m $min_prot_length
- --CPU $CPU
- --search_pfam "${ filter( lambda x: str( x[0] ) == str( $pfam_db ), $__app__.tool_data_tables[ 'pfam_databases' ].get_fields() )[0][-1] }"
-
- #if str($strand_specificity) == 'SS':
- -S
- #end if
-
- ## direct to output
- > output
-
- </command>
- <inputs>
-
- <param format="fasta" name="transcripts" type="data" label="Transcripts sequences in fastA format" help="" />
-
- <param name="min_prot_length" type="integer" label="Minimum peptide length (in amino acids)" value="100" min="50" help="" />
-
- <param name="strand_specificity" type="select" label="Strand specificity type">
- <option value="DS">NOT strand specific, examine both strands</option>
- <option value="SS">Strand specific, examine only top strand</option>
- </param>
-
- <param name="pfam_db" type="select" label="Pfam database">
- <options from_data_table="pfam_databases" />
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by hmmscan" />
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_pep_pfam" label="${tool.name} on ${on_string}: Pfam matches to Candidate Peptide Sequences" from_work_dir="longest_orfs.pep.pfam.dat"/>
- <data format="fasta" name="trinity_pep_seqs" label="${tool.name} on ${on_string}: Candidate Peptide Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.pep"/>
- <data format="bed" name="trinity_pep_coords" label = "${tool.name} on ${on_string} Candidate Peptide Coordinates" from_work_dir="best_candidates.eclipsed_orfs_removed.bed" />
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/trinityToolWrapper.py b/galaxy-plugin/GauravGalaxy/trinityToolWrapper.py
deleted file mode 100755
index 957ae12..0000000
--- a/galaxy-plugin/GauravGalaxy/trinityToolWrapper.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env python
-
-
-# borrowed from: http://wiki.g2.bx.psu.edu/Future/Job%20Failure%20When%20stderr and modified for use with Trinity tools.
-
-"""
-Wrapper that execute a program and its arguments but reports standard error
-messages only if the program exit status was not 0
-Example: ./stderr_wrapper.py myprog arg1 -f arg2
-"""
-
-import sys, subprocess, os
-
-assert sys.version_info[:2] >= ( 2, 4 )
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit()
-
-
-
-# get bindir
-bindir = sys.argv[0]
-bindir = bindir.split("/")
-if len(bindir) > 1:
- bindir.pop()
- bindir = "/".join(bindir)
-else:
- bindir = "."
-
-
-## add locations of tools to path setting.
-TOOL_PATHS_FILE = bindir + "/__add_to_PATH_setting.txt";
-for line in open(TOOL_PATHS_FILE):
- line = line.rstrip()
- os.environ['PATH'] += ":" + line
-
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get command-line arguments
- args = sys.argv
- # Remove name of calling program, i.e. ./stderr_wrapper.py
- args.pop(0)
- # If there are no arguments left, we're done
- if len(args) == 0:
- return
-
- # If one needs to silence stdout
- #args.append( ">" )
- #args.append( "/dev/null" )
-
- args[0] = "".join([TRINITY_BASE_DIR, '/', args[0]]);
-
- cmdline = " ".join(args)
-
-
-
- try:
- # Run program
- err_capture = open("stderr.txt", 'w')
- proc = subprocess.Popen( args=cmdline, shell=True, stderr=err_capture, stdout=sys.stdout )
- returncode = proc.wait()
- err_capture.close()
-
-
- if returncode != 0:
- raise Exception
-
- except Exception:
- # Running Grinder failed: write error message to stderr
- err_text = open("stderr.txt").readlines()
- stop_err( "ERROR:\n" + "\n".join(err_text))
-
-
-if __name__ == "__main__": __main__()
diff --git a/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml b/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml
deleted file mode 100644
index 621ff5e..0000000
--- a/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml
+++ /dev/null
@@ -1,127 +0,0 @@
-<tool id="trinityrnaseq" name="Trinity" version="0.0.1">
-
- <!-- Written by Jeremy Goecks, now maintained here by bhaas -->
- <description>De novo assembly of RNA-Seq data Using Trinity</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
- trinityToolWrapper.py Trinity --max_memory $JM --CPU $CPU
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --group_pairs_distance $inputs.group_pairs_distance
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- #if str($additional_params.use_additional) == "yes":
- --min_kmer_cov $inputs.min_kmer_cov --max_reads_per_graph $inputs.max_reads_per_graph --bflyHeapSpaceMax $input.bflyHeapSpaceMax
- #if $inputs.bfly_opts != 'None':
- --bfly_opts " $inputs.bfly_opts "
- #end if
- #end if
-
-
- ## direct to output
- > $trinity_log
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- <option value="200G">200G</option>
- <option value="500G">500G</option>
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by Trinity" />
-
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="group_pairs_distance" type="integer" value="500" min="1" label="Group pairs distance" help="Maximum length expected between fragment pairs"/>
- <param name="path_reinforcement_distance" type="integer" value="75" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- <param name="path_reinforcement_distance" type="integer" value="40" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
- </when>
- </conditional>
-
- <conditional name="additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="min_kmer_cov" type="integer" value="1" min="1" label="inchworm_min_kmer_cov" help="Minimum kmer coverage required by Inchworm for initial contig construction" />
- <param name="max_reads_per_graph" type="integer" value="20000000" min="10000" label="chrysalis_max_reads_per_graph" help="Maximum number of reads to be anchored within each transcript graph by Chrysalis" />
-
-
- <param name="bfly_opts" type="text" value="None" label="bfly_opts" help="Options to pass on to Butterfly" />
- <param name="bflyHeapSpaceMax" type="select" label="bflyHeapSpaceMax" help="Java heap space maximum value for Butterfly">
- <option value="1G">1G</option>
- <option value="2G">2G</option>
- <option value="4G" selected="true">4G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- </param>
-
- <param name="min_contig_length" type="integer" value="200" min="1" label="Minimum Contig Length" help=""/>
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta" name="assembled_transcripts" label="${tool.name} on ${on_string}: Assembled Transcripts" from_work_dir="trinity_out_dir/Trinity.fasta"/>
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml.Graham_version_022014 b/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml.Graham_version_022014
deleted file mode 100644
index d92bb13..0000000
--- a/galaxy-plugin/GauravGalaxy/trinityrnaseq.xml.Graham_version_022014
+++ /dev/null
@@ -1,127 +0,0 @@
-<tool id="trinityrnaseq" name="Trinity" version="0.0.1">
-
- <!-- Written by Jeremy Goecks, now maintained here by bhaas -->
- <description>De novo assembly of RNA-Seq data Using Trinity</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command>
- Trinity.pl --JM $JM --CPU $CPU
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --group_pairs_distance $inputs.group_pairs_distance
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- #if str($additional_params.use_additional) == "yes":
- --min_kmer_cov $additional_params.min_kmer_cov --max_reads_per_graph $additional_params.max_reads_per_graph --bflyHeapSpaceMax $additional_params.bflyHeapSpaceMax
- #if $additional_params.bfly_opts != 'None':
- --bfly_opts " $additional_params.bfly_opts "
- #end if
- #end if
-
-
- ## direct to output
- > $trinity_log 2>&1
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- <option value="200G">200G</option>
- <option value="500G">500G</option>
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by Trinity" />
-
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="group_pairs_distance" type="integer" value="500" min="1" label="Group pairs distance" help="Maximum length expected between fragment pairs"/>
- <param name="path_reinforcement_distance" type="integer" value="75" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- <param name="path_reinforcement_distance" type="integer" value="40" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
- </when>
- </conditional>
-
- <conditional name="additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="min_kmer_cov" type="integer" value="1" min="1" label="inchworm_min_kmer_cov" help="Minimum kmer coverage required by Inchworm for initial contig construction" />
- <param name="max_reads_per_graph" type="integer" value="20000000" min="10000" label="chrysalis_max_reads_per_graph" help="Maximum number of reads to be anchored within each transcript graph by Chrysalis" />
-
-
- <param name="bfly_opts" type="text" value="None" label="bfly_opts" help="Options to pass on to Butterfly" />
- <param name="bflyHeapSpaceMax" type="select" label="bflyHeapSpaceMax" help="Java heap space maximum value for Butterfly">
- <option value="1G">1G</option>
- <option value="2G">2G</option>
- <option value="4G" selected="true">4G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- </param>
-
- <param name="min_contig_length" type="integer" value="200" min="1" label="Minimum Contig Length" help=""/>
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta" name="assembled_transcripts" label="${tool.name} on ${on_string}: Assembled Transcripts" from_work_dir="trinity_out_dir/Trinity.fasta"/>
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/GauravGalaxy/trinityrnaseq_norm.xml b/galaxy-plugin/GauravGalaxy/trinityrnaseq_norm.xml
deleted file mode 100644
index 8e13024..0000000
--- a/galaxy-plugin/GauravGalaxy/trinityrnaseq_norm.xml
+++ /dev/null
@@ -1,102 +0,0 @@
-<tool id="trinityrnaseq_norm" name="Trinity read normalization" version="0.0.1">
-<!--tool id="trinityrnaseq" name="Trinity" version="0.0.1" -->
-
- <!-- Written by Jeremy Goecks, modified by Josh Bowden for normalization proceedure, now maintained here by bhaas -->
- <description>Pre-process RNA-seq data to reduce coverage of highly covered areas</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="perl">
- trinityToolWrapper.py util/normalize_by_kmer_coverage.pl --JM $JM --max_cov $MAXCOV
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- --outleft $output_left --outright $output_right
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
-
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --outputsingle $output_single
- #end if
- --galaxy
-
- ## direct to output
- > $trinity_coverage_normalization_log
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- </param>
-
- <param name="MAXCOV" type="select" label="MAXCOV" help="Read coverage in terms of maximum covarge to keep">
- <option value="30">30</option>
- <option value="40">40</option>
- <option value="50">50</option>
- <option value="60">60</option>
- <option value="70">70</option>
- <option value="100">100</option>
- </param>
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
-
- </when>
- </conditional>
- </inputs>
-
- <outputs>
- <!-- I have not found a way to do condional outputs so all potential output files are specified and some will be empty -->
- <data format="txt" name="trinity_coverage_normalization_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta,fastq" name="output_left" label="${tool.name} on ${on_string}: Normalized left data" />
- <data format="fasta,fastq" name="output_right" label="${tool.name} on ${on_string}: Normalized right data" />
- <data format="fasta,fastq" name="output_single" label="${tool.name} on ${on_string}: Normalized data" />
- <!-- data format="fastq" name="normalized right dataset" label="${tool.name} on ${on_string}: Normalized right data " from_work_dir="${inputs.right_input}.${inputs.input.ext}.normalized_K25_C${MAXCOV}_pctSD100.fq"/-->
- </outputs>
- <tests>
- </tests>
- <help>
- Runs script $TRINITY_HOME/util/normalize_by_kmer_coverage.pl which reduces data sizes with minimal impact on recovered transcripts when used by Trinity.pl.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/RSEM_abundance_estimation.xml b/galaxy-plugin/RSEM_abundance_estimation.xml
deleted file mode 100644
index 0ad744e..0000000
--- a/galaxy-plugin/RSEM_abundance_estimation.xml
+++ /dev/null
@@ -1,72 +0,0 @@
-<tool id="RSEM_abundance_estimation" name="RSEM_abundance_estimation" version="0.0.1">
-
- <description>run RSEM to estimate transcript abundances</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/RSEM_util/run_RSEM.pl --transcripts $transcripts --name_sorted_bam $name_sorted_bam
-
- ## Inputs.
- #if str($read_type.paired_or_single) == "single":
- #if int($read_type.fragment_length) > 0:
- --fragment_length $read_type.fragment_length
- #end if
- #end if
-
-
- #if str($target_type) == "Other":
- --no_group_by_component
- #end if
-
-
- </command>
- <inputs>
- <param format="fasta" name="transcripts" type="data" label="transcripts_fasta" help="Fasta sequences for which reads are aligned." />
-
- <param name="target_type" type="select" label="transcript target type" help="">
- <option value="trinity_mode">Trinity</option>
- <option value="other">Other</option>
- </param>
-
-
- <conditional name="read_type">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- </when>
- <when value="single">
- <param name="fragment_length" type="integer" value="0" label="fragment_length" help="Mean fragment length. Note, for de novo assemblies, leave as zero (treated as unset)"/>
- </when>
- </conditional>
-
- <param format="bam" name="name_sorted_bam" type="data" label="name_sorted_bam_file" help=""/>
-
-
- </inputs>
- <outputs>
- <data format="text" name="transcript_counts" label="${tool.name} on ${on_string}: Isoform Counts" from_work_dir="RSEM.isoforms.results"/>
- <data format="text" name="gene_counts" label="${tool.name} on ${on_string}: Gene counts" from_work_dir="RSEM.genes.results"/>
-
-
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/RSEM_estimates_to_matrix.xml b/galaxy-plugin/RSEM_estimates_to_matrix.xml
deleted file mode 100644
index 3057eb0..0000000
--- a/galaxy-plugin/RSEM_estimates_to_matrix.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<tool id="RSEM_estimates_to_matrix" name="RSEM_estimates_to_matrix" version="0.0.1">
-
- <description>Join RSEM estimates from multiple samples into a single matrix</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/RSEM_util/merge_RSEM_counts_and_labels_single_table.pl
-
- --labels
- #for $entry in $RSEM_samples:
- ${entry.column_label}
- #end for
-
- --RSEM_counts
- #for $entry in $RSEM_samples:
- ${entry.file}
- #end for
-
- > RSEM.counts.matrix
-
- </command>
- <inputs>
-
- <repeat name="RSEM_samples" title="RSEM abundance estimates for samples">
- <param name="file" label="Add file" type="data" format="text"/>
- <param name="column_label" label="column label" type="text" />
- </repeat>
-
- </inputs>
- <outputs>
- <data format="text" name="counts_matrix" label="${tool.name} on ${on_string}: Counts Matrix" from_work_dir="RSEM.counts.matrix"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/Trinity b/galaxy-plugin/Trinity
deleted file mode 100755
index be72d56..0000000
--- a/galaxy-plugin/Trinity
+++ /dev/null
@@ -1,2707 +0,0 @@
-#!/usr/bin/env perl
-
-use strict;
-use warnings;
-use threads;
-no strict qw(subs refs);
-
-use FindBin;
-use lib ("$FindBin::Bin/PerlLib");
-use File::Basename;
-use Time::localtime;
-use Cwd;
-use Carp;
-use COMMON;
-use Getopt::Long qw(:config no_ignore_case pass_through);
-
-BEGIN {
-
- $ENV{TRINITY_HOME} = "$FindBin::Bin";
-
-}
-
-use HTC::GridRunner;
-
-
-open (STDERR, ">&STDOUT"); ## capturing stderr and stdout in a single stdout stream
-
-# Site specific setup
-
-my $CPU_MAX = 64; # set higher at your own risk. Definitely don't set it higher than the number of cores available on your machine.
-
-my $KMER_SIZE = 25;
-my $MAX_KMER_SIZE = 32;
-
-my $INCHWORM_CUSTOM_PARAMS;
-
-# option list:
-my ($seqType, @left_files, @right_files, @single_files, $SS_lib_type, $min_contig_length,
- $group_pairs_distance, $jaccard_clip, $show_advanced_options,
- $output_directory, $prep_only
- );
-
-# What is allowed for the options. Put string to be displayed in '%allowed'; this
-# will be showed to the user via help and on error. Keys are the variable names.
-# Actual hash to be used for checking is auto-generated. Fancy regex inside map
-# is just to get rid of the syntaxical sugar 'or' in the display string.
-
-my %allowed =
- ( seqType => 'fa, or fq'
- , kmer_method => 'jellyfish, meryl, or inchworm'
- );
-
-my %allowed_check;
-foreach my $all (keys %allowed) {
- my %h = map { (my $s = $_) =~ s/^or //; $s => 1 } split ', ', $allowed{$all};
- $allowed_check{$all} = \%h;
-}
-
-# defaults:
-
-$output_directory = &create_full_path("trinity_out_dir", 0);
-
-
-# butterfly opts
-$min_contig_length = 200;
-$group_pairs_distance = 500;
-my $path_reinforcement_distance;
-my $PE_path_reinforcement_distance = 75;
-my $SE_path_reinforcement_distance = 25;
-
-my $NO_RUN_BUTTERFLY_FLAG = 0;
-my $RERUN_BUTTERFLY_FLAG = 0;
-my $bfly_opts = "";
-my $bflyHeapSpaceMax = "10G";
-my $bflyHeapSpaceInit = "1G";
-
-my $BFLY_JAR = "";
-
-# butterfly path merging criteria
-my $NO_PATH_MERGING = 0;
-my $MIN_PER_ID_SAME_PATH; # leave these at the butterfy defaults
-my $MAX_DIFFS_SAME_PATH;
-my $MAX_INTERNAL_GAP_SAME_PATH;
-
-
-# misc opts
-my $min_kmer_cov = 1;
-my $meryl_opts = "";
-my $inchworm_cpu = 6;
-
-my $min_percent_read_iworm_kmers = -1; # experimental, off
-
-my $CPU = 2;
-my $bflyCPU;
-my $bflyCalculateCPU = 0;
-my $bflyGCThreads = 2;
-
-my $long_reads = "";
-
-
-## ADVANCED OPTIONS:
-
-my $no_meryl_flag = 0;
-
-## Chrysalis opts
-my $min_glue = 2;
-my $min_iso_ratio = 0.05;
-my $glue_factor = 0.05;
-my $max_reads_per_graph = 200000;
-my $max_reads_per_loop = 10000000;
-my $min_pct_read_mapping = 0;
-my $NO_RUN_QUANTIFYGRAPH_FLAG = 0;
-my $NO_RUN_CHRYSALIS_FLAG = 0;
-my $chrysalis_output_dir = "chrysalis";
-my $component_directory;
-
-my $help_flag;
-my $advanced_help_flag;
-my $SHOW_CITATION_FLAG = 0;
-
-my $VERSION = "trinityrnaseq_r20140717";
-my $show_version_flag = 0;
-
-## Kmer methods
-my $kmer_method = "";
-
-## Jellyfish
-my $max_memory;
-
-
-## Grid computing options:
-my $grid_conf_file;
-
-## Performance monitoring options
-my $pm_logfile = "Trinity.timing";
-my $pm_trinity_startstring;
-my $pm_trinity_endstring;
-my $pm_trinity_start=0;
-my $pm_trinity_end=0;
-my $pm_inchworm_start=0;
-my $pm_inchworm_end=0;
-my $pm_chrysalis_start=0;
-my $pm_chrysalis_end=0;
-my $pm_butterfly_start=0;
-my $pm_butterfly_end=0;
-my $pm_left_fa_size=0;
-my $pm_right_fa_size=0;
-my $pm_single_fa_size=0;
-my $pm_trinity_fa_size=0;
-my $pm_trinity_arguments="";
-my $pm_inchworm_kmers=0;
-my $pm_read_count=0;
-
-my $run_with_collectl = 0;
-# flush each second, record procs+rest every 5 secs, use only process subsystem
-my $collectl_param = "-F1 -i5:5 -sZ";
-my $collectl_output_directory = "collectl";
-my $collectl_pid = 0;
-my $collectl_out = "";
-my $collectl_titlename = "";
-my $start_dir = cwd();
-
-## misc other opts, mostly for testing purposes
-my $run_as_paired_flag = 0; ## in case we have paired reads in single fasta file, already oriented.
-my $weldmer_size = 48;
-my $FORCE_INCHWORM_KMER_METHOD = 0;
-
-my $TRIPLET_LOCK = 1;
-my $EXTENDED_TRIPLET_LOCK = 0;
-
-
-
-
-my $PARALLEL_IWORM_FLAG = 1;
-my $NO_PARALLEL_IWORM = 0;
-
-## Quality trimming params
-my $RUN_TRIMMOMATIC_FLAG = 0;
-my $trimmomatic_quality_trim_params = "LEADING:5 TRAILING:5 MINLEN:36";
-
-## Normalize reads
-my $NORMALIZE_READS_FLAG = 0;
-my $normalize_max_read_cov = 50;
-my $NORMALIZE_BY_READ_SET = 0;
-
-# Note: For the Trinity logo below the backslashes are quoted in order to keep
-# them from quoting the character than follows them. "\\" keeps "\ " from occuring.
-
-my $basic_usage = qq^
-
-
-###############################################################################
-#
-# ______ ____ ____ ____ ____ ______ __ __
-# | || \\ | || \\ | || || | |
-# | || D ) | | | _ | | | | || | |
-# |_| |_|| / | | | | | | | |_| |_|| ~ |
-# | | | \\ | | | | | | | | | |___, |
-# | | | . \\ | | | | | | | | | | |
-# |__| |__|\\_||____||__|__||____| |__| |____/
-#
-###############################################################################
-#
-# Required:
-#
-# --seqType <string> :type of reads: ( $allowed{seqType} )
-#
-# --JM <string> :(Jellyfish Memory) number of GB of system memory to use for
-# k-mer counting by jellyfish (eg. 10G) *include the 'G' char
-#
-# If paired reads:
-# --left <string> :left reads, one or more (separated by space)
-# --right <string> :right reads, one or more (separated by space)
-#
-# Or, if unpaired reads:
-# --single <string> :single reads, one or more (note, if single file contains pairs, can use flag: --run_as_paired )
-#
-####################################
-## Misc: #########################
-#
-# --SS_lib_type <string> :Strand-specific RNA-Seq read orientation.
-# if paired: RF or FR,
-# if single: F or R. (dUTP method = RF)
-# See web documentation.
-#
-# --CPU <int> :number of CPUs to use, default: $CPU
-# --min_contig_length <int> :minimum assembled contig length to report
-# (def=$min_contig_length)
-#
-# --genome <string> :genome guided mode, provide path to genome fasta file (see genome-guided param section under --show_full_usage_info)
-#
-# --jaccard_clip :option, set if you have paired reads and
-# you expect high gene density with UTR
-# overlap (use FASTQ input file format
-# for reads).
-# (note: jaccard_clip is an expensive
-# operation, so avoid using it unless
-# necessary due to finding excessive fusion
-# transcripts w/o it.)
-#
-# --trimmomatic :run Trimmomatic to quality trim reads
-# see '--quality_trimming_params' under full usage info for tailored settings.
-#
-#
-# --normalize_reads :run in silico normalization of reads. Defaults to max. read coverage of $normalize_max_read_cov.
-# see '--normalize_max_read_cov' under full usage info for tailored settings.
-#
-#
-# --output <string> :name of directory for output (will be
-# created if it doesn't already exist)
-# default( your current working directory: "$output_directory" )
-#
-# --full_cleanup :only retain the Trinity fasta file, rename as \${output_dir}.Trinity.fasta
-#
-# --cite :show the Trinity literature citation
-#
-# --version :reports Trinity version ($VERSION) and exits.
-#
-# --show_full_usage_info :show the many many more options available for running Trinity (expert usage).
-^;
-
-my $full_usage = qq^
-# --prep :Only prepare files (high I/O usage) and stop before kmer counting.
-#
-# --full_cleanup_ET :only retains assembly fasta file, error tolerant (ET)
-#
-# --no_cleanup :retain all intermediate input files.
-#
-####################################################
-# Inchworm and K-mer counting-related options: #####
-#
-# --min_kmer_cov <int> :min count for K-mers to be assembled by
-# Inchworm (default: $min_kmer_cov)
-# --inchworm_cpu <int> :number of CPUs to use for Inchworm, default is min(6, --CPU option)
-#
-# --no_run_inchworm :stop after running jellyfish, before inchworm.
-#
-###################################
-# Chrysalis-related options: ######
-#
-# --max_reads_per_graph <int> :maximum number of reads to anchor within
-# a single graph (default: $max_reads_per_graph)
-# --min_glue <int> :min number of reads needed to glue two inchworm contigs
-# together. (default: $min_glue)
-# --no_run_chrysalis :stop Trinity after Inchworm and before
-# running Chrysalis
-# --no_run_quantifygraph :stop Trinity just before running the
-# parallel QuantifyGraph computes, to
-# leverage a compute farm and massively
-# parallel execution..
-#
-# --chrysalis_output <string> :name of directory for chrysalis output (will be
-# created if it doesn't already exist)
-# default( "$chrysalis_output_dir" )
-#
-# --no_bowtie :dont run bowtie to use pair info in chrysalis clustering.
-#
-#####################################
-### Butterfly-related options: ####
-#
-# --bfly_opts <string> :additional parameters to pass through to butterfly
-# (see butterfly options: java -jar Butterfly.jar ).
-# (note: only for expert or experimental use. Commonly used parameters are exposed through this Trinity menu here).
-#
-# //////////////////////////////////
-# Alternative reconstruction modes:
-# Default mode is the 'regular' Butterfly transcript reconstruction by graph node extension.
-#
-# --PasaFly PASA-like algorithm for maximally-supported isoforms
-# or
-# --CuffFly Cufflinks-like algorithm to report minimum transcripts
-#
-#
-# Butterfly read-pair grouping settings (used for all reconstruction modes to define 'pair paths'):
-#
-# --group_pairs_distance <int> :maximum length expected between fragment pairs (default: $group_pairs_distance)
-# (reads outside this distance are treated as single-end)
-#
-# ///////////////////////////////////////////////
-# Butterfly default reconstruction mode settings. (no CuffFly or PasaFly custom settings are currently available).
-#
-# --path_reinforcement_distance <int> :minimum overlap of reads with growing transcript
-# path (default: PE: $PE_path_reinforcement_distance, SE: $SE_path_reinforcement_distance)
-# Set to 1 for the most lenient path extension requirements.
-#
-# --no_triplet_lock : (increase stringency of regular butterfly reconstruction (default: on))
-# lock triplet-supported nodes: node 'c' having read path 'A-B-C' disables 'Z-B-C' if no such read support exists.
-#
-# --extended_lock : (further increase the stringency of regular butterfy reconstruction)
-# extend the triplet lock to include longer range read path information.
-# ex. in extending path 'A-B-Z' to 'A-B-Z-D', we only find read support for 'A-B-C-D', that 'A-B-Z' extension to 'D' will be blocked.
-#
-#
-# /////////////////////////////////////////
-# Butterfly transcript reduction settings:
-#
-# --NO_EM_REDUCE : do not run the final EM step to rank transcripts and remove lower-ranking entries that lack unique read conent.
-#
-# --no_path_merging : all final transcript candidates are output (including SNP variations, however, some SNPs may be unphased)
-#
-# By default, alternative transcript candidates are merged (in reality, discarded) if they are found to be too similar, according to the following logic:
-#
-# (identity=(numberOfMatches/shorterLen) > 95.0% or if we have <= 2 mismatches) and if we have internal gap lengths <= 10
-#
-# with parameters as:
-#
-# --min_per_id_same_path <int> default: 95 min percent identity for two paths to be merged into single paths
-# --max_diffs_same_path <int> default: 2 max allowed differences encountered between path sequences to combine them
-# --max_internal_gap_same_path <int> default: 10 maximum number of internal consecutive gap characters allowed for paths to be merged into single paths.
-#
-# If, in a comparison between two alternative transcripts, they are found too similar, the transcript with the greatest cumulative
-# compatible read (pair-path) support is retained, and the other is discarded.
-#
-#
-# //////////////////////////////////////////////
-# Butterfly Java and parallel execution settings.
-#
-# --bflyHeapSpaceMax <string> :java max heap space setting for butterfly
-# (default: $bflyHeapSpaceMax) => yields command
-# 'java -Xmx$bflyHeapSpaceMax -jar Butterfly.jar ... \$bfly_opts'
-# --bflyHeapSpaceInit <string> :java initial hap space settings for
-# butterfly (default: $bflyHeapSpaceInit) => yields command
-# 'java -Xms$bflyHeapSpaceInit -jar Butterfly.jar ... \$bfly_opts'
-# --bflyGCThreads <int> :threads for garbage collection
-# (default: $bflyGCThreads))
-# --bflyCPU <int> :CPUs to use (default will be normal
-# number of CPUs; e.g., $CPU)
-# --bflyCalculateCPU :Calculate CPUs based on 80% of max_memory
-# divided by maxbflyHeapSpaceMax
-# --no_run_butterfly :stops after the Chrysalis stage. You'll
-# need to run the Butterfly computes
-# separately, such as on a computing grid.
-# Then, concatenate all the Butterfly assemblies by running:
-# 'find trinity_out_dir/ -name "\*allProbPaths.fasta" \
-# -exec cat {} + > trinity_out_dir/Trinity.fasta'
-#
-# --bfly_jar <string> : /path/to/Butterfly.jar, otherwise default
-# Trinity-installed version is used.
-#
-
-#
-################################################################################
-#### Quality Trimming Options ####
-#
-# --quality_trimming_params <string> defaults to: "$trimmomatic_quality_trim_params"
-#
-################################################################################
-#### In silico Read Normalization Options ###
-#
-# --normalize_max_read_cov <int> defaults to 50
-# --normalize_by_read_set run normalization separate for each pair of fastq files,
-# then one final normalization that combines the individual normalized reads.
-# Consider using this if RAM limitations are a consideration.
-#
-################################################################################
-#### Genome-guided de novo assembly
-#
-# * required:
-#
-# --genome_guided_max_intron <int> :maximum allowed intron length (also maximum fragment span on genome)
-#
-# --genome_guided_use_bam <string> :use a provided coord-sorted bam file as starting point. Otherwise, use gmap to align to the genome.
-#
-# * optional:
-#
-# --genome_guided_min_coverage <int> :minimum read coverage for identifying and expressed region of the genome. (default: 1)
-#
-# --genome_guided_min_reads_per_partition <int> :default min of 10 reads per partition
-#
-# --genome_guided_CPU <int> : number of threads for the individual genome-guided Trinity commands to use. (defaults to --CPU setting)
-#
-# --genome_guided_sort_buffer <string> : amount of RAM to dedicate to the initial prep of genome-guided read partitioning (defaults to --JM)
-#
-#
-# --GMAP_CPU <int> :defaults to --CPU setting.
-#
-# --genome_guided_just_prep : process stops after prepping the reads for assembly (prior to submitting to a computing grid for parallel execution)
-#
-#################################
-# Grid-computing options: #######
-#
-# --grid_conf_file <string> :configuration file for supported compute farms
-# ex. TRINITY_HOME/htc_conf/BroadInst_LSF.conf
-# currently supported computing gris: LSF, SGE
-#
-#
- ^;
-
-my $usage_synopsis = qq^
-###############################################################################
-#
-# *Note, a typical Trinity command might be:
-#
-# Trinity --seqType fq --JM 100G --left reads_1.fq --right reads_2.fq --CPU 6
-#
-#
-# and for Genome-guided Trinity:
-#
-# Trinity --genome genome.fasta \
-# --genome_guided_max_intron 10000 --genome_guided_sort_buffer 10G \
-# --genome_guided_CPU 4 \
-# --seqType fq --JM 2G --left reads_1.fq --right reads_2.fq --CPU 6
-# (and optionally provide your own bam file: --genome_guided_use_bam rnaseq_alignments.csorted.bam
-# or Trinity will run GSNAP to generate one. )
-#
-#
-# see: $FindBin::RealBin/sample_data/test_Trinity_Assembly/
-# for sample data and 'runMe.sh' for example Trinity execution
-# For more details, visit: http://trinityrnaseq.sf.net
-#
-###############################################################################
-
-
- ^;
-
-
-
-my $advanced_usage = <<_ADVANCEDUSAGE_;
-###################################################################################
- ## Not intended for users, instead for experimentation by developers ##
-###################################################################################
-#
-#
-# Inchworm-related options:
-#
-# --INCHWORM_CUSTOM_PARAMS <string> :additional parameters to be passed on to Inchworm
-# --FORCE_INCHWORM_KMER_METHOD :uses inchworm built-in kmer cataloger instead of jellyfish (not recommended)
-# --long_reads <string> :fasta file containing corrected pac bio reads
-# --NO_PARALLEL_IWORM : turn off parallel iworm assembly
-#
-#
-# Chyrsalis-related options:
-#
-# --min_pcnt_read_iworm_kmers <int> :min percentage of a read sequence that must be composed of inchworm kmers to be pursued
-# by chrysalis (default: $min_percent_read_iworm_kmers) note: off if < 0
-#
-# --min_iso_ratio <float> :min fraction of average kmer coverage between two iworm contigs
-# required for gluing. (default: $min_iso_ratio)
-# --glue_factor <float> :fraction of max (iworm pair coverage) for read glue support (default: $glue_factor)
-#
-# --max_reads_per_loop <int> :maximum number of reads to read into
-# memory at once (default: $max_reads_per_loop)
-# --min_pct_read_mapping <int> :minimum percent of a reads kmers that must map to an
-# inchworm bundle (aka. component) default: 0
-#
-# --bowtie_components :use bowtie2 to generate readsToTranscripts mappings
-#
-#
-# Other:
-# --monitoring :use collectl to monitor all steps of Trinity
-#
-# --compdir|component_directory : use a temporary or local directory for Components_bin
-#
-#
-
-
-_ADVANCEDUSAGE_
-
- ;
-
-
-my $ROOTDIR = "$FindBin::RealBin";
-my $UTILDIR = "$ROOTDIR/util";
-my $INCHWORM_DIR = "$ROOTDIR/Inchworm";
-my $CHRYSALIS_DIR = "$ROOTDIR/Chrysalis";
-my $BUTTERFLY_DIR = "$ROOTDIR/Butterfly";
-my $JELLYFISH_DIR = "$ROOTDIR/trinity-plugins/jellyfish";
-my $FASTOOL_DIR = "$ROOTDIR/trinity-plugins/fastool";
-my $COLLECTL_DIR = "$ROOTDIR/trinity-plugins/collectl/bin";
-my $COREUTILS_DIR = "$ROOTDIR/trinity-plugins/coreutils/bin";
-my $PARAFLY = "$ROOTDIR/trinity-plugins/parafly/bin/ParaFly";
-my $TRIMMOMATIC = "$ROOTDIR/trinity-plugins/Trimmomatic/trimmomatic.jar";
-
-my $usage = $basic_usage . $usage_synopsis;
-
-unless (@ARGV) {
- die "$usage\n";
-}
-
-# Log command line parameters for performance monitoring
-foreach (@ARGV) {
- $pm_trinity_arguments = $pm_trinity_arguments . " " . $_;
-};
-
-
-my $sort_exec = &COMMON::get_sort_exec($CPU);
-
-my $NO_FASTOOL = 0;
-my $NO_CLEANUP = 0;
-my $FULL_CLEANUP = 0;
-my $FULL_CLEANUP_ERROR_TOLERANT = 0; ## NOTE, THIS IS AN AWFUL IDEA... //FIXME: add propper error-handling mechanisms
-my $NO_BOWTIE = 0;
-
-
-my $BOWTIE_COMP = 0;
-
-my $NO_RUN_INCHWORM_FLAG = 0;
-
-my $JELLY_S;
-
-
-my $PASAFLY_MODE = 0;
-my $CUFFFLY_MODE = 0;
-
-my $full_usage_info_flag;
-
-my $NO_TRIPLET_LOCK;
-my $NO_EM_REDUCE;
-
-## Genome-guided params:
-my $genome_fasta_file;
-my $genome_guided_max_intron;
-my $genome_guided_use_bam;
-my $genome_guided_min_coverage = 1;
-my $genome_guided_min_reads_per_partition = 10;
-my $GMAP_CPU;
-my $genome_guided_CPU;
-my $genome_guided_sort_buffer;
-my $genome_guided_just_prep_flag = 0;
-
-my @ORIG_ARGS = @ARGV;
-
-&GetOptions(
-
- 'h|help' => \$help_flag,
- 'advanced_help' => \$advanced_help_flag,
- 'show_full_usage_info' => \$full_usage_info_flag,
-
- ## general opts
- "seqType=s" => \$seqType,
- "left=s{,}" => \@left_files,
- "right=s{,}" => \@right_files,
- "single=s{,}" => \@single_files,
-
- "SS_lib_type=s" => \$SS_lib_type,
-
- "long_reads=s" => \$long_reads,
-
- "output=s" => \$output_directory,
-
- "min_contig_length=i" => \$min_contig_length,
-
- "jaccard_clip" => \$jaccard_clip,
-
- "cite" => \$SHOW_CITATION_FLAG,
-
- 'CPU=i' => \$CPU,
-
- 'prep' => \$prep_only,
-
- 'KMER_SIZE=i' => \$KMER_SIZE,
-
-
- # Quality trimming:
- 'trimmomatic' => \$RUN_TRIMMOMATIC_FLAG,
- 'quality_trimming_params=s' => \$trimmomatic_quality_trim_params,
-
- # In silico read normalization
- 'normalize_reads' => \$NORMALIZE_READS_FLAG,
- 'normalize_max_read_cov=i' => \$normalize_max_read_cov,
- 'normalize_by_read_set' => \$NORMALIZE_BY_READ_SET,
-
-
- # Butterfly opts
- 'no_run_butterfly' => \$NO_RUN_BUTTERFLY_FLAG,
- 'no_triplet_lock' => \$NO_TRIPLET_LOCK,
- 'extended_lock' => \$EXTENDED_TRIPLET_LOCK,
- "group_pairs_distance=i" => \$group_pairs_distance,
- 'bfly_opts=s' => \$bfly_opts,
- 'bflyHeapSpaceMax=s' => \$bflyHeapSpaceMax,
- 'bflyHeapSpaceInit=s' => \$bflyHeapSpaceInit,
- 'bflyGCThreads=i' => \$bflyGCThreads,
- 'bflyCPU=i' => \$bflyCPU,
- 'bflyCalculateCPU' => \$bflyCalculateCPU,
- 'bfly_jar=s' => \$BFLY_JAR,
-
- 'path_reinforcement_distance=i' => \$path_reinforcement_distance,
- 'rerun_butterfly' => \$RERUN_BUTTERFLY_FLAG,
-
- 'NO_EM_REDUCE' => \$NO_EM_REDUCE,
- 'no_path_merging' => \$NO_PATH_MERGING,
- 'min_per_id_same_path=i' => \$MIN_PER_ID_SAME_PATH,
- 'max_diffs_same_path=i' => \$MAX_DIFFS_SAME_PATH,
- 'max_internal_gap_same_path=i' => \$MAX_INTERNAL_GAP_SAME_PATH,
-
-
- 'PasaFly' => \$PASAFLY_MODE,
- 'CuffFly' => \$CUFFFLY_MODE,
-
- # Inchworm & kmer catalog opts
-
- 'min_kmer_cov=i' => \$min_kmer_cov,
- 'inchworm_cpu=i' => \$inchworm_cpu,
- 'FORCE_INCHWORM_KMER_METHOD' => \$FORCE_INCHWORM_KMER_METHOD,
- 'INCHWORM_CUSTOM_PARAMS=s' => \$INCHWORM_CUSTOM_PARAMS,
- 'no_run_inchworm' => \$NO_RUN_INCHWORM_FLAG,
-
- # Jellyfish
- 'JM=s' => \$max_memory, # in GB
-
- # Chrysalis -related opts
- 'min_glue=i' => \$min_glue,
- 'glue_factor=f' => \$glue_factor,
- 'min_iso_ratio=f' => \$min_iso_ratio,
- 'min_pcnt_read_iworm_kmers=i' => \$min_percent_read_iworm_kmers,
- 'no_run_quantifygraph' => \$NO_RUN_QUANTIFYGRAPH_FLAG,
- 'max_reads_per_graph=i' => \$max_reads_per_graph,
- 'max_reads_per_loop=i' => \$max_reads_per_loop,
- 'no_run_chrysalis' => \$NO_RUN_CHRYSALIS_FLAG,
- 'min_pct_read_mapping=i' => \$min_pct_read_mapping,
- 'weldmer_size=i' => \$weldmer_size,
- "chrysalis_output=s" => \$chrysalis_output_dir,
- "no_bowtie" => \$NO_BOWTIE,
- "bowtie_comp" => \$BOWTIE_COMP,
-
- # Grid computing options
- 'grid_conf_file=s' => \$grid_conf_file,
-
- "show_advanced_options" => \$show_advanced_options,
-
-
- # misc
- 'run_as_paired' => \$run_as_paired_flag,
- 'no_fastool' => \$NO_FASTOOL,
- 'no_cleanup' => \$NO_CLEANUP,
- 'full_cleanup' => \$FULL_CLEANUP,
- 'version' => \$show_version_flag,
- 'monitoring' => \$run_with_collectl,
- 'full_cleanup_ET' => \$FULL_CLEANUP_ERROR_TOLERANT,
-
- # hidden (don't look here! ;)
- 'KMER_SIZE=i' => \$KMER_SIZE,
- 'jelly_s=i' => \$JELLY_S,
- 'compdir|component_directory=s' => \$component_directory,
- 'NO_PARALLEL_IWORM' => \$NO_PARALLEL_IWORM,
-
-
-
- # genome guided
- "genome=s" => \$genome_fasta_file,
- "genome_guided_max_intron=i" => \$genome_guided_max_intron,
- "genome_guided_use_bam=s" => \$genome_guided_use_bam,
- "genome_guided_min_coverage=i" => \$genome_guided_min_coverage,
- "genome_guided_min_reads_per_partition=i" => \$genome_guided_min_reads_per_partition,
- "genome_guided_CPU=i" => \$genome_guided_CPU,
- "GMAP_CPU=i" => \$GMAP_CPU,
- "genome_guided_sort_buffer=s" => \$genome_guided_sort_buffer,
- "genome_guided_just_prep" => \$genome_guided_just_prep_flag,
-
- );
-
-
-
-if ($SHOW_CITATION_FLAG) {
- &show_lit_citation();
- exit(0);
-}
-
-
-if ($full_usage_info_flag) {
- $usage = $basic_usage . $full_usage . $usage_synopsis;
- die "$usage\n";
-}
-
-
-if ($advanced_help_flag) {
- die "$advanced_usage\n";
-}
-if ($help_flag) {
- die "$usage\n";
-}
-
-if ($show_version_flag) {
- print "Trinity version: $VERSION\n";
- exit(1);
-}
-
-if ($NO_CLEANUP && $FULL_CLEANUP) {
- die "cannot set --no_cleanup and --full_cleanup as they contradict";
-}
-
-
-if ($KMER_SIZE > $MAX_KMER_SIZE) {
- die "Error, kmer size can be at most $MAX_KMER_SIZE ";
-}
-
-
-if ($NO_TRIPLET_LOCK) {
- $TRIPLET_LOCK = 0; # turn it off since on by default.
-}
-if ($NO_PARALLEL_IWORM) {
- # turn it off.
- $PARALLEL_IWORM_FLAG = 0;
-}
-
-my $MIN_IWORM_LEN = $KMER_SIZE;
-
-
-unless ($GMAP_CPU) {
- $GMAP_CPU = $CPU;
-}
-unless ($genome_guided_CPU) {
- $genome_guided_CPU = $CPU;
-}
-
-if (@ARGV) {
- die "Error, do not understand options: @ARGV\n";
-}
-
-if ($run_with_collectl && $^O !~ /linux/i) {
- print STDERR "WARNING, --monitoring can only be used on linux. Turning it off.\n\n";
- $run_with_collectl = 0;
-}
-
-unless ($BFLY_JAR) {
- $BFLY_JAR = "$BUTTERFLY_DIR/Butterfly.jar";
-}
-
-
-## Check options set:
-
-# Subroutine takes variable *reference* plus name of variable. Lower-cases
-# variable value and checks to see if it one of the allowed ones.
-# 'die' has new-line in order to keep line number from being shown to user.
-
-sub check_option {
- my ($option, $name) = @_;
- $$option = lc $$option;
- if ($$option eq '') {
- die "Error, option '--$name' is required.\n";
- }
- if (!defined $allowed_check{$name}{$$option}) {
- die "Error, option '--$name' ($$option) not one of $allowed{$name}\n";
- }
-}
-
-check_option( \$seqType, 'seqType' );
-
-my $USE_FASTOOL = 1; # by default, using fastool for fastq to fasta conversion
-if ($NO_FASTOOL) {
- $USE_FASTOOL = 0;
-}
-
-if ($SS_lib_type) {
- unless ($SS_lib_type =~ /^(R|F|RF|FR)$/) {
- die "Error, unrecognized SS_lib_type value of $SS_lib_type. Should be: F, R, RF, or FR\n";
- }
-}
-
-unless ( (@left_files && @right_files) || @single_files ) {
- die "Error, need either options 'left' and 'right' or option 'single'\n";
-}
-
-if (@left_files) {
- @left_files = split(",", join(",", @left_files));
-}
-if (@right_files) {
- @right_files = split(",", join(",", @right_files));
-}
-if (@single_files) {
- @single_files = split(",", join(",", @single_files));
-}
-
-
-if ($min_iso_ratio > 1) {
- die "Error, --min_iso_ratio should be <= 1 \n";
-}
-
-## keep the original 'xG' format string for the --JM option, then calculate the numerical value for max_memory
-my $JM_string = $max_memory; ## this one is used in the Chrysalis exec string
-if ($max_memory) {
- $max_memory =~ /^([\d\.]+)G$/ or die "Error, cannot parse max_memory value of $max_memory. Set it to 'xG' where x is a numerical value\n";
-
- $max_memory = $1;
- $max_memory *= 1024**3; # convert to from gig to bytes
-}
-else {
- die "Error, must specify max memory for jellyfish to use, eg. --JM 10G \n";
-}
-
-unless ($genome_guided_sort_buffer) {
- $genome_guided_sort_buffer = $JM_string;
-}
-
-## Try to remove stack limits
-if ($^O eq "linux") { # cannot set stacksize on newer macs for some reason...
-# &try_unlimit_stacksize();
-}
-
-my $curr_limit_settings = `/bin/sh -c 'ulimit -a' `;
-unless ($curr_limit_settings && $curr_limit_settings =~ /\w/) {
- $curr_limit_settings = `/bin/csh -c limit`; # backup, probably not needed.
-}
-
-print "Current settings:\n$curr_limit_settings\n\n";
-
-
-## Check Java version:
-unless ($NO_RUN_BUTTERFLY_FLAG || $NO_RUN_CHRYSALIS_FLAG) {
- my $java_version = `java -Xmx64m -version 2>&1 `;
- unless ($java_version =~ /(java|openjdk) version \"1\.[67]\./) {
- die "Error, Trinity requires access to Java version 1.6 or 1.7. Currently installed version is: $java_version";
- }
-}
-
-# Give the variable with memory size and a user-oriented name
-
-sub bfly_check {
- my ($mem, $name) = @_;
- my ($num, $type) = $mem =~ /^(\d+)([MG])$/;
- if (!defined $mem || !defined $type) {
- die "Error, $name must be set to a value of format: \\d+G or \\d+M (eg. 1G or 1000M)\n Currently: $mem\n";
- }
- return $type eq 'G' ? $num * 1024**3 : $num * 1024**2;
-}
-
-my $bflyHeapSpaceMaxBytes = bfly_check($bflyHeapSpaceMax , 'bflyHeapSpaceMax' );
-my $bflyHeapSpaceInitBytes = bfly_check($bflyHeapSpaceInit, 'bflyHeapSpaceInit');
-
-if ($bflyHeapSpaceInitBytes > $bflyHeapSpaceMaxBytes) {
- die "Error, bflyHeapSpaceInit ($bflyHeapSpaceInit) must be less or equal to bflyHeapSpaceMax ($bflyHeapSpaceMax).\n";
-}
-
-
-if ($CPU > $CPU_MAX) {
- print STDERR "Warning, --CPU $CPU might be excessive. Limiting it to $CPU_MAX for now.\n";
- $CPU = $CPU_MAX;
-}
-
-if ($inchworm_cpu > $CPU) {
- $inchworm_cpu = $CPU;
-}
-
-if ($bflyCalculateCPU && $max_memory) {
- $bflyCPU = int ($max_memory * 0.80 / $bflyHeapSpaceMaxBytes);
-}
-
-$bflyCPU = $CPU if !defined $bflyCPU;
-
-if ($bflyCPU > $CPU_MAX) {
- print STDERR "Warning, --bflyCPU $bflyCPU might be excessive. Limiting it to $CPU_MAX for now.\n";
- $bflyCPU = $CPU_MAX;
-}
-
-
-if (defined($bflyGCThreads) && $bflyGCThreads > 32) {
- die "Error, you probably want fewer than $bflyGCThreads java garbage collection threads. Try a number less than 32.";
-}
-
-
-if ($genome_fasta_file) {
- ## genome-guided mode.
- unless ($genome_guided_max_intron) {
- die "Error, must specifiy --genome_guided_max_intron <int> for genome-guided mode.\n";
- }
- unless ($genome_guided_use_bam) {
- ## check for gsnap software
- my @tools = qw(gmap_build gsnap);
- foreach my $tool (@tools) {
- my $path = `which $tool`;
- if ($path =~ /\w/) {
- print STDERR "Found $tool at $path\n";
- }
- else {
- die "Error, cannot locate tool: $tool, required for genome-guided pipeline.";
- }
- }
- }
-}
-
-
-
-$ENV{OMP_NUM_THREADS} = $CPU; ## for Inchworm and Chrysalis
-
-
-my $PAIRED_MODE = ( (@left_files && @right_files) || $run_as_paired_flag) ? 1:0;
-if ($PAIRED_MODE && (!$NO_RUN_CHRYSALIS_FLAG) && (!$NO_BOWTIE)) {
- ## be sure we can find 'bowtie', since we use it as part of the iworm pair scaffolding step
- my $bowtie_path = `which bowtie`;
- my $bowtie_build_path = `which bowtie-build`;
- if ($bowtie_path =~ /\w/ && $bowtie_build_path =~ /\w/) {
- print "Paired mode requires bowtie. Found bowtie at: $bowtie_path\n and bowtie-build at $bowtie_build_path\n\n";
- }
- else {
- die "Error, cannot find path to bowtie ($bowtie_path) or bowtie-build ($bowtie_build_path), which is now needed as part of Chrysalis' read scaffolding step. If you should choose to not run bowtie, include the --no_bowtie in your Trinity command.\n\n";
- }
-
- my $samtools_path = `which samtools`;
- if ($samtools_path =~ /\w/) {
- print "Found samtools at: $samtools_path\n";
- }
- else {
- die "Error, cannot find samtools. Please be sure samtools is installed and included in your PATH setting.\n";
- }
-
- unless ($path_reinforcement_distance) {
- $path_reinforcement_distance = $PE_path_reinforcement_distance;
- }
-}
-else {
- unless ($path_reinforcement_distance) {
- $path_reinforcement_distance = $SE_path_reinforcement_distance;
- }
-}
-
-
-my $MKDIR_OUTDIR_FLAG = 0; ## only purging output_directory if we create it in this run.
-
-
-## Regular run. Name the output based on the butterfly reconstruction mode.
-my $butterfly_output_filename = "Trinity.fasta";
-if ($PASAFLY_MODE) {
- $butterfly_output_filename = "Trinity.Pasafly.fasta";
-}
-elsif ($CUFFFLY_MODE) {
- $butterfly_output_filename = "Trinity.Cufffly.fasta";
-}
-
-main: {
- $ENV{OMP_NUM_THREADS} = $CPU;
-
-
- unless ($NO_RUN_BUTTERFLY_FLAG || $NO_RUN_CHRYSALIS_FLAG) {
- print STDERR "-since butterfly will eventually be run, lets test for proper execution of java\n";
- &test_java_failure_capture();
- }
-
- unless ($genome_fasta_file) {
-
- if (basename($chrysalis_output_dir) !~ /chrysalis/i) {
- die "Error, chrysalis output directory name must include 'chrysalis' in the name."; # lets try to prevent bad things from happening... (security issue)
- }
-
- if ($FULL_CLEANUP && basename($output_directory) !~ /\w/) {
- die "Error, working in full-cleanup mode. Specify a named directory for the output. The directory and contents are purged at end of a successful run.";
- }
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) { # genome-guided mode
-
- if (basename($output_directory) !~ /trinity/i) {
- die "Error, in genome-guided mode, the output directory name must include 'trinity' in the name (precautionary measure)";
- }
- $FULL_CLEANUP = 1;
-
- ## purge chrysalis directory from a previously failed run
- if (-d $output_directory) {
- print STDERR "WARNING: $output_directory exists. Since under full-cleanup mode, deleting this first before proceeding.\n:";
- &process_cmd("rm -rf $output_directory");
- }
- }
-
-
- if ($chrysalis_output_dir !~ /^\//) {
- $chrysalis_output_dir = "$output_directory/$chrysalis_output_dir";
- }
-
- $chrysalis_output_dir = &create_full_path($chrysalis_output_dir, 0);
-
- if ($component_directory){
- # does a component directory exist from a previous run?
- if (-e $chrysalis_output_dir.'/Component_bins'){
- if (-l $chrysalis_output_dir.'/Component_bins'){
- $component_directory = readlink($chrysalis_output_dir.'/Component_bins');
- }else{
- $component_directory = $chrysalis_output_dir.'/Component_bins';
- }
- warn "Reusing existing component directory $component_directory\n";
- }else{
- $component_directory .= "/Trinity.$$";
- mkdir($component_directory) || die ("component directory cannot be created or already exists!\n");
- die "Cannot create component directory $component_directory" unless -d $component_directory;
- $component_directory .= "/Component_bins";
- mkdir($component_directory) || die ("component directory cannot be created or already exists!\n");
- die "Cannot create component directory $component_directory" unless -d $component_directory;
- }
- # so that users know where it is/remember to remove it if manually done?
- symlink($component_directory,$chrysalis_output_dir.'/Component_bins') unless -e $chrysalis_output_dir.'/Component_bins';
- }else{
- $component_directory = $chrysalis_output_dir . '/Component_bins';
- $component_directory = &create_full_path($component_directory,0);
- }
- }
-
-
-
- ## create complete paths for input files:
- @left_files = &create_full_path(\@left_files, 1) if @left_files;
- @right_files = &create_full_path(\@right_files, 1) if @right_files;
- @single_files = &create_full_path(\@single_files, 1) if @single_files;
- $output_directory = &create_full_path($output_directory, 0);
- $long_reads = &create_full_path($long_reads, 1) if $long_reads;
- $genome_fasta_file = &create_full_path($genome_fasta_file, 1) if $genome_fasta_file;
- $genome_guided_use_bam = &create_full_path($genome_guided_use_bam, 1) if $genome_guided_use_bam;
-
- $grid_conf_file = &create_full_path($grid_conf_file, 1) if $grid_conf_file;
-
- unless (-d $output_directory) {
-
- &process_cmd("mkdir -p $output_directory");
- $MKDIR_OUTDIR_FLAG = 1;
- }
-
- if ((! $genome_fasta_file) && (! -d $chrysalis_output_dir)) {
- &process_cmd("mkdir -p $chrysalis_output_dir"); # note, won't be auto-cleaned up if not in the trinity_out_dir/
- }
-
- chdir ($output_directory) or die "Error, cannot cd to $output_directory";
-
- collectl_start() unless ($FULL_CLEANUP);
- &perfmon_start() unless ($FULL_CLEANUP);
-
- ##########################
- ## Run Quality Trimming
- ##########################
-
- if ($RUN_TRIMMOMATIC_FLAG) {
-
- print STDERR "---------------------------------------------------------------\n"
- . "------ Quality Trimming Via Trimmomatic ---------------------\n"
- . "<< $trimmomatic_quality_trim_params >>\n"
- . "---------------------------------------------------------------\n\n";
-
-
- unless ($seqType eq 'fq') {
- die "Error, cannot do quality trimming on fasta files, need fastq files.";
- }
-
- if (@left_files && @right_files) {
- my @trimmed_left_files;
- my @trimmed_right_files;
-
- while (@left_files) {
- my $left_file = shift @left_files;
- my $right_file = shift @right_files;
-
- my ($left_file_trimmed, $right_file_trimmed) = &run_trimmomatic_PE($left_file, $right_file, $trimmomatic_quality_trim_params);
- push (@trimmed_left_files, $left_file_trimmed);
- push (@trimmed_right_files, $right_file_trimmed);
- }
-
- @left_files = @trimmed_left_files;
- @right_files = @trimmed_right_files;
- }
- elsif (@single_files) {
- my @trimmed_single_files;
- foreach my $single_file (@single_files) {
- my $trimmed_single_file = &run_trimmomatic_SE($single_file, $trimmomatic_quality_trim_params);
- push (@trimmed_single_files, $trimmed_single_file);
- }
- @single_files = @trimmed_single_files;
- }
- }
-
- ##########################################
- ## In silico normalization
- ##########################################
-
- if ($NORMALIZE_READS_FLAG) {
-
- if (@left_files && @right_files) {
- my ($left_norm_file, $right_norm_file) = &run_normalization($normalize_max_read_cov, \@left_files, \@right_files);
- @left_files = ($left_norm_file);
- @right_files = ($right_norm_file);
- }
- elsif (@single_files) {
- @single_files = &run_normalization($normalize_max_read_cov, \@single_files);
- }
- }
-
- if ($genome_fasta_file) {
-
- if (@left_files && @right_files) {
- &run_genome_guided_Trinity(\@left_files, \@right_files);
- }
- else {
- &run_genome_guided_Trinity(\@single_files);
- }
-
-
- exit(0);
- }
-
-
-
- ## create inchworm file name
- my $inchworm_file = "inchworm.K$KMER_SIZE.L$MIN_IWORM_LEN";
- unless ($SS_lib_type) {
- $inchworm_file .= ".DS";
- }
- $inchworm_file .= ".fa";
- $inchworm_file = &create_full_path($inchworm_file, 0);
-
- my $trinity_target_fa = (@single_files) ? "single.fa" : "both.fa";
- my $inchworm_target_fa = $trinity_target_fa; # change this later if we have long_reads
-
-
- ## Don't prep the inputs if Inchworm already exists.... Resuming earlier operations.
- my $inchworm_finished_checkpoint_file = "$inchworm_file.finished";
- if (-s $inchworm_file && -e $inchworm_finished_checkpoint_file) {
- print "\n\n#######################################################################\n"
- . "Inchworm file: $inchworm_file detected.\n"
- . "Skipping Inchworm Step, Using Previous Inchworm Assembly\n"
- . "#######################################################################\n\n";
- #sleep(2);
- }
- else {
-
- ## Prep data for Inchworm
- my $count_of_reads;
- if (@left_files && @right_files) {
-
- unless (-s $trinity_target_fa && !-e "left.fa" && !-e "right.fa") {
-
- my ($left_SS_type, $right_SS_type);
- if ($SS_lib_type) {
- ($left_SS_type, $right_SS_type) = split(//, $SS_lib_type);
- }
- print("Converting input files. (in parallel)");
- my $thr1;
- my $thr2;
- if (!(-s "left.fa")) {
- $thr1 = threads->create('prep_seqs', \@left_files, $seqType, "left", $left_SS_type);
- } else {
- $thr1 = threads->create(sub { print ("left file exists, nothing to do");});
- }
- if (!(-s "right.fa")) {
- $thr2 = threads->create('prep_seqs', \@right_files, $seqType, "right", $right_SS_type);
- } else {
- $thr2 = threads->create(sub { print ("right file exists, nothing to do");});
- }
- @left_files = @{$thr1->join()};
- @right_files =@{$thr2->join()};
-
- if ($thr1->error() || $thr2->error()) {
- die "Error prepping sequences.";
- }
-
- print("Done converting input files.");
- ## Calculate input file sizes for performance monitoring
- # this should be set as the created fasta otherwise results will differ for same data passed as .fq and .fa?
- my $pm_temp = -s "left.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_left_fa_size = sprintf('%.0f', $pm_temp);
- $pm_temp = -s "right.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_right_fa_size = sprintf('%.0f', $pm_temp);
-
- &process_cmd("cat left.fa right.fa > $trinity_target_fa") unless (-s $trinity_target_fa && (-s $trinity_target_fa == ((-s "left.fa") + (-s "right.fa"))));
- unless (-s $trinity_target_fa == ((-s "left.fa") + (-s "right.fa"))){
- die "$trinity_target_fa is smaller (".(-s $trinity_target_fa)." bytes) than the combined size of left.fa and right.fa (".((-s "left.fa") + (-s "right.fa"))." bytes)\n";
- }
-
- # we keep if we have jaccard; delete later
- unlink ("left.fa", "right.fa") unless $jaccard_clip; # no longer needed now that we have 'both.fa', which is needed by chryaslis
- }
-
- foreach my $f ((@left_files, at right_files)){
- if (-s $f.'.readcount'){
- open (IN,$f.'.readcount');
- my $s = <IN>;
- close IN;
- $s=~/([0-9]+)$/;
- $count_of_reads += $1 if $1;
- }
- }
-
-
- }
- elsif (@single_files) {
-
- @single_files = @{&prep_seqs(\@single_files, $seqType, "single", $SS_lib_type) unless (-s "single.fa")};
- ## Calculate input file sizes for performance monitoring
- my $pm_temp = -s "single.fa";
- $pm_temp = $pm_temp / 1024 / 1024;
- $pm_single_fa_size = sprintf('%.0f', $pm_temp);
- foreach my $f (@single_files){
- if (-s $f.'.readcount'){
- open (IN,$f.'.readcount');
- my $s = <IN>;
- close IN;
- $s=~/([0-9]+)$/;
- $count_of_reads += $1 if $1;
- }
- }
- }
-
- else {
- die "not sure what to do. "; # should never get here.
- }
-
- if (!$count_of_reads){
- $count_of_reads = `wc -l < $inchworm_target_fa`;chomp($count_of_reads); #AP: grep is expensive; one test took 2h...!
- $count_of_reads/=2;
- }
- if ($long_reads) {
- $inchworm_target_fa .= ".wLongReads.fa";
- $count_of_reads += `grep -c '^>' $long_reads | wc -l`; #AP we don't know if these will be one single line
- &process_cmd("cat $long_reads $trinity_target_fa > $inchworm_target_fa");
- }
-
- open (my $ofh, ">$inchworm_target_fa.read_count") or die $!;
- print $ofh $count_of_reads."\n";
- close $ofh;
- }
-
- if ($prep_only){
- print "Data has been prepared. Exiting now as per user request\n";
- exit();
- }
-
- #################
- ## Inchworm step:
- $pm_inchworm_start = `date +%s`;
- unless (-s $inchworm_file && -e $inchworm_finished_checkpoint_file) {
-
-
- &run_inchworm($inchworm_file, $inchworm_target_fa, $SS_lib_type, $kmer_method);
- &process_cmd("touch $inchworm_finished_checkpoint_file");
- }
- $pm_inchworm_end = `date +%s`;
-
-
- unless (-s $inchworm_file) {
-
- ## No inchworm output under genome-guided flag, must be sparse data.
-
- if ($FULL_CLEANUP_ERROR_TOLERANT && $FULL_CLEANUP && -e $inchworm_file && -e $inchworm_finished_checkpoint_file) {
- ## GG-trinity mode, clean-up gracefully
- if ($MKDIR_OUTDIR_FLAG) {
- &process_cmd("rm -rf $component_directory") if $component_directory;
- &process_cmd("rm -rf $output_directory");
- }
- else {
- print STDERR "WARNING, cannot remove output directory $output_directory, since not created in this run. (safety precaution)\n";
- }
- exit(0);
- }
- else {
- die "Error, no Inchworm output is detected at: $inchworm_file";
- }
- }
-
-
- if ($jaccard_clip) {
-
- eval {
-
- if ($jaccard_clip && -s 'left.fa' && -s 'right.fa') {
- $inchworm_file = &run_jaccard_clip_left_right($inchworm_file, \@left_files, \@right_files, $seqType, $SS_lib_type);
- #$inchworm_file = &run_jaccard_clip_left_right($inchworm_file, $left_file, $right_file, $seqType, $SS_lib_type);
-
- }
- elsif ($jaccard_clip && -s 'single.fa') {
- $inchworm_file = &run_jaccard_clip_single_but_really_paired($inchworm_file, \@single_files, $seqType, $SS_lib_type);
- #$inchworm_file = &run_jaccard_clip_single_but_really_paired($inchworm_file, $single_file, $seqType, $SS_lib_type);
- }
- };
-
- if ($@) {
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## GG-trinity mode, clean up gracefully
- system("rm -rf $output_directory &"); # ignore file system errors on failed cleanup
- exit(0);
- }
- else {
- die "Error, jaccard-clip failed: $@";
- }
- }
- }
-
-
- if ($NO_RUN_CHRYSALIS_FLAG) {
- print "\n\n\n";
- print "#########################################################################\n";
- print "Inchworm is complete. --no_run_chrysalis was specified, so stopping here.\n";
- print "#########################################################################\n\n\n";
-
- exit(0);
- }
- $ENV{OMP_NUM_THREADS} = $CPU;
- ##################
- ## Chrysalis step:
-
- if ($min_percent_read_iworm_kmers > 0) {
-
- ### EXPERIMENTAL: DO NOT USE!
-
- $trinity_target_fa = &extract_reads_with_iworm_kmers($trinity_target_fa, $inchworm_file, $min_percent_read_iworm_kmers, $SS_lib_type);
-
- }
-
- ## butterfly commands can be reparameterized for exploring different assembly requirements
- ## chrysalis will just run or resume depending on what's already been processed.
- $pm_chrysalis_start = `date +%s`;
- my $butterfly_cmds = &run_chrysalis($inchworm_file, $inchworm_target_fa,
- $min_contig_length, $group_pairs_distance, $SS_lib_type, $trinity_target_fa);
- $pm_chrysalis_end = `date +%s`;
-
- print "Butterfly_cmds: $butterfly_cmds\n";
-
- if ($butterfly_cmds && -s $butterfly_cmds) {
-
- if ($NO_RUN_BUTTERFLY_FLAG) {
-
- print "\n\nYou've opted to run butterfly commands independently from this script, such as on a computing grid.\n\n";
- print "Butterfly commands to execute are available here:\n"
- . "\t$butterfly_cmds\n\n";
- print "After executing Butterfly commands, concatenate all Butterfly outputs by running:\n"
- . "\t\tfind $output_directory/ -name \"\*allProbPaths.fasta\" -exec cat {} + > $output_directory/Trinity.fasta\n\n\n";
-
- exit(0);
-
- }
- else {
-
- ## Run Butterfly
-
- print "Inchworm and Chrysalis complete. Butterfly commands to execute are provided here:\n"
- . $butterfly_cmds . "\n\n";
-
-
- print STDERR "---------------------------------------------------------------\n"
- . "-------------------- Butterfly --------------------------------\n"
- . "-- (Reconstruct transcripts from reads and de Bruijn graphs) --\n"
- . "---------------------------------------------------------------\n\n";
-
- $pm_butterfly_start = `date +%s`;
- if ($grid_conf_file) {
- my @bfly_cmds = `cat $butterfly_cmds`;
- chomp @bfly_cmds;
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "chrysalis/butterfly_on_grid.cacheSuccess");
- my $ret = $grid_runner->run_on_grid(@bfly_cmds);
- if ($ret) {
- die "Error, not all butterfly commands could complete successfully... cannot continue.";
- }
- }
- else {
- my $cmd = "$PARAFLY -c $butterfly_cmds -shuffle -CPU $bflyCPU -failed_cmds failed_butterfly_commands.$$.txt -v "; # shuffle them since the first ones are usually the longest-running ones.
- &process_cmd($cmd);
- }
- $pm_butterfly_end = `date +%s`;
-
- ## capture results:
- # my $cmd = 'find ./chrysalis -name "*allProbPaths.fasta" -exec cat {} + > Trinity.fasta.tmp';
- # no longer scan the file system... we know which files should exist
- my $cmd = "$UTILDIR/support_scripts/print_butterfly_assemblies.pl $chrysalis_output_dir/component_base_listing.txt > Trinity.fasta.tmp";
- &process_cmd($cmd);
-
- }
-
- }
-
- if ($FULL_CLEANUP) {
- print "Fully cleaning up.\n";
- $output_directory =~ s|/+$||g; # remove any trailing directory slash
-
- if (-s "Trinity.fasta.tmp") {
- rename("Trinity.fasta.tmp", "$output_directory.Trinity.fasta") or die "Error, cannot rename Trinity.fasta.tmp to $output_directory.Trinity.fasta";
-
- print "\n\n";
- print "###################################################################\n";
- print "Butterfly assemblies are written to $output_directory.Trinity.fasta\n";
- print "###################################################################\n\n\n";
-
- }
- else {
- print "\n\n";
- print "####################################\n";
- print "## No butterfly assemblies to report.\n";
- print "####################################\n\n\n";
- }
-
- if ($MKDIR_OUTDIR_FLAG) {
- system("rm -rf $output_directory &"); # ignore filesystem errors on failed cleanup
- }
- else {
- print STDERR "WARNING, cannot remove output directory $output_directory, since not created in this run. (safety precaution)\n";
- }
-
- }
- else {
-
-
- if (-s "Trinity.fasta.tmp") {
- rename("Trinity.fasta.tmp", $butterfly_output_filename) or die "Error, cannot rename Trinity.fasta.tmp to $butterfly_output_filename"; # now that process has finished.
- }
-
- if (-s $butterfly_output_filename) {
-
- print "\n\n";
- print "###################################################################\n";
- print "Butterfly assemblies are written to $output_directory/$butterfly_output_filename\n";
- print "###################################################################\n\n\n";
- }
- else {
- die "ERROR, no butterfly assemblies reported.";
- }
-
- }
-
- &perfmon_end() unless ($FULL_CLEANUP);
- exit(0);
-}
-
-
-####
-sub run_chrysalis {
- my ($inchworm_file, $reads_file,
- $min_contig_length, $group_pairs_distance, $SS_lib_type, $pairs_fa) = @_;
-
-
- my $butterfly_cmds = &create_full_path("$chrysalis_output_dir/butterfly_commands");
-
- my $quantify_graph_cmds = &create_full_path("$chrysalis_output_dir/quantifyGraph_commands");
-
- my $chrysalis_finished_checkpoint = "$chrysalis_output_dir/chrysalis.finished";
-
- if (-e $chrysalis_finished_checkpoint) {
-
- print "###################################################################\n";
- print "#### Chrysalis results already exist. Not rerunning Chrysalis. ####\n";
- print "###################################################################\n\n\n";
-
- #sleep(2);
-
- }
- else {
- ## run Chrysalis
-
- my $cmd = "$CHRYSALIS_DIR/Chrysalis -i $reads_file -iworm $inchworm_file -o $chrysalis_output_dir -cpu $CPU "
- . " -min_glue $min_glue -min_iso_ratio $min_iso_ratio -glue_factor $glue_factor -kmer_size " . ($KMER_SIZE-1) # chrysalis wants kmer overlap length
- . " -weldmer_size $weldmer_size "
- . " -min $min_contig_length -dist $group_pairs_distance -max_reads $max_reads_per_graph "
- . " -sort_exec \"$sort_exec\" "
- . " -sort_buffer_size $JM_string -max_mem_reads $max_reads_per_loop ";
-
- if ($SS_lib_type) {
- $cmd .= " -strand 1 ";
- }
-
- if ($PAIRED_MODE) {
- $cmd .= " -paired ";
- $cmd .= " -reads_for_pairs $pairs_fa ";
-
- if ($NO_BOWTIE) {
- $cmd .= " -no_pair_links ";
- }
-
- }
-
- if ($BOWTIE_COMP) {
- $cmd .= " -bowtie_comp ";
- }
-
- if ($min_pct_read_mapping) {
- $cmd .= " -min_pct_read_mapping $min_pct_read_mapping ";
- }
-
-
- $cmd .= " -butterfly $BFLY_JAR ";
-
- if ($NO_CLEANUP) {
- $cmd .= " -no_cleanup ";
- }
-
- $cmd .= " 2>&1 ";
-
- eval {
-
- &process_cmd($cmd);
-
- };
-
-
- if ($@) {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode - OK, not enough data that's worth pursuing.
- return("");
-
- }
-
- my $errmsg = "$curr_limit_settings\n";
- $errmsg .= "Error, the Chrysalis process failed:\n$@\n";
- croak $errmsg;
- }
-
-
- print "Chrysalis initial stage completed successfully.\n";
- &process_cmd("touch $chrysalis_finished_checkpoint");
- }
-
- ## partition the graphs and reads in prep for quantify graph and butterfly steps.
-
- unless (-s "$chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn") {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode - OK, not enough data that's worth pursuing.
- return("");
- }
-
- croak "Error, no deBruijn graphs generated based on inchworm contigs: $chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn";
- }
-
-
- my $partitioning_checkpoint_file = "$chrysalis_output_dir/file_partitioning.ok";
-
- my $cmd = "$UTILDIR/support_scripts/partition_chrysalis_graphs_n_reads.pl --deBruijns $chrysalis_output_dir/bundled_iworm_contigs.fasta.deBruijn --componentReads $chrysalis_output_dir/readsToComponents.out.sort -N 1000 -L $min_contig_length --compdir $component_directory ";
-
- &process_cmd($cmd) unless (-e $partitioning_checkpoint_file);
-
- &process_cmd("touch $partitioning_checkpoint_file") unless (-e $partitioning_checkpoint_file);
-
- ## write the quantifygraph commands and butterfly commands
- my $component_base_listing_file = "$chrysalis_output_dir/component_base_listing.txt";
- unless (-s $component_base_listing_file) {
-
- if ($FULL_CLEANUP_ERROR_TOLERANT) {
- ## Trinity GG mode
- return("");
- }
- croak "Error, component base listing file: $component_base_listing_file does not exist";
-
- }
-
-
- {
- open (my $bfly_cmds_ofh, ">$butterfly_cmds") or die $!;
- open (my $qgraph_cmd_ofh, ">$quantify_graph_cmds") or die $!;
-
-
- open (my $fh, $component_base_listing_file) or die $!;
- while (<$fh>) {
- chomp;
- my ($component_id, $base_filename) = split(/\t/);
-
-
- { # quantify graph command
-
- my $quantify_graph_cmd = "$CHRYSALIS_DIR/QuantifyGraph -g $base_filename.graph.tmp "
- . " -i $base_filename.reads.tmp "
- . " -o $base_filename.graph.out "
- . " -max_reads $max_reads_per_graph "
- . " -k " . ($KMER_SIZE - 1);
-
- if ($SS_lib_type) {
- $quantify_graph_cmd .= " -strand ";
- }
- if ($NO_CLEANUP) {
-
- $quantify_graph_cmd .= " -no_cleanup ";
- }
-
- print $qgraph_cmd_ofh $quantify_graph_cmd . "\n";
-
- }
-
- { # butterfly command
-
-
- my $bfly_cmd = "java -Xmx$bflyHeapSpaceMax -Xms$bflyHeapSpaceInit ";
-
- if (defined($bflyGCThreads)) {
- $bfly_cmd .= " -XX:ParallelGCThreads=$bflyGCThreads ";
- }
-
- $bfly_cmd .= " -jar $BFLY_JAR -N 100000 -L $min_contig_length -F $group_pairs_distance -C $base_filename.graph ";
-
- if ($bfly_opts) {
- $bfly_cmd .= " $bfly_opts ";
- }
-
- $bfly_cmd .= " --path_reinforcement_distance=$path_reinforcement_distance ";
-
- if ($TRIPLET_LOCK) {
- $bfly_cmd .= " --triplet-lock ";
- }
- if ($EXTENDED_TRIPLET_LOCK) {
- $bfly_cmd .= " --extended_triplet ";
- }
-
- unless ($NO_EM_REDUCE) {
- $bfly_cmd .= " --EM_REDUCE ";
- }
-
- if ($NO_PATH_MERGING) {
- $bfly_cmd .= " --no_path_merging ";
- }
- else {
- if (defined($MIN_PER_ID_SAME_PATH)) {
- $bfly_cmd .= " --min_per_id_same_path=$MIN_PER_ID_SAME_PATH ";
- }
- if (defined($MAX_DIFFS_SAME_PATH)) {
- $bfly_cmd .= " --max_diffs_same_path=$MAX_DIFFS_SAME_PATH ";
- }
- if (defined($MAX_INTERNAL_GAP_SAME_PATH)) {
- $bfly_cmd .= " --max_internal_gap_same_path=$MAX_INTERNAL_GAP_SAME_PATH ";
- }
- }
-
- if ($PASAFLY_MODE) {
- $bfly_cmd .= " --PasaFly ";
- }
- elsif ($CUFFFLY_MODE) {
- $bfly_cmd .= " --CuffFly ";
- }
-
- print $bfly_cmds_ofh $bfly_cmd . "\n";
-
- }
- }
- close $fh;
- close $bfly_cmds_ofh;
- close $qgraph_cmd_ofh;
-
- }
-
- # see if we need to run the quantifyGraph commands:
- if ($NO_RUN_QUANTIFYGRAPH_FLAG) {
-
- print "#############################################################################\n";
- print "## Ceasing Trinity prior to execution of massively parallel operations.\n";
- print "##\n";
- print "## To complete Trinity, execute the following sets of commands:\n";
- print "##\n";
- print "## First, run the Chrysalis QuantifyGraph commands in parallel:\n";
- print "## $quantify_graph_cmds\n";
- print "##\n";
- print "## Then, execute all the Butterfly commands:\n";
- print "## $butterfly_cmds\n";
- print "##\n";
- print "## And, finally, concatenate all Butterfly assemblies into a single output file:\n";
- print "##\n";
- print "## find $output_directory/ -name \"\*allProbPaths.fasta\" -exec cat {} + > $output_directory/Trinity.fasta\n";
- print "##\n";
- print "##############################################################################\n";
- print "\n\n";
-
- exit(0);
- }
- else {
-
-
- my $quantify_graph_cmds_finished = &create_full_path("$chrysalis_output_dir/quantifyGraph_commands.run.finished");
- if (! -e $quantify_graph_cmds_finished) {
- ## run it
-
- print STDERR "---------------------------------------------------\n"
- . "----------- Chrysalis: QuantifyGraph --------------\n"
- . "-- (Integrate mapped reads into de Bruijn graph) --\n"
- . "---------------------------------------------------\n\n";
-
-
- if ($grid_conf_file) {
- my @quantify_graph_cmds = `cat $quantify_graph_cmds`;
- chomp @quantify_graph_cmds;
-
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "chrysalis/chrysalis_quantify_graph_on_grid.cacheSuccess");
- my $ret = $grid_runner->run_on_grid(@quantify_graph_cmds);
- if ($ret) {
- die "Error, not all Chrysalis quantify_graph commands completed successfully. Cannot continue.";
- }
- }
- else {
-
- my $cmd = "$PARAFLY -c $quantify_graph_cmds -CPU $CPU -failed_cmds failed_quantify_graph_commands.$$.txt -v -shuffle ";
- &process_cmd($cmd);
- }
-
- # write checkpoint
- &process_cmd("touch $quantify_graph_cmds_finished");
- }
-
-
- return($butterfly_cmds);
-
- }
-
-
-
-}
-
-
-####
-sub run_inchworm {
- my ($inchworm_outfile, $reads, $strand_specific_flag, $kmer_method) = @_;
-
-
- ## get count of number of reads to be assembled.
- my $read_count_file = "$reads.read_count";
- if (! -s $read_count_file) {
- my $count_of_reads = `wc -l < $reads`;chomp($count_of_reads); #AP: grep is expensive; one test took 2h...!
- $count_of_reads/=2; # assume fasta; two lines per read
- $pm_read_count = $count_of_reads;
- open (my $ofh, ">$read_count_file") or die $!;
- print $ofh $count_of_reads."\n";
- close $ofh;
- }
-
-
- my $inchworm_cmd;
-
- my @tmp_files; # to be deleted after successful inchworm run.
-
-
- #####################################################
- ## Using Jellyfish kmer method
- #####################################################
-
- if (! $FORCE_INCHWORM_KMER_METHOD) {
-
- my $jelly_kmer_fa_file = "jellyfish.kmers.fa";
- my $jelly_finished_checkpoint_file = "jellyfish.$min_kmer_cov.finished";
- unless (-e $jelly_finished_checkpoint_file) {
-
-
- print STDERR "-------------------------------------------\n"
- . "----------- Jellyfish --------------------\n"
- . "-- (building a k-mer catalog from reads) --\n"
- . "-------------------------------------------\n\n";
-
-
- my $read_file_size = -s $reads;
-
- my $jelly_hash_size = int( ($max_memory - $read_file_size)/7); # decided upon by Rick Westerman
-
-
- if ($jelly_hash_size < 100e6) {
- $jelly_hash_size = 100e6; # seems reasonable for a min hash size as 100M
- }
-
- ## for testing
- if ($JELLY_S) {
- $jelly_hash_size = $JELLY_S;
- }
-
- my $cmd = "$JELLYFISH_DIR/bin/jellyfish count -t $CPU -m $KMER_SIZE -s $jelly_hash_size ";
-
- unless ($SS_lib_type) {
- ## count both strands
- $cmd .= " --canonical ";
- }
-
- $cmd .= " $reads";
-
- &process_cmd($cmd);
-
-
- if (-s $jelly_kmer_fa_file) {
- unlink($jelly_kmer_fa_file) or die "Error, cannot unlink $jelly_kmer_fa_file";
- }
-
- my $jelly_db = "mer_counts.jf";
-
- $cmd = "$JELLYFISH_DIR/bin/jellyfish dump -L $min_kmer_cov $jelly_db > $jelly_kmer_fa_file";
-
- &process_cmd($cmd);
-
-
- ## write a histogram of the kmer counts.
- $cmd = "$JELLYFISH_DIR/bin/jellyfish histo -t $CPU -o $jelly_kmer_fa_file.histo $jelly_db";
- &process_cmd($cmd);
-
-
- unlink($jelly_db);
-
- ## if got this far, consider jellyfish done.
- &process_cmd("touch $jelly_finished_checkpoint_file");
-
-
- if ($NO_RUN_INCHWORM_FLAG) {
- print STDERR "WARNING: --no_run_inchworm parameter in effect. Stopping here prior to running inchworm.\n";
- exit(0);
- }
-
-
- }
-
-
- $inchworm_cmd = "$INCHWORM_DIR/bin/inchworm --kmers $jelly_kmer_fa_file --run_inchworm -K $KMER_SIZE -L $MIN_IWORM_LEN --monitor 1 ";
-
- # hold on to the jellyfish file - we might use it for other applications.
- #push (@tmp_files, $jelly_finished_checkpoint_file, $jelly_kmer_fa_file) unless $NO_CLEANUP;
-
- }
- else {
-
- ######################################################
- ## Using Inchworm kmer method (original, slow method)
- ######################################################
-
- $inchworm_cmd = "$INCHWORM_DIR/bin/inchworm --reads $reads --run_inchworm -K $KMER_SIZE -L $MIN_IWORM_LEN --monitor 1 ";
- if ($min_kmer_cov > 1) {
- $inchworm_cmd .= " --minKmerCount $min_kmer_cov ";
- }
- }
-
-
- ## finish constructing the inchworm command to execute
-
- unless ($strand_specific_flag) {
- $inchworm_cmd .= " --DS ";
- }
-
- unless ($NO_CLEANUP) {
- $inchworm_cmd .= " --keep_tmp_files ";
- }
-
-
- my $num_threads = ($inchworm_cpu) ? $inchworm_cpu : $CPU;
- $inchworm_cmd .= " --num_threads $num_threads ";
-
- if ($PARALLEL_IWORM_FLAG) {
- $inchworm_cmd .= " --PARALLEL_IWORM ";
- }
-
- if ($INCHWORM_CUSTOM_PARAMS) {
- $inchworm_cmd .= " $INCHWORM_CUSTOM_PARAMS ";
- }
-
- #$inchworm_cmd .= " 2>inchworm.log > $inchworm_outfile.tmp";
- $inchworm_cmd .= " > $inchworm_outfile.tmp";
-
- print STDERR "----------------------------------------------\n"
- . "--------------- Inchworm ---------------------\n"
- . "-- (Linear contig construction from k-mers) --\n"
- . "----------------------------------------------\n\n";
-
-
- eval {
-
- &process_cmd($inchworm_cmd);;
- };
-
- if ($@) {
-
- print STDERR "$@\n";
- print "** The inchworm process failed.";
- print STDERR "\n\nIf it indicates bad_alloc(), then Inchworm ran out of memory. You'll need to either reduce the size of your data set or run Trinity on a server with more memory available.\n\n";
- exit(1);
- }
-
- rename("$inchworm_outfile.tmp", $inchworm_outfile) or die "Error, cannot rename $inchworm_outfile.tmp to $inchworm_outfile"; # now we know for sure it's done.
-
-
- return;
-
-}
-
-####
-sub prep_seqs {
- my ($initial_files_ref, $seqType, $file_prefix, $SS_lib_type) = @_;
- my @initial_files = @$initial_files_ref;
- return if -e "$file_prefix.fa";
- for (my $i=0;$i<scalar(@initial_files);$i++){
- my $f = $initial_files[$i];
- if ($f=~/\.gz$/){
- my $new = $f;
- $new=~s/\.gz$//;
- unlink($new);
- &process_cmd("gunzip -c $f > $new");
- $initial_files[$i] = $new;
- }elsif ($f=~/\.bz2$/){
- my $new = $f;
- $new=~s/\.bz2$//;
- unlink($new);
- &process_cmd("bunzip2 -dkc $f > $new");
- $initial_files[$i] = $new;
- }
- }
-
- my $initial_file_str = join(" ", at initial_files);
- if ($seqType eq "fq") {
- # make fasta
- foreach my $f (@initial_files){
- my $perlcmd = "$UTILDIR/support_scripts/fastQ_to_fastA.pl -I $f ";
- my $fastool_cmd = "$FASTOOL_DIR/fastool";
- if ($SS_lib_type && $SS_lib_type eq "R") {
- $perlcmd .= " --rev ";
- $fastool_cmd .= " --rev ";
- }
- $fastool_cmd .= " --illumina-trinity --to-fasta $f >> $file_prefix.fa 2> $f.readcount ";
- $perlcmd .= " >> $file_prefix.fa 2> $f.readcount ";
- my $cmd = ($USE_FASTOOL) ? $fastool_cmd : $perlcmd;
- &process_cmd($cmd);
- }
- }
- elsif ($seqType eq "fa") {
- if (scalar(@initial_files) == 1 && (!$SS_lib_type || $SS_lib_type ne "R")) {
- ## just symlink it here:
- my $cmd = "ln -s $initial_file_str $file_prefix.fa";
- &process_cmd($cmd);
- }elsif(scalar(@initial_files) > 1 && (!$SS_lib_type || $SS_lib_type ne "R")){
- my $cmd = "cat $initial_file_str > $file_prefix.fa";
- &process_cmd($cmd);
- }else {
- #if ($SS_lib_type && $SS_lib_type eq "R") {
- foreach my $f (@initial_files){
- my $cmd = "$UTILDIR/support_scripts/revcomp_fasta.pl $f >> $file_prefix.fa";
- &process_cmd($cmd);
- }
- }
- }
- elsif (($seqType eq "cfa") | ($seqType eq "cfq")) {
- confess "cfa, cfq not supported";
- }
- return \@initial_files;
-}
-
-
-
-###
-sub create_full_path {
- my ($file, $verify_exists) = @_;
- if (ref($file) eq "ARRAY"){
- for (my $i=0;$i<scalar(@$file);$i++){
- my $filename = $file->[$i];
- if ($verify_exists && ! -e $filename) {
- confess "Error, cannot locate file: $filename";
- }
- $file->[$i] = &create_full_path($filename);
- }
- return @$file;
- }
- else {
- if ($verify_exists && ! -e $file) {
- confess "Error, cannot locate file: $file";
- }
- my $cwd = cwd();
- if ($file !~ m|^/|) { # must be a full path
- $file = $cwd . "/$file";
- }
- return($file);
- }
-}
-
-
-
-####
-sub process_cmd {
- my ($cmd) = @_;
-
- print &mytime."CMD: $cmd\n";
-
- my $start_time = time();
- my $ret = system($cmd);
- my $end_time = time();
-
- if ($ret) {
- die "Error, cmd: $cmd died with ret $ret";
- }
-
- print "CMD finished (" . ($end_time - $start_time) . " seconds)\n";
-
- return;
-}
-
-
-####
-sub run_jaccard_clip_left_right {
- my ($inchworm_file, $left_files_aref, $right_files_aref, $seqType, $SS_lib_type) = @_;
-
- my $output_file = "$inchworm_file.clipped.fa";
-
- if (-s $output_file) {
- print STDERR "###### WARNING: $output_file already exists, skipping the jaccard-clip step, using already existing output: $output_file\n";
- return($output_file);
- }
-
- my $cmd = "$UTILDIR/support_scripts/inchworm_transcript_splitter.pl --iworm $inchworm_file "
- . " --left " . join(",", @$left_files_aref) . " --right " . join(",", @$right_files_aref) . " --seqType $seqType --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- &process_cmd($cmd);
-
- unless (-s $output_file) {
- croak "Error, jaccard clipping didn't produce the expected output file: $output_file";
- }
-
- return($output_file);
-}
-
-
-
-####
-sub run_jaccard_clip_single_but_really_paired {
- my ($inchworm_file, $single_files_aref, $seqType, $SS_lib_type) = @_;
-
- my $output_file = "$inchworm_file.clipped.fa";
-
- if (-s $output_file) {
- print STDERR "###### WARNING: $output_file already exists, skipping the jaccard-clip step, using already existing output: $output_file\n";
- return($output_file);
- }
-
- my $cmd = "$UTILDIR/support_scripts/inchworm_transcript_splitter.pl --iworm $inchworm_file "
- . " --single_but_really_paired " . join(",", @$single_files_aref) . " --seqType $seqType --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- &process_cmd($cmd);
-
-
-
- unless (-s $output_file) {
- croak "Error, jaccard clipping didn't produce the expected output file: $output_file";
- }
-
- return($output_file);
-}
-
-####
-sub test_java_failure_capture {
-
- print "#######################################\n";
- print "Running Java Tests\n";
-
- my $java_prog = `which java`;
- unless ($java_prog) {
- die "Error, cannot find 'java'. Please be sure it is available within your \${PATH} setting and then try again.";
- }
-
-
- my $cmd = "java -Xmx64m -jar $UTILDIR/support_scripts/ExitTester.jar 0";
- eval {
- &process_cmd($cmd);
- };
- if ($@) {
- print STDERR "Error encountered in testing for running of a simple java application. ";
- print "$@\n\n";
- print STDERR "Please check your java configuration.\n";
- exit(1);
-
- }
-
- $cmd = "java -Xmx64m -jar $UTILDIR/support_scripts/ExitTester.jar 1";
- eval {
- &process_cmd($cmd);
- };
-
- if ($@) {
- print "-we properly captured the java failure status, as needed. Looking good.\n";
- }
- else {
- print STDERR "-we are unable to properly capture java failure status. Please be sure that java (or any wrapper around java that's being used) can properly capture and propagate failure status before proceeding.\n";
- exit(1);
- }
-
- print "Java tests succeeded.\n";
- print "###################################\n\n";
-
- return;
-}
-
-
-####
-sub extract_reads_with_iworm_kmers {
- my ($trinity_target_fa, $inchworm_file, $min_percent_read_containing_kmers, $SS_lib_type) = @_;
-
- my $extracted_reads_file = "$trinity_target_fa." . $min_percent_read_containing_kmers . "pcnt.iworm_extracted";
-
- my $cmd = "$INCHWORM_DIR/bin/pull_reads_with_kmers "
- . "--target $inchworm_file "
- . "--reads $trinity_target_fa "
- . "--min_percent_read_containing_kmers $min_percent_read_containing_kmers ";
-
- unless ($SS_lib_type) {
- $cmd .= " --DS ";
- }
-
- $cmd .= " > $extracted_reads_file ";
-
- if (-s $extracted_reads_file) {
- print STDERR "-warning, iworm kmer-extracted reads file already exists: $extracted_reads_file. Re-using it.\n";
- }
- else {
-
- &process_cmd($cmd);
- }
-
- return($extracted_reads_file);
-}
-
-
-sub try_unlimit_stacksize {
-
- # from Ryan Thompson
- eval "use BSD::Resource; setrlimit(RLIMIT_STACK, RLIM_INFINITY, RLIM_INFINITY); ";
-
- if( $@ ) {
- warn <<"EOF";
-
- $@
-
- Unable to set unlimited stack size. Please install the BSD::Resource
- Perl module to allow this script to set the stack size, or set it
- yourself in your shell before running Trinity (ignore this warning if
- you have set the stack limit in your shell). See the following URL for
- more information:
-
- http://trinityrnaseq.sourceforge.net/trinity_faq.html#ques_E
-
-EOF
-;
- }
- else {
- print "Successfully set unlimited stack size.\n";
- print "###################################\n\n";
- }
- return;;
-}
-
-sub mytime() {
- my @mabbr = qw(January February March April May June July August September October November December);
- my @wabbr = qw(Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
- my $sec = localtime->sec() < 10 ? '0' . localtime->sec() : localtime->sec();
- my $min = localtime->min() < 10 ? '0' . localtime->min() : localtime->min();
- my $hour = localtime->hour() < 10 ? '0' . localtime->hour() : localtime->hour();
- my $wday = $wabbr[localtime->wday];
- my $mday = localtime->mday;
- my $mon = $mabbr[localtime->mon];
- my $year = localtime->year() + 1900;
- return "$wday, $mon $mday, $year: $hour:$min:$sec\t";
-}
-
-
-
-####
-sub show_lit_citation {
-
- print "\n\n* Trinity:\n"
- . "Full-length transcriptome assembly from RNA-Seq data without a reference genome.\n"
- . "Grabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L,\n"
- . "Raychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F,\n"
- . "Birren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A.\n"
- . "Nature Biotechnology 29, 644–652 (2011)\n"
- . "Paper: http://www.nature.com/nbt/journal/v29/n7/full/nbt.1883.html\n"
- . "Code: http://trinityrnaseq.sf.net\n\n\n";
-
-=included_in_trinity
-
------------------------------------------------------------------------------------------
------ Tools Below are Used Within Trinity Accordingly -----------------------------------
------------------------------------------------------------------------------------------
-
-* Fastool (for fast fastQ-to-fastA conversion)
-Francesco Strozzi
-Code: https://github.com/fstrozzi/Fastool
-
-* Jellyfish (for fast K-mer counting)
-A fast, lock-free approach for efficient parallel counting of occurrences of k-mers.
-Guillaume Marcais and Carl Kingsford.
-Bioinformatics (2011) 27(6): 764-770
-Paper: http://bioinformatics.oxfordjournals.org/content/27/6/764.long\n
-Code: http://www.cbcb.umd.edu/software/jellyfish
-
-* Trimmomatic
-Lohse M, Bolger AM, Nagel A, Fernie AR, Lunn JE, Stitt M, Usadel B. RobiNA: a
-user-friendly, integrated software solution for RNA-Seq-based transcriptomics.
-Nucleic Acids Res. 2012 Jul;40(Web Server issue):W622-7.
-Code: http://www.usadellab.org/cms/?page=trimmomatic
-
-
-=cut
-
- return;
-}
-
-# clean-up after normal termination, exit(), or die()
-END {
- &collectl_stop();
-}
-
-
-sub perfmon_start {
- open (FILE, ">", "$output_directory/$pm_logfile") or die "Error, cannot write to: $output_directory/$pm_logfile";
- print FILE "Statistics:\n";
- print FILE "===========\n";
- print FILE "Trinity Version: $VERSION\n";
- my $tempp="";
- $tempp=`ldd $INCHWORM_DIR/bin/inchworm 2>/dev/null | grep "libgomp"`;
- if ($tempp eq "") {
- print FILE "Compiler: Intel\n";
- } else {
- print FILE "Compiler: GCC\n";
- }
- print FILE "Trinity Parameters: $pm_trinity_arguments\n";
- $pm_trinity_startstring = `date`;
- $pm_trinity_start = `date +%s`;
- close (FILE);
-}
-
-sub perfmon_end {
- $pm_trinity_endstring = `date`;
- $pm_trinity_end = `date +%s`;
- my $timestamp = `date +%s`;
- if ( -e "$output_directory/$pm_logfile" ) {
- open (FILE, '>>', "$output_directory/$pm_logfile") or die;
- if ($PAIRED_MODE) {
- print FILE "Paired mode\n";
- print FILE " Input data\n";
- if (@left_files && @right_files) {
- print FILE " Left.fasta $pm_left_fa_size MByte\n";
- print FILE " Right.fasta $pm_right_fa_size MByte\n";
- } else {
- print FILE " Single.fasta $pm_single_fa_size MByte\n";
- }
- } else {
- print FILE "Unpaired read mode\n";
- print FILE " Input data\n";
- print FILE " Single.fasta $pm_single_fa_size MByte\n";
- }
- }
- $pm_inchworm_kmers = `cat $output_directory/inchworm.kmer_count`;
- print FILE " Number of unique KMERs: $pm_inchworm_kmers";
- print FILE " Number of reads: $pm_read_count";
- print FILE " Output data\n";
- my $pm_temp = -s "$output_directory/Trinity.fasta" || 0;
- $pm_temp = $pm_temp / 1024 / 1024;
- my $pm_trinity_fa_size = sprintf('%.0f', $pm_temp);
- print FILE " Trinity.fasta $pm_trinity_fa_size MByte\n\n";
- print FILE "Runtime\n";
- print FILE "=======\n";
- print FILE "Start: $pm_trinity_startstring";
- print FILE "End: $pm_trinity_endstring";
- my $pm_trinity_time = $pm_trinity_end - $pm_trinity_start;
- print FILE "Trinity $pm_trinity_time seconds\n";
- my $pm_inchworm_time = $pm_inchworm_end - $pm_inchworm_start;
- print FILE " Inchworm $pm_inchworm_time seconds\n";
- my $pm_chrysalis_time = $pm_chrysalis_end - $pm_chrysalis_start;
- print FILE " Chrysalis $pm_chrysalis_time seconds\n";
- my $pm_butterfly_time = $pm_butterfly_end - $pm_butterfly_start;
- print FILE " Butterfly $pm_butterfly_time seconds\n";
- my $pm_rest_time = $pm_trinity_time - $pm_butterfly_time - $pm_chrysalis_time - $pm_inchworm_time;
- print FILE " Rest $pm_rest_time seconds\n";
- close (FILE);
-}
-
-sub collectl_start {
- # install signal handler to stop collectl on interrupt
- $SIG{INT} = sub { print "Trinity interrupted\n"; &collectl_stop(); exit(1); };
-
- if ($run_with_collectl){
- warn "STARTING COLLECTL\n";
- $collectl_output_directory = "$start_dir/collectl";
- `rm -rf $collectl_output_directory `;
- $collectl_output_directory = &create_full_path($collectl_output_directory, 0);
- unless (-d $collectl_output_directory) {
- mkdir $collectl_output_directory or die "Error, cannot mkdir $collectl_output_directory";
- }
- my $collectl_userid = qx(id --user --real);
- chomp($collectl_userid);
- my $cmd = "cd $collectl_output_directory && exec ${COLLECTL_DIR}/collectl $collectl_param --procfilt u$collectl_userid -f $collectl_output_directory/y";
- ## fork a child to run collectl
- $collectl_pid = fork();
- if (not defined $collectl_pid) {
- warn "FORK FAILED - NO COLLECTL PROCESS STARTED\n";
- } elsif ($collectl_pid == 0) {
- warn "I'M THE CHILD RUNNING TRINITY\n";
- exec($cmd);
- warn "COLLECTL FINISHED BEVORE KILL WAS CALLED\n";
- exit(0);
- } else {
- warn "I'M THE PARENT, COLLECTL_PID=$collectl_pid\n";
- }
- }
-}
-
-# finish collectl monitoring and create collectl plots
-sub collectl_stop {
- if ($run_with_collectl && $collectl_pid>0) {
- warn "TERMINATING COLLECTL, PID = $collectl_pid\n";
- # try to be nice here as a hard kill will result in broken/unusable raw.gz file
- system("sync");
- kill("INT", $collectl_pid);
- kill("TERM", $collectl_pid);
- waitpid($collectl_pid,0);
- chdir($collectl_output_directory) or return;
- system("$COLLECTL_DIR/make_data_files.sh");
- system("$COLLECTL_DIR/timetable.sh");
- $collectl_titlename = "${VERSION} ${CPU} @{left_files}@{single_files}";
- system("$COLLECTL_DIR/plot.sh \"$collectl_titlename\" ${CPU}");
- }
-}
-
-####
-sub run_trimmomatic_PE {
- my ($left_fq_file, $right_fq_file, $trimmomatic_params) = @_;
-
- my $trimmed_left_file_base = basename($left_fq_file);
- my $trimmed_right_file_base = basename($right_fq_file);
-
- my ($trimmed_left_fq, $trimmed_right_fq) = ("$trimmed_left_file_base.PwU.qtrim.fq", "$trimmed_right_file_base.PwU.qtrim.fq");
- my $checkpoint = "trimmomatic.ok";
-
- if (&files_exist($trimmed_left_fq, $trimmed_right_fq, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Trimmomatic process was previously completed. Skipping it and using existing qual-trimmed files: $trimmed_left_fq, $trimmed_right_fq\n";
- print STDERR "###############################################################################\n";
-
- return($trimmed_left_fq, $trimmed_right_fq);
- }
-
-
- my $cmd = "java -jar $TRIMMOMATIC PE -threads $CPU -phred33 "
- . " $left_fq_file $right_fq_file "
- . " $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim "
- . " $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim "
- . " $trimmomatic_params ";
-
- &process_cmd($cmd);
-
- ## append the orphans so we can still use them in assembly
- &process_cmd("cat $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim > $trimmed_left_fq");
- &process_cmd("cat $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim > $trimmed_right_fq");
-
- &process_cmd("touch $checkpoint");
-
- # compress the trimmomatic direct outputs to conserve space:
- &process_cmd("gzip $trimmed_left_file_base.P.qtrim $trimmed_left_file_base.U.qtrim $trimmed_right_file_base.P.qtrim $trimmed_right_file_base.U.qtrim &");
-
- return($trimmed_left_fq, $trimmed_right_fq);
-
-
-}
-
-####
-sub run_trimmomatic_SE {
- my ($single_fq, $trimmomatic_params) = @_;
-
-
- my $trimmed_fq = basename($single_fq) . ".qtrim.fq";
-
- my $checkpoint = "trimmomatic.ok";
-
- if (&files_exist($trimmed_fq, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Trimmomatic process was previously completed. Skipping it and using existing qual-trimmed file: $trimmed_fq\n";
- print STDERR "###############################################################################\n";
-
- return($trimmed_fq);
- }
-
- my $cmd = "java -jar $TRIMMOMATIC SE -threads $CPU -phred33 "
- . " $single_fq "
- . " $trimmed_fq "
- . " $trimmomatic_params ";
-
- &process_cmd($cmd);
-
- &process_cmd("touch $checkpoint");
-
- return($trimmed_fq);
-}
-
-####
-sub run_normalization {
- my ($max_read_coverage, @read_files) = @_;
-
- if ($NORMALIZE_BY_READ_SET) {
-
- my ($reads_left_or_single_aref, $right_reads_aref) = @read_files;
-
- my @normalized_left_or_single;
- my @normalized_right;
-
- my $counter = 0;
- while (@$reads_left_or_single_aref) {
- my $left_or_single_reads = shift @$reads_left_or_single_aref;
- my @reads_to_process = ([$left_or_single_reads]);
- if (ref $right_reads_aref) {
- my $right_reads = shift @$right_reads_aref;
- push (@reads_to_process, [$right_reads]);
- }
- $counter++;
- my $norm_out_dir = cwd() . "/norm_for_read_set_$counter";
- my @norm_read_files = &normalize($norm_out_dir, $max_read_coverage, @reads_to_process);
- push (@normalized_left_or_single, $norm_read_files[0]);
- if (scalar @norm_read_files == 2) {
- # PE norm
- push (@normalized_right, $norm_read_files[1]);
- }
-
- }
-
- ## now merge them in one final round:
- my $norm_merged_dir = cwd() . "/insilico_read_normalization_altogether";
- my @reads = (\@normalized_left_or_single);
- if (@normalized_right) {
- push (@reads, \@normalized_right);
- }
- my @ret_files = &normalize($norm_merged_dir, $max_read_coverage, @reads);
- return(@ret_files);
-
- }
- else {
- ## all at once.
- my $normalize_outdir = cwd() . "/insilico_read_normalization";
-
- my @ret_files = &normalize($normalize_outdir, $max_read_coverage, @read_files);
- return(@ret_files);
-
- }
-
-
-}
-
-####
-sub normalize {
- my ($normalize_outdir, $max_read_coverage, @read_files) = @_;
-
- print STDERR "---------------------------------------------------------------\n"
- . "------------ In silico Read Normalization ---------------------\n"
- . "-- (Removing Excess Reads Beyond $max_read_coverage Coverage --\n"
- . "-- $normalize_outdir --\n"
- . "---------------------------------------------------------------\n\n";
-
-
-
- my $cmd = "$UTILDIR/insilico_read_normalization.pl --seqType $seqType --JM $JM_string "
- . " --max_cov $max_read_coverage --CPU $CPU --output $normalize_outdir";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
-
- if ($NO_CLEANUP) {
- $cmd .= " --no_cleanup ";
- }
-
-
- my @ret_files;
- if (scalar @read_files == 2) {
- $cmd .= " --left " . join(",", @{$read_files[0]}) . " --right " . join(",", @{$read_files[1]})
- . " --pairs_together --PARALLEL_STATS ";
- @ret_files = ("$normalize_outdir/left.norm.$seqType", "$normalize_outdir/right.norm.$seqType");
-
- }
- elsif (scalar @read_files == 1) {
- $cmd .= " --single " . join(",", @{$read_files[0]});
- @ret_files = ("$normalize_outdir/single.norm.$seqType");
- }
- else {
- confess "how did we end up with " . scalar(@read_files) . " read files? @read_files\nNot sure what to do.... ";
- }
-
- my $checkpoint = "$normalize_outdir/normalization.ok";
- if (&files_exist(@ret_files, $checkpoint)) {
-
- print STDERR "###############################################################################\n";
- print STDERR "#### Normalization process was previously completed. Skipping it and using existing normalized files: @ret_files\n";
- print STDERR "###############################################################################\n";
-
- }
- else {
- # do the normalization
-
- &process_cmd($cmd);
-
- &process_cmd("touch $checkpoint");
- }
-
-
- return(@ret_files);
-}
-
-
-####
-sub files_exist {
- my @files = @_;
-
- foreach my $file (@files) {
- if (! -e $file) {
- return(0); # not exists
- }
- }
-
- return(1); # all exist
-}
-
-####
-sub run_genome_guided_Trinity {
- my ($left_files_aref, $right_files_aref) = @_;
-
-
- my $bam_file;
- if ($genome_guided_use_bam) {
- $bam_file = $genome_guided_use_bam;
- }
- else {
- ## run gsnap to align reads:
-
- $bam_file = "gsnap.coordSorted.bam";
-
- unless (-s "$bam_file" && -e "$bam_file.ok") {
-
- my @files;
- if ($left_files_aref && $right_files_aref) {
- while (@$left_files_aref) {
- my $left_file = shift @$left_files_aref;
- my $right_file = shift @$right_files_aref;
- push (@files, $left_file, $right_file);
- }
- }
- else {
- @files = @$left_files_aref; # really single files
- }
-
- @files = &add_zcat_gz(@files);
-
-
- ## prep the genome
- my $cmd = "ln -sf $genome_fasta_file gsnap_target.fa";
- &process_cmd($cmd);
-
- if (-s "$genome_fasta_file.gmap") {
- &process_cmd("ln -sf $genome_fasta_file.gmap gsnap_target.gmap");
- }
- else {
-
- my $cmd = "gmap_build -k 13 -D . -d gsnap_target.gmap gsnap_target.fa ";
- &process_cmd($cmd) unless (-e "target.gmap");
- }
-
- if (-s "$genome_fasta_file.fai") {
- &process_cmd("ln -sf $genome_fasta_file.fai gsnap_target.fa.fai");
- }
- else {
- my $cmd = "samtools faidx gsnap_target.fa";
- &process_cmd($cmd);
- }
-
- $cmd = "bash -c \"set -o pipefail; gsnap -d gsnap_target.gmap -D . -A sam --nofails -N 1 -t $GMAP_CPU -w $genome_guided_max_intron -n 20 @files | samtools view -bt gsnap_target.fa.fai - | samtools sort -o - - > $bam_file \"";
- &process_cmd($cmd);
-
- &process_cmd("touch $bam_file.ok"); # checkpoint
- }
-
- }
-
- ## partition the reads according to coverage piles:
-
- my $cmd = "$UTILDIR/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl --coord_sorted_SAM $bam_file -I $genome_guided_max_intron --sort_buffer $genome_guided_sort_buffer --CPU $CPU ";
-
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type $SS_lib_type ";
- }
- &process_cmd($cmd) unless (-e "partitions.ok");
-
- &process_cmd("touch partitions.ok") unless (-e "partitions.ok");
-
- ## generate list of the read files:
- $cmd = "find Dir_\* -name '*reads' > read_files.list";
-
- &process_cmd($cmd) unless (-s "read_files.list" && -e "read_files.list.ok");
- &process_cmd("touch read_files.list.ok") unless (-e "read_files.list.ok"); # checkpoint
-
- ##################################################
- ## write Trinity assembly commands for partitions:
- ##################################################
-
- $cmd = "$UTILDIR/support_scripts/GG_write_trinity_cmds.pl --reads_list_file read_files.list --CPU $genome_guided_CPU ";
- if ($run_as_paired_flag) {
- $cmd .= " --run_as_paired ";
- }
- if ($SS_lib_type) {
- $cmd .= " --SS_lib_type F "; # all sequences already reoriented
- }
-
- $cmd .= " --full_cleanup_ET --seqType fa ";
-
-
- my @potential_args = @ORIG_ARGS;
-
- while (@potential_args) {
- my $arg = shift @potential_args;
-
- # single value options that aren't needed:
- if ($arg =~ /run_as_paired|normalize_by_read_set|trimmomatic|normalize_reads|prep/) {
- next;
- }
-
- # value specified options that aren't needed
- if ($arg =~ /seqType|left|right|single|genome|SS_lib_type|GMAP|quality_trimming|output|normalize_max_read_cov|grid_conf/
- ||
- # more precise identification of parameter
- $arg =~ /^(CPU)$/
-
- ) {
- # skipping these, already represented by opt configuration above.
- my $val = shift @potential_args;
- next;
- }
-
- if ($arg eq "--bfly_opts") {
- # wrap val in quotes
- my $val = shift @potential_args;
- $cmd .= "$arg \"$val\" ";
- }
- else {
- ## just passing it on.
- $cmd .= " $arg ";
- }
- }
-
- $cmd .= " > trinity_GG.cmds";
-
- &process_cmd($cmd) unless (-e "trinity_GG.cmds.ok");
- &process_cmd("touch trinity_GG.cmds.ok") unless (-e "trinity_GG.cmds.ok");
-
- if ($genome_guided_just_prep_flag) {
- print STDERR "###### Just prepping data for genome-guided assembly. Stopping here due to --genome_guided_just_prep invocation. #####\n\n";
- exit(0);
- }
-
- ## execute the commands:
- if ($grid_conf_file) {
- my @trin_GG_cmds = `cat trinity_GG.cmds`;
- chomp @trin_GG_cmds;
-
- my $grid_runner = new HTC::GridRunner($grid_conf_file, "trinity_GG_cmds.htc_cache_success");
- my $ret = $grid_runner->run_on_grid(@trin_GG_cmds);
- if ($ret) {
- die "Error, not all Trinity-GG commands completed successfully. Cannot continue.";
- }
-
- }
- else {
- my $cmd = "$PARAFLY -c trinity_GG.cmds -CPU $CPU -v ";
- &process_cmd($cmd);
- }
-
- ## pull together the final outputs:
- $cmd = "find Dir_* -name '*inity.fasta' | $UTILDIR/support_scripts/GG_trinity_accession_incrementer.pl > Trinity-GG.fasta.tmp";
- &process_cmd($cmd);
-
- rename("Trinity-GG.fasta.tmp", "Trinity-GG.fasta"); # now that it's done.
-
- print STDERR "\n\nFinished. See Trinity-GG.fasta for reconstructed transcripts\n\n";
-
- return;
-}
-
-sub add_zcat_gz {
- my (@in_files) = @_;
-
- my @files;
-
- foreach my $file (@in_files) {
-
- if ($file =~ /\.gz$/) {
-
- $file = "<(zcat $file)";
-
- }
- push (@files, $file);
- }
-
- return(@files);
-}
diff --git a/galaxy-plugin/__add_to_PATH_setting.txt b/galaxy-plugin/__add_to_PATH_setting.txt
deleted file mode 100644
index 85feb78..0000000
--- a/galaxy-plugin/__add_to_PATH_setting.txt
+++ /dev/null
@@ -1 +0,0 @@
-/usr/local/bin
diff --git a/galaxy-plugin/abundance_estimation_to_matrix.xml b/galaxy-plugin/abundance_estimation_to_matrix.xml
deleted file mode 100644
index 46ef57e..0000000
--- a/galaxy-plugin/abundance_estimation_to_matrix.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<tool id="abundance_estimation_to_matrix" name="abundance_estimation_to_matrix" version="0.0.1">
-
- <description>Join RSEM estimates from multiple samples into a single matrix</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- abundance_estimation_to_matrix_wrapper.py
- #for $q in $RSEM_samples
- ${q.file} "${q.column_label}"
- #end for
-
- </command>
- <inputs>
-
- <repeat name="RSEM_samples" title="RSEM abundance estimates for samples">
- <param name="file" label="Add file" type="data" format="text"/>
- <param name="column_label" label="column label" type="text" />
- </repeat>
-
- </inputs>
- <outputs>
- <data format="text" name="counts_matrix" label="${tool.name} on ${on_string}: Counts Matrix" from_work_dir="matrix.counts.matrix"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/abundance_estimation_to_matrix_wrapper.py b/galaxy-plugin/abundance_estimation_to_matrix_wrapper.py
deleted file mode 100644
index e906e3a..0000000
--- a/galaxy-plugin/abundance_estimation_to_matrix_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, string, subprocess
-
-#aliasing the filenames using the labels
-
-
-def run_command(command):
- print "Running command: " + command
-
- err_capture_file = open("my.stderr", 'w') # writing stderr to a file
- cmd_run = subprocess.Popen(args=command, shell=True, stderr=err_capture_file, stdout=sys.stdout)
- err = cmd_run.wait() # get exit code from command execution
- err_capture_file.close()
-
- if err:
- # report the error messages we captured, and exit non-zero
- sys.stderr.write("Error, cmd: " + command + " died with ret: " + `err`)
- for line in open(err_capture_file):
- sys.stderr.write(line)
- sys.exit(err)
- return
-
-label_list = [] # symlink files to the labels
-for i in range(1, len(sys.argv), 2):
- filename=sys.argv[i]
- label= sys.argv[i+1]
- cmd= "ln -sf " + filename + " " + label
- label_list.append(label)
- run_command(cmd)
-
-
-# run the abundance estimation script
-
-cmd = os.path.dirname(sys.argv[0]) + "/trinityToolWrapper.py " + " util/abundance_estimates_to_matrix.pl --est_method RSEM --cross_sample_fpkm_norm TMM " + " ".join(label_list)
-
-run_command(cmd)
-
-sys.exit(0)
-
diff --git a/galaxy-plugin/align_and_estimate_abundance.xml b/galaxy-plugin/align_and_estimate_abundance.xml
deleted file mode 100644
index d102646..0000000
--- a/galaxy-plugin/align_and_estimate_abundance.xml
+++ /dev/null
@@ -1,90 +0,0 @@
-<tool id="RSEM_abundance_estimation" name="RSEM_abundance_estimation" version="0.0.1">
-
- <description>run RSEM to estimate transcript abundances</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/align_and_estimate_abundance.pl --transcripts $transcripts --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
-
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
-
- </command>
- <inputs>
- <param format="fasta" name="transcripts" type="data" label="transcripts_fasta" help="Fasta sequences for which reads are aligned." />
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
-
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="text" name="transcript_counts" label="${tool.name} on ${on_string}: Isoform Counts" from_work_dir="RSEM.isoforms.results"/>
- <data format="text" name="gene_counts" label="${tool.name} on ${on_string}: Gene counts" from_work_dir="RSEM.genes.results"/>
-
-
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/alignreads.xml b/galaxy-plugin/alignreads.xml
deleted file mode 100644
index 6347a13..0000000
--- a/galaxy-plugin/alignreads.xml
+++ /dev/null
@@ -1,138 +0,0 @@
-<tool id="alignreads" name="alignReads" version="0.0.1">
-
- <description>alignReads: short read alignment tool wrapper</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py util/alignReads.pl --target $target -o alignment --aligner $aligner_selection.aligner
-
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --max_dist_between_pairs $inputs.max_dist_between_pairs
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- ##if str($inputs.use_additional) == "yes":
- ## -- $inputs.additional_params
- ##end if
-
-
- ## direct to output
- > outCapture.txt
-
-
- </command>
- <inputs>
- <param format="fasta" name="target" type="data" label="target" help="Fasta sequences targeted for short-read alignment" />
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="max_dist_between_pairs" type="integer" value="2000" min="1" label="max_dist_between_pairs" help="Maximum length expected between fragment pairs as aligned to the target, including introns where relevant."/>
-
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- </when>
- </conditional>
-
- <conditional name="aligner_selection">
- <param name="aligner" type="select" label="Select alignment tool to run">
- <option value="bowtie">bowtie</option>
- <option value="bwa">bwa</option>
- <option value="blat">blat</option>
- </param>
- <when value="blat">
- <param name="max_intron_length" type="integer" value="10000" min = "1" label="maximum intron length" help="" />
- <param name="min_percent_identity" type="integer" value="95" min="1" label="minimum percent identity" help="" />
- </when>
- <when value="bwa">
- </when>
- <when value="bowtie">
- </when>
- </conditional>
-
-
- <!--
- <conditional name="use_additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="additional_params" type="text" value="" label="Additional command-line parameters to aligner" help="" />
- </when>
- </conditional>
-
- -->
-
- </inputs>
- <outputs>
- <data format="bam" name="coordSortedBam" label="${tool.name} on ${on_string}: COORD-sorted read alignments" from_work_dir="alignment/alignment.coordSorted.bam"/>
- <data format="bam" name="nameSortedBam" label="${tool.name} on ${on_string}: NAME-sorted read alignments" from_work_dir="alignment/alignment.nameSorted.bam"/>
-
- <!-- notes: need to retain:
- -the sample name for the alignment
- -coordinate-sorted vs. name-sorted bam file
- -paired vs. unpaired vs. strictly proper pairs (for RSEM)
- -strand-specific or not
- -->
-
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/analyze_diff_exp.xml b/galaxy-plugin/analyze_diff_exp.xml
deleted file mode 100644
index a28907f..0000000
--- a/galaxy-plugin/analyze_diff_exp.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<tool id="Analyze_Diff_Exp" name="Analyze_Differential_Expression" version="0.0.1">
-
- <description>Analyze differential expression</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
-
- analyze_diff_exp_wrapper.py $EdgeRTarGz $TMM_Matrix_FPKM $Pvalue $Cvalue
-
- </command>
- <inputs>
- <param name="EdgeRTarGz" label="EdgeR tar gz file" type="data" format="file"/>
- <param name="TMM_Matrix_FPKM" label="TMM Normalized FPKM matrix" type="data" format="file" />
- <param name="Pvalue" label="P-value" value="0.05" type="float" />
- <param name="Cvalue" label="C-value" value="0" type="float" />
-
- </inputs>
- <outputs>
- <data format="data" name="diffExpr_matrix" label="${tool.name} on ${on_string}: Diffexp" from_work_dir="diffExpr.matrix"/>
- <data format="data" name="diffExpr_correlation_matrix" label="${tool.name} on ${on_string}: Diffexp" from_work_dir="diffExpr.matrix.log2.sample_cor.dat"/>
- <data format="data" name="diffExpr_correlation_matrix_pdf" label="${tool.name} on ${on_string}: Diffexp" from_work_dir="diffExpr.matrix.log2.sample_cor_matrix.pdf"/>
- <data format="data" name="Heatmap" label="${tool.name} on ${on_string}: Diffexp" from_work_dir="diffExpr.matrix.log2.centered.genes_vs_samples_heatmap.pdf"/>
- </outputs>
- <tests>
-
-
- <test>
- <param name="target" value="trinity/Trinity.fasta" />
- <param name="aligner" value="bowtie" />
- <param name="paired_or_single" value="single" />
- <param name="library_type" value="None" />
- <param name="input" value="trinity/reads.left.fq" />
- </test>
-
-
- </tests>
- <help>
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/analyze_diff_exp_wrapper.py b/galaxy-plugin/analyze_diff_exp_wrapper.py
deleted file mode 100644
index d4d7fae..0000000
--- a/galaxy-plugin/analyze_diff_exp_wrapper.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys, os, subprocess
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit()
-
-usage= "usage: " + sys.argv[0] + " " + "edgeR.tar.gz " + "TMM_normalized_FPKM_matrix " + "P-value " + "C-value"
-print sys.argv
-print usage
-print " "
-
-if len(sys.argv)<5:
- print "Require atleast two parameters"
-else:
- print "All good- command going ahead"
-print " "
-
-Normalized_Matrix=sys.argv[2]
-Pvalue=sys.argv[3]
-Cvalue=sys.argv[4]
-
-def run_command(cmd):
- print "The command used: " + cmd
- pipe= subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
- pipe.wait()
- ret= pipe.returncode
- if ret:
- print "command died: " + str(ret)
- print pipe.stderr.readlines()
- sys.exit(1)
- else:
- return
-print " "
-
-Final_tar_gz= "edgeR.tar.gz"
-run_command("cp "+ sys.argv[1] + " " + "Final_tar_gz")
-run_command("tar -xvf " + "Final_tar_gz")
-run_command("mv " + "edgeR_results" + "/* ." )
-
-# run the analyze command
-cmd= TRINITY_BASE_DIR + "/Analysis/DifferentialExpression/analyze_diff_expr.pl "+ "--matrix " + Normalized_Matrix + " -P " + Pvalue + " -C " + Cvalue
-run_command(cmd)
-
-origMatrixName= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix"
-# diffExpr.P0.001_C2.0.matrix
-run_command("mv " + origMatrixName + " diffExpr.matrix")
-
-SampleCorName= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.sample_cor.dat"
-# diffExpr.P0.001_C2.0.matrix.log2.sample_cor.dat
-run_command("mv " + SampleCorName + " diffExpr.matrix.log2.sample_cor.dat")
-
-CorMatrix= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.sample_cor_matrix.pdf"
-# diffExpr.P0.001_C2.0.matrix.log2.sample_cor_matrix.pdf
-run_command("mv " + CorMatrix + " diffExpr.matrix.log2.sample_cor_matrix.pdf")
-
-Heatmap= "diffExpr.P" + Pvalue + "_" + "C" + Cvalue + ".matrix.log2.centered.genes_vs_samples_heatmap.pdf"
-#diffExpr.P0.001_C2.0.matrix.log2.centered.genes_vs_samples_heatmap.pdf
-run_command("mv " + Heatmap + " diffExpr.matrix.log2.centered.genes_vs_samples_heatmap.pdf")
-
-sys.exit(0)
diff --git a/galaxy-plugin/bash_command_executer.py b/galaxy-plugin/bash_command_executer.py
deleted file mode 100755
index c6ab738..0000000
--- a/galaxy-plugin/bash_command_executer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-
-
-import sys, subprocess, os
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get command-line arguments
- args = sys.argv
- # Remove name of calling program, i.e. ./stderr_wrapper.py
- args.pop(0)
-
- # If there are no arguments left, we're done
- if len(args) == 0:
- return
-
- # If one needs to silence stdout
- #args.append( ">" )
- #args.append( "/dev/null" )
-
- cmdline = " ".join(args)
-
-
- try:
- # Run program
- err_capture = open("stderr.txt", 'w')
- proc = subprocess.Popen( args=cmdline, shell=True, stderr=err_capture, stdout=sys.stdout )
- returncode = proc.wait()
- err_capture.close()
-
-
- if returncode != 0:
- raise Exception
-
- except Exception:
- # Running Grinder failed: write error message to stderr
- err_text = open("stderr.txt").readlines()
- stop_err( "ERROR:\n" + "\n".join(err_text))
-
-
-if __name__ == "__main__": __main__()
diff --git a/galaxy-plugin/cat.xml b/galaxy-plugin/cat.xml
deleted file mode 100644
index 9aeaa50..0000000
--- a/galaxy-plugin/cat.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<tool id="catbash" name="Concatenate datasets">
- <description>tail-to-head</description>
- <command interpreter="python">
- bash_command_executer.py cat
- $input1
- #for $q in $queries
- ${q.input2}
- #end for
- > $out_file1
- </command>
- <inputs>
- <param name="input1" type="data" label="Concatenate Dataset"/>
- <repeat name="queries" title="Dataset">
- <param name="input2" type="data" label="Select" />
- </repeat>
- </inputs>
- <outputs>
- <data name="out_file1" format="input" metadata_source="input1"/>
- </outputs>
- <tests>
- <test>
- <param name="input1" value="1.bed"/>
- <param name="input2" value="2.bed"/>
- <output name="out_file1" file="cat_wrapper_out1.bed"/>
- </test>
- <!--TODO: if possible, enhance the underlying test code to handle this test
- the problem is multiple params with the same name "input2"
- <test>
- <param name="input1" value="1.bed"/>
- <param name="input2" value="2.bed"/>
- <param name="input2" value="3.bed"/>
- <output name="out_file1" file="cat_wrapper_out2.bed"/>
- </test>
- -->
- </tests>
- <help>
-
- no help
-
- </help>
-</tool>
diff --git a/galaxy-plugin/diffExpress_edgeR.xml b/galaxy-plugin/diffExpress_edgeR.xml
deleted file mode 100644
index 1a5b186..0000000
--- a/galaxy-plugin/diffExpress_edgeR.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<tool id="diffExpress_edgeR" name="diffExpress_edgeR" version="0.0.1">
-
- <description>Identify Differentially Expressed Transcripts Using EdgeR</description>
- <requirements>
- <requirement type="package">edgeR</requirement>
- </requirements>
- <command interpreter="python">
-
- trinityToolWrapper.py Analysis/DifferentialExpression/run_EdgeR.pl
- --matrix $counts_matrix
- --transcripts $transcripts_fasta_file
- --output edgeR_results
- --dispersion $dispersion
-
- > stdout.txt
-
-
- </command>
- <inputs>
-
- <param type="data" format="txt" name="counts_matrix" label="Matrix of RNA-Seq fragment counts for transcripts per condition" />
- <param type="data" format="fasta" name="transcripts_fasta_file" label="Transcripts fasta file corresponding to matrix" />
- <param type="float" name="dispersion" value="0.1" min="0" label="dispersion value" help="Dispersion value to be used in the negative binomial" />
-
- </inputs>
- <outputs>
-
- <data format="txt" name="diff_expressed_edgeR_results" label="${tool.name} on ${on_string}: differentially expressed transcripts per pair of conditions" from_work_dir="edgeR_results/all_diff_expression_results.txt" />
-
- <data format="txt" name="matrix_FPKM" label="${tool.name} on ${on_string}: matrix.TMM_normalized.FPKM" from_work_dir="edgeR_results/matrix.TMM_normalized.FPKM" />
-
- <data format="txt" name="TMM_info" label="${tool.name} on ${on_string}: TMM library size estimates" from_work_dir="edgeR_results/TMM_info.txt" />
-
- </outputs>
- <tests>
-
- <test>
- <param name="myname" value="This is just a simple test" />
-
- </test>
-
-
- </tests>
- <help>
- help info here.
- </help>
-</tool>
diff --git a/galaxy-plugin/transcriptsToOrfs.xml b/galaxy-plugin/transcriptsToOrfs.xml
deleted file mode 100644
index 2afb8f6..0000000
--- a/galaxy-plugin/transcriptsToOrfs.xml
+++ /dev/null
@@ -1,53 +0,0 @@
-<tool id="transcriptsToOrfs" name="transcriptsToOrfs" version="0.0.1">
-
- <description>Trinity Transcripts to Candidate Peptides</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
- trinityToolWrapper.py Analysis/Coding/transcripts_to_best_scoring_ORFs.pl
-
- -t $transcripts
- -m $min_prot_length
- --CPU $CPU
- --search_pfam "${ filter( lambda x: str( x[0] ) == str( $pfam_db ), $__app__.tool_data_tables[ 'pfam_databases' ].get_fields() )[0][-1] }"
-
- #if str($strand_specificity) == 'SS':
- -S
- #end if
-
- ## direct to output
- > output
-
- </command>
- <inputs>
-
- <param format="fasta" name="transcripts" type="data" label="Transcripts sequences in fastA format" help="" />
-
- <param name="min_prot_length" type="integer" label="Minimum peptide length (in amino acids)" value="100" min="50" help="" />
-
- <param name="strand_specificity" type="select" label="Strand specificity type">
- <option value="DS">NOT strand specific, examine both strands</option>
- <option value="SS">Strand specific, examine only top strand</option>
- </param>
-
- <param name="pfam_db" type="select" label="Pfam database">
- <options from_data_table="pfam_databases" />
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by hmmscan" />
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_pep_pfam" label="${tool.name} on ${on_string}: Pfam matches to Candidate Peptide Sequences" from_work_dir="longest_orfs.pep.pfam.dat"/>
- <data format="fasta" name="trinity_pep_seqs" label="${tool.name} on ${on_string}: Candidate Peptide Sequences" from_work_dir="best_candidates.eclipsed_orfs_removed.pep"/>
- <data format="bed" name="trinity_pep_coords" label = "${tool.name} on ${on_string} Candidate Peptide Coordinates" from_work_dir="best_candidates.eclipsed_orfs_removed.bed" />
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/trinityToolWrapper.py b/galaxy-plugin/trinityToolWrapper.py
deleted file mode 100755
index 957ae12..0000000
--- a/galaxy-plugin/trinityToolWrapper.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env python
-
-
-# borrowed from: http://wiki.g2.bx.psu.edu/Future/Job%20Failure%20When%20stderr and modified for use with Trinity tools.
-
-"""
-Wrapper that execute a program and its arguments but reports standard error
-messages only if the program exit status was not 0
-Example: ./stderr_wrapper.py myprog arg1 -f arg2
-"""
-
-import sys, subprocess, os
-
-assert sys.version_info[:2] >= ( 2, 4 )
-
-TRINITY_BASE_DIR = ""
-if os.environ.has_key('TRINITY_HOME'):
- TRINITY_BASE_DIR = os.environ['TRINITY_HOME'];
-else:
- sys.stderr.write("You must set the environmental variable TRINITY_BASE_DIR to the base installation directory of Trinity before running this");
- sys.exit()
-
-
-
-# get bindir
-bindir = sys.argv[0]
-bindir = bindir.split("/")
-if len(bindir) > 1:
- bindir.pop()
- bindir = "/".join(bindir)
-else:
- bindir = "."
-
-
-## add locations of tools to path setting.
-TOOL_PATHS_FILE = bindir + "/__add_to_PATH_setting.txt";
-for line in open(TOOL_PATHS_FILE):
- line = line.rstrip()
- os.environ['PATH'] += ":" + line
-
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get command-line arguments
- args = sys.argv
- # Remove name of calling program, i.e. ./stderr_wrapper.py
- args.pop(0)
- # If there are no arguments left, we're done
- if len(args) == 0:
- return
-
- # If one needs to silence stdout
- #args.append( ">" )
- #args.append( "/dev/null" )
-
- args[0] = "".join([TRINITY_BASE_DIR, '/', args[0]]);
-
- cmdline = " ".join(args)
-
-
-
- try:
- # Run program
- err_capture = open("stderr.txt", 'w')
- proc = subprocess.Popen( args=cmdline, shell=True, stderr=err_capture, stdout=sys.stdout )
- returncode = proc.wait()
- err_capture.close()
-
-
- if returncode != 0:
- raise Exception
-
- except Exception:
- # Running Grinder failed: write error message to stderr
- err_text = open("stderr.txt").readlines()
- stop_err( "ERROR:\n" + "\n".join(err_text))
-
-
-if __name__ == "__main__": __main__()
diff --git a/galaxy-plugin/trinityrnaseq.xml b/galaxy-plugin/trinityrnaseq.xml
deleted file mode 100644
index 621ff5e..0000000
--- a/galaxy-plugin/trinityrnaseq.xml
+++ /dev/null
@@ -1,127 +0,0 @@
-<tool id="trinityrnaseq" name="Trinity" version="0.0.1">
-
- <!-- Written by Jeremy Goecks, now maintained here by bhaas -->
- <description>De novo assembly of RNA-Seq data Using Trinity</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="python">
- trinityToolWrapper.py Trinity --max_memory $JM --CPU $CPU
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --group_pairs_distance $inputs.group_pairs_distance
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- #if str($additional_params.use_additional) == "yes":
- --min_kmer_cov $inputs.min_kmer_cov --max_reads_per_graph $inputs.max_reads_per_graph --bflyHeapSpaceMax $input.bflyHeapSpaceMax
- #if $inputs.bfly_opts != 'None':
- --bfly_opts " $inputs.bfly_opts "
- #end if
- #end if
-
-
- ## direct to output
- > $trinity_log
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- <option value="200G">200G</option>
- <option value="500G">500G</option>
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by Trinity" />
-
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="group_pairs_distance" type="integer" value="500" min="1" label="Group pairs distance" help="Maximum length expected between fragment pairs"/>
- <param name="path_reinforcement_distance" type="integer" value="75" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- <param name="path_reinforcement_distance" type="integer" value="40" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
- </when>
- </conditional>
-
- <conditional name="additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="min_kmer_cov" type="integer" value="1" min="1" label="inchworm_min_kmer_cov" help="Minimum kmer coverage required by Inchworm for initial contig construction" />
- <param name="max_reads_per_graph" type="integer" value="20000000" min="10000" label="chrysalis_max_reads_per_graph" help="Maximum number of reads to be anchored within each transcript graph by Chrysalis" />
-
-
- <param name="bfly_opts" type="text" value="None" label="bfly_opts" help="Options to pass on to Butterfly" />
- <param name="bflyHeapSpaceMax" type="select" label="bflyHeapSpaceMax" help="Java heap space maximum value for Butterfly">
- <option value="1G">1G</option>
- <option value="2G">2G</option>
- <option value="4G" selected="true">4G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- </param>
-
- <param name="min_contig_length" type="integer" value="200" min="1" label="Minimum Contig Length" help=""/>
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta" name="assembled_transcripts" label="${tool.name} on ${on_string}: Assembled Transcripts" from_work_dir="trinity_out_dir/Trinity.fasta"/>
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/trinityrnaseq.xml.Graham_version_022014 b/galaxy-plugin/trinityrnaseq.xml.Graham_version_022014
deleted file mode 100644
index d92bb13..0000000
--- a/galaxy-plugin/trinityrnaseq.xml.Graham_version_022014
+++ /dev/null
@@ -1,127 +0,0 @@
-<tool id="trinityrnaseq" name="Trinity" version="0.0.1">
-
- <!-- Written by Jeremy Goecks, now maintained here by bhaas -->
- <description>De novo assembly of RNA-Seq data Using Trinity</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command>
- Trinity.pl --JM $JM --CPU $CPU
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --group_pairs_distance $inputs.group_pairs_distance
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- #end if
-
- ## Additional parameters.
- #if str($additional_params.use_additional) == "yes":
- --min_kmer_cov $additional_params.min_kmer_cov --max_reads_per_graph $additional_params.max_reads_per_graph --bflyHeapSpaceMax $additional_params.bflyHeapSpaceMax
- #if $additional_params.bfly_opts != 'None':
- --bfly_opts " $additional_params.bfly_opts "
- #end if
- #end if
-
-
- ## direct to output
- > $trinity_log 2>&1
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- <option value="200G">200G</option>
- <option value="500G">500G</option>
- </param>
-
- <param name="CPU" type="integer" value="2" min="1" label="CPU" help="Number of CPUs to use by Trinity" />
-
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- <param name="group_pairs_distance" type="integer" value="500" min="1" label="Group pairs distance" help="Maximum length expected between fragment pairs"/>
- <param name="path_reinforcement_distance" type="integer" value="75" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
-
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
- <param name="path_reinforcement_distance" type="integer" value="40" min="1" label="Path reinforcement distance" help="Minimum read overlap required for path extension in the graph" />
- </when>
- </conditional>
-
- <conditional name="additional_params">
- <param name="use_additional" type="select" label="Use Additional Params?">
- <option value="no">No</option>
- <option value="yes">Yes</option>
- </param>
- <when value="no">
- </when>
- <when value="yes">
- <param name="min_kmer_cov" type="integer" value="1" min="1" label="inchworm_min_kmer_cov" help="Minimum kmer coverage required by Inchworm for initial contig construction" />
- <param name="max_reads_per_graph" type="integer" value="20000000" min="10000" label="chrysalis_max_reads_per_graph" help="Maximum number of reads to be anchored within each transcript graph by Chrysalis" />
-
-
- <param name="bfly_opts" type="text" value="None" label="bfly_opts" help="Options to pass on to Butterfly" />
- <param name="bflyHeapSpaceMax" type="select" label="bflyHeapSpaceMax" help="Java heap space maximum value for Butterfly">
- <option value="1G">1G</option>
- <option value="2G">2G</option>
- <option value="4G" selected="true">4G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- </param>
-
- <param name="min_contig_length" type="integer" value="200" min="1" label="Minimum Contig Length" help=""/>
- </when>
- </conditional>
-
-
- </inputs>
- <outputs>
- <data format="txt" name="trinity_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta" name="assembled_transcripts" label="${tool.name} on ${on_string}: Assembled Transcripts" from_work_dir="trinity_out_dir/Trinity.fasta"/>
- </outputs>
- <tests>
- </tests>
- <help>
- Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
diff --git a/galaxy-plugin/trinityrnaseq_norm.xml b/galaxy-plugin/trinityrnaseq_norm.xml
deleted file mode 100644
index 8e13024..0000000
--- a/galaxy-plugin/trinityrnaseq_norm.xml
+++ /dev/null
@@ -1,102 +0,0 @@
-<tool id="trinityrnaseq_norm" name="Trinity read normalization" version="0.0.1">
-<!--tool id="trinityrnaseq" name="Trinity" version="0.0.1" -->
-
- <!-- Written by Jeremy Goecks, modified by Josh Bowden for normalization proceedure, now maintained here by bhaas -->
- <description>Pre-process RNA-seq data to reduce coverage of highly covered areas</description>
- <requirements>
- <requirement type="package">trinity</requirement>
- </requirements>
- <command interpreter="perl">
- trinityToolWrapper.py util/normalize_by_kmer_coverage.pl --JM $JM --max_cov $MAXCOV
-
- ## Inputs.
- #if str($inputs.paired_or_single) == "paired":
- --left $inputs.left_input --right $inputs.right_input
- --outleft $output_left --outright $output_right
- #if $inputs.left_input.ext == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
-
- #else:
- --single $inputs.input
- #if str($inputs.input.ext) == 'fa':
- --seqType fa
- #else:
- --seqType fq
- #end if
- #if str($inputs.library_type) != "None":
- --SS_lib_type $inputs.library_type
- #end if
- --outputsingle $output_single
- #end if
- --galaxy
-
- ## direct to output
- > $trinity_coverage_normalization_log
-
- </command>
- <inputs>
- <param name="JM" type="select" label="JM" help="Amount of memory to allocate to Jellyfish for Kmer catalog construction">
- <option value="1G">1G</option>
- <option value="10G">10G</option>
- <option value="20G">20G</option>
- <option value="50G">50G</option>
- <option value="100G">100G</option>
- </param>
-
- <param name="MAXCOV" type="select" label="MAXCOV" help="Read coverage in terms of maximum covarge to keep">
- <option value="30">30</option>
- <option value="40">40</option>
- <option value="50">50</option>
- <option value="60">60</option>
- <option value="70">70</option>
- <option value="100">100</option>
- </param>
-
- <conditional name="inputs">
- <param name="paired_or_single" type="select" label="Paired or Single-end data?">
- <option value="paired">Paired</option>
- <option value="single">Single</option>
- </param>
- <when value="paired">
- <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
- <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="FR">FR</option>
- <option value="RF">RF</option>
- </param>
- </when>
- <when value="single">
- <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
- <param name="library_type" type="select" label="Strand-specific Library Type">
- <option value="None">None</option>
- <option value="F">F</option>
- <option value="R">R</option>
- </param>
-
- </when>
- </conditional>
- </inputs>
-
- <outputs>
- <!-- I have not found a way to do condional outputs so all potential output files are specified and some will be empty -->
- <data format="txt" name="trinity_coverage_normalization_log" label="${tool.name} on ${on_string}: log" />
- <data format="fasta,fastq" name="output_left" label="${tool.name} on ${on_string}: Normalized left data" />
- <data format="fasta,fastq" name="output_right" label="${tool.name} on ${on_string}: Normalized right data" />
- <data format="fasta,fastq" name="output_single" label="${tool.name} on ${on_string}: Normalized data" />
- <!-- data format="fastq" name="normalized right dataset" label="${tool.name} on ${on_string}: Normalized right data " from_work_dir="${inputs.right_input}.${inputs.input.ext}.normalized_K25_C${MAXCOV}_pctSD100.fq"/-->
- </outputs>
- <tests>
- </tests>
- <help>
- Runs script $TRINITY_HOME/util/normalize_by_kmer_coverage.pl which reduces data sizes with minimal impact on recovered transcripts when used by Trinity.pl.
-
- .. _Trinity: http://trinityrnaseq.sourceforge.net
- </help>
-</tool>
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/trinityrnaseq.git
More information about the debian-med-commit
mailing list