[med-svn] [trinityrnaseq] 01/01: Imported Upstream version 2.2.0+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Fri May 20 13:31:09 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to annotated tag upstream/2.2.0+dfsg
in repository trinityrnaseq.
commit 0b7afc2e192e1a316ade19979e1c91c0826ea1b6
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Fri May 20 01:27:28 2016 -0700
Imported Upstream version 2.2.0+dfsg
---
Analysis/DifferentialExpression/PtR | 80 +-
Analysis/DifferentialExpression/ROKU.pl | 2 +-
.../TissueEnrichment/DE_graph_to_dot.pl | 55 +
.../DE_results_to_pairwise_summary.pl | 160 +++
.../TissueEnrichment/README.md | 5 +
.../pairwise_DE_summary_to_DE_classification.pl | 299 ++++++
.../DifferentialExpression/analyze_diff_expr.pl | 8 +-
.../cluster_sample_data/cleanme.pl | 2 +-
.../cut_tree_into_clusters.pl | 2 +-
.../define_clusters_by_cutting_tree.pl | 9 +-
.../diff_expr_analysis_to_heatmap_html.pl | 4 +-
Analysis/DifferentialExpression/diff_express.cgi | 2 +-
.../extract_GO_enriched_genes.pl | 2 +-
.../get_transcript_lengths.pl | 2 +-
Analysis/DifferentialExpression/merge_matrices.pl | 12 +-
.../remove_batch_effects_from_count_matrix.pl | 2 +-
.../replicates_to_sample_averages_matrix.pl | 2 +-
Analysis/DifferentialExpression/run_DE_analysis.pl | 10 +-
Analysis/DifferentialExpression/run_GOseq.pl | 2 +-
.../run_TMM_normalization_write_FPKM_matrix.pl | 4 +-
.../stratify_diff_expression.pl | 2 +-
.../subcluster_to_canvasXpress_html.pl | 2 +-
.../FL_trans_analysis_pipeline.pl | 4 +-
.../FL_reconstruction_analysis/compute_oracle.pl | 2 +-
.../tier_gene_trans_alignments.pl | 4 +-
.../util/blat_full_length_mappings.pl | 2 +-
.../util/blat_top_tier_genes.pl | 2 +-
Butterfly/src/src/TransAssembly_allProbPaths.java | 144 ++-
Release.Notes => Changelog.txt | 95 +-
Chrysalis/MakeDepend.cc | 2 +-
Chrysalis/Makefile | 10 +-
Chrysalis/analysis/TranscriptomeGraph.cc | 2 +-
Inchworm/src/IRKE.cpp | 122 +--
Inchworm/src/fastaToKmerCoverageStats.cpp | 11 +-
PerlLib/Fastq_reader.pm | 17 +-
PerlLib/test_htc_gridrunner_LSF.pl | 4 +-
PerlLib/test_htc_gridrunner_SGE.pl | 4 +-
README | 2 +-
README.md | 2 +-
Trinity | 47 +-
.../test_GraphFromFasta/runMe.sh | 1 +
sample_data/test_DE_analysis/Makefile | 6 +-
.../test_GOSeq_trinotate_pipe/Spombe/cleanme.pl | 2 +-
.../test_GOSeq_trinotate_pipe/Spombe/runMe.sh | 1 +
sample_data/test_GenomeGuidedTrinity/cleanme.pl | 2 +-
.../test_InSilicoReadNormalization/Makefile | 1 +
.../test_InSilicoReadNormalization/cleanme.pl | 10 +-
.../test_PE_normalization.mult_read_sets.sh | 11 +
sample_data/test_Inchworm/runMe_MPI.sh | 1 +
.../test_TissueSpecificityGraph/DE_results.tar.gz | Bin 0 -> 37622593 bytes
sample_data/test_TissueSpecificityGraph/Makefile | 10 +
.../transcripts.TMM.fpkm.avg_reps.matrix.gz | Bin 0 -> 3467094 bytes
sample_data/test_Trinity_Assembly/Makefile | 14 +-
.../align_reads_via_bowtie.sh | 3 +-
.../__indiv_ex_sample_derived/ex05/runMe.clean.sh | 1 +
.../__indiv_ex_sample_derived/ex05/runMe.sh | 1 +
.../__indiv_ex_sample_derived/ex09/runMe.sh | 1 +
sample_data/test_Trinity_Assembly/cleanme.pl | 4 +-
.../misc_run_tests/__runMe_include_long_reads.sh | 2 +-
sample_data/test_Trinity_Assembly/runMe.sh | 2 +-
.../test_align_and_estimate_abundance/Makefile | 19 +-
.../PAIRED_END_ABUNDANCE_ESTIMATION/Makefile | 41 +
.../cleanme.pl | 22 +-
.../misc_tests/drosoph_denovo.samples.txt | 6 +
.../misc_tests/drosoph_ref.samples.txt | 6 +
.../misc_tests/mouse_denovo.samples.txt | 6 +
.../misc_tests/mouse_ref.samples.txt | 6 +
.../misc_tests/schizo_denovo.samples.txt | 6 +
.../misc_tests/schizo_ref.samples.txt | 6 +
.../misc_tests/test_Drosoph_denovo.sh | 1 +
.../misc_tests/test_Drosoph_ref.sh | 1 +
.../misc_tests/test_Mouse_denovo.sh | 1 +
.../misc_tests/test_Mouse_ref.sh | 1 +
.../misc_tests/test_Schizo_denovo.sh | 1 +
.../misc_tests/test_Schizo_ref.sh | 1 +
.../PAIRED_END_ABUNDANCE_ESTIMATION/samples.txt | 8 +
.../SINGLE_END_ABUNDANCE_ESTIMATION/Makefile | 41 +
.../cleanme.pl | 23 +-
.../misc_tests/drosoph_denovo.samples.txt | 5 +
.../misc_tests/drosoph_ref.samples.txt | 6 +
.../misc_tests/mouse_denovo.samples.txt | 6 +
.../misc_tests/mouse_ref.samples.txt | 6 +
.../misc_tests/schizo_denovo.samples.txt | 6 +
.../misc_tests/schizo_ref.samples.txt | 6 +
.../misc_tests/test_Drosoph_denovo.sh | 1 +
.../misc_tests/test_Drosoph_ref.sh | 1 +
.../misc_tests/test_Mouse_denovo.sh | 1 +
.../misc_tests/test_Mouse_ref.sh | 1 +
.../misc_tests/test_Schizo_denovo.sh | 1 +
.../misc_tests/test_Schizo_ref.sh | 1 +
.../SINGLE_END_ABUNDANCE_ESTIMATION/samples.txt | 8 +
.../align_and_estimate_tester.pl | 68 +-
.../pairs.Rscript | 6 +
.../test_align_and_estimate_abundance/samples.txt | 4 -
sample_data/test_full_edgeR_pipeline/cleanme.pl | 2 +-
trinity-plugins/Makefile | 10 +-
.../fstrozzi-Fastool-7c3e034f05/Makefile | 5 +-
trinity-plugins/scaffold_iworm_contigs/Makefile | 4 +-
trinity-plugins/slclust/src/Makefile | 21 +-
util/SAM_nameSorted_to_uniq_count_stats.pl | 12 +-
util/TrinityStats.pl | 2 +-
util/abundance_estimates_to_matrix.pl | 84 +-
util/align_and_estimate_abundance.pl | 344 ++++---
util/analyze_blastPlus_topHit_coverage.pl | 2 +-
util/bowtie_PE_separate_then_join.pl | 6 +-
util/fasta_tool | 1054 +++++++++++++++++++-
util/filter_low_expr_transcripts.pl | 286 ++++++
util/insilico_read_normalization.pl | 13 +-
util/misc/Artemis/join_multi_wig_to_graph_plot.pl | 2 +-
util/misc/BLAT_to_SAM.pl | 6 +-
util/misc/ButterflyFastaToGraphDot.pl | 2 +-
util/misc/HiCpipe_nameSortedSam_to_raw.pl | 2 +-
util/misc/Monarch | 2 +-
util/misc/N50.pl | 2 +-
.../misc/SAM_coordsorted_max_reads_per_position.pl | 2 +-
util/misc/SAM_intron_extractor.pl | 2 +-
util/misc/SAM_pair_to_bed.pl | 2 +-
util/misc/SAM_sortAny_to_count_stats.pl | 2 +-
util/misc/SAM_toString.pl | 2 +-
util/misc/SAM_to_bed.pl | 2 +-
util/misc/SAM_to_fasta.pl | 2 +-
util/misc/TophatCufflinksWrapper.pl | 2 +-
util/misc/allele_simulator.pl | 2 +-
util/misc/average.pl | 2 +-
.../extract_bam_reads_per_target_gene.pl | 2 +-
.../extract_bam_reads_per_target_transcript.pl | 2 +-
util/misc/bam_gene_tests/write_trin_cmds.pl | 2 +-
util/misc/blast_outfmt6_group_segments.pl | 2 +-
..._outfmt6_group_segments.to_Markov_Clustering.pl | 2 +-
util/misc/blat_util/blat_sam_add_reads2.pl | 2 +-
util/misc/blat_util/blat_to_sam.pl | 4 +-
util/misc/blat_util/process_BLAT_alignments.pl | 4 +-
util/misc/blat_util/top_blat_sam_extractor.pl | 2 +-
util/misc/capture_orig_n_unmapped_reads.pl | 2 +-
util/misc/cdna_fasta_file_to_transcript_gtf.pl | 2 +-
util/misc/check_fastQ_pair_ordering.pl | 2 +-
util/misc/contig_ExN50_statistic.pl | 15 +-
util/misc/extract_fastQ_pairings.pl | 2 +-
util/misc/fastQ_rand_subset.pl | 2 +-
..._subset.reservoir_sampling_reqiures_high_mem.pl | 2 +-
util/misc/fasta_file_reformatter.pl | 2 +-
util/misc/fasta_filter_by_min_length.pl | 2 +-
util/misc/fasta_seq_length.pl | 25 +
util/misc/fasta_to_cmd_generator.pl | 2 +-
util/misc/fasta_write_sense_n_anti.pl | 2 +-
util/misc/fastq_interleave_pairs.pl | 2 +-
util/misc/fastq_unweave_pairs.pl | 2 +-
util/misc/gene_gff3_to_introns.pl | 2 +-
.../get_longest_isoform_seq_per_trinity_gene.pl | 2 +-
util/misc/gff3_file_to_cdna.pl | 2 +-
util/misc/gff3_file_utr_coverage_trimmer.pl | 2 +-
...f3_to_genome_feature_base_encoding.parse_SAM.pl | 2 +-
util/misc/gff3_to_genome_feature_base_encoding.pl | 2 +-
util/misc/gmap_gff3_chimera_jaccard_analyzer.pl | 2 +-
util/misc/gmap_gff3_to_percent_length_stats.pl | 2 +-
util/misc/gmap_native_to_format_converter.pl | 2 +-
util/misc/gtf_to_bed_format.pl | 2 +-
util/misc/gtf_to_introns.pl | 2 +-
util/misc/identify_distal_isoform_variations.pl | 2 +-
util/misc/illustrate_ref_comparison.pl | 2 +-
util/misc/jaccard_sam_pair_refiner.pl | 2 +-
util/misc/kmer_counter.pl | 2 +-
util/misc/m8_blastclust.pl | 2 +-
util/misc/map_gtf_transcripts_to_genome_annots.pl | 2 +-
util/misc/merge_blast_n_rsem_results.pl | 2 +-
util/misc/nameSorted_SAM_to_FastQ.pl | 2 +-
util/misc/pairwise_kmer_content_comparer.pl | 2 +-
util/misc/plot_ExN50_statistic.Rscript | 14 +
util/misc/plot_expressed_gene_dist.pl | 2 +-
util/misc/print_kmers.pl | 2 +-
util/misc/prop_pair_sam_refiner.pl | 2 +-
util/misc/run_GSNAP.pl | 27 +-
util/misc/run_HISAT.pl | 2 +-
util/misc/run_HiCpipe_bowtie.pl | 4 +-
util/misc/run_bowtie2.pl | 39 +
util/misc/run_read_simulator_per_fasta_entry.pl | 4 +-
util/misc/run_read_simulator_per_gene.pl | 2 +-
util/misc/run_trimmomatic_qual_trimming.pl | 4 +-
util/misc/simulate_illuminaPE_from_transcripts.pl | 2 +-
util/misc/simulate_reads_sam_and_fa.pl | 4 +-
util/misc/sixFrameTranslation.pl | 2 +-
util/misc/sort_fastq.pl | 2 +-
.../assess_intron_path_sensitivity.pl | 2 +-
util/misc/strip_fasta_header.pl | 2 +-
util/misc/transcript_coverage_UTR_trimmer.pl | 4 +-
util/misc/transcript_fasta_to_ORF_pics.pl | 2 +-
util/misc/transcript_gff3_to_bed.pl | 2 +-
util/misc/trinity_component_distribution.pl | 2 +-
util/run_DE_analysis_from_samples_file.pl | 2 +-
util/run_RSEM_from_samples_file.pl | 2 +-
util/run_Trinity_edgeR_pipeline.pl | 22 +-
util/run_Trinity_from_samples_file.pl | 2 +-
...AM_coordSorted_fragment_Read_coverage_writer.pl | 2 +-
.../SAM_coordSorted_fragment_coverage_writer2.pl | 2 +-
.../SAM_extract_properly_mapped_pairs.pl | 2 +-
.../SAM_extract_uniquely_mapped_reads.pl | 2 +-
.../SAM_filter_out_unmapped_reads.pl | 2 +-
util/support_scripts/SAM_ordered_pair_jaccard.pl | 2 +-
.../SAM_set_transcribed_orient_info.pl | 2 +-
util/support_scripts/SAM_strand_separator.pl | 2 +-
util/support_scripts/SAM_to_frag_coords.pl | 2 +-
.../define_SAM_coverage_partitions2.pl | 2 +-
util/support_scripts/define_coverage_partitions.pl | 2 +-
.../support_scripts/extract_reads_per_partition.pl | 2 +-
util/support_scripts/fastQ_to_fastA.pl | 2 +-
util/support_scripts/fasta_to_tab.pl | 2 +-
util/support_scripts/fragment_coverage_writer.pl | 2 +-
.../inchworm_transcript_splitter.pl | 2 +-
util/support_scripts/jaccard_fasta_clipper.pl | 2 +-
util/support_scripts/jaccard_wig_clipper.pl | 2 +-
.../merge_left_right_nameSorted_SAMs.pl | 2 +-
.../support_scripts/nbkc_merge_left_right_stats.pl | 4 +
.../ordered_fragment_coords_to_jaccard.pl | 2 +-
...aseq_alignments_for_genome_assisted_assembly.pl | 2 +-
util/support_scripts/run_TMM_scale_matrix.pl | 4 +-
.../run_UpperQuartileNormalization_matrix.pl | 8 +-
.../salmon_trans_to_gene_results.pl | 165 +++
util/support_scripts/tests/sample_data_tests.py | 56 ++
util/support_scripts/tests/test.py | 187 ----
util/support_scripts/tests/test_prep.py | 55 +-
util/support_scripts/tests/tests.py | 34 +-
.../write_partitioned_trinity_cmds.pl | 2 +-
222 files changed, 3447 insertions(+), 811 deletions(-)
diff --git a/Analysis/DifferentialExpression/PtR b/Analysis/DifferentialExpression/PtR
index 88539fb..28e47e2 100755
--- a/Analysis/DifferentialExpression/PtR
+++ b/Analysis/DifferentialExpression/PtR
@@ -7,8 +7,8 @@ use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use FindBin;
use File::Basename;
-my $min_rowSums = 10;
-my $min_colSums = 10;
+my $min_rowSums = 0;
+my $min_colSums = 0;
my $usage = <<__EOUSAGE__;
@@ -74,6 +74,7 @@ my $usage = <<__EOUSAGE__;
# --heatmap_scale_limits "<int,int>" cap scale intensity to low,high (ie. "-5,5")
# --heatmap_colorscheme <string> default is 'purple,black,yellow'
# a popular alternative is 'green,black,red'
+# Specify a two-color gradient like so: "black,yellow".
#
# # sample (column) labeling order
# --lexical_column_ordering order samples by column name lexical order.
@@ -496,7 +497,7 @@ my $HEATMAP_COLORS;
{
$heatmap_colorscheme =~ s/\s//g;
my @colors = split(/,/, $heatmap_colorscheme);
- unless (scalar @colors == 3) {
+ unless (scalar @colors != 3 || scalar @colors != 2) {
die "Error, need three colors as 'low,mid,high', instead have " . scalar(@colors) . " : $heatmap_colorscheme";
}
@@ -538,9 +539,9 @@ main: {
# source these after potential data restoration above - in case they changed.
- $Rscript .= "source(\"$FindBin::Bin/R/heatmap.3.R\")\n";
- $Rscript .= "source(\"$FindBin::Bin/R/misc_rnaseq_funcs.R\")\n";
- $Rscript .= "source(\"$FindBin::Bin/R/pairs3.R\")\n";
+ $Rscript .= "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
+ $Rscript .= "source(\"$FindBin::RealBin/R/misc_rnaseq_funcs.R\")\n";
+ $Rscript .= "source(\"$FindBin::RealBin/R/pairs3.R\")\n";
@@ -1062,46 +1063,41 @@ main: {
# Zscale the genes across samples for Prin Component analysis
- $Rscript .= "# Z-scale the genes across all the samples for PCA\n";
- $Rscript .= "prin_comp_data = data\n";
-
- unless ($ZSCALE_ROWS) {
- $Rscript .= "for (i in 1:nrow(data)) {\n";
- $Rscript .= " d = data[i,]\n";
- $Rscript .= " d_mean = mean(d)\n";
- $Rscript .= " d = d - d_mean\n";
- $Rscript .= " d = d / sd(d)\n";
- $Rscript .= " prin_comp_data[i,] = d\n";
- $Rscript .= "}\n\n";
- }
+ $Rscript .= "# Z-scale and center the genes across all the samples for PCA\n";
+ $Rscript .= "prin_comp_data = initial_matrix\n"
+ . "prin_comp_data = log2(prin_comp_data+1)\n"
+ . "prin_comp_data = scale(prin_comp_data)\n"
+ . "prin_comp_data = t(scale(t(prin_comp_data), center=TRUE, scale=F)) # just center trans expr level, retain original effect size.\n"
+ . "pca = prcomp(t(prin_comp_data), center = FALSE, scale. = FALSE)\n";
- $Rscript .= "write.table(prin_comp_data, file=\"$output_prefix.ZscaleRows.dat\", quote=F, sep=\"\t\")\n" if $write_intermediate_data_tables_flag;
-
- $Rscript .= "pc = princomp(prin_comp_data, cor=TRUE)\n";
- $Rscript .= "pc_pct_variance = (pc\$sdev^2)/sum(pc\$sdev^2)\n";
+
+ $Rscript .= "write.table(prin_comp_data, file=\"$output_prefix.Princomp_log2_Zscale_centered.dat\", quote=F, sep=\"\t\")\n" if $write_intermediate_data_tables_flag;
+
+ #$Rscript .= "pc = princomp(prin_comp_data, cor=TRUE)\n";
+ $Rscript .= "pc_pct_variance = (pca\$sdev^2)/sum(pca\$sdev^2)\n";
$Rscript .= "def.par <- par(no.readonly = TRUE) # save default, for resetting...\n"
. "gridlayout = matrix(c(1:4),nrow=2,ncol=2, byrow=TRUE);\n"
. "layout(gridlayout, widths=c(1,1));\n";
if (1) {
## write out the PC info
- $Rscript .= "write.table(pc\$loadings, file=\"$output_prefix.ZscaleRows.PC.loadings\", quote=F, sep=\"\t\")\n";
- $Rscript .= "write.table(pc\$scores, file=\"$output_prefix.ZscaleRows.PC.scores\", quote=F, sep=\"\t\")\n";
+ $Rscript .= "write.table(pca\$rotation, file=\"$output_prefix.PCA.loadings\", quote=F, sep=\"\t\")\n";
+ $Rscript .= "write.table(pca\$x, file=\"$output_prefix.PCA.scores\", quote=F, sep=\"\t\")\n";
}
$Rscript .= "for (i in 1:(max($prin_comp,2)-1)) {\n" # one plot for each n,n+1 component comparison.
- . " xrange = range(pc\$loadings[,i])\n"
- . " yrange = range(pc\$loadings[,i+1])\n"
- . " samples_want = rownames(pc\$loadings) \%in\% sample_type_list[[sample_types[1]]]\n" # color according to sample
+ . " xrange = range(pca\$x[,i])\n"
+ . " yrange = range(pca\$x[,i+1])\n"
+ . " samples_want = rownames(pca\$x) \%in\% sample_type_list[[sample_types[1]]]\n" # color according to sample
. " pc_i_pct_var = sprintf(\"(%.2f%%)\", pc_pct_variance[i]*100)\n"
. " pc_i_1_pct_var = sprintf(\"(%.2f%%)\", pc_pct_variance[i+1]*100)\n"
- . " plot(pc\$loadings[samples_want,i], pc\$loadings[samples_want,i+1], xlab=paste('PC',i, pc_i_pct_var), ylab=paste('PC',i+1, pc_i_1_pct_var), xlim=xrange, ylim=yrange, col=sample_colors[1])\n"
+ . " plot(pca\$x[samples_want,i], pca\$x[samples_want,i+1], xlab=paste('PC',i, pc_i_pct_var), ylab=paste('PC',i+1, pc_i_1_pct_var), xlim=xrange, ylim=yrange, col=sample_colors[1])\n"
. " for (j in 2:nsamples) {\n"
- . " samples_want = rownames(pc\$loadings) \%in\% sample_type_list[[sample_types[j]]]\n"
- . " points(pc\$loadings[samples_want,i], pc\$loadings[samples_want,i+1], col=sample_colors[j], pch=j)\n"
+ . " samples_want = rownames(pca\$x) \%in\% sample_type_list[[sample_types[j]]]\n"
+ . " points(pca\$x[samples_want,i], pca\$x[samples_want,i+1], col=sample_colors[j], pch=j)\n"
. " }\n"
. " plot.new()\n"
. " legend('topleft', as.vector(sample_types), col=sample_colors, pch=1:nsamples, ncol=2)\n"
@@ -1122,7 +1118,7 @@ main: {
#$Rscript .= "dev.off();stop('debug')\n";
- $Rscript .= "pcscore_mat_vals = pc\$scores[,1:$prin_comp]\n";
+ $Rscript .= "pcscore_mat_vals = pca\$rotation[,1:$prin_comp]\n";
$Rscript .= "pcscore_mat = matrix_to_color_assignments(pcscore_mat_vals, col=colorpanel(256,'purple','black','yellow'), by='row')\n";
$Rscript .= "colnames(pcscore_mat) = paste('PC', 1:ncol(pcscore_mat))\n";
@@ -1806,15 +1802,16 @@ sub add_prin_comp_heatmaps {
. "uniq_genes = c()\n"
. "for (i in 1:$prin_comp) {\n"
. " ## get genes with extreme vals\n"
- . " print(paste('range', range(pc\$scores[,i])))\n"
- . " ordered_gene_indices = order(pc\$scores[,i])\n"
+ . " print(paste('range', range(pca\$rotation[,i])))\n"
+ . " ordered_gene_indices = order(pca\$rotation[,i])\n"
. " num_genes = length(ordered_gene_indices)\n"
. " extreme_ordered_gene_indices = unique(c(1:$num_top_genes_PC_extreme, (num_genes-$num_top_genes_PC_extreme):num_genes))\n"
+ . " print('extreme ordered gene indices')\n"
. " print(extreme_ordered_gene_indices)\n"
. " selected_gene_indices = ordered_gene_indices[extreme_ordered_gene_indices]\n"
. " print('selected gene indices');print(selected_gene_indices);\n"
- . " print('PC scores:');print(pc\$scores[selected_gene_indices,i])\n"
- . " selected_genes_matrix = data[selected_gene_indices,]\n"
+ . " print('PC scores:');print(pca\$rotation[selected_gene_indices,i])\n"
+ . " selected_genes_matrix = prin_comp_data[selected_gene_indices,]\n"
#. " print(selected_genes_matrix)\n"
. " pc_color_bar_vals = pcscore_mat_vals[selected_gene_indices,i]\n"
. " print(pc_color_bar_vals)\n"
@@ -1844,13 +1841,7 @@ sub add_prin_comp_heatmaps {
## Include a heatmap containing all selected genes across all PCs.
- $Rscript .= "all_selected_genes_matrix = data[uniq_genes,]\n";
- #if(! $LOG2) {
- # $Rscript .= "all_selected_genes_matrix = log2(all_selected_genes_matrix + 1)\n";
- #}
- if ($CENTER) {
- $Rscript .= "all_selected_genes_matrix = t(scale(t(all_selected_genes_matrix), scale=F))\n";
- }
+ $Rscript .= "all_selected_genes_matrix = prin_comp_data[uniq_genes,]\n";
$Rscript .= "write.table(all_selected_genes_matrix, file=paste(\"$output_prefix\", '.PC_all','_extreme',$num_top_genes_PC_extreme,'.matrix', sep=''), quote=F, sep=\"\t\")\n";
$Rscript .= "heatmap.3(all_selected_genes_matrix, col=greenred(256), scale='none', density.info=\"none\", trace=\"none\", key=TRUE, keysize=1.2, cexCol=1, margins=c(10,10), cex.main=0.75, cexRow=0.5, main=paste('heatmap for ALL selected ', $num_top_genes_PC_extreme, ' extreme of all PCs')";
@@ -1875,7 +1866,10 @@ sub add_top_loadings_pc_heatmap {
#kruskal-wallis test
# subtracting out PC
-
+
+ die "Error - temporarily discontinued option and will revisit later.";
+ ## TODO: update code based on new use of pca and include proper tests
+
my $Rscript = "abs_loadings = abs(pc\$scores[,1:$prin_comp])\n"
. "max_loadings = apply(abs_loadings, 1, max)\n"
. "ordered_loadings = rev(order(max_loadings))\n"
diff --git a/Analysis/DifferentialExpression/ROKU.pl b/Analysis/DifferentialExpression/ROKU.pl
index 704e6a9..1791de1 100755
--- a/Analysis/DifferentialExpression/ROKU.pl
+++ b/Analysis/DifferentialExpression/ROKU.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use BHStats;
my $usage = "usage: $0 fpkm.matrix\n\n";
diff --git a/Analysis/DifferentialExpression/TissueEnrichment/DE_graph_to_dot.pl b/Analysis/DifferentialExpression/TissueEnrichment/DE_graph_to_dot.pl
new file mode 100755
index 0000000..2b43948
--- /dev/null
+++ b/Analysis/DifferentialExpression/TissueEnrichment/DE_graph_to_dot.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $usage = "usage: $0 DE.graph\n\n";
+
+my $DE_graph = $ARGV[0] or die $usage;
+
+
+=color_panel
+
+> colorpanel(10, 'black', 'purple', 'red')
+ [1] "#000000" "#28083C" "#501078" "#7818B4" "#A020F0" "#A020F0" "#B818B4"
+ [8] "#D01078" "#E7083C" "#FF0000"
+
+=cut
+
+
+my @colors = (
+ "#000000", "#28083C", "#501078", "#7818B4", "#A020F0", "#A020F0", "#B818B4",
+ "#D01078", "#E7083C", "#FF0000"
+ );
+
+
+main: {
+
+ print "digraph G {\n";
+
+ open (my $fh, $DE_graph) or die "Error, cannot open file $DE_graph";
+ my $line = <$fh>;
+ chomp $line;
+ close $fh;
+
+ my @x = split(/\t/, $line);
+ shift @x;
+ foreach my $pair (@x) {
+ my ($from, $to, $logFC) = split(/,/, $pair);
+
+ my $color_index = int($logFC + 0.5);
+ if ($color_index > $#colors) {
+ $color_index = $#colors;
+ }
+ my $color = $colors[$color_index];
+
+ print " $from->$to\[color=\"$color\"]\n";
+ }
+
+ print "}\n";
+
+ exit(0);
+}
+
+
+
diff --git a/Analysis/DifferentialExpression/TissueEnrichment/DE_results_to_pairwise_summary.pl b/Analysis/DifferentialExpression/TissueEnrichment/DE_results_to_pairwise_summary.pl
new file mode 100755
index 0000000..3d84697
--- /dev/null
+++ b/Analysis/DifferentialExpression/TissueEnrichment/DE_results_to_pairwise_summary.pl
@@ -0,0 +1,160 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Data::Dumper;
+
+my $usage = "usage: $0 sample_avg_expr.matrix edgeR_directory/ [FDR=0.05]\n\n";
+
+my $sample_expr_matrix = $ARGV[0] or die $usage;
+my $DE_dir = $ARGV[1] or die $usage;
+my $MAX_FDR = $ARGV[2];
+unless (defined $MAX_FDR) {
+ $MAX_FDR = 0.05;
+}
+
+my @DE_result_files = <$DE_dir/*.DE_results>;
+unless (@DE_result_files) {
+ die "Error, cannot find \*.DE_results files at $DE_dir ";
+}
+
+main: {
+
+
+ my %gene_to_sample_expr_val = &parse_expression_matrix($sample_expr_matrix);
+
+ print join("\t", "#feature", "sample_A", "sample_B", "log2(exprA)", "log2(exprB)", "logFC", "FDR") . "\n";
+
+ foreach my $DE_result_file (@DE_result_files) {
+ print STDERR "-processing DE file: $DE_result_file\n";
+
+ $DE_result_file =~ /\.([^\.\/]+)_vs_([^\.\/]+).[^\.]+.DE_results/ or die "Error, cannot parse filename: $DE_result_file";
+ my $sample_A = $1;
+ my $sample_B = $2;
+
+ open (my $fh, $DE_result_file) or die $!;
+ my $header = <$fh>;
+ chomp $header;
+ my @header_fields = split(/\t/, $header);
+ my $FDR_field = undef;
+
+ my $line_counter = 0;
+ while(<$fh>) {
+ $line_counter++;
+ chomp;
+ my @x = split(/\t/);
+
+ # match up header with data fields.
+ if ($line_counter == 1) {
+ if (scalar(@header_fields) == scalar(@x) -1) {
+ unshift(@header_fields, 'id');
+ }
+ elsif (scalar(@header_fields) != scalar(@x)) {
+ die "Error, disconnect between header line and data line, number of fields are unequal and header isn't one short:\n"
+ . "header: @header_fields\n"
+ . "line: @x\n";
+ }
+ for (my $i = 0; $i <= $#header_fields; $i++) {
+ if ($header_fields[$i] eq 'FDR') {
+ $FDR_field = $i;
+ }
+ }
+ unless ($FDR_field) {
+ die "Error, couldn't identify column corresponding to FDR";
+ }
+ }
+
+ my $feature = $x[0];
+ my $feature_expr = $gene_to_sample_expr_val{$feature};
+ unless (defined $feature_expr) {
+ die "Error, no expression values stored for [$feature] ";
+ }
+
+
+ my $FDR = $x[$FDR_field];
+ if ($FDR <= $MAX_FDR) {
+ my $expr_sample_A = $feature_expr->{$sample_A};
+ my $expr_sample_B = $feature_expr->{$sample_B};
+
+ unless (defined $expr_sample_A && defined $expr_sample_B) {
+ die "Error, no expr value for feature: $feature, $sample_A [$expr_sample_A] or $sample_B [$expr_sample_B] " . Dumper($gene_to_sample_expr_val{$feature});
+ }
+ my $log_expr_sample_A = log($expr_sample_A+1)/log(2);
+ my $log_expr_sample_B = log($expr_sample_B+1)/log(2);
+
+ my $log_FC = sprintf("%.2f", $log_expr_sample_A - $log_expr_sample_B);
+
+ print join("\t", $feature, $sample_A, $sample_B, $log_expr_sample_A, $log_expr_sample_B, $log_FC, $FDR) . "\n";
+ }
+ }
+ }
+
+ print STDERR "\nDone\n\n";
+
+ exit(0);
+
+}
+
+
+####
+sub parse_expression_matrix {
+ my ($expr_matrix_file) = @_;
+
+ print STDERR "\nReading matrix: $expr_matrix_file ... ";
+
+ my $cmd = "wc -l $expr_matrix_file ";
+
+ my $num_lines = `$cmd`;
+ if ($?) {
+ die "Error, cmd; $cmd died with ret $?";
+ }
+ $num_lines =~ /(\d+)/;
+ $num_lines = $1 or die "Error, cannot count number of lines from: $num_lines, cmd: $cmd";
+
+ print STDERR " $num_lines rows of matrix detected.\n\n";
+
+ my %gene_to_sample_expr_val;
+
+ open (my $fh, $expr_matrix_file) or die "Error, cannot open file $expr_matrix_file";
+ my $header = <$fh>;
+ chomp $header;
+ $header =~ s/^\s+//;
+ my @sample_names = split(/\t/, $header);
+
+ my $counter = 0;
+ while (<$fh>) {
+ chomp;
+ my @x = split(/\t/);
+ my $feature_name = shift @x;
+
+ #unless ($feature_name eq "c1088792_g2_i5^sp|Q06441|TSP4_XENLA^COMP^sigP") { next; }
+
+ unless (scalar @x == scalar @sample_names) {
+ die "Error, number of samples: " . scalar (@sample_names) . " doesn't match number of values read: " . scalar(@x) . " ";
+ }
+
+ for (my $i = 0; $i <= $#sample_names; $i++) {
+ my $sample = $sample_names[$i];
+ my $val = $x[$i];
+
+ $gene_to_sample_expr_val{$feature_name}->{$sample} = $val;
+ }
+
+
+ $counter++;
+ if ($counter % 10000 == 0) {
+ my $pct_done = sprintf("%.2f", $counter/$num_lines * 100);
+ print STDERR "\r[$pct_done %] matrix read. ";
+ }
+ #if ($counter > 10) { last; } # debug
+ }
+
+ close $fh;
+
+ print STDERR "\n\nDone reading matrix.\n";
+
+ #print Dumper(\%gene_to_sample_expr_val);
+
+ return(%gene_to_sample_expr_val);
+}
+
diff --git a/Analysis/DifferentialExpression/TissueEnrichment/README.md b/Analysis/DifferentialExpression/TissueEnrichment/README.md
new file mode 100644
index 0000000..98f31a6
--- /dev/null
+++ b/Analysis/DifferentialExpression/TissueEnrichment/README.md
@@ -0,0 +1,5 @@
+## documentation stub (to be completed very soon!)
+
+trinityrnaseq/Analysis/DifferentialExpression/TissueEnrichment/DE_results_to_pairwise_summary.pl transcripts.TMM.fpkm.avg_reps.matrix . > DE.pairwise_summary
+
+trinityrnaseq/Analysis/DifferentialExpression/TissueEnrichment/pairwise_DE_summary_to_DE_classification.pl DE.pairwise_summary
diff --git a/Analysis/DifferentialExpression/TissueEnrichment/pairwise_DE_summary_to_DE_classification.pl b/Analysis/DifferentialExpression/TissueEnrichment/pairwise_DE_summary_to_DE_classification.pl
new file mode 100755
index 0000000..f0eb0d7
--- /dev/null
+++ b/Analysis/DifferentialExpression/TissueEnrichment/pairwise_DE_summary_to_DE_classification.pl
@@ -0,0 +1,299 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+
+my $usage = "usage: $0 summary.dat [outprefix=argv[0]]\n\n";
+
+my $file = $ARGV[0] or die $usage;
+my $out_prefix = $ARGV[1] || $file;
+
+main: {
+
+ my %up_to_down;
+ my %feature_to_sample_expr_val;
+
+ open (my $fh, $file) or die $!;
+ while (<$fh>) {
+ if (/^\#/) { next; }
+ chomp;
+ my ($feature, $sampleA, $sampleB, $exprA, $exprB, $log_fold_change, $post_prob) = split(/\t/);
+
+
+ $feature_to_sample_expr_val{$feature}->{$sampleA} = $exprA;
+ $feature_to_sample_expr_val{$feature}->{$sampleB} = $exprB;
+
+ my ($up, $down) = ($log_fold_change > 0) ? ($sampleA, $sampleB) : ($sampleB, $sampleA);
+
+ $log_fold_change = abs($log_fold_change);
+
+ $up_to_down{$feature}->{$up}->{$down} = { logFC => $log_fold_change,
+ Pval => $post_prob,
+ };
+
+
+
+ }
+
+
+ my $class_up_outfile = "$out_prefix.class_up_priority";
+ open (my $class_up_ofh, ">$class_up_outfile") or die "Error, cannot write to $class_up_outfile";
+
+ my $class_down_outfile = "$out_prefix.class_down_priority";
+ open (my $class_down_ofh, ">$class_down_outfile") or die "Error, cannot write to $class_down_outfile";
+
+ my $graph_outfile = "$out_prefix.graph";
+ open (my $graph_ofh, ">$graph_outfile") or die "Error, cannot write to $graph_outfile";
+
+
+ my %up_cat_to_dat;
+ my %up_class_counter;
+
+ foreach my $feature (keys %up_to_down) {
+
+ my ($top_up_list, $top_down_list) = &get_top_up_down_list($up_to_down{$feature}, 'up_priority');
+
+ my $up_class = join(",", sort @$top_up_list);
+ my $down_class = join(",", sort @$top_down_list);
+
+ print $class_up_ofh join("\t", $feature, $up_class, $down_class) . "\n";
+
+
+
+ &write_graph_entry($feature, $up_to_down{$feature}, $graph_ofh);
+
+ my $up_class_expr = 0;
+
+ my $min_up_expr = undef;
+
+ foreach my $up ( @$top_up_list) {
+ my $expr = $feature_to_sample_expr_val{$feature}->{$up};
+ $up_class_expr += $expr;
+
+ if ( (! defined $min_up_expr) || $min_up_expr > $expr) {
+ $min_up_expr = $expr;
+ }
+ }
+ $up_class_expr /= scalar(@$top_up_list);
+
+
+ my $max_down_expr = 0;
+ foreach my $down (@$top_down_list) {
+ my $expr = $feature_to_sample_expr_val{$feature}->{$down};
+ if ($expr > $max_down_expr) {
+ $max_down_expr = $expr;
+ }
+ }
+
+
+ my $delta_expr = $min_up_expr - $max_down_expr;
+ my $priority = $delta_expr / ($max_down_expr + 1); # the 1 is a pseudocount to avoid ultra-small value comparisons.
+
+ push (@{$up_cat_to_dat{$up_class}}, { up_expr => $up_class_expr,
+ feature => $feature,
+ up_class => $up_class,
+ down_class => $down_class,
+
+ min_up_expr => $min_up_expr,
+ max_down_expr => $max_down_expr,
+
+ delta_expr => $delta_expr,
+
+ priority => $priority,
+
+ });
+
+
+
+ $up_class_counter{$up_class}++;
+
+
+
+ {
+ ## similarly look for transcripts that are downregulated as hallmark features
+
+ my ($top_up_list, $top_down_list) = &get_top_up_down_list($up_to_down{$feature}, 'down_priority');
+
+ my $up_class = join(",", sort @$top_up_list);
+ my $down_class = join(",", sort @$top_down_list);
+
+
+ print $class_down_ofh join("\t", $feature, $up_class, $down_class) . "\n";
+
+ }
+
+ }
+
+ close $class_up_ofh;
+ close $class_down_ofh;
+ close $graph_ofh;
+
+
+
+ ## make prioritized list
+ open (my $ofh_prioritized, ">$out_prefix.class_up_priority.ordered_by_expression") or die $!;
+
+ foreach my $up_class (reverse sort {$up_class_counter{$a}<=>$up_class_counter{$b}} keys %up_cat_to_dat) {
+
+
+
+ my @feature_structs = @{$up_cat_to_dat{$up_class}};
+
+ @feature_structs = reverse sort {$a->{priority}<=>$b->{priority}} @feature_structs;
+
+
+ my $num_features = scalar(@feature_structs);
+ print $ofh_prioritized "## $up_class ($num_features)\n";
+
+ foreach my $feature_struct (@feature_structs) {
+
+ my $feature = $feature_struct->{feature};
+
+ my @up_classes = split(/,/, $feature_struct->{up_class});
+ my @down_classes = split(/,/, $feature_struct->{down_class});
+
+ my @up_class_text;
+
+ @up_classes = reverse sort {$feature_to_sample_expr_val{$feature}->{$a} <=> $feature_to_sample_expr_val{$feature}->{$b}} @up_classes;
+
+ foreach my $up_class (@up_classes) {
+
+ my $expr = sprintf("%.2f", $feature_to_sample_expr_val{$feature}->{$up_class});
+ push (@up_class_text, "$up_class\($expr)");
+ }
+
+ my @down_class_text;
+ @down_classes = reverse sort {$feature_to_sample_expr_val{$feature}->{$a} <=> $feature_to_sample_expr_val{$feature}->{$b}} @down_classes;
+ foreach my $down_class (@down_classes) {
+ my $expr = sprintf("%.2f", $feature_to_sample_expr_val{$feature}->{$down_class});
+ push (@down_class_text, "$down_class\($expr)");
+ }
+
+ print $ofh_prioritized join("\t", $feature, join(",", @up_class_text), join(",", @down_class_text) . "\n");
+ }
+ }
+
+ close $ofh_prioritized;
+
+
+ exit(0);
+
+}
+
+####
+sub get_top_up_down_list {
+ my ($up_down_href, $priority_direction) = @_;
+
+ my @structs;
+
+
+ if ($priority_direction eq 'down_priority') {
+
+ $up_down_href = &reverse_updown_list($up_down_href);
+ }
+
+
+ foreach my $up (keys %$up_down_href) {
+
+ my $down_href = $up_down_href->{$up};
+
+ my @down = keys %$down_href;
+
+ my $struct = { up => $up,
+ down => [@down],
+ num => scalar @down,
+ };
+
+ push (@structs, $struct);
+ }
+
+ @structs = reverse sort {$a->{num}<=>$b->{num}} @structs;
+
+ my $top_struct = shift @structs;
+ my @top_structs = ($top_struct);
+
+ my $top_num = $top_struct->{num};
+
+ while (@structs) {
+ my $struct = shift @structs;
+ if ($struct->{num} == $top_num) {
+ push (@top_structs, $struct);
+ }
+ else {
+ last;
+ }
+ }
+
+ my @top;
+ my %bottom;
+ foreach my $struct (@top_structs) {
+ push (@top, $struct->{up});
+
+ foreach my $down (@{$struct->{down}}) {
+ $bottom{$down}++;
+ }
+ }
+
+ @top = sort @top;
+ my @bottom = sort keys %bottom;
+
+ if ($priority_direction eq 'down_priority') {
+
+ # switch them around.
+
+ my @orig_bottom = @bottom;
+ @bottom = @top;
+ @top = @orig_bottom;
+
+ }
+
+
+ return(\@top, \@bottom);
+}
+
+
+####
+sub write_graph_entry {
+ my ($feature, $graph_href, $graph_ofh) = @_;
+
+ my @nodes;
+
+ foreach my $up_sample (keys %$graph_href) {
+
+ foreach my $down_sample (keys %{$graph_href->{$up_sample}}) {
+
+ my $struct = $graph_href->{$up_sample}->{$down_sample};
+
+ my $logFC = $struct->{logFC};
+
+ push (@nodes, "$up_sample,$down_sample,$logFC");
+ }
+
+ }
+ print $graph_ofh join("\t", $feature, @nodes) . "\n";
+
+ return;
+}
+
+
+
+####
+sub reverse_updown_list {
+ my ($up_down_href) = @_;
+
+ my %down_up_href;
+
+ foreach my $up (keys %$up_down_href) {
+
+ foreach my $down (keys %{$up_down_href->{$up}}) {
+
+ $down_up_href{$down}->{$up} = $up_down_href->{$up}->{$down};
+ }
+ }
+
+
+ return(\%down_up_href);
+
+}
+
diff --git a/Analysis/DifferentialExpression/analyze_diff_expr.pl b/Analysis/DifferentialExpression/analyze_diff_expr.pl
index e451265..fce0ad5 100755
--- a/Analysis/DifferentialExpression/analyze_diff_expr.pl
+++ b/Analysis/DifferentialExpression/analyze_diff_expr.pl
@@ -232,7 +232,7 @@ main: {
sub cluster_diff_expressed_transcripts {
my ($diff_expr_matrix_file) = @_;
- my $cmd = "$FindBin::Bin/PtR -m $diff_expr_matrix_file --log2 --heatmap --min_colSums 0 --min_rowSums 0 --gene_dist euclidean --sample_dist euclidean --sample_cor_matrix --center_rows --save @ARGV";
+ my $cmd = "$FindBin::RealBin/PtR -m $diff_expr_matrix_file --log2 --heatmap --min_colSums 0 --min_rowSums 0 --gene_dist euclidean --sample_dist euclidean --sample_cor_matrix --center_rows --save @ARGV";
if ($samples_file) {
$cmd .= " -s $samples_file";
@@ -387,10 +387,10 @@ sub parse_result_files_find_diffExp {
## do GO enrichment analysis
if ($examine_GO_enrichment_flag) {
- my $cmd = "$FindBin::Bin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condA_up_subset_file";
+ my $cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condA_up_subset_file";
&process_cmd($cmd) if $countA;
- $cmd = "$FindBin::Bin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condB_up_subset_file";
+ $cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condB_up_subset_file";
&process_cmd($cmd) if $countB;
}
@@ -463,7 +463,7 @@ sub write_matrix_generate_heatmap {
}
close $ofh;
- my $cmd = "$FindBin::Bin/PtR -m $matrix_out_file -s $pairwise_samples_file --log2 --heatmap --min_colSums 0 --min_rowSums 0 --gene_dist euclidean --sample_dist euclidean @ARGV ";
+ my $cmd = "$FindBin::RealBin/PtR -m $matrix_out_file -s $pairwise_samples_file --log2 --heatmap --min_colSums 0 --min_rowSums 0 --gene_dist euclidean --sample_dist euclidean @ARGV ";
if ($samples_file) {
$cmd .= " -s $samples_file ";
diff --git a/Analysis/DifferentialExpression/cluster_sample_data/cleanme.pl b/Analysis/DifferentialExpression/cluster_sample_data/cleanme.pl
index 3380696..5e2be77 100755
--- a/Analysis/DifferentialExpression/cluster_sample_data/cleanme.pl
+++ b/Analysis/DifferentialExpression/cluster_sample_data/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
diff --git a/Analysis/DifferentialExpression/cut_tree_into_clusters.pl b/Analysis/DifferentialExpression/cut_tree_into_clusters.pl
index 4e3805e..65ac763 100755
--- a/Analysis/DifferentialExpression/cut_tree_into_clusters.pl
+++ b/Analysis/DifferentialExpression/cut_tree_into_clusters.pl
@@ -87,7 +87,7 @@ main: {
print $ofh "library(cluster)\n";
#print $ofh "library(gplots)\n";
print $ofh "library(Biobase)\n";
- print $ofh "source(\"$FindBin::Bin/R/heatmap.3.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
print $ofh "load(\"$R_data_file\")\n";
diff --git a/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl b/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl
index c40f19d..94475d2 100755
--- a/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl
+++ b/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl
@@ -88,7 +88,7 @@ main: {
print $ofh "library(cluster)\n";
#print $ofh "library(gplots)\n";
print $ofh "library(Biobase)\n";
- print $ofh "source(\"$FindBin::Bin/R/heatmap.3.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
print $ofh "load(\"$R_data_file\")\n";
@@ -114,8 +114,11 @@ main: {
print $ofh "gene_partition_assignments <- cutree(as.hclust(hc_genes), h=$pct_height/100*max(hc_genes\$height))\n";
$core_filename = "clusters_fixed_P_${pct_height}.heatmap";
$outdir = basename($R_data_file) . ".clusters_fixed_P_" . $pct_height;
-
}
+
+ # write gene order in heatmap clustering
+ print $ofh "write.table(gene_partition_assignments[hc_genes\$order], file=\"$core_filename.heatmap_gene_order.txt\", quote=F, sep='\t')\n";
+
print $ofh "max_cluster_count = max(gene_partition_assignments)\n";
print $ofh "outdir = \"" . $outdir . "\"\n";
@@ -186,7 +189,7 @@ main: {
chdir $outdir or die "Error, cannot cd into $outdir";
- my $cmd = "$FindBin::Bin/plot_expression_patterns.pl subcluster\*fpkm.matrix";
+ my $cmd = "$FindBin::RealBin/plot_expression_patterns.pl subcluster\*fpkm.matrix";
&process_cmd($cmd);
diff --git a/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl b/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl
index 0feb3a3..d47eda2 100755
--- a/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl
+++ b/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use File::Basename;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use CanvasXpress::Heatmap;
@@ -80,7 +80,7 @@ main: {
open (my $ofh, ">$R_script") or die "Error, cannot write to file $R_script";
- print $ofh "source(\"$FindBin::Bin/R/get_cluster_info.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/get_cluster_info.R\")\n";
print $ofh "get_cluster_info(\"$R_data_file\")\n";
close $ofh;
diff --git a/Analysis/DifferentialExpression/diff_express.cgi b/Analysis/DifferentialExpression/diff_express.cgi
index 5b6905a..b591769 100755
--- a/Analysis/DifferentialExpression/diff_express.cgi
+++ b/Analysis/DifferentialExpression/diff_express.cgi
@@ -8,7 +8,7 @@ use CGI::Carp qw(fatalsToBrowser);
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/PerlLib");
+use lib ("$FindBin::RealBin/PerlLib");
use CanvasXpress::Heatmap;
use BHStats;
diff --git a/Analysis/DifferentialExpression/extract_GO_enriched_genes.pl b/Analysis/DifferentialExpression/extract_GO_enriched_genes.pl
index 2ee582e..cccd62d 100755
--- a/Analysis/DifferentialExpression/extract_GO_enriched_genes.pl
+++ b/Analysis/DifferentialExpression/extract_GO_enriched_genes.pl
@@ -101,7 +101,7 @@ main: {
}
## generate a heatmap
- my $cmd = "$FindBin::Bin/PtR -m $fpkm_outfile --log2 --heatmap --gene_dist euclidean --sample_dist euclidean";
+ my $cmd = "$FindBin::RealBin/PtR -m $fpkm_outfile --log2 --heatmap --gene_dist euclidean --sample_dist euclidean";
if ($samples_file) {
$cmd .= " -s $samples_file ";
}
diff --git a/Analysis/DifferentialExpression/get_transcript_lengths.pl b/Analysis/DifferentialExpression/get_transcript_lengths.pl
index 99f7d0c..b65d49d 100755
--- a/Analysis/DifferentialExpression/get_transcript_lengths.pl
+++ b/Analysis/DifferentialExpression/get_transcript_lengths.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
diff --git a/Analysis/DifferentialExpression/merge_matrices.pl b/Analysis/DifferentialExpression/merge_matrices.pl
index 200faae..7d7e118 100755
--- a/Analysis/DifferentialExpression/merge_matrices.pl
+++ b/Analysis/DifferentialExpression/merge_matrices.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Data::Dumper;
@@ -24,11 +24,15 @@ my %genes;
main: {
foreach my $matrix (@matrices) {
-
- &parse_matrix($matrix);
+ if (-s $matrix) {
+ &parse_matrix($matrix);
+ }
+ else {
+ print STDERR "WARNING: cannot locate matrix file: $matrix";
+ }
}
-
+
## output new matrix:
my @colnames = sort keys %matrix;
diff --git a/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl b/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl
index 14669fb..bc9d165 100755
--- a/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl
+++ b/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Data::Dumper;
diff --git a/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl b/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl
index 2c68684..cef04d4 100755
--- a/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl
+++ b/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl
@@ -89,7 +89,7 @@ samples = read.table("$samples_file", header=F, check.names=F)
sample_types = as.vector(unique(samples[,1]))
nsamples = length(sample_types)
-data = read.table("$matrix_file", header=T, row.names=1, com='', nrows=5, check.names=F)
+data = read.table("$matrix_file", header=T, row.names=1, com='', nrows=10000, check.names=F)
classes = sapply(data,class)
data = read.table("$matrix_file", header=T, row.names=1, com='', colClasses=classes, check.names=F)
data = as.matrix(data)
diff --git a/Analysis/DifferentialExpression/run_DE_analysis.pl b/Analysis/DifferentialExpression/run_DE_analysis.pl
index 0a7fd4a..80e64b5 100755
--- a/Analysis/DifferentialExpression/run_DE_analysis.pl
+++ b/Analysis/DifferentialExpression/run_DE_analysis.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Data::Dumper;
@@ -413,7 +413,7 @@ sub run_edgeR_sample_pair {
print $ofh "write.table(tTags, file=\'$output_prefix.edgeR.DE_results\', sep='\t', quote=F, row.names=T)\n";
## generate MA and Volcano plots
- print $ofh "source(\"$FindBin::Bin/R/rnaseq_plot_funcs.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.edgeR.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "result_table = tTags\$table\n";
print $ofh "plot_MA_and_Volcano(result_table\$logCPM, result_table\$logFC, result_table\$FDR)\n";
@@ -502,7 +502,7 @@ sub run_DESeq2_sample_pair {
print $ofh "write.table(as.data.frame(res[order(res\$pvalue),]), file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE, row.names=F)\n";
## generate MA and Volcano plots
- print $ofh "source(\"$FindBin::Bin/R/rnaseq_plot_funcs.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.DESeq2.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(log2(res\$baseMean+1), res\$log2FoldChange, res\$padj)\n";
print $ofh "dev.off()\n";
@@ -577,7 +577,7 @@ sub run_limma_voom_sample_pair {
## generate MA and Volcano plots
print $ofh "# MA and volcano plots\n";
- print $ofh "source(\"$FindBin::Bin/R/rnaseq_plot_funcs.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.voom.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(tTags2\$logCPM, tTags\$logFC, tTags\$'adj.P.Val')\n";
print $ofh "dev.off()\n";
@@ -673,7 +673,7 @@ sub run_ROTS_sample_pair {
## generate MA and Volcano plots
print $ofh "# MA and volcano plots\n";
- print $ofh "source(\"$FindBin::Bin/R/rnaseq_plot_funcs.R\")\n";
+ print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.voom.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(final_table\$logCPM, final_table\$logFC, final_table\$FDR)\n";
print $ofh "dev.off()\n";
diff --git a/Analysis/DifferentialExpression/run_GOseq.pl b/Analysis/DifferentialExpression/run_GOseq.pl
index fdf03a5..aa4a5ba 100755
--- a/Analysis/DifferentialExpression/run_GOseq.pl
+++ b/Analysis/DifferentialExpression/run_GOseq.pl
@@ -155,7 +155,7 @@ main: {
close $ofh;
- my $cmd = "R --vanilla -q < $Rscript";
+ my $cmd = "R --no-save --no-restore --no-site-file --no-init-file --quiet < $Rscript";
my $ret = system($cmd);
if ($ret) {
die "Error, cmd: $cmd died with ret $ret";
diff --git a/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl b/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl
index a4a7634..b4caefb 100755
--- a/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl
+++ b/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Data::Dumper;
@@ -104,7 +104,7 @@ sub run_TMM {
my $tmm_norm_script = "__tmp_runTMM.R";
open (my $ofh, ">$tmm_norm_script") or die "Error, cannot write to $tmm_norm_script";
- #print $ofh "source(\"$FindBin::Bin/R/edgeR_funcs.R\")\n";
+ #print $ofh "source(\"$FindBin::RealBin/R/edgeR_funcs.R\")\n";
print $ofh "library(edgeR)\n\n";
diff --git a/Analysis/DifferentialExpression/stratify_diff_expression.pl b/Analysis/DifferentialExpression/stratify_diff_expression.pl
index 7d2efdc..cf0075e 100755
--- a/Analysis/DifferentialExpression/stratify_diff_expression.pl
+++ b/Analysis/DifferentialExpression/stratify_diff_expression.pl
@@ -21,7 +21,7 @@ for my $fold_change (1..8) {
for my $pvalue (2..10) {
- my $cmd = "$FindBin::Bin/analyze_diff_expr.pl --matrix $fpkm_matrix_file -C $fold_change -P 1e-$pvalue";
+ my $cmd = "$FindBin::RealBin/analyze_diff_expr.pl --matrix $fpkm_matrix_file -C $fold_change -P 1e-$pvalue";
&process_cmd($cmd);
diff --git a/Analysis/DifferentialExpression/subcluster_to_canvasXpress_html.pl b/Analysis/DifferentialExpression/subcluster_to_canvasXpress_html.pl
index 774fd20..b810e80 100755
--- a/Analysis/DifferentialExpression/subcluster_to_canvasXpress_html.pl
+++ b/Analysis/DifferentialExpression/subcluster_to_canvasXpress_html.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib("$FindBin::Bin/../TrinityWeb/cgi-bin/PerlLib/");
+use lib("$FindBin::RealBin/../TrinityWeb/cgi-bin/PerlLib/");
use CanvasXpress::Heatmap;
use CanvasXpress::PlotOnLoader;
use CanvasXpress::Line;
diff --git a/Analysis/FL_reconstruction_analysis/FL_trans_analysis_pipeline.pl b/Analysis/FL_reconstruction_analysis/FL_trans_analysis_pipeline.pl
index 7e939e0..2852580 100755
--- a/Analysis/FL_reconstruction_analysis/FL_trans_analysis_pipeline.pl
+++ b/Analysis/FL_reconstruction_analysis/FL_trans_analysis_pipeline.pl
@@ -8,7 +8,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use FindBin;
use File::Basename;
-$ENV{PATH} .= ":$FindBin::Bin/../../trinity-plugins/slclust/bin/";
+$ENV{PATH} .= ":$FindBin::RealBin/../../trinity-plugins/slclust/bin/";
my $help_flag;
@@ -87,7 +87,7 @@ if ($help_flag) {
die $usage;
}
-my $util_dir = "$FindBin::Bin/util";
+my $util_dir = "$FindBin::RealBin/util";
my $found_all_tools = 1;
my @required_tools = qw (blat slclust);
diff --git a/Analysis/FL_reconstruction_analysis/compute_oracle.pl b/Analysis/FL_reconstruction_analysis/compute_oracle.pl
index 240227c..c0e67cf 100755
--- a/Analysis/FL_reconstruction_analysis/compute_oracle.pl
+++ b/Analysis/FL_reconstruction_analysis/compute_oracle.pl
@@ -12,7 +12,7 @@ my $ref_transcripts_fasta = $ARGV[1] or die $usage;
my $SS_flag = $ARGV[2] || 0;
-my $cmd = "$FindBin::Bin/../../Inchworm/bin/inchworm "
+my $cmd = "$FindBin::RealBin/../../Inchworm/bin/inchworm "
. " --reads $reads_file "
. " --checkFastaPath $ref_transcripts_fasta ";
diff --git a/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.pl b/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.pl
index 6794e75..f553c39 100755
--- a/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.pl
+++ b/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.pl
@@ -16,14 +16,14 @@ my $SS = $ARGV[4] || 0;
main: {
## run blat:
- my $cmd = "$FindBin::Bin/../../util/process_BLAT_alignments.pl -g $genes_fasta -t $trans_fasta -I $max_intron --CPU $blat_cpu --KEEP_PSLX";
+ my $cmd = "$FindBin::RealBin/../../util/process_BLAT_alignments.pl -g $genes_fasta -t $trans_fasta -I $max_intron --CPU $blat_cpu --KEEP_PSLX";
&process_cmd($cmd);
$cmd = "cat blat_out_dir/*top_1 > blat.top_1.pslx";
&process_cmd($cmd);
- $cmd = "$FindBin::Bin/util/blat_top_tier_genes.pl blat.top_1.pslx $SS";
+ $cmd = "$FindBin::RealBin/util/blat_top_tier_genes.pl blat.top_1.pslx $SS";
&process_cmd($cmd);
exit(0);
diff --git a/Analysis/FL_reconstruction_analysis/util/blat_full_length_mappings.pl b/Analysis/FL_reconstruction_analysis/util/blat_full_length_mappings.pl
index 78c5dbb..0ee5970 100755
--- a/Analysis/FL_reconstruction_analysis/util/blat_full_length_mappings.pl
+++ b/Analysis/FL_reconstruction_analysis/util/blat_full_length_mappings.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use SingleLinkageClusterer;
use PSL_parser;
diff --git a/Analysis/FL_reconstruction_analysis/util/blat_top_tier_genes.pl b/Analysis/FL_reconstruction_analysis/util/blat_top_tier_genes.pl
index 55092a8..314f347 100755
--- a/Analysis/FL_reconstruction_analysis/util/blat_top_tier_genes.pl
+++ b/Analysis/FL_reconstruction_analysis/util/blat_top_tier_genes.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use PSL_parser;
require "overlapping_nucs.ph";
diff --git a/Butterfly/src/src/TransAssembly_allProbPaths.java b/Butterfly/src/src/TransAssembly_allProbPaths.java
index 33b2e44..929616c 100644
--- a/Butterfly/src/src/TransAssembly_allProbPaths.java
+++ b/Butterfly/src/src/TransAssembly_allProbPaths.java
@@ -905,7 +905,7 @@ public class TransAssembly_allProbPaths {
if (BFLY_GLOBALS.VERBOSE_LEVEL >= 15) {
- debugMes("Printing Pair Paths ------------------", 15);
+ debugMes("Printing Pair Paths Before DAG Overlap Layout ------------------", 15);
printPairPaths(combinedReadHash, "PairPaths at Init");
}
@@ -1192,6 +1192,22 @@ public class TransAssembly_allProbPaths {
HashMap<List<Integer>,HashMap<PairPath,Integer>> finalPathsToContainedReads = assignCompatibleReadsToPaths(FinalPaths_all,combinedReadHash);
+
+ if (BFLY_GLOBALS.VERBOSE_LEVEL >= 20) {
+
+ for (List<Integer> final_path : finalPathsToContainedReads.keySet()) {
+ HashMap<PairPath,Integer> contained_reads = finalPathsToContainedReads.get(final_path);
+ debugMes("PRELIM_FINAL_PATH:\n" + final_path + "\ncontains:", 20);
+ int sum_support = 0;
+ for (PairPath pp : contained_reads.keySet()) {
+ Integer read_support = contained_reads.get(pp);
+ debugMes(pp + "\tcount: " + read_support, 20);
+ sum_support += read_support;
+ }
+ debugMes("Total support: " + sum_support + "\n", 20);
+ }
+
+ }
HashMap<List<Integer>, Pair<Integer>> filtered_paths_to_keep = new HashMap<List<Integer>,Pair<Integer>>();
@@ -1374,10 +1390,10 @@ public class TransAssembly_allProbPaths {
DirectedSparseGraph<SeqVertex, SimpleEdge> graph, HashMap<List<Integer>,
HashMap<PairPath, Integer>> finalPathsToContainedReads,
HashMap<List<Integer>,Integer> separate_gene_ids) {
-
-
+
+
debugMes("SECTION\n====== ## BFLY_EM_REDUCE ## ==========\n\n", 5);
-
+
List<List<Integer>> all_paths = new ArrayList<List<Integer>>(finalPaths_all.keySet());
@@ -1429,7 +1445,7 @@ public class TransAssembly_allProbPaths {
}
-
+
private static HashMap<Integer, HashMap<PairPath, Integer>> create_DAG_from_OverlapLayout(
DirectedSparseGraph<SeqVertex, SimpleEdge> seqvertex_graph, HashMap<Integer, HashMap<PairPath, Integer>> combinedReadHash, String dot_file_prefix,
@@ -1473,16 +1489,29 @@ public class TransAssembly_allProbPaths {
Collections.reverse(paths); // want descending by path l
-
+ //////////////////////////////
// remove the contained reads
+ //////////////////////////////
+
+ //contained_path_to_containers: (key= the path contained, value = list of all other paths that fully contain it)
HashMap<List<Integer>,List<List<Integer>>> contained_path_to_containers = new HashMap<List<Integer>,List<List<Integer>>>();
List<List<Integer>> noncontained_paths = remove_containments(paths, contained_path_to_containers);
debugMes("Noncontained paths: " + noncontained_paths, 15);
+
+ //////////////////////////////
+ // find dispersed repeats ////
+ //////////////////////////////
+
HashSet<Integer> dispersed_repeat_nodes = find_dispersed_repeat_nodes(noncontained_paths);
+ ////////////////////////////
+ // build the overlap graph
+ ////////////////////////////
+
+
// build a graph of compatible paths.
List<Path> path_list = new ArrayList<Path>();
for (List<Integer> p : noncontained_paths) {
@@ -1518,6 +1547,10 @@ public class TransAssembly_allProbPaths {
writeDotFile(path_overlap_graph, dot_file_prefix + "_POG.PE_links_added.dot", graphName);
+ //////////////////////////////
+ // Breaking cycles
+ /////////////////////////////
+
int cycle_round = 0;
boolean breaking_cycles = true;
@@ -1550,8 +1583,11 @@ public class TransAssembly_allProbPaths {
HashMap<Path,PathWithOrig> orig_path_to_updated_path = convert_path_DAG_to_SeqVertex_DAG(path_overlap_graph,
pathMatches, seqvertex_graph, dot_file_prefix, graphName, createMiddleDotFiles);
+ // note, path_overlap_graph includes non-contained paths
+ // pairPathToReadSupport contains all paths
+
- combinedReadHash = update_PairPaths_using_overlapDAG_refined_paths(orig_path_to_updated_path, pairPathToReadSupport);
+ combinedReadHash = update_PairPaths_using_overlapDAG_refined_paths(orig_path_to_updated_path, pairPathToReadSupport, contained_path_to_containers);
@@ -1731,7 +1767,8 @@ public class TransAssembly_allProbPaths {
private static HashMap<Integer, HashMap<PairPath, Integer>> update_PairPaths_using_overlapDAG_refined_paths(
HashMap<Path, PathWithOrig> orig_path_to_updated_path,
- Map<PairPath, Integer> pairPathToReadSupport) {
+ Map<PairPath, Integer> pairPathToReadSupport,
+ HashMap<List<Integer>, List<List<Integer>>> contained_path_to_containers) {
// get the old-to-new listing in List<Integer> format for use with PairPath objects
@@ -1750,7 +1787,7 @@ public class TransAssembly_allProbPaths {
-
+ // get list of all old/new path pairs
List<PathWithOrig> revised_paths = new ArrayList<PathWithOrig>(orig_path_to_updated_path.values());
// now, create new pair paths based on updated mappings.
@@ -1763,36 +1800,56 @@ public class TransAssembly_allProbPaths {
Integer read_support = pairPathToReadSupport.get(pp);
+ debugMes("update_PairPaths_using_overlapDAG_refined_paths: orig_pp: " + pp + " has support: " + read_support, 20);
+
+
PairPath new_pp;
+ List<List<Integer>> p1_list = new ArrayList<List<Integer>>();
+
List<Integer> p1 = pp.getPath1();
if (old_to_new_path.containsKey(p1)) {
- p1 = old_to_new_path.get(p1);
+ p1_list.add(old_to_new_path.get(p1));
}
else {
- p1 = update_path_mappings(p1, revised_paths);
+ // might not be a unique path!! (eg. single original nodes now ending up in multiple places)
+ p1_list = get_all_possible_updated_path_mappings(p1, revised_paths);
+
+ debugMes("update_PairPaths_using_overlapDAG_refined_paths, p1: " + p1 + " mapped to: " + p1_list, 20);
+
}
+ List<List<Integer>> p2_list = new ArrayList<List<Integer>>();
if (pp.hasSecondPath()) {
List<Integer> p2 = pp.getPath2();
if (old_to_new_path.containsKey(p2)) {
p2 = old_to_new_path.get(p2);
+ p2_list.add(p2);
}
else {
- p2 = update_path_mappings(p2, revised_paths);
+ p2_list = get_all_possible_updated_path_mappings(p2, revised_paths);
+ }
+
+ // create new pair lists
+ for (List<Integer> p1_path : p1_list) {
+ for (List<Integer> p2_path : p2_list) {
+ new_pp = new PairPath(p1_path, p2_path);
+ updated_pairPaths.put(new_pp, read_support);
+ old_pp_to_new_pp.put(pp, new_pp); // FIXME: need to allow for multiple mappings here wrt long reads
+
+ }
}
- new_pp = new PairPath(p1, p2);
}
else {
- new_pp = new PairPath(p1);
+ // only individual paths
+ for (List<Integer>p1_path : p1_list) {
+ new_pp = new PairPath(p1_path);
+ updated_pairPaths.put(new_pp, read_support);
+ old_pp_to_new_pp.put(pp, new_pp);
+ }
}
- updated_pairPaths.put(new_pp, read_support);
-
-
- old_pp_to_new_pp.put(pp, new_pp);
-
}
update_long_read_path_mappings(old_pp_to_new_pp);
@@ -1861,12 +1918,43 @@ public class TransAssembly_allProbPaths {
throw new RuntimeException("Unable to remap read: " + p1 + " given: " + revised_paths);
}
+ private static List<List<Integer>> get_all_possible_updated_path_mappings(
+ List<Integer> p1,
+ List<PathWithOrig> revised_paths) {
+
+ List<List<Integer>> all_path_mappings = new ArrayList<List<Integer>>();
+
+ PathWithOrig pwo_needs_updating = new PathWithOrig(p1);
+
+ for (PathWithOrig pwo : revised_paths) {
+
+ PathWithOrig updated_pwo = pwo_needs_updating.align_path_by_orig_id(pwo);
+ if (updated_pwo != null) {
+ List<Integer> updated_path = updated_pwo.getVertexList();
+ if (! all_path_mappings.contains(updated_path)) {
+ all_path_mappings.add(updated_path);
+ }
+ }
+
+ }
+ if (all_path_mappings.isEmpty()) {
+
+ throw new RuntimeException("Unable to remap read: " + p1 + " given: " + revised_paths);
+ }
+ else {
+ return(all_path_mappings);
+ }
+ }
+
private static HashMap<Path,PathWithOrig> convert_path_DAG_to_SeqVertex_DAG(
DirectedSparseGraph<Path, SimplePathNodeEdge> path_overlap_graph,
HashMap<String, PathOverlap> pathMatches,
- DirectedSparseGraph<SeqVertex, SimpleEdge> seqvertex_graph, String dot_file_prefix, String graphName, boolean createMiddleDotFiles) {
+ DirectedSparseGraph<SeqVertex, SimpleEdge> seqvertex_graph,
+ String dot_file_prefix,
+ String graphName,
+ boolean createMiddleDotFiles) {
debugMes("SECTION\n======== Convert Path-DAG to SeqVertex-DAG ============\n\n", 5);
@@ -6626,9 +6714,16 @@ HashMap<List<Integer>, Pair<Integer>> transcripts = new HashMap<List<Integer>,Pa
if (p.isCompatibleAndContainedBySinglePath(path)) {
if (! pathToContainedReads.containsKey(path)) {
pathToContainedReads.put(path, new HashMap<PairPath, Integer>());
+
}
+
+ debugMes("assignCompatibleReadsToPaths: " + p + " is compatible with " + path, 20);
+
pathToContainedReads.get(path).put(p, read_map.get(p));
}
+ else {
+ debugMes("assignCompatibleReadsToPaths: " + p + " is NOT compatible with " + path, 20);
+ }
}
}
@@ -10827,14 +10922,21 @@ HashMap<List<Integer>, Pair<Integer>> transcripts = new HashMap<List<Integer>,Pa
debugMes("removing path "+path2remove+" and keeping path "+path2keep,15);
+
+
if (!removeSimilarPaths.contains(path2remove))
removeSimilarPaths.add(path2remove);
+
if (PathReads.get(path2remove)!=null)
{
if (PathReads.get(path2keep)==null)
PathReads.put(path2keep, new HashMap<PairPath,Integer>());
- PathReads.get(path2keep).putAll(PathReads.get(path2remove));
+
+ // no longer assuming ownership of the other's reads, as this causes problems!
+ //PathReads.get(path2keep).putAll(PathReads.get(path2remove));
+
+
PathReads.remove(path2remove);
}
diff --git a/Release.Notes b/Changelog.txt
similarity index 90%
rename from Release.Notes
rename to Changelog.txt
index cc3fa27..c9bc3ba 100644
--- a/Release.Notes
+++ b/Changelog.txt
@@ -1,18 +1,52 @@
+# Release v2.2.0 March 17, 2016
-# next release
+ -Butterfly update: bugfix related to polynucleotide runs.
+ -util/SAM_nameSorted_to_uniq_count_stats.pl: count fragments instead of reads.
+ -util/abundance_estimates_to_matrix.pl: will output a matrix even if only a single sample is specified. Also, now can take a --samples_file containing a list of the target files to build the matrix from.
+ -util/align_and_estimate_abundance.pl: added support for salmon
+ -sample_data/test_align_and_estimate_abundance/: added examples and tests for single-end and paired-end abundance estimation
+
+
+# Release v2.1.1 Oct 15, 2015
-including -XX:ParallelGCThreads=$bflyGCThreads in ExitTester.jar execution.
-incorporating samtools-0.1.19 as plugin
-
+
+A few minor fixes:
+
+Memory is divided among the samtools threads.
+
+The Trinity contig identifiers for genome-guided assemblies are now formatted correctly (as compared to v2.1.0).
+
+We now run a check to ensure that the number of fastq records being converted to fasta by fastools matches (sanity check).
+
+
+
+# Release v2.1.0 Sept 29, 2015
+
+Abundance estimation: added support for kallisto and using TPMs now instead of FPKMs for downstream analyses.
+
+DE analysis: added support for Limma/Voom and ROTS, dropped support for DESeq(1) while keeping DESeq2. For edgeR w/o bio reps, user must define dispersion parameter.
+
+Minimal changes to the assembler, minor bug fixes, tackled most github 'issues' from last release.
+
+Trinity documentation was reorganized, revised, and moved to the wiki format.
# Release v2.0.6
-patch to autoconf for the inchworm build
+patch - had to 'autoconf --install' for the Inchworm build
+
+
# Release v2.0.5
-bugfix to properly fan out read files (they were inadvertently ending up in a single directory)
+Performance-related patch.
+
+Files containing reads to assemble are now properly being fanned out across a number of directories and files, instead of inadvertently co-localizing them all in a single directory. Performance improvements should be observed in the context of large data sets.
+
# Release v2.0.4
@@ -21,7 +55,10 @@
-additional testing built in
-use parallel samtools always (not just w/ v1.1, silly!)
-
+-Trimmomatic symlink set w/ capital T
+-additional testing built in
+-use parallel samtools always (not just w/ v1.1, silly!)
+-runtime latest-version checking added
## Release v2.0.3
@@ -33,7 +70,13 @@
-use Jellyfish for only phase 1 of Trinity, with inchworm doing its own kmer counting in phase 2 (faster this way).
-moved the HTC code over to the HPC GridRunner codebase and synched.
+-Bugfix to Butterfly that accounts for rare edge-cases resulting in fatal error: DAG contains a cycle
+-Jellyfish is now only used in the initial stage-1 of Trinity (read clustering phase), and Inchworm does the kmer counting in stage-2 (the assembly phase). This results in much faster runtimes, particularly on small data sets.
+
+-Trinity is much less verbose, especially in stage-2
+
+-Matt MacManes updated the Trimmomatic settings to those defined as optimal for trinity assembly.
@@ -71,6 +114,52 @@
-analyze_diff_expr: different options for ordering samples or replicates in the heatmap (useful for time series)
+
+The long awaited Trinity release is now available:
+https://github.com/trinityrnaseq/trinityrnaseq/releases
+
+This version has slightly improved assembly characteristics as compared to all previous versions of Trinity, as demonstrated from full-length transcript reconstruction stats as well as Detonate scores (to be shown later).
+
+Trinity v2.0 includes a number of significant changes as outlined below:
+
+Logistics:
+
+Trinity moves to github, with the new website location at: http://trinityrnaseq.github.io
+
+User support now occurs through the google group:
+https://groups.google.com/forum/#!forum/trinityrnaseq-users
+
+Software:
+
+-Trinity assembly now operates in two distinct phases (1): clustering reads and (2) assembly of reads. The phase (1) read clustering phase can be done by de novo read clustering (default) or in a genome-guided way (given a coordinate-sorted bam file). Phase (2) involves executing the complete Trinity process on each cluster of reads. For the de novo read clustering phase, existing Trinity components are used (Inchworm and Chrysalis), but that process will likely be replaced by an alterna [...]
+
+-the Butterfly algorithm has been extensively revised to better integrate long read support and to improve on the assembly of complex isoforms, particularly those containing internally repetitive sequences.
+
+Numerous minor changes and differences in usage - see web documentation. Most notable changes are:
+
+Trinity --max_memory instead of --JM, and simpler usage for the genome-guided method, which requires that the user provide a coordinate-sorted bam file with parameter: --genome_guided_bam.
+
+If you have error-corrected pacbio reads, you can incorporate them with the Trinity --long_reads parameter. Note, however, if you have strand-specific RNA-Seq, you'll need to be sure to first reorient your pacbio reads so that they are sense strand oriented (we do not have an automated process to do that yet). Also, note that this new feature continues to be experimental and additional work is underway to fully demonstrate the added value from incorporating the long read data.
+
+Note, the build process has changed slightly:
+To build Trinity, type 'make' in the base installation directory.
+To then build additional plugin components required for post-assembly analysis, type 'make plugins'.
+If under 'make plugins', the rsem build fails, simply visit the trinity_plugins/tmp.rsem directory and type 'make', then go back and resume the 'make plugins' in the base installation directory.
+
+
+
+
+
+
+
+
+
+
+
+
+#################################
+## Older Trinity v1 release notes
+############################
## Trinity release 2014-07-17
run_DE_analysis.pl
diff --git a/Chrysalis/MakeDepend.cc b/Chrysalis/MakeDepend.cc
index 1129872..ce901ce 100644
--- a/Chrysalis/MakeDepend.cc
+++ b/Chrysalis/MakeDepend.cc
@@ -420,7 +420,7 @@ void makefile_builder::DumpDependencies( const string &target )
// The vector of pairs count_per_provider[] simply reverses the map
// provider_counts[], copying each map element B->n to an ordered pair (n,B),
- // so that the resulting vector can be sorted by number of occurences, rather
+ // so that the resulting vector can be sorted by number of occurrences, rather
// than provider name. (Why not build the mapping as a vector in the first
// place? Efficiency: keeping the vector sorted while building up the map
// would require a lot of copying of data everytime a new provider name was
diff --git a/Chrysalis/Makefile b/Chrysalis/Makefile
index 61f7cff..e82ebe6 100644
--- a/Chrysalis/Makefile
+++ b/Chrysalis/Makefile
@@ -52,6 +52,7 @@
##############################################################################
DATE = $(shell date)
+BUILD_DATETIME ?= $(DATE) # allow for overrides to enable reproducible builds
OS_NAME = $(shell uname -s)
NODE_NAME = $(shell uname -n)
OS_RELEASE = $(shell uname -r)
@@ -277,7 +278,7 @@ XERCES_LIB =
# Base definitions:
SYS_DEFS = \
- -DMAKE_DATE='"$(DATE)"' \
+ -DMAKE_DATE='"$(BUILD_DATETIME)"' \
-DMAKE_OS_RELEASE='"$(OS_RELEASE)"' \
-DMAKE_RELEASE='"$(RELEASE)"' \
-DNEW_MAKEFILE
@@ -308,7 +309,7 @@ ifeq ($(COMPILER),g++)
endif
# Linking control (e.g. to link templates):
-SYS_LINK =
+SYS_LINK = $(LDFLAGS)
# Required libraries:
SYS_LIBS = -lm -pthread
@@ -380,7 +381,8 @@ CPP_OPTIONS = \
$(SYS_LANG) \
$(SYS_INCS) \
$(OMP_OPTIONS) \
- $(PTHREAD_OPTIONS)
+ $(PTHREAD_OPTIONS) \
+ $(CPPFLAGS)
LINK_OPTIONS = \
$(SYS_DEBUG) \
@@ -435,7 +437,7 @@ clean:
for file in $(EXECUTABLES); do rm -f $(BIN)/$$file; done
rm -f MakeDepend $(BIN)/MakeDepend contigs.out my.permanent.log.file \
core a.out Makefile.bak bsubin BasevectorTables.h $(BIN)/checkLock
- find $(OBJ) -name '*.o' -exec rm {} \;
+ find $(OBJ) -name '*.o' -exec rm {} \; || /bin/true
rm -rf cxx_repository
rm -f lib_*_temp.a
diff --git a/Chrysalis/analysis/TranscriptomeGraph.cc b/Chrysalis/analysis/TranscriptomeGraph.cc
index 174a857..bd07120 100644
--- a/Chrysalis/analysis/TranscriptomeGraph.cc
+++ b/Chrysalis/analysis/TranscriptomeGraph.cc
@@ -666,7 +666,7 @@ int TranscriptomeGraph(vecDNAVector & seq,
DNAVector & d = seq[0];
for (i=0; i<=d.isize()-k; i++) {
- fprintf(pOut, "%d\t%d\t1\t", i, i-1);
+ fprintf(pOut, "%lu\t%lu\t1\t", i, i-1);
//cout << i << "\t" << i-1 << "\t1\t";
for (size_t x=i; x<i+k; x++)
fprintf(pOut, "%c", d[x]);
diff --git a/Inchworm/src/IRKE.cpp b/Inchworm/src/IRKE.cpp
index c3c7ce5..55167eb 100644
--- a/Inchworm/src/IRKE.cpp
+++ b/Inchworm/src/IRKE.cpp
@@ -93,7 +93,7 @@ void IRKE::populate_Kmers_from_kmers(const string& fasta_filename) {
record_counter[i] = 0;
}
- cerr << "-reading Kmer occurences..." << endl;
+ cerr << "-reading Kmer occurrences..." << "\n";
start = time(NULL);
Fasta_reader fasta_reader(fasta_filename);
@@ -137,13 +137,13 @@ void IRKE::populate_Kmers_from_kmers(const string& fasta_filename) {
sum+= record_counter[i];
delete [] record_counter;
- cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end-start) << " seconds." << endl;
+ cerr << "\n" << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end-start) << " seconds." << "\n";
- cerr << endl << "TIMING KMER_DB_BUILDING " << (end-start) << " s." << endl;
+ cerr << "\n" << "TIMING KMER_DB_BUILDING " << (end-start) << " s." << "\n";
ofstream iworm_kmer_count_report_fh;
iworm_kmer_count_report_fh.open("inchworm.kmer_count");
- iworm_kmer_count_report_fh << kcounter.size() << endl;
+ iworm_kmer_count_report_fh << kcounter.size() << "\n";
iworm_kmer_count_report_fh.close();
@@ -168,7 +168,7 @@ void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassemb
}
- cerr << "-storing Kmers..." << endl;
+ cerr << "-storing Kmers..." << "\n";
start = time(NULL);
Fasta_reader fasta_reader(fasta_filename);
@@ -189,7 +189,7 @@ void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassemb
record_counter[myTid]++;
if (IRKE_COMMON::MONITOR >= 4) {
- cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << endl;;
+ cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";;
}
else if (IRKE_COMMON::MONITOR) {
if (myTid == 0 && record_counter[myTid] % 1000 == 0)
@@ -238,7 +238,7 @@ void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassemb
if (IRKE_COMMON::MONITOR >= 1) {
cerr << "Adding inchworm assembly " << accession
<< " K: " << kmer_val << " Cov: " << cov_val
- << " with coverage: " << normalized_coverage_val << endl;
+ << " with coverage: " << normalized_coverage_val << "\n";
}
if (cov_val < 1) {
stringstream err;
@@ -258,7 +258,7 @@ void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassemb
&&
record_counter[myTid]/omp_get_num_threads() % PRUNE_SINGLETON_READ_INTERVAL == 0) {
if (IRKE_COMMON::MONITOR >= 1) {
- cerr << "Reached singleton kmer pruning interval at read count: " << record_counter << endl;
+ cerr << "Reached singleton kmer pruning interval at read count: " << record_counter << "\n";
}
prune_kmers_min_count(1);
}
@@ -273,7 +273,7 @@ void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassemb
sum+= record_counter[i];
delete [] record_counter;
- cerr << endl << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << endl;
+ cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n";
return;
@@ -325,7 +325,7 @@ void IRKE::traverse_path(KmerCounter& kcounter, Kmer_Occurence_Pair seed_kmer, K
Kmer_visitor& place_holder, float MIN_CONNECTIVITY_RATIO, unsigned int depth) {
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << "traverse_path, depth: " << depth << ", kmer: " << kcounter.get_kmer_string(seed_kmer.first) << endl;
+ cerr << "traverse_path, depth: " << depth << ", kmer: " << kcounter.get_kmer_string(seed_kmer.first) << "\n";
}
@@ -333,7 +333,7 @@ void IRKE::traverse_path(KmerCounter& kcounter, Kmer_Occurence_Pair seed_kmer, K
if (visitor.exists(seed_kmer.first)) {
// already visited
if (IRKE_COMMON::MONITOR >= 3) {
- cout << "\talready visited " << kcounter.get_kmer_string(seed_kmer.first) << endl;
+ cout << "\talready visited " << kcounter.get_kmer_string(seed_kmer.first) << "\n";
}
return;
@@ -387,7 +387,7 @@ string add_fasta_seq_line_breaks(string& sequence, int interval) {
fasta_seq << *it;
if (counter % interval == 0 && (it + 1) != sequence.end()) {
- fasta_seq << endl;
+ fasta_seq << "\n";
}
}
@@ -414,7 +414,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
if (! got_sorted_kmers_flag) {
stringstream error;
- error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl;
+ error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << "\n";
throw(error.str());
}
@@ -424,7 +424,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
unsigned long init_size = kcounter.size();
- cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << endl;
+ cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << "\n";
unsigned int kmer_length = kcounter.get_kmer_length();
ofstream coverage_writer;
@@ -471,7 +471,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
vector<iworm_tmp_file> tmpfiles;
int num_threads = omp_get_max_threads();
- cerr << "num threads set to: " << num_threads << endl;
+ cerr << "num threads set to: " << num_threads << "\n";
for (int i =0; i < num_threads; i++) {
iworm_tmp_file tmpfile_struct;
@@ -487,7 +487,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
itmp.fh = new ofstream();
itmp.fh->open(itmp.tmp_filename);
- cerr << "Done opening file. " << itmp.tmp_filename << endl;
+ cerr << "Done opening file. " << itmp.tmp_filename << "\n";
}
@@ -498,7 +498,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
#pragma omp parallel for private (myTid) schedule (dynamic, 1000)
for (unsigned int i = 0; i < kmers.size(); i++) {
- // cerr << "round: " << i << endl;
+ // cerr << "round: " << i << "\n";
myTid = omp_get_thread_num();
@@ -510,7 +510,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
stringstream error;
error << stacktrace() << "Error, Kcounter size has grown from " << init_size
- << " to " << kmer_counter_size << endl;
+ << " to " << kmer_counter_size << "\n";
throw (error.str());
}
@@ -526,15 +526,15 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
continue;
}
- // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
+ // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << "\n";
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
+ cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << "\n";
}
if (IRKE_COMMON::MONITOR >= 2) {
#pragma omp critical
- cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: " << kmer_count << endl;
+ cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: " << kmer_count << "\n";
}
unsigned int total_counts;
@@ -564,7 +564,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
/*
cout << "Inchworm-reconstructed sequence, length: " << sequence.length()
<< ", avgCov: " << avg_cov
- << " " << sequence << endl;
+ << " " << sequence << "\n";
*/
size_t contig_length = sequence.length();
@@ -572,10 +572,10 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
if (contig_length >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) {
- *(tmpfiles[myTid].fh) << total_counts << endl
- << avg_cov << endl
- << kmer_count << endl
- << sequence << endl;
+ *(tmpfiles[myTid].fh) << total_counts << "\n"
+ << avg_cov << "\n"
+ << kmer_count << "\n"
+ << sequence << "\n";
}
@@ -600,7 +600,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
}
if (IRKE_COMMON::MONITOR) {
- cerr << endl;
+ cerr << "\n";
}
if (WRITE_COVERAGE) {
@@ -639,7 +639,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
if (tmpreader.eof()) // apparently only happens on the read after the last line is read.
break;
- //cerr << "Read sequence: " << sequence << endl;
+ //cerr << "Read sequence: " << sequence << "\n";
unsigned int contig_hash = generateHash(sequence);
@@ -664,7 +664,7 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
sequence = add_fasta_seq_line_breaks(sequence, 60);
- cout << header << endl << sequence << endl;
+ cout << header << "\n" << sequence << "\n";
}
@@ -684,18 +684,18 @@ void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connecti
if (WRITE_COVERAGE) {
- coverage_writer << header << endl;
+ coverage_writer << header << "\n";
for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) {
coverage_writer << assembly_base_coverage[i];
if ( (i+1) % 30 == 0) {
- coverage_writer << endl;
+ coverage_writer << "\n";
}
else {
coverage_writer << " ";
}
}
- coverage_writer << endl;
+ coverage_writer << "\n";
}
}
@@ -721,7 +721,7 @@ bool IRKE::is_good_seed_kmer(kmer_int_type_t kmer, unsigned int kmer_count, unsi
// palindromic kmer, avoid palindromes as seeds
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl;
+ cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << "\n";
}
return(false);
@@ -730,7 +730,7 @@ bool IRKE::is_good_seed_kmer(kmer_int_type_t kmer, unsigned int kmer_count, unsi
if (kmer_count < MIN_SEED_COVERAGE) {
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "-seed has insufficient coverage, skipping" << endl;
+ cerr << "-seed has insufficient coverage, skipping" << "\n";
}
return(false);
@@ -742,7 +742,7 @@ bool IRKE::is_good_seed_kmer(kmer_int_type_t kmer, unsigned int kmer_count, unsi
if (entropy < MIN_SEED_ENTROPY) {
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "-skipping seed due to low entropy: " << entropy << endl;
+ cerr << "-skipping seed due to low entropy: " << entropy << "\n";
}
return(false);
@@ -774,7 +774,7 @@ vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t k
vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first;
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl;
+ cerr << "Forward path contains: " << forward_path.size() << " kmers. " << "\n";
}
@@ -783,7 +783,7 @@ vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t k
visitor.add(kmer);
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl;
+ cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << "\n";
}
}
@@ -795,9 +795,9 @@ vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t k
Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity);
if (IRKE_COMMON::MONITOR >= 2) {
vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
- cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl;
+ cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << "\n";
for (unsigned int i = 0; i < reverse_path.size(); i++) {
- cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl;
+ cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << "\n";
}
}
@@ -814,7 +814,7 @@ vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t k
Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor& visitor, float min_connectivity) {
- // cout << "inchworm" << endl;
+ // cout << "inchworm" << "\n";
Path_n_count_pair entire_path;
entire_path.second = 0; // init cumulative path coverage
@@ -833,11 +833,11 @@ Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_in
// terminate extension with probability of __DEVEL_rand_fracture_prob
float prob_to_fracture = rand() / (float) RAND_MAX;
- //cerr << "prob: " << prob_to_fracture << endl;
+ //cerr << "prob: " << prob_to_fracture << "\n";
if (prob_to_fracture <= IRKE_COMMON::__DEVEL_rand_fracture_prob) {
- // cerr << "Fracturing at iworm round: " << inchworm_round << " given P: " << prob_to_fracture << endl;
+ // cerr << "Fracturing at iworm round: " << inchworm_round << " given P: " << prob_to_fracture << "\n";
return(entire_path);
}
@@ -851,9 +851,9 @@ Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_in
}
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << endl << "Inchworm round(" << string(1,direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl;
+ cerr << "\n" << "Inchworm round(" << string(1,direction) << "): " << inchworm_round << " searching kmer: " << kmer << "\n";
string kmer_str = kcounter.get_kmer_string(kmer);
- cerr << kcounter.describe_kmer(kmer_str) << endl;
+ cerr << kcounter.describe_kmer(kmer_str) << "\n";
}
visitor.erase(kmer); // seed kmer must be not visited already.
@@ -905,7 +905,7 @@ Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_in
}
if (IRKE_COMMON::MONITOR >= 3)
- cerr << "No extension possible." << endl << endl;
+ cerr << "No extension possible." << "\n" << "\n";
return(entire_path);
@@ -928,7 +928,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
Kmer_visitor& eliminator, unsigned int inchworm_round, unsigned int depth,
float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) {
- // cout << "inchworm_step" << endl;
+ // cout << "inchworm_step" << "\n";
if (IRKE_COMMON::MONITOR >= 2) {
cerr << "\rinchworm: " << string(1,direction)
@@ -948,8 +948,8 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
) {
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << "base case, already visited or kmer doesn't exist." << endl;
- cerr << kmer.first << " already visited or doesn't exist. ending recursion at depth: " << depth << endl;
+ cerr << "base case, already visited or kmer doesn't exist." << "\n";
+ cerr << kmer.first << " already visited or doesn't exist. ending recursion at depth: " << depth << "\n";
}
return(best_path_n_pair);
@@ -959,7 +959,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
visitor.add(kmer.first);
if (PACMAN && depth > 0) {
- // cerr << "pacman eliminated kmer: " << kmer << endl;
+ // cerr << "pacman eliminated kmer: " << kmer << "\n";
eliminator.add(kmer.first);
}
@@ -977,7 +977,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
}
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << "Got " << kmer_candidates.size() << " kmer extension candidates." << endl;
+ cerr << "Got " << kmer_candidates.size() << " kmer extension candidates." << "\n";
}
bool tie = true;
@@ -999,7 +999,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
!visitor.exists(kmer_candidate.first) // avoid creating already visited kmers since they're unvisited below...
&& exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO) ) {
- //cout << endl << "\ttrying " << kmer_candidate << endl;
+ //cout << "\n" << "\ttrying " << kmer_candidate << "\n";
// recursive call here for extension
@@ -1025,7 +1025,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
int rand_index = rand() % paths.size();
tie = false;
if (IRKE_COMMON::MONITOR) {
- cerr << "IRKE_COMMON::__DEVEL_no_greedy_extend -- picking random path index: " << rand_index << " from size(): " << paths.size() << endl;
+ cerr << "IRKE_COMMON::__DEVEL_no_greedy_extend -- picking random path index: " << rand_index << " from size(): " << paths.size() << "\n";
}
best_path_n_pair = paths[rand_index];
}
@@ -1041,10 +1041,10 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
// got tie, two different paths and two different endpoints:
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl;
+ cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << "\n";
vector<unsigned int> v;
- cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl;
- cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl;
+ cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << "\n";
+ cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << "\n";
}
@@ -1054,7 +1054,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
int rand_index = rand() % 2;
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "IRKE_COMMON::__DEVEL_no_tie_breaking, so picking path: " << rand_index << " at random." << endl;
+ cerr << "IRKE_COMMON::__DEVEL_no_tie_breaking, so picking path: " << rand_index << " at random." << "\n";
}
best_path_n_pair = paths[rand_index];
@@ -1067,7 +1067,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
best_path_length = paths[0].first.size();
}
else {
- // cerr << "not able to delve further into the graph, though... Stopping here." << endl;
+ // cerr << "not able to delve further into the graph, though... Stopping here." << "\n";
tie = false;
best_path_n_pair = paths[0]; // pick one
}
@@ -1079,7 +1079,7 @@ Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Km
) {
if (IRKE_COMMON::MONITOR >= 3) {
- cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << endl;
+ cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << "\n";
}
tie = false;
best_path_n_pair = paths[0];
@@ -1228,7 +1228,7 @@ string IRKE::thread_sequence_through_graph(string& sequence) {
unsigned int kmer_length = kcounter.get_kmer_length();
if (sequence.length() < kmer_length) {
- cerr << "Sequence length: " << sequence.length() << " is too short to contain any kmers." << endl;
+ cerr << "Sequence length: " << sequence.length() << " is too short to contain any kmers." << "\n";
return("");
}
@@ -1238,7 +1238,7 @@ string IRKE::thread_sequence_through_graph(string& sequence) {
string kmer = sequence.substr(i, kmer_length);
- s << kcounter.describe_kmer(kmer) << endl;
+ s << kcounter.describe_kmer(kmer) << "\n";
}
return(s.str());
@@ -1358,7 +1358,7 @@ kmer_int_type_t IRKE::extract_best_seed(vector<kmer_int_type_t>& kmer_vec, KmerC
}
if (IRKE_COMMON::MONITOR >= 2) {
- cerr << "Parallel method found better seed: " << kcounter.get_kmer_string(best_seed) << " with count: " << best_kmer_count << endl;
+ cerr << "Parallel method found better seed: " << kcounter.get_kmer_string(best_seed) << " with count: " << best_kmer_count << "\n";
}
return(best_seed);
diff --git a/Inchworm/src/fastaToKmerCoverageStats.cpp b/Inchworm/src/fastaToKmerCoverageStats.cpp
index 1d8e287..7233e2e 100644
--- a/Inchworm/src/fastaToKmerCoverageStats.cpp
+++ b/Inchworm/src/fastaToKmerCoverageStats.cpp
@@ -168,7 +168,7 @@ void populate_kmer_counter(KmerCounter& kcounter, string& kmers_fasta_file) {
for (int i = 0; i < omp_get_max_threads(); i++) {
record_counter[i] = 0;
}
- cerr << "-reading Kmer occurences..." << endl;
+ cerr << "-reading Kmer occurrences..." << endl;
start = time(NULL);
Fasta_reader fasta_reader(kmers_fasta_file);
#pragma omp parallel private (myTid)
@@ -208,10 +208,17 @@ void populate_kmer_counter(KmerCounter& kcounter, string& kmers_fasta_file) {
}
vector<unsigned int> compute_kmer_coverage(string& sequence, KmerCounter& kcounter) {
- vector<unsigned int> coverage;
if(IRKE_COMMON::MONITOR) {
cerr << "processing sequence: " << sequence << endl;
}
+ if (sequence.length() < KMER_SIZE)
+ {
+ // Can't rely on length() - KMER_SIZE for this as length is unsigned
+ cerr << "Sequence: " << sequence << "is smaller than " << KMER_SIZE << " base pairs, skipping" << endl;
+ return vector<unsigned int>();
+ }
+
+ vector<unsigned int> coverage;
for (size_t i = 0; i <= sequence.length() - KMER_SIZE; i++) {
// cerr << "i: " << i << ", <= " << sequence.length() - KMER_SIZE << endl;
string kmer = sequence.substr(i, KMER_SIZE);
diff --git a/PerlLib/Fastq_reader.pm b/PerlLib/Fastq_reader.pm
index e6e9b02..3038fdc 100755
--- a/PerlLib/Fastq_reader.pm
+++ b/PerlLib/Fastq_reader.pm
@@ -23,14 +23,15 @@ sub new {
else {
if ( $fastqFile =~ /\.gz$/ ) {
open ($filehandle, "gunzip -c $fastqFile | ") or die "Error: Couldn't open compressed $fastqFile\n";
- }
- elsif ($fastqFile =~ /\.bz2$/) {
- open ($filehandle, "bunzip2 -c $fastqFile | ") or die "Error, couldn't open compressed $fastqFile $!";
-
- } else {
- open ($filehandle, $fastqFile) or die "Error: Couldn't open $fastqFile\n";
- }
-
+ }
+ elsif ($fastqFile =~ /\.xz$/) {
+ open($filehandle, "xz -c -d ${fastqFile} | ") or die "Error, couldn't open compressed $fastqFile $!";
+ }
+ elsif ($fastqFile =~ /\.bz2$/) {
+ open ($filehandle, "bunzip2 -c $fastqFile | ") or die "Error, couldn't open compressed $fastqFile $!";
+ } else {
+ open ($filehandle, $fastqFile) or die "Error: Couldn't open $fastqFile\n";
+ }
$self->{fastqFile} = $fastqFile;
}
diff --git a/PerlLib/test_htc_gridrunner_LSF.pl b/PerlLib/test_htc_gridrunner_LSF.pl
index 46e9816..0d770a4 100755
--- a/PerlLib/test_htc_gridrunner_LSF.pl
+++ b/PerlLib/test_htc_gridrunner_LSF.pl
@@ -3,11 +3,11 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin");
+use lib ("$FindBin::RealBin");
use HTC::GridRunner;
-my $config_file = "$FindBin::Bin/../htc_conf/BroadInst_LSF.test.conf";
+my $config_file = "$FindBin::RealBin/../htc_conf/BroadInst_LSF.test.conf";
main: {
diff --git a/PerlLib/test_htc_gridrunner_SGE.pl b/PerlLib/test_htc_gridrunner_SGE.pl
index 678479f..a0f78f3 100755
--- a/PerlLib/test_htc_gridrunner_SGE.pl
+++ b/PerlLib/test_htc_gridrunner_SGE.pl
@@ -3,11 +3,11 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin");
+use lib ("$FindBin::RealBin");
use HTC::GridRunner;
-my $config_file = "$FindBin::Bin/../htc_conf/BroadInst_SGE.test.conf";
+my $config_file = "$FindBin::RealBin/../htc_conf/BroadInst_SGE.test.conf";
main: {
diff --git a/README b/README
index 2648976..90305bb 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
All documentation for Trinity is provided at the Trinity website:
-http://TrinityRNASeq.sourceforge.net
+http://trinityrnaseq.github.io
diff --git a/README.md b/README.md
index f94d5f3..bc216bc 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
trinityrnaseq
=============
-Trinity RNA-Seq de novo transcriptome assembly
+Trinity RNA-Seq de novo transcriptome assembly see the main webpage [http://trinityrnaseq.github.io](http://trinityrnaseq.github.io)
diff --git a/Trinity b/Trinity
index 94f5c06..3845021 100755
--- a/Trinity
+++ b/Trinity
@@ -17,12 +17,12 @@ use Pipeliner;
use Fasta_reader;
use List::Util qw(min max);
-my $VERSION = "v2.1.1";
+my $VERSION = "v2.2.0";
BEGIN {
- $ENV{TRINITY_HOME} = "$FindBin::Bin";
+ $ENV{TRINITY_HOME} = "$FindBin::RealBin";
}
@@ -250,6 +250,7 @@ my $basic_usage = qq^
# (def=$min_contig_length)
#
# --long_reads <string> :fasta file containing error-corrected or circular consensus (CCS) pac bio reads
+# (** note: experimental parameter **, this functionality continues to be under development)
#
# --genome_guided_bam <string> :genome guided mode, provide path to coordinate-sorted bam file.
# (see genome-guided param section under --show_full_usage_info)
@@ -823,6 +824,7 @@ if ($show_version_flag) {
exit(1);
}
+
if ($NO_CLEANUP && $FULL_CLEANUP) {
die "cannot set --no_cleanup and --full_cleanup as they contradict";
}
@@ -1035,6 +1037,7 @@ elsif ($CUFFFLY_MODE) {
+
my $PARALLEL_SAMTOOLS_SORT_TOKEN = "-\@ $CPU";
@@ -1220,6 +1223,7 @@ main: {
if (-s "Trinity.fasta.tmp") {
rename("Trinity.fasta.tmp", "$output_directory.Trinity.fasta") or die "Error, cannot rename Trinity.fasta.tmp to $output_directory.Trinity.fasta";
+
print "\n\n"
. "###################################################################\n"
@@ -1248,10 +1252,10 @@ main: {
if (-s $butterfly_output_filename) {
- print "\n\n";
- print "###################################################################\n";
- print "Butterfly assemblies are written to $output_directory/$butterfly_output_filename\n";
- print "###################################################################\n\n\n" unless $TRINITY_COMPLETE_FLAG;
+ print "\n\n"
+ . "###################################################################\n"
+ . "Butterfly assemblies are written to $output_directory/$butterfly_output_filename\n"
+ . "###################################################################\n\n\n" unless $TRINITY_COMPLETE_FLAG;
}
else {
@@ -1554,7 +1558,7 @@ sub run_chrysalis {
$pipeliner->add_commands( new Command($cmd, "$iworm_min100_fa_file.bowtie_build.ok"));
my $bowtie_sam_file = "$chrysalis_output_dir/iworm.bowtie.nameSorted.bam";
- my $samtools_max_memory = int($jellyfish_ram/$CPU);
+ my $samtools_max_memory = int($jellyfish_ram/($CPU*2));
if ($long_reads){
$cmd = "bash -c \" set -o pipefail;bowtie2 --local -a --threads $CPU -f $iworm_min100_fa_file $bowtie_reads_fa | samtools view $PARALLEL_SAMTOOLS_SORT_TOKEN -F4 -Sb - | samtools sort -m $samtools_max_memory $PARALLEL_SAMTOOLS_SORT_TOKEN -no - - > $bowtie_sam_file\" ";
}
@@ -2084,15 +2088,28 @@ sub prep_seqs {
if ($f=~/\.gz$/){
$fastool_cmd = "gunzip -c $f | $FASTOOL_DIR/fastool";
$linecount_cmd = "gunzip -c $f | wc -l";
- }elsif ($f=~/\.bz2$/){
+ } elsif ($f=~/\.bz2$/){
$fastool_cmd = "bunzip2 -dkc $f | $FASTOOL_DIR/fastool";
$linecount_cmd = "bunzip2 -dkc $f | wc -l";
+ } elsif ($f =~ /\.xz/) {
+ $fastool_cmd = "xz -dc ${f} | ${FASTOOL_DIR}/fastool";
+ $linecount_cmd = "xz -dc ${f} | wc -l";
+ ## I would like to suggest that these if statements are not necessary if one just does
+ ## qx"less ${f} |" because less has smart input filters in place and will automagically
+ ## handle all the likely compression formats.
}
if ($SS_lib_type && $SS_lib_type eq "R") {
$fastool_cmd .= " --rev";
}
- $fastool_cmd .= " --illumina-trinity --to-fasta >> $file_prefix.fa 2> $f.readcount ";
+ my $style = "--illumina-trinity";
+ if ($file_prefix eq "left"){
+ $style = "--append /1";
+ }
+ elsif ($file_prefix eq "right"){
+ $style = "--append /2";
+ }
+ $fastool_cmd .= " $style --to-fasta >> $file_prefix.fa 2> $f.readcount ";
my $cmd = $fastool_cmd;
&process_cmd($cmd);
@@ -2108,7 +2125,9 @@ sub prep_seqs {
$cmd = "gunzip -c $initial_file_str >$file_prefix.fa";
}elsif ($initial_file_str=~/\.bz2$/){
$cmd = "bunzip2 -dkc $initial_file_str >$file_prefix.fa";
- }
+ } elsif ($initial_file_str =~ /\.xz$/) {
+ $cmd = "xz -dc ${initial_file_str} > ${file_prefix}.fa";
+ }
&process_cmd($cmd);
}elsif(scalar(@initial_files) > 1 && (!$SS_lib_type || $SS_lib_type ne "R")){
foreach my $f (@initial_files){
@@ -2117,6 +2136,8 @@ sub prep_seqs {
$cmd = "gunzip -c $f >> $file_prefix.fa";
}elsif ($f=~/\.bz2$/){
$cmd = "bunzip2 -dkc $f >> $file_prefix.fa";
+ } elsif ($f =~ /\.xz$/) {
+ $cmd = "xz -dc ${f} >> ${file_prefix}.fa";
}
&process_cmd($cmd);
}
@@ -2128,6 +2149,8 @@ sub prep_seqs {
$cmd = "gunzip -c $f | $UTILDIR/support_scripts/revcomp_fasta.pl >> $file_prefix.fa";
}elsif ($f=~/\.bz2$/){
$cmd = "bunzip2 -dkc $f | $UTILDIR/support_scripts/revcomp_fasta.pl >> $file_prefix.fa";
+ } elsif ($f =~ /\.xz$/) {
+ $cmd = "xz -dc ${f} | ${UTILDIR}/support_scripts/revcomp_fasta.pl >> ${file_prefix}.fa";
}
&process_cmd($cmd);
}
@@ -2926,6 +2949,10 @@ sub add_fifo_for_gzip {
}
elsif ($file =~ /\.gz$/) {
$file = "<(zcat $file)";
+ } elsif ($file =~ /\.xz$/) {
+ $file = "<(xzcat ${file})";
+ } elsif ($file =~ /\.bz2$/) {
+ $file = "<(bzcat ${file})";
}
}
diff --git a/sample_data/__regression_tests/test_GraphFromFasta/runMe.sh b/sample_data/__regression_tests/test_GraphFromFasta/runMe.sh
index 2fd10c9..aa66e48 100644
--- a/sample_data/__regression_tests/test_GraphFromFasta/runMe.sh
+++ b/sample_data/__regression_tests/test_GraphFromFasta/runMe.sh
@@ -1 +1,2 @@
+#!/bin/sh
../../../Chrysalis/GraphFromFasta -i inchworm.K25.L25.fa -r both.fa -min_contig_length 100 -min_glue 0 -glue_factor 0 -min_iso_ratio 0 -t 4 -k 24 -kk 48
diff --git a/sample_data/test_DE_analysis/Makefile b/sample_data/test_DE_analysis/Makefile
index 5208ab6..4a18e13 100644
--- a/sample_data/test_DE_analysis/Makefile
+++ b/sample_data/test_DE_analysis/Makefile
@@ -1,6 +1,6 @@
-test: test_edgeR test_DESeq2 test_voom test_ROTS
+test: test_edgeR test_DESeq2 test_voom test_ROTS test_PtR_PCA
test_edgeR:
../../Analysis/DifferentialExpression/run_DE_analysis.pl -m Trinity_trans.counts.matrix --method edgeR -s samples.txt -o edgeR_outdir
@@ -18,6 +18,10 @@ test_ROTS:
../../Analysis/DifferentialExpression/run_DE_analysis.pl -m Trinity_trans.counts.matrix --method ROTS -s samples.txt -o ROTS_outdir --ROTS_B 50 # make it go faster, use more in practice
cd ROTS_outdir && ../../../Analysis/DifferentialExpression/analyze_diff_expr.pl --matrix ../Trinity_trans.TMM.EXPR.matrix --samples ../samples.txt
+
+test_PtR_PCA:
+ ../../Analysis/DifferentialExpression/PtR -m Trinity_trans.TMM.EXPR.matrix -s samples.txt --prin_comp 3 --add_prin_comp_heatmaps 30
+
clean:
rm -rf ./edgeR_outdir
rm -rf ./DESeq2_outdir
diff --git a/sample_data/test_GOSeq_trinotate_pipe/Spombe/cleanme.pl b/sample_data/test_GOSeq_trinotate_pipe/Spombe/cleanme.pl
index 17e6f9e..b867e6d 100755
--- a/sample_data/test_GOSeq_trinotate_pipe/Spombe/cleanme.pl
+++ b/sample_data/test_GOSeq_trinotate_pipe/Spombe/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
diff --git a/sample_data/test_GOSeq_trinotate_pipe/Spombe/runMe.sh b/sample_data/test_GOSeq_trinotate_pipe/Spombe/runMe.sh
index 6f91066..f6079bc 100755
--- a/sample_data/test_GOSeq_trinotate_pipe/Spombe/runMe.sh
+++ b/sample_data/test_GOSeq_trinotate_pipe/Spombe/runMe.sh
@@ -1,3 +1,4 @@
+#!/bin/sh
../../../Analysis/DifferentialExpression/run_GOseq.pl --factor_labeling hs_induced_vs_log.factors --GO_assignments Trinotate_report.xls.trans.gene_ontology --lengths Trinity.seq_lengths
../../../Analysis/DifferentialExpression/run_GOseq.pl --factor_labeling ds_induced_vs_log.factors --GO_assignments Trinotate_report.xls.trans.gene_ontology --lengths Trinity.seq_lengths
diff --git a/sample_data/test_GenomeGuidedTrinity/cleanme.pl b/sample_data/test_GenomeGuidedTrinity/cleanme.pl
index cadbffa..6a792fd 100755
--- a/sample_data/test_GenomeGuidedTrinity/cleanme.pl
+++ b/sample_data/test_GenomeGuidedTrinity/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
diff --git a/sample_data/test_InSilicoReadNormalization/Makefile b/sample_data/test_InSilicoReadNormalization/Makefile
index dc468b1..00dad80 100644
--- a/sample_data/test_InSilicoReadNormalization/Makefile
+++ b/sample_data/test_InSilicoReadNormalization/Makefile
@@ -2,6 +2,7 @@
test:
./test_PE_normalization.sh
./test_SE_normalization.sh
+ ./test_PE_normalization.mult_read_sets.sh
clean:
./cleanme.pl
diff --git a/sample_data/test_InSilicoReadNormalization/cleanme.pl b/sample_data/test_InSilicoReadNormalization/cleanme.pl
index 1ea0739..69d84bc 100755
--- a/sample_data/test_InSilicoReadNormalization/cleanme.pl
+++ b/sample_data/test_InSilicoReadNormalization/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
@@ -20,15 +20,17 @@ my @files_to_keep = qw (cleanme.pl
test_PE_normalization.w_base_cov_stats.sh
test_PE_normalization.sh
+test_PE_normalization.mult_read_sets.sh
);
my %keep = map { + $_ => 1 } @files_to_keep;
-`rm -rf tmp_normalized_reads/`;
-`rm -rf single_tmp_norm_reads/`;
-`rm -rf tmp_PE_norm_dir`;
+`rm -rf ./tmp_normalized_reads/`;
+`rm -rf ./single_tmp_norm_reads/`;
+`rm -rf ./tmp_PE_norm_dir`;
+`rm -rf ./test_multi_read_sets_norm_outdir/`;
foreach my $file (<*>) {
diff --git a/sample_data/test_InSilicoReadNormalization/test_PE_normalization.mult_read_sets.sh b/sample_data/test_InSilicoReadNormalization/test_PE_normalization.mult_read_sets.sh
new file mode 100755
index 0000000..ebdae52
--- /dev/null
+++ b/sample_data/test_InSilicoReadNormalization/test_PE_normalization.mult_read_sets.sh
@@ -0,0 +1,11 @@
+#!/bin/bash -ve
+
+# just for testing purposes, use --max_cov 30 or higher for real applications.
+../../util/insilico_read_normalization.pl --JM 2G \
+ --left ../test_Trinity_Assembly/reads.left.fq.gz,../test_Trinity_Assembly/reads2.left.fq.gz \
+ --right ../test_Trinity_Assembly/reads.right.fq.gz,../test_Trinity_Assembly/reads2.right.fq.gz \
+ --seqType fq --max_cov 5 --pairs_together --no_cleanup --CPU 4 --tmp_dir_name tmp_PE_mult_norm_dir \
+ --output test_multi_read_sets_norm_outdir
+
+
+
diff --git a/sample_data/test_Inchworm/runMe_MPI.sh b/sample_data/test_Inchworm/runMe_MPI.sh
index a068773..4d16e13 100644
--- a/sample_data/test_Inchworm/runMe_MPI.sh
+++ b/sample_data/test_Inchworm/runMe_MPI.sh
@@ -1,3 +1,4 @@
+#!/bin/sh
if [ ! -e jellyfish.kmers.fa ]; then
gunzip -c jellyfish.kmers.fa.gz > jellyfish.kmers.fa
fi
diff --git a/sample_data/test_TissueSpecificityGraph/DE_results.tar.gz b/sample_data/test_TissueSpecificityGraph/DE_results.tar.gz
new file mode 100644
index 0000000..969e26c
Binary files /dev/null and b/sample_data/test_TissueSpecificityGraph/DE_results.tar.gz differ
diff --git a/sample_data/test_TissueSpecificityGraph/Makefile b/sample_data/test_TissueSpecificityGraph/Makefile
new file mode 100644
index 0000000..86e56f7
--- /dev/null
+++ b/sample_data/test_TissueSpecificityGraph/Makefile
@@ -0,0 +1,10 @@
+test:
+ tar xvf DE_results.tar.gz
+ gunzip -c transcripts.TMM.fpkm.avg_reps.matrix.gz > transcripts.TMM.fpkm.avg_reps.matrix
+ ../../Analysis/DifferentialExpression/TissueEnrichment/DE_results_to_pairwise_summary.pl transcripts.TMM.fpkm.avg_reps.matrix . > DE_pairwise_summary.txt
+ ../../Analysis/DifferentialExpression/TissueEnrichment/pairwise_DE_summary_to_DE_classification.pl DE_pairwise_summary.txt
+
+clean:
+ rm -f ./*.DE_results
+ rm -f ./DE_pairwise_summary.txt*
+ rm -f ./transcripts.TMM.fpkm.avg_reps.matrix
diff --git a/sample_data/test_TissueSpecificityGraph/transcripts.TMM.fpkm.avg_reps.matrix.gz b/sample_data/test_TissueSpecificityGraph/transcripts.TMM.fpkm.avg_reps.matrix.gz
new file mode 100644
index 0000000..44e5755
Binary files /dev/null and b/sample_data/test_TissueSpecificityGraph/transcripts.TMM.fpkm.avg_reps.matrix.gz differ
diff --git a/sample_data/test_Trinity_Assembly/Makefile b/sample_data/test_Trinity_Assembly/Makefile
index ded6e8e..cebf851 100644
--- a/sample_data/test_Trinity_Assembly/Makefile
+++ b/sample_data/test_Trinity_Assembly/Makefile
@@ -3,7 +3,7 @@ test_trinity:
./runMe.sh
-test: test_assembly test_abundance_estimation test_bowtie_PE_read_estimates
+test: test_assembly test_bowtie_PE_read_estimates
test_assembly:
@@ -13,18 +13,8 @@ test_assembly:
./misc_run_tests/__runMe_with_qual_trimming_and_normalization.sh
./misc_run_tests/__test_runMe_with_jaccard_clip.sh
./misc_run_tests/__runMe_with_qual_trimming_and_normalize_libs_separately.sh
+ ./misc_run_tests/__runMe_include_long_reads.sh
-test_abundance_estimation: test_RSEM test_eXpress test_kallisto
-
-test_RSEM:
- ../../util/align_and_estimate_abundance.pl --transcripts trinity_out_dir/Trinity.fasta --seqType fq --left reads.left.fq --right reads.right.fq --SS_lib_type RF --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference --output_dir RSEM_PE
-
-
-test_eXpress:
- ../../util/align_and_estimate_abundance.pl --transcripts trinity_out_dir/Trinity.fasta --seqType fq --left reads.left.fq --right reads.right.fq --SS_lib_type RF --est_method eXpress --aln_method bowtie2 --trinity_mode --prep_reference --output_dir eXpress_PE
-
-test_kallisto:
- ../../util/align_and_estimate_abundance.pl --transcripts trinity_out_dir/Trinity.fasta --seqType fq --left reads.left.fq --right reads.right.fq --est_method kallisto --trinity_mode --prep_reference --output_dir kallisto_PE
test_bowtie_PE_read_estimates:
../../util/bowtie_PE_separate_then_join.pl --seqType fq --left reads.left.fq --right reads.right.fq --target trinity_out_dir/Trinity.fasta --aligner bowtie
diff --git a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/align_reads_via_bowtie.sh b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/align_reads_via_bowtie.sh
index 2c161de..0893b63 100755
--- a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/align_reads_via_bowtie.sh
+++ b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/align_reads_via_bowtie.sh
@@ -1 +1,2 @@
-/home/unix/bhaas/SVN/trinityrnaseq/trunk/util/bowtie_PE_separate_then_join.pl --seqType fq --left ../reads.left.fq --right ../reads.right.fq --target refSeqs.fa --aligner bowtie -- -p 4 --all --best --strata -m 300
+#!/bin/bash
+/usr/lib/trinityrnaseq/util/bowtie_PE_separate_then_join.pl --seqType fq --left ../reads.left.fq --right ../reads.right.fq --target refSeqs.fa --aligner bowtie -- -p 4 --all --best --strata -m 300
diff --git a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.clean.sh b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.clean.sh
index 9bd8482..49fbaf0 100644
--- a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.clean.sh
+++ b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.clean.sh
@@ -1 +1,2 @@
+#!/bin/sh
../../../../Trinity.pl --seqType fq --left clean.left.fq --right clean.right.fq --seqType fq --JM 1G --bfly_opts "-V 15 --stderr "
diff --git a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.sh b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.sh
index 0644b52..21b8f1d 100644
--- a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.sh
+++ b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex05/runMe.sh
@@ -1 +1,2 @@
+#!/bin/sh
../../../../Trinity.pl --seqType fq --left ex5.reads.left.fq --right ex5.reads.right.fq --seqType fq --JM 1G --bfly_opts "-V 15 --stderr"
diff --git a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex09/runMe.sh b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex09/runMe.sh
index 0a3461d..8c0c804 100755
--- a/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex09/runMe.sh
+++ b/sample_data/test_Trinity_Assembly/__indiv_ex_sample_derived/ex09/runMe.sh
@@ -1,2 +1,3 @@
+#!/bin/sh
../../Trinity.pl --seqType fq --left ex9.reads.left.fq --right ex9.reads.right.fq --SS_lib_type RF --bfly_opts "--edge-thr=0.05 --stderr -V 18" --run_butterfly --output trinity_outdir
diff --git a/sample_data/test_Trinity_Assembly/cleanme.pl b/sample_data/test_Trinity_Assembly/cleanme.pl
index f6520ab..0e6392d 100755
--- a/sample_data/test_Trinity_Assembly/cleanme.pl
+++ b/sample_data/test_Trinity_Assembly/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
@@ -60,6 +60,8 @@ my %keep = map { + $_ => 1 } @files_to_keep;
`rm -rf ./eXpress_PE`;
`rm -rf ./kallisto_PE`;
`rm -rf ./RSEM_PE`;
+`rm -rf ./test_trinity_long_reads`;
+
foreach my $file (<*>) {
diff --git a/sample_data/test_Trinity_Assembly/misc_run_tests/__runMe_include_long_reads.sh b/sample_data/test_Trinity_Assembly/misc_run_tests/__runMe_include_long_reads.sh
index 9dda7b5..e22c41c 100755
--- a/sample_data/test_Trinity_Assembly/misc_run_tests/__runMe_include_long_reads.sh
+++ b/sample_data/test_Trinity_Assembly/misc_run_tests/__runMe_include_long_reads.sh
@@ -5,7 +5,7 @@
#######################################################
## use jellyfish
-../../Trinity --seqType fq --max_memory 2G --left reads.left.fq.gz --right reads.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup --long_reads longReads.fa
+../../Trinity --seqType fq --max_memory 2G --left reads.left.fq.gz --right reads.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup --long_reads longReads.fa --output test_trinity_long_reads
##### Done Running Trinity #####
diff --git a/sample_data/test_Trinity_Assembly/runMe.sh b/sample_data/test_Trinity_Assembly/runMe.sh
index fb16b44..669ac5d 100755
--- a/sample_data/test_Trinity_Assembly/runMe.sh
+++ b/sample_data/test_Trinity_Assembly/runMe.sh
@@ -23,7 +23,7 @@ fi
## Run Trinity to Generate Transcriptome Assemblies ##
#######################################################
-../../Trinity --seqType fq --max_memory 2G --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup
+../../Trinity --seqType fq --max_memory 2G --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup --normalize_reads
##### Done Running Trinity #####
diff --git a/sample_data/test_align_and_estimate_abundance/Makefile b/sample_data/test_align_and_estimate_abundance/Makefile
index 2c0d233..bc491d3 100644
--- a/sample_data/test_align_and_estimate_abundance/Makefile
+++ b/sample_data/test_align_and_estimate_abundance/Makefile
@@ -1,17 +1,14 @@
-test: test_RSEM test_eXpress test_kallisto
- ../../Analysis/DifferentialExpression/merge_matrices.pl RSEM-trans.TPM.not_cross_norm eXpress-trans.TPM.not_cross_norm kallisto-trans.TPM.not_cross_norm > combined.TPM.not_cross_norm.matrix
- ./plot_paired_comparisons.Rscript
-test_RSEM:
- ./align_and_estimate_tester.pl RSEM
+DIRS = PAIRED_END_ABUNDANCE_ESTIMATION SINGLE_END_ABUNDANCE_ESTIMATION
-test_eXpress:
- ./align_and_estimate_tester.pl eXpress
-
-test_kallisto:
- ./align_and_estimate_tester.pl kallisto
+test:
+ @for i in $(DIRS); do \
+ echo "Running example in $$i..."; \
+ (cd $$i; $(MAKE) test) || exit $$?; done
clean:
- ./cleanme.pl
+ @for i in $(DIRS); do \
+ echo "Running example in $$i..."; \
+ (cd $$i; $(MAKE) clean) || exit $$?; done
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/Makefile b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/Makefile
new file mode 100644
index 0000000..eb12951
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/Makefile
@@ -0,0 +1,41 @@
+SAMPLES="samples.txt"
+TRINITY_FASTA="../../test_DATA/Trinity.fasta"
+
+test: test_RSEM test_eXpress test_kallisto test_salmon
+ ../../../Analysis/DifferentialExpression/merge_matrices.pl RSEM-trans.TPM.not_cross_norm express-trans.TPM.not_cross_norm kallisto-trans.TPM.not_cross_norm salmon-quasi-trans.TPM.not_cross_norm salmon-fmd-trans.TPM.not_cross_norm > combined.TPM.not_cross_norm.matrix
+ ../plot_paired_comparisons.Rscript
+
+test_RSEM:
+ifeq ("", "$(shell which rsem-calculate-expression)")
+ echo WARNING: skipping RSEM test, cannot locate rsem-calculate-expression tool in PATH
+else
+ ../align_and_estimate_tester.pl RSEM ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_eXpress:
+ifeq ("", "$(shell which express)")
+ echo WARNING: skipping express test, cant locate express tool in PATH
+else
+ ../align_and_estimate_tester.pl express ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_kallisto:
+ifeq ("", "$(shell which kallisto)")
+ echo WARNING: cannot find kallisto installed in PATH, skipping kallisto test
+else
+ ../align_and_estimate_tester.pl kallisto ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_salmon:
+ifeq ("", "$(shell which salmon)")
+ echo WARNING: cannot find salmon installed in PATH, skipping salmon test
+else
+ ../align_and_estimate_tester.pl salmon-quasi ${SAMPLES} ${TRINITY_FASTA}
+ ../align_and_estimate_tester.pl salmon-fmd ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+
+clean:
+ ./cleanme.pl
+
+
diff --git a/sample_data/test_align_and_estimate_abundance/cleanme.pl b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/cleanme.pl
similarity index 56%
copy from sample_data/test_align_and_estimate_abundance/cleanme.pl
copy to sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/cleanme.pl
index b768bdc..7cd38c2 100755
--- a/sample_data/test_align_and_estimate_abundance/cleanme.pl
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/cleanme.pl
@@ -7,23 +7,31 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
my @files_to_keep = qw (
cleanme.pl
Makefile
-align_and_estimate_tester.pl
+align_and_estimate_tester_PAIRED_END.pl
samples.txt
-plot_paired_comparisons.Rscript
+
+schizo.samples.txt
+test_Schizo.sh
+
+mouse.samples.txt
+test_Mouse.sh
+
+drosoph.samples.txt
+test_Drosoph.sh
+
);
my %keep = map { + $_ => 1 } @files_to_keep;
-
foreach my $file (<*>) {
if (-f $file && ! $keep{$file}) {
@@ -34,8 +42,12 @@ foreach my $file (<*>) {
`rm -rf ./RSEM-*`;
-`rm -rf ./eXpress-*`;
+`rm -rf ./express-*`;
`rm -rf ./kallisto-*`;
+`rm -rf ./salmon-*`;
+`rm -rf Trinity.fasta.salmon*`;
+`rm -rf mm9_ucsc_genomeStudio_genes.gtf*`;
+`rm -rf SP2_GeneDB.290110.prot_coding.cds.nr*`;
exit(0);
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt
new file mode 100644
index 0000000..91ed8fb
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt
@@ -0,0 +1,6 @@
+DROSOPH /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.left.fa.gz /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.right.fa.gz
+
+
+--seqType fa
+--trinity_mode
+
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt
new file mode 100644
index 0000000..34100e2
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt
@@ -0,0 +1,6 @@
+DROSOPH /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.left.fa.gz /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.right.fa.gz
+
+
+--seqType fa
+--gene_trans_map /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/Annotations/drosoph.gene_trans_map
+
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt
new file mode 100644
index 0000000..28add80
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt
@@ -0,0 +1,6 @@
+MOUSE /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.left.fq /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.right.fq
+
+
+--SS_lib_type RF
+--seqType fq
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt
new file mode 100644
index 0000000..b29c3f3
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt
@@ -0,0 +1,6 @@
+MOUSE /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.left.fq /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.right.fq
+
+
+--SS_lib_type RF
+--seqType fq
+--gene_trans_map /seq/regev_genome_portal/RESOURCES/mouse/mm9/Annotations/mm9_ucsc_genomeStudio_genes/mm9_ucsc_genomeStudio_genes.gtf.gene_iso_map
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt
new file mode 100644
index 0000000..c1fb461
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt
@@ -0,0 +1,6 @@
+SCHIZO /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.left.fq.gz /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.right.fq.gz
+
+
+--SS_lib_type RF
+--seqType fq
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt
new file mode 100644
index 0000000..251d8ba
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt
@@ -0,0 +1,6 @@
+SCHIZO /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.left.fq.gz /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.right.fq.gz
+
+
+--SS_lib_type RF
+--seqType fq
+--gene_trans_map /home/unix/bhaas/utilities/schizo/SP2.gene_trans_map
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh
new file mode 100644
index 0000000..a8b3b90
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/trinity_out_dir/Trinity.fasta SAMPLES=misc_tests/drosoph_denovo.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh
new file mode 100644
index 0000000..2130fd2
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/Annotations/Drosoph_coding.cdna SAMPLES=misc_tests/drosoph_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh
new file mode 100644
index 0000000..159a1bb
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/trinity_out_dir/Trinity.fasta SAMPLES=misc_tests/mouse_denovo.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh
new file mode 100644
index 0000000..8924f25
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/regev_genome_portal/RESOURCES/mouse/mm9/Annotations/mm9_ucsc_genomeStudio_genes/mm9_ucsc_genomeStudio_genes.gtf.trans.fa SAMPLES=misc_tests/mouse_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh
new file mode 100644
index 0000000..cc6e694
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/trinity_out_dir/Trinity.fasta SAMPLES=misc_tests/schizo_denovo.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh
new file mode 100644
index 0000000..1de64f7
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/home/unix/bhaas/utilities/schizo/SP2_GeneDB.290110.prot_coding.cds.nr SAMPLES=misc_tests/schizo_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/samples.txt b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/samples.txt
new file mode 100644
index 0000000..5ba177a
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/PAIRED_END_ABUNDANCE_ESTIMATION/samples.txt
@@ -0,0 +1,8 @@
+heatshock ../../test_DATA/Sp_hs.10k.left.fq.gz ../../test_DATA/Sp_hs.10k.right.fq.gz
+plateau ../../test_DATA/Sp_plat.10k.right.fq.gz ../../test_DATA/Sp_plat.10k.left.fq.gz
+log_growth ../../test_DATA/Sp_log.10k.left.fq.gz ../../test_DATA/Sp_log.10k.right.fq.gz
+diauxic_shift ../../test_DATA/Sp_ds.10k.right.fq.gz ../../test_DATA/Sp_ds.10k.left.fq.gz
+
+--SS_lib_type RF
+--seqType fq
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/Makefile b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/Makefile
new file mode 100644
index 0000000..eb12951
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/Makefile
@@ -0,0 +1,41 @@
+SAMPLES="samples.txt"
+TRINITY_FASTA="../../test_DATA/Trinity.fasta"
+
+test: test_RSEM test_eXpress test_kallisto test_salmon
+ ../../../Analysis/DifferentialExpression/merge_matrices.pl RSEM-trans.TPM.not_cross_norm express-trans.TPM.not_cross_norm kallisto-trans.TPM.not_cross_norm salmon-quasi-trans.TPM.not_cross_norm salmon-fmd-trans.TPM.not_cross_norm > combined.TPM.not_cross_norm.matrix
+ ../plot_paired_comparisons.Rscript
+
+test_RSEM:
+ifeq ("", "$(shell which rsem-calculate-expression)")
+ echo WARNING: skipping RSEM test, cannot locate rsem-calculate-expression tool in PATH
+else
+ ../align_and_estimate_tester.pl RSEM ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_eXpress:
+ifeq ("", "$(shell which express)")
+ echo WARNING: skipping express test, cant locate express tool in PATH
+else
+ ../align_and_estimate_tester.pl express ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_kallisto:
+ifeq ("", "$(shell which kallisto)")
+ echo WARNING: cannot find kallisto installed in PATH, skipping kallisto test
+else
+ ../align_and_estimate_tester.pl kallisto ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+test_salmon:
+ifeq ("", "$(shell which salmon)")
+ echo WARNING: cannot find salmon installed in PATH, skipping salmon test
+else
+ ../align_and_estimate_tester.pl salmon-quasi ${SAMPLES} ${TRINITY_FASTA}
+ ../align_and_estimate_tester.pl salmon-fmd ${SAMPLES} ${TRINITY_FASTA}
+endif
+
+
+clean:
+ ./cleanme.pl
+
+
diff --git a/sample_data/test_align_and_estimate_abundance/cleanme.pl b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/cleanme.pl
similarity index 55%
rename from sample_data/test_align_and_estimate_abundance/cleanme.pl
rename to sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/cleanme.pl
index b768bdc..0d1a3fe 100755
--- a/sample_data/test_align_and_estimate_abundance/cleanme.pl
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/cleanme.pl
@@ -7,21 +7,30 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
my @files_to_keep = qw (
cleanme.pl
Makefile
-align_and_estimate_tester.pl
+align_and_estimate_tester_SINGLE_END.pl
samples.txt
-plot_paired_comparisons.Rscript
+
+test_Schizo.sh
+schizo.samples.txt
+
+test_Mouse.sh
+mouse.samples.txt
+
+test_Drosoph.sh
+drosoph.samples.txt
+
);
-my %keep = map { + $_ => 1 } @files_to_keep;
+my %keep = map { + $_ => 1 } @files_to_keep;
foreach my $file (<*>) {
@@ -34,8 +43,12 @@ foreach my $file (<*>) {
`rm -rf ./RSEM-*`;
-`rm -rf ./eXpress-*`;
+`rm -rf ./express-*`;
`rm -rf ./kallisto-*`;
+`rm -rf ./salmon-*`;
+`rm -rf Trinity.fasta.salmon*`;
+`rm -rf mm9_ucsc_genomeStudio_genes.gtf*`;
+`rm -rf SP2_GeneDB.290110.prot_coding.cds.nr*`;
exit(0);
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt
new file mode 100644
index 0000000..7572281
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_denovo.samples.txt
@@ -0,0 +1,5 @@
+DROSOPH /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.left.fa.gz
+
+
+--seqType fa
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt
new file mode 100644
index 0000000..a964778
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/drosoph_ref.samples.txt
@@ -0,0 +1,6 @@
+DROSOPH /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/Drosoph.10M.left.fa.gz
+
+
+--seqType fa
+--gene_trans_map /seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/Annotations/drosoph.gene_trans_map
+
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt
new file mode 100644
index 0000000..4eed62e
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_denovo.samples.txt
@@ -0,0 +1,6 @@
+MOUSE /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.left.fq
+
+
+--SS_lib_type R
+--seqType fq
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt
new file mode 100644
index 0000000..0f41deb
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/mouse_ref.samples.txt
@@ -0,0 +1,6 @@
+MOUSE /seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/10M.left.fq
+
+
+--SS_lib_type R
+--seqType fq
+--gene_trans_map /seq/regev_genome_portal/RESOURCES/mouse/mm9/Annotations/mm9_ucsc_genomeStudio_genes/mm9_ucsc_genomeStudio_genes.gtf.gene_iso_map
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt
new file mode 100644
index 0000000..792b7ac
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_denovo.samples.txt
@@ -0,0 +1,6 @@
+SCHIZO /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.left.fq.gz
+
+
+--SS_lib_type R
+--seqType fq
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt
new file mode 100644
index 0000000..013542a
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/schizo_ref.samples.txt
@@ -0,0 +1,6 @@
+SCHIZO /seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/10M.left.fq.gz
+
+
+--SS_lib_type RF
+--seqType fq
+--gene_trans_map /home/unix/bhaas/utilities/schizo/SP2.gene_trans_map
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh
new file mode 100644
index 0000000..984a8e5
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/10M/trinity_out_dir/Trinity.fasta SAMPLES=drosoph.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh
new file mode 100644
index 0000000..2130fd2
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Drosoph_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/DROSOPHILA/Annotations/Drosoph_coding.cdna SAMPLES=misc_tests/drosoph_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh
new file mode 100644
index 0000000..c3a2fc1
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/MOUSE_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel/trinity_out_dir/Trinity.fasta SAMPLES=mouse.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh
new file mode 100644
index 0000000..8924f25
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Mouse_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/regev_genome_portal/RESOURCES/mouse/mm9/Annotations/mm9_ucsc_genomeStudio_genes/mm9_ucsc_genomeStudio_genes.gtf.trans.fa SAMPLES=misc_tests/mouse_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh
new file mode 100644
index 0000000..ebef2e4
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_denovo.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/seq/RNASEQ/TRINITY_DEVEL_DATA/SCHIZO_RNASEQ_METHODS_ANALYSIS/10M/SS_trin_test/run_devel_trinity/trinity_out_dir/Trinity.fasta SAMPLES=schizo.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh
new file mode 100644
index 0000000..1de64f7
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/misc_tests/test_Schizo_ref.sh
@@ -0,0 +1 @@
+make TRINITY_FASTA=/home/unix/bhaas/utilities/schizo/SP2_GeneDB.290110.prot_coding.cds.nr SAMPLES=misc_tests/schizo_ref.samples.txt
diff --git a/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/samples.txt b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/samples.txt
new file mode 100644
index 0000000..948f920
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/SINGLE_END_ABUNDANCE_ESTIMATION/samples.txt
@@ -0,0 +1,8 @@
+heatshock ../../test_DATA/Sp_hs.10k.left.fq.gz
+plateau ../../test_DATA/Sp_plat.10k.right.fq.gz
+log_growth ../../test_DATA/Sp_log.10k.left.fq.gz
+diauxic_shift ../../test_DATA/Sp_ds.10k.right.fq.gz
+
+--seqType fq
+--SS_lib_type R
+--trinity_mode
diff --git a/sample_data/test_align_and_estimate_abundance/align_and_estimate_tester.pl b/sample_data/test_align_and_estimate_abundance/align_and_estimate_tester.pl
index c59bf9a..b45da2f 100755
--- a/sample_data/test_align_and_estimate_abundance/align_and_estimate_tester.pl
+++ b/sample_data/test_align_and_estimate_abundance/align_and_estimate_tester.pl
@@ -5,21 +5,21 @@ use warnings;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Process_cmd;
-my $usage = "usage: $0 (RSEM|eXpress|kallisto)\n\n";
+my $usage = "usage: $0 (RSEM|eXpress|kallisto|salmon-(fmd|quasi)) samples.txt Trinity.fasta\n\n";
+
my $method = $ARGV[0] or die $usage;
-unless ($method =~ /^(RSEM|eXpress|kallisto)$/) {
+unless ($method =~ /^(RSEM|eXpress|kallisto|salmon-(fmd|quasi))$/i) {
die $usage;
}
+my $samples_file = $ARGV[1] or die $usage;
+my $trinity_fasta = $ARGV[2] or die $usage;
-my $utildir = "$FindBin::Bin/../../util";
-
-my $samples_file = "samples.txt";
-my $trinity_fasta = "../test_DATA/Trinity.fasta";
+my $utildir = "$FindBin::RealBin/../../util";
main: {
@@ -28,26 +28,48 @@ main: {
$trinity_fasta = basename($trinity_fasta);
my @samples;
+ my @global_params;
{
open (my $fh, $samples_file) or die $!;
while (<$fh>) {
chomp;
- my ($sample_name, $left_fq, $right_fq) = split(/\s+/);
- $left_fq = &ensure_full_path($left_fq);
- $right_fq = &ensure_full_path($right_fq);
- push (@samples, [$sample_name, $left_fq, $right_fq]);
+ unless (/\w/) { next; }
+ if (/^\#/) { next; }
+ if (/^\-/) {
+ push (@global_params, $_);
+ }
+ else {
+ my ($sample_name, $left_fq, $right_fq) = split(/\s+/);
+ unless ($left_fq) {
+ die "Error, not able to parse line: $_";
+ }
+ $left_fq = &ensure_full_path($left_fq);
+ $right_fq = &ensure_full_path($right_fq) if $right_fq;
+
+ my @local_params;
+
+ if ($left_fq && $right_fq) {
+ push (@local_params, "--left $left_fq --right $right_fq");
+ }
+ else {
+ push (@local_params, "--single $left_fq")
+ }
+
+ push (@samples, [$sample_name, @local_params]);
+
+ }
}
close $fh;
}
-
+
my @trans_results;
my @gene_results;
foreach my $sample (@samples) {
- my ($sample_name, $left_fq, $right_fq) = @$sample;
+ my ($sample_name, @local_params) = @$sample;
my $cmd = "$utildir/align_and_estimate_abundance.pl --transcripts $trinity_fasta --prep_reference "
- . " --left $left_fq --right $right_fq --seqType fq --trinity_mode ";
+ . " @local_params @global_params";
my $outdir = "$method-$sample_name";
@@ -58,27 +80,31 @@ main: {
push (@gene_results, "$outdir/RSEM.genes.results");
}
- elsif ($method eq 'eXpress') {
+ elsif ($method =~ /eXpress/i) {
$cmd .= " --est_method eXpress --output_dir $outdir --aln_method bowtie2 ";
push (@trans_results, "$outdir/results.xprs");
push (@gene_results, "$outdir/results.xprs.genes");
}
elsif ($method eq 'kallisto') {
- $cmd .= " --est_method kallisto --output_dir kallisto-$sample_name ";
+ $cmd .= " --est_method kallisto --output_dir $outdir ";
push (@trans_results, "$outdir/abundance.tsv");
push (@gene_results, "$outdir/abundance.tsv.genes");
-
+ }
+ elsif($method =~ /salmon-(\w+)$/) {
+ my $salmon_idx_type = $1;
+ $cmd .= " --est_method salmon --salmon_idx_type $salmon_idx_type --output_dir $outdir";
+ push (@trans_results, "$outdir/quant.sf");
+ push (@gene_results, "$outdir/quant.sf.genes");
}
else {
# shouldn't ever get here.
die "error - method $method not recognized";
}
-
+
&process_cmd($cmd);
-
-
+
}
-
+
## generate matrices.
my $cmd = "$utildir/abundance_estimates_to_matrix.pl --est_method $method --out_prefix $method-trans --name_sample_by_basedir @trans_results";
&process_cmd($cmd);
diff --git a/sample_data/test_align_and_estimate_abundance/pairs.Rscript b/sample_data/test_align_and_estimate_abundance/pairs.Rscript
new file mode 100755
index 0000000..5764215
--- /dev/null
+++ b/sample_data/test_align_and_estimate_abundance/pairs.Rscript
@@ -0,0 +1,6 @@
+#!/usr/bin/env Rscript
+
+pdf("pairs.pdf")
+data = read.table("combined.TPM.not_cross_norm.matrix")
+data = log2(data+1)
+pairs(data, pch='.')
diff --git a/sample_data/test_align_and_estimate_abundance/samples.txt b/sample_data/test_align_and_estimate_abundance/samples.txt
deleted file mode 100644
index 4bce4f8..0000000
--- a/sample_data/test_align_and_estimate_abundance/samples.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-heatshock ../test_DATA/Sp_hs.10k.left.fq.gz ../test_DATA/Sp_hs.10k.right.fq.gz
-plateau ../test_DATA/Sp_plat.10k.right.fq.gz ../test_DATA/Sp_plat.10k.left.fq.gz
-log_growth ../test_DATA/Sp_log.10k.left.fq.gz ../test_DATA/Sp_log.10k.right.fq.gz
-diauxic_shift ../test_DATA/Sp_ds.10k.right.fq.gz ../test_DATA/Sp_ds.10k.left.fq.gz
diff --git a/sample_data/test_full_edgeR_pipeline/cleanme.pl b/sample_data/test_full_edgeR_pipeline/cleanme.pl
index 78235ce..efc796d 100755
--- a/sample_data/test_full_edgeR_pipeline/cleanme.pl
+++ b/sample_data/test_full_edgeR_pipeline/cleanme.pl
@@ -7,7 +7,7 @@ use FindBin;
## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
-chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+chdir $FindBin::RealBin or die "error, cannot cd to $FindBin::RealBin";
diff --git a/trinity-plugins/Makefile b/trinity-plugins/Makefile
index 4218185..6e6e788 100644
--- a/trinity-plugins/Makefile
+++ b/trinity-plugins/Makefile
@@ -10,6 +10,14 @@ FASTOOL_CODE=fstrozzi-Fastool-7c3e034f05
PARAFLY_CODE=parafly-code
TRIMMOMATIC_CODE=Trimmomatic-0.32
+
+UNAME_S=$(shell uname -s)
+ifeq ("${UNAME_S}", "Linux")
+ LTINFO="LIBPATH=-ltinfo"
+endif
+
+
+
trinity_essentials: jellyfish scaffold_iworm_contigs_target fastool_target parafly_target trimmomatic_target samtools
trimmomatic_target:
@@ -18,7 +26,7 @@ trimmomatic_target:
samtools:
tar xvf samtools-0.1.19.tar.bz2
- cd samtools-0.1.19 && $(MAKE) LIBPATH=-ltinfo
+ cd samtools-0.1.19 && $(MAKE) ${LTINFO}
mv samtools-0.1.19/samtools ./BIN/.
jellyfish:
diff --git a/trinity-plugins/fstrozzi-Fastool-7c3e034f05/Makefile b/trinity-plugins/fstrozzi-Fastool-7c3e034f05/Makefile
index 36b4299..f5ea2c3 100644
--- a/trinity-plugins/fstrozzi-Fastool-7c3e034f05/Makefile
+++ b/trinity-plugins/fstrozzi-Fastool-7c3e034f05/Makefile
@@ -1,8 +1,7 @@
-CC = gcc
-CFLAGS = -O2 -std=c99 -Werror
+CFLAGS += -O2 -std=c99 -Werror
all:kseq.h fastool.c
- $(CC) $(CFLAGS) fastool.c -o fastool
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) fastool.c -o fastool
clean:
rm -f *.o fastool
diff --git a/trinity-plugins/scaffold_iworm_contigs/Makefile b/trinity-plugins/scaffold_iworm_contigs/Makefile
index 4babd7c..202f39b 100644
--- a/trinity-plugins/scaffold_iworm_contigs/Makefile
+++ b/trinity-plugins/scaffold_iworm_contigs/Makefile
@@ -2,9 +2,9 @@ CXX = g++
prefix = ../htslib
ScaffoldIwormContigs:
- $(CXX) -I$(prefix) -L$(prefix) ScaffoldIwormContigs.cpp error_checker.cpp -lhts -lz -o scaffold_iworm_contigs
+ $(CXX) $(LDFLAGS) -I$(prefix) -L$(prefix) ScaffoldIwormContigs.cpp error_checker.cpp -lhts -o scaffold_iworm_contigs
clean:
- rm scaffold_iworm_contigs
+ rm -f scaffold_iworm_contigs
diff --git a/trinity-plugins/slclust/src/Makefile b/trinity-plugins/slclust/src/Makefile
index a9f8d38..aa134ab 100755
--- a/trinity-plugins/slclust/src/Makefile
+++ b/trinity-plugins/slclust/src/Makefile
@@ -17,11 +17,8 @@ OBJS = slcluster.o graph.o graphnode.o cmd_line_opts.o
# use ${LIBDIR} defined above if appropriate
LIBS =
-# How they are invoked on the compile line (eg: -lspecial)
-LLIBS =
-
# Local additions for the CFLAG options
-LOCAL_CFLAGS = -Wall
+LOCAL_CXXFLAGS = -Wall
#-static
@@ -34,20 +31,11 @@ BIN = ${PROJECT_ROOT}/bin
INCLUDE = ${PROJECT_ROOT}/include
DEBUG = DEBUG
-CFLAGS = -I${INCLUDE} ${LOCAL_CFLAGS}
-CC = g++ ${CFLAGS}
+CXXFLAGS += -I${INCLUDE} ${LOCAL_CXXFLAGS}
MAKEFILE = Makefile
-# Suffix rules
-
-.cc.o:
- ${CC} -c $<
-
-.cpp.o:
- ${CC} -c $<
-
# Target dependencies
@@ -59,12 +47,13 @@ install : ${EXECUTABLE}
clean :
- rm -f ${OBJS} core a.out *~ \#* ${EXECUTABLE} ${MAKEFILE}.bak
+ rm -f ${OBJS} core a.out *~ \#* ${EXECUTABLE} ${MAKEFILE}.bak \
+ ${BIN}/${EXECUTABLE}
${OBJ} : ${MAKEFILE}
${EXECUTABLE} : ${OBJS}
- ${CC} ${OBJS} ${LIBS} -o ${EXECUTABLE}
+ ${CXX} ${LDFLAGS} ${OBJS} ${LIBS} -o ${EXECUTABLE}
chmod 755 ${EXECUTABLE}
diff --git a/util/SAM_nameSorted_to_uniq_count_stats.pl b/util/SAM_nameSorted_to_uniq_count_stats.pl
index 40e7f07..4daba48 100755
--- a/util/SAM_nameSorted_to_uniq_count_stats.pl
+++ b/util/SAM_nameSorted_to_uniq_count_stats.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../PerlLib");
+use lib ("$FindBin::RealBin/../PerlLib");
use SAM_reader;
use SAM_entry;
@@ -94,7 +94,7 @@ main: {
print STDERR "\n\n";
- &process_pairs(\@reads, \%counts);
+ &process_pairs(\@reads, \%counts) if @reads;
my $sum_reads = 0;
@@ -108,8 +108,8 @@ main: {
print "$count_type\t$count\t" . sprintf("%.2f", $count/$sum_reads*100) . "\n";
}
print "\n";
- print "Total aligned reads: $sum_reads\n\n";
-
+ print "Total aligned rnaseq fragments: $sum_reads\n\n";
+
close $DEBUG_OFH if $DEBUG;
@@ -165,12 +165,12 @@ sub process_pairs {
my $class = "";
if ($got_proper_pair) {
- $counts_href->{proper_pairs} += 2;
+ $counts_href->{proper_pairs}++;
$class = "PP";
}
elsif ($got_left_read && $got_right_read) {
- $counts_href->{improper_pairs} += 2;
+ $counts_href->{improper_pairs}++;
$class = "IP";
}
elsif ($got_left_read) {
diff --git a/util/TrinityStats.pl b/util/TrinityStats.pl
index 2ca4dc3..593284d 100755
--- a/util/TrinityStats.pl
+++ b/util/TrinityStats.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../PerlLib");
+use lib ("$FindBin::RealBin/../PerlLib");
use Fasta_reader;
use BHStats;
diff --git a/util/abundance_estimates_to_matrix.pl b/util/abundance_estimates_to_matrix.pl
index e5a031f..c9b280b 100755
--- a/util/abundance_estimates_to_matrix.pl
+++ b/util/abundance_estimates_to_matrix.pl
@@ -11,9 +11,14 @@ my $usage = <<__EOUSAGE__;
############################################################
#
# Usage: $0 --est_method <method> sample1.results sample2.results ...
-# Required:
#
-# --est_method <string> RSEM|eXpress|kallisto (needs to know what format to expect)
+# or $0 --est_method <method> --samples_file file.listing_target_files.txt
+#
+# Note, if only a single input file is given, it's expected to contain the paths to all the target abundance estimation files.
+#
+# Required:
+#
+# --est_method <string> RSEM|eXpress|kallisto|salmon (needs to know what format to expect)
#
# Options:
#
@@ -24,6 +29,8 @@ my $usage = <<__EOUSAGE__;
#
# --out_prefix <string> default: 'matrix'
#
+# --samples_file <string> file containing a list of all the target files.
+#
############################################################
@@ -39,6 +46,7 @@ my $cross_sample_norm = "TMM";
my $name_sample_by_basedir = 0;
my $out_prefix = "matrix";
my $basedir_index = -2;
+my $samples_file = "";
&GetOptions('help|h' => \$help_flag,
'est_method=s' => \$est_method,
@@ -48,6 +56,7 @@ my $basedir_index = -2;
'out_prefix=s' => \$out_prefix,
'basedir_index=i' => \$basedir_index,
+ 'samples_file=s' => \$samples_file,
);
@@ -55,26 +64,27 @@ unless ($est_method && @ARGV) {
die $usage;
}
-unless ($est_method =~ /^(RSEM|eXpress|kallisto)$/i) {
+unless ($est_method =~ /^(RSEM|eXpress|kallisto|salmon)/i) {
die "Error, dont recognize --est_method $est_method ";
}
unless ($cross_sample_norm =~ /^(TMM|UpperQuartile|none)$/i) {
die "Error, dont recognize --cross_sample_norm $cross_sample_norm ";
}
-my @files = @ARGV;
-
-if (scalar @files == 1) {
+my @files;
- if (-s $files[0]) {
- # allow for a file listing the various files.
- @files = `cat $files[0]`;
- chomp @files;
- }
- else {
- die $usage;
- }
+if ($samples_file) {
+ # allow for a file listing the various files.
+ @files = `cat $samples_file`;
+ chomp @files;
}
+elsif (@ARGV) {
+ @files = @ARGV;
+}
+else {
+ die $usage;
+}
+
=data_formats
@@ -116,6 +126,14 @@ if (scalar @files == 1) {
3 est_counts
4 tpm
+
+## salmon:
+0 Name
+1 Length
+2 EffectiveLength
+3 TPM
+4 NumReads
+
=cut
;
@@ -140,8 +158,14 @@ elsif ($est_method =~ /^kallisto$/i) {
$fpkm_field = "tpm";
$tpm_field = "tpm";
}
+elsif ($est_method =~ /^salmon/) {
+ $acc_field = "Name";
+ $counts_field = "NumReads";
+ $fpkm_field = "TPM";
+ $tpm_field = "TPM";
+}
else {
- die "Error, dont recognize --est_method $est_method ";
+ die "Error, dont recognize --est_method [$est_method] ";
}
main: {
@@ -236,18 +260,28 @@ main: {
close $ofh_counts;
close $ofh_TPM;
- if ($cross_sample_norm =~ /^TMM$/i) {
- my $cmd = "$FindBin::Bin/support_scripts/run_TMM_scale_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
- &process_cmd($cmd);
- }
- elsif ($cross_sample_norm =~ /^UpperQuartile$/) {
- my $cmd = "$FindBin::Bin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
- &process_cmd($cmd);
+
+ if (scalar @files > 1) {
+ ## more than one sample
+
+ if ($cross_sample_norm =~ /^TMM$/i) {
+ my $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
+ &process_cmd($cmd);
+ }
+ elsif ($cross_sample_norm =~ /^UpperQuartile$/) {
+ my $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
+ &process_cmd($cmd);
+ }
+ elsif ($cross_sample_norm =~ /^none$/i) {
+ print STDERR "-not performing cross-sample normalization.\n";
+ }
}
- elsif ($cross_sample_norm =~ /^none$/i) {
- print STDERR "-not performing cross-sample normalization.\n";
+ else {
+ unless (scalar @files == 1) {
+ die "Error, no target samples. Shouldn't get here.";
+ }
+ print STDERR "Warning, only one sample, so not performing cross-sample normalization\n";
}
-
print STDERR "Done.\n\n";
exit(0);
diff --git a/util/align_and_estimate_abundance.pl b/util/align_and_estimate_abundance.pl
index 182dc12..1fd0688 100755
--- a/util/align_and_estimate_abundance.pl
+++ b/util/align_and_estimate_abundance.pl
@@ -10,10 +10,6 @@ use Carp;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
-my $RSEM_DIR = "$FindBin::Bin/../trinity-plugins/rsem";
-$ENV{PATH} = "$RSEM_DIR:$ENV{PATH}"; # be sure to use the included rsem package over other ones installed.
-
-
my %aligner_params = (
@@ -57,7 +53,10 @@ my %aligner_params = (
my $rsem_add_opts = "";
my $eXpress_add_opts = "";
my $kallisto_add_opts = "";
-
+my $salmon_add_opts= "";
+my $salmon_idx_type = 'quasi';
+my $salmon_quasi_kmer_length = 31;
+my $salmon_fmd_kmer_length = 19;
my $usage = <<__EOUSAGE__;
@@ -68,16 +67,16 @@ my $usage = <<__EOUSAGE__;
#
# If Paired-end:
#
-# --left <string>
-# --right <string>
+# --left <string>
+# --right <string>
#
-# or Single-end:
+# or Single-end:
#
-# --single <string>
+# --single <string>
#
# --est_method <string> abundance estimation method.
# alignment_based: RSEM|eXpress
-# alignment_free: kallisto
+# alignment_free: kallisto|salmon
#
# --output_dir <string> write all files to output directory
#
@@ -89,38 +88,65 @@ my $usage = <<__EOUSAGE__;
# Optional:
#
# --SS_lib_type <string> strand-specific library type: paired('RF' or 'FR'), single('F' or 'R').
+# (note, no strand-specific mode for kallisto)
#
# --thread_count number of threads to use (default = 4)
#
-# --max_ins_size <int> maximum insert size (bowtie -X parameter, default: 800)
-#
# --debug retain intermediate files
#
-#
# --gene_trans_map <string> file containing 'gene(tab)transcript' identifiers per line.
# or
# --trinity_mode Setting --trinity_mode will automatically generate the gene_trans_map and use it.
#
#
-# --prep_reference prep reference set for eXpress (builds bowtie index, etc)
+# --prep_reference prep reference (builds target index)
#
# --output_prefix <string> prefix for output files. Defaults to --est_method setting.
#
#
-# if alignment_based method:
-# --coordsort_bam provide coord-sorted bam in addition to the default (unsorted) bam.
+########################################
#
-# --show_full_usage_info provide more detailed usage info for customizing the alignment or abundance estimation parameters.
+# Parameters for single-end reads:
#
-#############################
+# --fragment_length <int> specify RNA-Seq fragment length (default: 200)
+# --fragment_std <int> fragment length standard deviation (defalt: 80)
+#
+########################################
+#
+# bowtie-related parameters: (note, tool-specific settings are further below)
+#
+# --max_ins_size <int> maximum insert size (bowtie -X parameter, default: 800)
+# --coordsort_bam provide coord-sorted bam in addition to the default (unsorted) bam.
+#
+########################################
# RSEM opts:
-# --fragment_length <int> optionally specify fragment length (not seq length, but frag size ie. 300) for SE reads.
#
+# --bowtie_RSEM <string> if using 'bowtie', default: \"$aligner_params{bowtie_RSEM}\"
+# --bowtie2_RSEM <string> if using 'bowtie2', default: \"$aligner_params{bowtie2_RSEM}\"
# --include_rsem_bam provide the RSEM enhanced bam file including posterior probabilities of read assignments.
+# --rsem_add_opts <string> additional parameters to pass on to rsem-calculate-expression
#
-#########################################################################
+##########################################################################
+# eXpress opts:
+#
+# --bowtie_eXpress <string> default: \"$aligner_params{bowtie_eXpress}\"
+# --bowtie2_eXpress <string> default: \"$aligner_params{bowtie2_eXpress}\"
+# --eXpress_add_opts <string> default: "$eXpress_add_opts"
+#
+##########################################################################
+# kallisto opts:
+#
+# --kallisto_add_opts <string> default: $kallisto_add_opts
+#
+##########################################################################
+#
+# salmon opts:
#
-# Example usage:
+# --salmon_idx_type <string> quasi|fmd (defalt: $salmon_idx_type)
+# --salmon_add_opts <string> default: $salmon_add_opts
+#
+#
+# Example usage
#
# ## Just prepare the reference for alignment and abundance estimation
#
@@ -128,11 +154,11 @@ my $usage = <<__EOUSAGE__;
#
# ## Run the alignment and abundance estimation (assumes reference has already been prepped, errors-out if prepped reference not located.)
#
-# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method RSEM --aln_method bowtie --trinity_mode
+# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method RSEM --aln_method bowtie --trinity_mode --output_dir rsem_outdir
#
## ## prep the reference and run the alignment/estimation
#
-# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference
+# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method RSEM --aln_method bowtie --trinity_mode --prep_reference --output_dir rsem_outdir
#
#########################################################################
@@ -143,39 +169,6 @@ __EOUSAGE__
-my $advanced_usage_info = <<__EOADVANCEDUSAGE__;
-
-
-#############################################################
-## Customizing alignment and abundance estimation parameters.
-#############################################################
-#
-# Default alignment parameters are:
-#
-# --bowtie_RSEM <string> default: \"$aligner_params{bowtie_RSEM}\"
-# --bowtie2_RSEM <string> default: \"$aligner_params{bowtie2_RSEM}\"
-#
-# --bowtie_eXpress <string> default: \"$aligner_params{bowtie_eXpress}\"
-# --bowtie2_eXpress <string> default: \"$aligner_params{bowtie2_eXpress}\"
-#
-# Options to pass on to RSEM or eXpress
-#
-# --rsem_add_opts <string> default: "$rsem_add_opts"
-#
-# --eXpress_add_opts <string> default: "$eXpress_add_opts"
-#
-# --kallisto_add_opts <string> default: $kallisto_add_opts
-
-# * note, options for handling strand-specific reads are already taken care of internally, so no need to
-# pass on those parameters.
-#
-##############################################################
-
-__EOADVANCEDUSAGE__
-
- ;
-
-my $show_full_usage_info;
my $output_dir;
my $help_flag;
@@ -195,10 +188,10 @@ my $max_ins_size = 800;
my $est_method;
my $aln_method = "";
-
my $retain_sorted_bam_file = 0;
-my $fragment_length = "";
+my $fragment_length = 200;
+my $fragment_std = 80;
my $output_prefix = "";
@@ -240,13 +233,11 @@ my $coordsort_bam_flag = 0;
## devel opts
'prep_reference' => \$prep_reference,
- ## rsem opts
+ # opts for single-end reads
'fragment_length=i' => \$fragment_length,
+ 'fragment_std=i' => \$fragment_std,
#
- 'show_full_usage_info' => \$show_full_usage_info,
-
-
'bowtie_RSEM=s' => \($aligner_params{'bowtie_RSEM'}),
'bowtie2_RSEM=s' => \($aligner_params{'bowtie2_RSEM'}),
'bowtie_eXpress=s' => \($aligner_params{'bowtie_eXpress'}),
@@ -255,10 +246,15 @@ my $coordsort_bam_flag = 0;
'rsem_add_opts=s' => \$rsem_add_opts,
'eXpress_add_opts=s' => \$eXpress_add_opts,
'kallisto_add_opts=s' => \$kallisto_add_opts,
+ 'salmon_add_opts=s' => \$salmon_add_opts,
'coordsort_bam' => \$coordsort_bam_flag,
+
+ 'salmon_idx_type=s' => \$salmon_idx_type,
+ 'salmon_quasi_kmer_length=i' => \$salmon_quasi_kmer_length,
+ 'salmon_fmd_kmer_length=i' => \$salmon_fmd_kmer_length,
- );
+ );
@@ -269,23 +265,21 @@ if (@ARGV) {
if ($help_flag) {
die $usage;
}
-if ($show_full_usage_info) {
- die "$usage\n\n$advanced_usage_info\n\n";
-}
+
unless ($est_method) {
die $usage;
}
-my @EST_METHODS = qw(RSEM eXpress kallisto);
-my %ALIGNMENT_BASED_EST_METHODS = map { + $_ => 1 } qw (RSEM eXpress);
-my %ALIGNMENT_FREE_EST_METHODS = map { + $_ => 1 } qw (kallisto);
+my @EST_METHODS = qw(RSEM express kallisto salmon);
+my %ALIGNMENT_BASED_EST_METHODS = map { + $_ => 1 } qw (RSEM express eXpress);
+my %ALIGNMENT_FREE_EST_METHODS = map { + $_ => 1 } qw (kallisto salmon);
unless ($output_dir) {
die "Error, must specify output directory name via: --output_dir ";
}
-unless (($est_method && $prep_reference && $transcripts) ## just prep reference
+unless (($est_method && $prep_reference && $transcripts && (! ($single||$left||$right)) ) ## just prep reference
|| ($transcripts && $est_method && $seqType && ($single || ($left && $right))) # do alignment
@@ -315,14 +309,20 @@ else {
}
-unless ($est_method =~ /^(RSEM|eXpress|kallisto|none)$/) {
- die "Error, --est_method @EST_METHODS only, and capitalization matters. :) \n";
+unless ($est_method =~ /^(RSEM|express|kallisto|salmon|none)$/i) {
+ die "Error, --est_method @EST_METHODS only\n";
}
$left = &create_full_path($left) if $left;
$right = &create_full_path($right) if $right;
-$single = &create_full_path($single) if $single;
+
+if ($single) {
+ $single = &create_full_path($single);
+ unless ($fragment_length) {
+ die "Error, specify --fragment_length for single-end reads (note, not the length of the read but the mean fragment length)\n\n";
+ }
+}
$transcripts = &create_full_path($transcripts);
@@ -370,13 +370,17 @@ if ( $thread_count !~ /^\d+$/ ) {
if ($est_method =~ /^RSEM$/i) {
push (@tools, 'rsem-calculate-expression');
}
- elsif ($est_method =~ /^eXpress$/i) {
+ elsif ($est_method =~ /^express$/i) {
push (@tools, 'express');
}
elsif ($est_method eq 'kallisto') {
push (@tools, 'kallisto');
}
+ elsif ($est_method eq 'salmon') {
+ push (@tools, 'salmon');
+ }
+
foreach my $tool (@tools) {
my $p = `which $tool`;
unless ($p =~ /\w/) {
@@ -395,7 +399,7 @@ main: {
if ($trinity_mode && ! $gene_trans_map_file) {
$gene_trans_map_file = "$transcripts.gene_trans_map";
- my $cmd = "$FindBin::Bin/support_scripts/get_Trinity_gene_to_trans_map.pl $transcripts > $gene_trans_map_file";
+ my $cmd = "$FindBin::RealBin/support_scripts/get_Trinity_gene_to_trans_map.pl $transcripts > $gene_trans_map_file";
&process_cmd($cmd) unless (-e $gene_trans_map_file);
}
@@ -419,6 +423,13 @@ sub run_alignment_FREE_estimation {
if ($est_method eq "kallisto") {
&run_kallisto();
}
+ elsif ($est_method eq "salmon") {
+ &run_salmon();
+ }
+ else {
+ die "Error, not recognizing est_method: $est_method";
+ # sholdn't get here
+ }
}
@@ -510,60 +521,71 @@ sub run_alignment_BASED_estimation {
#####################
## Run alignments
#####################
-
- unless (-d $output_dir) {
- system("mkdir -p $output_dir");
- }
- chdir $output_dir or die "Error, cannot cd to output directory $output_dir";
-
my $prefix = $output_prefix;
if ($prefix) {
$prefix .= "."; # add separator in filename
}
-
- my $read_type = ($seqType eq "fq") ? "-q" : "-f";
-
- ## run bowtie
- my $bowtie_cmd;
my $bam_file = "${prefix}${aln_method}.bam";
my $bam_file_ok = "$bam_file.ok";
if ($PROCESSING_EXISTING_BAM_FLAG) {
$bam_file = $aln_method;
+ $bam_file = &create_full_path($bam_file);
unless (-e $bam_file_ok) {
&process_cmd("touch $bam_file_ok");
}
}
+
+
+ unless (-d $output_dir) {
+ system("mkdir -p $output_dir");
+ }
+ chdir $output_dir or die "Error, cannot cd to output directory $output_dir";
+
+
+ my $read_type = ($seqType eq "fq") ? "-q" : "-f";
+
if ($left && $right) {
$paired_flag = 1;
}
- if ($aln_method eq 'bowtie') {
- if ($left && $right) {
- $bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -X $max_ins_size -S -p $thread_count $db_index_name -1 $left -2 $right | samtools view -F 4 -S -b -o $bam_file -";
-
- }
- else {
- $bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -S -p $thread_count $db_index_name $single | samtools view -F 4 -S -b -o $bam_file -";
- }
- }
- elsif ($aln_method eq 'bowtie2') {
+ if (! $PROCESSING_EXISTING_BAM_FLAG) {
+ ## run bowtie
- if ($left && $right) {
+ ##############
+ ## Align reads
- $bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -X $max_ins_size -x $db_index_name -1 $left -2 $right -p $thread_count | samtools view -F 4 -S -b -o $bam_file -";
+ my $bowtie_cmd;
+
+ if ($aln_method eq 'bowtie') {
+ if ($left && $right) {
+ ## PE alignment
+ $bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -X $max_ins_size -S -p $thread_count $db_index_name -1 $left -2 $right | samtools view -F 4 -S -b -o $bam_file -";
+
+ }
+ else {
+ # SE alignment
+ $bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -S -p $thread_count $db_index_name $single | samtools view -F 4 -S -b -o $bam_file -";
+ }
}
- else {
-
- $bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -x $db_index_name -U $single -p $thread_count | samtools view -F 4 -S -b -o $bam_file -";
+ elsif ($aln_method eq 'bowtie2') {
+
+ if ($left && $right) {
+ ## PE alignment
+ $bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -X $max_ins_size -x $db_index_name -1 $left -2 $right -p $thread_count | samtools view -F 4 -S -b -o $bam_file -";
+ }
+ else {
+ # SE alignment
+ $bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -x $db_index_name -U $single -p $thread_count | samtools view -F 4 -S -b -o $bam_file -";
+ }
}
+
+ &process_cmd($bowtie_cmd) unless (-s $bam_file && -e $bam_file_ok);
+
+ &process_cmd("touch $bam_file_ok") unless (-e $bam_file_ok);
}
- &process_cmd($bowtie_cmd) unless (-s $bam_file && -e $bam_file_ok);
-
- &process_cmd("touch $bam_file_ok") unless (-e $bam_file_ok);
-
- if ($est_method eq "eXpress") {
+ if ($est_method =~ /express/i) {
&run_eXpress($bam_file);
}
elsif ($est_method eq "RSEM") {
@@ -629,14 +651,19 @@ sub run_eXpress {
}
## run eXpress
- my $express_cmd = "express $SS_opt $eXpress_add_opts $transcripts";
+ my $fraglength_param = "";
+ if ($single) {
+ $fraglength_param = "--frag-len-mean $fragment_length --frag-len-stddev $fragment_std";
+ }
+
+ my $express_cmd = "express $SS_opt $fraglength_param $eXpress_add_opts $transcripts";
my $cmd = "$express_cmd $bam_file";
&process_cmd($cmd);
if ($gene_trans_map_file) {
- my $cmd = "$FindBin::Bin/support_scripts/eXpress_trans_to_gene_results.pl results.xprs $gene_trans_map_file > results.xprs.genes";
+ my $cmd = "$FindBin::RealBin/support_scripts/eXpress_trans_to_gene_results.pl results.xprs $gene_trans_map_file > results.xprs.genes";
&process_cmd($cmd);
}
@@ -655,13 +682,11 @@ sub run_RSEM {
my $keep_intermediate_files_opt = ($DEBUG_flag) ? "--keep-intermediate-files" : "";
- if ($fragment_length) {
- $fragment_length = "--fragment-length-mean $fragment_length";
- }
- else {
- $fragment_length = "";
+ my $fraglength_info_txt = "";
+ if ($single) {
+ $fraglength_info_txt = "--fragment-length-mean $fragment_length --fragment-length-sd $fragment_std";
}
-
+
my $SS_opt = "";
if ($SS_lib_type) {
if ($SS_lib_type =~ /^F/) {
@@ -685,8 +710,8 @@ sub run_RSEM {
my $cmd = "rsem-calculate-expression $no_qualities_string "
. "$paired_flag_text "
. " $rsem_add_opts "
- . "-p 4 "
- . "$fragment_length "
+ . "-p $thread_count "
+ . "$fraglength_info_txt "
. "$keep_intermediate_files_opt "
. "$SS_opt $rsem_bam_flag "
. "--bam $bam_file "
@@ -706,6 +731,10 @@ sub run_RSEM {
sub process_cmd {
my ($cmd) = @_;
+ unless ($cmd) {
+ confess "Error, no cmd specified";
+ }
+
print STDERR "CMD: $cmd\n";
my $ret = system("bash", "-o", "pipefail", "-c", $cmd);
@@ -785,17 +814,94 @@ sub run_kallisto {
&process_cmd($cmd);
}
elsif ($single) {
- my $cmd = "kallisto quant -l $fragment_length -i $kallisto_index -o $output_dir $kallisto_add_opts $single";
+ my $cmd = "kallisto quant --single -l $fragment_length -s $fragment_std -i $kallisto_index -o $output_dir $kallisto_add_opts $single";
&process_cmd($cmd);
}
- if ($gene_trans_map_file) {
+ if ( ($left || $single) && $gene_trans_map_file) {
+
+ my $cmd = "$FindBin::RealBin/support_scripts/kallisto_trans_to_gene_results.pl $output_dir/abundance.tsv $gene_trans_map_file > $output_dir/abundance.tsv.genes";
+ &process_cmd($cmd);
+ }
+
+
+ return;
+}
+
+
+
+####
+sub run_salmon {
+
+ my $salmon_index = "$transcripts.salmon_${salmon_idx_type}.idx";
+
+ if ( (! $prep_reference) && (! -e $salmon_index)) {
+ confess "Error, no salmon index file: $salmon_index, and --prep_reference not set. Re-run with --prep_reference";
+ }
+ if ($prep_reference && ! -e $salmon_index) {
+
+ ## Prep salmon index
+ my $cmd;
+
+ if ($salmon_idx_type eq 'quasi') {
+ $cmd = "salmon index -t $transcripts -i $salmon_index --type quasi -k $salmon_quasi_kmer_length -p $thread_count";
+ }
+ elsif ($salmon_idx_type eq 'fmd') {
+ $cmd = "salmon index -t $transcripts -i $salmon_index --type fmd -p $thread_count";
+ }
+ else {
+ die "Error, not recognizing idx type: $salmon_idx_type";
+ }
- my $cmd = "$FindBin::Bin/support_scripts/kallisto_trans_to_gene_results.pl $output_dir/abundance.tsv $gene_trans_map_file > $output_dir/abundance.tsv.genes";
&process_cmd($cmd);
}
+ my $outdir = $output_dir; #"$output_dir.$salmon_idx_type";
+
+
+ if ($left && $right) {
+ ## PE mode
+ my $cmd;
+ my $libtype = ($SS_lib_type) ? "IS" . substr($SS_lib_type, 0, 1) : "IU";
+
+ if ($salmon_idx_type eq 'quasi') {
+ $cmd = "salmon quant -i $salmon_index -l $libtype -1 $left -2 $right -o $outdir $salmon_add_opts -p $thread_count";
+ }
+ elsif ($salmon_idx_type eq 'fmd') {
+ $cmd = "salmon quant -i $salmon_index -l $libtype -1 $left -2 $right -k $salmon_fmd_kmer_length -o $outdir $salmon_add_opts -p $thread_count";
+ }
+ else {
+ die "Error, not recognizing salmon_idx_type: $salmon_idx_type";
+ }
+
+ &process_cmd($cmd);
+
+ }
+ elsif ($single) {
+ my $libtype = ($SS_lib_type) ? "S" . substr($SS_lib_type, 0, 1) : "U";
+ my $cmd;
+
+ if ($salmon_idx_type eq 'quasi') {
+ $cmd = "salmon quant -i $salmon_index -l $libtype -r $single -o $outdir $salmon_add_opts -p $thread_count";
+ }
+ elsif ($salmon_idx_type eq 'fmd') {
+ $cmd = "salmon quant -i $salmon_index -l $libtype -r $single -k $salmon_fmd_kmer_length -o $outdir $salmon_add_opts -p $thread_count";
+ }
+ else {
+ die "Error, not recognizing salmon_idx_type: $salmon_idx_type";
+ }
+
+ &process_cmd($cmd);
+
+ }
+
+ if ( ($left || $single) && $gene_trans_map_file) {
+
+ my $cmd = "$FindBin::RealBin/support_scripts/salmon_trans_to_gene_results.pl $output_dir/quant.sf $gene_trans_map_file > $output_dir/quant.sf.genes";
+ &process_cmd($cmd);
+ }
+
return;
}
diff --git a/util/analyze_blastPlus_topHit_coverage.pl b/util/analyze_blastPlus_topHit_coverage.pl
index fcdd214..ca9daf0 100755
--- a/util/analyze_blastPlus_topHit_coverage.pl
+++ b/util/analyze_blastPlus_topHit_coverage.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../PerlLib");
+use lib ("$FindBin::RealBin/../PerlLib");
use Fasta_reader;
use Data::Dumper;
diff --git a/util/bowtie_PE_separate_then_join.pl b/util/bowtie_PE_separate_then_join.pl
index eee9b97..82f7938 100755
--- a/util/bowtie_PE_separate_then_join.pl
+++ b/util/bowtie_PE_separate_then_join.pl
@@ -11,7 +11,7 @@ use Data::Dumper;
use Getopt::Long qw(:config no_ignore_case bundling);
-$ENV{PATH} .= "\:$FindBin::Bin/../trinity-plugins/rsem/sam/"; # include samtools in path, already included in rsem build.
+$ENV{PATH} .= "\:$FindBin::RealBin/../trinity-plugins/rsem/sam/"; # include samtools in path, already included in rsem build.
$ENV{LC_ALL} = 'C'; # critical for proper sorting using [system "sort -k1,1 ..."] within the perl script
@@ -214,7 +214,7 @@ unless ($aligner eq "bowtie") {
}
-my $util_dir = "$FindBin::Bin/../util/support_scripts";
+my $util_dir = "$FindBin::RealBin/../util/support_scripts";
my ($start_dir, $work_dir, $num_hits);
@@ -657,7 +657,7 @@ sub make_RSEM_bam {
if ($RUN_RSEM) {
- my $cmd = "$FindBin::Bin/align_and_estimate_abundance.pl --est_method RSEM --aln_method $rsem_bam "
+ my $cmd = "$FindBin::RealBin/align_and_estimate_abundance.pl --est_method RSEM --aln_method $rsem_bam "
. " --transcripts $target_db --seqType $seqType ";
if ($left_file && $right_file) {
diff --git a/util/fasta_tool b/util/fasta_tool
deleted file mode 120000
index 272b5bd..0000000
--- a/util/fasta_tool
+++ /dev/null
@@ -1 +0,0 @@
-../trinity-plugins/GAL_0.2.1/fasta_tool
\ No newline at end of file
diff --git a/util/fasta_tool b/util/fasta_tool
new file mode 100755
index 0000000..42ba349
--- /dev/null
+++ b/util/fasta_tool
@@ -0,0 +1,1053 @@
+#!/usr/bin/env perl
+
+eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
+ if 0; # not running under some shell
+
+eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
+ if 0; # not running under some shell
+
+use strict;
+use warnings;
+use FindBin;
+use lib "$FindBin::RealBin/../lib";
+use lib "$FindBin::RealBin/../perl/lib";
+use Getopt::Long;
+use Bio::SeqIO;
+use IO::All;
+
+
+
+#-----------------------------------------------------------------------------
+#----------------------------------- MAIN ------------------------------------
+#-----------------------------------------------------------------------------
+my $usage = "
+
+Synopsis:
+
+fasta_tool [-options] fasta_file
+
+Description:
+The script takes a fasta file and can search it, reformat it, and manipulate it
+in a large variety of ways that can prove very very usful. For options that
+provide the ability to evaluate code, use Perl.
+
+Options:
+ --summary
+ For functions that can report data for every sequence (nt_count),
+ use this flag to report only summary data for all sequences combined.
+
+ --chunks <integer>
+ Break up a single fasta file into the given number of chunks
+
+ --split
+ Split a multi-fasta into individual files. One for each fasta.
+
+ --eval_code <code>
+ Run the given code on (\$seq_obj, \$sequence or \$header). If the code
+ block returns a positive value then the sequence is printed. This can be
+ used to build complex and custom filters.
+
+ --eval_all <code>
+ Run the given code on (\$seq_obj, \$sequence or \$header). Prints all
+ sequences regardless of the return value of the evaled code. This can
+ but used to perform operations (e.g. soft to hard masking with
+ s/[a-z]/N/g, but still print every sequence even if it's unaltered.
+
+ --extract_ids <id_file.txt>
+ Extract all of the sequences who's IDs are found in the given file.
+
+ --grep_header <pattern>
+ Grep through a multi fasta file and print out only the fasta
+ sequences that have a match in the header. Use grepv_header for
+ negation.
+
+ --grep_seq <pattern>
+ Grep throught a multi fasta file and print out only the fasta
+ sequences that have a match in the sequence. Use grepv_seq for
+ negation.
+
+ --wrap <integer>
+ Wrap the sequence output to a given number of columns.
+
+ --translate <string>
+ Translate a given nucleotide sequence to protein sequence.
+ Accepts 0,1,2 (for the phase) or 'maker' if you want to use the
+ frame from MAKER produced headers
+
+ --trim_maker_utr
+ Prints MAKER produced transcipts without the leading and trailing
+ UTR sequence
+
+ --seq_only
+ Print only the sequence (without the header) to STDOUT. This
+ can also be accomplished with grep -v '>' fasta_file.
+
+ --nt_count
+ Print the number and percentage of every nt/aa found in the
+ sequence.
+
+ --length
+ Print the length of each sequence.
+
+ --total_length
+ Print the total length of all sequences.
+
+ --n50
+ Calculate the N-50 (http://en.wikipedia.org/wiki/N50_statistic)
+ of the sequences in the file.
+
+ --tab
+ Print the header and sequence on the same line separated by a tab.
+
+ --table
+ Print in table format rather than fasta format.
+
+ --print
+ Print the sequence. Use in conjuction with 'wrap' or other formatting
+ commands to reformat the sequence.
+
+ --reverse
+ Reverse the order of the sequences in a fasta file.
+
+ --rev_seq
+ Reverse the order of the nt/aa in each sequence.
+
+ --comp_seq
+ Complement the nucleotide sequence.
+
+ --rev_comp
+ Reverse compliment a sequence. Same as --rev_seq && --comp_seq together.
+
+ --uniq
+
+ Print only uniq sequences. This method only compares complete
+ sequences.
+
+ --uniq_sub
+
+ Print only uniq sequences, but also check that shorter sequences
+ are not perfect substrings of longer sequences.
+
+ --shuffle_order
+ Randomize the order of sequences in a multi-fasta file.
+
+ --shuffle_seq
+ Randomize the order of the nt/aa in each sequence.
+
+ --shuffle_codon
+ Randomize the order of the codons in a nucleotide sequence.
+
+ --shuffle_pick
+ Pick a given number of sequences from a multi-fasta file.
+
+ --select
+ Pass in a file with IDs and return sequences with these IDs.
+
+ --remove
+ Pass in a file with IDs and remove sequences with these IDs.
+
+ --swap_ids
+ Pass in a file with two columns of IDs and map the IDs in the
+ fasta headers from the first column of the ID file to the second
+ column of the ID file. If an ID in the fasta header is not found
+ in the first column of the ID file then issue a warning, but leave
+ the ID unmapped.
+
+ --fix_prot
+ Fix protein fasta files for use as blast database. Removes spaces
+ and '*' and replaces any non amino acid codes with C.
+
+ --subseq
+ Grab a sub-sequence from a fasta file based on coordinates. The
+ requested coordinates are in the form seqid:start-end;
+
+
+";
+
+my ($summary, $chunks, $split, $eval_code, $eval_all, $extract_ids,
+ $grep_header, $grepv_header, $grep_seq, $grepv_seq, $wrap, $count,
+ $translate, $seq_only, $nt_count, $length, $total_length, $n50,
+ $tab, $reverse, $rev_seq, $comp_seq, $rev_comp, $uniq, $uniq_sub,
+ $shuffle_order, $shuffle_seq, $shuffle_codon, $shuffle_pick,
+ $select_file, $remove_file, $print, $mRNAseq, $EST,
+ $trim_maker_utr, $table, $swap_ids, $fix_prot, $subseq, $tile);
+
+GetOptions('summary' => \$summary,
+ 'chunks=i' => \$chunks,
+ 'split' => \$split,
+ 'eval_code=s' => \$eval_code,
+ 'eval_all=s' => \$eval_all,
+ 'extract_ids=s' => \$extract_ids,
+ 'grep_header=s' => \$grep_header,
+ 'grep_seq=s' => \$grep_seq,
+ 'grepv_header=s' => \$grepv_header,
+ 'grepv_seq=s' => \$grepv_seq,
+ 'wrap=i' => \$wrap,
+ 'count' => \$count,
+ 'translate=s' => \$translate,
+ 'trim_maker_utr' => \$trim_maker_utr,
+ 'seq_only' => \$seq_only,
+ 'nt_count' => \$nt_count,
+ 'length' => \$length,
+ 'total_length' => \$total_length,
+ 'n50' => \$n50,
+ 'tab' => \$tab,
+ 'table' => \$table,
+ 'print' => \$print,
+ 'reverse' => \$reverse,
+ 'rev_seq' => \$rev_seq,
+ 'comp_seq' => \$comp_seq,
+ 'rev_comp' => \$rev_comp,
+ 'uniq' => \$uniq,
+ 'uniq_sub' => \$uniq_sub,
+ 'shuffle_order' => \$shuffle_order,
+ 'shuffle_seq' => \$shuffle_seq,
+ 'shuffle_codon' => \$shuffle_codon,
+ 'shuffle_pick=i' => \$shuffle_pick,
+ 'remove=s' => \$remove_file,
+ 'select=s' => \$select_file,
+ 'fix_prot' => \$fix_prot,
+ 'mRNAseq' => \$mRNAseq,
+ 'EST' => \$EST,
+ 'swap_ids=s' => \$swap_ids,
+ 'subseq=s' => \$subseq,
+ 'tile=s' => \$tile,
+ );
+
+my $file = shift;
+unless (($file && -r $file) || ! -t STDIN){
+ print $usage;
+ exit;
+}
+
+if ($rev_comp) {$rev_seq++; $comp_seq++}
+
+my $warning = "\n\n" . ('#' x 40) . "\nThis function is not yet thouroughly tested!!\n" . ('#' x 40) . "\n\n";
+
+warn $warning if grep {$_} ($extract_ids,
+ $reverse,
+ $rev_seq,
+ $comp_seq,
+ $shuffle_order,
+ $shuffle_seq,
+ $shuffle_codon,
+ $shuffle_pick,
+ );
+
+# These functions handle their own printing;
+$print++ unless grep {$_} ($chunks,
+ $split,
+ $eval_code,
+ $eval_all,
+ $extract_ids,
+ $grep_header,
+ $grep_seq,
+ $grepv_header,
+ $grepv_seq,
+ $count,
+ $translate,
+ $seq_only,
+ $nt_count,
+ $length,
+ $total_length,
+ $n50,
+ $reverse,
+ $rev_seq,
+ $comp_seq,
+ $uniq,
+ $uniq_sub,
+ $shuffle_order,
+ $shuffle_seq,
+ $shuffle_codon,
+ $shuffle_pick,
+ $remove_file,
+ $select_file,
+ $mRNAseq,
+ $EST,
+ $swap_ids,
+ $subseq,
+ $tile,
+ );
+
+$nt_count++ if $summary;
+
+if(defined $translate && $translate !~ /^\d+$/ && $translate ne 'maker'){
+ $translate = 0;
+}
+
+my $IN;
+if (! $file && ! -t STDIN) {
+ open ($IN, "<&=STDIN") or die "Can't open STDIN\n";
+}
+else {
+ open ($IN, $file) or die "Can't open $file for reading: $!\n";
+}
+
+#Bioperl object for main fasta input file.
+my $seq_io = Bio::SeqIO->new(-fh => $IN,
+ -format => 'Fasta');
+
+chunks($file, $chunks) if $chunks;
+split_fasta() if $split;
+eval_code($eval_code) if $eval_code;
+eval_all($eval_all) if $eval_all;
+extract_ids($extract_ids) if $extract_ids;
+grep_header($grep_header) if $grep_header;
+grep_seq($grep_seq) if $grep_seq;
+grepv_header($grepv_header) if $grepv_header;
+grepv_seq($grepv_seq) if $grepv_seq;
+translate() if defined($translate);
+trim_maker_utr() if $trim_maker_utr;
+seq_only() if $seq_only;
+nt_count() if $nt_count;
+seq_length() if $length;
+total_length() if $total_length;
+n50() if $n50;
+tab() if $tab;
+reverse_order() if $reverse;
+rev_comp() if $rev_seq;
+rev_comp() if $comp_seq;
+uniq() if $uniq;
+uniq_sub() if $uniq_sub;
+shuffle_order() if $shuffle_order;
+shuffle_seq() if $shuffle_seq;
+shuffle_codon() if $shuffle_codon;
+shuffle_pick($shuffle_pick) if $shuffle_pick;
+remove_ids($remove_file) if $remove_file;
+select_ids($select_file) if $select_file;
+swap_ids($swap_ids) if $swap_ids;
+subseq($file, $subseq) if $subseq;
+fix_prot() if $fix_prot;
+print_seq() if $print;
+mRNAseq() if $mRNAseq;
+EST() if $EST;
+
+#-----------------------------------------------------------------------------
+#-------------------------------- SUBROUTINES --------------------------------
+#-----------------------------------------------------------------------------
+sub chunks {
+ my($file, $chunks) = @_;
+
+ my $outfile_base; #Create a base name for the output file.
+ ($outfile_base = $file) =~ s/\.[^\.]*$//; #Input file name minus it's extension.
+
+ my $file_size = io($file)->size; #What's the size of our input file.
+
+ #How many chunks should the input file be split into?
+ my $chunk_size = int($file_size/$chunks) + ($file_size % $chunks);
+
+ #How many digits should the output file interations be sprintf'ed to=?
+ my $digits = int(log($chunks)/log(10)) + 1;
+
+ #I felt like using a closure today.
+ my $file_counter = make_file_counter();
+
+ my $out;
+ my $file_name;
+ #Loop over each sequence
+ while ( my $seq = $seq_io->next_seq() ) {
+ #Get an Bio::SeqIO fh if we don't have one, or if the current output file
+ #has grown too large.
+ if (! $out || (io($file_name)->size > $chunk_size)) {
+ ($out, $file_name) = get_output_stream($outfile_base, $digits, $file_counter);
+ }
+ #Write it to the file.
+ $out->write_seq($seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub get_output_stream{
+ my ($outfile_base, $digits, $file_counter) = @_;
+ my $file_count = $file_counter->();
+ #Build/format the file_name.
+ $file_count = sprintf "%0${digits}s", $file_count;
+ my $file_name = $outfile_base . "_$file_count" . '.fasta';
+ #Get Bio::SeqIO object
+ my $out = Bio::SeqIO->new(-file => ">$file_name",
+ -format => 'fasta');
+ return ($out, $file_name);
+}
+#-----------------------------------------------------------------------------
+sub make_file_counter {
+ #Initialize the counter.
+ my $file_counter = 0;
+
+ #Increment the counter.
+ return sub{$file_counter++}
+}
+#-----------------------------------------------------------------------------
+sub split_fasta {
+ while ( my $seq = $seq_io->next_seq() ) {
+ my $file_out = $seq->display_id . ".fasta";
+
+ #Get Bio::SeqIO object
+ my $out = Bio::SeqIO->new(-file => ">$file_out",
+ -format => 'fasta');
+ #Write it to the file.
+ $out->write_seq($seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub eval_code {
+ my ($code) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+
+ my $return_value = eval $code;
+ die "Fatal Error in code ref:\n$@\n" if $@;
+ next unless $return_value;
+ print_this_seq($header, $seq);
+ }
+
+}
+#-----------------------------------------------------------------------------s
+sub eval_all {
+ my ($code) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+
+ eval $code;
+ die "Fatal Error in code ref:\n$@\n" if $@;
+ print_this_seq($header, $seq);
+ }
+
+}
+#-----------------------------------------------------------------------------
+sub extract_ids {
+
+ my $id_file = shift;
+
+ open(my $IN, '<', $id_file) or die "Can't open $id_file for reading\n$!\n";
+
+ my %ids = map {$_ => 1 unless (/^\#/ || ! $_)} (<$IN>);
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+
+ if (exists $ids{$id}) {
+ print_this_seq($header, $seq);
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+sub grep_header {
+ my ($pattern) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ $header .= " " . $seq_obj->description;
+ my $seq = $seq_obj->seq;
+
+ if ($header =~ /$pattern/) {
+ print_this_seq($header, $seq);
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+sub grepv_header {
+ my ($pattern) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+
+ if ($header !~ /$pattern/) {
+ print_this_seq($header, $seq);
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+{my $i = 0;
+sub mRNAseq {
+ my $size = 50;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $seq = $seq_obj->seq;
+
+ my $len = length($seq);
+
+ for (my $j = 0; $j < $len/5; $j++){
+ my $range = $len - $size;
+
+ my $start; my $end;
+ if($range < 0){
+ $start = 0 ;
+ $end = $len - 1;
+ }
+ else{
+ $start = int(rand($range));
+ $end = $start + $size - 1;
+ }
+
+ my $l = $end - $start + 1;
+
+ my $header = "sequence_".$i++;
+ print_this_seq($header, substr($seq, $start, $l));
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+sub EST {
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $seq = $seq_obj->seq;
+
+ my $len = length($seq);
+
+
+ my $range = $len - 500;
+ my $min = 250;
+
+ if($range < 0 || $min < 0){
+ my $header = "sequence_".$i++;
+ print_this_seq($header, $seq);
+ next;
+ }
+
+ my $A = int(rand($range) + $min);
+ my $start = int(rand($range) + $min);
+ my $end = int(rand($range));
+ my $B = int(rand($range) + $min);
+
+ my $l = abs($end - $start + 1);
+ if($l > 250){
+ my $header = "sequence_".$i++;
+ print_this_seq($header, substr($seq, $start, $l));
+ }
+
+ $l = abs($A - 0 + 1);
+ if($l > 250){
+ my $header = "sequence_".$i++;
+ print_this_seq($header, substr($seq, 0, $l));
+ }
+
+ $l = abs($len - $B + 1);
+ if($l > 250){
+ my $header = "sequence_".$i++;
+ print_this_seq($header, substr($seq, $B, $l));
+ }
+
+ }
+}
+}
+#-----------------------------------------------------------------------------
+sub grep_seq {
+ my ($pattern) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ $seq =~ s/\s//g;
+
+ if ($seq =~ /$pattern/) {
+ print_this_seq($header, $seq);
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+sub grepv_seq {
+ my ($pattern) = @_;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ $seq =~ s/\s//g;
+
+ if ($seq !~ /$pattern/) {
+ print_this_seq($header, $seq);
+ }
+ }
+}
+#-----------------------------------------------------------------------------
+sub fix_prot {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ $seq =~ s/[\s\*]//g;
+ $seq =~ s/[^abcdefghiklmnpqrstvwyzxABCDEFGHIKLMNPQRSTVWYZX\-\n]/C/g;
+ next if($seq eq ''); #skip empty fasta entries
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub translate {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $frame;
+ my $offset;
+ if($translate eq 'maker'){
+ $header =~ /offset:(\d+)/;
+ $frame = ($1 % 3);
+ $offset = ($1 - $frame)/3;
+ }
+ else{
+ $frame = $translate % 3;
+ $offset = ($translate - $frame)/3;
+ }
+ my $pep_seq = $seq_obj->translate(-frame => $frame)->seq;
+ $pep_seq = substr($pep_seq, $offset);
+ $pep_seq =~ s/^([^\*]+).*/$1/;
+ print_this_seq($header, $pep_seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub trim_maker_utr {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $frame;
+ my $offset;
+
+ $header =~ /offset:(\d+)/;
+
+ die "ERROR: These do not appear to be MAKER produced transcripts\n"
+ if(! defined $1 || $1 eq '');
+ $frame = ($1 % 3);
+ $offset = ($1 - $frame)/3; #peptide offet without frame
+
+ my $tra_seq = $seq_obj->seq;
+ my $pep_seq = $seq_obj->translate(-frame => $frame)->seq;
+
+ $pep_seq = substr($pep_seq, $offset);
+ $pep_seq =~ s/^([^\*]+\*?).*/$1/;
+ $offset = 3 * $offset + $frame; #make transcript offset
+ my $length = 3 * length($pep_seq); #length of substring to get
+ my $fix = $offset + $length - length($tra_seq);
+ $length -= $fix if($fix > 0);
+ $tra_seq = substr($tra_seq, $offset, $length);
+
+ print_this_seq($header, $tra_seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub seq_only {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $seq = $seq_obj->seq;
+ $seq = wrap_seq($seq, $wrap) if $wrap;
+ print $seq . "\n";
+ }
+}
+#-----------------------------------------------------------------------------
+sub nt_count {
+ my %all_seq_count;
+ my $total_count;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my %this_seq_count;
+ my $this_count;
+ my $id = $seq_obj->display_id;
+ my $seq = $seq_obj->seq;
+ $seq =~ s/\s//g;
+ my @nts = split //, $seq;
+ for my $nt (@nts) {
+ $all_seq_count{$nt}++;
+ $this_seq_count{$nt}++;
+ $this_count++;
+ $total_count++;
+ }
+
+ next if $summary;
+ print "$id:\n";
+ print '-' x 80;
+ print "\n";
+ for my $nt (sort keys %this_seq_count) {
+ my $round = sprintf ("%.4f", $this_seq_count{$nt} / $this_count * 100);
+ print join "\t", ($nt,
+ $this_seq_count{$nt},
+ $round,
+ );
+ print '%' . "\n";
+ }
+
+ my %this_report;
+ map {$this_report{aA} += $this_seq_count{$_} if $this_seq_count{$_}} qw(a A);
+ map {$this_report{tT} += $this_seq_count{$_} if $this_seq_count{$_}} qw(t T);
+ map {$this_report{gG} += $this_seq_count{$_} if $this_seq_count{$_}} qw(g G);
+ map {$this_report{cC} += $this_seq_count{$_} if $this_seq_count{$_}} qw(c C);
+
+ map {$this_report{aAtT} += $this_report{$_} if $this_report{$_}} qw(aA tT);
+ map {$this_report{gGcC} += $this_report{$_} if $this_report{$_}} qw(gG cC);
+ map {$this_report{aAtTgGcC} += $this_report{$_} if $this_report{$_}} qw(aAtT gGcC);
+
+ map {$this_report{atgc} += $this_seq_count{$_} if $this_seq_count{$_}} qw(a t g c);
+ map {$this_report{nN} += $this_seq_count{$_} if $this_seq_count{$_}} qw(n N);
+ map {$this_report{atgcnN} += $this_seq_count{$_} if $this_seq_count{$_}} qw(atgc nN);
+
+ for my $key (sort keys %this_report) {
+
+ print join "\t", ($key,
+ $this_report{$key},
+ sprintf ("%.4f", $this_report{$key} / $this_count * 100),
+ );
+ print '%' . "\n";
+ }
+ print "\n\n";
+ }
+
+ print "All sequences combined:\n";
+ print '-' x 80;
+ print "\n";
+
+ for my $nt (sort keys %all_seq_count) {
+ print join "\t", ($nt,
+ $all_seq_count{$nt},
+ sprintf ("%.4f", $all_seq_count{$nt} / $total_count * 100),
+ );
+ print '%' . "\n";
+ }
+
+ my %all_report;
+ map {$all_report{aA} += $all_seq_count{$_} if $all_seq_count{$_}} qw(a A);
+ map {$all_report{tT} += $all_seq_count{$_} if $all_seq_count{$_}} qw(t T);
+ map {$all_report{gG} += $all_seq_count{$_} if $all_seq_count{$_}} qw(g G);
+ map {$all_report{cC} += $all_seq_count{$_} if $all_seq_count{$_}} qw(c C);
+
+ map {$all_report{aAtT} += $all_report{$_} if $all_report{$_}} qw(aA tT);
+ map {$all_report{gGcC} += $all_report{$_} if $all_report{$_}} qw(gG cC);
+ map {$all_report{aAtTgGcC} += $all_report{$_} if $all_report{$_}} qw(aAtT gGcC);
+
+ map {$all_report{atgc} += $all_seq_count{$_} if $all_seq_count{$_}} qw(a t g c);
+ map {$all_report{nN} += $all_seq_count{$_} if $all_seq_count{$_}} qw(n N);
+ map {$all_report{atgcnN} += $all_seq_count{$_} if $all_seq_count{$_}} qw(atgc nN);
+
+ for my $key (sort keys %all_report) {
+
+ print join "\t", ($key,
+ $all_report{$key},
+ sprintf ("%.4f", $all_report{$key} / $total_count * 100),
+ );
+ print '%' . "\n";
+ }
+ print "\n";
+ print "Total nts\t$total_count\n";
+}
+#-----------------------------------------------------------------------------
+sub seq_length {
+ my $count = 0;
+ my $total;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ my $length = $seq_obj->length;
+ $total += $length;
+ $count++;
+ print "$id\t$length\n";
+ }
+ print "Total\t$total\n" if $count > 1;
+}
+#-----------------------------------------------------------------------------
+sub total_length {
+ my $total_length;
+ while (my $seq_obj = $seq_io->next_seq) {
+ $total_length += $seq_obj->length;
+ }
+ print $total_length . "\n";
+}
+#-----------------------------------------------------------------------------
+sub n50 {
+ my $total_length;
+ my @lengths;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $length = $seq_obj->length;
+ $total_length += $length;
+ push @lengths, $length;
+ }
+ my $cumulative_length;
+ my $last_length;
+ my $n50;
+ for my $length (sort {$b <=> $a} @lengths) {
+ $cumulative_length += $length;
+ if ($cumulative_length > $total_length / 2) {
+ $n50 = $length;
+ last;
+ }
+ elsif ($cumulative_length == $total_length / 2) {
+ $n50 = $length;
+ $last_length = $length;
+ last;
+ }
+ $last_length = $length;
+ }
+ $n50 = int((($n50 + $last_length) / 2) + 0.5);
+ print $n50 . "\n";
+}
+#-----------------------------------------------------------------------------
+sub tab {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ $seq =~ s/[\s\n\t]//g;
+ print "$header\t$seq\n";
+ }
+}
+#-----------------------------------------------------------------------------
+sub print_seq {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ print_this_seq($header, $seq);
+
+ }
+}
+#-----------------------------------------------------------------------------
+sub reverse_order {
+ my @seqs;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ push @seqs, {seq => $seq,
+ header => $header,
+ };
+ }
+
+ @seqs = reverse @seqs;
+
+ for my $seq (@seqs) {
+ print_this_seq($seq->{header}, $seq->{seq});
+ }
+}
+#-----------------------------------------------------------------------------
+sub rev_comp{
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ $seq = reverse $seq if $rev_seq;
+ if ($comp_seq) {
+ $seq =~ tr/acgtrymkswhdbvACGTRYMKSWHDBV
+ /tgcayrkmswdhvbTGCAYRKMSWDHVB/;
+ }
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub uniq{
+ my %seen;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ print_this_seq($header, $seq) unless exists $seen{$seq};
+ $seen{$seq}++;
+ }
+}
+#-----------------------------------------------------------------------------
+sub uniq_sub {
+ my @seqs;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ push @seqs, [$header, $seq];
+ }
+
+ @seqs = sort {length $a->[1] <=> length $b->[1]} @seqs;
+
+ OUTER:
+ for my $outer_idx (0 .. $#seqs) {
+ my $start_idx = $outer_idx + 1;
+ for my $inner_idx ($start_idx .. $#seqs) {
+ if ($seqs[$inner_idx][1] =~ /$seqs[$outer_idx][1]/) {
+ print STDERR "WARN : skipping_sequence : ($seqs[$outer_idx][0]) " .
+ "$seqs[$outer_idx][1]\n";
+ next OUTER;
+ }
+ }
+ print_this_seq($seqs[$outer_idx][0], $seqs[$outer_idx][1]);
+ }
+}
+#-----------------------------------------------------------------------------
+sub shuffle_order {
+ my @seqs;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ push @seqs, {seq => $seq,
+ header => $header,
+ };
+ }
+
+ shuffle(\@seqs);
+
+ for my $seq (@seqs) {
+ print_this_seq($seq->{header}, $seq->{seq});
+ }
+}
+#-----------------------------------------------------------------------------
+sub shuffle_seq {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my @seq = split //, $seq_obj->seq;
+ shuffle(\@seq);
+ my $seq = join '', @seq;
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub shuffle_codon {
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ my @codons = $seq =~ /(.{3})/g;
+ shuffle(\@codons);
+ $seq = join '', @codons;
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub shuffle_pick {
+ my $shuffle_pick = shift;
+
+ my @seqs;
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ push @seqs, {seq => $seq,
+ header => $header,
+ };
+ }
+
+ my @picks;
+ for (1 .. $shuffle_pick) {
+ push @picks, splice @seqs, int(rand(scalar @seqs)), 1;
+ }
+
+ for my $pick (@picks) {
+ print_this_seq($pick->{header}, $pick->{seq});
+ }
+}
+#-----------------------------------------------------------------------------
+sub remove_ids {
+
+ my $remove_file = shift;
+ open (my $IN, '<', $remove_file) or die "Can't open $remove_file for reading\n";
+ my %ids = map {chomp;$_, 1} (<$IN>);
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ next if $ids{$id};
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub select_ids {
+
+ my $select_file = shift;
+ open (my $IN, '<', $select_file) or die "Can't open $select_file for reading\n";
+ my %ids = map {chomp;$_, 1} (<$IN>);
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ next unless $ids{$id};
+ my $header = get_header($seq_obj);
+ my $seq = $seq_obj->seq;
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub print_this_seq {
+ my ($header, $seq) = @_;
+
+ if($table){
+ chomp $seq;
+ ($header) = $header =~ /^([^\s+]+)/;
+ print join("\t", $header, uc($seq))."\n";
+ }
+ else{
+ $seq = wrap_seq($seq, $wrap) if $wrap;
+ chomp $seq;
+ my $join = $tab ? "\t" : "\n";
+ print join $join, (">$header", "$seq\n");
+ }
+}
+#-----------------------------------------------------------------------------
+sub wrap_seq {
+ my ($seq, $wrap) = @_;
+
+ if ($wrap > 0) {
+ $seq =~ s/\s//g;
+ $seq =~ s/(.{$wrap})/$1\n/g;
+ }
+ chomp $seq;
+ return $seq;
+}
+#-----------------------------------------------------------------------------
+sub get_header {
+ my $seq_obj = shift;
+ return $seq_obj->display_id . " " . $seq_obj->description;
+
+}
+#-----------------------------------------------------------------------------
+sub shuffle {
+ #Fisher-Yates Shuffle
+ my $array = shift;
+
+ my $n = scalar @{$array};
+ while ($n > 1) {
+ my $k = int rand($n--);
+ ($array->[$n], $array->[$k]) = ($array->[$k], $array->[$n]);
+ }
+}
+#-----------------------------------------------------------------------------
+sub swap_ids {
+
+ my $id_file = shift;
+ open (my $IN, '<', $id_file) or die "Can't open $id_file for reading\n";
+ my %ids;
+ while (<$IN>) {
+ chomp;
+ my($id1, $id2) = split /\t/, $_;
+ # $id1 =~ s/\.\d+$//;
+ $ids{$id1} = $id2;
+ }
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ # gi|71999842|ref|NM_073020.2|
+ # my ($x, $y, $z, $id) = split /\|/, $id_text;
+ # $id =~ s/\.\d+//;
+ my $header = get_header($seq_obj);
+ if (exists $ids{$id}) {
+ $header = $ids{$id};
+ }
+ my $seq = $seq_obj->seq;
+ print_this_seq($header, $seq);
+ }
+}
+#-----------------------------------------------------------------------------
+sub subseq {
+
+ my ($file, $coordinates) = @_;
+
+ require Bio::DB::Fasta;
+ my $fasta = Bio::DB::Fasta->new($file);
+
+ my ($seqid, $start, $end) = split /[:-]/, $coordinates;
+
+ print $fasta->seq($seqid, $start, $end);
+ print "\n";
+}
+#-----------------------------------------------------------------------------
+
+sub tile_seq {
+
+ my $tile = shift;
+
+ my ($tile_length, $step) = split /,/, $tile;
+ $tile_length ||= 50;
+ $step ||= 1;
+
+ while (my $seq_obj = $seq_io->next_seq) {
+ my $id = $seq_obj->display_id;
+ my $seq = $seq_obj->seq;
+ my $seq_length = length($seq);
+ my $start;
+ for ($start = 1;$start <= ($seq_length - $tile_length); $start += $step) {
+ my $header = "$id:$start-" . ($start + $tile_length - 1);
+ my $subseq = substr($seq, $start, $tile_length);
+ print ">$header\n$subseq\n";
+ }
+ }
+}
+
+#-----------------------------------------------------------------------------
diff --git a/util/filter_low_expr_transcripts.pl b/util/filter_low_expr_transcripts.pl
new file mode 100755
index 0000000..7c3a66a
--- /dev/null
+++ b/util/filter_low_expr_transcripts.pl
@@ -0,0 +1,286 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Carp;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+use FindBin;
+use lib ("$FindBin::RealBin/../PerlLib");
+use Fasta_reader;
+
+my $help_flag;
+
+
+my $usage = <<__EOUSAGE__;
+
+##########################################################################################
+#
+# --matrix|m <string> expression matrix (TPM or FPKM, *not* raw counts)
+#
+# --transcripts|t <string> transcripts fasta file (eg. Trinity.fasta)
+#
+#
+# # expression level filter:
+#
+# --min_expr_any <float> minimum expression level required across any sample (default: 0)
+#
+# # Isoform-level filtering
+#
+# --min_pct_dom_iso <int> minimum percent of dominant isoform expression (default: 0)
+# or
+# --highest_iso_only only retain the most highly expressed isoform per gene (default: off)
+# (mutually exclusive with --min_pct_iso param)
+#
+# # requires gene-to-transcript mappings
+#
+# --trinity_mode targets are Trinity-assembled transcripts
+# or
+# --gene_to_trans_map <string> file containing gene-to-transcript mappings
+# (format is: gene(tab)transcript )
+#
+#########################################################################################
+
+
+__EOUSAGE__
+
+ ;
+
+
+my $matrix_file;
+my $transcripts_file;
+my $min_expr_any = 0;
+my $min_pct_dom_iso = 0;
+my $highest_iso_only_flag = 0;
+my $trinity_mode_flag = 0;
+my $gene_to_trans_map_file;
+
+
+&GetOptions ( 'help|h' => \$help_flag,
+
+ 'matrix|m=s' => \$matrix_file,
+ 'transcripts|t=s' => \$transcripts_file,
+
+ 'min_expr_any=f' => \$min_expr_any,
+ 'min_pct_iso=i' => \$min_pct_dom_iso,
+ 'highest_iso_only' => \$highest_iso_only_flag,
+
+ 'trinity_mode' => \$trinity_mode_flag,
+ 'gene_to_trans_map=s' => \$gene_to_trans_map_file,
+
+
+ );
+
+
+if ($help_flag) {
+ die $usage;
+}
+
+
+unless ($matrix_file && $transcripts_file &&
+ ($min_expr_any || defined($min_pct_dom_iso) || $highest_iso_only_flag) ) {
+
+ die $usage;
+}
+
+if ( (defined($min_pct_dom_iso) || $highest_iso_only_flag) && ! ($trinity_mode_flag || $gene_to_trans_map_file) ) {
+ die "Error, if --min_pct_iso or --highest_iso_only, must also specify either --trinity_mode or --gene_to_trans_map";
+}
+
+if (defined($min_pct_dom_iso) && $highest_iso_only_flag) {
+ die "Error, --min_pct_iso and --highest_iso_only are mutually exclusive parameters. ";
+}
+
+
+main: {
+
+ my %expr_vals = &parse_expr_matrix($matrix_file);
+
+ if (defined($min_pct_dom_iso) || $highest_iso_only_flag) {
+
+ my %gene_to_iso_map = ($trinity_mode_flag)
+ ? &parse_Trinity_gene_mapping($transcripts_file)
+ : &parse_gene_trans_map_file($gene_to_trans_map_file);
+
+ &add_pct_iso_stats(\%expr_vals, \%gene_to_iso_map);
+ }
+
+ my $total_records = 0;
+ my $retained_records = 0;
+
+ my $fasta_reader = new Fasta_reader($transcripts_file);
+ while (my $seq_obj = $fasta_reader->next()) {
+
+ $total_records++;
+
+ my $acc = $seq_obj->get_accession();
+
+ my $keep_flag = 1;
+
+ my $info_struct = $expr_vals{$acc} or die "Error, no expression record stored for acc: [$acc]";
+
+ if ($min_expr_any && $info_struct->{max_expr} < $min_expr_any) {
+ $keep_flag = 0;
+ }
+ if (defined($min_pct_dom_iso) && (! $info_struct->{top_iso}) && $info_struct->{pct_dom_iso_expr} < $min_pct_dom_iso) {
+ # notice we'll still keep the dominant isoform for the gene even if it's pct iso < $min_pct_dom_iso.
+ ## dont want to be silly and throw out the gene altogther... :)
+
+ $keep_flag = 0;
+ }
+
+ if ($highest_iso_only_flag && ! $info_struct->{top_iso}) {
+ $keep_flag = 0;
+ }
+
+ if ($keep_flag) {
+ $retained_records++;
+ my $fasta_record = $seq_obj->get_FASTA_format();
+ chomp $fasta_record;
+ my ($header_line, @seq_lines) = split(/\n/, $fasta_record);
+ # tack on the pct expr info onto the header
+ my $top_iso_flag = $info_struct->{top_iso};
+ my $pct_iso_expr = $info_struct->{pct_iso_expr};
+ my $pct_dom_iso_expr = $info_struct->{pct_dom_iso_expr};
+ $header_line .= " top_iso:$top_iso_flag pct_iso_expr=$pct_iso_expr pct_dom_iso_expr=$pct_dom_iso_expr";
+
+ print join("\n", $header_line, @seq_lines) . "\n";
+
+ }
+ }
+
+ my $pct_records_retained = sprintf("%.2f", $retained_records / $total_records * 100);
+ print STDERR "\n\n\tRetained $retained_records / $total_records = $pct_records_retained\% of total transcripts.\n\n\n";
+
+
+ exit(0);
+
+
+}
+
+####
+sub add_pct_iso_stats {
+ my ($expr_vals_href, $gene_to_iso_map_href) = @_;
+
+ foreach my $gene (keys %$gene_to_iso_map_href) {
+
+ my @isoforms = keys %{$gene_to_iso_map_href->{$gene}};
+
+ if (scalar @isoforms == 1) {
+ # only one isoform, so must be 100% of that gene.
+ $expr_vals_href->{ $isoforms[0] }->{pct_iso_expr} = 100;
+ $expr_vals_href->{ $isoforms[0] }->{pct_dom_iso_expr} = 100;
+ $expr_vals_href->{ $isoforms[0] }->{top_iso} = 1;
+
+ }
+ else {
+ # determine fraction of total gene expr
+ # first, get sum of gene expr across isoforms
+ my $gene_sum_expr = 0;
+ my $dominant_iso_expr = 0;
+ foreach my $iso (@isoforms) {
+
+ my $expr = $expr_vals_href->{$iso}->{sum_expr};
+ if (!defined($expr)) {
+ use Data::Dumper;
+ print STDERR "ISO: $iso\t" . Dumper($expr_vals_href->{$iso});
+ }
+ if ($expr > $dominant_iso_expr) {
+ $dominant_iso_expr = $expr;
+ }
+
+
+ $gene_sum_expr += $expr;
+ }
+ # now compute pct iso
+ foreach my $iso (@isoforms) {
+ my $expr = $expr_vals_href->{$iso}->{sum_expr};
+ my $pct_iso = sprintf("%.2f", $expr / $gene_sum_expr * 100);
+ $expr_vals_href->{$iso}->{pct_iso_expr} = $pct_iso;
+
+ my $pct_dom_iso_expr = sprintf("%.2f", $expr / $dominant_iso_expr * 100);
+ $expr_vals_href->{$iso}->{pct_dom_iso_expr} = $pct_dom_iso_expr;
+ }
+ # set top iso
+ @isoforms = sort { $expr_vals_href->{$a}->{pct_iso_expr} <=> $expr_vals_href->{$b}->{pct_iso_expr} } @isoforms;
+
+ my $top_isoform = pop @isoforms;
+ $expr_vals_href->{$top_isoform}->{top_iso} = 1;
+ }
+ }
+}
+
+
+####
+sub parse_expr_matrix {
+ my ($matrix_file) = @_;
+
+ my %expr_vals;
+
+ open (my $fh, $matrix_file) or die "Error, cannot open file $matrix_file";
+ my $header = <$fh>;
+ while (<$fh>) {
+ chomp;
+ my @expr = split(/\t/);
+ my $acc = shift @expr;
+
+ my $max_val = 0;
+ my $sum = 0;
+
+ foreach my $expr_val (@expr) {
+ $sum += $expr_val;
+ if ($expr_val > $max_val) {
+ $max_val = $expr_val;
+ }
+ }
+
+ $expr_vals{$acc}->{max_expr} = $max_val;
+ $expr_vals{$acc}->{sum_expr} = $sum;
+ $expr_vals{$acc}->{pct_iso_expr} = undef; # set later
+ $expr_vals{$acc}->{pct_dom_iso_expr} = undef;
+ $expr_vals{$acc}->{top_iso} = 0; # set later to the isoform with highest expression for that gene.
+
+ }
+ close $fh;
+
+ return(%expr_vals);
+}
+
+####
+sub parse_Trinity_gene_mapping {
+ my ($transcripts_file) = @_;
+
+ my %gene_to_iso_map;
+
+ open (my $fh, $transcripts_file) or die "Error, cannot open file $transcripts_file";
+ while (<$fh>) {
+ if (/^>(\S+)/) {
+ my $acc = $1;
+ $acc =~ /^(\S+)(_i\d+)$/ or die "Error, cannot parse Trinity accession: $acc";
+ my $gene_id = $1;
+
+ $gene_to_iso_map{$gene_id}->{$acc} = 1;
+ }
+ }
+ close $fh;
+
+ return(%gene_to_iso_map);
+}
+
+####
+sub parse_gene_trans_map_file {
+ my ($gene_to_trans_map_file) = @_;
+
+ my %gene_to_iso_map;
+
+ open (my $fh, $gene_to_trans_map_file) or die "Error, cannot open file $gene_to_trans_map_file";
+ while (<$fh>) {
+ chomp;
+ my ($gene, $trans) = split(/\t/);
+
+ $gene_to_iso_map{$gene}->{$trans} = 1;
+ }
+ close $fh;
+
+ return (%gene_to_iso_map);
+}
+
diff --git a/util/insilico_read_normalization.pl b/util/insilico_read_normalization.pl
index 16ba3ac..15729be 100755
--- a/util/insilico_read_normalization.pl
+++ b/util/insilico_read_normalization.pl
@@ -6,7 +6,7 @@ use threads;
no strict qw(subs refs);
use FindBin;
-use lib ("$FindBin::Bin/../PerlLib");
+use lib ("$FindBin::RealBin/../PerlLib");
use File::Basename;
use Cwd;
use Carp;
@@ -63,7 +63,7 @@ my $usage = <<_EOUSAGE_;
#
#
# If paired reads:
-# --left <string> :left reads
+# --left <string> :left reads (if specifying multiple files, list them as comma-delimited. eg. leftA.fq,leftB.fq,...)
# --right <string> :right reads
#
# Or, if unpaired reads:
@@ -245,6 +245,9 @@ if ($max_memory) {
$sort_mem = $max_memory;
if ($PARALLEL_STATS) {
$sort_mem = int($sort_mem/2);
+ unless ($sort_mem > 1) {
+ $sort_mem = 1;
+ }
}
$sort_mem .= "G";
@@ -612,7 +615,7 @@ sub run_jellyfish {
sub prep_seqs {
my ($initial_file, $seqType, $file_prefix, $SS_lib_type) = @_;
- ($initial_file) = &add_fifo_for_gzip($initial_file) if $initial_file =~ /\.gz$/;
+ ($initial_file) = &add_fifo_for_gzip($initial_file) if $initial_file =~ /\.gz$|\.xz$|\.bz2$/;
if ($seqType eq "fq") {
# make fasta
@@ -947,6 +950,10 @@ sub add_fifo_for_gzip {
}
elsif ($file =~ /\.gz$/) {
$file = "<(gunzip -c $file)";
+ } elsif ($file =~ /\.xz$/) {
+ $file = "<(xz -d -c ${file})";
+ } elsif ($file =~ /\.bz2$/) {
+ $file = "<(bunzip2 -dc ${file})";
}
}
diff --git a/util/misc/Artemis/join_multi_wig_to_graph_plot.pl b/util/misc/Artemis/join_multi_wig_to_graph_plot.pl
index c9a244e..b9b9b60 100755
--- a/util/misc/Artemis/join_multi_wig_to_graph_plot.pl
+++ b/util/misc/Artemis/join_multi_wig_to_graph_plot.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use WigParser;
diff --git a/util/misc/BLAT_to_SAM.pl b/util/misc/BLAT_to_SAM.pl
index 2d52968..4ca81f8 100755
--- a/util/misc/BLAT_to_SAM.pl
+++ b/util/misc/BLAT_to_SAM.pl
@@ -171,8 +171,8 @@ if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) {
die "Error, SS_lib_type must be one of the following: (F, R, FR, RF) ";
}
-my $UTIL_DIR = "$FindBin::Bin/../support_scripts";
-my $BLAT_UTIL_DIR = "$FindBin::Bin/blat_util";
+my $UTIL_DIR = "$FindBin::RealBin/../support_scripts";
+my $BLAT_UTIL_DIR = "$FindBin::RealBin/blat_util";
my ($start_dir, $work_dir);
@@ -387,7 +387,7 @@ main: {
# report splice junctions and remove short terminal exons that are more likely noise.
- my $cmd = "$FindBin::Bin/../../Inchworm/bin/cigar_tweaker $outfile_basename.pre.coordSorted.sam target.fa $trim_short_terminal_segment_length | sort -T . -S $sort_buffer_size -k 3,3 -k 4,4n > $outfile_basename.coordSorted.spliceAdjust.sam";
+ my $cmd = "$FindBin::RealBin/../../Inchworm/bin/cigar_tweaker $outfile_basename.pre.coordSorted.sam target.fa $trim_short_terminal_segment_length | sort -T . -S $sort_buffer_size -k 3,3 -k 4,4n > $outfile_basename.coordSorted.spliceAdjust.sam";
&process_cmd($cmd) unless (-e "$outfile_basename.coordSorted.spliceAdjust.sam.finished");
$cmd = "touch $outfile_basename.coordSorted.spliceAdjust.sam.finished";
&process_cmd($cmd) unless (-e "$outfile_basename.coordSorted.spliceAdjust.sam.finished");
diff --git a/util/misc/ButterflyFastaToGraphDot.pl b/util/misc/ButterflyFastaToGraphDot.pl
index 7c30715..a25008e 100755
--- a/util/misc/ButterflyFastaToGraphDot.pl
+++ b/util/misc/ButterflyFastaToGraphDot.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib", "$FindBin::Bin/../../PerlLib/KmerGraphLib");
+use lib ("$FindBin::RealBin/../../PerlLib", "$FindBin::RealBin/../../PerlLib/KmerGraphLib");
use Fasta_reader;
use StringGraph;
diff --git a/util/misc/HiCpipe_nameSortedSam_to_raw.pl b/util/misc/HiCpipe_nameSortedSam_to_raw.pl
index 9e91dc0..9948245 100755
--- a/util/misc/HiCpipe_nameSortedSam_to_raw.pl
+++ b/util/misc/HiCpipe_nameSortedSam_to_raw.pl
@@ -5,7 +5,7 @@ use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
use Data::Dumper;
diff --git a/util/misc/Monarch b/util/misc/Monarch
index c90b2d4..f5aa833 100755
--- a/util/misc/Monarch
+++ b/util/misc/Monarch
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib", "$FindBin::Bin/../../PerlLib/KmerGraphLib");
+use lib ("$FindBin::RealBin/../../PerlLib", "$FindBin::RealBin/../../PerlLib/KmerGraphLib");
use Fasta_reader;
diff --git a/util/misc/N50.pl b/util/misc/N50.pl
index 189224c..9d23888 100755
--- a/util/misc/N50.pl
+++ b/util/misc/N50.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
diff --git a/util/misc/SAM_coordsorted_max_reads_per_position.pl b/util/misc/SAM_coordsorted_max_reads_per_position.pl
index f9664d3..8a5811a 100755
--- a/util/misc/SAM_coordsorted_max_reads_per_position.pl
+++ b/util/misc/SAM_coordsorted_max_reads_per_position.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/SAM_intron_extractor.pl b/util/misc/SAM_intron_extractor.pl
index 124e1ff..331f1ab 100755
--- a/util/misc/SAM_intron_extractor.pl
+++ b/util/misc/SAM_intron_extractor.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/SAM_pair_to_bed.pl b/util/misc/SAM_pair_to_bed.pl
index de49824..4c85907 100755
--- a/util/misc/SAM_pair_to_bed.pl
+++ b/util/misc/SAM_pair_to_bed.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
use Overlap_piler;
diff --git a/util/misc/SAM_sortAny_to_count_stats.pl b/util/misc/SAM_sortAny_to_count_stats.pl
index f5c88d2..46c444f 100755
--- a/util/misc/SAM_sortAny_to_count_stats.pl
+++ b/util/misc/SAM_sortAny_to_count_stats.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/SAM_toString.pl b/util/misc/SAM_toString.pl
index a17e79e..e3f43e4 100755
--- a/util/misc/SAM_toString.pl
+++ b/util/misc/SAM_toString.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/SAM_to_bed.pl b/util/misc/SAM_to_bed.pl
index 378f555..2a41422 100755
--- a/util/misc/SAM_to_bed.pl
+++ b/util/misc/SAM_to_bed.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/SAM_to_fasta.pl b/util/misc/SAM_to_fasta.pl
index a5aec64..7a11664 100755
--- a/util/misc/SAM_to_fasta.pl
+++ b/util/misc/SAM_to_fasta.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
use Nuc_translator;
diff --git a/util/misc/TophatCufflinksWrapper.pl b/util/misc/TophatCufflinksWrapper.pl
index 149e1a0..d280a10 100755
--- a/util/misc/TophatCufflinksWrapper.pl
+++ b/util/misc/TophatCufflinksWrapper.pl
@@ -92,7 +92,7 @@ if ($GTF_annots) {
main: {
- my $util_dir = "$FindBin::Bin/..";
+ my $util_dir = "$FindBin::RealBin/..";
#############################
## align reads using Tophat
diff --git a/util/misc/allele_simulator.pl b/util/misc/allele_simulator.pl
index 388379f..e42611d 100755
--- a/util/misc/allele_simulator.pl
+++ b/util/misc/allele_simulator.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib("$FindBin::Bin/../../PerlLib");
+use lib("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use List::Util qw(min max);
use Data::Dumper;
diff --git a/util/misc/average.pl b/util/misc/average.pl
index 11c83eb..09256e2 100755
--- a/util/misc/average.pl
+++ b/util/misc/average.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use BHStats;
my $count = 0;
diff --git a/util/misc/bam_gene_tests/extract_bam_reads_per_target_gene.pl b/util/misc/bam_gene_tests/extract_bam_reads_per_target_gene.pl
index 6449d71..58fc788 100755
--- a/util/misc/bam_gene_tests/extract_bam_reads_per_target_gene.pl
+++ b/util/misc/bam_gene_tests/extract_bam_reads_per_target_gene.pl
@@ -8,7 +8,7 @@ use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use File::Basename;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use Nuc_translator;
use SAM_reader;
diff --git a/util/misc/bam_gene_tests/extract_bam_reads_per_target_transcript.pl b/util/misc/bam_gene_tests/extract_bam_reads_per_target_transcript.pl
index a522be7..e655397 100755
--- a/util/misc/bam_gene_tests/extract_bam_reads_per_target_transcript.pl
+++ b/util/misc/bam_gene_tests/extract_bam_reads_per_target_transcript.pl
@@ -8,7 +8,7 @@ use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use File::Basename;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use Nuc_translator;
use SAM_reader;
diff --git a/util/misc/bam_gene_tests/write_trin_cmds.pl b/util/misc/bam_gene_tests/write_trin_cmds.pl
index c8df5c8..8e2edfb 100755
--- a/util/misc/bam_gene_tests/write_trin_cmds.pl
+++ b/util/misc/bam_gene_tests/write_trin_cmds.pl
@@ -70,7 +70,7 @@ while (<$fh>) {
my $file = pop @x;
- my $cmd = "$FindBin::Bin/../../../Trinity --single \"$file\" --output \"$file.trinity.$out_token\" $trin_args ";
+ my $cmd = "$FindBin::RealBin/../../../Trinity --single \"$file\" --output \"$file.trinity.$out_token\" $trin_args ";
print "$cmd\n";
}
diff --git a/util/misc/blast_outfmt6_group_segments.pl b/util/misc/blast_outfmt6_group_segments.pl
index 604b6c2..d015acc 100755
--- a/util/misc/blast_outfmt6_group_segments.pl
+++ b/util/misc/blast_outfmt6_group_segments.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib "$FindBin::Bin/../../PerlLib";
+use lib "$FindBin::RealBin/../../PerlLib";
use Fasta_reader;
use List::Util qw(min max);
use Overlap_piler;
diff --git a/util/misc/blast_outfmt6_group_segments.to_Markov_Clustering.pl b/util/misc/blast_outfmt6_group_segments.to_Markov_Clustering.pl
index be3c82f..de58107 100755
--- a/util/misc/blast_outfmt6_group_segments.to_Markov_Clustering.pl
+++ b/util/misc/blast_outfmt6_group_segments.to_Markov_Clustering.pl
@@ -6,7 +6,7 @@ use warnings;
use Carp;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use FindBin;
-use lib "$FindBin::Bin/../../PerlLib";
+use lib "$FindBin::RealBin/../../PerlLib";
use Pipeliner;
use File::Basename;
diff --git a/util/misc/blat_util/blat_sam_add_reads2.pl b/util/misc/blat_util/blat_sam_add_reads2.pl
index 88f9d90..4177bf8 100755
--- a/util/misc/blat_util/blat_sam_add_reads2.pl
+++ b/util/misc/blat_util/blat_sam_add_reads2.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use Nuc_translator;
my $usage = "usage: $0 blat.psl.nameSorted.sam reads.tab.nameSorted\n\n";
diff --git a/util/misc/blat_util/blat_to_sam.pl b/util/misc/blat_util/blat_to_sam.pl
index 2652d21..4afaabd 100755
--- a/util/misc/blat_util/blat_to_sam.pl
+++ b/util/misc/blat_util/blat_to_sam.pl
@@ -72,7 +72,7 @@ unless ($genome_fa && $reads_fa) {
main: {
- my $util_dir = "$FindBin::Bin/../util";
+ my $util_dir = "$FindBin::RealBin/../util";
my $cmd = "$util_dir/fasta_to_tab.pl $reads_fa > $reads_fa.tab";
&process_cmd($cmd) unless (-s "$reads_fa.tab");
@@ -97,7 +97,7 @@ main: {
$cmd = "$util_dir/top_blat_sam_extractor.pl $reads_fa.psl.sam.wReads $top_hits $min_per_ID > $reads_fa.psl.sam.wReads.top";
&process_cmd($cmd);
- $cmd = "$FindBin::Bin/cigar_tweaker $reads_fa.psl.sam.wReads.top $genome_fa > $reads_fa.psl.sam.wReads.top.tweaked";
+ $cmd = "$FindBin::RealBin/cigar_tweaker $reads_fa.psl.sam.wReads.top $genome_fa > $reads_fa.psl.sam.wReads.top.tweaked";
&process_cmd($cmd);
$cmd = "sort -T . -S 2G -k 3,3 -k 4,4n $reads_fa.psl.sam.wReads.top.tweaked > $reads_fa.psl.sam.wReads.top.tweaked.coordSorted.sam";
diff --git a/util/misc/blat_util/process_BLAT_alignments.pl b/util/misc/blat_util/process_BLAT_alignments.pl
index bf3bca6..6d63718 100755
--- a/util/misc/blat_util/process_BLAT_alignments.pl
+++ b/util/misc/blat_util/process_BLAT_alignments.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use strict;
use warnings;
@@ -66,7 +66,7 @@ my $genome_db = $opt_g;
my $transcript_db = $opt_t;
my $output_prefix = $opt_o || "blat";
my $blat_path = "blat";
-my $util_dir = $FindBin::Bin;
+my $util_dir = $FindBin::RealBin;
unless ($genome_db && $transcript_db) {
die "$usage\n";
diff --git a/util/misc/blat_util/top_blat_sam_extractor.pl b/util/misc/blat_util/top_blat_sam_extractor.pl
index 7de148f..d8842d5 100755
--- a/util/misc/blat_util/top_blat_sam_extractor.pl
+++ b/util/misc/blat_util/top_blat_sam_extractor.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/capture_orig_n_unmapped_reads.pl b/util/misc/capture_orig_n_unmapped_reads.pl
index faad3b4..b923f15 100644
--- a/util/misc/capture_orig_n_unmapped_reads.pl
+++ b/util/misc/capture_orig_n_unmapped_reads.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Cwd;
use Carp;
diff --git a/util/misc/cdna_fasta_file_to_transcript_gtf.pl b/util/misc/cdna_fasta_file_to_transcript_gtf.pl
index 2172776..fbdeb54 100755
--- a/util/misc/cdna_fasta_file_to_transcript_gtf.pl
+++ b/util/misc/cdna_fasta_file_to_transcript_gtf.pl
@@ -5,7 +5,7 @@ use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
diff --git a/util/misc/check_fastQ_pair_ordering.pl b/util/misc/check_fastQ_pair_ordering.pl
index 332de3e..1733384 100755
--- a/util/misc/check_fastQ_pair_ordering.pl
+++ b/util/misc/check_fastQ_pair_ordering.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
my $usage = "usage: $0 left.fq right.fq\n\n";
diff --git a/util/misc/contig_ExN50_statistic.pl b/util/misc/contig_ExN50_statistic.pl
index ade42b5..09e0dea 100755
--- a/util/misc/contig_ExN50_statistic.pl
+++ b/util/misc/contig_ExN50_statistic.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 EXPR.matrix Trinity.fasta\n\n";
@@ -109,15 +109,16 @@ while (@trans) {
# ensure that we do E100
if (%Estats_wanted) {
-
- my $min_max_expr = &get_min_max(@captured);
- my $N50 = &calc_N50(@captured);
- my $num_trans = scalar(@captured);
- print "E100\t$min_max_expr\t$N50\t$num_trans\n";
+ if (exists $Estats_wanted{"100"}) {
+ my $min_max_expr = &get_min_max(@captured);
+ my $N50 = &calc_N50(@captured);
+ my $num_trans = scalar(@captured);
+
+ print "E100\t$min_max_expr\t$N50\t$num_trans\n";
+ }
}
-
exit(0);
diff --git a/util/misc/extract_fastQ_pairings.pl b/util/misc/extract_fastQ_pairings.pl
index dc395bd..623c0b2 100755
--- a/util/misc/extract_fastQ_pairings.pl
+++ b/util/misc/extract_fastQ_pairings.pl
@@ -6,7 +6,7 @@ use warnings;
use Data::Dumper;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
my $DEBUG = 0;
diff --git a/util/misc/fastQ_rand_subset.pl b/util/misc/fastQ_rand_subset.pl
index 8b0f7e0..8b957c9 100755
--- a/util/misc/fastQ_rand_subset.pl
+++ b/util/misc/fastQ_rand_subset.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
use File::Basename;
diff --git a/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl b/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
index e5377d0..eedc324 100755
--- a/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
+++ b/util/misc/fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl
@@ -23,7 +23,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
use File::Basename;
diff --git a/util/misc/fasta_file_reformatter.pl b/util/misc/fasta_file_reformatter.pl
index e98b867..8f0be71 100755
--- a/util/misc/fasta_file_reformatter.pl
+++ b/util/misc/fasta_file_reformatter.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 fasta\n";
diff --git a/util/misc/fasta_filter_by_min_length.pl b/util/misc/fasta_filter_by_min_length.pl
index 5e233e2..167ccab 100755
--- a/util/misc/fasta_filter_by_min_length.pl
+++ b/util/misc/fasta_filter_by_min_length.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
diff --git a/util/misc/fasta_seq_length.pl b/util/misc/fasta_seq_length.pl
new file mode 100755
index 0000000..c3337d5
--- /dev/null
+++ b/util/misc/fasta_seq_length.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+
+use lib ("$FindBin::Bin/../../PerlLib");
+use Fasta_reader;
+
+my $usage = "usage: $0 fastaFile\n\n";
+
+my $file = $ARGV[0] or die $usage;
+
+my $fasta_reader = new Fasta_reader($file);
+
+print join("\t", "fasta_entry", "length") . "\n";
+while (my $seq_obj = $fasta_reader->next()) {
+ my $sequence = $seq_obj->get_sequence();
+ my $accession = $seq_obj->get_accession();
+
+ print join("\t", $accession, length($sequence)) . "\n";
+}
+
+exit(0);
+
diff --git a/util/misc/fasta_to_cmd_generator.pl b/util/misc/fasta_to_cmd_generator.pl
index 1baccd3..53f0dea 100755
--- a/util/misc/fasta_to_cmd_generator.pl
+++ b/util/misc/fasta_to_cmd_generator.pl
@@ -2,7 +2,7 @@
use strict;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Getopt::Std;
use strict;
diff --git a/util/misc/fasta_write_sense_n_anti.pl b/util/misc/fasta_write_sense_n_anti.pl
index 7d7dfd6..a44b88a 100755
--- a/util/misc/fasta_write_sense_n_anti.pl
+++ b/util/misc/fasta_write_sense_n_anti.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Nuc_translator;
diff --git a/util/misc/fastq_interleave_pairs.pl b/util/misc/fastq_interleave_pairs.pl
index 7f20c6d..ec749ab 100755
--- a/util/misc/fastq_interleave_pairs.pl
+++ b/util/misc/fastq_interleave_pairs.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
diff --git a/util/misc/fastq_unweave_pairs.pl b/util/misc/fastq_unweave_pairs.pl
index 9a84da5..b69a93d 100755
--- a/util/misc/fastq_unweave_pairs.pl
+++ b/util/misc/fastq_unweave_pairs.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
diff --git a/util/misc/gene_gff3_to_introns.pl b/util/misc/gene_gff3_to_introns.pl
index 833fdc4..cd96479 100755
--- a/util/misc/gene_gff3_to_introns.pl
+++ b/util/misc/gene_gff3_to_introns.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use Fasta_reader;
use GFF3_utils;
diff --git a/util/misc/get_longest_isoform_seq_per_trinity_gene.pl b/util/misc/get_longest_isoform_seq_per_trinity_gene.pl
index 2ce55ee..53b35bb 100755
--- a/util/misc/get_longest_isoform_seq_per_trinity_gene.pl
+++ b/util/misc/get_longest_isoform_seq_per_trinity_gene.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
print STDERR "\n\n\tNOTE - longest transcript isn't always the best transcript!... consider filtering based on relative expression support ... \n\n";
diff --git a/util/misc/gff3_file_to_cdna.pl b/util/misc/gff3_file_to_cdna.pl
index 4fed058..4b6a8d9 100755
--- a/util/misc/gff3_file_to_cdna.pl
+++ b/util/misc/gff3_file_to_cdna.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use Fasta_reader;
use GFF3_utils;
diff --git a/util/misc/gff3_file_utr_coverage_trimmer.pl b/util/misc/gff3_file_utr_coverage_trimmer.pl
index 25855cc..632a3e0 100755
--- a/util/misc/gff3_file_utr_coverage_trimmer.pl
+++ b/util/misc/gff3_file_utr_coverage_trimmer.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use Fasta_reader;
use GFF3_utils;
diff --git a/util/misc/gff3_to_genome_feature_base_encoding.parse_SAM.pl b/util/misc/gff3_to_genome_feature_base_encoding.parse_SAM.pl
index fd60201..abf5d1c 100755
--- a/util/misc/gff3_to_genome_feature_base_encoding.parse_SAM.pl
+++ b/util/misc/gff3_to_genome_feature_base_encoding.parse_SAM.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use GFF3_utils;
use SAM_reader;
diff --git a/util/misc/gff3_to_genome_feature_base_encoding.pl b/util/misc/gff3_to_genome_feature_base_encoding.pl
index f541f8b..6fc6893 100755
--- a/util/misc/gff3_to_genome_feature_base_encoding.pl
+++ b/util/misc/gff3_to_genome_feature_base_encoding.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use GFF3_utils;
use Data::Dumper;
diff --git a/util/misc/gmap_gff3_chimera_jaccard_analyzer.pl b/util/misc/gmap_gff3_chimera_jaccard_analyzer.pl
index aef9dd9..2a1abfa 100755
--- a/util/misc/gmap_gff3_chimera_jaccard_analyzer.pl
+++ b/util/misc/gmap_gff3_chimera_jaccard_analyzer.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use GFF3_alignment_utils;
use WigParser;
use Data::Dumper;
diff --git a/util/misc/gmap_gff3_to_percent_length_stats.pl b/util/misc/gmap_gff3_to_percent_length_stats.pl
index cceb585..e802bba 100755
--- a/util/misc/gmap_gff3_to_percent_length_stats.pl
+++ b/util/misc/gmap_gff3_to_percent_length_stats.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 gmap.gff3 transcripts.fasta\n\n";
diff --git a/util/misc/gmap_native_to_format_converter.pl b/util/misc/gmap_native_to_format_converter.pl
index 79438d3..c6ad56f 100755
--- a/util/misc/gmap_native_to_format_converter.pl
+++ b/util/misc/gmap_native_to_format_converter.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
my $usage = "usage: $0 file.gmap (BED|GTF)\n\n";
diff --git a/util/misc/gtf_to_bed_format.pl b/util/misc/gtf_to_bed_format.pl
index 95a55f3..1d0bc66 100755
--- a/util/misc/gtf_to_bed_format.pl
+++ b/util/misc/gtf_to_bed_format.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
my $usage = "usage: $0 transcripts.gtf\n\n";
diff --git a/util/misc/gtf_to_introns.pl b/util/misc/gtf_to_introns.pl
index 09098ce..aea9bea 100755
--- a/util/misc/gtf_to_introns.pl
+++ b/util/misc/gtf_to_introns.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
use Fasta_reader;
diff --git a/util/misc/identify_distal_isoform_variations.pl b/util/misc/identify_distal_isoform_variations.pl
index 6c94823..e5b1101 100755
--- a/util/misc/identify_distal_isoform_variations.pl
+++ b/util/misc/identify_distal_isoform_variations.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib", "$FindBin::Bin/PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib", "$FindBin::RealBin/PerlLib");
use Gene_obj;
use GFF3_utils;
diff --git a/util/misc/illustrate_ref_comparison.pl b/util/misc/illustrate_ref_comparison.pl
index 9f359fb..ba827ed 100755
--- a/util/misc/illustrate_ref_comparison.pl
+++ b/util/misc/illustrate_ref_comparison.pl
@@ -5,7 +5,7 @@ use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib/");
+use lib ("$FindBin::RealBin/../../PerlLib/");
use Ascii_genome_illustrator;
use Cwd;
diff --git a/util/misc/jaccard_sam_pair_refiner.pl b/util/misc/jaccard_sam_pair_refiner.pl
index a3a1697..68b5e7e 100755
--- a/util/misc/jaccard_sam_pair_refiner.pl
+++ b/util/misc/jaccard_sam_pair_refiner.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/kmer_counter.pl b/util/misc/kmer_counter.pl
index 848f486..06b1e1a 100755
--- a/util/misc/kmer_counter.pl
+++ b/util/misc/kmer_counter.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use Carp;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Ktree;
use Nuc_translator;
diff --git a/util/misc/m8_blastclust.pl b/util/misc/m8_blastclust.pl
index 2999606..57f090a 100755
--- a/util/misc/m8_blastclust.pl
+++ b/util/misc/m8_blastclust.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Carp;
use Getopt::Long qw(:config no_ignore_case bundling);
diff --git a/util/misc/map_gtf_transcripts_to_genome_annots.pl b/util/misc/map_gtf_transcripts_to_genome_annots.pl
index e47866a..1db13b7 100755
--- a/util/misc/map_gtf_transcripts_to_genome_annots.pl
+++ b/util/misc/map_gtf_transcripts_to_genome_annots.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib/");
+use lib ("$FindBin::RealBin/../../PerlLib/");
use Gene_obj;
use GFF3_utils;
use GTF_utils;
diff --git a/util/misc/merge_blast_n_rsem_results.pl b/util/misc/merge_blast_n_rsem_results.pl
index ec223f4..f5e4c9e 100755
--- a/util/misc/merge_blast_n_rsem_results.pl
+++ b/util/misc/merge_blast_n_rsem_results.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 rsem.out blast.outfmt6 [transcripts.fasta]\n\n";
diff --git a/util/misc/nameSorted_SAM_to_FastQ.pl b/util/misc/nameSorted_SAM_to_FastQ.pl
index 1254d00..f253388 100755
--- a/util/misc/nameSorted_SAM_to_FastQ.pl
+++ b/util/misc/nameSorted_SAM_to_FastQ.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/pairwise_kmer_content_comparer.pl b/util/misc/pairwise_kmer_content_comparer.pl
index 2d7a88a..38bf39a 100755
--- a/util/misc/pairwise_kmer_content_comparer.pl
+++ b/util/misc/pairwise_kmer_content_comparer.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 file.fasta [kmer_length=25]\n\n";
diff --git a/util/misc/plot_ExN50_statistic.Rscript b/util/misc/plot_ExN50_statistic.Rscript
new file mode 100755
index 0000000..bef4887
--- /dev/null
+++ b/util/misc/plot_ExN50_statistic.Rscript
@@ -0,0 +1,14 @@
+#!/usr/bin/env Rscript
+
+args<-commandArgs(TRUE)
+
+dat_filename = args[1]
+pdf_filename = paste(dat_filename, ".plot.pdf", sep='')
+pdf(pdf_filename)
+data = read.table(dat_filename, com='', header=T, row.names=1)
+plot(data$E.N50, xlab="Pct. Ex.", ylab="N50 contig length", col='blue', t='b')
+
+write(cat("ExN50 data plotted as:", pdf_filename), stderr())
+
+quit(save = "no", status = 0, runLast = FALSE)
+
diff --git a/util/misc/plot_expressed_gene_dist.pl b/util/misc/plot_expressed_gene_dist.pl
index 713345a..7120dd2 100755
--- a/util/misc/plot_expressed_gene_dist.pl
+++ b/util/misc/plot_expressed_gene_dist.pl
@@ -11,7 +11,7 @@ my $fpkm_file = $ARGV[0] or die $usage;
my $Rscript = "$fpkm_file.R";
open (my $ofh, ">$Rscript");
-print $ofh "source(\"$FindBin::Bin/R/expression_analysis_lib.R\")\n";
+print $ofh "source(\"$FindBin::RealBin/R/expression_analysis_lib.R\")\n";
print $ofh "pdf(\"$fpkm_file.genes_vs_minFPKM.pdf\")\n";
print $ofh "plot_expressed_gene_counts(\"$fpkm_file\", title=\"expressed transcripts vs. min FPKM\", fpkm_range=seq(0,5,0.01), outfile=\"$fpkm_file.genes_vs_minFPKM.dat\")\n";
print $ofh "dev.off()\n";
diff --git a/util/misc/print_kmers.pl b/util/misc/print_kmers.pl
index c9711ac..7d6b958 100755
--- a/util/misc/print_kmers.pl
+++ b/util/misc/print_kmers.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 file.fa [kmer_length=25]\n\n";
diff --git a/util/misc/prop_pair_sam_refiner.pl b/util/misc/prop_pair_sam_refiner.pl
index 6f93741..5fae6c9 100755
--- a/util/misc/prop_pair_sam_refiner.pl
+++ b/util/misc/prop_pair_sam_refiner.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/misc/run_GSNAP.pl b/util/misc/run_GSNAP.pl
index 2ad4337..1284c6d 100755
--- a/util/misc/run_GSNAP.pl
+++ b/util/misc/run_GSNAP.pl
@@ -109,7 +109,9 @@ main: {
my $gsnap_use_sarray = ($no_sarray) ? "--use-sarray=0" : "";
- $reads = &add_zcat_fifo($reads);
+ if ($reads =~ /\.gz$/) {
+ $reads .= " --gunzip";
+ }
my $require_proper_pairs = "";
if ($proper_pairs_only_flag) {
@@ -127,29 +129,6 @@ main: {
exit(0);
}
-
-####
-sub add_zcat_fifo {
- my ($reads) = @_;
-
- my @adj_reads_list;
-
- foreach my $reads_file (split(/\s+/, $reads) ) {
- if ($reads_file =~ /\.gz$/) {
- $reads_file = "<(zcat $reads_file)";
- }
- push (@adj_reads_list, $reads_file);
- }
-
- my $adj_reads = join(" ", @adj_reads_list);
-
- return($adj_reads);
-}
-
-
-
-
-
####
sub process_cmd {
my ($cmd) = @_;
diff --git a/util/misc/run_HISAT.pl b/util/misc/run_HISAT.pl
index 3a7252d..82e0972 100755
--- a/util/misc/run_HISAT.pl
+++ b/util/misc/run_HISAT.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib("$FindBin::Bin/../../PerlLib");
+use lib("$FindBin::RealBin/../../PerlLib");
use Pipeliner;
use File::Basename;
use Cwd;
diff --git a/util/misc/run_HiCpipe_bowtie.pl b/util/misc/run_HiCpipe_bowtie.pl
index 890bd82..baa6411 100755
--- a/util/misc/run_HiCpipe_bowtie.pl
+++ b/util/misc/run_HiCpipe_bowtie.pl
@@ -21,12 +21,12 @@ while ($output_dir =~ m|/$|) {
main: {
## run bowtie
- my $cmd = "$FindBin::Bin/../alignReads.pl --target $genome_file --left $left_fq_file --right $right_fq_file "
+ my $cmd = "$FindBin::RealBin/../alignReads.pl --target $genome_file --left $left_fq_file --right $right_fq_file "
. " --seqType fq --aligner bowtie -o $output_dir --max_dist_between_pairs 900000000 --no_rsem --retain_intermediate_files "
. " -- -a -m 1 --best --strata -p 4 --chunkmbs 512 ";
&process_cmd($cmd) unless (-s "$output_dir/$output_dir.nameSorted.sam");
- $cmd = "$FindBin::Bin/HiCpipe_nameSortedSam_to_raw.pl $output_dir/$output_dir.nameSorted.sam > $output_dir/$output_dir.raw";
+ $cmd = "$FindBin::RealBin/HiCpipe_nameSortedSam_to_raw.pl $output_dir/$output_dir.nameSorted.sam > $output_dir/$output_dir.raw";
&process_cmd($cmd) unless (-s "$output_dir/$output_dir.raw");
diff --git a/util/misc/run_bowtie2.pl b/util/misc/run_bowtie2.pl
new file mode 100755
index 0000000..eb8d095
--- /dev/null
+++ b/util/misc/run_bowtie2.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Findbin;
+use lib ("$FindBin::Bin/../../PerlLib");
+use Process_cmd;
+
+my $usage = "usage: $0 target.seq reads_1.fq [reads_2.fq]\n\n"
+ . " and you can pipe it into samtools to make a bam file:\n\n"
+ . "\t | samtools view -Sb - | samtools sort - myoutputbamMinusExtension\n\n";
+
+my $target_seq = $ARGV[0] or die $usage;
+my $reads_1_fq = $ARGV[1] or die $usage;
+my $reads_2_fq = $ARGV[2];
+
+main: {
+
+ unless (-s "$target_seq.1.bt2") {
+ my $cmd = "bowtie2-build $target_seq $target_seq 1>&2 ";
+ &process_cmd($cmd);
+ }
+
+ my $format = ($reads_1_fq =~ /\.fq/) ? "-q" : "-f";
+
+ my $bowtie2_cmd = "bowtie2 --local --no-unal -x $target_seq $format ";
+ if ($reads_2_fq) {
+ $bowtie2_cmd .= " -1 $reads_1_fq -2 $reads_2_fq ";
+ }
+ else {
+ $bowtie2_cmd .= " -U $reads_1_fq ";
+ }
+
+
+ &process_cmd($bowtie2_cmd);
+
+ exit(0);
+}
+
diff --git a/util/misc/run_read_simulator_per_fasta_entry.pl b/util/misc/run_read_simulator_per_fasta_entry.pl
index f589a37..6771e33 100755
--- a/util/misc/run_read_simulator_per_fasta_entry.pl
+++ b/util/misc/run_read_simulator_per_fasta_entry.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
@@ -41,7 +41,7 @@ main: {
my $outfile = "$sim_out_dir/$outdir/$outdir.reads.fa";
- my $cmd = "$FindBin::Bin/simulate_illuminaPE_from_transcripts.pl --transcripts $template_file --out_prefix $template_file";
+ my $cmd = "$FindBin::RealBin/simulate_illuminaPE_from_transcripts.pl --transcripts $template_file --out_prefix $template_file";
if ($require_proper_pairs_flag) {
$cmd .= " --require_proper_pairs";
}
diff --git a/util/misc/run_read_simulator_per_gene.pl b/util/misc/run_read_simulator_per_gene.pl
index 31e424f..550f2eb 100755
--- a/util/misc/run_read_simulator_per_gene.pl
+++ b/util/misc/run_read_simulator_per_gene.pl
@@ -69,7 +69,7 @@ main: {
my $outfile = "$sim_out_dir/$outdir/$outdir.reads.fa";
- my $cmd = "$FindBin::Bin/../simulate_illuminaPE_from_transcripts.pl --transcripts $template_file --SS > $outfile";
+ my $cmd = "$FindBin::RealBin/../simulate_illuminaPE_from_transcripts.pl --transcripts $template_file --SS > $outfile";
&process_cmd($cmd);
$gene_counter++;
diff --git a/util/misc/run_trimmomatic_qual_trimming.pl b/util/misc/run_trimmomatic_qual_trimming.pl
index 81c11d9..f0784c3 100755
--- a/util/misc/run_trimmomatic_qual_trimming.pl
+++ b/util/misc/run_trimmomatic_qual_trimming.pl
@@ -72,7 +72,7 @@ main: {
if ($left && $right) {
- $cmd = "java -jar $FindBin::Bin/../../trinity-plugins/Trimmomatic/trimmomatic.jar PE -threads $threads -phred33 "
+ $cmd = "java -jar $FindBin::RealBin/../../trinity-plugins/Trimmomatic/trimmomatic.jar PE -threads $threads -phred33 "
. " $left $right "
. " $left.P.qtrim.fq $left.U.qtrim.fq "
. " $right.P.qtrim.fq $right.U.qtrim.fq "
@@ -80,7 +80,7 @@ main: {
}
else {
- $cmd = "java -jar $FindBin::Bin/../../trinity-plugins/Trimmomatic/trimmomatic.jar SE -threads $threads -phred33 "
+ $cmd = "java -jar $FindBin::RealBin/../../trinity-plugins/Trimmomatic/trimmomatic.jar SE -threads $threads -phred33 "
. " $single "
. " $single.qtrim.fq "
. " $trim_params ";
diff --git a/util/misc/simulate_illuminaPE_from_transcripts.pl b/util/misc/simulate_illuminaPE_from_transcripts.pl
index 400d09f..566f214 100755
--- a/util/misc/simulate_illuminaPE_from_transcripts.pl
+++ b/util/misc/simulate_illuminaPE_from_transcripts.pl
@@ -5,7 +5,7 @@ use warnings;
use Carp;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Nuc_translator;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
diff --git a/util/misc/simulate_reads_sam_and_fa.pl b/util/misc/simulate_reads_sam_and_fa.pl
index caaca6a..1037ddb 100755
--- a/util/misc/simulate_reads_sam_and_fa.pl
+++ b/util/misc/simulate_reads_sam_and_fa.pl
@@ -353,7 +353,7 @@ main: {
=strand_sep_trans
if ($SS_lib_type) {
- $cmd = "$FindBin::Bin/../support_scripts/SAM_strand_separator.pl $trans_sam_outfile.coordSorted.bam $SS_lib_type";
+ $cmd = "$FindBin::RealBin/../support_scripts/SAM_strand_separator.pl $trans_sam_outfile.coordSorted.bam $SS_lib_type";
&process_cmd($cmd);
foreach my $sam_file ("$trans_sam_outfile.coordSorted.bam.+.sam", "$trans_sam_outfile.coordSorted.bam.-.sam") {
@@ -398,7 +398,7 @@ main: {
=strand_sep_genome
if ($SS_lib_type) {
- $cmd = "$FindBin::Bin/../support_scripts/SAM_strand_separator.pl $genome_sam_outfile.coordSorted.bam $SS_lib_type";
+ $cmd = "$FindBin::RealBin/../support_scripts/SAM_strand_separator.pl $genome_sam_outfile.coordSorted.bam $SS_lib_type";
&process_cmd($cmd);
foreach my $sam_file ("$genome_sam_outfile.coordSorted.bam.+.sam", "$genome_sam_outfile.coordSorted.bam.-.sam") {
diff --git a/util/misc/sixFrameTranslation.pl b/util/misc/sixFrameTranslation.pl
index 3d93a2d..598c73d 100755
--- a/util/misc/sixFrameTranslation.pl
+++ b/util/misc/sixFrameTranslation.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use Nuc_translator;
diff --git a/util/misc/sort_fastq.pl b/util/misc/sort_fastq.pl
index cf47363..3fa7bfc 100755
--- a/util/misc/sort_fastq.pl
+++ b/util/misc/sort_fastq.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
my $usage = "usage: $0 file.fastq\n\n";
diff --git a/util/misc/splice_path_analysis/assess_intron_path_sensitivity.pl b/util/misc/splice_path_analysis/assess_intron_path_sensitivity.pl
index c2b874c..ca698f0 100755
--- a/util/misc/splice_path_analysis/assess_intron_path_sensitivity.pl
+++ b/util/misc/splice_path_analysis/assess_intron_path_sensitivity.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../../PerlLib");
+use lib ("$FindBin::RealBin/../../../PerlLib");
use Gene_obj;
use GFF3_utils;
use BED_utils;
diff --git a/util/misc/strip_fasta_header.pl b/util/misc/strip_fasta_header.pl
index f796c42..18618cb 100755
--- a/util/misc/strip_fasta_header.pl
+++ b/util/misc/strip_fasta_header.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use Carp;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 file.fasta\n\n";
diff --git a/util/misc/transcript_coverage_UTR_trimmer.pl b/util/misc/transcript_coverage_UTR_trimmer.pl
index 05fda3f..89decf2 100755
--- a/util/misc/transcript_coverage_UTR_trimmer.pl
+++ b/util/misc/transcript_coverage_UTR_trimmer.pl
@@ -7,7 +7,7 @@ use threads;
use FindBin;
use Getopt::Long qw(:config no_ignore_case bundling);
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use WigParser;
use Fasta_reader;
use Statistics::Descriptive;
@@ -71,7 +71,7 @@ if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) {
die "Error, invalid --SS_lib_type, only F, R, FR, or RF are possible values";
}
-my $UTIL_DIR = "$FindBin::Bin/";
+my $UTIL_DIR = "$FindBin::RealBin/";
main: {
diff --git a/util/misc/transcript_fasta_to_ORF_pics.pl b/util/misc/transcript_fasta_to_ORF_pics.pl
index badb978..a4b95e3 100755
--- a/util/misc/transcript_fasta_to_ORF_pics.pl
+++ b/util/misc/transcript_fasta_to_ORF_pics.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use strict;
use warnings;
diff --git a/util/misc/transcript_gff3_to_bed.pl b/util/misc/transcript_gff3_to_bed.pl
index 759485d..f03486b 100755
--- a/util/misc/transcript_gff3_to_bed.pl
+++ b/util/misc/transcript_gff3_to_bed.pl
@@ -5,7 +5,7 @@ use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;
my $usage = "usage: $0 alignments.gff3\n\n";
diff --git a/util/misc/trinity_component_distribution.pl b/util/misc/trinity_component_distribution.pl
index fb72273..99f7cd3 100755
--- a/util/misc/trinity_component_distribution.pl
+++ b/util/misc/trinity_component_distribution.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use POSIX qw (ceil);
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 Trinity.fasta [length_bin_size=100] [out_prefix='dist']\n\n";
diff --git a/util/run_DE_analysis_from_samples_file.pl b/util/run_DE_analysis_from_samples_file.pl
index 4cbd356..ccb04b1 100755
--- a/util/run_DE_analysis_from_samples_file.pl
+++ b/util/run_DE_analysis_from_samples_file.pl
@@ -8,7 +8,7 @@ use Cwd;
######################################################
## Set to base directory of the Trinity installation:
-my $BASEDIR = "$FindBin::Bin/../";
+my $BASEDIR = "$FindBin::RealBin/../";
######################################################
my $usage = <<__EOUSAGE__;
diff --git a/util/run_RSEM_from_samples_file.pl b/util/run_RSEM_from_samples_file.pl
index 67d0ccd..2a86ab4 100755
--- a/util/run_RSEM_from_samples_file.pl
+++ b/util/run_RSEM_from_samples_file.pl
@@ -8,7 +8,7 @@ use Cwd;
######################################################
## Set to base directory of the Trinity installation:
-my $BASEDIR = "$FindBin::Bin/../";
+my $BASEDIR = "$FindBin::RealBin/../";
######################################################
my $usage = <<__EOUSAGE__;
diff --git a/util/run_Trinity_edgeR_pipeline.pl b/util/run_Trinity_edgeR_pipeline.pl
index b7a30de..5b2f03b 100755
--- a/util/run_Trinity_edgeR_pipeline.pl
+++ b/util/run_Trinity_edgeR_pipeline.pl
@@ -8,12 +8,19 @@ use Cwd;
######################################################
## Set to base directory of the Trinity installation:
-my $BASEDIR = "$FindBin::Bin/../";
+my $BASEDIR = "$FindBin::RealBin/../";
######################################################
my $usage = <<__EOUSAGE__;
-
+#############################
+#
+# Note, if you already have a Trinity assembly and it exists as:
+#
+# trinity_out_dir/Trinity.fasta
+#
+# Then, it will be used as the target for expression and DE analysis.
+#
##########################################################################################################
#
# Required:
@@ -92,8 +99,15 @@ my $reads_ALL_left_fq = "reads.ALL.left.fq";
my $reads_ALL_right_fq = "reads.ALL.right.fq";
my $REGENERATE_ALL_FQ = 1;
-if (-s $reads_ALL_left_fq && -s $reads_ALL_right_fq) {
- $REGENERATE_ALL_FQ = 0;
+if (-s $reads_ALL_left_fq || -s $reads_ALL_right_fq) {
+
+ if (-s $reads_ALL_left_fq == -s $reads_ALL_right_fq)
+ {
+ $REGENERATE_ALL_FQ = 0;
+ }
+ else {
+ unlink($reads_ALL_left_fq, $reads_ALL_right_fq);
+ }
}
diff --git a/util/run_Trinity_from_samples_file.pl b/util/run_Trinity_from_samples_file.pl
index 3e048d2..21af809 100755
--- a/util/run_Trinity_from_samples_file.pl
+++ b/util/run_Trinity_from_samples_file.pl
@@ -8,7 +8,7 @@ use Cwd;
######################################################
## Set to base directory of the Trinity installation:
-my $BASEDIR = "$FindBin::Bin/../";
+my $BASEDIR = "$FindBin::RealBin/../";
######################################################
my $usage = <<__EOUSAGE__;
diff --git a/util/support_scripts/SAM_coordSorted_fragment_Read_coverage_writer.pl b/util/support_scripts/SAM_coordSorted_fragment_Read_coverage_writer.pl
index 1e48ca2..835fdf5 100755
--- a/util/support_scripts/SAM_coordSorted_fragment_Read_coverage_writer.pl
+++ b/util/support_scripts/SAM_coordSorted_fragment_Read_coverage_writer.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_coordSorted_fragment_coverage_writer2.pl b/util/support_scripts/SAM_coordSorted_fragment_coverage_writer2.pl
index 189df0c..dc7134e 100755
--- a/util/support_scripts/SAM_coordSorted_fragment_coverage_writer2.pl
+++ b/util/support_scripts/SAM_coordSorted_fragment_coverage_writer2.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_extract_properly_mapped_pairs.pl b/util/support_scripts/SAM_extract_properly_mapped_pairs.pl
index cc632f1..7350e0f 100755
--- a/util/support_scripts/SAM_extract_properly_mapped_pairs.pl
+++ b/util/support_scripts/SAM_extract_properly_mapped_pairs.pl
@@ -2,7 +2,7 @@
use strict;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_extract_uniquely_mapped_reads.pl b/util/support_scripts/SAM_extract_uniquely_mapped_reads.pl
index b191144..2ab7e96 100755
--- a/util/support_scripts/SAM_extract_uniquely_mapped_reads.pl
+++ b/util/support_scripts/SAM_extract_uniquely_mapped_reads.pl
@@ -2,7 +2,7 @@
use strict;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_filter_out_unmapped_reads.pl b/util/support_scripts/SAM_filter_out_unmapped_reads.pl
index fa728f5..2091dee 100755
--- a/util/support_scripts/SAM_filter_out_unmapped_reads.pl
+++ b/util/support_scripts/SAM_filter_out_unmapped_reads.pl
@@ -2,7 +2,7 @@
use strict;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_ordered_pair_jaccard.pl b/util/support_scripts/SAM_ordered_pair_jaccard.pl
index dd04bf8..4836926 100755
--- a/util/support_scripts/SAM_ordered_pair_jaccard.pl
+++ b/util/support_scripts/SAM_ordered_pair_jaccard.pl
@@ -84,7 +84,7 @@ if (@ARGV) {
}
-my $util_dir = "$FindBin::Bin";
+my $util_dir = "$FindBin::RealBin";
main: {
diff --git a/util/support_scripts/SAM_set_transcribed_orient_info.pl b/util/support_scripts/SAM_set_transcribed_orient_info.pl
index e06a8e1..87014ca 100755
--- a/util/support_scripts/SAM_set_transcribed_orient_info.pl
+++ b/util/support_scripts/SAM_set_transcribed_orient_info.pl
@@ -2,7 +2,7 @@
use strict;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_strand_separator.pl b/util/support_scripts/SAM_strand_separator.pl
index 18812fe..972f736 100755
--- a/util/support_scripts/SAM_strand_separator.pl
+++ b/util/support_scripts/SAM_strand_separator.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use Carp;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/SAM_to_frag_coords.pl b/util/support_scripts/SAM_to_frag_coords.pl
index ba31ebd..1c00cfc 100755
--- a/util/support_scripts/SAM_to_frag_coords.pl
+++ b/util/support_scripts/SAM_to_frag_coords.pl
@@ -8,7 +8,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/define_SAM_coverage_partitions2.pl b/util/support_scripts/define_SAM_coverage_partitions2.pl
index 9b81ff7..e2bfafd 100755
--- a/util/support_scripts/define_SAM_coverage_partitions2.pl
+++ b/util/support_scripts/define_SAM_coverage_partitions2.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use WigParser;
my $usage = "usage: $0 strand_coverage.wig strand[+-]\n\n";
diff --git a/util/support_scripts/define_coverage_partitions.pl b/util/support_scripts/define_coverage_partitions.pl
index bfc72f4..ae1300d 100755
--- a/util/support_scripts/define_coverage_partitions.pl
+++ b/util/support_scripts/define_coverage_partitions.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use WigParser;
my $usage = "usage: $0 strand_coverage.wig min_coverage strand[+-]\n\n";
diff --git a/util/support_scripts/extract_reads_per_partition.pl b/util/support_scripts/extract_reads_per_partition.pl
index a04a75e..c44992c 100755
--- a/util/support_scripts/extract_reads_per_partition.pl
+++ b/util/support_scripts/extract_reads_per_partition.pl
@@ -8,7 +8,7 @@ use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use File::Basename;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Nuc_translator;
use SAM_reader;
diff --git a/util/support_scripts/fastQ_to_fastA.pl b/util/support_scripts/fastQ_to_fastA.pl
index cd0e40c..13484dd 100755
--- a/util/support_scripts/fastQ_to_fastA.pl
+++ b/util/support_scripts/fastQ_to_fastA.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Nuc_translator;
use IO::Uncompress::Gunzip;
diff --git a/util/support_scripts/fasta_to_tab.pl b/util/support_scripts/fasta_to_tab.pl
index b483a9a..b04da09 100755
--- a/util/support_scripts/fasta_to_tab.pl
+++ b/util/support_scripts/fasta_to_tab.pl
@@ -3,7 +3,7 @@
use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 [multiFastaFile] [NO_FULL_HEADER_FLAG=0]\n\n";
diff --git a/util/support_scripts/fragment_coverage_writer.pl b/util/support_scripts/fragment_coverage_writer.pl
index 24c9e96..6cbabfc 100755
--- a/util/support_scripts/fragment_coverage_writer.pl
+++ b/util/support_scripts/fragment_coverage_writer.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/inchworm_transcript_splitter.pl b/util/support_scripts/inchworm_transcript_splitter.pl
index f6b6baf..5318d55 100755
--- a/util/support_scripts/inchworm_transcript_splitter.pl
+++ b/util/support_scripts/inchworm_transcript_splitter.pl
@@ -11,7 +11,7 @@ use Cwd;
$ENV{LC_ALL} = 'C';
-my $util_dir = "$FindBin::Bin/../../util/support_scripts";
+my $util_dir = "$FindBin::RealBin/../../util/support_scripts";
my $usage = <<_EOUSAGE_;
diff --git a/util/support_scripts/jaccard_fasta_clipper.pl b/util/support_scripts/jaccard_fasta_clipper.pl
index e306b64..aa4db3f 100755
--- a/util/support_scripts/jaccard_fasta_clipper.pl
+++ b/util/support_scripts/jaccard_fasta_clipper.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
my $usage = "usage: $0 transcripts.fasta jaccard_clips.wig\n\n";
diff --git a/util/support_scripts/jaccard_wig_clipper.pl b/util/support_scripts/jaccard_wig_clipper.pl
index 2465f18..7d8b428 100755
--- a/util/support_scripts/jaccard_wig_clipper.pl
+++ b/util/support_scripts/jaccard_wig_clipper.pl
@@ -7,7 +7,7 @@ use Carp;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use WigParser;
diff --git a/util/support_scripts/merge_left_right_nameSorted_SAMs.pl b/util/support_scripts/merge_left_right_nameSorted_SAMs.pl
index 1dda428..ba31d89 100755
--- a/util/support_scripts/merge_left_right_nameSorted_SAMs.pl
+++ b/util/support_scripts/merge_left_right_nameSorted_SAMs.pl
@@ -5,7 +5,7 @@ use warnings;
use Carp;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/nbkc_merge_left_right_stats.pl b/util/support_scripts/nbkc_merge_left_right_stats.pl
index 64d3c15..ac98131 100755
--- a/util/support_scripts/nbkc_merge_left_right_stats.pl
+++ b/util/support_scripts/nbkc_merge_left_right_stats.pl
@@ -49,6 +49,8 @@ main: {
print STDERR "-opening $left_stats_file\n";
if ($left_stats_file =~ /\.gz$/) {
open ($left_fh, "gunzip -c $left_stats_file | ") or die $!;
+ } elsif ($left_stats_file =~ /\.xz$/) {
+ open(${left_fh}, "xz -cd ${left_stats_file} | ") or die $!;
}
else {
open ($left_fh, $left_stats_file) or die $!;
@@ -57,6 +59,8 @@ main: {
print STDERR "-opening $right_stats_file\n";
if ($right_stats_file =~ /\.gz$/) {
open ($right_fh, "gunzip -c $right_stats_file | ") or die $!;
+ } elsif ($right_stats_file =~ /\.xz$/) {
+ open (${right_fh}, "xz -dc ${right_stats_file} | ") or die $!;
}
else {
open ($right_fh, $right_stats_file) or die $!;
diff --git a/util/support_scripts/ordered_fragment_coords_to_jaccard.pl b/util/support_scripts/ordered_fragment_coords_to_jaccard.pl
index f192dab..9b14222 100755
--- a/util/support_scripts/ordered_fragment_coords_to_jaccard.pl
+++ b/util/support_scripts/ordered_fragment_coords_to_jaccard.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use FindBin;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use SAM_reader;
use SAM_entry;
diff --git a/util/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl b/util/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl
index 402e3b0..880f96b 100755
--- a/util/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl
+++ b/util/support_scripts/prep_rnaseq_alignments_for_genome_assisted_assembly.pl
@@ -90,7 +90,7 @@ if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) {
die "Error, invalid --SS_lib_type, only F, R, FR, or RF are possible values";
}
-my $UTIL_DIR = "$FindBin::Bin/";
+my $UTIL_DIR = "$FindBin::RealBin/";
main: {
diff --git a/util/support_scripts/run_TMM_scale_matrix.pl b/util/support_scripts/run_TMM_scale_matrix.pl
index 58103ab..6fd326e 100755
--- a/util/support_scripts/run_TMM_scale_matrix.pl
+++ b/util/support_scripts/run_TMM_scale_matrix.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Data::Dumper;
my $usage = <<__EOUSAGE__;
@@ -67,7 +67,7 @@ sub run_TMM {
my $tmm_norm_script = "__tmp_runTMM.R";
open (my $ofh, ">$tmm_norm_script") or die "Error, cannot write to $tmm_norm_script";
- #print $ofh "source(\"$FindBin::Bin/R/edgeR_funcs.R\")\n";
+ #print $ofh "source(\"$FindBin::RealBin/R/edgeR_funcs.R\")\n";
print $ofh "library(edgeR)\n\n";
diff --git a/util/support_scripts/run_UpperQuartileNormalization_matrix.pl b/util/support_scripts/run_UpperQuartileNormalization_matrix.pl
index 729a41e..14a59f1 100755
--- a/util/support_scripts/run_UpperQuartileNormalization_matrix.pl
+++ b/util/support_scripts/run_UpperQuartileNormalization_matrix.pl
@@ -7,7 +7,7 @@ use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use FindBin;
use File::Basename;
-use lib ("$FindBin::Bin/../../PerlLib");
+use lib ("$FindBin::RealBin/../../PerlLib");
use Data::Dumper;
@@ -76,7 +76,7 @@ sub upper_quartile_normalize {
my $tmm_norm_script = "__tmp_upper_quart_norm.R";
open (my $ofh, ">$tmm_norm_script") or die "Error, cannot write to $tmm_norm_script";
- #print $ofh "source(\"$FindBin::Bin/R/edgeR_funcs.R\")\n";
+ #print $ofh "source(\"$FindBin::RealBin/R/edgeR_funcs.R\")\n";
print $ofh "data = read.table(\"$matrix_file\", header=T, row.names=1, com='')\n";
print $ofh "get_upper_quartile = function(vec) {\n"
@@ -90,11 +90,11 @@ sub upper_quartile_normalize {
print $ofh "mean_upp_quart = mean(upp_quartiles)\n";
print $ofh "m = m * mean_upp_quart\n";
- print $ofh "write.table(m, file=\"$matrix_file.upper_quartile_norm.matrix\", quote=F, sep=\"\\t\")\n";
+ print $ofh "write.table(m, quote=F, sep=\"\\t\")\n";
close $ofh;
- &process_cmd("R --vanilla -q < $tmm_norm_script 1>&2 ");
+ &process_cmd("R --vanilla -q --slave < $tmm_norm_script ");
return;
}
diff --git a/util/support_scripts/salmon_trans_to_gene_results.pl b/util/support_scripts/salmon_trans_to_gene_results.pl
new file mode 100755
index 0000000..b776d38
--- /dev/null
+++ b/util/support_scripts/salmon_trans_to_gene_results.pl
@@ -0,0 +1,165 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Data::Dumper;
+
+my $usage = "\n\nusage: $0 quant.sf gene_to_trans_map_file.txt\n\n\n";
+
+my $quant_sf = $ARGV[0] or die $usage;
+my $gene_to_trans_map_file = $ARGV[1] or die $usage;
+
+
+main: {
+
+ my %trans_to_gene_info;
+ {
+ open (my $fh, $gene_to_trans_map_file) or die "Error, cannot open file $gene_to_trans_map_file";
+ while (<$fh>) {
+ unless (/\w/) { next; }
+ chomp;
+ my ($gene, $trans, @rest) = split(/\s+/);
+ unless ($gene && $trans) {
+ die "Error, cannot extract gene & trans relationship from line $_ of file $gene_to_trans_map_file";
+ }
+ $trans_to_gene_info{$trans} = $gene;
+ }
+ close $fh;
+ }
+
+
+ open (my $fh, $quant_sf) or die "Error, cannot open file $quant_sf";
+ my $header = <$fh>;
+ chomp $header;
+ my %field_index;
+ my @fields = split(/\t/, $header);
+ {
+
+ for (my $i = 0; $i <= $#fields; $i++) {
+ my $field = $fields[$i];
+ $field_index{$field} = $i;
+ }
+ }
+
+
+ my %gene_data;
+ while (<$fh>) {
+ chomp;
+
+ # quant.sf format:
+ #
+ #Name Length EffectiveLength TPM NumReads
+ #TRINITY_DN10_c0_g1_i1 334 67.2849 3125.31 7
+ #TRINITY_DN11_c0_g1_i1 319 55.1277 0 0
+ #TRINITY_DN12_c0_g1_i1 244 244 1231.18 10
+ #TRINITY_DN17_c0_g1_i1 229 229 393.549 3
+ #TRINITY_DN18_c0_g1_i1 633 360.371 593.619 7.12107
+
+ my @x = split(/\t/);
+
+ my $trans_id = $x[ $field_index{Name} ];
+ my $tpm = $x[ $field_index{TPM} ];
+ my $length = $x[ $field_index{Length} ];
+ my $eff_length = $x[ $field_index{EffectiveLength} ];
+ my $est_counts = $x[ $field_index{NumReads} ];
+
+ my $gene = $trans_to_gene_info{$trans_id} or die "Error, cannot find gene identifier for transcript [$trans_id] ";
+
+ push (@{$gene_data{$gene}}, { Name => $trans_id,
+ TPM => $tpm,
+ Length => $length,
+ EffectiveLength => $eff_length,
+ NumReads => $est_counts,
+ });
+
+
+ }
+ close $fh;
+
+
+ ## Output gene summaries:
+
+ print $header . "\n";
+
+ foreach my $gene (keys %gene_data) {
+ my @trans_structs = @{$gene_data{$gene}};
+
+ my @trans_ids;
+ my $sum_counts = 0;
+ my $sum_tpm = 0;
+
+ my $counts_per_len_sum = 0;
+ my $counts_per_eff_len_sum = 0;
+
+ my $sum_lengths = 0;
+ my $sum_eff_lengths = 0;
+
+ my $num_trans = scalar(@trans_structs);
+
+
+ foreach my $struct (@trans_structs) {
+
+ #print Dumper($struct);
+
+ my $trans_id = $struct->{Name};
+ my $tpm = $struct->{TPM};
+ my $length = $struct->{Length};
+
+ my $eff_length = $struct->{EffectiveLength};
+ my $est_counts = $struct->{NumReads};
+
+ unless ($eff_length > 0) {
+ $eff_length = 1; # cannot have zero length feature!
+ }
+
+ unless ($length > 0 && $eff_length > 0) {
+ die "Error, length: $length, eff_length: $eff_length" . Dumper($struct);
+ }
+
+ $sum_lengths += $length;
+ $sum_eff_lengths += $eff_length;
+
+ $counts_per_len_sum += $est_counts/$length;
+
+ $counts_per_eff_len_sum += $est_counts/$eff_length;
+
+ $sum_counts += $est_counts;
+ $sum_tpm += $tpm;
+ }
+
+ my $gene_length = $sum_lengths / $num_trans;
+ my $gene_eff_length = $sum_eff_lengths / $num_trans;
+ if ($sum_counts) {
+ # set lengths as weighted by expression of isoforms.
+ eval {
+ $gene_length = $sum_counts / $counts_per_len_sum;
+ $gene_eff_length = $sum_counts / $counts_per_eff_len_sum;
+ };
+ if ($@) {
+ print STDERR "$@\n" . Dumper(\@trans_structs);
+ die;
+ }
+ }
+
+ my %gene_info = ( Name => $gene,
+ TPM => sprintf("%.2f", $sum_tpm),
+ Length => sprintf("%.2f", $gene_length),
+ EffectiveLength => sprintf("%.2f", $gene_eff_length),
+ NumReads => sprintf("%.2f", $sum_counts),
+
+ );
+
+ my @vals;
+ foreach my $field (@fields) {
+ my $result = $gene_info{$field};
+ unless (defined $result) {
+ $result = "NA";
+ }
+ push (@vals, $result);
+ }
+ print join("\t", @vals) . "\n";
+ }
+
+
+ exit(0);
+}
diff --git a/util/support_scripts/tests/sample_data_tests.py b/util/support_scripts/tests/sample_data_tests.py
new file mode 100644
index 0000000..1989a61
--- /dev/null
+++ b/util/support_scripts/tests/sample_data_tests.py
@@ -0,0 +1,56 @@
+from Bio import SeqIO
+import unittest
+import os
+
+class TestTrinitySampleData(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ cls.sampledata_dir = os.environ["TRINITY_SAMPLEDATA"]
+
+ def test_genome_guided(self):
+ seq_count = self.count_sequences('test_GenomeGuidedTrinity', 'test_GG_use_bam_trinity_outdir',
+ 'Trinity-GG.fasta')
+ self.assertTrue(50 <= seq_count <= 60, msg='Found %s sequences' % seq_count)
+
+ def test_genome_guided_with_jaccard_clipping(self):
+ seq_count = self.count_sequences('test_GenomeGuidedTrinity', 'test_Schizo_trinityGG_jaccard_RF_outdir',
+ 'Trinity-GG.fasta')
+ self.assertTrue(60 <= seq_count <= 80, msg='Found %s sequences' % seq_count)
+
+ def test_paired_end_normalization(self):
+ seq_count = self.count_sequences('test_InSilicoReadNormalization',
+ 'reads.left.fq.gz.normalized_K25_C5_pctSD200.fq')
+ self.assertTrue(35 <= seq_count <= 50, msg='Found %s sequences' % seq_count)
+ seq_count = self.count_sequences('test_InSilicoReadNormalization',
+ 'reads.right.fq.gz.normalized_K25_C5_pctSD200.fq')
+ self.assertTrue(30 <= seq_count <= 40, msg='Found %s sequences' % seq_count)
+ seq_count = self.count_sequences('test_InSilicoReadNormalization',
+ 'reads.single.fq.normalized_K25_C5_pctSD200.fq')
+ self.assertTrue(60 <= seq_count <= 65, msg='Found %s sequences' % seq_count)
+
+ def test_trinity_assembly(self):
+ seq_count = self.count_sequences('test_Trinity_Assembly', 'trinity_out_dir', 'Trinity.fasta')
+ self.assertTrue(100 <= seq_count <= 120, msg='Found %s sequences' % seq_count)
+
+ def test_DE_analysis_EdgeR(self):
+ check_file = os.path.join(self.sampledata_dir, 'test_DE_analysis', 'edgeR_outdir', 'numDE_feature_counts.P0.001_C2.matrix')
+ self.assertTrue(os.path.isfile(check_file))
+
+ def test_align_and_estimate_abundance(self):
+ check_file = os.path.join(self.sampledata_dir, 'test_align_and_estimate_abundance', 'RSEM-gene.counts.matrix')
+ self.assertTrue(os.path.isfile(check_file))
+
+ def test_full_edgeR_pipeline(self):
+ check_file = os.path.join(self.sampledata_dir, 'test_full_edgeR_pipeline', 'read_content_analysis', 'read_content_analysis.nameSorted.bam')
+ self.assertTrue(os.path.isfile(check_file))
+
+
+### Helper methods
+ def count_sequences(self, *paths):
+ gg = os.path.join(self.sampledata_dir, *paths)
+ handle = open(gg, "rU")
+ seq_count = len([x for x in SeqIO.parse(handle, "fasta")])
+ handle.close()
+ return seq_count
+
diff --git a/util/support_scripts/tests/test.py b/util/support_scripts/tests/test.py
deleted file mode 100755
index 06fbb6b..0000000
--- a/util/support_scripts/tests/test.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import subprocess
-from Bio import SeqIO
-import unittest
-import shutil
-import os
-import time
-# dfsfd
-# Prereqs:
-# module load bowtie/0.12.8
-# module load java
-# module load samtools
-# Trinity
-# Copy the .gz files in sample_data/test_Trinity_Assembly to current directory
-# Run using nosetests
-MEM_FLAG = "--max_memory 2G"
-TEMP_FILES = ['both.fa', 'inchworm.K25.L25.fa', 'jellyfish.kmers.fa']
-
-
-class TestTrinity(unittest.TestCase):
-
- @classmethod
- def setUpClass(cls):
- try:
- os.remove('coverage.log')
- except:
- pass
-
- def tearDown(self):
- shutil.rmtree('trinity_out_dir', True)
-
- def test_sample_data_seq_count(self):
- self.trinity(
- "Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup" % MEM_FLAG)
- handle = open("trinity_out_dir/Trinity.fasta", "rU")
- seq_count = len([x for x in SeqIO.parse(handle, "fasta")])
- handle.close()
- self.assertTrue(75 <= seq_count <= 100, msg='Found %s sequences' % seq_count)
-
- def test_sample_data_trimmed_and_normalized(self):
- self.trinity(
- "Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --trimmomatic --normalize_reads --no_cleanup" % MEM_FLAG)
- handle = open("trinity_out_dir/Trinity.fasta", "rU")
- seq_count = len([x for x in SeqIO.parse(handle, "fasta")])
- handle.close()
- self.assertTrue(75 <= seq_count <= 85, msg='Found %s sequences' % seq_count)
-
- def test_no_cleanup_leaves_temp_files(self):
- self.trinity(
- "Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup" % MEM_FLAG)
- for f in TEMP_FILES:
- self.assertTrue(os.path.exists("trinity_out_dir/%s" % f), msg="%s not found with no_cleanup" % f)
-
- def test_cleanup_removes_temp_files(self):
- self.trinity(
- "Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --full_cleanup" % MEM_FLAG)
- time.sleep(5) # Make sure the system has time to recognize the directory is gone
- self.assertFalse(os.path.exists("trinity_out_dir"), msg="Did full_cleanup but trinity_out_dir exists")
- self.assertTrue(os.path.isfile("trinity_out_dir.Trinity.fasta"),
- msg="Did full_cleanup but output file not created")
-
- def test_single_end_with_rf_lib_type_error(self):
- try:
- subprocess.call("Trinity --seqType fq --single reads.left.fq --SS_lib_type RF", shell=True)
- except subprocess.CalledProcessError as e:
- self.assertTrue("Error, with --single reads, the --SS_lib_type can be 'F' or 'R' only." in e.output)
-
- def test_single_end_with_fq(self):
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F" % MEM_FLAG)
-
- def test_no_run_chrysalis(self):
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_run_chrysalis" % MEM_FLAG)
- self.assertEquals(0, len(os.listdir('trinity_out_dir/chrysalis')))
-
- def test_no_run_inchworm(self):
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_run_inchworm" % MEM_FLAG)
- self.assertFalse(os.path.isfile("trinity_out_dir/inchworm.K25.L25.fa.finished"),
- msg="Inchworm appears to have run although no_run_inchworm was specified")
- self.assertTrue(os.path.isfile("trinity_out_dir/jellyfish.kmers.fa"),
- msg="jellyfish.kmers.fa was not created")
-
- def test_no_bowtie(self):
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_bowtie" % MEM_FLAG)
- self.assertFalse(os.path.isfile("trinity_out_dir/bowtie.nameSorted.bam"),
- msg="Bowtie appears to have run although no_bowtie was specified")
-
- def test_no_distributed_trinity_exec(self):
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_distributed_trinity_exec" % MEM_FLAG)
- self.assertTrue(os.path.isfile("trinity_out_dir/inchworm.K25.L25.fa.finished"),
- msg="Inchworm did not appear to run with no_distributed_trinity_exec flag")
- self.assertTrue(os.path.isfile("trinity_out_dir/jellyfish.kmers.fa.histo"),
- msg="Jellyfish did not appear to run with no_distributed_trinity_exec flag")
- self.assertFalse(os.path.isfile("trinity_out_dir/Trinity.fasta"),
- msg="Trinity.fasta created with no_distributed_trinity_exec")
-
- def test_single_end_with_fa_and_reverse(self):
- self.fq2fa()
- self.trinity("Trinity %s --seqType fa --single reads.fa --SS_lib_type R" % MEM_FLAG)
-
- def test_output_correctly_changes_dir(self):
- shutil.rmtree('trinity_test', True)
- self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --output trinity_test" % MEM_FLAG)
- self.assertTrue(os.path.exists("trinity_test"), msg="Changed output directory but it was not created")
- shutil.rmtree('trinity_test', True)
-
- def test_scaffold_iworm_contigs(self):
- os.environ['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH'] + ':../src/trinity-plugins/htslib'
- exe = "../src/trinity-plugins/scaffold_iworm_contigs/scaffold_iworm_contigs"
- bamfile = "iworm.bowtie.nameSorted.bam"
- ifile = "inchworm.K25.L25.fa"
- f = subprocess.check_output([exe, bamfile, ifile]).split('\n')
- expected_result = [['a340;25', '339', 'a9;40', '8', '41'],
- ['a719;8', '718', 'a832;15', '831', '33'],
- ['a1;43', '0', 'a346;23', '345', '31'],
- ['a346;23', '345', 'a9;40', '8', '26'],
- ['a339;142', '338', 'a37;14', '36', '25'],
- ['a3;61', '2', 'a432;9', '431', '23'],
- ['a345;34', '344', 'a40;12', '39', '21'],
- ['a354;96', '353', 'a368;13', '367', '18'],
- ['a689;4', '688', 'a774;6', '773', '13']]
-
- actual_result = [line.split('\t') for line in f if line]
- self.assertEquals(expected_result, actual_result[0:9])
- actual_lengths = [int(s[4].strip()) for s in actual_result]
- expected_order = list(reversed(sorted(actual_lengths)))
- self.assertEquals(expected_order, actual_lengths)
-
- def test_Inchworm_handles_compressed_files(self):
- self.trinity('Trinity %s --seqType fq --single reads.left.fq.gz --SS_lib_type F --no_run_chrysalis' % MEM_FLAG);
- num_lines = sum(1 for line in open('trinity_out_dir/inchworm.K25.L25.fa'))
- self.assertTrue(2875 <= num_lines <= 3100, msg='Found %s lines' % num_lines)
-
-### information tests
- def test_cite(self):
- expected = '\n\n* Trinity:\nFull-length transcriptome assembly from RNA-Seq data without a reference genome.\nGrabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L,\nRaychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F,\nBirren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A.\nNature Biotechnology 29, 644\xe2\x80\x93652 (2011)\nPaper: http://www.nature.com/nbt/journal/v29/n7/full/nbt.1883.html\nCode: http://trinityrna [...]
- cite = subprocess.check_output(["Trinity", "--cite"])
- self.assertEqual(expected, cite)
-
- def test_version(self):
- try:
- subprocess.check_output(["Trinity", "--version"])
- self.fail("Version returned 0 errorcode!")
- except subprocess.CalledProcessError as e:
- self.assertTrue('Trinity version: __TRINITY_VERSION_TAG__' in e.output)
- self.assertTrue('using Trinity devel version. Note, latest production release is: v2.0.6' in e.output)
-
-
- def test_show_full_usage_info(self):
- try:
- subprocess.check_output(["Trinity", "--show_full_usage_info"])
- except subprocess.CalledProcessError as e:
- self.assertTrue("Inchworm and K-mer counting-related options" in e.output)
- self.assertTrue("Chrysalis-related options" in e.output)
- self.assertTrue("Butterfly-related options" in e.output)
- self.assertTrue("Quality Trimming Options" in e.output)
- self.assertTrue("In silico Read Normalization Options" in e.output)
-
-### Invalid command line tests
- def test_no_JM_specified_error(self):
- error = self.get_error("Trinity --seqType fq --single reads.left.fq --SS_lib_type F")
- self.assertTrue("Error, must specify max memory for jellyfish to use, eg. --max_memory 10G" in error)
-
- def test_invalid_option_error(self):
- error = self.get_error("Trinity --squidward")
- self.assertTrue("ERROR, don't recognize parameter: --squidward" in error)
-
- def test_set_no_cleanup_and_full_cleanup_error(self):
- error = self.get_error("Trinity --no_cleanup --full_cleanup")
- self.assertTrue("cannot set --no_cleanup and --full_cleanup as they contradict" in error)
-
-
-### Helper methods
- def trinity(self, cmdline):
- with open("coverage.log", 'a') as file_out:
- subprocess.call(cmdline,shell=True, stdout=file_out)
-
- def get_error(self, cmd):
- try:
- subprocess.check_output(cmd.split(' '))
- except subprocess.CalledProcessError as e:
- return e.output
-
- def fq2fa(self):
- handle = open("reads.left.fq", "rU")
- records = [x for x in SeqIO.parse(handle, "fastq")]
- handle.close()
- SeqIO.write(records, "reads.fa", "fasta")
-
diff --git a/util/support_scripts/tests/test_prep.py b/util/support_scripts/tests/test_prep.py
index 261ca09..c5d07ec 100644
--- a/util/support_scripts/tests/test_prep.py
+++ b/util/support_scripts/tests/test_prep.py
@@ -11,7 +11,7 @@ from Bio import SeqIO
# clear, gzip, bzip
-class TestTrinity(unittest.TestCase):
+class TestTrinityPrepFlag(unittest.TestCase):
@classmethod
def setUpClass(cls):
@@ -26,75 +26,79 @@ class TestTrinity(unittest.TestCase):
def test_fastq(self):
self.trinity("left1.fq", "fq")
- self.assertEquals(30575, self.count_seqs())
+ self.assertEquals(30575, self.count_seqs(), "Unexpected sequence count")
def test_fastq_gz(self):
self.trinity("left1.fq.gz", "fq")
- self.assertEquals(30575, self.count_seqs())
+ self.assertEquals(30575, self.count_seqs(), "Unexpected sequence count")
def test_fastq_bz2(self):
self.trinity("left1.fq.bz2", "fq")
- self.assertEquals(30575, self.count_seqs())
+ self.assertEquals(30575, self.count_seqs(), "Unexpected sequence count")
def test_fastq_multiple_files_single(self):
self.trinity("left1.fq,left1.fq.gz", "fq")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fastq_multiple_files_single_bz2(self):
self.trinity("left1.fq.bz2,left1.fq.gz", "fq")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fastq_multiple_files_single_reverse(self):
self.trinity("left1.fq,left1.fq.gz", "fq", True)
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fasta(self):
self.trinity("left1.fa")
- self.assertEquals(30575, self.count_seqs())
+ self.assertEquals(30575, self.count_seqs(), "Unexpected sequence count")
def test_fasta_gz(self):
self.trinity("left1.fa.gz")
- self.assertEquals(30575, self.count_seqs())
+ self.assertEquals(30575, self.count_seqs(), "Unexpected sequence count")
def test_fasta_multiple_files_single(self):
self.trinity("left1.fa,left1.fa.gz")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fasta_multiple_files_single_reverse(self):
self.trinity("left1.fa,left1.fa.gz", reverse=True)
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_paired_fastq(self):
self.trinity("left1.fq", "fq", morefiles="right1.fq")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_paired_fastq_gz(self):
self.trinity("left1.fq.gz", "fq", morefiles="right1.fq.gz")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fastq_multiple_files_paired(self):
self.trinity("left1.fq,left1.fq.gz", "fq", morefiles="right1.fq,right1.fq.gz")
- self.assertEquals(122300, self.count_seqs())
+ self.assertEquals(122300, self.count_seqs(), "Unexpected sequence count")
def test_fastq_multiple_files_paired_reverse(self):
self.trinity("left1.fq,left1.fq.gz", "fq", reverse=True, morefiles="right1.fq,right1.fq.gz")
- self.assertEquals(122300, self.count_seqs())
+ self.assertEquals(122300, self.count_seqs(), "Unexpected sequence count")
def test_fasta_paired(self):
self.trinity("left1.fa", morefiles="right1.fa")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
+
+ def test_paired_sequences_have_1_or_2_extension(self):
+ self.trinity("sra_test.fq", morefiles="sra_test2.fq", seqtype='fq')
+ self.assertEquals(0, self.count_bad_endings(), "Found sequences with bad endings")
def test_fasta_gz_paired(self):
self.trinity("left1.fa.gz", morefiles="right1.fa.gz")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fasta_multiple_files_paired(self):
self.trinity("left1.fa,left1.fa.gz", morefiles="right1.fa,right1.fa.gz")
- self.assertEquals(61150, self.count_seqs())
+ self.assertEquals(61150, self.count_seqs(), "Unexpected sequence count")
def test_fasta_multiple_files_paired(self):
self.trinity("left1.fa,left1.fa.gz", morefiles="right1.fa,right1.fa.gz", reverse=True)
- self.assertEquals(122300, self.count_seqs())
+ self.assertEquals(122300, self.count_seqs(), "Unexpected sequence count")
def trinity(self, files, seqtype='fa', reverse=False, morefiles=None):
if morefiles:
@@ -105,7 +109,7 @@ class TestTrinity(unittest.TestCase):
cmdline = tpl % (files, seqtype)
if reverse:
cmdline += " --SS_lib_type " + ('RF' if morefiles else 'R')
- print cmdline
+ print "Command line:", cmdline
with open("coverage.log", 'a') as file_out:
subprocess.call(cmdline,shell=True, stdout=file_out)
@@ -120,3 +124,14 @@ class TestTrinity(unittest.TestCase):
handle.close()
return seq_count
+ def count_bad_endings(self):
+ f = "trinity_out_dir/single.fa"
+ if os.path.isfile(f):
+ handle = open(f, "rU")
+ else:
+ handle = open("trinity_out_dir/both.fa", "rU")
+
+ seq_count = len(list(x for x in SeqIO.parse(handle, "fasta") if not (x.id.endswith('/1') or x.id.endswith('/2'))))
+ handle.close()
+ return seq_count
+
diff --git a/util/support_scripts/tests/tests.py b/util/support_scripts/tests/tests.py
index 2137524..819dd57 100644
--- a/util/support_scripts/tests/tests.py
+++ b/util/support_scripts/tests/tests.py
@@ -29,28 +29,32 @@ class TestTrinity(unittest.TestCase):
shutil.rmtree('trinity_out_dir', True)
def test_sample_data_seq_count(self):
+ print "When assembling the sample data, the number of sequences assembled should be between 75 and 100"
self.trinity(
"Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup" % MEM_FLAG)
handle = open("trinity_out_dir/Trinity.fasta", "rU")
seq_count = len([x for x in SeqIO.parse(handle, "fasta")])
handle.close()
- self.assertTrue(75 <= seq_count <= 100, msg='Found %s sequences' % seq_count)
+ self.assertTrue(85 <= seq_count <= 110, msg='Found %s sequences' % seq_count)
def test_sample_data_trimmed_and_normalized(self):
+ print "When assembling the sample data with the --trimmomatic --normalize_reads flags, the number of sequences assembled should be between 75 and 85"
self.trinity(
"Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --trimmomatic --normalize_reads --no_cleanup" % MEM_FLAG)
handle = open("trinity_out_dir/Trinity.fasta", "rU")
seq_count = len([x for x in SeqIO.parse(handle, "fasta")])
handle.close()
- self.assertTrue(75 <= seq_count <= 85, msg='Found %s sequences' % seq_count)
+ self.assertTrue(85 <= seq_count <= 100, msg='Found %s sequences' % seq_count)
def test_no_cleanup_leaves_temp_files(self):
+ print "The --no_cleanup flag should ensure that the output directory is left behind"
self.trinity(
"Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --no_cleanup" % MEM_FLAG)
for f in TEMP_FILES:
self.assertTrue(os.path.exists("trinity_out_dir/%s" % f), msg="%s not found with no_cleanup" % f)
def test_cleanup_removes_temp_files(self):
+ print "The --full_cleanup flag should ensure that the output directory is gone but the output file remains"
self.trinity(
"Trinity --seqType fq %s --left reads.left.fq.gz,reads2.left.fq.gz --right reads.right.fq.gz,reads2.right.fq.gz --SS_lib_type RF --CPU 4 --full_cleanup" % MEM_FLAG)
time.sleep(5) # Make sure the system has time to recognize the directory is gone
@@ -59,19 +63,23 @@ class TestTrinity(unittest.TestCase):
msg="Did full_cleanup but output file not created")
def test_single_end_with_rf_lib_type_error(self):
+ print "Single reads with an SS_lib_type of RF should result in an error"
try:
subprocess.call("Trinity --seqType fq --single reads.left.fq --SS_lib_type RF", shell=True)
except subprocess.CalledProcessError as e:
self.assertTrue("Error, with --single reads, the --SS_lib_type can be 'F' or 'R' only." in e.output)
def test_single_end_with_fq(self):
+ print "Single reads with FQ file should succeed"
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F" % MEM_FLAG)
def test_no_run_chrysalis(self):
+ print "The --no_run_chrysalis flag should result in no chrysalis subdirectory in the output directory"
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_run_chrysalis" % MEM_FLAG)
self.assertEquals(0, len(os.listdir('trinity_out_dir/chrysalis')))
def test_no_run_inchworm(self):
+ print "The --no_run_inchworm flag should result in no inchworm.finished file"
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_run_inchworm" % MEM_FLAG)
self.assertFalse(os.path.isfile("trinity_out_dir/inchworm.K25.L25.fa.finished"),
msg="Inchworm appears to have run although no_run_inchworm was specified")
@@ -79,11 +87,13 @@ class TestTrinity(unittest.TestCase):
msg="jellyfish.kmers.fa was not created")
def test_no_bowtie(self):
+ print "The --no_bowtie flag should result in no bowtie.nameSorted.bam file"
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_bowtie" % MEM_FLAG)
self.assertFalse(os.path.isfile("trinity_out_dir/bowtie.nameSorted.bam"),
msg="Bowtie appears to have run although no_bowtie was specified")
def test_no_distributed_trinity_exec(self):
+ print "The --no_distributed_trinity_exec flag should run Jellyfish but not create an output file"
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --no_distributed_trinity_exec" % MEM_FLAG)
self.assertTrue(os.path.isfile("trinity_out_dir/inchworm.K25.L25.fa.finished"),
msg="Inchworm did not appear to run with no_distributed_trinity_exec flag")
@@ -93,16 +103,19 @@ class TestTrinity(unittest.TestCase):
msg="Trinity.fasta created with no_distributed_trinity_exec")
def test_single_end_with_fa_and_reverse(self):
+ print "The --no_distributed_trinity_exec flag should run Jellyfish but not create an output file"
self.fq2fa()
self.trinity("Trinity %s --seqType fa --single reads.fa --SS_lib_type R" % MEM_FLAG)
def test_output_correctly_changes_dir(self):
+ print "The --output flag should change the output directory"
shutil.rmtree('trinity_test', True)
self.trinity("Trinity %s --seqType fq --single reads.left.fq --SS_lib_type F --output trinity_test" % MEM_FLAG)
self.assertTrue(os.path.exists("trinity_test"), msg="Changed output directory but it was not created")
shutil.rmtree('trinity_test', True)
def test_scaffold_iworm_contigs(self):
+ print "scaffold_iworm_contigs works as expected"
os.environ['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH'] + ':../src/trinity-plugins/htslib'
exe = "../src/trinity-plugins/scaffold_iworm_contigs/scaffold_iworm_contigs"
bamfile = "iworm.bowtie.nameSorted.bam"
@@ -125,26 +138,30 @@ class TestTrinity(unittest.TestCase):
self.assertEquals(expected_order, actual_lengths)
def test_Inchworm_handles_compressed_files(self):
+ print "A compressed single file should be handlred correctly by Inchworm"
self.trinity('Trinity %s --seqType fq --single reads.left.fq.gz --SS_lib_type F --no_run_chrysalis' % MEM_FLAG);
num_lines = sum(1 for line in open('trinity_out_dir/inchworm.K25.L25.fa'))
- self.assertTrue(2900 <= num_lines <= 3100, msg='Found %s lines' % num_lines)
+ self.assertTrue(2850 <= num_lines <= 3100, msg='Found %s lines' % num_lines)
### information tests
def test_cite(self):
+ print "Cite flag should return citing information"
expected = '\n\n* Trinity:\nFull-length transcriptome assembly from RNA-Seq data without a reference genome.\nGrabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L,\nRaychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F,\nBirren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A.\nNature Biotechnology 29, 644\xe2\x80\x93652 (2011)\nPaper: http://www.nature.com/nbt/journal/v29/n7/full/nbt.1883.html\nCode: http://trinityrna [...]
cite = subprocess.check_output(["Trinity", "--cite"])
self.assertEqual(expected, cite)
def test_version(self):
+ print "Version flag should return version information"
try:
subprocess.check_output(["Trinity", "--version"])
self.fail("Version returned 0 errorcode!")
except subprocess.CalledProcessError as e:
self.assertTrue('Trinity version: __TRINITY_VERSION_TAG__' in e.output)
- self.assertTrue('using Trinity devel version. Note, latest production release is: v2.0.6' in e.output)
+ self.assertTrue('using Trinity devel version. Note, latest production release is: v2.1.1' in e.output)
def test_show_full_usage_info(self):
+ print "show_full_usage_info flag has several option sections"
try:
subprocess.check_output(["Trinity", "--show_full_usage_info"])
except subprocess.CalledProcessError as e:
@@ -156,22 +173,29 @@ class TestTrinity(unittest.TestCase):
### Invalid command line tests
def test_no_JM_specified_error(self):
+ print "max_memory flag is required"
error = self.get_error("Trinity --seqType fq --single reads.left.fq --SS_lib_type F")
self.assertTrue("Error, must specify max memory for jellyfish to use, eg. --max_memory 10G" in error)
def test_invalid_option_error(self):
+ print "Invalid options result in an error"
error = self.get_error("Trinity --squidward")
- self.assertTrue("Error, do not understand options: --squidward" in error)
+ self.assertTrue("ERROR, don't recognize parameter: --squidward" in error)
def test_set_no_cleanup_and_full_cleanup_error(self):
+ print "Setting no_cleanup and full_cleanup together results in an error"
error = self.get_error("Trinity --no_cleanup --full_cleanup")
self.assertTrue("cannot set --no_cleanup and --full_cleanup as they contradict" in error)
### Helper methods
def trinity(self, cmdline):
+ print "Command line:", cmdline
with open("coverage.log", 'a') as file_out:
+ file_out.write("COMMAND: %s\n" % cmdline)
+ file_out.flush()
subprocess.call(cmdline,shell=True, stdout=file_out)
+ file_out.write("TEST COMPLETE\n")
def get_error(self, cmd):
try:
diff --git a/util/support_scripts/write_partitioned_trinity_cmds.pl b/util/support_scripts/write_partitioned_trinity_cmds.pl
index 68bf2aa..4dc39cd 100755
--- a/util/support_scripts/write_partitioned_trinity_cmds.pl
+++ b/util/support_scripts/write_partitioned_trinity_cmds.pl
@@ -71,7 +71,7 @@ while (<$fh>) {
my $file = pop @x;
- my $cmd = "$FindBin::Bin/../../Trinity --single \"$file\" --output \"$file.out\" $trin_args ";
+ my $cmd = "$FindBin::RealBin/../../Trinity --single \"$file\" --output \"$file.out\" $trin_args ";
print "$cmd\n";
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/trinityrnaseq.git
More information about the debian-med-commit
mailing list